INN commit: trunk/innd (keywords.c)

Sun Nov 1 16:56:19 UTC 2009

Date: Sunday, November 1, 2009 @ 08:56:18
  Author: iulius
Revision: 8697

When keyword generation was done on an article which has
a Keywords: header whose length is greater than the keylimit:
parameter, innd crashed.
Change memcpy to strlcpy.

Modified:
  trunk/innd/keywords.c

------------+
 keywords.c |  110 ++++++++++++++++++++++++++++-------------------------------
 1 file changed, 53 insertions(+), 57 deletions(-)

Modified: keywords.c
===================================================================

--- keywords.c	2009-11-01 15:16:19 UTC (rev 8696)
+++ keywords.c	2009-11-01 16:56:18 UTC (rev 8697)
@@ -6,7 +6,7 @@
 **  order to provide better (scorable) OVER data, containing bits of article
 **  body content which have a reasonable expectation of utility.
 **
-**  Basic idea: Simple word-counting.  We find words in the article body,
+**  Basic idea:  simple word-counting.  We find words in the article body,
 **  separated by whitespace.  Remove punctuation.  Sort words, count unique
 **  words, sort those counts.  Write the resulting Keywords: header containing
 **  the poster's original Keywords: (if any) followed by a magic cookie
@@ -21,8 +21,9 @@
 #include "inn/innconf.h"
 #include "innd.h"
 
-/* If keyword support wasn't requested, stub out the main function provided by
-   this file. */
+/*  If keyword support wasn't requested, stub out the main function provided by
+**  this file.
+*/
 #if !DO_KEYWORDS
 void
 KEYgenerate(HDRCONTENT *header UNUSED, const char *body UNUSED,
@@ -33,15 +34,18 @@
 
 #else
 
-/* For regex-based common word elimination. */
+/*
+**  For regex-based common word elimination.
+*/
 #include <regex.h>
 
 #define	MIN_WORD_LENGTH	3	/* 1- and 2-char words don't count. */
-#define	MAX_WORD_LENGTH	28	/* fits "antidisestablishmentarianism". */
+#define	MAX_WORD_LENGTH	28	/* Fits "antidisestablishmentarianism". */
 
+
 /*
-** A trivial structure for keeping track of words via both
-** index to the overall word list and their counts.
+**  A trivial structure for keeping track of words via both
+**  index to the overall word list and their counts.
 */
 struct word_entry {
     int	index;
@@ -49,32 +53,32 @@
     int	count;
 };
 
+
 /*
-** Wrapper for qsort(3) comparison of word_entry (frequency).
+**  Wrapper for qsort(3) comparison of word_entry (frequency).
 */
-
 static int
 wvec_freq_cmp(const void *p1, const void *p2)
 {
-    return ((const struct word_entry *)p2)->count -	/* decreasing sort */
+    return ((const struct word_entry *)p2)->count -	/* Decreasing sort. */
            ((const struct word_entry *)p1)->count;
 }
 
+
 /*
-** Wrapper for qsort(3) comparison of word_entry (word length).
+**  Wrapper for qsort(3) comparison of word_entry (word length).
 */
-
 static int
 wvec_length_cmp(const void *p1, const void *p2)
 {
-    return ((const struct word_entry *)p2)->length -	/* decreasing sort */
+    return ((const struct word_entry *)p2)->length -	/* Decreasing sort. */
            ((const struct word_entry *)p1)->length;
 }
 
+
 /*
-** Wrapper for qsort(3), for pointer-to-pointer strings.
+**  Wrapper for qsort(3), for pointer-to-pointer strings.
 */
-
 static int
 ptr_strcmp(const void *p1, const void *p2)
 {
@@ -84,17 +88,18 @@
     return strcmp(*s1, *s2);
 }
 
+
 /*
-**  Build new Keywords.
+**  Build new Keywords: header.
 */
 
 void
 KEYgenerate(
-    HDRCONTENT	*hc,		/* header data */
-    const char	*body,		/* article body */
-    size_t      bodylen,	/* article body length */
-    const char	*v,		/* old kw value */
-    size_t	l)		/* old kw length */
+    HDRCONTENT	*hc,		/* Header data. */
+    const char	*body,		/* Article body. */
+    size_t      bodylen,	/* Article body length. */
+    const char	*v,		/* Old Keywords: value. */
+    size_t	l)		/* Old Keywords: length. */
 {
 
     int		word_count, word_length, word_index, distinct_words;
@@ -104,8 +109,7 @@
     static char		**word;
     static const char	*whitespace  = " \t\r\n";
 
-    /* ---------------------------------------------------------------- */
-    /* Prototype setup: Regex match preparation. */
+    /* Prototype setup:  regex match preparation. */
     static	int	regex_lib_init = 0;
     static	regex_t	preg;
     static const char	*elim_regexp = "^\\([-+/0-9][-+/0-9]*\\|.*1st\\|.*2nd\\|.*3rd\\|.*[04-9]th\\|about\\|after\\|ago\\|all\\|already\\|also\\|among\\|and\\|any\\|anybody\\|anyhow\\|anyone\\|anywhere\\|are\\|bad\\|because\\|been\\|before\\|being\\|between\\|but\\|can\\|could\\|did\\|does\\|doing\\|done\\|dont\\|during\\|eight\\|eighth\\|eleven\\|else\\|elsewhere\\|every\\|everywhere\\|few\\|five\\|fifth\\|first\\|for\\|four\\|fourth\\|from\\|get\\|going\\|gone\\|good\\|got\\|had\\|has\\|have\\|having\\|he\\|her\\|here\\|hers\\|herself\\|him\\|himself\\|his\\|how\\|ill\\|into\\|its\\|ive\\|just\\|kn[eo]w\\|least\\|less\\|let\\|like\\|look\\|many\\|may\\|more\\|m[ou]st\\|myself\\|next\\|nine\\|ninth\\|not\\|now\\|off\\|one\\|only\\|onto\\|our\\|out\\|over\\|really\\|said\\|saw\\|says\\|second\\|see\\|set\\|seven\\|seventh\\|several\\|shall\\|she\\|should\\|since\\|six\\|sixth\\|some\\|somehow\\|someone\\|something\\|somewhere\\|such\\|take\\|ten\\|tenth\\|than\\|that\\|the\\|
 their\\!|them\\|then\\|there\\|therell\\|theres\\|these\\|they\\|thing\\|things\\|third\\|this\\|those\\|three\\|thus\\|together\\|told\\|too\\|twelve\\|two\\|under\\|upon\\|very\\|via\\|want\\|wants\\|was\\|wasnt\\|way\\|were\\|weve\\|what\\|whatever\\|when\\|where\\|wherell\\|wheres\\|whether\\|which\\|while\\|who\\|why\\|will\\|with\\|would\\|write\\|writes\\|wrote\\|yes\\|yet\\|you\\|your\\|youre\\|yourself\\)$";
@@ -129,24 +133,20 @@
 	    abort();
 	}
     }
-    /* ---------------------------------------------------------------- */
 
-    /* first re-init kw from original value.  this is a mostly arbitrary
-       cutoff leaving room for a minimal word vec */
-    if (l > (size_t) (innconf->keylimit - (MAX_WORD_LENGTH + 5)))
-        l = innconf->keylimit - (MAX_WORD_LENGTH + 5);
-    hc->Value = xmalloc(innconf->keylimit+1);
+    /* First re-init Keywords: from original value.  This is a mostly arbitrary
+     * cutoff leaving room for a minimal word vector. */
+    hc->Value = xmalloc(innconf->keylimit + 1);
     if ((v != NULL) && (*v != '\0')) {
-        memcpy(hc->Value, v, l);
-        hc->Value[l] = '\0';
+        if (l > (size_t) innconf->keylimit + 1)
+            l = innconf->keylimit + 1;
+        strlcpy(hc->Value, v, l);
     } else
         *hc->Value = '\0';
     l = hc->Length = strlen(hc->Value);
 
-    /*
-     * now figure acceptable extents, and copy body to working string.
-     * (Memory-intensive for hefty articles: limit to non-ABSURD articles.)
-     */
+    /* Now figure acceptable extents, and copy body to working string.
+     * (Memory-intensive for hefty articles:  limit to non-ABSURD articles.) */
     if ((bodylen < 100) || (bodylen > (size_t) innconf->keyartlimit)) /* Too small/big to bother. */
 	return;
 
@@ -158,30 +158,30 @@
 
     text_end = text + bodylen;
 
-    /* abusive punctuation stripping: turn it all into SPCs. */
+    /* Abusive punctuation stripping:  turn it all into spaces. */
     for (punc = text; *punc; punc++)
 	if (!CTYPE(isalpha, *punc))
 	    *punc = ' ';
 
-    /* move to first word. */
+    /* Move to first word. */
     text += strspn(text, whitespace);
     word_count = 0;
 
-    /* hunt down words */
-    while ((text < text_end) &&		/* while there might be words... */
+    /* Hunt down words. */
+    while ((text < text_end) &&		/* While there might be words... */
 	   (*text != '\0') &&
 	   (word_count < innconf->keymaxwords)) {
 
-	/* find a word. */
+	/* Find a word. */
 	word_length = strcspn(text, whitespace);
 	if (word_length == 0)
-	    break;			/* no words left */
+	    break;			/* No words left. */
 
-	/* bookkeep to save word location, then move through text. */
+	/* Bookkeep to save word location, then move through text. */
 	word[word_count++] = this_word = text;
 	text += word_length;
 	*(text++) = '\0';
-	text += strspn(text, whitespace);	/* move to next word. */
+	text += strspn(text, whitespace);	/* Move to next word. */
 
 	/* 1- and 2-char words don't count, nor do excessively long ones. */
 	if ((word_length < MIN_WORD_LENGTH) ||
@@ -190,7 +190,7 @@
 	    continue;
 	}
 
-	/* squash to lowercase. */
+	/* Squash to lowercase. */
 	for (chase = this_word; *chase; chase++)
 	    if (CTYPE(isupper, *chase))
 		*chase = tolower(*chase);
@@ -204,12 +204,12 @@
     qsort(word, word_count, sizeof(word[0]), ptr_strcmp);
 
     /* Count unique words. */
-    distinct_words = 0;			/* the 1st word is "pre-figured". */
+    distinct_words = 0;			/* The 1st word is "pre-figured". */
     word_vec[0].index = 0;
     word_vec[0].length = strlen(word[0]);
     word_vec[0].count = 1;
 
-    for (word_index = 1;		/* we compare (N-1)th and Nth words. */
+    for (word_index = 1;		/* We compare (N-1)th and Nth words. */
 	 word_index < word_count;
 	 word_index++) {
 	if (strcmp(word[word_index-1], word[word_index]) == 0)
@@ -223,11 +223,11 @@
     }
 
     /* Sort the counts. */
-    distinct_words++;			/* we were off-by-1 until this. */
+    distinct_words++;			/* We were off-by-1 until this. */
     qsort(word_vec, distinct_words, sizeof(struct word_entry), wvec_freq_cmp);
 
     /* Sub-sort same-frequency words on word length. */
-    for (last = 0, word_index = 1;	/* again, (N-1)th and Nth entries. */
+    for (last = 0, word_index = 1;	/* Again, (N-1)th and Nth entries. */
 	 word_index < distinct_words;
 	 word_index++) {
 	if (word_vec[last].count != word_vec[word_index].count) {
@@ -237,7 +237,7 @@
 	    last = word_index;
 	}
     }
-    /* do it one last time for the only-one-appearance words. */
+    /* Do it one last time for the only-one-appearance words. */
     if ((word_index - last) != 1)
 	qsort(&word_vec[last], word_index - last,
 	      sizeof(struct word_entry), wvec_length_cmp);
@@ -247,13 +247,12 @@
     for (chase = hc->Value + l + 2, word_index = 0;
 	 word_index < distinct_words;
 	 word_index++) {
-	/* ---------------------------------------------------------------- */
-	/* "noise" words don't count */
+	
+        /* "noise" words don't count. */
 	if (regexec(&preg, word[word_vec[word_index].index], 0, NULL, 0) == 0)
 	    continue;
-	/* ---------------------------------------------------------------- */
 
-	/* add to list. */
+	/* Add to list. */
 	*chase++ = ',';
 	strlcpy(chase, word[word_vec[word_index].index],
                 innconf->keylimit + 1 - (chase - hc->Value));
@@ -262,10 +261,7 @@
 	if (chase - hc->Value > (innconf->keylimit - (MAX_WORD_LENGTH + 4)))
 	    break;
     }
-    /* note #words we didn't get to add. */
-    /* This code can potentially lead to a buffer overflow if the number of
-       ignored words is greater than 100, under some circumstances.  It's
-       temporarily disabled until fixed. */
+
     hc->Length = strlen(hc->Value);
 
 out: