INN commit: branches/2.5/innd (keywords.c)
INN Commit
rra at isc.org
Sun Nov 15 09:22:57 UTC 2009
Date: Sunday, November 15, 2009 @ 01:22:56
Author: iulius
Revision: 8773
When keyword generation was done on an article which has
a Keywords: header whose length is greater than the keylimit:
parameter, innd crashed.
Change memcpy to strlcpy.
Modified:
branches/2.5/innd/keywords.c
------------+
keywords.c | 110 ++++++++++++++++++++++++++++-------------------------------
1 file changed, 53 insertions(+), 57 deletions(-)
Modified: keywords.c
===================================================================
--- keywords.c 2009-11-15 09:22:45 UTC (rev 8772)
+++ keywords.c 2009-11-15 09:22:56 UTC (rev 8773)
@@ -6,7 +6,7 @@
** order to provide better (scorable) OVER data, containing bits of article
** body content which have a reasonable expectation of utility.
**
-** Basic idea: Simple word-counting. We find words in the article body,
+** Basic idea: simple word-counting. We find words in the article body,
** separated by whitespace. Remove punctuation. Sort words, count unique
** words, sort those counts. Write the resulting Keywords: header containing
** the poster's original Keywords: (if any) followed by a magic cookie
@@ -21,8 +21,9 @@
#include "inn/innconf.h"
#include "innd.h"
-/* If keyword support wasn't requested, stub out the main function provided by
- this file. */
+/* If keyword support wasn't requested, stub out the main function provided by
+** this file.
+*/
#if !DO_KEYWORDS
void
KEYgenerate(HDRCONTENT *header UNUSED, const char *body UNUSED,
@@ -33,15 +34,18 @@
#else
-/* For regex-based common word elimination. */
+/*
+** For regex-based common word elimination.
+*/
#include <regex.h>
#define MIN_WORD_LENGTH 3 /* 1- and 2-char words don't count. */
-#define MAX_WORD_LENGTH 28 /* fits "antidisestablishmentarianism". */
+#define MAX_WORD_LENGTH 28 /* Fits "antidisestablishmentarianism". */
+
/*
-** A trivial structure for keeping track of words via both
-** index to the overall word list and their counts.
+** A trivial structure for keeping track of words via both
+** index to the overall word list and their counts.
*/
struct word_entry {
int index;
@@ -49,32 +53,32 @@
int count;
};
+
/*
-** Wrapper for qsort(3) comparison of word_entry (frequency).
+** Wrapper for qsort(3) comparison of word_entry (frequency).
*/
-
static int
wvec_freq_cmp(const void *p1, const void *p2)
{
- return ((const struct word_entry *)p2)->count - /* decreasing sort */
+ return ((const struct word_entry *)p2)->count - /* Decreasing sort. */
((const struct word_entry *)p1)->count;
}
+
/*
-** Wrapper for qsort(3) comparison of word_entry (word length).
+** Wrapper for qsort(3) comparison of word_entry (word length).
*/
-
static int
wvec_length_cmp(const void *p1, const void *p2)
{
- return ((const struct word_entry *)p2)->length - /* decreasing sort */
+ return ((const struct word_entry *)p2)->length - /* Decreasing sort. */
((const struct word_entry *)p1)->length;
}
+
/*
-** Wrapper for qsort(3), for pointer-to-pointer strings.
+** Wrapper for qsort(3), for pointer-to-pointer strings.
*/
-
static int
ptr_strcmp(const void *p1, const void *p2)
{
@@ -84,17 +88,18 @@
return strcmp(*s1, *s2);
}
+
/*
-** Build new Keywords.
+** Build new Keywords: header.
*/
void
KEYgenerate(
- HDRCONTENT *hc, /* header data */
- const char *body, /* article body */
- size_t bodylen, /* article body length */
- const char *v, /* old kw value */
- size_t l) /* old kw length */
+ HDRCONTENT *hc, /* Header data. */
+ const char *body, /* Article body. */
+ size_t bodylen, /* Article body length. */
+ const char *v, /* Old Keywords: value. */
+ size_t l) /* Old Keywords: length. */
{
int word_count, word_length, word_index, distinct_words;
@@ -104,8 +109,7 @@
static char **word;
static const char *whitespace = " \t\r\n";
- /* ---------------------------------------------------------------- */
- /* Prototype setup: Regex match preparation. */
+ /* Prototype setup: regex match preparation. */
static int regex_lib_init = 0;
static regex_t preg;
static const char *elim_regexp = "^\\([-+/0-9][-+/0-9]*\\|.*1st\\|.*2nd\\|.*3rd\\|.*[04-9]th\\|about\\|after\\|ago\\|all\\|already\\|also\\|among\\|and\\|any\\|anybody\\|anyhow\\|anyone\\|anywhere\\|are\\|bad\\|because\\|been\\|before\\|being\\|between\\|but\\|can\\|could\\|did\\|does\\|doing\\|done\\|dont\\|during\\|eight\\|eighth\\|eleven\\|else\\|elsewhere\\|every\\|everywhere\\|few\\|five\\|fifth\\|first\\|for\\|four\\|fourth\\|from\\|get\\|going\\|gone\\|good\\|got\\|had\\|has\\|have\\|having\\|he\\|her\\|here\\|hers\\|herself\\|him\\|himself\\|his\\|how\\|ill\\|into\\|its\\|ive\\|just\\|kn[eo]w\\|least\\|less\\|let\\|like\\|look\\|many\\|may\\|more\\|m[ou]st\\|myself\\|next\\|nine\\|ninth\\|not\\|now\\|off\\|one\\|only\\|onto\\|our\\|out\\|over\\|really\\|said\\|saw\\|says\\|second\\|see\\|set\\|seven\\|seventh\\|several\\|shall\\|she\\|should\\|since\\|six\\|sixth\\|some\\|somehow\\|someone\\|something\\|somewhere\\|such\\|take\\|ten\\|tenth\\|than\\|that\\|the\\|
their\\!|them\\|then\\|there\\|therell\\|theres\\|these\\|they\\|thing\\|things\\|third\\|this\\|those\\|three\\|thus\\|together\\|told\\|too\\|twelve\\|two\\|under\\|upon\\|very\\|via\\|want\\|wants\\|was\\|wasnt\\|way\\|were\\|weve\\|what\\|whatever\\|when\\|where\\|wherell\\|wheres\\|whether\\|which\\|while\\|who\\|why\\|will\\|with\\|would\\|write\\|writes\\|wrote\\|yes\\|yet\\|you\\|your\\|youre\\|yourself\\)$";
@@ -129,24 +133,20 @@
abort();
}
}
- /* ---------------------------------------------------------------- */
- /* first re-init kw from original value. this is a mostly arbitrary
- cutoff leaving room for a minimal word vec */
- if (l > (size_t) (innconf->keylimit - (MAX_WORD_LENGTH + 5)))
- l = innconf->keylimit - (MAX_WORD_LENGTH + 5);
- hc->Value = xmalloc(innconf->keylimit+1);
+ /* First re-init Keywords: from original value. This is a mostly arbitrary
+ * cutoff leaving room for a minimal word vector. */
+ hc->Value = xmalloc(innconf->keylimit + 1);
if ((v != NULL) && (*v != '\0')) {
- memcpy(hc->Value, v, l);
- hc->Value[l] = '\0';
+ if (l > (size_t) innconf->keylimit + 1)
+ l = innconf->keylimit + 1;
+ strlcpy(hc->Value, v, l);
} else
*hc->Value = '\0';
l = hc->Length = strlen(hc->Value);
- /*
- * now figure acceptable extents, and copy body to working string.
- * (Memory-intensive for hefty articles: limit to non-ABSURD articles.)
- */
+ /* Now figure acceptable extents, and copy body to working string.
+ * (Memory-intensive for hefty articles: limit to non-ABSURD articles.) */
if ((bodylen < 100) || (bodylen > (size_t) innconf->keyartlimit)) /* Too small/big to bother. */
return;
@@ -158,30 +158,30 @@
text_end = text + bodylen;
- /* abusive punctuation stripping: turn it all into SPCs. */
+ /* Abusive punctuation stripping: turn it all into spaces. */
for (punc = text; *punc; punc++)
if (!CTYPE(isalpha, *punc))
*punc = ' ';
- /* move to first word. */
+ /* Move to first word. */
text += strspn(text, whitespace);
word_count = 0;
- /* hunt down words */
- while ((text < text_end) && /* while there might be words... */
+ /* Hunt down words. */
+ while ((text < text_end) && /* While there might be words... */
(*text != '\0') &&
(word_count < innconf->keymaxwords)) {
- /* find a word. */
+ /* Find a word. */
word_length = strcspn(text, whitespace);
if (word_length == 0)
- break; /* no words left */
+ break; /* No words left. */
- /* bookkeep to save word location, then move through text. */
+ /* Bookkeep to save word location, then move through text. */
word[word_count++] = this_word = text;
text += word_length;
*(text++) = '\0';
- text += strspn(text, whitespace); /* move to next word. */
+ text += strspn(text, whitespace); /* Move to next word. */
/* 1- and 2-char words don't count, nor do excessively long ones. */
if ((word_length < MIN_WORD_LENGTH) ||
@@ -190,7 +190,7 @@
continue;
}
- /* squash to lowercase. */
+ /* Squash to lowercase. */
for (chase = this_word; *chase; chase++)
if (CTYPE(isupper, *chase))
*chase = tolower(*chase);
@@ -204,12 +204,12 @@
qsort(word, word_count, sizeof(word[0]), ptr_strcmp);
/* Count unique words. */
- distinct_words = 0; /* the 1st word is "pre-figured". */
+ distinct_words = 0; /* The 1st word is "pre-figured". */
word_vec[0].index = 0;
word_vec[0].length = strlen(word[0]);
word_vec[0].count = 1;
- for (word_index = 1; /* we compare (N-1)th and Nth words. */
+ for (word_index = 1; /* We compare (N-1)th and Nth words. */
word_index < word_count;
word_index++) {
if (strcmp(word[word_index-1], word[word_index]) == 0)
@@ -223,11 +223,11 @@
}
/* Sort the counts. */
- distinct_words++; /* we were off-by-1 until this. */
+ distinct_words++; /* We were off-by-1 until this. */
qsort(word_vec, distinct_words, sizeof(struct word_entry), wvec_freq_cmp);
/* Sub-sort same-frequency words on word length. */
- for (last = 0, word_index = 1; /* again, (N-1)th and Nth entries. */
+ for (last = 0, word_index = 1; /* Again, (N-1)th and Nth entries. */
word_index < distinct_words;
word_index++) {
if (word_vec[last].count != word_vec[word_index].count) {
@@ -237,7 +237,7 @@
last = word_index;
}
}
- /* do it one last time for the only-one-appearance words. */
+ /* Do it one last time for the only-one-appearance words. */
if ((word_index - last) != 1)
qsort(&word_vec[last], word_index - last,
sizeof(struct word_entry), wvec_length_cmp);
@@ -247,13 +247,12 @@
for (chase = hc->Value + l + 2, word_index = 0;
word_index < distinct_words;
word_index++) {
- /* ---------------------------------------------------------------- */
- /* "noise" words don't count */
+
+ /* "noise" words don't count. */
if (regexec(&preg, word[word_vec[word_index].index], 0, NULL, 0) == 0)
continue;
- /* ---------------------------------------------------------------- */
- /* add to list. */
+ /* Add to list. */
*chase++ = ',';
strlcpy(chase, word[word_vec[word_index].index],
innconf->keylimit + 1 - (chase - hc->Value));
@@ -262,10 +261,7 @@
if (chase - hc->Value > (innconf->keylimit - (MAX_WORD_LENGTH + 4)))
break;
}
- /* note #words we didn't get to add. */
- /* This code can potentially lead to a buffer overflow if the number of
- ignored words is greater than 100, under some circumstances. It's
- temporarily disabled until fixed. */
+
hc->Length = strlen(hc->Value);
out:
More information about the inn-committers
mailing list