INN commit: trunk (6 files)

INN Commit Russ_Allbery at isc.org
Mon Mar 24 09:16:04 UTC 2008


    Date: Monday, March 24, 2008 @ 02:16:03
  Author: iulius
Revision: 7718

Implementation of the Diablo hashfeed algorithm.  It allows
to distribute the messages among several peers (new Q flag
for newsfeeds).

Thanks to Miquel van Smoorenburg for this implementation in INN.


Also fix inncheck after commit 7713 for the legacy Diablo
quickhash (which uses the Q flag too).

Modified:
  trunk/doc/pod/newsfeeds.pod
  trunk/innd/art.c
  trunk/innd/innd.h
  trunk/innd/newsfeeds.c
  trunk/innd/site.c
  trunk/scripts/inncheck.in

-----------------------+
 doc/pod/newsfeeds.pod |   47 ++++++++++++++++--------
 innd/art.c            |   92 +++++++++++++++++++++++++++++++++++++++---------
 innd/innd.h           |   19 ++++++---
 innd/newsfeeds.c      |   41 ++++++++++++++++-----
 innd/site.c           |   11 +++--
 scripts/inncheck.in   |    1 
 6 files changed, 159 insertions(+), 52 deletions(-)

Modified: doc/pod/newsfeeds.pod
===================================================================
--- doc/pod/newsfeeds.pod	2008-03-23 17:59:00 UTC (rev 7717)
+++ doc/pod/newsfeeds.pod	2008-03-24 09:16:03 UTC (rev 7718)
@@ -386,31 +386,48 @@
 
 =item B<Q> I<hashfeed>
 
-Specifies the I<hashfeed> value for this site.  The Message-ID of the
-article is hashed using a quick (but dirty) hash method (31-bits sum
-of the value of all characters).  The I<hashfeed> value must be given
-as C<value/mod> or as C<start-end/mod>.  If the Message-ID hash modulus
-C<mod> plus one equals C<value> or is between C<start> and C<end>,
-the article will be fed to this site.  All these numbers must be integers.
+Specifies the I<hashfeed> match expression for this site.  It must be in
+the form C<value/mod> or C<start-end/mod>.  The Message-ID of the article
+is hashed using MD5, which results in a 128-bit hash.  The lowest
+S<32 bits> are then taken by default as the hashfeed value (which is an
+integer).  If the hashfeed value modulus C<mod> plus one equals C<value> or
+is between C<start> and C<end>, the article will be fed to this site.  All
+these numbers must be integers.
 
+It is a deterministic way to control the flow of articles and to split a feed.
 For instance:
 
-    Q1/2      Feeds about 50% of the messages to this site.
-    Q2/2      Feeds the other 50% of the messages.
+    Q1/2      Feeds about 50% of all articles to this site.
+    Q2/2      Feeds the other 50% of all articles.
 
 Another example with three sites:
 
-    Q1-3/10   Feeds about 30% of the messages.
-    Q4-5/10   Feeds about 20% of the messages.
-    Q6-10/10  Feeds about 50% of the messages.
+    Q1-3/10   Feeds about 30% of all articles.
+    Q4-5/10   Feeds about 20% of all articles.
+    Q6-10/10  Feeds about 50% of all articles.
 
 If this flag is specified multiple times, the contents will be
-logically C<OR>ed together (just one matched flag is needed).
+logically C<OR>ed together (just one match is needed).
 
-The algorithm is compatible with the legacy Quickhash used by Diablo.
-Please note that the distribution of the messages is not perfect with
-this algorithm.
+You can use an extended syntax of the form C<value/mod_offset> or
+C<start-end/mod_offset>.  As MD5 generates a 128-bit return value,
+it is possible to specify from which byte-offset the 32-bit integer
+used by hashfeed starts.  The default value for C<offset> is C<_0>
+and thirteen overlapping values from C<_0> to C<_12> can be used.
+Only up to four totally independent values exist:  C<_0>, C<_4>,
+C<_8> and C<_12>.
 
+Therefore, it allows to a generate a second level of deterministic
+distribution.  Indeed, if a news server is fed C<Q1/2>, it can go on
+splitting thanks to C<Q1-3/9_4> for instance.
+
+The algorithm is compatible with the one used by S<Diablo 5.1> and up.
+If you want to use the legacy quickhashing method used by Diablo
+before 5.1, you can put an C<@> sign just after the B<Q> flag (for
+instance C<Q at 1-3/10>, but the distribution of the messages is not
+perfect with this legacy method whose use is discouraged and for
+which offsets cannot be used).
+
 =item B<S> I<size>
 
 If the amount of data queued for the site gets to be larger than I<size>

Modified: innd/art.c
===================================================================
--- innd/art.c	2008-03-23 17:59:00 UTC (rev 7717)
+++ innd/art.c	2008-03-24 09:16:03 UTC (rev 7718)
@@ -8,10 +8,11 @@
 #include <sys/uio.h>
 
 #include "inn/innconf.h"
+#include "inn/md5.h"
+#include "inn/ov.h"
+#include "inn/storage.h"
 #include "inn/wire.h"
 #include "innd.h"
-#include "inn/ov.h"
-#include "inn/storage.h"
 
 typedef struct iovec	IOVEC;
 
@@ -1538,23 +1539,81 @@
 }
 
 /*
-**  Return true if an element of the QHASHLIST matches the
-**  quickhash of the Message-ID.
+**  Even though we have already calculated the Message-ID MD5sum,
+**  we have to do it again since unfortunately HashMessageID()
+**  lowercases the Message-ID first.  We also need to remain
+**  compatible with Diablo's hashfeed.
 */
-static bool
-QHashMatch(QHASHLIST *qh, char *MessageID)
+static unsigned int
+HashFeedMD5(char *MessageID, unsigned int offset)
 {
-  unsigned char *p = (unsigned char *)MessageID;
-  int h = 0;
+  static char LastMessageID[128];
+  static char *LastMessageIDPtr;
+  static struct md5_context context;
+  unsigned int ret;
 
+  if (offset > 12)
+    return 0;
+
+  /* Some light caching. */
+  if (MessageID != LastMessageIDPtr ||
+      strcmp(MessageID, LastMessageID) != 0) {
+    md5_init(&context);
+    md5_update(&context, (unsigned char *)MessageID, strlen(MessageID));
+    md5_final(&context);
+    LastMessageIDPtr = MessageID;
+    strncpy(LastMessageID, MessageID, sizeof(LastMessageID) - 1);
+    LastMessageID[sizeof(LastMessageID) - 1] = 0;
+  }
+
+  memcpy(&ret, &context.digest[12 - offset], 4);
+
+  return ntohl(ret);
+}
+
+/*
+** Old-style Diablo (< 5.1) quickhash.
+*/
+static unsigned int
+HashFeedQH(char *MessageID, unsigned int *tmp)
+{
+  unsigned char *p;
+  int n;
+
+  if (*tmp != (unsigned int)-1)
+    return *tmp;
+
+  p = (unsigned char *)MessageID;
+  n = 0;
   while (*p)
-    h += *p++;
+    n += *p++;
+  *tmp = (unsigned int)n;
 
-  while (qh) {
-    if ((h % qh->mod + 1) >= qh->begin &&
-        (h % qh->mod + 1) <= qh->end)
-          return true;
-    qh = qh->next;
+  return *tmp;
+}
+
+/*
+**  Return true if an element of the HASHFEEDLIST matches
+**  the hash of the Message-ID.
+*/
+static bool
+HashFeedMatch(HASHFEEDLIST *hf, char *MessageID)
+{
+  unsigned int qh = (unsigned int)-1;
+  unsigned int h;
+
+  while (hf) {
+    if (hf->type == HASHFEED_MD5)
+      h = HashFeedMD5(MessageID, hf->offset);
+    else if (hf->type == HASHFEED_QH)
+      h = HashFeedQH(MessageID, &qh);
+    else
+      continue;
+
+    if ((h % hf->mod + 1) >= hf->begin &&
+        (h % hf->mod + 1) <= hf->end)
+      return true;
+    hf = hf->next;
   }
 
   return false;
@@ -1633,8 +1692,9 @@
        * cross-posting. */
       continue;
 
-    if (sp->QHashList && !QHashMatch(sp->QHashList, HDR(HDR__MESSAGE_ID)))
-      /* Quickhash doesn't match. */
+    if (sp->HashFeedList
+      && !HashFeedMatch(sp->HashFeedList, HDR(HDR__MESSAGE_ID)))
+      /* Hashfeed doesn't match. */
       continue;
 
     if (list && *list != NULL && sp->Distributions &&

Modified: innd/innd.h
===================================================================
--- innd/innd.h	2008-03-23 17:59:00 UTC (rev 7717)
+++ innd/innd.h	2008-03-24 09:16:03 UTC (rev 7718)
@@ -401,13 +401,18 @@
 /*
 **  Diablo-style hashed feeds or hashfeeds.
 */
-typedef struct _QHASHLIST {
-  int             begin;
-  int             end;
-  int             mod;
-  struct _QHASHLIST   *next;
-} QHASHLIST;
+#define HASHFEED_QH 1
+#define HASHFEED_MD5 2
 
+typedef struct _HASHFEEDLIST {
+  int type;
+  unsigned int begin;
+  unsigned int end;
+  unsigned int mod;
+  unsigned int offset;
+  struct _HASHFEEDLIST *next;
+} HASHFEEDLIST;
+
 /*
 **  A site may reject something in its subscription list if it has
 **  too many hops, or a bad distribution.
@@ -460,7 +465,7 @@
   struct buffer	  Buffer;
   bool		  Buffered;
   char	      **  Originator;
-  QHASHLIST    *  QHashList;
+  HASHFEEDLIST *  HashFeedList;
   int		  Next;
   int		  Prev;
 } SITE;

Modified: innd/newsfeeds.c
===================================================================
--- innd/newsfeeds.c	2008-03-23 17:59:00 UTC (rev 7717)
+++ innd/newsfeeds.c	2008-03-24 09:16:03 UTC (rev 7718)
@@ -448,7 +448,7 @@
     int			isp;
     SITE		*nsp;
     struct buffer	b;
-    QHASHLIST           *qh;
+    HASHFEEDLIST        *hf;
 
     b = sp->Buffer;
     *sp = SITEnull;
@@ -468,7 +468,7 @@
     sp->NeedOverviewCreation = false;
     sp->FeedwithoutOriginator = false;
     sp->DropFiltered = false;
-    sp->QHashList = NULL;
+    sp->HashFeedList = NULL;
 
     /* Nip off the first field, the site name. */
     if ((f2 = strchr(Entry, NF_FIELD_SEP)) == NULL)
@@ -601,16 +601,37 @@
                 sp->Nice = atoi(p);
             break;
         case 'Q':
-            qh = xmalloc(sizeof(QHASHLIST));
+            hf = xmalloc(sizeof(HASHFEEDLIST));
             p++;
-            if (sscanf(p, "%d/%d", &qh->begin, &qh->mod) == 2) {
-                qh->end = qh->begin;
-            } else if (sscanf(p, "%d-%d/%d", &qh->begin, &qh->end, &qh->mod) != 3) {
-                free(qh);
-                return "hash for Q not in x/z or x-y/z format";
+            /* Check whether it is a quickhash or a MD5 hashfeed. */
+            if (*p == '@') {
+                p++;
+                hf->type = HASHFEED_QH;
+            } else {
+                hf->type = HASHFEED_MD5;
             }
-            qh->next = sp->QHashList;
-            sp->QHashList = qh;
+            /* Check the presence of a starting byte-offset for hashfeed. */
+            if ((u = strchr(p, '_')) != NULL) {
+                if (sscanf(u + 1, "%u", &hf->offset) != 1 || hf->offset > 12) {
+                    free(hf);
+                    return "invalid hash offset for Q param in field 3";
+                }
+            } else {
+                hf->offset = 0;
+            }
+            if (sscanf(p, "%u/%u", &hf->begin, &hf->mod) == 2) {
+                hf->end = hf->begin;
+            } else if (sscanf(p, "%u-%u/%u", &hf->begin, &hf->end,
+                              &hf->mod) != 3) {
+                free(hf);
+                return "hash not in x/z or x-y/z format for Q param in field 3";
+            }
+            if (hf->begin > hf->end || hf->end > hf->mod) {
+                free(hf);
+                return "incorrect hash values for Q param in field 3";
+            }
+            hf->next = sp->HashFeedList;
+            sp->HashFeedList = hf;
             break;
 	case 'S':
 	    if (*++p && CTYPE(isdigit, *p))

Modified: innd/site.c
===================================================================
--- innd/site.c	2008-03-23 17:59:00 UTC (rev 7717)
+++ innd/site.c	2008-03-24 09:16:03 UTC (rev 7718)
@@ -1005,7 +1005,7 @@
 SITEfree(SITE *sp)
 {
     SITE                *s;
-    QHASHLIST           *qh, *qn;
+    HASHFEEDLIST        *hf, *hn;
     int                 new;
     int                 i;
     
@@ -1060,9 +1060,12 @@
 	sp->FNLnames.data = NULL;
 	sp->FNLnames.size = 0;
     }
-    for (qh = sp->QHashList; qh; qh = qn) {
-        qn = qh->next;
-        free(qh);
+    if (sp->HashFeedList) {
+        for (hf = sp->HashFeedList; hf; hf = hn) {
+            hn = hf->next;
+            free(hf);
+        }
+        sp->HashFeedList = NULL;
     }
 
     /* If this site was a master, find a new one. */

Modified: scripts/inncheck.in
===================================================================
--- scripts/inncheck.in	2008-03-23 17:59:00 UTC (rev 7717)
+++ scripts/inncheck.in	2008-03-24 09:16:03 UTC (rev 7718)
@@ -360,6 +360,7 @@
     'N',	'^[mu]$',
     'O',	'^\S+$',
     'P',	'^\d+$',
+    'Q',	'^@?\d+(-\d+)?/\d+(_\d+)?$',
     'S',	'^\d+$',
     'T',	'^[cflmpx]$',
     'U',        '^\d+$',



More information about the inn-committers mailing list