PATCH: diablo style hashfeeds

"Miquel van Smoorenburg" list-inn-workers at news.cistron.nl
Mon Mar 24 00:18:34 UTC 2008


In article <D837E9C8EA3F4A50BB80C3F1B49ED58D at Iulius>,
Julien ÉLIE  <julien at trigofacile.com> wrote:
>Hi Miquel,
>
>It should be:
>
>    md5_update(&context, (unsigned char *)MessageID, strlen(MessageID));
[..]
>I would put the quickhash calculation in the condition
>(hf->type == HASHFEED_QH) because it is otherwise always computed.
[..]
>Hmm... Something is then missing in the documentation:
[..]
>        return "incorrect hash values for Q";
[..]
>+    'Q',       '^@?\d(-\d)?/\d(:\d)?$',
[..]

Thanks for all the suggestions. A revised version is attached. I
took your suggestion to change the offset separation character
from ':' to '_'. In fact I had already changed it to ';' in my
development version - but forgot to update the patch. I like ';'
more, but in case INN ever moves to bind-like syntax for config
files the ';' might not be a good choise.

diff -ruN t/inn-2.4.3/doc/pod/newsfeeds.pod inn-2.4.3/doc/pod/newsfeeds.pod
--- t/inn-2.4.3/doc/pod/newsfeeds.pod	2006-03-20 05:14:57.000000000 +0100
+++ inn-2.4.3/doc/pod/newsfeeds.pod	2008-03-24 01:06:28.000000000 +0100
@@ -382,6 +382,43 @@
 new process will run with.  This flag can be used to raise the priority to
 normal if you're using the I<nicekids> parameter in F<inn.conf>.
 
+=item B<Q> I<hashfeed>
+
+The hashfeed match expression for this site. It must be in the form
+C<value/mod> or C<start-end/mod>. The message-id of the article is
+hashed using MD5, which results in a 128 bit hash. The lowest 32 bits
+(by default) are then taken as the C<hashfeed value>. If the hashfeed
+value modulus C<mod> plus one equals C<number> or is between C<start>
+and C<end>, the article will be fed to the site.
+
+Example:
+
+    Q1/2     Feeds about 50% of all articles to this site
+    Q2/2     Feeds the other 50% of all articles
+
+    Q1-3/10  Feeds about 30% of all articles
+    Q4-5/10  Feeds about 20% of all articles
+    Q6-10/10 Feeds about 50% of all articles
+
+If this flag is specified multiple times the contents will be
+logically ORed together (just one match needed).
+
+You can use an extended syntax of the form C<start-end/mod_offset>.
+As MD5 generates a 128-bit return value, it is possible to specify
+from which byte-offset the 32-bit integer used by hashfeed starts.
+The default for C<offset> is C<_0> and thirteen overlapping values
+from C<_0> to C<_12> can be used. Only four totally independent
+values exist: C<_0>, C<_4>, C<_8> and C<_12>.
+
+Therefore, it allows to a generate a second level of deterministic
+distribution.  Indeed, if a news server is fed C<Q1/2>, it can go on
+splitting thanks to C<Q1-3/9_4> for instance.
+
+The algorithm is compatible with the one used by Diablo 5.1 and up.
+If you want to use the hashing method Diablo used before 5.1,
+put an '@' sign just after the 'Q' (for example: Q at 1/2). Note that
+offsets can't be used with the older hashing method.
+
 =item B<S> I<size>
 
 If the amount of data queued for the site gets to be larger than I<size>
diff -ruN t/inn-2.4.3/innd/art.c inn-2.4.3/innd/art.c
--- t/inn-2.4.3/innd/art.c	2006-03-20 05:14:57.000000000 +0100
+++ inn-2.4.3/innd/art.c	2008-03-24 00:54:08.000000000 +0100
@@ -9,6 +9,7 @@
 
 #include "inn/innconf.h"
 #include "inn/wire.h"
+#include "inn/md5.h"
 #include "innd.h"
 #include "ov.h"
 #include "storage.h"
@@ -1553,6 +1562,88 @@
 }
 
 /*
+**  Even though we have already calculated the message-id MD5sum,
+**  we have to do it again since unfortunately HashMessageID()
+**  lowercases the message-id first.
+*/
+
+static unsigned int
+HashFeedMD5(char *MessageID, unsigned int offset)
+{
+  static char LastMessageID[128];
+  static char *LastMessageIDPtr;
+  static struct md5_context context;
+  unsigned int ret;
+
+  if (offset > 12)
+    return 0;
+
+  /* Some light caching */
+  if (MessageID != LastMessageIDPtr ||
+    strcmp(MessageID, LastMessageID) != 0) {
+    md5_init(&context);
+    md5_update(&context, (unsigned char *)MessageID, strlen(MessageID));
+    md5_final(&context);
+    LastMessageIDPtr = MessageID;
+    strncpy(LastMessageID, MessageID, sizeof(LastMessageID) - 1);
+    LastMessageID[sizeof(LastMessageID) - 1] = 0;
+  }
+
+  memcpy(&ret, &context.digest[12 - offset], 4);
+
+  return ntohl(ret);
+}
+
+/*
+** Old-style diablo quickhash
+**
+*/
+static unsigned int
+HashFeedQH(char *MessageID, unsigned int *tmp)
+{
+  unsigned char *p;
+  unsigned int h;
+  int n;
+
+  if (*tmp != (unsigned int)-1)
+    return *tmp;
+
+  p = (unsigned char *)MessageID;
+  n = 0;
+  while (*p)
+    n += *p++;
+  *tmp = (unsigned int)n;
+
+  return *tmp;
+}
+
+/*
+**  Return true if an element of the HASHFEEDLIST matches
+**  the hash of the message-id.
+*/
+static bool
+HashFeedMatch(HASHFEEDLIST *hf, char *MessageID)
+{
+  unsigned int qh = (unsigned int)-1;
+  unsigned int h;
+
+  while (hf) {
+    if (hf->type == HASHFEED_MD5)
+      h = HashFeedMD5(MessageID, hf->offset);
+    else if (hf->type == HASHFEED_QH)
+      h = HashFeedQH(MessageID, &qh);
+    else
+      continue;
+    if ((h % hf->mod + 1) >= hf->begin &&
+        (h % hf->mod + 1) <= hf->end)
+	  return true;
+    hf = hf->next;
+  }
+
+  return false;
+}
+
+/*
 **  Propagate an article to the sites have "expressed an interest."
 */
 static void
@@ -1625,6 +1716,11 @@
        * cross-posting. */
       continue;
 
+    if (sp->HashFeedList &&
+      !HashFeedMatch(sp->HashFeedList, HDR(HDR__MESSAGE_ID)))
+      /* hashfeed doesn't match */
+      continue;
+
     if (list && *list != NULL && sp->Distributions &&
       !DISTwantany(sp->Distributions, list))
       /* Not in the site's desired list of distributions. */
diff -ruN t/inn-2.4.3/innd/innd.h inn-2.4.3/innd/innd.h
--- t/inn-2.4.3/innd/innd.h	2006-03-20 05:14:57.000000000 +0100
+++ inn-2.4.3/innd/innd.h	2008-03-24 00:52:29.000000000 +0100
@@ -407,6 +409,22 @@
 
 
 /*
+**  Diablo-style hashed feeds or hashfeeds.
+*/
+#define HASHFEED_QH	1
+#define HASHFEED_MD5	2
+
+typedef struct _HASHFEEDLIST {
+  int			type;
+  unsigned int		begin;
+  unsigned int		end;
+  unsigned int		mod;
+  unsigned int		offset;
+  struct _HASHFEEDLIST	*next;
+} HASHFEEDLIST;
+
+
+/*
 **  A site may reject something in its subscription list if it has
 **  too many hops, or a bad distribution.
 */
@@ -458,6 +476,7 @@
   struct buffer	  Buffer;
   bool		  Buffered;
   char	      **  Originator;
+  HASHFEEDLIST *  HashFeedList;
   int		  Next;
   int		  Prev;
 } SITE;
diff -ruN t/inn-2.4.3/innd/newsfeeds.c inn-2.4.3/innd/newsfeeds.c
--- t/inn-2.4.3/innd/newsfeeds.c	2006-03-20 05:14:57.000000000 +0100
+++ inn-2.4.3/innd/newsfeeds.c	2008-03-24 00:51:43.000000000 +0100
@@ -448,6 +448,7 @@
     int			isp;
     SITE		*nsp;
     struct buffer	b;
+    HASHFEEDLIST	*hf;
 
     b = sp->Buffer;
     *sp = SITEnull;
@@ -467,6 +468,7 @@
     sp->NeedOverviewCreation = false;
     sp->FeedwithoutOriginator = false;
     sp->DropFiltered = false;
+    sp->HashFeedList = NULL;
 
     /* Nip off the first field, the site name. */
     if ((f2 = strchr(Entry, NF_FIELD_SEP)) == NULL)
@@ -603,6 +605,36 @@
             if (*++p && CTYPE(isdigit, *p))
                 sp->Nice = atoi(p);
             break;
+	case 'Q':
+	    hf = xmalloc(sizeof(HASHFEEDLIST));
+	    p++;
+	    if (*p == '@') {
+		p++;
+		hf->type = HASHFEED_QH;
+	    } else
+		hf->type = HASHFEED_MD5;
+	    if ((u = strchr(p, '_')) != NULL) {
+		if (sscanf(u + 1, "%u", &hf->offset) != 1 || hf->offset > 12) {
+		    free(hf);
+		    return "invalid hash offset for Q";
+		}
+	    } else
+		hf->offset = 0;
+	    if (sscanf(p, "%u-%u/%u", &hf->begin, &hf->end, &hf->mod) != 3) {
+	    	if (sscanf(p, "%u/%u", &hf->begin, &hf->mod) == 2) {
+		    hf->end = hf->begin;
+		} else {
+		    free(hf);
+		    return "hash not in x/z or x-y/z format for Q";
+		}
+		if (hf->begin > hf->end || hf->end > hf->mod) {
+		    free(hf);
+		    return "incorrect hash values for Q";
+		}
+	    }
+	    hf->next = sp->HashFeedList;
+	    sp->HashFeedList = hf;
+	    break;
 	case 'S':
 	    if (*++p && CTYPE(isdigit, *p))
 		sp->StartSpooling = atol(p);
diff -ruN t/inn-2.4.3/innd/site.c inn-2.4.3/innd/site.c
--- t/inn-2.4.3/innd/site.c	2006-03-20 05:14:57.000000000 +0100
+++ inn-2.4.3/innd/site.c	2008-03-15 14:52:31.000000000 +0100
@@ -997,6 +1004,7 @@
 SITEfree(SITE *sp)
 {
     SITE                *s;
+    HASHFEEDLIST	*hf, *hn;
     int                 new;
     int                 i;
     
@@ -1051,6 +1059,13 @@
 	sp->FNLnames.data = NULL;
 	sp->FNLnames.size = 0;
     }
+    if (sp->HashFeedList) {
+	for (hf = sp->HashFeedList; hf; hf = hn) {
+	    hn = hf->next;
+	    free(hf);
+	}
+	sp->HashFeedList = NULL;
+    }
 
     /* If this site was a master, find a new one. */
     if (sp->IsMaster) {
diff -ruN t/inn-2.4.3/scripts/inncheck.in inn-2.4.3/scripts/inncheck.in
--- t/inn-2.4.3/scripts/inncheck.in	2006-03-20 05:14:57.000000000 +0100
+++ inn-2.4.3/scripts/inncheck.in	2008-03-24 00:54:49.000000000 +0100
@@ -367,6 +367,7 @@
     'N',	'^[mu]$',
     'O',	'^\S+$',
     'P',	'^\d+$',
+    'Q',	'^@?\d(-\d)?/\d(:\d)?$',
     'S',	'^\d+$',
     'T',	'^[cflmpx]$',
     'W',	'^[befghmnpst*DGHNPOR]*$',
-- 
The From: and Reply-To: addresses are internal news2mail gateway addresses.
Reply to the list or to "Miquel van Smoorenburg" <miquels at cistron.nl>


More information about the inn-workers mailing list