PATCH inn-2.4.5 : fixes for buffindex storage method

Kirill Berezin kyb at online.ru
Thu Nov 27 11:03:22 UTC 2008


Hie workers.

I finally decided to post a patch for buffindex storage method. I hope 
it will be helpful.
This patch is for inn version 2.4.5, because it was tested on server 
running this version.

Kirill.

Here is a detailed description:

lines 40, 1185, 1207, 1286, 1323, 1343, 1509, 1573, 1709
  this fix allows use of buffers larger then 4 Gigs.
  I sent this patch about 2 years ago, but it was skipped. I still argue 
for it.

line 217 -- fixes lost initialization resolves a potential sigfault.

line 651 -- data placement optimization; the main idea is to place data 
as tight as possible; current implementation scatters data over all 
unavailable disk space. For our server this fix led to reduction of  
expireover runtime from 9 to about 2 hours (it seems that one hour or so 
is due to improvements of 2.4.5 version :) ).

line 1930 -- fixes lost initialization of group descriptor after 
exipiration of all articles for a group at once.

line 1980 -- rollback for the case of error during group expiration.
    This fix requires a long description. Expiration process for 
buffindex method is straightforward --- retrieve full chain of indexes, 
sort, look through the chain and COPY data and indexes to keep in new 
location (in other words create a new chain of indexes); update group 
description with new chain location and free all blocks allocated for 
old chain. This means it must be enough space to copy all group's 
indexes and headers.

For the case of error, for example not enough room available, blocks 
allocated for old chain are freed, group description remains unchanged 
and blocks allocated for new chain  are still allocated, but not linked 
to any group. Such a failure have a very interesting outcomes. According 
to group description first block of a group exists and valid, but in 
reality it is unallocated and server  free to write a header into it. 
And when  occasionally  server writes a header into this location 
expiration process dumps a following error message:

Nov  6 04:29:48 andromeda expireover[23091]: buffindexed: ovgroupmmap ovbuff is null(ovindex is 12854, ovblock is 808793140

as you can see the number of buffer is 12854, but it is not possible for a server with buffers numbers 0, 1, 2 and 3.

This patch changes error processing as follows: free just allocated 
chain, keep old chain and group description as it was before expiration.
   
line 61 -- just a comment, for a clarity.


diff -urN inn-2.4.5/storage/buffindexed/buffindexed.c 
inn-2.4.5-patched/storage/buffindexed/buffindexed.c
--- inn-2.4.5/storage/buffindexed/buffindexed.c    2008-06-29 
21:56:57.000000000 +0400
+++ inn-2.4.5-patched/storage/buffindexed/buffindexed.c    2008-11-27 
10:32:57.279655426 +0300
@@ -40,8 +40,8 @@
 #define    OV_BLOCKSIZE    8192
 #define    OV_BEFOREBITF    (1 * OV_BLOCKSIZE)
 #define    OV_FUDGE    1024
+#define OV_OFFSET(block) (block*(off_t)OV_BLOCKSIZE)
 
-/* ovblock pointer */
 typedef struct _OV {
   unsigned int    blocknum;
   short        index;
@@ -61,7 +61,7 @@
 
 /* ovbuff info */
 typedef struct _OVBUFF {
-  unsigned int        index;            /* ovbuff index */
+  unsigned int        index;            /* ovbuff(partition or file) 
index */
   char            path[OVBUFFPASIZ];    /* Path to file */
   int            magicver;        /* Magic version number */
   int            fd;            /* file descriptor for this
@@ -217,7 +217,7 @@
 
 static char LocalLogName[] = "buffindexed";
 static long        pagesize = 0;
-static OVBUFF        *ovbufftab;
+static OVBUFF        *ovbufftab = (OVBUFF*)NULL;
 static int              GROUPfd;
 static GROUPHEADER      *GROUPheader = NULL;
 static GROUPENTRY       *GROUPentries = NULL;
@@ -651,28 +651,16 @@
     last++;
   }
   table = ((ULONG *) ovbuff->bitfield + (OV_BEFOREBITF / sizeof(long)));
-  for (i = ovbuff->nextchunk ; i < last ; i++) {
-    if (i == last - 1 && left != 0) {
-      for (j = 1 ; j < left ; j++) {
-    mask |= mask >> 1;
-      }
-      if ((table[i] & mask) != mask)
-    break;
-    } else {
-      if ((table[i] ^ ~0) != 0)
-    break;
-    }
+  /* for tighter placement look for unused block from the begining on 
every run */
+  for (i = 0 ; i < last ; i++) {
+    if ((table[i] ^ ~0) != 0)
+      break;
   }
   if (i == last) {
-    for (i = 0 ; i < ovbuff->nextchunk ; i++) {
-      if ((table[i] ^ ~0) != 0)
-        break;
-    }
-    if (i == ovbuff->nextchunk) {
-      ovbuff->freeblk = ovbuff->totalblk;
-      return;
-    }
+    ovbuff->freeblk = ovbuff->totalblk;
+    return;
   }
+  /* */
   if ((i - 1) >= 0 && (last - 1 == i) && left != 0) {
     lastbit = left;
   } else {
@@ -1185,9 +1173,9 @@
   ovindexhead.low = 0;
   ovindexhead.high = 0;
 #ifdef MMAP_MISSES_WRITES
-  if (mmapwrite(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD), 
ovbuff->base + ov.blocknum * OV_BLOCKSIZE) != sizeof(OVINDEXHEAD)) {
+  if (mmapwrite(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD), 
ovbuff->base + OV_OFFSET(ov.blocknum)) != sizeof(OVINDEXHEAD)) {
 #else
-  if (pwrite(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD), 
ovbuff->base + ov.blocknum * OV_BLOCKSIZE) != sizeof(OVINDEXHEAD)) {
+  if (pwrite(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD), 
ovbuff->base + OV_OFFSET(ov.blocknum)) != sizeof(OVINDEXHEAD)) {
 #endif /* MMAP_MISSES_WRITES */
     syslog(L_ERROR, "%s: could not write index record index '%d', 
blocknum '%d': %m", LocalLogName, ge->curindex.index, 
ge->curindex.blocknum);
     return true;
@@ -1207,9 +1195,9 @@
     ovindexhead.low = ge->curlow;
     ovindexhead.high = ge->curhigh;
 #ifdef MMAP_MISSES_WRITES
-    if (mmapwrite(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD), 
ovbuff->base + ge->curindex.blocknum * OV_BLOCKSIZE) != 
sizeof(OVINDEXHEAD)) {
+    if (mmapwrite(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD), 
ovbuff->base + OV_OFFSET(ge->curindex.blocknum)) != sizeof(OVINDEXHEAD)) {
 #else
-    if (pwrite(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD), 
ovbuff->base + ge->curindex.blocknum * OV_BLOCKSIZE) != 
sizeof(OVINDEXHEAD)) {
+    if (pwrite(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD), 
ovbuff->base + OV_OFFSET(ge->curindex.blocknum)) != sizeof(OVINDEXHEAD)) {
 #endif /* MMAP_MISSES_WRITES */
       syslog(L_ERROR, "%s: could not write index record index '%d', 
blocknum '%d': %m", LocalLogName, ge->curindex.index, 
ge->curindex.blocknum);
       return false;
@@ -1286,9 +1274,9 @@
   }
 #endif /* OV_DEBUG */
 #ifdef MMAP_MISSES_WRITES
-  if (mmapwrite(ovbuff->fd, data, len, ovbuff->base + 
ge->curdata.blocknum * OV_BLOCKSIZE + ge->curoffset) != len) {
+  if (mmapwrite(ovbuff->fd, data, len, ovbuff->base + 
OV_OFFSET(ge->curdata.blocknum) + ge->curoffset) != len) {
 #else
-  if (pwrite(ovbuff->fd, data, len, ovbuff->base + ge->curdata.blocknum 
* OV_BLOCKSIZE + ge->curoffset) != len) {
+  if (pwrite(ovbuff->fd, data, len, ovbuff->base + 
OV_OFFSET(ge->curdata.blocknum) + ge->curoffset) != len) {
 #endif /* MMAP_MISSES_WRITES */
     syslog(L_ERROR, "%s: could not append overview record index '%d', 
blocknum '%d': %m", LocalLogName, ge->curdata.index, ge->curdata.blocknum);
     return false;
@@ -1323,9 +1311,9 @@
   }
 #endif /* OV_DEBUG */
 #ifdef MMAP_MISSES_WRITES
-  if (mmapwrite(ovbuff->fd, &ie, sizeof(ie), ovbuff->base + 
ge->curindex.blocknum * OV_BLOCKSIZE + sizeof(OVINDEXHEAD) + sizeof(ie) 
* ge->curindexoffset) != sizeof(ie)) {
+  if (mmapwrite(ovbuff->fd, &ie, sizeof(ie), ovbuff->base + 
OV_OFFSET(ge->curindex.blocknum) + sizeof(OVINDEXHEAD) + sizeof(ie) * 
ge->curindexoffset) != sizeof(ie)) {
 #else
-  if (pwrite(ovbuff->fd, &ie, sizeof(ie), ovbuff->base + 
ge->curindex.blocknum * OV_BLOCKSIZE + sizeof(OVINDEXHEAD) + sizeof(ie) 
* ge->curindexoffset) != sizeof(ie)) {
+  if (pwrite(ovbuff->fd, &ie, sizeof(ie), ovbuff->base + 
OV_OFFSET(ge->curindex.blocknum) + sizeof(OVINDEXHEAD) + sizeof(ie) * 
ge->curindexoffset) != sizeof(ie)) {
 #endif /* MMAP_MISSES_WRITES */
     syslog(L_ERROR, "%s: could not write index record index '%d', 
blocknum '%d': %m", LocalLogName, ge->curindex.index, 
ge->curindex.blocknum);
     return true;
@@ -1343,9 +1331,9 @@
     ovindexhead.low = ge->curlow;
     ovindexhead.high = ge->curhigh;
 #ifdef MMAP_MISSES_WRITES
-    if (mmapwrite(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD), 
ovbuff->base + ge->curindex.blocknum * OV_BLOCKSIZE) != 
sizeof(OVINDEXHEAD)) {
+    if (mmapwrite(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD), 
ovbuff->base + OV_OFFSET(ge->curindex.blocknum)) != sizeof(OVINDEXHEAD)) {
 #else
-    if (pwrite(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD), 
ovbuff->base + ge->curindex.blocknum * OV_BLOCKSIZE) != 
sizeof(OVINDEXHEAD)) {
+    if (pwrite(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD), 
ovbuff->base + OV_OFFSET(ge->curindex.blocknum)) != sizeof(OVINDEXHEAD)) {
 #endif /* MMAP_MISSES_WRITES */
       syslog(L_ERROR, "%s: could not write index record index '%d', 
blocknum '%d': %m", LocalLogName, ge->curindex.index, 
ge->curindex.blocknum);
       return true;
@@ -1509,7 +1497,7 @@
       ovgroupunmap();
       return false;
     }
-    offset = ovbuff->base + (ov.blocknum * OV_BLOCKSIZE);
+    offset = ovbuff->base + OV_OFFSET(ov.blocknum);
     pagefudge = offset % pagesize;
     mmapoffset = offset - pagefudge;
     len = pagefudge + OV_BLOCKSIZE;
@@ -1573,11 +1561,12 @@
   if (count * OV_BLOCKSIZE > innconf->keepmmappedthreshold * 1024)
     /* large retrieval, mmap is done in ovsearch() */
     return true;
+  // datablocks are being mmapped, not copied
   for (i = 0 ; i < GROUPDATAHASHSIZE ; i++) {
     for (gdb = groupdatablock[i] ; gdb != NULL ; gdb = gdb->next) {
       ov = gdb->datablk;
       ovbuff = getovbuff(ov);
-      offset = ovbuff->base + (ov.blocknum * OV_BLOCKSIZE);
+      offset = ovbuff->base + OV_OFFSET(ov.blocknum);
       pagefudge = offset % pagesize;
       mmapoffset = offset - pagefudge;
       gdb->len = pagefudge + OV_BLOCKSIZE;
@@ -1709,7 +1698,7 @@
         search->gdb.datablk.blocknum = srchov.blocknum;
         search->gdb.datablk.index = srchov.index;
         ovbuff = getovbuff(srchov);
-        offset = ovbuff->base + (srchov.blocknum * OV_BLOCKSIZE);
+        offset = ovbuff->base + OV_OFFSET(srchov.blocknum);
         pagefudge = offset % pagesize;
         mmapoffset = offset - pagefudge;
         search->gdb.len = pagefudge + OV_BLOCKSIZE;
@@ -1930,6 +1919,7 @@
       ovgroupunmap();
       ge->expired = time(NULL);
       ge->count = 0;
+      ge->baseindex = ge->curindex = ge->curdata = ovnull;
       GROUPlock(gloc, INN_LOCK_UNLOCK);
     }
     return true;
@@ -1980,10 +1970,18 @@
 #else
     if (!ovaddrec(&newge, artnum, token, data, len, arrived, expires)) {
 #endif /* OV_DEBUG */
-      ovclosesearch(handle, true);
+      ovclosesearch(handle, false); /* old group cannot be freed */
       ge->expired = time(NULL);
       GROUPlock(gloc, INN_LOCK_UNLOCK);
-      syslog(L_ERROR, "%s: could not add new overview for '%s'", 
LocalLogName, group);
+      syslog(L_ERROR, "%s: not enough room to expire overview for group 
'%s'", LocalLogName, group);
+      /* clean just reserved overview entries */
+      if (!ovgroupmmap(&newge, newge.low, newge.high, true)){
+        syslog(L_ERROR, "%s: cannot prepare free operation", LocalLogName);
+        return false;
+      }
+      freegroupblock();
+      ovgroupunmap();
+      /* */
       return false;
     }
   }




More information about the inn-workers mailing list