INN commit: trunk (3 files)

INN Commit Russ_Allbery at isc.org
Sat Nov 29 11:07:03 UTC 2008


    Date: Saturday, November 29, 2008 @ 03:07:02
  Author: iulius
Revision: 8195

Allow the use of buffers larger than 2 GB with buffindexed.

Patch from Kirill Berezin.


Idea of the patch:

  if (mmapwrite(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD),
      ovbuff->base + ov.blocknum * OV_BLOCKSIZE) != sizeof(OVINDEXHEAD)) {

This is a part of current ovsetcurindexblock function.  Third argument is
the offset from the beginning of buffer:  ovbuff->base is off_t and
ov.blocknum is unsigned int.  For the case of INN compiled without
--enable-largefiles it works fine, but in other case the size of off_t
is 8 bytes and unsigned int still uses 4 bytes.

Now say we have a 5Gb buffer, which is roughly equal to 625000 8000 byte
blocks, and we are going to access to block # 620000 that is equal to
offset of 4960000000 bytes, or about at very end of the buffer.  BUT C
standard does not require to cast all parts of statement to type of
argument with longest size before calculation of result, instead is
requires a cast of arguments of current operation only.
As a result, because multiplication is operation with higher priority
and the size of blocknum is 4 bytes and constant have no size hints, we
will have an offset somewhere in the beginning of buffer instead of very
end of it.
To resolve such a limitation, a macro (OV_OFFSET) now calculates the
offset (changing blocknum type to off_t instead of unsigned int).

Modified:
  trunk/doc/pod/buffindexed.conf.pod
  trunk/doc/pod/news.pod
  trunk/storage/buffindexed/buffindexed.c

-----------------------------------+
 doc/pod/buffindexed.conf.pod      |    6 ++----
 doc/pod/news.pod                  |    7 +++++++
 storage/buffindexed/buffindexed.c |   20 +++++++++++---------
 3 files changed, 20 insertions(+), 13 deletions(-)

Modified: doc/pod/buffindexed.conf.pod
===================================================================
--- doc/pod/buffindexed.conf.pod	2008-11-29 10:39:08 UTC (rev 8194)
+++ doc/pod/buffindexed.conf.pod	2008-11-29 11:07:02 UTC (rev 8195)
@@ -49,9 +49,7 @@
 <size> is the length of the buffer in kilobytes (S<1 KB = 1024 bytes>).  If
 <filename> does not specify a special device, the file size of the buffer
 must be S<< <size> * 1024 bytes >>.  If it does specify a special device, that
-device must have at least <size> space available.  Buffers over S<2 GB> are
-not supported (regardless of whether INN was compiled with large file
-support); this limitation may be fixed in the future.  For more
+device must have at least <size> space available.  For more
 information on setting up the buffers, see L<CREATING BUFFERS>.
 
 An example of F<buffindexed.conf> file can be:
@@ -99,7 +97,7 @@
 locking on block devices, and therefore this method should not be used on
 Solaris.
 
-Partition the disk to make each partition equal to or smaller than S<2 GB>.
+Partition the disk.
 If you're using Solaris, set up your partitions to avoid the first
 cylinder of the disk (or otherwise the buffindexed header will overwrite
 the disk partition table and render the buffers inaccessible).  Then,

Modified: doc/pod/news.pod
===================================================================
--- doc/pod/news.pod	2008-11-29 10:39:08 UTC (rev 8194)
+++ doc/pod/news.pod	2008-11-29 11:07:02 UTC (rev 8195)
@@ -61,6 +61,13 @@
 
 =item *
 
+Thanks to Kirill Berezin, the buffindexed overview method now supports buffers
+larger than S<2 GB>.  It is not necessary to compile INN with large file support
+to use such large buffers with buffindexed.  Buffindexed is also more robust
+with mmaped files.
+
+=item *
+
 B<tinyleaf>, a miniature IHAVE-only leaf server, is now included.  See the
 tinyleaf(8) man page for more information.
 

Modified: storage/buffindexed/buffindexed.c
===================================================================
--- storage/buffindexed/buffindexed.c	2008-11-29 10:39:08 UTC (rev 8194)
+++ storage/buffindexed/buffindexed.c	2008-11-29 11:07:02 UTC (rev 8195)
@@ -69,6 +69,7 @@
 #define OV_BEFOREBITF   (1 * OV_BLOCKSIZE)
 #define	OV_BLOCKSIZE	8192
 #define	OV_FUDGE	1024
+#define OV_OFFSET(block) (block*(off_t) OV_BLOCKSIZE)
 
 /* ovblock pointer */
 typedef struct _OV {
@@ -96,7 +97,7 @@
 
 /* ovbuff info */
 typedef struct _OVBUFF {
-  unsigned int		index;			/* ovbuff index */
+  unsigned int		index;			/* ovbuff (partition or file) */
   char			path[OVBUFFPASIZ];	/* Path to file */
   int			fd;			/* file descriptor for this
 						   ovbuff */
@@ -1330,7 +1331,7 @@
   ovindexhead.next = ovnull;
   ovindexhead.low = 0;
   ovindexhead.high = 0;
-  if (PWRITE(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD), ovbuff->base + ov.blocknum * OV_BLOCKSIZE) != sizeof(OVINDEXHEAD)) {
+  if (PWRITE(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD), ovbuff->base + OV_OFFSET(ov.blocknum)) != sizeof(OVINDEXHEAD)) {
     syswarn("buffindexed: could not write index record index '%d', blocknum"
             " '%d'", ge->curindex.index, ge->curindex.blocknum);
     return true;
@@ -1352,7 +1353,7 @@
     ovindexhead.next = ov;
     ovindexhead.low = ge->curlow;
     ovindexhead.high = ge->curhigh;
-    if (PWRITE(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD), ovbuff->base + ge->curindex.blocknum * OV_BLOCKSIZE) != sizeof(OVINDEXHEAD)) {
+    if (PWRITE(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD), ovbuff->base + OV_OFFSET(ge->curindex.blocknum)) != sizeof(OVINDEXHEAD)) {
       syswarn("buffindexed: could not write index record index '%d', blocknum"
               " '%d'", ge->curindex.index, ge->curindex.blocknum);
       return false;
@@ -1432,7 +1433,7 @@
 #endif /* OV_DEBUG */
   }
 
-  if (PWRITE(ovbuff->fd, data, len, ovbuff->base + ge->curdata.blocknum * OV_BLOCKSIZE + ge->curoffset) != len) {
+  if (PWRITE(ovbuff->fd, data, len, ovbuff->base + OV_OFFSET(ge->curdata.blocknum) + ge->curoffset) != len) {
     syswarn("buffindexed: could not append overview record index '%d',"
             " blocknum '%d'", ge->curdata.index, ge->curdata.blocknum);
     return false;
@@ -1469,7 +1470,7 @@
     ovusedblock(ovbuff, ge->curindex.blocknum, true, true);
 #endif /* OV_DEBUG */
   }
-  if (PWRITE(ovbuff->fd, &ie, sizeof(ie), ovbuff->base + ge->curindex.blocknum * OV_BLOCKSIZE + sizeof(OVINDEXHEAD) + sizeof(ie) * ge->curindexoffset) != sizeof(ie)) {
+  if (PWRITE(ovbuff->fd, &ie, sizeof(ie), ovbuff->base + OV_OFFSET(ge->curindex.blocknum) + sizeof(OVINDEXHEAD) + sizeof(ie) * ge->curindexoffset) != sizeof(ie)) {
     syswarn("buffindexed: could not write index record index '%d', blocknum"
             " '%d'", ge->curindex.index, ge->curindex.blocknum);
     return true;
@@ -1486,7 +1487,7 @@
     ovindexhead.next = ovnull;
     ovindexhead.low = ge->curlow;
     ovindexhead.high = ge->curhigh;
-    if (PWRITE(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD), ovbuff->base + ge->curindex.blocknum * OV_BLOCKSIZE) != sizeof(OVINDEXHEAD)) {
+    if (PWRITE(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD), ovbuff->base + OV_OFFSET(ge->curindex.blocknum)) != sizeof(OVINDEXHEAD)) {
       syswarn("buffindexed: could not write index record index '%d', blocknum"
               " '%d'", ge->curindex.index, ge->curindex.blocknum);
       return true;
@@ -1653,7 +1654,7 @@
       ovgroupunmap();
       return false;
     }
-    offset = ovbuff->base + (ov.blocknum * OV_BLOCKSIZE);
+    offset = ovbuff->base + OV_OFFSET(ov.blocknum);
     pagefudge = offset % pagesize;
     mmapoffset = offset - pagefudge;
     len = pagefudge + OV_BLOCKSIZE;
@@ -1717,11 +1718,12 @@
   if (count * OV_BLOCKSIZE > innconf->keepmmappedthreshold * 1024)
     /* large retrieval, mmap is done in ovsearch() */
     return true;
+  /* Data blocks are being mmapped, not copied. */
   for (i = 0 ; i < GROUPDATAHASHSIZE ; i++) {
     for (gdb = groupdatablock[i] ; gdb != NULL ; gdb = gdb->next) {
       ov = gdb->datablk;
       ovbuff = getovbuff(ov);
-      offset = ovbuff->base + (ov.blocknum * OV_BLOCKSIZE);
+      offset = ovbuff->base + OV_OFFSET(ov.blocknum);
       pagefudge = offset % pagesize;
       mmapoffset = offset - pagefudge;
       gdb->len = pagefudge + OV_BLOCKSIZE;
@@ -1857,7 +1859,7 @@
 	    search->gdb.datablk.blocknum = srchov.blocknum;
 	    search->gdb.datablk.index = srchov.index;
 	    ovbuff = getovbuff(srchov);
-	    offset = ovbuff->base + (srchov.blocknum * OV_BLOCKSIZE);
+	    offset = ovbuff->base + OV_OFFSET(srchov.blocknum);
 	    pagefudge = offset % pagesize;
 	    mmapoffset = offset - pagefudge;
 	    search->gdb.len = pagefudge + OV_BLOCKSIZE;




More information about the inn-committers mailing list