PATCH inn-2.4.5 : fixes for buffindex storage method
Kirill Berezin
kyb at online.ru
Thu Nov 27 11:03:22 UTC 2008
Hie workers.
I finally decided to post a patch for buffindex storage method. I hope
it will be helpful.
This patch is for inn version 2.4.5, because it was tested on server
running this version.
Kirill.
Here is a detailed description:
lines 40, 1185, 1207, 1286, 1323, 1343, 1509, 1573, 1709
this fix allows use of buffers larger then 4 Gigs.
I sent this patch about 2 years ago, but it was skipped. I still argue
for it.
line 217 -- fixes lost initialization resolves a potential sigfault.
line 651 -- data placement optimization; the main idea is to place data
as tight as possible; current implementation scatters data over all
unavailable disk space. For our server this fix led to reduction of
expireover runtime from 9 to about 2 hours (it seems that one hour or so
is due to improvements of 2.4.5 version :) ).
line 1930 -- fixes lost initialization of group descriptor after
exipiration of all articles for a group at once.
line 1980 -- rollback for the case of error during group expiration.
This fix requires a long description. Expiration process for
buffindex method is straightforward --- retrieve full chain of indexes,
sort, look through the chain and COPY data and indexes to keep in new
location (in other words create a new chain of indexes); update group
description with new chain location and free all blocks allocated for
old chain. This means it must be enough space to copy all group's
indexes and headers.
For the case of error, for example not enough room available, blocks
allocated for old chain are freed, group description remains unchanged
and blocks allocated for new chain are still allocated, but not linked
to any group. Such a failure have a very interesting outcomes. According
to group description first block of a group exists and valid, but in
reality it is unallocated and server free to write a header into it.
And when occasionally server writes a header into this location
expiration process dumps a following error message:
Nov 6 04:29:48 andromeda expireover[23091]: buffindexed: ovgroupmmap ovbuff is null(ovindex is 12854, ovblock is 808793140
as you can see the number of buffer is 12854, but it is not possible for a server with buffers numbers 0, 1, 2 and 3.
This patch changes error processing as follows: free just allocated
chain, keep old chain and group description as it was before expiration.
line 61 -- just a comment, for a clarity.
diff -urN inn-2.4.5/storage/buffindexed/buffindexed.c
inn-2.4.5-patched/storage/buffindexed/buffindexed.c
--- inn-2.4.5/storage/buffindexed/buffindexed.c 2008-06-29
21:56:57.000000000 +0400
+++ inn-2.4.5-patched/storage/buffindexed/buffindexed.c 2008-11-27
10:32:57.279655426 +0300
@@ -40,8 +40,8 @@
#define OV_BLOCKSIZE 8192
#define OV_BEFOREBITF (1 * OV_BLOCKSIZE)
#define OV_FUDGE 1024
+#define OV_OFFSET(block) (block*(off_t)OV_BLOCKSIZE)
-/* ovblock pointer */
typedef struct _OV {
unsigned int blocknum;
short index;
@@ -61,7 +61,7 @@
/* ovbuff info */
typedef struct _OVBUFF {
- unsigned int index; /* ovbuff index */
+ unsigned int index; /* ovbuff(partition or file)
index */
char path[OVBUFFPASIZ]; /* Path to file */
int magicver; /* Magic version number */
int fd; /* file descriptor for this
@@ -217,7 +217,7 @@
static char LocalLogName[] = "buffindexed";
static long pagesize = 0;
-static OVBUFF *ovbufftab;
+static OVBUFF *ovbufftab = (OVBUFF*)NULL;
static int GROUPfd;
static GROUPHEADER *GROUPheader = NULL;
static GROUPENTRY *GROUPentries = NULL;
@@ -651,28 +651,16 @@
last++;
}
table = ((ULONG *) ovbuff->bitfield + (OV_BEFOREBITF / sizeof(long)));
- for (i = ovbuff->nextchunk ; i < last ; i++) {
- if (i == last - 1 && left != 0) {
- for (j = 1 ; j < left ; j++) {
- mask |= mask >> 1;
- }
- if ((table[i] & mask) != mask)
- break;
- } else {
- if ((table[i] ^ ~0) != 0)
- break;
- }
+ /* for tighter placement look for unused block from the begining on
every run */
+ for (i = 0 ; i < last ; i++) {
+ if ((table[i] ^ ~0) != 0)
+ break;
}
if (i == last) {
- for (i = 0 ; i < ovbuff->nextchunk ; i++) {
- if ((table[i] ^ ~0) != 0)
- break;
- }
- if (i == ovbuff->nextchunk) {
- ovbuff->freeblk = ovbuff->totalblk;
- return;
- }
+ ovbuff->freeblk = ovbuff->totalblk;
+ return;
}
+ /* */
if ((i - 1) >= 0 && (last - 1 == i) && left != 0) {
lastbit = left;
} else {
@@ -1185,9 +1173,9 @@
ovindexhead.low = 0;
ovindexhead.high = 0;
#ifdef MMAP_MISSES_WRITES
- if (mmapwrite(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD),
ovbuff->base + ov.blocknum * OV_BLOCKSIZE) != sizeof(OVINDEXHEAD)) {
+ if (mmapwrite(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD),
ovbuff->base + OV_OFFSET(ov.blocknum)) != sizeof(OVINDEXHEAD)) {
#else
- if (pwrite(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD),
ovbuff->base + ov.blocknum * OV_BLOCKSIZE) != sizeof(OVINDEXHEAD)) {
+ if (pwrite(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD),
ovbuff->base + OV_OFFSET(ov.blocknum)) != sizeof(OVINDEXHEAD)) {
#endif /* MMAP_MISSES_WRITES */
syslog(L_ERROR, "%s: could not write index record index '%d',
blocknum '%d': %m", LocalLogName, ge->curindex.index,
ge->curindex.blocknum);
return true;
@@ -1207,9 +1195,9 @@
ovindexhead.low = ge->curlow;
ovindexhead.high = ge->curhigh;
#ifdef MMAP_MISSES_WRITES
- if (mmapwrite(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD),
ovbuff->base + ge->curindex.blocknum * OV_BLOCKSIZE) !=
sizeof(OVINDEXHEAD)) {
+ if (mmapwrite(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD),
ovbuff->base + OV_OFFSET(ge->curindex.blocknum)) != sizeof(OVINDEXHEAD)) {
#else
- if (pwrite(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD),
ovbuff->base + ge->curindex.blocknum * OV_BLOCKSIZE) !=
sizeof(OVINDEXHEAD)) {
+ if (pwrite(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD),
ovbuff->base + OV_OFFSET(ge->curindex.blocknum)) != sizeof(OVINDEXHEAD)) {
#endif /* MMAP_MISSES_WRITES */
syslog(L_ERROR, "%s: could not write index record index '%d',
blocknum '%d': %m", LocalLogName, ge->curindex.index,
ge->curindex.blocknum);
return false;
@@ -1286,9 +1274,9 @@
}
#endif /* OV_DEBUG */
#ifdef MMAP_MISSES_WRITES
- if (mmapwrite(ovbuff->fd, data, len, ovbuff->base +
ge->curdata.blocknum * OV_BLOCKSIZE + ge->curoffset) != len) {
+ if (mmapwrite(ovbuff->fd, data, len, ovbuff->base +
OV_OFFSET(ge->curdata.blocknum) + ge->curoffset) != len) {
#else
- if (pwrite(ovbuff->fd, data, len, ovbuff->base + ge->curdata.blocknum
* OV_BLOCKSIZE + ge->curoffset) != len) {
+ if (pwrite(ovbuff->fd, data, len, ovbuff->base +
OV_OFFSET(ge->curdata.blocknum) + ge->curoffset) != len) {
#endif /* MMAP_MISSES_WRITES */
syslog(L_ERROR, "%s: could not append overview record index '%d',
blocknum '%d': %m", LocalLogName, ge->curdata.index, ge->curdata.blocknum);
return false;
@@ -1323,9 +1311,9 @@
}
#endif /* OV_DEBUG */
#ifdef MMAP_MISSES_WRITES
- if (mmapwrite(ovbuff->fd, &ie, sizeof(ie), ovbuff->base +
ge->curindex.blocknum * OV_BLOCKSIZE + sizeof(OVINDEXHEAD) + sizeof(ie)
* ge->curindexoffset) != sizeof(ie)) {
+ if (mmapwrite(ovbuff->fd, &ie, sizeof(ie), ovbuff->base +
OV_OFFSET(ge->curindex.blocknum) + sizeof(OVINDEXHEAD) + sizeof(ie) *
ge->curindexoffset) != sizeof(ie)) {
#else
- if (pwrite(ovbuff->fd, &ie, sizeof(ie), ovbuff->base +
ge->curindex.blocknum * OV_BLOCKSIZE + sizeof(OVINDEXHEAD) + sizeof(ie)
* ge->curindexoffset) != sizeof(ie)) {
+ if (pwrite(ovbuff->fd, &ie, sizeof(ie), ovbuff->base +
OV_OFFSET(ge->curindex.blocknum) + sizeof(OVINDEXHEAD) + sizeof(ie) *
ge->curindexoffset) != sizeof(ie)) {
#endif /* MMAP_MISSES_WRITES */
syslog(L_ERROR, "%s: could not write index record index '%d',
blocknum '%d': %m", LocalLogName, ge->curindex.index,
ge->curindex.blocknum);
return true;
@@ -1343,9 +1331,9 @@
ovindexhead.low = ge->curlow;
ovindexhead.high = ge->curhigh;
#ifdef MMAP_MISSES_WRITES
- if (mmapwrite(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD),
ovbuff->base + ge->curindex.blocknum * OV_BLOCKSIZE) !=
sizeof(OVINDEXHEAD)) {
+ if (mmapwrite(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD),
ovbuff->base + OV_OFFSET(ge->curindex.blocknum)) != sizeof(OVINDEXHEAD)) {
#else
- if (pwrite(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD),
ovbuff->base + ge->curindex.blocknum * OV_BLOCKSIZE) !=
sizeof(OVINDEXHEAD)) {
+ if (pwrite(ovbuff->fd, &ovindexhead, sizeof(OVINDEXHEAD),
ovbuff->base + OV_OFFSET(ge->curindex.blocknum)) != sizeof(OVINDEXHEAD)) {
#endif /* MMAP_MISSES_WRITES */
syslog(L_ERROR, "%s: could not write index record index '%d',
blocknum '%d': %m", LocalLogName, ge->curindex.index,
ge->curindex.blocknum);
return true;
@@ -1509,7 +1497,7 @@
ovgroupunmap();
return false;
}
- offset = ovbuff->base + (ov.blocknum * OV_BLOCKSIZE);
+ offset = ovbuff->base + OV_OFFSET(ov.blocknum);
pagefudge = offset % pagesize;
mmapoffset = offset - pagefudge;
len = pagefudge + OV_BLOCKSIZE;
@@ -1573,11 +1561,12 @@
if (count * OV_BLOCKSIZE > innconf->keepmmappedthreshold * 1024)
/* large retrieval, mmap is done in ovsearch() */
return true;
+ // datablocks are being mmapped, not copied
for (i = 0 ; i < GROUPDATAHASHSIZE ; i++) {
for (gdb = groupdatablock[i] ; gdb != NULL ; gdb = gdb->next) {
ov = gdb->datablk;
ovbuff = getovbuff(ov);
- offset = ovbuff->base + (ov.blocknum * OV_BLOCKSIZE);
+ offset = ovbuff->base + OV_OFFSET(ov.blocknum);
pagefudge = offset % pagesize;
mmapoffset = offset - pagefudge;
gdb->len = pagefudge + OV_BLOCKSIZE;
@@ -1709,7 +1698,7 @@
search->gdb.datablk.blocknum = srchov.blocknum;
search->gdb.datablk.index = srchov.index;
ovbuff = getovbuff(srchov);
- offset = ovbuff->base + (srchov.blocknum * OV_BLOCKSIZE);
+ offset = ovbuff->base + OV_OFFSET(srchov.blocknum);
pagefudge = offset % pagesize;
mmapoffset = offset - pagefudge;
search->gdb.len = pagefudge + OV_BLOCKSIZE;
@@ -1930,6 +1919,7 @@
ovgroupunmap();
ge->expired = time(NULL);
ge->count = 0;
+ ge->baseindex = ge->curindex = ge->curdata = ovnull;
GROUPlock(gloc, INN_LOCK_UNLOCK);
}
return true;
@@ -1980,10 +1970,18 @@
#else
if (!ovaddrec(&newge, artnum, token, data, len, arrived, expires)) {
#endif /* OV_DEBUG */
- ovclosesearch(handle, true);
+ ovclosesearch(handle, false); /* old group cannot be freed */
ge->expired = time(NULL);
GROUPlock(gloc, INN_LOCK_UNLOCK);
- syslog(L_ERROR, "%s: could not add new overview for '%s'",
LocalLogName, group);
+ syslog(L_ERROR, "%s: not enough room to expire overview for group
'%s'", LocalLogName, group);
+ /* clean just reserved overview entries */
+ if (!ovgroupmmap(&newge, newge.low, newge.high, true)){
+ syslog(L_ERROR, "%s: cannot prepare free operation", LocalLogName);
+ return false;
+ }
+ freegroupblock();
+ ovgroupunmap();
+ /* */
return false;
}
}
More information about the inn-workers
mailing list