CNFS: round up writev to CNFS_BLOCKSIZE [PATCH]

"Miquel van Smoorenburg" list-inn-workers at news.cistron.nl
Sun May 8 11:09:29 UTC 2005


Right now, CNFS blocks are written starting on a CNFS_BLOCKSIZE
boundary, but not ending on one.

That means that even if CNFS_BLOCKSIZE is a multiple of the
filesystem blocksize, the OS still needs to read the last block
from disk in order to complete the write (remember - partial
block writes need a read from disk first).

On a system that gets a full feed, that means ~70 random reads/sec
from disk that can easily be eliminated by writing some extra
zero bytes so that a multiple of CNFS_BLOCKSIZE gets written.

This will only make a real difference on filesystems that have
a blocksize of which CNFS_BLOCKSIZE (512 by default) is a multiple.
If you have a filesystem with 4K blocksize (as most are) then it
would make sense to change CNFS_BLOCKSIZE to 4096 as well on
your system.

I think CNFS_BLOCKSIZE should be made tunable, defaulting to
4K, and its value should be written to the CNFS superblock. If
the value in the superblock is unset it would be 512 for backwards
compatibility, but that's a patch for another day.

inn-2.4.2-cnfs-align.diff

--- t/inn-2.4.2/storage/cnfs/cnfs.c	2004-12-22 05:21:19.000000000 +0100
+++ inn-2.4.2/storage/cnfs/cnfs.c	2005-05-05 22:42:23.000000000 +0200
@@ -1062,6 +1062,7 @@
     METACYCBUFF		*metacycbuff = NULL;
     int			i;
     static char		buf[1024];
+    static char		alignbuf[CNFS_BLOCKSIZE];
     char		*artcycbuffname;
     off_t               artoffset, middle;
     uint32_t		artcyclenum;
@@ -1071,6 +1072,7 @@
     int			tonextblock;
     CNFSEXPIRERULES	*metaexprule;
     off_t		left;
+    int			totlen;
 
     for (metaexprule = metaexprulestab; metaexprule != (CNFSEXPIRERULES *)NULL; metaexprule = metaexprule->next) {
 	if (metaexprule->class == class)
@@ -1097,6 +1099,12 @@
 	token.type = TOKEN_EMPTY;
 	return token;
     }
+
+    /* Align cycbuff->free just to be sure. */
+    tonextblock = CNFS_BLOCKSIZE - (cycbuff->free & (CNFS_BLOCKSIZE - 1));
+    if (tonextblock != CNFS_BLOCKSIZE)
+	cycbuff->free += (off_t)tonextblock;
+
     /* Article too big? */
     if (cycbuff->len - cycbuff->free < CNFS_BLOCKSIZE + 1)
         left = 0;
@@ -1139,6 +1147,7 @@
 	  CNFSflushhead(cycbuff);		/* Flush, just for giggles */
 	}
     }
+
     /* Ah, at least we know all three important data */
     artcycbuffname = cycbuff->name;
     artoffset = cycbuff->free;
@@ -1161,19 +1170,28 @@
 	return token;
     }
     if (iovcnt == 0) {
-	iov = xmalloc((article.iovcnt + 1) * sizeof(struct iovec));
-	iovcnt = article.iovcnt + 1;
-    } else if (iovcnt < article.iovcnt + 1) {
-        iov = xrealloc(iov, (article.iovcnt + 1) * sizeof(struct iovec));
-	iovcnt = article.iovcnt + 1;
+	iov = xmalloc((article.iovcnt + 2) * sizeof(struct iovec));
+	iovcnt = article.iovcnt + 2;
+    } else if (iovcnt < article.iovcnt + 2) {
+        iov = xrealloc(iov, (article.iovcnt + 2) * sizeof(struct iovec));
+	iovcnt = article.iovcnt + 2;
     }
     iov[0].iov_base = (char *) &cah;
     iov[0].iov_len = sizeof(cah);
+    totlen = iov[0].iov_len;
     for (i = 1 ; i <= article.iovcnt ; i++) {
         iov[i].iov_base = article.iov[i-1].iov_base;
         iov[i].iov_len = article.iov[i-1].iov_len;
+	totlen += iov[i].iov_len;
+    }
+    if (totlen & (CNFS_BLOCKSIZE - 1)) {
+	/* Want to xwritev an exact multiple of CNFS_BLOCKSIZE */
+	iov[i].iov_base = alignbuf;
+	iov[i].iov_len = CNFS_BLOCKSIZE - (totlen & (CNFS_BLOCKSIZE - 1));
+	totlen += iov[i].iov_len;
+	i++;
     }
-    if (xwritev(cycbuff->fd, iov, article.iovcnt + 1) < 0) {
+    if (xwritev(cycbuff->fd, iov, i) < 0) {
 	SMseterror(SMERR_INTERNAL, "cnfs_store() xwritev() failed");
 	syslog(L_ERROR,
 	       "%s: cnfs_store xwritev failed for '%s' offset 0x%s: %m",
@@ -1185,9 +1203,7 @@
     cycbuff->needflush = true;
 
     /* Now that the article is written, advance the free pointer & flush */
-    cycbuff->free += (off_t) article.len + sizeof(cah);
-    tonextblock = CNFS_BLOCKSIZE - (cycbuff->free & (CNFS_BLOCKSIZE - 1));
-    cycbuff->free += (off_t) tonextblock;
+    cycbuff->free += totlen;
     /*
     ** If cycbuff->free > cycbuff->len, don't worry.  The next cnfs_store()
     ** will detect the situation & wrap around correctly.

-- 
The From: and Reply-To: addresses are internal news2mail gateway addresses.
Reply to the list or to "Miquel van Smoorenburg" <miquels at cistron.nl>


More information about the inn-workers mailing list