Rewritten ARTparse committed

Russ Allbery rra at stanford.edu
Fri May 21 09:36:48 UTC 2004


I had a brainstorm late the other night about a way to simplify ARTparse
considerably, and implemented it last night.  Tonight, I worked out how to
unit test it, and tested it pretty thoroughly before committing it to
CURRENT.  I'm pretty confident that it works properly (380 tests in the
test suite), but it's always possible that I missed something and the test
suite can always be more comprehensive.

If people could keep an eye out for any errors, that would be greatly
appreciated.

I made another change to the logic while I was at it.  maxartsize is now
checked against the article size in wire format, not in native format.  I
think this is the least surprising (and it's far easier to implement; the
previous algorithm checked against native size but without undoing dot
stuffing).

I'm hoping that this will also solve the problems with innd being unable
to count properly, but I'm not positive (since I'm not sure what was
behind them and didn't find anything obvious).

Here is the new code, if anyone is curious to look it over and is willing
to put some more eyeballs on it.

/*
**  Report a rejection of an article by putting the reason for rejection into
**  the Error field of the supplied channel.
*/
static void
ARTerror(CHANNEL *cp, const char *format, ...)
{
    va_list args;

    snprintf(cp->Error, sizeof(cp->Error), "%d ", NNTP_REJECTIT_VAL);
    va_start(args, format);
    vsnprintf(cp->Error + 4, sizeof(cp->Error) - 4, format, args);
    va_end(args);
}

/*
**  Check to see if an article exceeds the local size limit and set the
**  channel state appropriately.  If the article has been fully received, also
**  update the Error buffer in the channel if needed.
*/
static void
ARTchecksize(CHANNEL *cp)
{
    size_t size;
    HDRCONTENT *hc = cp->Data.HdrContent;
    const char *msgid;

    size = cp->Next - cp->Start;
    if (innconf->maxartsize > 0 && size > (size_t) innconf->maxartsize) {
        if (cp->State == CSgotarticle)
            cp->State = CSgotlargearticle;
        else
            cp->State = CSeatarticle;
    }
    if (cp->State == CSgotlargearticle) {
        notice("%s internal rejecting huge article (%lu > %ld)", CHANname(cp),
               (unsigned long) size, innconf->maxartsize);
        ARTerror(cp, "Article of %lu bytes exceeds local limit of %ld bytes",
                 (unsigned long) size, innconf->maxartsize);

	/* Write a local cancel entry so nobody else gives it to us. */
	if (HDR_FOUND(HDR__MESSAGE_ID)) {
            HDR_PARSE_START(HDR__MESSAGE_ID);
            msgid = HDR(HDR__MESSAGE_ID);
            if (!HIScheck(History, msgid) && !InndHisRemember(msgid))
                warn("SERVER cant write %s", msgid);
	}
    }
}

/*
**  Parse a section of the header of an article.  This is called by ARTparse()
**  while the channel state is CSgetheader.  If we find the beginning of the
**  body, change the channel state and hand control off to ARTparsebody.
*/
static void
ARTparseheader(CHANNEL *cp)
{
    struct buffer *bp = &cp->In;
    ARTDATA *data = &cp->Data;
    size_t i;
    unsigned long length;

    for (i = cp->Next; i < bp->used; i++) {
        if (bp->data[i] == '\0')
            ARTerror(cp, "Nul character in header");
        if (bp->data[i] == '\n')
            data->LFwithoutCR++;
        if (bp->data[i] != '\r')
            continue;

        /* We saw a \r, which is the beginning of everything interesting.  The
           longest possibly interesting thing we could see is an article
           terminator (five characters).  If we don't have at least five more
           characters, we're guaranteed that the article isn't complete, so
           save ourselves complexity and just return and wait for more
           data. */
        if (bp->used - i < 5) {
            cp->Next = i;
            return;
        }
        if (memcmp(&bp->data[i], "\r\n.\r\n", 5) == 0) {
            if (i == cp->Start) {
                ARTerror(cp, "Empty article");
                cp->State = CSnoarticle;
            } else {
                ARTerror(cp, "No body");
                cp->State = CSgotarticle;
            }
            cp->Next = i + 5;
            return;
        } else if (bp->data[i + 1] == '\n') {
            length = i - data->LastCRLF - 1;
            if (data->LastCRLF == cp->Start)
                length++;
            if (length > MAXHEADERSIZE)
                ARTerror(cp, "Header line too long (%lu bytes)", length);

            /* Be a little tricky here.  Normally, the headers end at the
               first occurrance of \r\n\r\n, so since we've seen \r\n, we want
               to advance i and then look to see if we have another one.  The
               exception is the degenerate case of an article with no headers.
               In that case, log an error and *don't* advance i so that we'll
               still see the end of headers. */
            if (i == cp->Start) {
                ARTerror(cp, "No headers");
            } else {
                i += 2;
                ARTcheckheader(cp, i - data->CurHeader);
            }
            if (bp->data[i] == '\r' && bp->data[i + 1] == '\n') {
                cp->Next = i + 2;
                data->Body = i + 2;
                cp->State = CSgetbody;
                return ARTparsebody(cp);
            } else if (bp->data[i] != ' ' && bp->data[i] != '\t') {
                data->CurHeader = i;
            }
            data->HeaderLines++;
            data->LastCRLF = i - 1;
        } else {
            data->CRwithoutLF++;
        }
    }
    cp->Next = i;
}

/*
**  Parse a section of the body of an article.  This is called by ARTparse()
**  while the channel state is CSgetbody or CSeatarticle.
*/
static void
ARTparsebody(CHANNEL *cp)
{
    struct buffer *bp = &cp->In;
    ARTDATA *data = &cp->Data;
    size_t i;

    for (i = cp->Next; i < bp->used; i++) {
        if (bp->data[i] == '\0')
            ARTerror(cp, "Nul character in body");
        if (bp->data[i] == '\n')
            data->LFwithoutCR++;
        if (bp->data[i] != '\r')
            continue;

        /* Saw \r.  We're just scanning for the article terminator, so if we
           don't have at least five characters left, we can save effort and
           stop now. */
        if (bp->used - i < 5) {
            cp->Next = i;
            return;
        }
        if (memcmp(&bp->data[i], "\r\n.\r\n", 5) == 0) {
            if (cp->State == CSeatarticle)
                cp->State = CSgotlargearticle;
            else
                cp->State = CSgotarticle;
            cp->Next = i + 5;
            data->Lines++;
            return;
        } else if (bp->data[i + 1] == '\n') {
            data->Lines++;
        } else {
            data->LFwithoutCR++;
        }
    }
    cp->Next = i;
}

/*
**  The external interface to article parsing, called by NCproc.  This
**  function may be called repeatedly as each new block of data arrives.
*/
void
ARTparse(CHANNEL *cp)
{
    if (cp->State == CSgetheader)
        ARTparseheader(cp);
    else
        ARTparsebody(cp);
    ARTchecksize(cp);
    if (cp->State == CSgotarticle || cp->State == CSgotlargearticle)
        if (cp->Error[0] != '\0')
            ARTlogreject(cp);
}

-- 
Russ Allbery (rra at stanford.edu)             <http://www.eyrie.org/~eagle/>


More information about the inn-workers mailing list