INN commit: trunk (5 files)

Mon Feb 4 14:21:58 UTC 2019

Date: Monday, February 4, 2019 @ 06:21:58
  Author: iulius
Revision: 10325

Allow again the use of UTF-8 in header fields

Regression since INN 2.6.1.  Posts with internationalized header
fields (UTF-8) are now accepted again.

Modified:
  trunk/doc/pod/news.pod
  trunk/lib/headers.c
  trunk/lib/uwildmat.c
  trunk/tests/lib/headers-t.c
  trunk/tests/lib/uwildmat-t.c

------------------------+
 doc/pod/news.pod       |    6 ++++++
 lib/headers.c          |   21 +++++++++++++--------
 lib/uwildmat.c         |   21 +++++++++++++++------
 tests/lib/headers-t.c  |   11 ++++++++++-
 tests/lib/uwildmat-t.c |    3 ++-
 5 files changed, 46 insertions(+), 16 deletions(-)

Modified: doc/pod/news.pod
===================================================================

--- doc/pod/news.pod	2018-12-29 13:49:20 UTC (rev 10324)
+++ doc/pod/news.pod	2019-02-04 14:21:58 UTC (rev 10325)
@@ -20,6 +20,12 @@
 
 =item *
 
+Fixed a regression since S<INN 2.6.1> that prevented articles with
+internationalized header fields (that is to say encoded in UTF-8)
+from being posted.
+
+=item *
+
 Support for S<Python 3> has been added to INN.  Embedded Python filtering
 and authentication hooks for B<innd> and B<nnrpd> can now use S<version
 3.3.0> or later of the Python interpreter.  In the 2.x series, S<version

Modified: lib/headers.c
===================================================================
--- lib/headers.c	2018-12-29 13:49:20 UTC (rev 10324)
+++ lib/headers.c	2019-02-04 14:21:58 UTC (rev 10325)
@@ -40,6 +40,7 @@
 /*
 **  Check whether the argument is a valid header field body.  It starts
 **  after the space following the header field name and its colon.
+**  Internationalized header fields encoded in UTF-8 are allowed.
 **
 **  We currently assume the maximal line length has already been checked.
 */
@@ -52,13 +53,11 @@
     if (p == NULL || *p == '\0')
         return false;
 
+    if (!is_valid_utf8(p))
+        return false;
+
     for (; *p != '\0'; p++) {
-        if (isgraph((unsigned char) *p)) {
-            /* Current header content line contains a (non-whitespace)
-             * printable char. */
-            emptycontentline = false;
-            continue;
-        } else if (ISWHITE(*p)) {
+        if (ISWHITE(*p)) {
             /* Skip SP and TAB. */
             continue;
         } else if (*p == '\n' || (*p == '\r' && *++p == '\n')) {
@@ -75,9 +74,15 @@
              * re-initialize emptycontentline to true. */
             emptycontentline = true;
             continue;
+        } else if (p[-1] == '\r') {
+            /* Case of CR not followed by LF (handled at the previous
+             * if statement). */
+            return false;
         } else {
-            /* Invalid character found. */
-            return false;
+            /* Current header content line contains a (non-whitespace)
+             * character. */
+            emptycontentline = false;
+            continue;
         }
     }
 

Modified: lib/uwildmat.c
===================================================================
--- lib/uwildmat.c	2018-12-29 13:49:20 UTC (rev 10324)
+++ lib/uwildmat.c	2019-02-04 14:21:58 UTC (rev 10325)
@@ -63,6 +63,8 @@
 
 #include "config.h"
 #include "clibrary.h"
+#include <ctype.h>
+
 #include "inn/libinn.h"
 
 #define ABORT -1
@@ -100,7 +102,8 @@
 
 
 /*
-**  Check whether a string contains only valid UTF-8 characters.
+**  Check whether a string contains only valid UTF-8 characters, without
+**  any ASCII control characters except for \r, \n and \t.
 */
 bool
 is_valid_utf8(const char *text)
@@ -120,10 +123,16 @@
 
         p++;
 
-        /* Valid ASCII. */
-        if (length == 0)
-            continue;
-        
+        /* Valid printable ASCII character or CR, LF or HTAB. */
+        if (length == 0) {
+            if(isprint((unsigned char) p[-1])
+               || p[-1] == '\r' || p[-1] == '\n' || p[-1] == '\t') {
+                continue;
+            } else {
+                return false;
+            }
+        }
+
         /* Invalid length. */
         if (length < 2 || length > 6)
             return false;
@@ -350,7 +359,7 @@
         return !*text ? UWILDMAT_MATCH : UWILDMAT_FAIL;
     end = start + strlen((const char *) start) - 1;
 
-    /* Main match loop.  Find each comma that separates patterns, and attempt 
+    /* Main match loop.  Find each comma that separates patterns, and attempt
        to match the text with each pattern in order.  The last matching
        pattern determines whether the whole expression matches. */
     for (; p <= end + 1; p = split + 1) {

Modified: tests/lib/headers-t.c
===================================================================
--- tests/lib/headers-t.c	2018-12-29 13:49:20 UTC (rev 10324)
+++ tests/lib/headers-t.c	2019-02-04 14:21:58 UTC (rev 10325)
@@ -12,7 +12,7 @@
 int
 main(void)
 {
-    plan(9+3+9+7+12+5);
+    plan(9+3+11+8+14+7);
 
     ok(!IsValidHeaderName(NULL), "bad header name 1");
     ok(!IsValidHeaderName(""), "bad header name 2");
@@ -38,6 +38,8 @@
     ok(!IsValidHeaderBody("\r\n b"), "bad header body 7");
     ok(!IsValidHeaderBody("a\r\n b\r\n"), "bad header body 8");
     ok(!IsValidHeaderBody("a\n\tb\n \t\n c"), "bad header body 9");
+    ok(!IsValidHeaderBody("a\003b"), "bad header body 10");
+    ok(!IsValidHeaderBody("a\r b"), "bad header body 11");
 
     ok(IsValidHeaderBody(":"), "good header body 1");
     ok(IsValidHeaderBody("a b"), "good header body 2");
@@ -46,6 +48,7 @@
     ok(IsValidHeaderBody("a\r\n\tb"), "good header body 5");
     ok(IsValidHeaderBody("a\n   b"), "good header body 6");
     ok(IsValidHeaderBody("a\n\tb\n \tc\n d"), "good header body 7");
+    ok(IsValidHeaderBody("\317\205\317\204\317\2068"), "good header body 8");
 
     ok(!IsValidHeaderField(NULL), "bad header field 1");
     ok(!IsValidHeaderField(""), "bad header field 2");
@@ -59,6 +62,8 @@
     ok(!IsValidHeaderField("\177Subject: a"), "bad header field 10");
     ok(!IsValidHeaderField("Subject: a\177b"), "bad header field 11");
     ok(!IsValidHeaderField("Subject: a\nb"), "bad header field 12");
+    ok(!IsValidHeaderField("UT\317\2068: a"), "bad header field 13");
+    ok(!IsValidHeaderField("Control\004: a"), "bad header field 14");
 
     ok(IsValidHeaderField("Subject: a"), "good header field 1");
     ok(IsValidHeaderField("Subject: a\n\tb"), "good header field 2");
@@ -65,6 +70,10 @@
     ok(IsValidHeaderField("Sub: ject"), "good header field 3");
     ok(IsValidHeaderField("X-#%-T`?!: yeah"), "good header field 4");
     ok(IsValidHeaderField("Subject: a\r\n\tb"), "good header field 5");
+    ok(IsValidHeaderField("Newsgroups: local.\317\205\317\204\317\2068"),
+       "good header field 6");
+    ok(IsValidHeaderField("Subject: \317\205\317\204\317\2068\r\n testing"),
+       "good header field 7");
 
     return 0;
 }

Modified: tests/lib/uwildmat-t.c
===================================================================
--- tests/lib/uwildmat-t.c	2018-12-29 13:49:20 UTC (rev 10324)
+++ tests/lib/uwildmat-t.c	2019-02-04 14:21:58 UTC (rev 10325)
@@ -58,7 +58,7 @@
 int
 main(void)
 {
-    test_init(187);
+    test_init(188);
 
     /* Basic wildmat features. */
     test_r(  1, "foo",            "foo",               true);
@@ -276,6 +276,7 @@
                                                        false);
     test_v(186, "",                                    true);
     test_v(187, "a\303\251b\303\0c",                   false);
+    test_v(188, "two words",                           true);
     
     return 0;
 }