INN commit: trunk (3 files)

INN Commit Russ_Allbery at isc.org
Sun Feb 1 16:00:02 UTC 2009


    Date: Sunday, February 1, 2009 @ 08:00:01
  Author: iulius
Revision: 8315

Add encoding to newgroup processing:  controlchan now looks
for the charset and try to convert the description into
the local encoding.
Defaults are cp1252 for expected encodings (unless the charset
is properly set in the newgroup message, or control.ctl
dictates another charset) and utf-8 for the local encoding.

Add a few encoding exceptions to control.ctl.local.

(The man pages will be updated when checkgroups are also
processed the same way.)

see #4

Modified:
  trunk/control/controlchan.in
  trunk/control/modules/newgroup.pl
  trunk/samples/control.ctl.local

-----------------------------+
 control/controlchan.in      |   62 +++++++++++++++++++++++++++++++-----------
 control/modules/newgroup.pl |   33 +++++++++++++++++++++-
 samples/control.ctl.local   |   27 ++++++++++++++++++
 3 files changed, 106 insertions(+), 16 deletions(-)

Modified: control/controlchan.in
===================================================================
--- control/controlchan.in	2009-01-31 22:04:46 UTC (rev 8314)
+++ control/controlchan.in	2009-02-01 16:00:01 UTC (rev 8315)
@@ -31,6 +31,7 @@
 
 require 5.004_03;
 use MIME::Parser;
+use Encode;
 use strict;
 
 delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
@@ -131,9 +132,10 @@
     $progname = $1;
 
     # Do we want to process the message?  Let's check the permissions.
-    my ($action, $logname, $newsgrouppats) =
+    my @charset_from;
+    my ($action, $logname, $newsgrouppats, $charset_to) =
         ctlperm($progname, $sender, $progparams[0],
-                $token, $article);
+                $token, $article, \@charset_from);
 
     if ($action eq 'drop') {
         $parser->filer->purge;
@@ -185,7 +187,8 @@
         . ($logname ? "=$logname" : '') .", $approved");
 
     &$subfind(\@progparams, $sender, $replyto, $sitepath,
-        $action, $logname, $approved, $article);
+        $action, $logname, $approved, $article,
+        \@charset_from, $charset_to);
 
     $parser->filer->purge;
 }
@@ -261,11 +264,14 @@
             chop;
             # Not a comment or blank?  Convert wildmat to regex.
             next if not /^(\s+)?[^\#]/ or /^$/;
-            if (not /:(?:doit|doifarg|drop|log|mail|verify-.*)(?:=.*)?$/) {
+
+            if (not /^\/(?:local)?encoding\/:/ and
+                not /:(?:doit|doifarg|drop|log|mail|verify-.*)(?:=.*)?$/) {
                 s/.*://;
                 logmsg("$_ is not a valid action for control.ctl", 'err');
                 next;
             }
+
             # Convert to a ':'-separated list of regexps.
             s/^all:/*:/i;
             s/([\$\+\.])/\\$1/g;
@@ -274,6 +280,8 @@
             s/(.*)/^$1\$/;
             s/:/\$:^/g;
             s/\|/\$|^/g;
+            s/\//\\\//g;
+
             push(@ctllist, $_);
         }
         close(CTLFILE);
@@ -285,10 +293,10 @@
 
 # Parse a control message's permissions.
 sub ctlperm {
-    my ($type, $sender, $newsgroup, $token, $article) = @_;
+    my ($type, $sender, $newsgroup, $token, $article, $charset_from) = @_;
 
-    my $action = 'drop';    # default
-    my ($logname, $hier);
+    my $action = 'drop';       # Default action.
+    my ($logname, $hier, $charset_to);
 
     # newgroup and rmgroup require newsgroup names; check explicitly for that
     # here and return drop if the newsgroup is missing (to avoid a bunch of
@@ -300,21 +308,45 @@
     }
 
     my $ctllist = readctlfile();
+    my $matchedaction = 0;
+    my $matchedencoding = 0;
     foreach (@$ctllist) {
         my @ctlline = split /:/;
         # 0: type  1: from at addr  2: group.*  3: action
         if ($type =~ /$ctlline[0]/ and $sender =~ /$ctlline[1]/i and
             ($type !~ /(?:new|rm)group/ or $newsgroup =~ /$ctlline[2]/)) {
-            $action = $ctlline[3];
-            $action =~ s/\^(.+)\$/$1/;
-            $action =~ s/\\//g;
-            $hier = $ctlline[2] if $type eq 'checkgroups';
-            # @ctllist is a reversed list so the first match is the last
-            # one in control.ctl followed by control.ctl.local.
-            last;
+            if (not $matchedaction) {
+                $action = $ctlline[3];
+                $action =~ s/\^(.+)\$/$1/;
+                $action =~ s/\\//g;
+                $hier = $ctlline[2] if $type eq 'checkgroups';
+                # @ctllist is a reversed list so the first match is the last
+                # one in control.ctl followed by control.ctl.local.
+                $matchedaction = 1;
+            }
         }
+        # 0: /localencoding/  1: encoding
+        if ($ctlline[0] eq '^\/localencoding\/$') {
+            if (not $matchedencoding) {
+                $charset_to = $ctlline[1];
+                $charset_to =~ s/\^(.+)\$/$1/;
+                $charset_to =~ s/\\//g;
+                $matchedencoding = 1;
+            }
+        }
+        # 0: /encoding/  1: from at addr  2: group.*  3: encoding[=force]
+        if ($ctlline[0] eq '^\/encoding\/$') {
+            if ($sender =~ /$ctlline[1]/i) {
+                push (@$charset_from, $ctlline[2].':'.$ctlline[3]);
+            }
+        }
     }
 
+    if (not defined $charset_to
+        or not defined Encode::find_encoding($charset_to)) {
+        $charset_to = 'UTF-8';  # Default local encoding.
+    }
+
     ($action, $logname) = split(/=/, $action);
 
     if ($action =~ /^verify-(.+)/) {
@@ -332,7 +364,7 @@
         }
     }
 
-    return ($action, $logname, $hier);
+    return ($action, $logname, $hier, $charset_to);
 }
 
 # Write stuff to a log or send mail to the news admin.

Modified: control/modules/newgroup.pl
===================================================================
--- control/modules/newgroup.pl	2009-01-31 22:04:46 UTC (rev 8314)
+++ control/modules/newgroup.pl	2009-02-01 16:00:01 UTC (rev 8315)
@@ -19,7 +19,7 @@
 
 sub control_newgroup {
     my ($par, $sender, $replyto, $site, $action, $log, $approved,
-        $article) = @_;
+        $article, $charset_from, $charset_to) = @_;
     my ($groupname, $modflag) = @$par;
 
     my $head = $article->head;
@@ -28,6 +28,11 @@
     my (@body, $part, $part_head);
     my $mimegroupinfo = 0;
 
+    my $charset_message;
+    if (defined $head->mime_attr('Content-Type.charset')) {
+        $charset_message = $head->mime_attr('Content-Type.charset');
+    }
+
     # Check if it is a multipart message.  The body is restricted to
     # the application/news-groupinfo part, if any.
     if ($article->parts > 0) {
@@ -36,6 +41,9 @@
 
             if ($part_head->mime_type eq 'application/news-groupinfo') {
                 @body = split(/\r?\n/, $part->stringify_body);
+                if (defined $part_head->mime_attr('Content-Type.charset')) {
+                    $charset_message = $part_head->mime_attr('Content-Type.charset');
+                }
                 $mimegroupinfo = 1;
             }
         }
@@ -50,6 +58,24 @@
 
     @body = @fullbody if not $mimegroupinfo;
 
+    # Find the right charset if absent or forced by control.ctl.
+    foreach (@$charset_from) {
+        my ($group, $charset) = split /:/;
+        if ($groupname =~ /$group/) {
+            if (not defined $charset_message or $charset =~ /=force/) {
+                $charset_message = $charset;
+                $charset_message =~ s/\^(.+)\$/$1/;
+                $charset_message =~ s/\\//g;
+                $charset_message =~ s/=force//;
+            }
+            last;
+        }
+    }
+    if (not defined $charset_message
+        or not defined Encode::find_encoding($charset_message)) {
+        $charset_message = "cp1252";  # Default charset, when undefined.
+    }
+
     $modflag ||= '';
     my $modcmd = $modflag eq 'moderated' ? 'm' : 'y';
 
@@ -98,11 +124,13 @@
 
     if ($found) {
       ($ngname, $ngdesc) = split(/\s+/, $ngline, 2);
+
       if ($ngdesc) {
           $ngdesc =~ s/\s+$//;
           $ngdesc =~ s/\s+\(moderated\)\s*$//i;
           $ngdesc .= ' (Moderated)' if $modflag eq 'moderated';
       }
+
       # Scan newsgroups to see the previous description, if any.
       open(NEWSGROUPS, $INN::Config::newsgroups)
           or logdie("Cannot open $INN::Config::newsgroups: $!");
@@ -115,6 +143,9 @@
       close NEWSGROUPS;
     }
 
+    # Properly encode the newsgroup description.
+    Encode::from_to($ngdesc, $charset_message, $charset_to);
+
     if (@oldgroup) {
         if ($oldgroup[3] eq 'm' and $modflag ne 'moderated') {
             $status = 'be made unmoderated';

Modified: samples/control.ctl.local
===================================================================
--- samples/control.ctl.local	2009-01-31 22:04:46 UTC (rev 8314)
+++ samples/control.ctl.local	2009-02-01 16:00:01 UTC (rev 8315)
@@ -5,5 +5,32 @@
 ##  It defines local access control for control messages.
 ##  The rules in this file are executed after those defined
 ##  in control.ctl.
+##  Lines are matched in order and the last matching line
+##  will be used.
+##
 ##  See the control.ctl man page for more information.
 
+##  Output encoding for the newsgroups file.
+/localencoding/:utf-8
+
+##  Incoming encodings in newgroup and checkgroups control articles.
+
+# Default (for any description).
+/encoding/:*:*:cp1252
+
+/encoding/:*:cn.*:gb18030
+/encoding/:*:han.*:gb18030
+
+/encoding/:*:fido7.*:cp1251
+/encoding/:*:medlux.*:cp1251
+/encoding/:*:ukr.*:cp1251
+
+/encoding/:*:fr.*:iso-8859-15
+
+/encoding/:*:nctu.*:big5
+/encoding/:*:ncu.*:big5
+/encoding/:*:tw.*:big5
+/encoding/:*:scout.forum.chinese:big5
+/encoding/:*:scout.forum.korean:big5
+
+/encoding/:*:fido.*:utf-8




More information about the inn-committers mailing list