Author: tille
Date: 2008-07-26 20:04:47 +0000 (Sat, 26 Jul 2008)
New Revision: 2331

Modified:
   trunk/community/talks/200808_debconf8/get-archive-pages
Log:
several fixes


Modified: trunk/community/talks/200808_debconf8/get-archive-pages
===================================================================
--- trunk/community/talks/200808_debconf8/get-archive-pages     2008-07-26 
18:03:08 UTC (rev 2330)
+++ trunk/community/talks/200808_debconf8/get-archive-pages     2008-07-26 
20:04:47 UTC (rev 2331)
@@ -8,7 +8,7 @@
 my $BASEURL  = "http://lists.debian.org/debian"; ;
 my @PROJECTS = ('med', 'edu', 'jr') ;
 my @MONTHES  = ('01', '02', '03', '04', '05', '06', '07', '08', '09', '10', 
'11', '12');
-my @ROBOTS   = ('Debian Installer', 'bugzilla-skolelinux', 'Archive 
Administrator');
+my @ROBOTS   = ('Debian Installer', 'bugzilla-skolelinux', 'Archive 
Administrator', 'hostmaster');
 
 # Debian-Jr starts in 2000
 my $YEARSTART = 2000;
@@ -57,8 +57,16 @@
                    my $line;
                    my $linestart = '';
                    foreach $line (@lines) {
+                       if ( $line =~ /^\s*$/) { next ; }
                        if ( $linestart =~ /.+/ ) {
-                           $line = $linestart . $line;
+                           if ( $line =~ /^\s*<\/?ul>\s*$/ || 
+                                $line =~ /^\s*<\/?li>\s*$/ ) {
+                               # fix broken formatting if there is a useless 
EOL and next line is <ul> or </li>
+                               $line = $linestart;
+                           } else {
+                               # Append next line
+                               $line = $linestart . $line;
+                           }
                            print "DEBUG: Whole line is $line\n" ;
                            $linestart = '';
                        }
@@ -66,6 +74,7 @@
                             $line =~ /^\s*<\/?li>\s*$/ ||
                             $line =~ /^\s*<li>[^<]+<\/li>\s*$/ ||
                             $line =~ /^\s*<li><em>Message not available<\/em>/ 
||
+                            $line =~ /^<em>(continued)<\/em>\s*$/ ||
                             $line =~ /^\s*$/) { next ; }
                        if ( ($subject, $author) = $line =~ 
m#<li><strong>.*html">(.+)</a></strong>\s*<em>(.+)</em>#gs ) {
                            $_ = $subject ;
@@ -74,7 +83,7 @@
                            $_ =~ s/\s*\(fwd\)\s*//i ; # Remove (fwd)
                            $subject = $_ ;
                            if ( $subject =~ /^[&#x\d;\sA-F\?]+$/ ) {
-                               print "Warning: Potential SPAM line: $line\n";
+                               print "Potential SPAM line - strange subject: 
$project $year-$month: $subject\n";
                                $spamlines++ ;
                            } else {
                                print HTMLSNIP "$subject ; $author\n";
@@ -98,9 +107,10 @@
                                unless ( $line =~ /<\/em>\s*<\/li>\s*$/ ) { # 
sometimes there are continued lines ...
                                    print "DEBUG: Continued line $line\n" ;
                                    $linestart = $line;
+                                   ##next ; ##### ??????? if this line is 
missing line we get $linestart$linestart ...
                                } else {
                                    if ( $line =~ /<em>\s*<\/em>\s*<\/li>\s*$/ 
) { # sometimes SPAM has no sender ...
-                                       print "Warning: Potential SPAM line: 
$line\n";
+                                       print "Potential SPAM line - no author: 
$project $year-$month\n";
                                        $spamlines++ ;
                                    } else {
                                        print "Warning: unknown Line: $line\n";


_______________________________________________
debian-med-commit mailing list
[email protected]
http://lists.alioth.debian.org/mailman/listinfo/debian-med-commit

Reply via email to