Author: tille Date: 2008-07-26 20:04:47 +0000 (Sat, 26 Jul 2008) New Revision: 2331
Modified: trunk/community/talks/200808_debconf8/get-archive-pages Log: several fixes Modified: trunk/community/talks/200808_debconf8/get-archive-pages =================================================================== --- trunk/community/talks/200808_debconf8/get-archive-pages 2008-07-26 18:03:08 UTC (rev 2330) +++ trunk/community/talks/200808_debconf8/get-archive-pages 2008-07-26 20:04:47 UTC (rev 2331) @@ -8,7 +8,7 @@ my $BASEURL = "http://lists.debian.org/debian" ; my @PROJECTS = ('med', 'edu', 'jr') ; my @MONTHES = ('01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'); -my @ROBOTS = ('Debian Installer', 'bugzilla-skolelinux', 'Archive Administrator'); +my @ROBOTS = ('Debian Installer', 'bugzilla-skolelinux', 'Archive Administrator', 'hostmaster'); # Debian-Jr starts in 2000 my $YEARSTART = 2000; @@ -57,8 +57,16 @@ my $line; my $linestart = ''; foreach $line (@lines) { + if ( $line =~ /^\s*$/) { next ; } if ( $linestart =~ /.+/ ) { - $line = $linestart . $line; + if ( $line =~ /^\s*<\/?ul>\s*$/ || + $line =~ /^\s*<\/?li>\s*$/ ) { + # fix broken formatting if there is a useless EOL and next line is <ul> or </li> + $line = $linestart; + } else { + # Append next line + $line = $linestart . $line; + } print "DEBUG: Whole line is $line\n" ; $linestart = ''; } @@ -66,6 +74,7 @@ $line =~ /^\s*<\/?li>\s*$/ || $line =~ /^\s*<li>[^<]+<\/li>\s*$/ || $line =~ /^\s*<li><em>Message not available<\/em>/ || + $line =~ /^<em>(continued)<\/em>\s*$/ || $line =~ /^\s*$/) { next ; } if ( ($subject, $author) = $line =~ m#<li><strong>.*html">(.+)</a></strong>\s*<em>(.+)</em>#gs ) { $_ = $subject ; @@ -74,7 +83,7 @@ $_ =~ s/\s*\(fwd\)\s*//i ; # Remove (fwd) $subject = $_ ; if ( $subject =~ /^[&#x\d;\sA-F\?]+$/ ) { - print "Warning: Potential SPAM line: $line\n"; + print "Potential SPAM line - strange subject: $project $year-$month: $subject\n"; $spamlines++ ; } else { print HTMLSNIP "$subject ; $author\n"; @@ -98,9 +107,10 @@ unless ( $line =~ /<\/em>\s*<\/li>\s*$/ ) { # sometimes there are continued lines ... print "DEBUG: Continued line $line\n" ; $linestart = $line; + ##next ; ##### ??????? if this line is missing line we get $linestart$linestart ... } else { if ( $line =~ /<em>\s*<\/em>\s*<\/li>\s*$/ ) { # sometimes SPAM has no sender ... - print "Warning: Potential SPAM line: $line\n"; + print "Potential SPAM line - no author: $project $year-$month\n"; $spamlines++ ; } else { print "Warning: unknown Line: $line\n"; _______________________________________________ debian-med-commit mailing list [email protected] http://lists.alioth.debian.org/mailman/listinfo/debian-med-commit
