Author: tille
Date: 2008-07-26 16:23:02 +0000 (Sat, 26 Jul 2008)
New Revision: 2327

Modified:
   trunk/community/talks/200808_debconf8/get-archive-pages
Log:
Better parsing features.


Modified: trunk/community/talks/200808_debconf8/get-archive-pages
===================================================================
--- trunk/community/talks/200808_debconf8/get-archive-pages     2008-07-26 
15:30:38 UTC (rev 2326)
+++ trunk/community/talks/200808_debconf8/get-archive-pages     2008-07-26 
16:23:02 UTC (rev 2327)
@@ -41,15 +41,23 @@
            #print "$year-$month\n$data\n";
            my $datafile = "${year}-${month}" ;
            unless ( open(HTMLSNIP, ">$datafile") ) { die("Unable to open 
$datafile"); }
-           my ($content, $subject, $author) ;
+           my ($content, $subject, $author, $messages, $pages) ;
            foreach $content (@data) {
                my @lines = split(/(\n)/, $content);
                # print "------> @lines\n" ;
                my $line;
+               my $linestart = '';
+               my $messagelines = 0;
+               my $spamlines = 0;
                foreach $line (@lines) {
+                   if ( $linestart ) {
+                       $line = $linestart . $line;
+                       $linestart = '';
+                   }
                    if ( $line =~ /^\s*<\/?ul>\s*$/ || 
                          $line =~ /^\s*<\/?li>\s*$/ ||
                          $line =~ /^\s*<li>[^<]+<\/li>\s*$/ ||
+                        $line =~ /^\s*<li><em>Message not available<\/em>/ ||
                          $line =~ /^\s*$/) { next ; }
                    if ( ($subject, $author) = $line =~ 
m#<li><strong>.*html">(.+)</a></strong>\s*<em>(.+)</em>#gs ) {
                        $_ = $subject ;
@@ -58,8 +66,30 @@
                        $_ =~ s/\s*\(fwd\)\s*//i ; # Remove (fwd)
                        $subject = $_ ;
                        print HTMLSNIP "$subject ; $author\n";
+                       $messagelines++ ;
                    } else {
-                       print HTMLSNIP "$line\n";
+                       if ( ($messages, $pages) = $line 
+                               =~ m#The last update .* There are (\d+) 
messages. Page 1 of (\d+).<br>#gs ) {
+                           if ( $pages > 1 ) {
+                               print "Warning: More than one page ($pages) in 
$year/$month of $project\n";
+                           }
+                           print HTMLSNIP "$messages Messages (counted 
$messagelines)\n";
+                           if ( $messages != $messagelines + $spamlines ) {
+                               print "Warning: $project $year/$month counted 
$messagelines and $spamlines but page says $messages\n";
+                           }
+                       } else {
+                           unless ( $line =~ /<\/em>\s*<\/li>\s*$/ ) { # 
sometimes there are continued lines ...
+                               print "DEBUG: Continued line $line\n" ;
+                               $linestart = $line;
+                           } else {
+                               if ( $line =~ /<em>\s*<\/em>\s*<\/li>\s*$/ ) { 
# sometimes SPAM has no sender ...
+                                   print "Warning: Potential SPAM line: 
$line\n";
+                                   $spamlines++ ;
+                               } else {
+                                   print "Warning: unknown Line: $line\n";
+                               }
+                           }
+                       }
                    }
                }
            }


_______________________________________________
debian-med-commit mailing list
[email protected]
http://lists.alioth.debian.org/mailman/listinfo/debian-med-commit

Reply via email to