Author: tille
Date: 2008-07-26 17:47:25 +0000 (Sat, 26 Jul 2008)
New Revision: 2329

Modified:
   trunk/community/talks/200808_debconf8/get-archive-pages
Log:
Remove some obvious spam entries


Modified: trunk/community/talks/200808_debconf8/get-archive-pages
===================================================================
--- trunk/community/talks/200808_debconf8/get-archive-pages     2008-07-26 
17:22:18 UTC (rev 2328)
+++ trunk/community/talks/200808_debconf8/get-archive-pages     2008-07-26 
17:47:25 UTC (rev 2329)
@@ -8,6 +8,7 @@
 my $BASEURL  = "http://lists.debian.org/debian"; ;
 my @PROJECTS = ('med', 'edu', 'jr') ;
 my @MONTHES  = ('01', '02', '03', '04', '05', '06', '07', '08', '09', '10', 
'11', '12');
+my @ROBOTS   = ('Debian Installer', 'bugzilla-skolelinux', 'Archive 
Administrator');
 
 # Debian-Jr starts in 2000
 my $YEARSTART = 2000;
@@ -33,60 +34,67 @@
                last;
            }
            my $url = "${URL}/${year}/${month}/";
-           #print "$year-$month: $url\n";
-           my $uri = URI->new($url);
-           my $page = $ua->get($url, Host => $uri->host );
-           unless ( $page->is_success ) { next } ; # some mailing lists 
startet later ...
-           (my @data) = $page->content =~ 
m#.*<!--TNAVEND-->\n(.+)<hr>.*<!--BNAVSTART-->.*#gs;
-           #print "$year-$month\n$data\n";
-           my $datafile = "${year}-${month}" ;
-           unless ( open(HTMLSNIP, ">$datafile") ) { die("Unable to open 
$datafile"); }
-           my ($content, $subject, $author, $messages, $pages) ;
-           foreach $content (@data) {
-               my @lines = split(/(\n)/, $content);
-               # print "------> @lines\n" ;
-               my $line;
-               my $linestart = '';
-               my $messagelines = 0;
-               my $spamlines = 0;
-               foreach $line (@lines) {
-                   if ( $linestart ) {
-                       $line = $linestart . $line;
-                       $linestart = '';
-                   }
-                   if ( $line =~ /^\s*<\/?ul>\s*$/ || 
-                         $line =~ /^\s*<\/?li>\s*$/ ||
-                         $line =~ /^\s*<li>[^<]+<\/li>\s*$/ ||
-                        $line =~ /^\s*<li><em>Message not available<\/em>/ ||
-                         $line =~ /^\s*$/) { next ; }
-                   if ( ($subject, $author) = $line =~ 
m#<li><strong>.*html">(.+)</a></strong>\s*<em>(.+)</em>#gs ) {
-                       $_ = $subject ;
-                       $_ =~ s/^Re:\s*//i ;       # Remove Re:
-                       $_ =~ s/^\[[^\]]+\]\s*// ; # Remove other list markers
-                       $_ =~ s/\s*\(fwd\)\s*//i ; # Remove (fwd)
-                       $subject = $_ ;
-                       print HTMLSNIP "$subject ; $author\n";
-                       $messagelines++ ;
-                   } else {
-                       if ( ($messages, $pages) = $line 
-                               =~ m#The last update .* There are (\d+) 
messages. Page 1 of (\d+).<br>#gs ) {
-                           if ( $pages > 1 ) {
-                               print "Warning: More than one page ($pages) in 
$year/$month of $project\n";
-                           }
-                           print HTMLSNIP "$messages Messages (counted 
$messagelines)\n";
-                           if ( $messages != $messagelines + $spamlines ) {
-                               print "Warning: $project $year/$month counted 
$messagelines and $spamlines but page says $messages\n";
-                           }
+           while ( $url =~ /.+/ ) { # if only one page $url is set to ''
+               # print "$year-$month: $url\n";
+               my $uri = URI->new($url);
+               my $indexpage = $ua->get($url, Host => $uri->host );
+               unless ( $indexpage->is_success ) { $url = ''; next; } ; # some 
mailing lists startet later ...
+               (my @data) = $indexpage->content =~ 
m#.*<!--TNAVEND-->\n(.+)<hr>.*<!--BNAVSTART-->.*#gs;
+               #print "$year-$month\n$data\n";
+               my $datafile = "${year}-${month}" ;
+               unless ( open(HTMLSNIP, ">$datafile") ) { die("Unable to open 
$datafile"); }
+               my ($content, $subject, $author, $messages, $pages, $page) ;
+               foreach $content (@data) {
+                   my @lines = split(/(\n)/, $content);
+                   # print "------> @lines\n" ;
+                   my $line;
+                   my $linestart = '';
+                   my $messagelines = 0;
+                   my $spamlines = 0;
+                   foreach $line (@lines) {
+                       if ( $linestart =~ /.+/ ) {
+                           $line = $linestart . $line;
+                           print "DEBUG: Whole line is $line\n" ;
+                           $linestart = '';
+                       }
+                       if ( $line =~ /^\s*<\/?ul>\s*$/ || 
+                            $line =~ /^\s*<\/?li>\s*$/ ||
+                            $line =~ /^\s*<li>[^<]+<\/li>\s*$/ ||
+                            $line =~ /^\s*<li><em>Message not available<\/em>/ 
||
+                            $line =~ /^\s*$/) { next ; }
+                       if ( ($subject, $author) = $line =~ 
m#<li><strong>.*html">(.+)</a></strong>\s*<em>(.+)</em>#gs ) {
+                           $_ = $subject ;
+                           $_ =~ s/^Re:\s*//i ;       # Remove Re:
+                           $_ =~ s/^\[[^\]]+\]\s*([^\s]+)/$1/ ; # Remove other 
list markers (but only if something is following)
+                           $_ =~ s/\s*\(fwd\)\s*//i ; # Remove (fwd)
+                           $subject = $_ ;
+                           print HTMLSNIP "$subject ; $author\n";
+                           $messagelines++ ;
                        } else {
-                           unless ( $line =~ /<\/em>\s*<\/li>\s*$/ ) { # 
sometimes there are continued lines ...
-                               print "DEBUG: Continued line $line\n" ;
-                               $linestart = $line;
+                           if ( ($messages, $page, $pages) = $line 
+                                =~ m#The last update .* There are (\d+) 
messages. Page (\d+) of (\d+).<br>#gs ) {
+                               if ( $page != $pages ) { # handle following 
pages
+                                   print "Warning: Page %page of $pages in 
$year/$month of $project\n";
+                                   $page++;
+                                   $url = "$url/thrd${page}.html";
+                               } else {
+                                   $url = '';
+                               }
+                               print HTMLSNIP "$messages Messages (counted 
$messagelines)\n";
+                               if ( $messages != $messagelines + $spamlines ) {
+                                   print "Warning: $project $year/$month 
counted $messagelines and $spamlines but page says $messages\n";
+                               }
                            } else {
-                               if ( $line =~ /<em>\s*<\/em>\s*<\/li>\s*$/ ) { 
# sometimes SPAM has no sender ...
-                                   print "Warning: Potential SPAM line: 
$line\n";
-                                   $spamlines++ ;
+                               unless ( $line =~ /<\/em>\s*<\/li>\s*$/ ) { # 
sometimes there are continued lines ...
+                                   print "DEBUG: Continued line $line\n" ;
+                                   $linestart = $line;
                                } else {
-                                   print "Warning: unknown Line: $line\n";
+                                   if ( $line =~ /<em>\s*<\/em>\s*<\/li>\s*$/ 
) { # sometimes SPAM has no sender ...
+                                       print "Warning: Potential SPAM line: 
$line\n";
+                                       $spamlines++ ;
+                                   } else {
+                                       print "Warning: unknown Line: $line\n";
+                                   }
                                }
                            }
                        }


_______________________________________________
debian-med-commit mailing list
[email protected]
http://lists.alioth.debian.org/mailman/listinfo/debian-med-commit

Reply via email to