Author: tille
Date: 2008-07-26 18:03:08 +0000 (Sat, 26 Jul 2008)
New Revision: 2330

Modified:
   trunk/community/talks/200808_debconf8/get-archive-pages
Log:
Fixed some bugs, better SPAM detection


Modified: trunk/community/talks/200808_debconf8/get-archive-pages
===================================================================
--- trunk/community/talks/200808_debconf8/get-archive-pages     2008-07-26 
17:47:25 UTC (rev 2329)
+++ trunk/community/talks/200808_debconf8/get-archive-pages     2008-07-26 
18:03:08 UTC (rev 2330)
@@ -34,23 +34,28 @@
                last;
            }
            my $url = "${URL}/${year}/${month}/";
+           my $datafile = "${year}-${month}" ;
+           unless ( open(HTMLSNIP, ">$datafile") ) { die("Unable to open 
$datafile"); }
+           my $messagelines = 0;
+           my $spamlines = 0;
            while ( $url =~ /.+/ ) { # if only one page $url is set to ''
                # print "$year-$month: $url\n";
                my $uri = URI->new($url);
                my $indexpage = $ua->get($url, Host => $uri->host );
-               unless ( $indexpage->is_success ) { $url = ''; next; } ; # some 
mailing lists startet later ...
+               unless ( $indexpage->is_success ) { # some mailing lists 
startet later ...
+                   $url = '';
+                   close HTMLSNIP ;
+                   # remove empty file
+                   unlink($datafile);
+                   next;
+               } ; 
                (my @data) = $indexpage->content =~ 
m#.*<!--TNAVEND-->\n(.+)<hr>.*<!--BNAVSTART-->.*#gs;
-               #print "$year-$month\n$data\n";
-               my $datafile = "${year}-${month}" ;
-               unless ( open(HTMLSNIP, ">$datafile") ) { die("Unable to open 
$datafile"); }
                my ($content, $subject, $author, $messages, $pages, $page) ;
                foreach $content (@data) {
                    my @lines = split(/(\n)/, $content);
                    # print "------> @lines\n" ;
                    my $line;
                    my $linestart = '';
-                   my $messagelines = 0;
-                   my $spamlines = 0;
                    foreach $line (@lines) {
                        if ( $linestart =~ /.+/ ) {
                            $line = $linestart . $line;
@@ -68,13 +73,18 @@
                            $_ =~ s/^\[[^\]]+\]\s*([^\s]+)/$1/ ; # Remove other 
list markers (but only if something is following)
                            $_ =~ s/\s*\(fwd\)\s*//i ; # Remove (fwd)
                            $subject = $_ ;
-                           print HTMLSNIP "$subject ; $author\n";
-                           $messagelines++ ;
+                           if ( $subject =~ /^[&#x\d;\sA-F\?]+$/ ) {
+                               print "Warning: Potential SPAM line: $line\n";
+                               $spamlines++ ;
+                           } else {
+                               print HTMLSNIP "$subject ; $author\n";
+                               $messagelines++ ;
+                           }
                        } else {
                            if ( ($messages, $page, $pages) = $line 
                                 =~ m#The last update .* There are (\d+) 
messages. Page (\d+) of (\d+).<br>#gs ) {
                                if ( $page != $pages ) { # handle following 
pages
-                                   print "Warning: Page %page of $pages in 
$year/$month of $project\n";
+                                   print "Warning: Page $page of $pages in 
$year/$month of $project\n";
                                    $page++;
                                    $url = "$url/thrd${page}.html";
                                } else {


_______________________________________________
debian-med-commit mailing list
[email protected]
http://lists.alioth.debian.org/mailman/listinfo/debian-med-commit

Reply via email to