Author: tille Date: 2008-07-26 17:47:25 +0000 (Sat, 26 Jul 2008) New Revision: 2329
Modified: trunk/community/talks/200808_debconf8/get-archive-pages Log: Remove some obvious spam entries Modified: trunk/community/talks/200808_debconf8/get-archive-pages =================================================================== --- trunk/community/talks/200808_debconf8/get-archive-pages 2008-07-26 17:22:18 UTC (rev 2328) +++ trunk/community/talks/200808_debconf8/get-archive-pages 2008-07-26 17:47:25 UTC (rev 2329) @@ -8,6 +8,7 @@ my $BASEURL = "http://lists.debian.org/debian" ; my @PROJECTS = ('med', 'edu', 'jr') ; my @MONTHES = ('01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'); +my @ROBOTS = ('Debian Installer', 'bugzilla-skolelinux', 'Archive Administrator'); # Debian-Jr starts in 2000 my $YEARSTART = 2000; @@ -33,60 +34,67 @@ last; } my $url = "${URL}/${year}/${month}/"; - #print "$year-$month: $url\n"; - my $uri = URI->new($url); - my $page = $ua->get($url, Host => $uri->host ); - unless ( $page->is_success ) { next } ; # some mailing lists startet later ... - (my @data) = $page->content =~ m#.*<!--TNAVEND-->\n(.+)<hr>.*<!--BNAVSTART-->.*#gs; - #print "$year-$month\n$data\n"; - my $datafile = "${year}-${month}" ; - unless ( open(HTMLSNIP, ">$datafile") ) { die("Unable to open $datafile"); } - my ($content, $subject, $author, $messages, $pages) ; - foreach $content (@data) { - my @lines = split(/(\n)/, $content); - # print "------> @lines\n" ; - my $line; - my $linestart = ''; - my $messagelines = 0; - my $spamlines = 0; - foreach $line (@lines) { - if ( $linestart ) { - $line = $linestart . $line; - $linestart = ''; - } - if ( $line =~ /^\s*<\/?ul>\s*$/ || - $line =~ /^\s*<\/?li>\s*$/ || - $line =~ /^\s*<li>[^<]+<\/li>\s*$/ || - $line =~ /^\s*<li><em>Message not available<\/em>/ || - $line =~ /^\s*$/) { next ; } - if ( ($subject, $author) = $line =~ m#<li><strong>.*html">(.+)</a></strong>\s*<em>(.+)</em>#gs ) { - $_ = $subject ; - $_ =~ s/^Re:\s*//i ; # Remove Re: - $_ =~ s/^\[[^\]]+\]\s*// ; # Remove other list markers - $_ =~ s/\s*\(fwd\)\s*//i ; # Remove (fwd) - $subject = $_ ; - print HTMLSNIP "$subject ; $author\n"; - $messagelines++ ; - } else { - if ( ($messages, $pages) = $line - =~ m#The last update .* There are (\d+) messages. Page 1 of (\d+).<br>#gs ) { - if ( $pages > 1 ) { - print "Warning: More than one page ($pages) in $year/$month of $project\n"; - } - print HTMLSNIP "$messages Messages (counted $messagelines)\n"; - if ( $messages != $messagelines + $spamlines ) { - print "Warning: $project $year/$month counted $messagelines and $spamlines but page says $messages\n"; - } + while ( $url =~ /.+/ ) { # if only one page $url is set to '' + # print "$year-$month: $url\n"; + my $uri = URI->new($url); + my $indexpage = $ua->get($url, Host => $uri->host ); + unless ( $indexpage->is_success ) { $url = ''; next; } ; # some mailing lists startet later ... + (my @data) = $indexpage->content =~ m#.*<!--TNAVEND-->\n(.+)<hr>.*<!--BNAVSTART-->.*#gs; + #print "$year-$month\n$data\n"; + my $datafile = "${year}-${month}" ; + unless ( open(HTMLSNIP, ">$datafile") ) { die("Unable to open $datafile"); } + my ($content, $subject, $author, $messages, $pages, $page) ; + foreach $content (@data) { + my @lines = split(/(\n)/, $content); + # print "------> @lines\n" ; + my $line; + my $linestart = ''; + my $messagelines = 0; + my $spamlines = 0; + foreach $line (@lines) { + if ( $linestart =~ /.+/ ) { + $line = $linestart . $line; + print "DEBUG: Whole line is $line\n" ; + $linestart = ''; + } + if ( $line =~ /^\s*<\/?ul>\s*$/ || + $line =~ /^\s*<\/?li>\s*$/ || + $line =~ /^\s*<li>[^<]+<\/li>\s*$/ || + $line =~ /^\s*<li><em>Message not available<\/em>/ || + $line =~ /^\s*$/) { next ; } + if ( ($subject, $author) = $line =~ m#<li><strong>.*html">(.+)</a></strong>\s*<em>(.+)</em>#gs ) { + $_ = $subject ; + $_ =~ s/^Re:\s*//i ; # Remove Re: + $_ =~ s/^\[[^\]]+\]\s*([^\s]+)/$1/ ; # Remove other list markers (but only if something is following) + $_ =~ s/\s*\(fwd\)\s*//i ; # Remove (fwd) + $subject = $_ ; + print HTMLSNIP "$subject ; $author\n"; + $messagelines++ ; } else { - unless ( $line =~ /<\/em>\s*<\/li>\s*$/ ) { # sometimes there are continued lines ... - print "DEBUG: Continued line $line\n" ; - $linestart = $line; + if ( ($messages, $page, $pages) = $line + =~ m#The last update .* There are (\d+) messages. Page (\d+) of (\d+).<br>#gs ) { + if ( $page != $pages ) { # handle following pages + print "Warning: Page %page of $pages in $year/$month of $project\n"; + $page++; + $url = "$url/thrd${page}.html"; + } else { + $url = ''; + } + print HTMLSNIP "$messages Messages (counted $messagelines)\n"; + if ( $messages != $messagelines + $spamlines ) { + print "Warning: $project $year/$month counted $messagelines and $spamlines but page says $messages\n"; + } } else { - if ( $line =~ /<em>\s*<\/em>\s*<\/li>\s*$/ ) { # sometimes SPAM has no sender ... - print "Warning: Potential SPAM line: $line\n"; - $spamlines++ ; + unless ( $line =~ /<\/em>\s*<\/li>\s*$/ ) { # sometimes there are continued lines ... + print "DEBUG: Continued line $line\n" ; + $linestart = $line; } else { - print "Warning: unknown Line: $line\n"; + if ( $line =~ /<em>\s*<\/em>\s*<\/li>\s*$/ ) { # sometimes SPAM has no sender ... + print "Warning: Potential SPAM line: $line\n"; + $spamlines++ ; + } else { + print "Warning: unknown Line: $line\n"; + } } } } _______________________________________________ debian-med-commit mailing list [email protected] http://lists.alioth.debian.org/mailman/listinfo/debian-med-commit
