Author: tille
Date: 2008-08-15 03:17:31 +0000 (Fri, 15 Aug 2008)
New Revision: 2410

Modified:
   trunk/community/talks/200808_debconf8/get-archive-pages
Log:
Parsing Alioth should work


Modified: trunk/community/talks/200808_debconf8/get-archive-pages
===================================================================
--- trunk/community/talks/200808_debconf8/get-archive-pages     2008-08-13 
01:25:27 UTC (rev 2409)
+++ trunk/community/talks/200808_debconf8/get-archive-pages     2008-08-15 
03:17:31 UTC (rev 2410)
@@ -38,7 +38,8 @@
 my @ROBOTS   = ('Debian Installer', 'bugzilla-skolelinux', 'Archive 
Administrator', 'hostmaster',
                 'Debian-med-request', 'Debian testing watch', 'Debian Bug 
Tracking System',
                 'Skolelinux archive Installer', 'Debian Wiki', 
'gentoo-\w+\+help',
-                'Debichem-commits');
+                'Debichem-commits', 'Weekly infolist of updatable packages for 
the debichem project',
+                'bts-link-upstream at lists.alioth.debian.org', 'DDPOMail 
robot');
 
 ## TODO: just consider mails containing these strings as SPAM
 ##       This has to be implemented in the code below
@@ -75,7 +76,7 @@
 
 foreach $project (@PROJECTS) {
     $ALLPROJECTS{$project} = { 'url'     => "${BASEURL}-${project}",
-                              'type'    => 0 # == lists.debian.org
+                              'type'    => 0, # == lists.debian.org
     };
 }
 
@@ -85,14 +86,23 @@
     };
 }
 
+my $SEPARATOR='<!-- -->';
+# different mailing list systems use different separators between message URL, 
subject and author
+my @SEP1 = ( '<li><strong>.*href="', '\s*' );
+my @SEP2 = ( '">',                   "\\s*<!-- -->\\s*" );
+my @SEP3 = ( '</a></strong>\s*<em>', '\s*<I>\s*');
+my @SEP4 = ( '</em>',                '\s*');
+
 # foreach $project (keys %ALLPROJECTS) {
 #    print "$project: $ALLPROJECTS{$project}{'url'}, 
$ALLPROJECTS{$project}{'type'}\n"
 #}
 
+my ($query, $daten);
+
 foreach $project (keys %ALLPROJECTS) {
     # Remove database entries for this project
-    my $query  = "DELETE FROM listarchive WHERE project = '$project'";
-    my($daten) = $dbh->prepare_cached($query);
+    $query = "DELETE FROM listarchive WHERE project = '$project'";
+    $daten = $dbh->prepare_cached($query);
     $daten->execute() ;
     $daten->finish() ;
 
@@ -103,12 +113,13 @@
     my $URL="$ALLPROJECTS{$project}{'url'}";
     my ( $year, $month, $url, @data, @lines ) ;
     my ($content, $msgurl, $subject, $author, $messages, $pages, $page, $line) 
;
+    my $type = $ALLPROJECTS{$project}{'type'};
     for ( $year = $YEARSTART ; $year <= $YEAREND; $year++ ) {
        foreach $month (@MONTHES) {
            if ( $year == $YEAREND && $month == $MONTHEND ) {
                last;
            }
-           if ( $ALLPROJECTS{$project}{'type'} == 0 ) {
+           if ( $type == 0 ) {
                $url = "${URL}/${year}/${month}/";
            } else {
                $url = "${URL}/${year}-$monthdict{$month}/";
@@ -121,7 +132,7 @@
            my $spamlines    = 0;
            my $robotlines   = 0;
            while ( $url =~ /.+/ ) { # if only one page $url is set to ''
-               print "DEBUG: $year-$month: $url\n";
+               # print "DEBUG: $year-$month: $url\n";
                my $uri = URI->new($url);
                my $indexpage = $ua->get($url, Host => $uri->host );
                unless ( $indexpage->is_success ) { # some mailing lists 
startet later ...
@@ -131,48 +142,49 @@
                    unlink($datafile);
                    next;
                } ; 
-               if ( $ALLPROJECTS{$project}{'type'} == 1 ) {
+               if ( $type == 1 ) {
                    # make sure the loop will end in case of Alioth lists.  
Seems these list do
                    # not feature more than one page per Month so there is no 
point in looping over them
                    $url = '';
                }
-               if ( $ALLPROJECTS{$project}{'type'} == 0 ) {
+               if ( $type == 0 ) {
                    @data = $indexpage->content =~ 
m#.*<!--TNAVEND-->\n(.+)<hr>.*<!--BNAVSTART-->.*#gs;
                } else {
                    my @tmpdata = $indexpage->content =~ m#.*<b>Ending:</b> 
<i>[ \w]+ [ \d:]+ UTC [\d]+</i><br>\n(.+)<a name="end"><b>Last message 
date:</b></a>.*#gs;
-                   @data = ();
+                   my $tmpdata = '';
+                    my $tmpline = '';
                    foreach $content (@tmpdata) {
                        @lines = split(/(\n)/, $content);
-                       foreach (@lines) {
+                       foreach $line (@lines) {
+                           $_ = $line;
                            s/\s+/ /g;
                            if ( $_ =~ /^\s*$/   || $_ =~ /^<!--\d+ /      ||
                                 $_ =~ /^<\/I>$/ || $_ =~ /^\s*<\/?p>\s*$/ ||
                                  $_ =~ /^\s*<\/?UL>\s*$/i ||
                                 $_ =~ /^<\/A><A NAME="\d+">&nbsp;<\/A>$/ ) { 
next ; }
-                           if ( ($subject) = $_ =~ /^\s*<LI><A 
HREF="\d+.html">\[[-\w]+\]\s*(.+)$/ ) {
+                           if ( ($msgurl, $subject) = $_ =~ /^\s*<LI><A 
HREF="(\d+.html)">\[[-\w]+\]\s*(.+)$/ ) {
                                $_ = $subject ;
                                $_ =~ s/^\s*Re:\s*//i ;       # Remove Re:
-                               @data = (@data, $subject) ;
+                               $_ =~ s/^\s*//i ;             # Remove blanks
+                               $tmpline = $msgurl . $SEPARATOR . $subject ;
                            } else {
                                if ( $_ =~ /<I>/ || $_ =~ /<b>Messages:<\/b>/ ) 
{
-                                   @data = (@data, "$_\n" ) ;
+                                   $tmpline = "$_\n"  ;
                                } else {
-                                   @data = (@data, "$_" ) ;
+                                   $tmpline = "$_"  ;
                                }
                            }
+                           $tmpdata = $tmpdata . $tmpline;
                        }
                    }
-                   if ( $storefiles ) {
-                       print HTMLSNIP "@data\n";
-                   }
+                   @data = ($tmpdata);
                }
                foreach $content (@data) {
                    @lines = split(/(\n)/, $content);
-                   # print "------> @lines\n" ;
                    my $linestart = '';
                    foreach $line (@lines) {
                        if ( $line =~ /^\s*$/) { next ; }
-                       if ( $linestart =~ /.+/ ) {
+                       if ( $linestart =~ /.+/ && $type == 0 ) {
                            if ( $line =~ /^\s*<\/?ul>\s*$/ || 
                                 $line =~ /^\s*<\/?li>\s*$/ ) {
                                # fix broken formatting if there is a useless 
EOL and next line is <ul> or </li>
@@ -189,8 +201,16 @@
                             $line =~ /^\s*<li><em>Message not available<\/em>/ 
||
                             $line =~ /<em>\(continued\)<\/em>\s*$/ ||
                             $line =~ /^\s*$/) { next ; }
+# @SEP1 = ( '<li><strong>.*href="', '\s*' );
+# @SEP2 = ( '">',                   "\s*$SEPARATOR\s*" );
+# @SEP3 = ( '</a></strong>\s*<em>', '\s*<I>\s*');
+# @SEP4 = ( '</em>',                '\s*');
+                       # print "DEBUG: $line\n";
+                       if ( $storefiles ) {
+                           print HTMLSNIP "$line\n";
+                       }
                        if ( ($msgurl, $subject, $author) = 
-                              $line =~ 
m#<li><strong>.*href="(msg\d+\.html)">(.+)</a></strong>\s*<em>(.+)</em>#gs ) {
+                              $line =~ 
m#$SEP1[$type]([msg]*\d+\.html)$SEP2[$type](.+)$SEP3[$type](.+)$SEP4[$type]#gs 
) {
                            $_ = $subject ;
                            $_ =~ s/^Re:\s*//i ;       # Remove Re:
                            $_ =~ s/^\[[^\]]+\]\s*([^\s]+)/$1/ ; # Remove other 
list markers (but only if something is following)
@@ -229,31 +249,39 @@
                                }
                            }
                        } else {
-                           if ( ($messages, $page, $pages) = $line 
-                                =~ m#The last update .* There are (\d+) 
messages. Page (\d+) of (\d+).<br>#gs ) {
-                               if ( $page != $pages ) { # handle following 
pages
-                                   print "Warning: Page $page of $pages in 
$year/$month of $project\n";
-                                   $page++;
-                                   $url = 
"${URL}/${year}/${month}/thrd${page}.html";
+                           if ( $type == 0 ) {
+                               if ( ($messages, $page, $pages) = $line 
+                                    =~ m#The last update .* There are (\d+) 
messages. Page (\d+) of (\d+).<br>#gs ) {
+                                   if ( $page != $pages ) { # handle following 
pages
+                                       print "Warning: Page $page of $pages in 
$year/$month of $project\n";
+                                       $page++;
+                                       $url = 
"${URL}/${year}/${month}/thrd${page}.html";
+                                   } else {
+                                       $url = '';
+                                   }
+                                   if ( $storefiles ) {
+                                       print HTMLSNIP "$messages Messages 
($messagelines real messages, $spamlines SPAM, $robotlines messages by 
robots)\n";
+                                   }
+                                   if ( $messages != $messagelines + 
$spamlines + $robotlines ) {
+                                       print "Warning: $project $year/$month 
counted $messagelines Messages, $spamlines SPAM and $robotlines robots but page 
says $messages\n";
+                                   }
                                } else {
-                                   $url = '';
+                                   unless ( $line =~ /<\/em>\s*<\/li>\s*$/ ) { 
# sometimes there are continued lines ...
+                                       $linestart = $line;
+                                       ##next ; ##### ??????? if this line is 
missing line we get $linestart$linestart ...
+                                   } else {
+                                       if ( $line =~ 
/<em>\s*<\/em>\s*<\/li>\s*$/ ) { # sometimes SPAM has no sender ...
+                                           print "Potential SPAM line - no 
author: $project $year-$month\n";
+                                           $spamlines++ ;
+                                       } else {
+                                           print "Warning: unknown Line: 
$line\n";
+                                       }
+                                   }
                                }
-                               if ( $storefiles ) {
-                                   print HTMLSNIP "$messages Messages 
($messagelines real messages, $spamlines SPAM, $robotlines messages by 
robots)\n";
-                               }
-                               if ( $messages != $messagelines + $spamlines + 
$robotlines ) {
-                                   print "Warning: $project $year/$month 
counted $messagelines Messages, $spamlines SPAM and $robotlines robots but page 
says $messages\n";
-                               }
                            } else {
-                               unless ( $line =~ /<\/em>\s*<\/li>\s*$/ ) { # 
sometimes there are continued lines ...
-                                   $linestart = $line;
-                                   ##next ; ##### ??????? if this line is 
missing line we get $linestart$linestart ...
-                               } else {
-                                   if ( $line =~ /<em>\s*<\/em>\s*<\/li>\s*$/ 
) { # sometimes SPAM has no sender ...
-                                       print "Potential SPAM line - no author: 
$project $year-$month\n";
-                                       $spamlines++ ;
-                                   } else {
-                                       print "Warning: unknown Line: $line\n";
+                               if ( ($messages) = $line =~ 
m#^\s*<b>Messages:</b>\s*(\d+)<p>#gs ) {
+                                   if ( $storefiles ) {
+                                       print HTMLSNIP "$messages Messages 
($messagelines real messages, $spamlines SPAM, $robotlines messages by 
robots)\n";
                                    }
                                }
                            }
@@ -271,9 +299,16 @@
 
 # Database has shown that Ralf Gsellenstetter is posting with several names
 # in Debian Edu.  This script cleans up this
-system("./0fix_ralf_edu");
+# system("./0fix_ralf_edu");
+$query = "UPDATE listarchive SET author = 'Ralf Gesellensetter' WHERE project 
= 'edu' AND author LIKE 'Ralf%setter';" ;
+$query = "UPDATE listarchive SET author = 'Vagrant Cascadian'   WHERE project 
= 'edu' AND author LIKE '%vagrant%';" ;
 
+$daten = $dbh->prepare_cached($query);
+$daten->execute() ;
+$daten->finish() ;
+
+
 # Just do the graphing of all lists we got
-foreach $project (@PROJECTS) {
+foreach $project (keys %ALLPROJECTS) {
     system("./author_stats $project") ;
 }


_______________________________________________
debian-med-commit mailing list
[email protected]
http://lists.alioth.debian.org/mailman/listinfo/debian-med-commit

Reply via email to