Author: tille
Date: 2008-08-10 23:36:05 +0000 (Sun, 10 Aug 2008)
New Revision: 2403

Modified:
   trunk/community/talks/200808_debconf8/get-archive-pages
Log:
Try to include alioth lists.  Not working now and some Debugging output added.


Modified: trunk/community/talks/200808_debconf8/get-archive-pages
===================================================================
--- trunk/community/talks/200808_debconf8/get-archive-pages     2008-08-10 
01:25:03 UTC (rev 2402)
+++ trunk/community/talks/200808_debconf8/get-archive-pages     2008-08-10 
23:36:05 UTC (rev 2403)
@@ -10,8 +10,13 @@
 my @PROJECTS = ('med', 'edu', 'jr', 'accessibility', 'desktop', 'enterprise', 
'lex',
                 'nonprofit', 'science', 'custom',
                 'i18n', 'devel', 'project') ; # ... just for the sake of 
interest
+
+## DEBUG
[EMAIL PROTECTED] = ('enterprise'); # Just find a very short list while testing 
Alioth ...
+
 # Well, there is also interest in alioth lists ...
-my @ALIOTHPRJ= ('debichem-devel', 'pkg-grass-general') ;
+my $BASEALIOTH = 'http://lists.alioth.debian.org/pipermail/';
+my @ALIOTHPRJ  = ('debichem-devel', 'pkg-grass-general') ;
 
 ## 
http://lists.alioth.debian.org/pipermail/debichem-devel/2008-August/thread.html
 ## 
http://lists.alioth.debian.org/pipermail/pkg-grass-general/2008-July/thread.html
@@ -42,7 +47,7 @@
 
 # if != 0 then extract of mailing list archives is stored in files in dirs
 # The prefered method is to use only the database
-my $storefiles = 0;
+my $storefiles = 1; # Just store the files again for debugging issues of 
alioth lists
 
 # Debian-Devel starts in 1995
 my $YEARSTART = 1995;
@@ -65,7 +70,25 @@
 my $datain = $dbh->prepare_cached($insert);
 my ( $robot, $robotflag );
 
+my %ALLPROJECTS;
+
 foreach $project (@PROJECTS) {
+    $ALLPROJECTS{$project} = { 'url'     => "${BASEURL}-${project}",
+                              'type'    => 0 # == lists.debian.org
+    };
+}
+
+foreach $project (@ALIOTHPRJ) {
+    $ALLPROJECTS{$project} = { 'url'     => "${BASEALIOTH}/${project}",
+                              'type'    => 1 # == lists.alioth.debian.org
+    };
+}
+
+# foreach $project (keys %ALLPROJECTS) {
+#    print "$project: $ALLPROJECTS{$project}{'url'}, 
$ALLPROJECTS{$project}{'type'}\n"
+#}
+
+foreach $project (keys %ALLPROJECTS) {
     # Remove database entries for this project
     my $query  = "DELETE FROM listarchive WHERE project = '$project'";
     my($daten) = $dbh->prepare_cached($query);
@@ -76,15 +99,19 @@
        mkdir($project,0777);
        chdir($project);
     }
-    my $URL="${BASEURL}-${project}";
-    my $year;
-    my $month;
+    my $URL="$ALLPROJECTS{$project}{'url'}";
+    my ( $year, $month, $url, @data, @lines ) ;
+    my ($content, $msgurl, $subject, $author, $messages, $pages, $page, $line) 
;
     for ( $year = $YEARSTART ; $year <= $YEAREND; $year++ ) {
        foreach $month (@MONTHES) {
            if ( $year == $YEAREND && $month == $MONTHEND ) {
                last;
            }
-           my $url = "${URL}/${year}/${month}/";
+           if ( $ALLPROJECTS{$project}{'type'} == 0 ) {
+               $url = "${URL}/${year}/${month}/";
+           } else {
+               $url = "${URL}/${year}-$monthdict{$month}/";
+           }
            my $datafile = "${year}-${month}" ;
            if ( $storefiles ) {
                unless ( open(HTMLSNIP, ">$datafile") ) { die("Unable to open 
$datafile"); }
@@ -93,7 +120,7 @@
            my $spamlines    = 0;
            my $robotlines   = 0;
            while ( $url =~ /.+/ ) { # if only one page $url is set to ''
-               # print "$year-$month: $url\n";
+               print "DEBUG: $year-$month: $url\n";
                my $uri = URI->new($url);
                my $indexpage = $ua->get($url, Host => $uri->host );
                unless ( $indexpage->is_success ) { # some mailing lists 
startet later ...
@@ -103,12 +130,36 @@
                    unlink($datafile);
                    next;
                } ; 
-               (my @data) = $indexpage->content =~ 
m#.*<!--TNAVEND-->\n(.+)<hr>.*<!--BNAVSTART-->.*#gs;
-               my ($content, $msgurl, $subject, $author, $messages, $pages, 
$page) ;
+               if ( $ALLPROJECTS{$project}{'type'} == 1 ) {
+                   # make sure the loop will end in case of Alioth lists.  
Seems these list do
+                   # not feature more than one page per Month so there is no 
point in looping over them
+                   $url = '';
+               }
+               if ( $ALLPROJECTS{$project}{'type'} == 0 ) {
+                   @data = $indexpage->content =~ 
m#.*<!--TNAVEND-->\n(.+)<hr>.*<!--BNAVSTART-->.*#gs;
+               } else {
+                   my @tmpdata = $indexpage->content =~ m#.*<b>Ending:</b> 
<i>[ \w]+ [ \d:]+ UTC [\d]+</i><br>\n(.+)<a name="end"><b>Last message 
date:</b></a>.*#gs;
+                   @data = ();
+                   foreach $content (@tmpdata) {
+                       @lines = split(/(\n)/, $content);
+                       foreach $line (@lines) {
+                           if ( $line =~ /^\s*$/   || $line =~ /^<!--\d+ / ||
+                                $line =~ /^<\/I>$/ || $line =~ /^<UL>$/    ||
+                                $line =~ /^<\/A><A NAME="\d+">&nbsp;<\/A>$/ ) 
{ next ; }
+                           if ( $line =~ /^<LI><A HREF="\d+.html">\[[-\w]+\]/ 
) {
+                               @data = (@data, $line) ;
+                           } else {
+                               @data = (@data, "$line\n" ) ;
+                           }
+                       }
+                   }
+                   if ( $storefiles ) {
+                       print HTMLSNIP "@data\n";
+                   }
+               }
                foreach $content (@data) {
-                   my @lines = split(/(\n)/, $content);
+                   @lines = split(/(\n)/, $content);
                    # print "------> @lines\n" ;
-                   my $line;
                    my $linestart = '';
                    foreach $line (@lines) {
                        if ( $line =~ /^\s*$/) { next ; }


_______________________________________________
debian-med-commit mailing list
[email protected]
http://lists.alioth.debian.org/mailman/listinfo/debian-med-commit

Reply via email to