https://www.mediawiki.org/wiki/Special:Code/MediaWiki/115474

Revision: 115474
Author:   ezachte
Date:     2012-06-03 22:40:18 +0000 (Sun, 03 Jun 2012)
Log Message:
-----------
Two processing fixes in count job and improved job feedback to standard out

Modified Paths:
--------------
    trunk/wikistats/squids/SquidCountArchive.pl
    trunk/wikistats/squids/SquidCountArchiveConfig.pm
    trunk/wikistats/squids/SquidCountArchiveProcessLogRecord.pm
    trunk/wikistats/squids/SquidCountArchiveReadInput.pm

Modified: trunk/wikistats/squids/SquidCountArchive.pl
===================================================================
--- trunk/wikistats/squids/SquidCountArchive.pl 2012-06-03 20:29:54 UTC (rev 
115473)
+++ trunk/wikistats/squids/SquidCountArchive.pl 2012-06-03 22:40:18 UTC (rev 
115474)
@@ -98,6 +98,7 @@
     if ($do_phase1) # Collect IP frequencies
     { &ProcessPhase1 ($days_ago, $date_collect_files, $time_to_start, 
$time_to_stop,  $path_out, @files) ; }
 
+    $lines_processed = 0 ;
     if ($do_phase2) # collect other data
     { &ProcessPhase2 ($days_ago, $date_collect_files, $time_to_start, 
$time_to_stop,  $path_out, $path_out_month, @files) ; }
 

Modified: trunk/wikistats/squids/SquidCountArchiveConfig.pm
===================================================================
--- trunk/wikistats/squids/SquidCountArchiveConfig.pm   2012-06-03 20:29:54 UTC 
(rev 115473)
+++ trunk/wikistats/squids/SquidCountArchiveConfig.pm   2012-06-03 22:40:18 UTC 
(rev 115474)
@@ -1,9 +1,8 @@
 #!/usr/bin/perl
 
-  $developer = "engels" ;
-  $cfg_liblocation = "/a/squid/stats/dev_$developer/scripts" ;
+  $cfg_liblocation = "/a/squid/stats/scripts" ;
 
-  $cfg_path_root_production = "/a/squid/stats/dev_$developer/csv" ; 
+  $cfg_path_root_production = "/a/squid/stats/csv" ; 
   $cfg_path_root_test       = "w:/! perl/squids/archive/test" ;  # Erik
 
   $cfg_dir_in_production = "/a/squid/archive/sampled" ;
@@ -12,7 +11,7 @@
   $cfg_logname = "sampled-1000.log" ;
   
 # set default arguments for test on local machine
-  $cfg_default_argv = "-d 2012/04/01-2012/04/30" ;
+  $cfg_default_argv = "-d 2011/10/16-2011/10/16" ;
 
   $cfg_file_test = "w:/! Perl/Squids/Archive/sampled-1000.log-20111016.txt" ; 
# Erik
   $cfg_test_maxlines = 4000000 ;

Modified: trunk/wikistats/squids/SquidCountArchiveProcessLogRecord.pm
===================================================================
--- trunk/wikistats/squids/SquidCountArchiveProcessLogRecord.pm 2012-06-03 
20:29:54 UTC (rev 115473)
+++ trunk/wikistats/squids/SquidCountArchiveProcessLogRecord.pm 2012-06-03 
22:40:18 UTC (rev 115474)
@@ -5,6 +5,8 @@
   my $line = shift ;
 
   $time = $fields [2] ;
+  if ($time !~ /\.\d\d\d/) # for column alignment
+  { $time .= ".000" ; }
   $date = substr ($time,0,10) ;
 
   $client_ip  = $fields [4] ;
@@ -17,7 +19,7 @@
     if (($url =~ /\.m\..*?\/wiki\//) || ($url =~ /\.m\..*?\/w\/index.php/))
     { $mime = "text/html" ; }
   }
-  
+
   $count_event = 1 ;
   # from Oct 16, 2011 00:00 hrs till Nov 29, 2011 20:00 hrs one of the two 
servers which process requests to the mobile site did not send log lines
   # since the two servers are load-balanced, selected stats (e.g. breakdown 
browser, OS) can be repaired by counting requests to mobile site twice in this 
period
@@ -230,7 +232,7 @@
   elsif ($agent2 =~ /Trident[\/ ]?\d/io)
   { ($browserengine = $agent2) =~ s/^.*?Trident[\/ ]?(\d+\.?\d*).*$/Trident 
$1/io ; }
   elsif ($agent2 =~ /Presto[\/ ]?\d/io)
-  { ($browserengine = $agent2) =~ s/^.*?Presto[\/ ]?(\d+\.?\d*).*$/Presto 
$1/io ; }  
+  { ($browserengine = $agent2) =~ s/^.*?Presto[\/ ]?(\d+\.?\d*).*$/Presto 
$1/io ; }
   elsif ($agent2 =~ /AppleWebKit/io)
   { $browserengine = "AppleWebKit" ; }
   elsif ($agent2 =~ /Trident/io)
@@ -301,7 +303,7 @@
 
   # MOBILE
   $mobile = '-' ;
-  if (($url =~ /\?seg=/) or ($url =~ /&seg=/))
+  if (($url =~ /\?seg=/) or ($url =~ /&seg=/)) # Andre, tweede is al in eerste 
gevat en seg= is wel erg generiek kan ook pageseg= of zoiets zijn  if (($url =~ 
/\?seg=/) or ($url =~ /&seg=/))
   { $mobile = 'P' ; }
   elsif ($agent2 =~ /(?:$tags_wiki_mobile)/io)
   { $mobile = 'W' ; }
@@ -637,7 +639,7 @@
     $clients {"$mobile,$version,$mimecat"} += $count_event ; ;
 
     $operating_systems =~ s/,/,/go ;
-    ($mobile2 = $mobile) =~ s/W/M/; # code 'W' was introduced for 
SquidReportClients only
+    ($mobile2 = $mobile) =~ s/[^\-]/M/; # everything except '-' is mobile, use 
'M' voor reporting scripts, 'W' etc was introduced for SquidReportClients only
     $operating_systems {"$mobile2,$os"} += $count_event ; ;
   }
 
@@ -871,13 +873,13 @@
     if (! $test)
     {
       $time2    = substr ($time,0,19) ; # omit msec
-      ($mobile2 = $mobile) =~ s/W/M/; # code 'W' was introduced for 
SquidReportClients only
+      ($mobile2 = $mobile) =~ s/[^\-]/M/; # everything except '-' is mobile, 
use 'M' voor reporting scripts, 'W' etc was introduced for SquidReportClients 
only
       $line = 
"$time2,$client_ip,$domain,$ind_bot2,$mobile2,$os,$version,$mimecat\n" ;
       $gz_csv_views_viz->gzwrite($line) || die "Zlib error writing to 
$file_csv_views_viz: $gz_csv_views_viz->gzerror\n" ;
     }
   }
 
-  ($mobile2 = $mobile) =~ s/W/M/; # code 'W' was introduced for 
SquidReportClients only
+ ($mobile2 = $mobile) =~ s/[^\-]/M/; # everything except '-' is mobile, use 
'M' voor reporting scripts, 'W' etc was introduced for SquidReportClients only
   $records {"$mobile2,$mimecat"} += $count_event ;
   $records {"*,$mimecat"}       += $count_event ;
   $records {"$mobile2,*"}        += $count_event ;
@@ -973,7 +975,7 @@
     next if $parm eq '*' ;
 
     if (($parm !~ /=/) && ($parm !~ /^[\w\d\-\_]+$/o))
-    { $error = "parm probably invalid: '$parm' in '$url' -> skip\n" ; $invalid 
= $true ; last }
+    { $error = "parm probably invalid: '$parm' in '$url' -> skip\n" ; $invalid 
= $true ; $parms_invalid ++ ; last }
 
     ($keyword,$data) = split ('\=', $parm) ;
     if ($keyword eq "")
@@ -987,7 +989,7 @@
 
   if ($invalid)
   {
-    print $error ;
+    # print $error ;
     print ERR $error ;
     return ("?","?") ;
   }

Modified: trunk/wikistats/squids/SquidCountArchiveReadInput.pm
===================================================================
--- trunk/wikistats/squids/SquidCountArchiveReadInput.pm        2012-06-03 
20:29:54 UTC (rev 115473)
+++ trunk/wikistats/squids/SquidCountArchiveReadInput.pm        2012-06-03 
22:40:18 UTC (rev 115474)
@@ -24,7 +24,7 @@
 
   my ($date_archived) ;
 
-  $dir_in = $job_runs_on_production_server ? $cfg_dir_in_production : 
$cfg_dir_in_test ; 
+  $dir_in = $job_runs_on_production_server ? $cfg_dir_in_production : 
$cfg_dir_in_test ;
 
   $some_files_found = $false ;
   $full_range_found = $false ;
@@ -70,12 +70,12 @@
       # assuming only one file is archived per day !
       if ($head_found && $tail_found)
       {
-        $full_range_found = $true ;     
+        $full_range_found = $true ;
         last ;
       }
     }
   }
-  
+
   if (! $some_files_found)
   { print "Not any file was found which contains log records for $days_ago 
days ago. Skip processing for $date_collect_files.\n\n" ; return $false ; }
   if (! $full_range_found)
@@ -150,7 +150,6 @@
     my $file_csv_views_viz2 = $file_csv_views_viz ;
     my $date = substr ($time_to_start,0,4) . substr ($time_to_start,5,2) . 
substr ($time_to_start,8,2) ;
     $file_csv_views_viz2 =~ s/date/$date/ ;
-    print $file_csv_views_viz2 ;
     $gz_csv_views_viz = gzopen ($file_csv_views_viz2, "wb") || die "Unable to 
write $file_csv_views_viz2 $!\n" ;
 
     $comment = "# Data from $time_to_start till $time_to_stop 
(yyyy-mm-ddThh:mm:ss) - all counts in thousands due to sample rate of log (1 = 
1000)\n" ;
@@ -186,7 +185,7 @@
     $line = "" ;
     while ($line = <IN>)
     {
-      $lines_in_file ++ ;
+      $lines_to_process ++ ;
 
     # if ($line =~ /fy\.wikipedia\.org/o) # test/debug
     # {
@@ -236,10 +235,19 @@
 }
       }
 
+      if (! $scan_ip_frequencies) # phase 2
+      {
+        if ($lines_to_process % 1000000 == 0)
+        { print "Field count: " .
+          sprintf ("%.5f\%", 100 * $fields_too_few / $lines_to_process) . " of 
" . ($lines_to_process/1000000) . " M lines have too few fields, " .
+          sprintf ("%.5f\%", 100 * $fields_too_many / $lines_to_process) . " 
have too many fields, " .
+          sprintf ("%.5f\%", 100 * $parms_invalid / $lines_to_process) . " 
have invalid parms\n" ; }
+      }
+
       if ($#fields < $fields_expected)
       {
         $fields_too_few  ++ ;
-        print "invalid field count " . $#fields . "\n" ;
+      # print "invalid field count " . $#fields . "\n" ;
         print ERR $#fields . " fields: \"$line\"\n" ;
         next ;
       }
@@ -248,12 +256,16 @@
       {
         @a = @fields ;
         $fields_too_many ++ ;
-        print "invalid field count " . $#fields . "\n" ;
+      # print "invalid field count " . $#fields . "\n" ;
         print ERR $#fields . " fields: \"$line\"\n" ;
         next ;
       }
 
+      $fields_just_enough ++ ;
+
       $time = $fields [2] ;
+      if ($time !~ /\.\d\d\d/) # for column alignment
+      { $time .= ".000" ; }
 
       if (($oldest_time_read eq "") || ($time lt $oldest_time_read))
       { $oldest_time_read = $time ; }
@@ -267,6 +279,8 @@
       {
         if (++ $times % 1000000 == 0)
         { print "[$time]\n" ; }
+        $lines_to_process = 0 ;
+        $lines_processed  = 0 ;
         next ;
       }
 
@@ -301,7 +315,7 @@
 #next if $line !~ /http:\/\/\w+\.m\./ ;
 #print "$line\n" ;
       &ProcessLine ($line) ;
-      if (++ $lines_processed % 50000 == 0)
+      if (++ $lines_processed % 1000000 == 0)
       {
         if (! $scan_ip_frequencies) # phase 2
         {
@@ -315,9 +329,9 @@
         }
 
         if ($banner_requests_ignored == 0)
-        { print "$time $lines_processed$perc_mobile\n" ; }
+        { print "$time " . ($lines_processed / 1000000). " M 
lines$perc_mobile\n" ; }
         else
-        { print "$time $lines_processed$perc_mobile ($banner_requests_ignored 
banner requests ignored)\n" ; }
+        { print "$time " . ($lines_processed / 1000000). " M lines$perc_mobile 
($banner_requests_ignored banner requests ignored)\n" ; }
       }
       if ($test and $lines_processed >= $test_maxlines)
       { last ; }
@@ -342,7 +356,7 @@
     print "No data found for $time_to_start - $time_to_stop\n" ;
   }
   else
-  { print "$lines_this_day out $lines_in_file processed\n" ; }
+  { print "$lines_this_day out $lines_to_process examined\n" ; }
 
   if ($url_wikipedia_mobile > 0)
   {


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to