https://www.mediawiki.org/wiki/Special:Code/MediaWiki/112317

Revision: 112317
Author:   ezachte
Date:     2012-02-24 12:20:57 +0000 (Fri, 24 Feb 2012)
Log Message:
-----------
double count mobile records for certain date range, use of cfg_ prefix for 
var's in ..Config.pm

Modified Paths:
--------------
    trunk/wikistats/squids/SquidCountArchive.pl
    trunk/wikistats/squids/SquidCountArchiveProcessLogRecord.pm
    trunk/wikistats/squids/SquidCountArchiveReadInput.pm
    trunk/wikistats/squids/SquidCountArchiveWriteOutput.pm
    trunk/wikistats/squids/SquidCountryScan.pl
    trunk/wikistats/squids/SquidCountryScanConfig.pm
    trunk/wikistats/squids/SquidReportArchive.pl
    trunk/wikistats/squids/SquidReportArchiveConfig.pm

Removed Paths:
-------------
    trunk/wikistats/squids/config.pm

Modified: trunk/wikistats/squids/SquidCountArchive.pl
===================================================================
--- trunk/wikistats/squids/SquidCountArchive.pl 2012-02-24 12:01:15 UTC (rev 
112316)
+++ trunk/wikistats/squids/SquidCountArchive.pl 2012-02-24 12:20:57 UTC (rev 
112317)
@@ -1,7 +1,7 @@
  #!/usr/bin/perl
 
-  use config ;
-  use lib $liblocation ;
+  use SquidCountArchiveConfig ;
+  use lib $cfg_liblocation ;
   use EzLib ;
 
   $trace_on_exit = $true ;
@@ -13,8 +13,7 @@
   use SquidCountArchiveReadInput ;
   use SquidCountArchiveWriteOutput ;
 
-  # set defaults mainly for tests on local machine
-  default_argv $default_argv;
+  default_argv $cfg_default_argv ;
 
 # http://wikitech.wikimedia.org/view/Squid_log_format
 # 1. Hostname
@@ -43,25 +42,24 @@
 # todo: parm -r root folder
 
   $test = $false  ;
-  $test_maxlines = 4000000 ;
+  $test_maxlines = $cfg_text_maxlines ;
+  $file_test     = $cfg_file_test ;
 
   if (! $job_runs_on_production_server)
   {
     $test = $true ;
-    $file_test = "w:/# Out Locke/sampled-1000-log-20110401.txt" ;
-  # $file_test = getcwd . "/SquidDataFilterFY.txt" ;
-    if (! -e $file_test)
+    if (! -e $cfg_file_test)
     { abort "Test input file '$file_test' not found" ; }
   }
 
   $time_start = time ;
 
-  $path_root = "/srv/erik" ;
+  $path_root = $job_runs_on_production_server ? $cfg_path_root_production : 
$cfg_path_root_test ;
 
   $tags_wiki_mobile = "Wikiamo|Wikipanion|Wikimedia" ;
 
   $tags_mobile      = "Android|BlackBerry|Windows 
CE|DoCoMo|iPad|iPod|iPhone|HipTop|Kindle|LGE|Linux 
arm|Mobile|MIDP|NetFront|Nintendo|Nokia|Obigo|Opera Mini|Opera 
Mobi|Palm|Playstation 
Portable|Samsung|SoftBank|SonyEricsson|SymbianOS|UP\.Browser|Vodafone|WAP|webOS|HTC_Touch|KDDI|FOMA|HTC_HD2|Polaris|Teleca"
 ;
-  $tags_mobile_upd  = "August 2011" ;
+  $tags_mobile_upd  = "February 2012" ;
 
   $pattern_url_pre  = "(?:^|[a-zA-Z0-9-]+\\.)*?" ;
   $pattern_url_post = 
"\\.(?:biz|com|info|name|net|org|pro|aero|asia|cat|coop|edu|gov|int|jobs|mil|mobi|museum|tel|travel|arpa|[a-zA-Z0-9-]{2}|(?:com?|ne)\\.[a-zA-Z0-9-]{2})\$"
 ;
@@ -502,6 +500,7 @@
   undef %origins_external ;
   undef %origins_unsimplified ;
   undef %referers_internal ;
+  undef %records ;
   undef %requests ;
   undef %scripts ;
   undef %search ;
@@ -510,6 +509,7 @@
   undef %squid_events ;
   undef %squid_seqno ;
   undef %statusses ;
+  undef %total_clients ;
   undef %unrecognized_domains ;
   undef %wikis ;
 # undef @files ;

Modified: trunk/wikistats/squids/SquidCountArchiveProcessLogRecord.pm
===================================================================
--- trunk/wikistats/squids/SquidCountArchiveProcessLogRecord.pm 2012-02-24 
12:01:15 UTC (rev 112316)
+++ trunk/wikistats/squids/SquidCountArchiveProcessLogRecord.pm 2012-02-24 
12:20:57 UTC (rev 112317)
@@ -18,14 +18,22 @@
     { $mime = "text/html" ; }
   }
 
+  $count_event = 1 ;
+  # from Oct 16, 2011 00:00 hrs till Nov 29, 2011 20:00 hrs one of the two 
servers which process requests to the mobile site did not send log lines
+  # since the two servers are load-balanced, selected stats (e.g. breakdown 
browser, OS) can be repaired by counting requests to mobile site twice in this 
period
+  # note: do not count twice for metrics where specific ip addresses are 
considered, this would inflate number of assumed bots (based on ip address 
freq.)
+  if ((($time ge '2011-10-16T00') && ($time lt '2011-11-29T20')) and
+       ($url =~ /\.m\./))
+  { $count_event = 2 ; }
+
   if ($scan_ip_frequencies) # phase 1
   {
     return if $line =~ /Banner(?:Cont|List|Load|beheer)/io ;
 
     if ($mime eq "text/html")
     {
-      $ip_frequencies {$client_ip} ++ ;
-      $html_pages_found ++ ;
+      $ip_frequencies {$client_ip} ++ ; # do not use $count_event here!
+      $html_pages_found += $count_event ;
     }
 
     return ;
@@ -33,8 +41,8 @@
 
 
   # remember for each squid per hour lowest and highest sequence number and 
number of events
-  # later calc per hour average distance between events = (higest - lowest 
sequence number) / events - 1
-  # distance between consecutive events that lay in different hour bin are 
ignored, begligible
+  # later calc per hour average distance between events = (highest - lowest 
sequence number) / events - 1
+  # distance between consecutive events that lay in different hour bin are 
ignored, negligible
   $squid = $fields [0] ;
   $seqno = $fields [1] ;
   $hour = substr ($time, 11, 2) ;
@@ -64,12 +72,12 @@
 
   if ($url =~ /\.m\.wikipedia.org/)
   {
-    $url_wikipedia_mobile ++ ;
-    $status_url_wikipedia_mobile {$status} ++ ;
-    $status_mime_url_wikipedia_mobile {"$status,$mime"} ++ ;
+    $url_wikipedia_mobile += $count_event ;
+    $status_url_wikipedia_mobile {$status} += $count_event ;
+    $status_mime_url_wikipedia_mobile {"$status,$mime"} += $count_event ;
     if ($status eq "TCP_MISS/302")
     {
-      $redirected_to_mobile ++ ;
+      $redirected_to_mobile += $count_event ;
       return ;
     }
   }
@@ -81,13 +89,13 @@
 
   ($agent2 = $agent) =~ s/\%20/ /g ; # mainly to make line content more 
readable on debugging
   $agent2 =~ s/\%2F/\//g ; # mainly to make line content more readable on 
debugging
-  $agents_raw {$agent2}++ ;
+  $agents_raw {$agent2} += $count_event ;
 
   ($file,$ext) = &GetFileExt ($url) ;
-  $exts {$ext}++ ;
+  $exts {$ext} += $count_event ;
 
   if (($ext eq "js") || ($ext eq "css"))
-  { $scripts {"$ext,$file,"} ++ ; }
+  { $scripts {"$ext,$file,"} += $count_event ; }
 
   $title = "" ;
   $parm  = "" ;
@@ -103,14 +111,14 @@
     if ($parm eq "?") { return ; } # error
     $file =~ s/,/,/go ;
     $parm =~ s/,/,/go ;
-    $scripts {"php,$file,$parm"} ++ ;
+    $scripts {"php,$file,$parm"} += $count_event ;
     $ext .= "($file)" ; # add filename behind extension php
   }
 
   if ($mime eq "text/html")
   {
     $mimecat = "page" ;
-    $tot_mime_html ++ ;
+    $tot_mime_html += $count_event ;
   }
   elsif ($mime =~ /(?:gif|png|jpeg)/o)
   { $mimecat = "image" ; }
@@ -137,12 +145,12 @@
 
   if ($line =~ /(?:BannerCont|BannerList|BannerLoad|Bannerbeheer)/io)
   {
-    $banners {"$country,$url"} ++ ;
-    $banner_requests_ignored ++ ;
+    $banners {"$country,$url"} += $count_event ;
+    $banner_requests_ignored += $count_event ;
     return ;
   }
 
-  $countries {$country}++ ;
+  $countries {$country} += $count_event ; ;
 
   $agent2 = $agent ;
   $agent2 =~ s/\%20/ /g ;
@@ -202,13 +210,13 @@
     if ($agent2 !~ /MSIE \d+\/\d+/o) # most likely false positives
     {
       $bot = $true ;
-      @bots {"$mime,$agent2"} ++ ;
+      @bots {"$mime,$agent2"} += $count_event ;
     }
   }
   elsif (($agent2 =~ /bot/io) || (($agent2 =~ /crawl(?:er)?/io) && ($agent2 !~ 
/MSIEcrawler/io)) || ($agent2 =~ /spider/io) || ($agent2 =~ /parser/io))
   {
     $bot = $true ;
-    @bots {"$mime,$agent2"} ++ ;
+    @bots {"$mime,$agent2"} += $count_event ;
   }
 
   # GECKO
@@ -337,7 +345,7 @@
   if (($os eq '..') && ($mobile eq 'M' || $mobile eq 'W'))
   {
     $os = "Mobile other" ;
-    $mobile_other {$agent2} ++ ;
+    $mobile_other {$agent2} += $count_event ; 
   }
 
   if ($version =~ /(?:Ipod|Iphone)/io)
@@ -557,21 +565,21 @@
   {
     $engine  =~ s/,/,/go ;
     if ($gecko ne "")
-    { $engines {$gecko} ++ ; }
+    { $engines {$gecko} += $count_event ; }
     elsif ($applewebkit ne "")
     {
       $applewebkit =~ s/AppleWebKit\//AppleWebKit /o ;
-      $engines {$applewebkit} ++ ;
+      $engines {$applewebkit} += $count_event ; ;
     }
 
     $version =~ s/,/,/go ;
     if ($os =~ /playstation/io)
     { $version = "NetFront (PlayStation)" ; }
 
-    $clients {"$mobile,$version,$mimecat"}++ ;
+    $clients {"$mobile,$version,$mimecat"} += $count_event ; ;
 
     $operating_systems =~ s/,/,/go ;
-    $operating_systems {"$mobile,$os"} ++ ;
+    $operating_systems {"$mobile,$os"} += $count_event ; ;
   }
 
   if ($count_hits_per_ip_range)
@@ -583,16 +591,16 @@
 
   if ($status =~ /^TCP/)
   {
-    $statusses {"$method:$status"}++ ;
-    $statusses {"$method:total"}++ ;
+    $statusses {"$method:$status"} += $count_event ;
+    $statusses {"$method:total"}   += $count_event ;
   }
   else
-  { $statusses_non_tcp ++ ; }
+  { $statusses_non_tcp += $count_event ; }
 
   if ($url =~ /org\/skins/o)
   {
     ($url2 = $url) =~ s/^.*?\/skins/skins/o ;
-    $skins {$url2} ++ ;
+    $skins {$url2} += $count_event ; ;
   }
 
   if ($url =~ /^upload\.wikimedia\.org\//o) # count image size if applicable
@@ -604,7 +612,7 @@
 
   # for diagnostics
   if (($referer =~ /google/o) || ($agent =~ /google/io))
-  { $googles++ ; }
+  { $googles += $count_event ; }
 
   $referer =~ s/^http\w?\:\/\///o ;
   $referer =~ s/\.php\?.*$/\.php\?../go ;
@@ -622,7 +630,7 @@
   if (($domain =~ /\./o) ||
       ($domain !~ /^[\*\@\%]?!(wb|wn|wp|wq|ws|wv|wk|wx|xx|wm|mw|wmf)\:/o))
   {
-    $unrecognized_domains {$domain_original} ++ ;
+    $unrecognized_domains {$domain_original} += $count_event ;
     $domain = 'other' ;
   }
 
@@ -641,24 +649,24 @@
 
   if ($referer_external)
   {
-    $tot_referers_external++ ;
+    $tot_referers_external += $count_event ; ;
 
     ($origin, $toplevel) = &DetectOrigin ($client_ip, $referer_original, 
$agent, $mime, $mimecat, $service, $ext) ;
 
     &CountOrigin ("external", $origin, $toplevel, $mimecat) ;
 
     if ($origin !~ /^\!/o)
-    { $origins_unsimplified {$referer_original} ++ ; }
+    { $origins_unsimplified {$referer_original} += $count_event ; }
     else
     {
-      $origin_simplified {"$origin [$referer] <- $referer_original"} ++ ;
-      $origins_external   {$origin} ++ ;
+      $origin_simplified {"$origin [$referer] <- $referer_original"} += 
$count_event ; ;
+      $origins_external   {$origin} += $count_event ;
     }
   }
   else
   {
-    $tot_referers_internal ++ ;
-    $referers_internal {$referer} ++ ;
+    $tot_referers_internal += $count_event ;
+    $referers_internal {$referer} += $count_event ;
     $referer =~ s/!//go ; # ! was marker to signal pattern was recognized as 
wikimedia project
     &CountOrigin ("internal", $referer, "org" , $mimecat) ;
   }
@@ -670,10 +678,9 @@
   if ($domain =~ /!/o)
   { print ERR "still ! in domain: '$domain' <- '$domain_original'\n" ; }
 
-  $requests {"$domain|$referer|$ext|$mime|$parm"}++ ;
+  $requests {"$domain|$referer|$ext|$mime|$parm"} += $count_event ; ;
+  $clients_by_wiki {"$mobile,$version,$domain"}   += $count_event ; ;
 
-  $clients_by_wiki {"$mobile,$version,$domain"}++ ;
-
   # different output use either 'bot=N' or 'M'(anual) / 'bot=Y' or 'B'(ot)
   if ($bot)
   {
@@ -689,13 +696,13 @@
   if (($domain =~ /^\@/) || ($domain =~ /^\*/))
   {
     # print "Requests wap $domain | $ext | $mime | $parm | $country | 
$ind_bot\n" ;
-    $requests_wap {"$domain|$ext|$mime|$parm|$country|$ind_bot"} ++ ;
+    $requests_wap {"$domain|$ext|$mime|$parm|$country|$ind_bot"} += 
$count_event ; ;
   }
 
   if ($domain =~ /^\%/)
   {
     # print "Requests m $domain | $ext | $mime | $parm | $country | 
$ind_bot\n" ;
-    $requests_m {"$domain|$ext|$mime|$parm|$country|$ind_bot"} ++ ;
+    $requests_m {"$domain|$ext|$mime|$parm|$country|$ind_bot"} += $count_event 
; ;
   }
                               # $title !~ /:/ -> only namespace 0 (minus few 
titles with colon in name)
   if (($url =~ /index.php\?/o) && ($title !~ /:/o) && ($mime eq "text/html") 
&& (($url =~ /action=edit/o) || ($url =~ /action=submit/o)))
@@ -708,34 +715,40 @@
     $key =~ s/,/&comma;/go ;
     $key =~ s/\|/,/go ;
 
-    $index_php_raw {$key}++ ;
+    $index_php_raw {$key} += $count_event ; ;
     $client_ip_record_cnt {$client_ip}++ ;
   }
 
   if ($mimecat eq "page")
   {
-    $tot_mime_html2 ++ ;
+    $tot_mime_html2 += $count_event ;
 
     if (($ind_bot =~ /N/) and ($ip_frequencies {$client_ip} > 2))
     { $ind_bot = 'bot=Y' ; }
 
-    $countries_views {"$ind_bot,$domain,$country"} ++ ;
-
+    $countries_views {"$ind_bot,$domain,$country"} += $count_event ; ;
                                   # $title !~ /:/ -> only namespace 0 (minus 
few titles with colon in name)
     if (($url =~ /index.php\?/o) && ($title !~ /:/) && ($mime eq "text/html") 
&& ($url =~ /action=submit/o) && ($status =~ /302/o))
-    { $countries_saves {"$ind_bot,$domain,$country"} ++ ; }
+    { $countries_saves {"$ind_bot,$domain,$country"} += $count_event ; }
 
     $time_hh = substr ($time,11,2) ;
     $time_mm = substr ($time,14,2) ;
     $time_tt = $time_hh * 60 + $time_mm ;
     $time_tt2 = $time_tt - $time_tt % 15 ;
-    $countries_timed {"$ind_bot,$domain,$country,$time_tt2"} ++ ;
+    $countries_timed {"$ind_bot,$domain,$country,$time_tt2"} += $count_event ;
 
+    if (! $test)
+    {
+      $time2    = substr ($time,0,19) ; # omit msec
+      $line = 
"$time2,$client_ip,$domain,$ind_bot2,$mobile,$os,$version,$mimecat\n" ;
+      $gz_csv_views_viz->gzwrite($line) || die "Zlib error writing to 
$file_csv_views_viz: $gz_csv_views_viz->gzerror\n" ;
+    }
+  }
 
-    $time2    = substr ($time,0,19) ; # omit msec
-    $line = 
"$time2,$client_ip,$domain,$ind_bot2,$mobile,$os,$version,$mimecat\n" ;
-    $gz_csv_views_viz->gzwrite($line) || die "Zlib error writing to 
$file_csv_views_viz: $gz_csv_views_viz->gzerror\n" ;
-  }
+  $records {"$mobile,$mimecat"} += $count_event ;
+  $records {"*,$mimecat"}       += $count_event ;
+  $records {"$mobile,*"}        += $count_event ;
+  $records {"*,*"}              += $count_event ;
 }
 
 sub ExtractLanguage
@@ -747,10 +760,10 @@
   $regexp_lang = "[a-z]{2}(?:-[a-zA-Z]{2,3})?(?:-[a-zA-Z]{2,3})?" ;
   ($language = $agent) =~ s/^.*?; ($regexp_lang)[\);].*$/$1/o ;
   if ($language eq $agent)
-  { $languages_unrecognized {$agent} ++ ; }
+  { $languages_unrecognized {$agent} += $count_event ; }
   else
   {
-    $languages {"$application,$language"} ++ ;
+    $languages {"$application,$language"} += $count_event ;
     $agent =~ s/ $language//o ;
   }
   return ($agent) ;
@@ -823,7 +836,8 @@
 
   foreach $parm (@parms)
   {
-    next if $parm eq "" ;
+    next if $parm eq '' ;
+    next if $parm eq '*' ;
 
     if (($parm !~ /=/) && ($parm !~ /^[\w\d\-\_]+$/o))
     { $error = "parm probably invalid: '$parm' in '$url' -> skip\n" ; $invalid 
= $true ; last }
@@ -901,7 +915,7 @@
   $domain =~ s/\.m\./.%/o ;
 
   if ($domain =~ /^error:/o)
-  { $domain_errors {$domain}++ ; }
+  { $domain_errors {$domain} += $count_event ; }
   $domain =~ s/error:.*$/!error:1/o ;
 
   $domain =~ s/^([^\.\/]+)\.([^\.\/]+)\.org/$2:$1/o ;
@@ -1018,16 +1032,16 @@
     if (($googlematch eq "- - z") && ($service =~ /GoogleBot/io))
     {
       $service = "GoogleBot?" ;
-      $google_imposters {$agent}++ ;
+      $google_imposters {$agent} += $count_event ;
     }
 
     # obsolete? to be considered ?
     # if (($googlematch ne "- - z") || ($service =~ /(?:Earth|Desktop)/o))
-    # { $search 
{"'$googlematch',google,$referer2,$service,$agent2,$mimecat,$top_level_domain"} 
++ ; }
+    # { $search 
{"'$googlematch',google,$referer2,$service,$agent2,$mimecat,$top_level_domain"} 
+= $count_event ; }
     # else
     # { $accept = "not" ; }
 
-    $search 
{"'$googlematch',google,$referer2,$service,$agent2,$mimecat,$top_level_domain"} 
++ ;
+    $search 
{"'$googlematch',google,$referer2,$service,$agent2,$mimecat,$top_level_domain"} 
+= $count_event ;
 
     $googlebins2 {"$accept [$googlematch]  " . sprintf ("%-14s",$service) . 
$referer} ++ ;
     $googlebins {$googlematch}++ ;
@@ -1055,7 +1069,7 @@
   #  }
 
   if ($origin =~ /wiki/o)
-  { $wikis {$origin} ++ ; }
+  { $wikis {$origin} += $count_event ; }
 
   if ($origin eq "wikipedia")
   {
@@ -1187,7 +1201,7 @@
 
   if ($source eq "external")
   {
-    $tot_origins_external_counted ++ ;
+    $tot_origins_external_counted += $count_event ;
     $origin =~ s/\:.*$//o ;
     if (is_valid_ip_address ($origin))
     { $origin = "unmatched ip address" ; $toplevel = "" ; }
@@ -1203,7 +1217,7 @@
       # print "$origin\n" ;
     }
   }
-  $origins {"$source,$origin,$toplevel,$mimecat"} ++ ;
+  $origins {"$source,$origin,$toplevel,$mimecat"} += $count_event ;
 }
 
 sub ProcessUploadPath
@@ -1213,7 +1227,7 @@
   ($path = $url) =~ s/^.*?\.org\///o ;
   ($file = $path) =~ s/^.*\/([^\/]*)$/$1/go ; # remove path
 
-  $binaries {$path} ++ ; # Jan 2012 store path, not file only
+  $binaries {$path} += $count_event ; # Jan 2012 store path, not file only
 
   if ($file =~ /(?:gif|jpg|jpeg|png|svg)$/io)
   {
@@ -1227,10 +1241,10 @@
     {
       ($size = $file) =~ s/^.*?(\d+)px.*$/$1/o ;
        $sizerange = sprintf ("%5d",(int ($size / 20)) * 20) . "-"  . sprintf 
("%5d",(((int ($size / 20))+1) * 20 - 1)) ;
-       $imagesizes {$sizerange} ++ ;
+       $imagesizes {$sizerange} += $count_event ;
     }
     else
-    { $imagesizes {"???"} ++ ; }
+    { $imagesizes {"???"} += $count_event ; }
   }
 }
 

Modified: trunk/wikistats/squids/SquidCountArchiveReadInput.pm
===================================================================
--- trunk/wikistats/squids/SquidCountArchiveReadInput.pm        2012-02-24 
12:01:15 UTC (rev 112316)
+++ trunk/wikistats/squids/SquidCountArchiveReadInput.pm        2012-02-24 
12:20:57 UTC (rev 112317)
@@ -4,7 +4,7 @@
 # test:
 # echo 125.123.123.123 | /usr/local/bin/geoiplogtag 1
 # refresh: bayes:/usr/share/GeoIP> wget 
http://geolite.maxmind.com/download/geoip/database/GeoLiteCountry/GeoIP.dat.gz
-use config ;
+use SquidCountArchiveConfig ;
 
 sub CollectFilesToProcess
 {
@@ -24,6 +24,8 @@
 
   my ($date_archived) ;
 
+  $dir_in = $job_runs_on_production_server ? $cfg_dir_in_production : 
$cfg_dir_in_test ; 
+
   $some_files_found = $false ;
   $full_range_found = $false ;
 
@@ -47,7 +49,7 @@
     $date_archived = sprintf ("%4d%02d%02d", $year+1900, $month+1, $day) ;
     print "\n- Inspect file saved $days_ago_inspect days ago: 
$logname-$date_archived.gz\n" ;
 
-    my $file = "$dir_in/$logname-$date_archived.gz" ;
+    my $file = "$dir_in/$cfg_logname-$date_archived.gz" ;
 
     if (! -e $file)
     { print "- File not found: $file\n" ; }
@@ -173,8 +175,8 @@
     else
     {
       open IN, '<', $file_in ;
-    #  $fields_expected = 14 ;
-      $fields_expected = 13 ;
+      $fields_expected = 14 ; # add fake country code
+    # $fields_expected = 13 ;
     }
 
     $line = "" ;
@@ -192,6 +194,12 @@
 # ugly Q&D code to circumvent spaces in agent string
 # $line2 = $line ;
       chomp $line ;
+
+      if ($test)
+      { $line .= ' XX' ; }
+
+      $line =~ s/x-www-form-urlencoded; 
charset=UTF-8/x-www-form-urlencoded;%20charset=UTF-8/ ; # log lines are space 
delimited, other spaces should be encoded
+
       @fields = split (' ', $line) ;
 # next if $line =~ /upload/ ;
 # next if $line !~ /en\.m\.wikipedia/ ;
@@ -201,23 +209,46 @@
 #next if $fields [9] =~ /NONE/ ;
      if ($#fields > 14)
      {
+if (! $scan_ip_frequencies)
+{
 # print "line $line2\n" ;
 # print "fields " . $#fields . "\n$line\n" ;
+}
+
       $country_code = $fields [$#fields] ;
       $fields [$#fields] = '' ;
       $line = join (' ', @fields) ;
-# print "2 $line\n" ;
       @fields = split (' ', $line, 14) ;
       $fields [14] = $country_code ;
+ $fields [13] =~ s/ /%20/g ;
+
+if (! $scan_ip_frequencies)
+{
+# print "2 $line\n" ;
 # print "\n\n12: " . $fields [12] . "\n"  ;
 # print "13: " . $fields [13] . "\n"  ;
 # print "14: " . $fields [14] . "\n"  ;
 # print "15: " . $fields [15] . "\n"  ;
+}
       }
 
-      if ($#fields < $fields_expected) { $fields_too_few  ++ ; print "invalid 
field count " . $#fields . "\n" ; next ; }
-      if ($#fields > $fields_expected) { $fields_too_many ++ ; print "invalid 
field count " . $#fields . "\n" ; next ; }
+      if ($#fields < $fields_expected)
+      {
+        $fields_too_few  ++ ;
+        print "invalid field count " . $#fields . "\n" ;
+        print ERR $#fields . " fields: \"$line\"\n" ;
+        next ;
+      }
 
+      if ($#fields > $fields_expected)
+      {
+        @a = @fields ;
+        $fields_too_many ++ ;
+        print "invalid field count " . $#fields . "\n" ;
+        print ERR $#fields . " fields: \"$line\"\n" ;
+        next ;
+      }
+
       $time = $fields [2] ;
 
       if (($oldest_time_read eq "") || ($time lt $oldest_time_read))
@@ -230,7 +261,7 @@
 
       if ($time lt $time_to_start)
       {
-        if (++ $times % 100000 == 0)
+        if (++ $times % 1000000 == 0)
         { print "[$time]\n" ; }
         next ;
       }
@@ -266,12 +297,23 @@
 #next if $line !~ /http:\/\/\w+\.m\./ ;
 #print "$line\n" ;
       &ProcessLine ($line) ;
-      if (++ $lines_processed % 10000 == 0)
+      if (++ $lines_processed % 50000 == 0)
       {
+        if (! $scan_ip_frequencies) # phase 2
+        {
+          $perc_mobile_all = '-' ;
+          if ($records {"*,*"} > 0)
+          { $perc_mobile_all = sprintf ("%.1f", 100 * $records {"M,*"} / 
$records {"*,*"}) ; }
+          $perc_mobile_pages = '-' ;
+          if ($records {"*,page"} > 0)
+          { $perc_mobile_pages = sprintf ("%.1f", 100 * $records {"M,page"} / 
$records {"*,page"}) ; }
+          $perc_mobile = " (mobile: all $perc_mobile_all\%, pages 
$perc_mobile_pages\%)" ;
+        }
+
         if ($banner_requests_ignored == 0)
-        { print "$time $lines_processed\n" ; }
+        { print "$time $lines_processed$perc_mobile\n" ; }
         else
-        { print "$time $lines_processed ($banner_requests_ignored banner 
requests ignored)\n" ; }
+        { print "$time $lines_processed$perc_mobile ($banner_requests_ignored 
banner requests ignored)\n" ; }
       }
       if ($test and $lines_processed >= $test_maxlines)
       { last ; }

Modified: trunk/wikistats/squids/SquidCountArchiveWriteOutput.pm
===================================================================
--- trunk/wikistats/squids/SquidCountArchiveWriteOutput.pm      2012-02-24 
12:01:15 UTC (rev 112316)
+++ trunk/wikistats/squids/SquidCountArchiveWriteOutput.pm      2012-02-24 
12:20:57 UTC (rev 112317)
@@ -1,7 +1,7 @@
  #!/usr/bin/perl
 
-  use config ;
-  use lib $liblocation ;
+  use SquidCountArchiveConfig ;
+  use lib $cfg_liblocation ;
   use EzLib ;
 
 sub WriteOutputIpFrequencies

Modified: trunk/wikistats/squids/SquidCountryScan.pl
===================================================================
--- trunk/wikistats/squids/SquidCountryScan.pl  2012-02-24 12:01:15 UTC (rev 
112316)
+++ trunk/wikistats/squids/SquidCountryScan.pl  2012-02-24 12:20:57 UTC (rev 
112317)
@@ -4,7 +4,7 @@
 ## sub ProcessRawData <- SquidDataCountries.csv -> ??
 
   use SquidCountryScanConfig ;
-  use lib $liblocation ;
+  use lib $cfg_liblocation ;
   use EzLib ;
   $trace_on_exit = $true ;
 
@@ -23,7 +23,7 @@
   # exit ;
   }
 
-  $path_root = $job_runs_on_production_server ? $path_root_production : 
$path_root_test ;
+  $path_root = $job_runs_on_production_server ? $cfg_path_root_production : 
$cfg_path_root_test ;
 
   $file_raw_data_monthly_visits  = 
"$path_root/SquidDataVisitsPerCountryMonthly.csv" ;
   $file_raw_data_daily_visits    = 
"$path_root/SquidDataVisitsPerCountryDaily.csv" ;

Modified: trunk/wikistats/squids/SquidCountryScanConfig.pm
===================================================================
--- trunk/wikistats/squids/SquidCountryScanConfig.pm    2012-02-24 12:01:15 UTC 
(rev 112316)
+++ trunk/wikistats/squids/SquidCountryScanConfig.pm    2012-02-24 12:20:57 UTC 
(rev 112317)
@@ -1,7 +1,7 @@
 #!/usr/bin/perl
 
-  $liblocation = "/home/ezachte/lib" ;
+  $cfg_liblocation = "/home/ezachte/lib" ;
 
-  $path_root_production = "/a/ezachte/" ;
-  $path_root_test       = "w:/! perl/squids/archive/" ; # Erik 
-# $path_root_test       = "?" ;                         # Andr\xE9 
+  $cfg_path_root_production = "/a/ezachte/" ;
+  $cfg_path_root_test       = "w:/! perl/squids/archive/" ; # Erik 
+# $cfg_path_root_test       = "?" ;                         # Andr\xE9 

Modified: trunk/wikistats/squids/SquidReportArchive.pl
===================================================================
--- trunk/wikistats/squids/SquidReportArchive.pl        2012-02-24 12:01:15 UTC 
(rev 112316)
+++ trunk/wikistats/squids/SquidReportArchive.pl        2012-02-24 12:20:57 UTC 
(rev 112317)
@@ -1,13 +1,13 @@
 #!/usr/bin/perl
 
   use SquidReportArchiveConfig ;
-  use lib $liblocation ;
+  use lib $cfg_liblocation ;
 
   use EzLib ;
   $trace_on_exit = $true ;
   ez_lib_version (2) ;
 
-  default_argv ($default_argv) ;
+  default_argv ($cfg_default_argv) ;
 
 # to do: add text from http://wiki.squid-cache.org/SquidFaq/SquidLogs
 # ReportOrigin how to handle '!error <-> other
@@ -32,12 +32,12 @@
 
   undef %country_code_not_specified_reported ;
 
-  if (-d "/a/squid")
-  {
-    &Log ("\n\nJob runs on server $hostname\n\n") ;
-    $path_in  = "/a/ezachte" ;
-    $path_out = "/a/ezachte" ;
-  }
+  $path_in  = $job_runs_on_production_server ? $cfg_path_in_production  : 
$cfg_path_in_test ;
+  $path_out = $job_runs_on_production_server ? $cfg_path_out_production : 
$cfg_path_out_test ;
+
+  &Log ("Path in  = $path_in\n") ;
+  &Log ("Path out = $path_out\n") ;
+
 # following test needs to change -> remove server name dependency (new run 
argument ?)
 # elsif ($hostname eq 'bayes')
 # {
@@ -45,16 +45,7 @@
 #   $path_in  = "/home/ezachte/wikistats/animation" ;
 #   $path_out = "/home/ezachte/wikistats/animation" ;
 # }
-  else
-  {
-    print "Job runs local for tests\n\n" ;
-    $path_in  = $path_in_local ;
-    $path_out = $path_out_local ;
-  }
 
-  &Log ("Path in  = $path_in\n") ;
-  &Log ("Path out = $path_out\n") ;
-
   $file_csv_country_meta_info = "SquidReportCountryMetaInfo.csv" ;
 
   # periodically harvest updated metrics from

Modified: trunk/wikistats/squids/SquidReportArchiveConfig.pm
===================================================================
--- trunk/wikistats/squids/SquidReportArchiveConfig.pm  2012-02-24 12:01:15 UTC 
(rev 112316)
+++ trunk/wikistats/squids/SquidReportArchiveConfig.pm  2012-02-24 12:20:57 UTC 
(rev 112317)
@@ -1,15 +1,16 @@
 #!/usr/bin/perl
 
-  $liblocation = "/home/ezachte/lib" ;
+  $cfg_liblocation = "/home/ezachte/lib" ;
 
-# $path_in_local  = "W:/# Out Locke" ;      # Erik
-# $path_out_local = "W:/# Out Test/Locke" ; # Erik
+  $cfg_path_in_production  = "/a/ezachte" ;
+  $cfg_path_out_production = "/a/ezachte" ;
+# $cfg_path_in_test        = "W:/# Out Locke" ;      # Erik
+# $cfg_path_out_test       = "W:/# Out Test/Locke" ; # Erik
+  $cfg_path_in_test        = "/srv/erik/" ;          # Andr\xE9
+  $cfg_path_out_test       = "/srv/erik/" ;          # Andr\xE9
 
-  $path_in  = "/srv/erik/" ;                # Andr\xE9
-  $path_out = "/srv/erik/" ;                # Andr\xE9
-
-# set defaults for tests on local machine
-# $default_argv = "-m 2011-07" ;   # monthly report
-# $default_argv = "-w" ;           # refresh country info from Wikipedia 
(population etc)
-# $default_argv = "-c" ;           # country/regional reports
-  $default_argv = "-c -q 2011Q4" ; # country/regional reports based on data 
for one quarter only
+# set default arguments for test on local machine
+# $cfg_default_argv = "-m 2011-07" ;   # monthly report
+# $cfg_default_argv = "-w" ;           # refresh country info from Wikipedia 
(population etc)
+# $cfg_default_argv = "-c" ;           # country/regional reports
+  $cfg_default_argv = "-c -q 2011Q4" ; # country/regional reports based on 
data for one quarter only

Deleted: trunk/wikistats/squids/config.pm
===================================================================
--- trunk/wikistats/squids/config.pm    2012-02-24 12:01:15 UTC (rev 112316)
+++ trunk/wikistats/squids/config.pm    2012-02-24 12:20:57 UTC (rev 112317)
@@ -1,6 +0,0 @@
- #!/usr/bin/perl
-
- $liblocation = "/home/ezachte/lib" ;
- $default_argv =  "-d 2011/04/01" ;
- $dir_in  = "/a/squid/archive" ;
- $logname = "sampled-1000.log" ;
\ No newline at end of file


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to