https://www.mediawiki.org/wiki/Special:Code/MediaWiki/112317
Revision: 112317
Author: ezachte
Date: 2012-02-24 12:20:57 +0000 (Fri, 24 Feb 2012)
Log Message:
-----------
double count mobile records for certain date range, use of cfg_ prefix for
var's in ..Config.pm
Modified Paths:
--------------
trunk/wikistats/squids/SquidCountArchive.pl
trunk/wikistats/squids/SquidCountArchiveProcessLogRecord.pm
trunk/wikistats/squids/SquidCountArchiveReadInput.pm
trunk/wikistats/squids/SquidCountArchiveWriteOutput.pm
trunk/wikistats/squids/SquidCountryScan.pl
trunk/wikistats/squids/SquidCountryScanConfig.pm
trunk/wikistats/squids/SquidReportArchive.pl
trunk/wikistats/squids/SquidReportArchiveConfig.pm
Removed Paths:
-------------
trunk/wikistats/squids/config.pm
Modified: trunk/wikistats/squids/SquidCountArchive.pl
===================================================================
--- trunk/wikistats/squids/SquidCountArchive.pl 2012-02-24 12:01:15 UTC (rev
112316)
+++ trunk/wikistats/squids/SquidCountArchive.pl 2012-02-24 12:20:57 UTC (rev
112317)
@@ -1,7 +1,7 @@
#!/usr/bin/perl
- use config ;
- use lib $liblocation ;
+ use SquidCountArchiveConfig ;
+ use lib $cfg_liblocation ;
use EzLib ;
$trace_on_exit = $true ;
@@ -13,8 +13,7 @@
use SquidCountArchiveReadInput ;
use SquidCountArchiveWriteOutput ;
- # set defaults mainly for tests on local machine
- default_argv $default_argv;
+ default_argv $cfg_default_argv ;
# http://wikitech.wikimedia.org/view/Squid_log_format
# 1. Hostname
@@ -43,25 +42,24 @@
# todo: parm -r root folder
$test = $false ;
- $test_maxlines = 4000000 ;
+ $test_maxlines = $cfg_text_maxlines ;
+ $file_test = $cfg_file_test ;
if (! $job_runs_on_production_server)
{
$test = $true ;
- $file_test = "w:/# Out Locke/sampled-1000-log-20110401.txt" ;
- # $file_test = getcwd . "/SquidDataFilterFY.txt" ;
- if (! -e $file_test)
+ if (! -e $cfg_file_test)
{ abort "Test input file '$file_test' not found" ; }
}
$time_start = time ;
- $path_root = "/srv/erik" ;
+ $path_root = $job_runs_on_production_server ? $cfg_path_root_production :
$cfg_path_root_test ;
$tags_wiki_mobile = "Wikiamo|Wikipanion|Wikimedia" ;
$tags_mobile = "Android|BlackBerry|Windows
CE|DoCoMo|iPad|iPod|iPhone|HipTop|Kindle|LGE|Linux
arm|Mobile|MIDP|NetFront|Nintendo|Nokia|Obigo|Opera Mini|Opera
Mobi|Palm|Playstation
Portable|Samsung|SoftBank|SonyEricsson|SymbianOS|UP\.Browser|Vodafone|WAP|webOS|HTC_Touch|KDDI|FOMA|HTC_HD2|Polaris|Teleca"
;
- $tags_mobile_upd = "August 2011" ;
+ $tags_mobile_upd = "February 2012" ;
$pattern_url_pre = "(?:^|[a-zA-Z0-9-]+\\.)*?" ;
$pattern_url_post =
"\\.(?:biz|com|info|name|net|org|pro|aero|asia|cat|coop|edu|gov|int|jobs|mil|mobi|museum|tel|travel|arpa|[a-zA-Z0-9-]{2}|(?:com?|ne)\\.[a-zA-Z0-9-]{2})\$"
;
@@ -502,6 +500,7 @@
undef %origins_external ;
undef %origins_unsimplified ;
undef %referers_internal ;
+ undef %records ;
undef %requests ;
undef %scripts ;
undef %search ;
@@ -510,6 +509,7 @@
undef %squid_events ;
undef %squid_seqno ;
undef %statusses ;
+ undef %total_clients ;
undef %unrecognized_domains ;
undef %wikis ;
# undef @files ;
Modified: trunk/wikistats/squids/SquidCountArchiveProcessLogRecord.pm
===================================================================
--- trunk/wikistats/squids/SquidCountArchiveProcessLogRecord.pm 2012-02-24
12:01:15 UTC (rev 112316)
+++ trunk/wikistats/squids/SquidCountArchiveProcessLogRecord.pm 2012-02-24
12:20:57 UTC (rev 112317)
@@ -18,14 +18,22 @@
{ $mime = "text/html" ; }
}
+ $count_event = 1 ;
+ # from Oct 16, 2011 00:00 hrs till Nov 29, 2011 20:00 hrs one of the two
servers which process requests to the mobile site did not send log lines
+ # since the two servers are load-balanced, selected stats (e.g. breakdown
browser, OS) can be repaired by counting requests to mobile site twice in this
period
+ # note: do not count twice for metrics where specific ip addresses are
considered, this would inflate number of assumed bots (based on ip address
freq.)
+ if ((($time ge '2011-10-16T00') && ($time lt '2011-11-29T20')) and
+ ($url =~ /\.m\./))
+ { $count_event = 2 ; }
+
if ($scan_ip_frequencies) # phase 1
{
return if $line =~ /Banner(?:Cont|List|Load|beheer)/io ;
if ($mime eq "text/html")
{
- $ip_frequencies {$client_ip} ++ ;
- $html_pages_found ++ ;
+ $ip_frequencies {$client_ip} ++ ; # do not use $count_event here!
+ $html_pages_found += $count_event ;
}
return ;
@@ -33,8 +41,8 @@
# remember for each squid per hour lowest and highest sequence number and
number of events
- # later calc per hour average distance between events = (higest - lowest
sequence number) / events - 1
- # distance between consecutive events that lay in different hour bin are
ignored, begligible
+ # later calc per hour average distance between events = (highest - lowest
sequence number) / events - 1
+ # distance between consecutive events that lay in different hour bin are
ignored, negligible
$squid = $fields [0] ;
$seqno = $fields [1] ;
$hour = substr ($time, 11, 2) ;
@@ -64,12 +72,12 @@
if ($url =~ /\.m\.wikipedia.org/)
{
- $url_wikipedia_mobile ++ ;
- $status_url_wikipedia_mobile {$status} ++ ;
- $status_mime_url_wikipedia_mobile {"$status,$mime"} ++ ;
+ $url_wikipedia_mobile += $count_event ;
+ $status_url_wikipedia_mobile {$status} += $count_event ;
+ $status_mime_url_wikipedia_mobile {"$status,$mime"} += $count_event ;
if ($status eq "TCP_MISS/302")
{
- $redirected_to_mobile ++ ;
+ $redirected_to_mobile += $count_event ;
return ;
}
}
@@ -81,13 +89,13 @@
($agent2 = $agent) =~ s/\%20/ /g ; # mainly to make line content more
readable on debugging
$agent2 =~ s/\%2F/\//g ; # mainly to make line content more readable on
debugging
- $agents_raw {$agent2}++ ;
+ $agents_raw {$agent2} += $count_event ;
($file,$ext) = &GetFileExt ($url) ;
- $exts {$ext}++ ;
+ $exts {$ext} += $count_event ;
if (($ext eq "js") || ($ext eq "css"))
- { $scripts {"$ext,$file,"} ++ ; }
+ { $scripts {"$ext,$file,"} += $count_event ; }
$title = "" ;
$parm = "" ;
@@ -103,14 +111,14 @@
if ($parm eq "?") { return ; } # error
$file =~ s/,/,/go ;
$parm =~ s/,/,/go ;
- $scripts {"php,$file,$parm"} ++ ;
+ $scripts {"php,$file,$parm"} += $count_event ;
$ext .= "($file)" ; # add filename behind extension php
}
if ($mime eq "text/html")
{
$mimecat = "page" ;
- $tot_mime_html ++ ;
+ $tot_mime_html += $count_event ;
}
elsif ($mime =~ /(?:gif|png|jpeg)/o)
{ $mimecat = "image" ; }
@@ -137,12 +145,12 @@
if ($line =~ /(?:BannerCont|BannerList|BannerLoad|Bannerbeheer)/io)
{
- $banners {"$country,$url"} ++ ;
- $banner_requests_ignored ++ ;
+ $banners {"$country,$url"} += $count_event ;
+ $banner_requests_ignored += $count_event ;
return ;
}
- $countries {$country}++ ;
+ $countries {$country} += $count_event ; ;
$agent2 = $agent ;
$agent2 =~ s/\%20/ /g ;
@@ -202,13 +210,13 @@
if ($agent2 !~ /MSIE \d+\/\d+/o) # most likely false positives
{
$bot = $true ;
- @bots {"$mime,$agent2"} ++ ;
+ @bots {"$mime,$agent2"} += $count_event ;
}
}
elsif (($agent2 =~ /bot/io) || (($agent2 =~ /crawl(?:er)?/io) && ($agent2 !~
/MSIEcrawler/io)) || ($agent2 =~ /spider/io) || ($agent2 =~ /parser/io))
{
$bot = $true ;
- @bots {"$mime,$agent2"} ++ ;
+ @bots {"$mime,$agent2"} += $count_event ;
}
# GECKO
@@ -337,7 +345,7 @@
if (($os eq '..') && ($mobile eq 'M' || $mobile eq 'W'))
{
$os = "Mobile other" ;
- $mobile_other {$agent2} ++ ;
+ $mobile_other {$agent2} += $count_event ;
}
if ($version =~ /(?:Ipod|Iphone)/io)
@@ -557,21 +565,21 @@
{
$engine =~ s/,/,/go ;
if ($gecko ne "")
- { $engines {$gecko} ++ ; }
+ { $engines {$gecko} += $count_event ; }
elsif ($applewebkit ne "")
{
$applewebkit =~ s/AppleWebKit\//AppleWebKit /o ;
- $engines {$applewebkit} ++ ;
+ $engines {$applewebkit} += $count_event ; ;
}
$version =~ s/,/,/go ;
if ($os =~ /playstation/io)
{ $version = "NetFront (PlayStation)" ; }
- $clients {"$mobile,$version,$mimecat"}++ ;
+ $clients {"$mobile,$version,$mimecat"} += $count_event ; ;
$operating_systems =~ s/,/,/go ;
- $operating_systems {"$mobile,$os"} ++ ;
+ $operating_systems {"$mobile,$os"} += $count_event ; ;
}
if ($count_hits_per_ip_range)
@@ -583,16 +591,16 @@
if ($status =~ /^TCP/)
{
- $statusses {"$method:$status"}++ ;
- $statusses {"$method:total"}++ ;
+ $statusses {"$method:$status"} += $count_event ;
+ $statusses {"$method:total"} += $count_event ;
}
else
- { $statusses_non_tcp ++ ; }
+ { $statusses_non_tcp += $count_event ; }
if ($url =~ /org\/skins/o)
{
($url2 = $url) =~ s/^.*?\/skins/skins/o ;
- $skins {$url2} ++ ;
+ $skins {$url2} += $count_event ; ;
}
if ($url =~ /^upload\.wikimedia\.org\//o) # count image size if applicable
@@ -604,7 +612,7 @@
# for diagnostics
if (($referer =~ /google/o) || ($agent =~ /google/io))
- { $googles++ ; }
+ { $googles += $count_event ; }
$referer =~ s/^http\w?\:\/\///o ;
$referer =~ s/\.php\?.*$/\.php\?../go ;
@@ -622,7 +630,7 @@
if (($domain =~ /\./o) ||
($domain !~ /^[\*\@\%]?!(wb|wn|wp|wq|ws|wv|wk|wx|xx|wm|mw|wmf)\:/o))
{
- $unrecognized_domains {$domain_original} ++ ;
+ $unrecognized_domains {$domain_original} += $count_event ;
$domain = 'other' ;
}
@@ -641,24 +649,24 @@
if ($referer_external)
{
- $tot_referers_external++ ;
+ $tot_referers_external += $count_event ; ;
($origin, $toplevel) = &DetectOrigin ($client_ip, $referer_original,
$agent, $mime, $mimecat, $service, $ext) ;
&CountOrigin ("external", $origin, $toplevel, $mimecat) ;
if ($origin !~ /^\!/o)
- { $origins_unsimplified {$referer_original} ++ ; }
+ { $origins_unsimplified {$referer_original} += $count_event ; }
else
{
- $origin_simplified {"$origin [$referer] <- $referer_original"} ++ ;
- $origins_external {$origin} ++ ;
+ $origin_simplified {"$origin [$referer] <- $referer_original"} +=
$count_event ; ;
+ $origins_external {$origin} += $count_event ;
}
}
else
{
- $tot_referers_internal ++ ;
- $referers_internal {$referer} ++ ;
+ $tot_referers_internal += $count_event ;
+ $referers_internal {$referer} += $count_event ;
$referer =~ s/!//go ; # ! was marker to signal pattern was recognized as
wikimedia project
&CountOrigin ("internal", $referer, "org" , $mimecat) ;
}
@@ -670,10 +678,9 @@
if ($domain =~ /!/o)
{ print ERR "still ! in domain: '$domain' <- '$domain_original'\n" ; }
- $requests {"$domain|$referer|$ext|$mime|$parm"}++ ;
+ $requests {"$domain|$referer|$ext|$mime|$parm"} += $count_event ; ;
+ $clients_by_wiki {"$mobile,$version,$domain"} += $count_event ; ;
- $clients_by_wiki {"$mobile,$version,$domain"}++ ;
-
# different output use either 'bot=N' or 'M'(anual) / 'bot=Y' or 'B'(ot)
if ($bot)
{
@@ -689,13 +696,13 @@
if (($domain =~ /^\@/) || ($domain =~ /^\*/))
{
# print "Requests wap $domain | $ext | $mime | $parm | $country |
$ind_bot\n" ;
- $requests_wap {"$domain|$ext|$mime|$parm|$country|$ind_bot"} ++ ;
+ $requests_wap {"$domain|$ext|$mime|$parm|$country|$ind_bot"} +=
$count_event ; ;
}
if ($domain =~ /^\%/)
{
# print "Requests m $domain | $ext | $mime | $parm | $country |
$ind_bot\n" ;
- $requests_m {"$domain|$ext|$mime|$parm|$country|$ind_bot"} ++ ;
+ $requests_m {"$domain|$ext|$mime|$parm|$country|$ind_bot"} += $count_event
; ;
}
# $title !~ /:/ -> only namespace 0 (minus few
titles with colon in name)
if (($url =~ /index.php\?/o) && ($title !~ /:/o) && ($mime eq "text/html")
&& (($url =~ /action=edit/o) || ($url =~ /action=submit/o)))
@@ -708,34 +715,40 @@
$key =~ s/,/,/go ;
$key =~ s/\|/,/go ;
- $index_php_raw {$key}++ ;
+ $index_php_raw {$key} += $count_event ; ;
$client_ip_record_cnt {$client_ip}++ ;
}
if ($mimecat eq "page")
{
- $tot_mime_html2 ++ ;
+ $tot_mime_html2 += $count_event ;
if (($ind_bot =~ /N/) and ($ip_frequencies {$client_ip} > 2))
{ $ind_bot = 'bot=Y' ; }
- $countries_views {"$ind_bot,$domain,$country"} ++ ;
-
+ $countries_views {"$ind_bot,$domain,$country"} += $count_event ; ;
# $title !~ /:/ -> only namespace 0 (minus
few titles with colon in name)
if (($url =~ /index.php\?/o) && ($title !~ /:/) && ($mime eq "text/html")
&& ($url =~ /action=submit/o) && ($status =~ /302/o))
- { $countries_saves {"$ind_bot,$domain,$country"} ++ ; }
+ { $countries_saves {"$ind_bot,$domain,$country"} += $count_event ; }
$time_hh = substr ($time,11,2) ;
$time_mm = substr ($time,14,2) ;
$time_tt = $time_hh * 60 + $time_mm ;
$time_tt2 = $time_tt - $time_tt % 15 ;
- $countries_timed {"$ind_bot,$domain,$country,$time_tt2"} ++ ;
+ $countries_timed {"$ind_bot,$domain,$country,$time_tt2"} += $count_event ;
+ if (! $test)
+ {
+ $time2 = substr ($time,0,19) ; # omit msec
+ $line =
"$time2,$client_ip,$domain,$ind_bot2,$mobile,$os,$version,$mimecat\n" ;
+ $gz_csv_views_viz->gzwrite($line) || die "Zlib error writing to
$file_csv_views_viz: $gz_csv_views_viz->gzerror\n" ;
+ }
+ }
- $time2 = substr ($time,0,19) ; # omit msec
- $line =
"$time2,$client_ip,$domain,$ind_bot2,$mobile,$os,$version,$mimecat\n" ;
- $gz_csv_views_viz->gzwrite($line) || die "Zlib error writing to
$file_csv_views_viz: $gz_csv_views_viz->gzerror\n" ;
- }
+ $records {"$mobile,$mimecat"} += $count_event ;
+ $records {"*,$mimecat"} += $count_event ;
+ $records {"$mobile,*"} += $count_event ;
+ $records {"*,*"} += $count_event ;
}
sub ExtractLanguage
@@ -747,10 +760,10 @@
$regexp_lang = "[a-z]{2}(?:-[a-zA-Z]{2,3})?(?:-[a-zA-Z]{2,3})?" ;
($language = $agent) =~ s/^.*?; ($regexp_lang)[\);].*$/$1/o ;
if ($language eq $agent)
- { $languages_unrecognized {$agent} ++ ; }
+ { $languages_unrecognized {$agent} += $count_event ; }
else
{
- $languages {"$application,$language"} ++ ;
+ $languages {"$application,$language"} += $count_event ;
$agent =~ s/ $language//o ;
}
return ($agent) ;
@@ -823,7 +836,8 @@
foreach $parm (@parms)
{
- next if $parm eq "" ;
+ next if $parm eq '' ;
+ next if $parm eq '*' ;
if (($parm !~ /=/) && ($parm !~ /^[\w\d\-\_]+$/o))
{ $error = "parm probably invalid: '$parm' in '$url' -> skip\n" ; $invalid
= $true ; last }
@@ -901,7 +915,7 @@
$domain =~ s/\.m\./.%/o ;
if ($domain =~ /^error:/o)
- { $domain_errors {$domain}++ ; }
+ { $domain_errors {$domain} += $count_event ; }
$domain =~ s/error:.*$/!error:1/o ;
$domain =~ s/^([^\.\/]+)\.([^\.\/]+)\.org/$2:$1/o ;
@@ -1018,16 +1032,16 @@
if (($googlematch eq "- - z") && ($service =~ /GoogleBot/io))
{
$service = "GoogleBot?" ;
- $google_imposters {$agent}++ ;
+ $google_imposters {$agent} += $count_event ;
}
# obsolete? to be considered ?
# if (($googlematch ne "- - z") || ($service =~ /(?:Earth|Desktop)/o))
- # { $search
{"'$googlematch',google,$referer2,$service,$agent2,$mimecat,$top_level_domain"}
++ ; }
+ # { $search
{"'$googlematch',google,$referer2,$service,$agent2,$mimecat,$top_level_domain"}
+= $count_event ; }
# else
# { $accept = "not" ; }
- $search
{"'$googlematch',google,$referer2,$service,$agent2,$mimecat,$top_level_domain"}
++ ;
+ $search
{"'$googlematch',google,$referer2,$service,$agent2,$mimecat,$top_level_domain"}
+= $count_event ;
$googlebins2 {"$accept [$googlematch] " . sprintf ("%-14s",$service) .
$referer} ++ ;
$googlebins {$googlematch}++ ;
@@ -1055,7 +1069,7 @@
# }
if ($origin =~ /wiki/o)
- { $wikis {$origin} ++ ; }
+ { $wikis {$origin} += $count_event ; }
if ($origin eq "wikipedia")
{
@@ -1187,7 +1201,7 @@
if ($source eq "external")
{
- $tot_origins_external_counted ++ ;
+ $tot_origins_external_counted += $count_event ;
$origin =~ s/\:.*$//o ;
if (is_valid_ip_address ($origin))
{ $origin = "unmatched ip address" ; $toplevel = "" ; }
@@ -1203,7 +1217,7 @@
# print "$origin\n" ;
}
}
- $origins {"$source,$origin,$toplevel,$mimecat"} ++ ;
+ $origins {"$source,$origin,$toplevel,$mimecat"} += $count_event ;
}
sub ProcessUploadPath
@@ -1213,7 +1227,7 @@
($path = $url) =~ s/^.*?\.org\///o ;
($file = $path) =~ s/^.*\/([^\/]*)$/$1/go ; # remove path
- $binaries {$path} ++ ; # Jan 2012 store path, not file only
+ $binaries {$path} += $count_event ; # Jan 2012 store path, not file only
if ($file =~ /(?:gif|jpg|jpeg|png|svg)$/io)
{
@@ -1227,10 +1241,10 @@
{
($size = $file) =~ s/^.*?(\d+)px.*$/$1/o ;
$sizerange = sprintf ("%5d",(int ($size / 20)) * 20) . "-" . sprintf
("%5d",(((int ($size / 20))+1) * 20 - 1)) ;
- $imagesizes {$sizerange} ++ ;
+ $imagesizes {$sizerange} += $count_event ;
}
else
- { $imagesizes {"???"} ++ ; }
+ { $imagesizes {"???"} += $count_event ; }
}
}
Modified: trunk/wikistats/squids/SquidCountArchiveReadInput.pm
===================================================================
--- trunk/wikistats/squids/SquidCountArchiveReadInput.pm 2012-02-24
12:01:15 UTC (rev 112316)
+++ trunk/wikistats/squids/SquidCountArchiveReadInput.pm 2012-02-24
12:20:57 UTC (rev 112317)
@@ -4,7 +4,7 @@
# test:
# echo 125.123.123.123 | /usr/local/bin/geoiplogtag 1
# refresh: bayes:/usr/share/GeoIP> wget
http://geolite.maxmind.com/download/geoip/database/GeoLiteCountry/GeoIP.dat.gz
-use config ;
+use SquidCountArchiveConfig ;
sub CollectFilesToProcess
{
@@ -24,6 +24,8 @@
my ($date_archived) ;
+ $dir_in = $job_runs_on_production_server ? $cfg_dir_in_production :
$cfg_dir_in_test ;
+
$some_files_found = $false ;
$full_range_found = $false ;
@@ -47,7 +49,7 @@
$date_archived = sprintf ("%4d%02d%02d", $year+1900, $month+1, $day) ;
print "\n- Inspect file saved $days_ago_inspect days ago:
$logname-$date_archived.gz\n" ;
- my $file = "$dir_in/$logname-$date_archived.gz" ;
+ my $file = "$dir_in/$cfg_logname-$date_archived.gz" ;
if (! -e $file)
{ print "- File not found: $file\n" ; }
@@ -173,8 +175,8 @@
else
{
open IN, '<', $file_in ;
- # $fields_expected = 14 ;
- $fields_expected = 13 ;
+ $fields_expected = 14 ; # add fake country code
+ # $fields_expected = 13 ;
}
$line = "" ;
@@ -192,6 +194,12 @@
# ugly Q&D code to circumvent spaces in agent string
# $line2 = $line ;
chomp $line ;
+
+ if ($test)
+ { $line .= ' XX' ; }
+
+ $line =~ s/x-www-form-urlencoded;
charset=UTF-8/x-www-form-urlencoded;%20charset=UTF-8/ ; # log lines are space
delimited, other spaces should be encoded
+
@fields = split (' ', $line) ;
# next if $line =~ /upload/ ;
# next if $line !~ /en\.m\.wikipedia/ ;
@@ -201,23 +209,46 @@
#next if $fields [9] =~ /NONE/ ;
if ($#fields > 14)
{
+if (! $scan_ip_frequencies)
+{
# print "line $line2\n" ;
# print "fields " . $#fields . "\n$line\n" ;
+}
+
$country_code = $fields [$#fields] ;
$fields [$#fields] = '' ;
$line = join (' ', @fields) ;
-# print "2 $line\n" ;
@fields = split (' ', $line, 14) ;
$fields [14] = $country_code ;
+ $fields [13] =~ s/ /%20/g ;
+
+if (! $scan_ip_frequencies)
+{
+# print "2 $line\n" ;
# print "\n\n12: " . $fields [12] . "\n" ;
# print "13: " . $fields [13] . "\n" ;
# print "14: " . $fields [14] . "\n" ;
# print "15: " . $fields [15] . "\n" ;
+}
}
- if ($#fields < $fields_expected) { $fields_too_few ++ ; print "invalid
field count " . $#fields . "\n" ; next ; }
- if ($#fields > $fields_expected) { $fields_too_many ++ ; print "invalid
field count " . $#fields . "\n" ; next ; }
+ if ($#fields < $fields_expected)
+ {
+ $fields_too_few ++ ;
+ print "invalid field count " . $#fields . "\n" ;
+ print ERR $#fields . " fields: \"$line\"\n" ;
+ next ;
+ }
+ if ($#fields > $fields_expected)
+ {
+ @a = @fields ;
+ $fields_too_many ++ ;
+ print "invalid field count " . $#fields . "\n" ;
+ print ERR $#fields . " fields: \"$line\"\n" ;
+ next ;
+ }
+
$time = $fields [2] ;
if (($oldest_time_read eq "") || ($time lt $oldest_time_read))
@@ -230,7 +261,7 @@
if ($time lt $time_to_start)
{
- if (++ $times % 100000 == 0)
+ if (++ $times % 1000000 == 0)
{ print "[$time]\n" ; }
next ;
}
@@ -266,12 +297,23 @@
#next if $line !~ /http:\/\/\w+\.m\./ ;
#print "$line\n" ;
&ProcessLine ($line) ;
- if (++ $lines_processed % 10000 == 0)
+ if (++ $lines_processed % 50000 == 0)
{
+ if (! $scan_ip_frequencies) # phase 2
+ {
+ $perc_mobile_all = '-' ;
+ if ($records {"*,*"} > 0)
+ { $perc_mobile_all = sprintf ("%.1f", 100 * $records {"M,*"} /
$records {"*,*"}) ; }
+ $perc_mobile_pages = '-' ;
+ if ($records {"*,page"} > 0)
+ { $perc_mobile_pages = sprintf ("%.1f", 100 * $records {"M,page"} /
$records {"*,page"}) ; }
+ $perc_mobile = " (mobile: all $perc_mobile_all\%, pages
$perc_mobile_pages\%)" ;
+ }
+
if ($banner_requests_ignored == 0)
- { print "$time $lines_processed\n" ; }
+ { print "$time $lines_processed$perc_mobile\n" ; }
else
- { print "$time $lines_processed ($banner_requests_ignored banner
requests ignored)\n" ; }
+ { print "$time $lines_processed$perc_mobile ($banner_requests_ignored
banner requests ignored)\n" ; }
}
if ($test and $lines_processed >= $test_maxlines)
{ last ; }
Modified: trunk/wikistats/squids/SquidCountArchiveWriteOutput.pm
===================================================================
--- trunk/wikistats/squids/SquidCountArchiveWriteOutput.pm 2012-02-24
12:01:15 UTC (rev 112316)
+++ trunk/wikistats/squids/SquidCountArchiveWriteOutput.pm 2012-02-24
12:20:57 UTC (rev 112317)
@@ -1,7 +1,7 @@
#!/usr/bin/perl
- use config ;
- use lib $liblocation ;
+ use SquidCountArchiveConfig ;
+ use lib $cfg_liblocation ;
use EzLib ;
sub WriteOutputIpFrequencies
Modified: trunk/wikistats/squids/SquidCountryScan.pl
===================================================================
--- trunk/wikistats/squids/SquidCountryScan.pl 2012-02-24 12:01:15 UTC (rev
112316)
+++ trunk/wikistats/squids/SquidCountryScan.pl 2012-02-24 12:20:57 UTC (rev
112317)
@@ -4,7 +4,7 @@
## sub ProcessRawData <- SquidDataCountries.csv -> ??
use SquidCountryScanConfig ;
- use lib $liblocation ;
+ use lib $cfg_liblocation ;
use EzLib ;
$trace_on_exit = $true ;
@@ -23,7 +23,7 @@
# exit ;
}
- $path_root = $job_runs_on_production_server ? $path_root_production :
$path_root_test ;
+ $path_root = $job_runs_on_production_server ? $cfg_path_root_production :
$cfg_path_root_test ;
$file_raw_data_monthly_visits =
"$path_root/SquidDataVisitsPerCountryMonthly.csv" ;
$file_raw_data_daily_visits =
"$path_root/SquidDataVisitsPerCountryDaily.csv" ;
Modified: trunk/wikistats/squids/SquidCountryScanConfig.pm
===================================================================
--- trunk/wikistats/squids/SquidCountryScanConfig.pm 2012-02-24 12:01:15 UTC
(rev 112316)
+++ trunk/wikistats/squids/SquidCountryScanConfig.pm 2012-02-24 12:20:57 UTC
(rev 112317)
@@ -1,7 +1,7 @@
#!/usr/bin/perl
- $liblocation = "/home/ezachte/lib" ;
+ $cfg_liblocation = "/home/ezachte/lib" ;
- $path_root_production = "/a/ezachte/" ;
- $path_root_test = "w:/! perl/squids/archive/" ; # Erik
-# $path_root_test = "?" ; # Andr\xE9
+ $cfg_path_root_production = "/a/ezachte/" ;
+ $cfg_path_root_test = "w:/! perl/squids/archive/" ; # Erik
+# $cfg_path_root_test = "?" ; # Andr\xE9
Modified: trunk/wikistats/squids/SquidReportArchive.pl
===================================================================
--- trunk/wikistats/squids/SquidReportArchive.pl 2012-02-24 12:01:15 UTC
(rev 112316)
+++ trunk/wikistats/squids/SquidReportArchive.pl 2012-02-24 12:20:57 UTC
(rev 112317)
@@ -1,13 +1,13 @@
#!/usr/bin/perl
use SquidReportArchiveConfig ;
- use lib $liblocation ;
+ use lib $cfg_liblocation ;
use EzLib ;
$trace_on_exit = $true ;
ez_lib_version (2) ;
- default_argv ($default_argv) ;
+ default_argv ($cfg_default_argv) ;
# to do: add text from http://wiki.squid-cache.org/SquidFaq/SquidLogs
# ReportOrigin how to handle '!error <-> other
@@ -32,12 +32,12 @@
undef %country_code_not_specified_reported ;
- if (-d "/a/squid")
- {
- &Log ("\n\nJob runs on server $hostname\n\n") ;
- $path_in = "/a/ezachte" ;
- $path_out = "/a/ezachte" ;
- }
+ $path_in = $job_runs_on_production_server ? $cfg_path_in_production :
$cfg_path_in_test ;
+ $path_out = $job_runs_on_production_server ? $cfg_path_out_production :
$cfg_path_out_test ;
+
+ &Log ("Path in = $path_in\n") ;
+ &Log ("Path out = $path_out\n") ;
+
# following test needs to change -> remove server name dependency (new run
argument ?)
# elsif ($hostname eq 'bayes')
# {
@@ -45,16 +45,7 @@
# $path_in = "/home/ezachte/wikistats/animation" ;
# $path_out = "/home/ezachte/wikistats/animation" ;
# }
- else
- {
- print "Job runs local for tests\n\n" ;
- $path_in = $path_in_local ;
- $path_out = $path_out_local ;
- }
- &Log ("Path in = $path_in\n") ;
- &Log ("Path out = $path_out\n") ;
-
$file_csv_country_meta_info = "SquidReportCountryMetaInfo.csv" ;
# periodically harvest updated metrics from
Modified: trunk/wikistats/squids/SquidReportArchiveConfig.pm
===================================================================
--- trunk/wikistats/squids/SquidReportArchiveConfig.pm 2012-02-24 12:01:15 UTC
(rev 112316)
+++ trunk/wikistats/squids/SquidReportArchiveConfig.pm 2012-02-24 12:20:57 UTC
(rev 112317)
@@ -1,15 +1,16 @@
#!/usr/bin/perl
- $liblocation = "/home/ezachte/lib" ;
+ $cfg_liblocation = "/home/ezachte/lib" ;
-# $path_in_local = "W:/# Out Locke" ; # Erik
-# $path_out_local = "W:/# Out Test/Locke" ; # Erik
+ $cfg_path_in_production = "/a/ezachte" ;
+ $cfg_path_out_production = "/a/ezachte" ;
+# $cfg_path_in_test = "W:/# Out Locke" ; # Erik
+# $cfg_path_out_test = "W:/# Out Test/Locke" ; # Erik
+ $cfg_path_in_test = "/srv/erik/" ; # Andr\xE9
+ $cfg_path_out_test = "/srv/erik/" ; # Andr\xE9
- $path_in = "/srv/erik/" ; # Andr\xE9
- $path_out = "/srv/erik/" ; # Andr\xE9
-
-# set defaults for tests on local machine
-# $default_argv = "-m 2011-07" ; # monthly report
-# $default_argv = "-w" ; # refresh country info from Wikipedia
(population etc)
-# $default_argv = "-c" ; # country/regional reports
- $default_argv = "-c -q 2011Q4" ; # country/regional reports based on data
for one quarter only
+# set default arguments for test on local machine
+# $cfg_default_argv = "-m 2011-07" ; # monthly report
+# $cfg_default_argv = "-w" ; # refresh country info from Wikipedia
(population etc)
+# $cfg_default_argv = "-c" ; # country/regional reports
+ $cfg_default_argv = "-c -q 2011Q4" ; # country/regional reports based on
data for one quarter only
Deleted: trunk/wikistats/squids/config.pm
===================================================================
--- trunk/wikistats/squids/config.pm 2012-02-24 12:01:15 UTC (rev 112316)
+++ trunk/wikistats/squids/config.pm 2012-02-24 12:20:57 UTC (rev 112317)
@@ -1,6 +0,0 @@
- #!/usr/bin/perl
-
- $liblocation = "/home/ezachte/lib" ;
- $default_argv = "-d 2011/04/01" ;
- $dir_in = "/a/squid/archive" ;
- $logname = "sampled-1000.log" ;
\ No newline at end of file
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs