https://www.mediawiki.org/wiki/Special:Code/MediaWiki/115474
Revision: 115474
Author: ezachte
Date: 2012-06-03 22:40:18 +0000 (Sun, 03 Jun 2012)
Log Message:
-----------
Two processing fixes in count job and improved job feedback to standard out
Modified Paths:
--------------
trunk/wikistats/squids/SquidCountArchive.pl
trunk/wikistats/squids/SquidCountArchiveConfig.pm
trunk/wikistats/squids/SquidCountArchiveProcessLogRecord.pm
trunk/wikistats/squids/SquidCountArchiveReadInput.pm
Modified: trunk/wikistats/squids/SquidCountArchive.pl
===================================================================
--- trunk/wikistats/squids/SquidCountArchive.pl 2012-06-03 20:29:54 UTC (rev
115473)
+++ trunk/wikistats/squids/SquidCountArchive.pl 2012-06-03 22:40:18 UTC (rev
115474)
@@ -98,6 +98,7 @@
if ($do_phase1) # Collect IP frequencies
{ &ProcessPhase1 ($days_ago, $date_collect_files, $time_to_start,
$time_to_stop, $path_out, @files) ; }
+ $lines_processed = 0 ;
if ($do_phase2) # collect other data
{ &ProcessPhase2 ($days_ago, $date_collect_files, $time_to_start,
$time_to_stop, $path_out, $path_out_month, @files) ; }
Modified: trunk/wikistats/squids/SquidCountArchiveConfig.pm
===================================================================
--- trunk/wikistats/squids/SquidCountArchiveConfig.pm 2012-06-03 20:29:54 UTC
(rev 115473)
+++ trunk/wikistats/squids/SquidCountArchiveConfig.pm 2012-06-03 22:40:18 UTC
(rev 115474)
@@ -1,9 +1,8 @@
#!/usr/bin/perl
- $developer = "engels" ;
- $cfg_liblocation = "/a/squid/stats/dev_$developer/scripts" ;
+ $cfg_liblocation = "/a/squid/stats/scripts" ;
- $cfg_path_root_production = "/a/squid/stats/dev_$developer/csv" ;
+ $cfg_path_root_production = "/a/squid/stats/csv" ;
$cfg_path_root_test = "w:/! perl/squids/archive/test" ; # Erik
$cfg_dir_in_production = "/a/squid/archive/sampled" ;
@@ -12,7 +11,7 @@
$cfg_logname = "sampled-1000.log" ;
# set default arguments for test on local machine
- $cfg_default_argv = "-d 2012/04/01-2012/04/30" ;
+ $cfg_default_argv = "-d 2011/10/16-2011/10/16" ;
$cfg_file_test = "w:/! Perl/Squids/Archive/sampled-1000.log-20111016.txt" ;
# Erik
$cfg_test_maxlines = 4000000 ;
Modified: trunk/wikistats/squids/SquidCountArchiveProcessLogRecord.pm
===================================================================
--- trunk/wikistats/squids/SquidCountArchiveProcessLogRecord.pm 2012-06-03
20:29:54 UTC (rev 115473)
+++ trunk/wikistats/squids/SquidCountArchiveProcessLogRecord.pm 2012-06-03
22:40:18 UTC (rev 115474)
@@ -5,6 +5,8 @@
my $line = shift ;
$time = $fields [2] ;
+ if ($time !~ /\.\d\d\d/) # for column alignment
+ { $time .= ".000" ; }
$date = substr ($time,0,10) ;
$client_ip = $fields [4] ;
@@ -17,7 +19,7 @@
if (($url =~ /\.m\..*?\/wiki\//) || ($url =~ /\.m\..*?\/w\/index.php/))
{ $mime = "text/html" ; }
}
-
+
$count_event = 1 ;
# from Oct 16, 2011 00:00 hrs till Nov 29, 2011 20:00 hrs one of the two
servers which process requests to the mobile site did not send log lines
# since the two servers are load-balanced, selected stats (e.g. breakdown
browser, OS) can be repaired by counting requests to mobile site twice in this
period
@@ -230,7 +232,7 @@
elsif ($agent2 =~ /Trident[\/ ]?\d/io)
{ ($browserengine = $agent2) =~ s/^.*?Trident[\/ ]?(\d+\.?\d*).*$/Trident
$1/io ; }
elsif ($agent2 =~ /Presto[\/ ]?\d/io)
- { ($browserengine = $agent2) =~ s/^.*?Presto[\/ ]?(\d+\.?\d*).*$/Presto
$1/io ; }
+ { ($browserengine = $agent2) =~ s/^.*?Presto[\/ ]?(\d+\.?\d*).*$/Presto
$1/io ; }
elsif ($agent2 =~ /AppleWebKit/io)
{ $browserengine = "AppleWebKit" ; }
elsif ($agent2 =~ /Trident/io)
@@ -301,7 +303,7 @@
# MOBILE
$mobile = '-' ;
- if (($url =~ /\?seg=/) or ($url =~ /&seg=/))
+ if (($url =~ /\?seg=/) or ($url =~ /&seg=/)) # Andre, tweede is al in eerste
gevat en seg= is wel erg generiek kan ook pageseg= of zoiets zijn if (($url =~
/\?seg=/) or ($url =~ /&seg=/))
{ $mobile = 'P' ; }
elsif ($agent2 =~ /(?:$tags_wiki_mobile)/io)
{ $mobile = 'W' ; }
@@ -637,7 +639,7 @@
$clients {"$mobile,$version,$mimecat"} += $count_event ; ;
$operating_systems =~ s/,/,/go ;
- ($mobile2 = $mobile) =~ s/W/M/; # code 'W' was introduced for
SquidReportClients only
+ ($mobile2 = $mobile) =~ s/[^\-]/M/; # everything except '-' is mobile, use
'M' voor reporting scripts, 'W' etc was introduced for SquidReportClients only
$operating_systems {"$mobile2,$os"} += $count_event ; ;
}
@@ -871,13 +873,13 @@
if (! $test)
{
$time2 = substr ($time,0,19) ; # omit msec
- ($mobile2 = $mobile) =~ s/W/M/; # code 'W' was introduced for
SquidReportClients only
+ ($mobile2 = $mobile) =~ s/[^\-]/M/; # everything except '-' is mobile,
use 'M' voor reporting scripts, 'W' etc was introduced for SquidReportClients
only
$line =
"$time2,$client_ip,$domain,$ind_bot2,$mobile2,$os,$version,$mimecat\n" ;
$gz_csv_views_viz->gzwrite($line) || die "Zlib error writing to
$file_csv_views_viz: $gz_csv_views_viz->gzerror\n" ;
}
}
- ($mobile2 = $mobile) =~ s/W/M/; # code 'W' was introduced for
SquidReportClients only
+ ($mobile2 = $mobile) =~ s/[^\-]/M/; # everything except '-' is mobile, use
'M' voor reporting scripts, 'W' etc was introduced for SquidReportClients only
$records {"$mobile2,$mimecat"} += $count_event ;
$records {"*,$mimecat"} += $count_event ;
$records {"$mobile2,*"} += $count_event ;
@@ -973,7 +975,7 @@
next if $parm eq '*' ;
if (($parm !~ /=/) && ($parm !~ /^[\w\d\-\_]+$/o))
- { $error = "parm probably invalid: '$parm' in '$url' -> skip\n" ; $invalid
= $true ; last }
+ { $error = "parm probably invalid: '$parm' in '$url' -> skip\n" ; $invalid
= $true ; $parms_invalid ++ ; last }
($keyword,$data) = split ('\=', $parm) ;
if ($keyword eq "")
@@ -987,7 +989,7 @@
if ($invalid)
{
- print $error ;
+ # print $error ;
print ERR $error ;
return ("?","?") ;
}
Modified: trunk/wikistats/squids/SquidCountArchiveReadInput.pm
===================================================================
--- trunk/wikistats/squids/SquidCountArchiveReadInput.pm 2012-06-03
20:29:54 UTC (rev 115473)
+++ trunk/wikistats/squids/SquidCountArchiveReadInput.pm 2012-06-03
22:40:18 UTC (rev 115474)
@@ -24,7 +24,7 @@
my ($date_archived) ;
- $dir_in = $job_runs_on_production_server ? $cfg_dir_in_production :
$cfg_dir_in_test ;
+ $dir_in = $job_runs_on_production_server ? $cfg_dir_in_production :
$cfg_dir_in_test ;
$some_files_found = $false ;
$full_range_found = $false ;
@@ -70,12 +70,12 @@
# assuming only one file is archived per day !
if ($head_found && $tail_found)
{
- $full_range_found = $true ;
+ $full_range_found = $true ;
last ;
}
}
}
-
+
if (! $some_files_found)
{ print "Not any file was found which contains log records for $days_ago
days ago. Skip processing for $date_collect_files.\n\n" ; return $false ; }
if (! $full_range_found)
@@ -150,7 +150,6 @@
my $file_csv_views_viz2 = $file_csv_views_viz ;
my $date = substr ($time_to_start,0,4) . substr ($time_to_start,5,2) .
substr ($time_to_start,8,2) ;
$file_csv_views_viz2 =~ s/date/$date/ ;
- print $file_csv_views_viz2 ;
$gz_csv_views_viz = gzopen ($file_csv_views_viz2, "wb") || die "Unable to
write $file_csv_views_viz2 $!\n" ;
$comment = "# Data from $time_to_start till $time_to_stop
(yyyy-mm-ddThh:mm:ss) - all counts in thousands due to sample rate of log (1 =
1000)\n" ;
@@ -186,7 +185,7 @@
$line = "" ;
while ($line = <IN>)
{
- $lines_in_file ++ ;
+ $lines_to_process ++ ;
# if ($line =~ /fy\.wikipedia\.org/o) # test/debug
# {
@@ -236,10 +235,19 @@
}
}
+ if (! $scan_ip_frequencies) # phase 2
+ {
+ if ($lines_to_process % 1000000 == 0)
+ { print "Field count: " .
+ sprintf ("%.5f\%", 100 * $fields_too_few / $lines_to_process) . " of
" . ($lines_to_process/1000000) . " M lines have too few fields, " .
+ sprintf ("%.5f\%", 100 * $fields_too_many / $lines_to_process) . "
have too many fields, " .
+ sprintf ("%.5f\%", 100 * $parms_invalid / $lines_to_process) . "
have invalid parms\n" ; }
+ }
+
if ($#fields < $fields_expected)
{
$fields_too_few ++ ;
- print "invalid field count " . $#fields . "\n" ;
+ # print "invalid field count " . $#fields . "\n" ;
print ERR $#fields . " fields: \"$line\"\n" ;
next ;
}
@@ -248,12 +256,16 @@
{
@a = @fields ;
$fields_too_many ++ ;
- print "invalid field count " . $#fields . "\n" ;
+ # print "invalid field count " . $#fields . "\n" ;
print ERR $#fields . " fields: \"$line\"\n" ;
next ;
}
+ $fields_just_enough ++ ;
+
$time = $fields [2] ;
+ if ($time !~ /\.\d\d\d/) # for column alignment
+ { $time .= ".000" ; }
if (($oldest_time_read eq "") || ($time lt $oldest_time_read))
{ $oldest_time_read = $time ; }
@@ -267,6 +279,8 @@
{
if (++ $times % 1000000 == 0)
{ print "[$time]\n" ; }
+ $lines_to_process = 0 ;
+ $lines_processed = 0 ;
next ;
}
@@ -301,7 +315,7 @@
#next if $line !~ /http:\/\/\w+\.m\./ ;
#print "$line\n" ;
&ProcessLine ($line) ;
- if (++ $lines_processed % 50000 == 0)
+ if (++ $lines_processed % 1000000 == 0)
{
if (! $scan_ip_frequencies) # phase 2
{
@@ -315,9 +329,9 @@
}
if ($banner_requests_ignored == 0)
- { print "$time $lines_processed$perc_mobile\n" ; }
+ { print "$time " . ($lines_processed / 1000000). " M
lines$perc_mobile\n" ; }
else
- { print "$time $lines_processed$perc_mobile ($banner_requests_ignored
banner requests ignored)\n" ; }
+ { print "$time " . ($lines_processed / 1000000). " M lines$perc_mobile
($banner_requests_ignored banner requests ignored)\n" ; }
}
if ($test and $lines_processed >= $test_maxlines)
{ last ; }
@@ -342,7 +356,7 @@
print "No data found for $time_to_start - $time_to_stop\n" ;
}
else
- { print "$lines_this_day out $lines_in_file processed\n" ; }
+ { print "$lines_this_day out $lines_to_process examined\n" ; }
if ($url_wikipedia_mobile > 0)
{
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs