Diederik has submitted this change and it was merged.
Change subject: Fix for mingle 356
......................................................................
Fix for mingle 356
* Added test for 356. It passes. The bug report could not be
reproduced in current version of the code, but a test was written
to make sure it will never occur
* Added travis support (I also added it for the 341 gerrit patchset which is
in review )
* Removed verbose output from test 09
* Modified t/CommonConfig.pm to use $ENV{WORKSPACE} for the path of
the workspace on Jenkins ( I found this out from Krinkle of
#wikimedia-operations, thanks )
* 5-th field is now IP and Country Code separated by "|" and Wikistats
agrees with this
Change-Id: I5c74df172e1f3505cfd72aaaab068d7d9a4476ce
---
A .travis.yml
M squids/.gitignore
M squids/perl/SquidCountArchiveProcessLogRecord.pm
M squids/perl/SquidCountArchiveReadInput.pm
M squids/t/06-regression-mismatch-world-north-south-unknown.t
M squids/t/09-regression-totals-fixes-squidreportclients.t
A squids/t/10-regression-mingle-356-bugzilla-46269.t
M squids/t/CommonConfig.pm
M squids/t/Generate/Squid.pm
A
squids/testdata/regression-mingle-356-bugzilla-46269/SquidCountArchiveConfig.pm
A
squids/testdata/regression-mingle-356-bugzilla-46269/SquidReportArchiveConfig.pm
A squids/testdata/regression-mingle-356-bugzilla-46269/ua.txt.gz
12 files changed, 232 insertions(+), 64 deletions(-)
Approvals:
Erik Zachte: Checked
Diederik: Looks good to me, approved
jenkins-bot: Verified
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..e950543
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,23 @@
+
+language: perl
+perl:
+ - "5.16"
+ - "5.14"
+ - "5.12"
+ - "5.10"
+
+
+before_install:
+
+ - sudo apt-get update -qq
+ - # sudo apt-get install -qq
+ - cpanm --quiet --notest Template
+ - cpanm --quiet --notest JSON::XS HTML::TreeBuilder::XPath
+
+install: ""
+
+script:
+ - hostname
+ - pwd
+ - echo "=================================================="
+ - cd squids; prove -v -Iperl/ t/
diff --git a/squids/.gitignore b/squids/.gitignore
index a25cb96..3b337db 100644
--- a/squids/.gitignore
+++ b/squids/.gitignore
@@ -29,8 +29,11 @@
!*.sh
# file specifying line-ending characters
!.gitattributes
+!testdata
!testdata/
!testdata/*
+!testdata/*/*.pm
+!testdata/*/*.txt
!testdata/*/*.gz
# and readme's
@@ -45,6 +48,7 @@
!logs_basic/
+!csv
# advice Linus
# use add -f to add few files in ignored folders
diff --git a/squids/perl/SquidCountArchiveProcessLogRecord.pm
b/squids/perl/SquidCountArchiveProcessLogRecord.pm
index 7d32ef1..d537512 100755
--- a/squids/perl/SquidCountArchiveProcessLogRecord.pm
+++ b/squids/perl/SquidCountArchiveProcessLogRecord.pm
@@ -97,7 +97,8 @@
$redirected_to_mobile += $count_event ;
return ;
}
- }
+ };
+
$url =~ s/^http\w?\:\/\///o ;
$url =~ s/\%3A/:/gio ;
@@ -142,58 +143,12 @@
else
{ $mimecat = "other" ; }
+
+
if ($job_runs_on_production_server)
{
$country = "" ;
-
- #################################################
- # udp-filters has logic for gelocating and it's using
- # the x-forwarded-for field to geolocate first if that's
- # present but only the first ip inside that field.
- #
- # At the present moment there is not enough time to
- # check all the ips in XFF field but we can come back
- # at this later -- Stefan Petrea
- #
- # The reason for this is that now 2012-Dec-04 we have a high
- # priority for getting editor reports for November out
- # there.
- #################################################
-
- #@xffparts = split('%20',$xff) ;
- #foreach $ip (@xffparts)
- #{
- #next if $country ne "" ;
- #if ($ip =~ /^\d+\.\d+\.\d+\.\d+$/)
- #{
- #$country = $savedipcountry { $ip } ;
- #if ($country eq "")
- #{
- #$country = `echo $ip | /usr/local/bin/geoiplogtag 1` ;
- #$country =~ s/.*\s([\w-\(])/$1/ ;
- #$country =~ s/\s//g ;
- #$savedipcountry { $ip } = $country ;
- #}
- #$foundip = $ip ;
- #if ($country =~ /(^(--|-P|A1|A2|AB|BL|G2|GF|KO|MF|O1|TE|TK)$|null)/
) # Non-countries
- #{ $country = "" ; }
- #}
- #}
- if ($country eq "")
- {
- $country = $fields [14] ;
- $foundip = $client_ip ;
- }
- if (($country eq "") || ($country =~ /null/))
- {
- $country = "--" ;
- if ($foundip =~ /:/)
- {
- $country = "-P" ;
- }
- }
- if (&IsInternal($foundip))
- { $country = "-X" ; }
+ ($country) = $client_ip =~ /\|(..)$/;
}
else
{
diff --git a/squids/perl/SquidCountArchiveReadInput.pm
b/squids/perl/SquidCountArchiveReadInput.pm
index 6e8ad7d..d578444 100755
--- a/squids/perl/SquidCountArchiveReadInput.pm
+++ b/squids/perl/SquidCountArchiveReadInput.pm
@@ -182,12 +182,12 @@
else
{ open IN,"<$file_in" ; } # http://perldoc.perl.org/functions/open.html
- $fields_expected = 14 ;
+ $fields_expected = 13 ;
}
else
{
open IN, '<', $file_in ;
- $fields_expected = 14 ; # add fake country code
+ $fields_expected = 13 ; # add fake country code
# $fields_expected = 13 ;
}
@@ -220,7 +220,7 @@
# print "mime " . $fields[10] . "\n" ;
#next if $fields [9] eq '-' ;
#next if $fields [9] =~ /NONE/ ;
- if ($#fields > 14)
+ if ($#fields >= 13)
{
if (! $scan_ip_frequencies)
{
@@ -228,11 +228,7 @@
# print "fields " . $#fields . "\n$line\n" ;
}
- $country_code = $fields [$#fields] ;
- $fields [$#fields] = '' ;
- $line = join (' ', @fields) ;
@fields = split (' ', $line, 14) ;
- $fields [14] = $country_code ;
$fields [13] =~ s/ /%20/g ;
if (! $scan_ip_frequencies)
diff --git a/squids/t/06-regression-mismatch-world-north-south-unknown.t
b/squids/t/06-regression-mismatch-world-north-south-unknown.t
index 8f6b015..d8a3933 100644
--- a/squids/t/06-regression-mismatch-world-north-south-unknown.t
+++ b/squids/t/06-regression-mismatch-world-north-south-unknown.t
@@ -134,6 +134,10 @@
unknown = $unknown
";
+ok($world_total > 2, "world_total > 2");
+ok($global_south > 2, "globla_south > 2");
+ok($global_north > 2, "globla_north > 2");
+
warn "world_total - global_north - global_south - ipv6 - unknown =
".($world_total - $global_north - $global_south - $ipv6 - $unknown);
my $country_code_invariant_1 = $world_total - $global_north - $global_south -
$ipv6 - $unknown;
diff --git a/squids/t/09-regression-totals-fixes-squidreportclients.t
b/squids/t/09-regression-totals-fixes-squidreportclients.t
index 5d5fa95..7b533af 100644
--- a/squids/t/09-regression-totals-fixes-squidreportclients.t
+++ b/squids/t/09-regression-totals-fixes-squidreportclients.t
@@ -198,7 +198,7 @@
};
my $wikistats_run_cmd_output = `$wikistats_run_cmd`;
-warn $wikistats_run_cmd_output;
+#warn $wikistats_run_cmd_output;
use HTML::TreeBuilder::XPath;
diff --git a/squids/t/10-regression-mingle-356-bugzilla-46269.t
b/squids/t/10-regression-mingle-356-bugzilla-46269.t
new file mode 100644
index 0000000..2376510
--- /dev/null
+++ b/squids/t/10-regression-mingle-356-bugzilla-46269.t
@@ -0,0 +1,115 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Test::More qw/no_plan/;
+require 't/CommonConfig.pm';
+use lib "testdata/regression-mingle-356-bugzilla-46269";
+use Data::Dumper;
+use SquidReportArchiveConfig;
+use Carp;
+use lib "./t";
+use Generate::Squid;
+use List::Util qw/sum/;
+
+our $__DATA_BASE;
+our $__CODE_BASE;
+
+my $o = Generate::Squid->new({
+ start_date => "2013-01-31" ,
+ prefix => "sampled-1000.tab.log-" ,
+ output_dir => "$__DATA_BASE",
+});
+
+my @ua1 = split(/\n/,`zcat $__DATA_BASE/ua.txt.gz`);
+my @ua = @ua1[0..10_000];
+
+$o->generate_line({ geocode=>"--" }) for 1..30;
+$o->__increase_day;
+$o->generate_line({ geocode=>"--" }) for 1..30;
+$o->dump_to_disk; # 2013-02-01
+
+
+$o->generate_line({ geocode=>"CA" }) for 1..100;
+$o->generate_line({ geocode=>"US" }) for 1..60;
+$o->__increase_day;
+
+$o->generate_line({ geocode=>"BR" , user_agent_header => $_ }) for @ua;
+$o->generate_line({ geocode=>"BE" }) for 1..30;
+$o->generate_line({ geocode=>"NL" }) for 1..30;
+$o->dump_to_disk; # 2013-02-02
+
+
+$o->generate_line({ geocode=>"NL" }) for 1..30;
+$o->__increase_day;
+$o->generate_line({ geocode=>"--" });
+$o->dump_to_disk; # 2013-02-03
+
+
+
+my $wikistats_run_cmd = qq{
+
+ cd $__DATA_BASE;
+ rm -f sampled-1000.*.gz;
+ ls sampled-1000.tab* | grep -v "\.gz" | xargs gzip;
+ cd $__CODE_BASE;
+
+ echo "FINISHED gzip";
+
+ rm -rf $__DATA_BASE/csv/;
+ rm -rf $__DATA_BASE/reports/;
+ rm -rf $__DATA_BASE/logs/;
+
+ echo "FINISHED cleaning";
+
+ mkdir $__DATA_BASE/csv/;
+ ln -s ../../../csv/meta $__DATA_BASE/csv/meta;
+
+ echo "FINISHED cleaning 2";
+
+ ########################
+ # Run Count Archive
+ ########################
+ nice perl \\
+ -I ./perl \\
+ perl/SquidCountArchive.pl \\
+ -d 2013/02/02-2013/02/02 \\
+ -r $__DATA_BASE/SquidCountArchiveConfig.pm \\
+ -p 2>&1;
+
+ echo "FINISHED counting";
+ ########################
+ # Make the reports
+ ########################
+ nice perl perl/SquidReportArchive.pl \\
+ -r $__DATA_BASE/SquidReportArchiveConfig.pm \\
+ -m 2013-02 \\
+ -p 2>&1;
+};
+
+my $wikistats_run_cmd_output = `$wikistats_run_cmd`;
+
+use HTML::TreeBuilder::XPath;
+my $p = HTML::TreeBuilder::XPath->new;
+$p->parse_file("$__DATA_BASE/reports/2013-02/SquidReportDevices.htm");
+# 3rd column
+my @percentages = map {
+ my @d = $_->descendants("/td");
+ my $val;
+ if(@d==8) {
+ if($d[0]->as_text) {
+ $val = $d[3]->as_text;
+ };
+ };
+ $val;
+ } $p->findnodes("/html/body/p/table/tr");
+
+@percentages = map { /(\d+\.\d+)/ } grep { defined } @percentages;
+
+
+my $sum = sum(@percentages);
+print "Sum=$sum\n";
+
+ok( @percentages > 2 , "More than 2 device classes");
+ok( abs( $sum - 100) < 0.2, "Percentages of device classes add up to 100%");
+
+
diff --git a/squids/t/CommonConfig.pm b/squids/t/CommonConfig.pm
index 224441d..367e5e9 100644
--- a/squids/t/CommonConfig.pm
+++ b/squids/t/CommonConfig.pm
@@ -12,18 +12,21 @@
#
my $hostname = `hostname`;
+my $pwd = `pwd`;
chomp $hostname;
our $__CODE_BASE;
if($hostname eq "gallium") {
# Running on Jenkins
- $__CODE_BASE = "/var/lib/jenkins/jobs/analytics-wikistats/workspace/squids";
+ $__CODE_BASE = $ENV{WORKSPACE}."/squids";
} elsif($hostname eq "stat1" && $ENV{HOME} eq "/home/ezachte") {
# Running on Erik's account on stat1
$__CODE_BASE = "/home/ezachte/wikistats/squids";
} elsif($hostname eq "stat1" && $ENV{HOME} eq "/home/diederik") {
# Running on Diederik's account on stat1
$__CODE_BASE = "/home/diederik/wikistats/squids";
+} elsif($pwd =~ /\/travis\//) {
+ $__CODE_BASE = "/home/travis/build/wsdookadr/analytics-wikistats/squids";
} else {
# Anywhere else
$__CODE_BASE = "/a/wikistats_git/squids";
diff --git a/squids/t/Generate/Squid.pm b/squids/t/Generate/Squid.pm
index 1bce9e8..eb5b80b 100644
--- a/squids/t/Generate/Squid.pm
+++ b/squids/t/Generate/Squid.pm
@@ -156,14 +156,37 @@
$ALL_COUNTRY_CODES[int(rand(@ALL_COUNTRY_CODES))];
};
-
sub dump_to_disk_and_increase_day {
my ($self) = @_;
+ $self->dump_to_disk;
+ $self->__increase_day;
+};
+sub get_date_current {
+ my ($self) = @_;
my $tp_object = $self->__parse_self_current_datetime;
my $filename_date = $tp_object->ymd;
$filename_date =~ s/-//g;
+ return $filename_date;
+};
+sub get_date_previous {
+ my ($self) = @_;
+ my $tp_object = $self->__parse_self_current_datetime;
+ $tp_object-=86_400;
+ my $filename_date = $tp_object->ymd;
+ $filename_date =~ s/-//g;
+ return $filename_date;
+};
+
+sub dump_to_disk_previous {
+ my ($self) = @_;
+ my $filename_date = $self->get_date_previous;
+ $self->_write_to_disk($filename_date);
+}
+
+sub _write_to_disk {
+ my ($self,$filename_date) = @_;
my $output_dir = $self->{output_dir};
my $log_filename = $self->{prefix}.$filename_date;
my $log_fullpath = "$output_dir/$log_filename";
@@ -175,7 +198,12 @@
# reset buffer and close filehandle
close($log_fh);
$self->{buffer} = "";
- $self->__increase_day;
+}
+
+sub dump_to_disk {
+ my ($self) = @_;
+ my $filename_date = $self->get_date_current;
+ $self->_write_to_disk($filename_date);
};
sub __mobile_url_country {
"http://$_[0].m.wikipedia.org/wiki/Harry_Potter_and_the_Half-Blood_Prince_(film)"
};
@@ -224,7 +252,7 @@
referer_header =>
"http://en.wikipedia.org/wiki/Phil_of_the_Future",
x_forwarded_for_header => "-",
user_agent_header =>
"Mozilla/5.0%20(Windows%20NT%206.1;%20WOW64;%20rv:15.0)%20Gecko/20100101%20Firefox/15.0.1",
- geocode => "US",
+ geocode => "--",
};
my $field_client_ip;
@@ -257,7 +285,11 @@
my $field_referer_header = $params->{referer_header} //
$default->{referer_header};
my $field_x_forwarded_for_header = $params->{x_forwarded_for_header} //
$default->{x_forwarded_for_header};
my $field_user_agent_header = $params->{user_agent_header} //
$default->{user_agent_header};
- my $field_geocode = $params->{geocode} //
$default->{geocode};
+
+ my $geocode = $params->{geocode} //
$default->{geocode};
+
+ $field_client_ip .= "|$geocode";
+
# Currently wikistats accepts 15-field entries, geocode field included (will
have to talk to Erik about fields
# 15 and 16 in the description above because we seem to be dropping those at
the 15-field-count-check)
@@ -276,7 +308,6 @@
$field_referer_header ,
$field_x_forwarded_for_header,
$field_user_agent_header ,
- $field_geocode ,
);
$self->{buffer} .= "$raw_logline\n";
diff --git
a/squids/testdata/regression-mingle-356-bugzilla-46269/SquidCountArchiveConfig.pm
b/squids/testdata/regression-mingle-356-bugzilla-46269/SquidCountArchiveConfig.pm
new file mode 100644
index 0000000..71d56da
--- /dev/null
+++
b/squids/testdata/regression-mingle-356-bugzilla-46269/SquidCountArchiveConfig.pm
@@ -0,0 +1,16 @@
+#!/usr/bin/perl
+require 't/CommonConfig.pm';
+
+$__DATA_BASE =
"$__CODE_BASE/testdata/regression-mingle-356-bugzilla-46269";
+#no tracing
+$trace_on_exit = $false;
+$trace_on_exit_verbose = $false;
+$trace_on_exit_concise = $false;
+
+$cfg_liblocation = "$__CODE_BASE/perl" ;
+$squids = "$__CODE_BASE" ;
+
+$cfg_path_root_production = "$__DATA_BASE/csv" ;
+$cfg_dir_in_production = "$__DATA_BASE" ;
+
+$cfg_logname = "sampled-1000.tab.log" ;
diff --git
a/squids/testdata/regression-mingle-356-bugzilla-46269/SquidReportArchiveConfig.pm
b/squids/testdata/regression-mingle-356-bugzilla-46269/SquidReportArchiveConfig.pm
new file mode 100644
index 0000000..24c2494
--- /dev/null
+++
b/squids/testdata/regression-mingle-356-bugzilla-46269/SquidReportArchiveConfig.pm
@@ -0,0 +1,21 @@
+#!/usr/bin/perl
+require 't/CommonConfig.pm';
+
+$__DATA_BASE =
"$__CODE_BASE/testdata/regression-mingle-356-bugzilla-46269";
+#no tracing
+$trace_on_exit = $false;
+$trace_on_exit_verbose = $false;
+$trace_on_exit_concise = $false;
+
+# Code configuration
+$cfg_liblocation = "$__CODE_BASE/perl" ;
+$squids = "$__CODE_BASE" ;
+
+
+# Data configuration
+
+$cfg_path_csv = "$__DATA_BASE/csv" ;
+$cfg_path_reports = "$__DATA_BASE/reports" ;
+$cfg_path_log = "$__DATA_BASE/logs" ;
+
+$cfg_default_argv = "-m 2011-08" ; # monthly report
diff --git a/squids/testdata/regression-mingle-356-bugzilla-46269/ua.txt.gz
b/squids/testdata/regression-mingle-356-bugzilla-46269/ua.txt.gz
new file mode 100644
index 0000000..f220f76
--- /dev/null
+++ b/squids/testdata/regression-mingle-356-bugzilla-46269/ua.txt.gz
Binary files differ
--
To view, visit https://gerrit.wikimedia.org/r/62206
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I5c74df172e1f3505cfd72aaaab068d7d9a4476ce
Gerrit-PatchSet: 5
Gerrit-Project: analytics/wikistats
Gerrit-Branch: master
Gerrit-Owner: Stefan.petrea <[email protected]>
Gerrit-Reviewer: Diederik <[email protected]>
Gerrit-Reviewer: Erik Zachte <[email protected]>
Gerrit-Reviewer: Milimetric <[email protected]>
Gerrit-Reviewer: jenkins-bot
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits