http://www.mediawiki.org/wiki/Special:Code/MediaWiki/88643
Revision: 88643
Author: ezachte
Date: 2011-05-23 15:00:50 +0000 (Mon, 23 May 2011)
Log Message:
-----------
scripts and data files related to new 'analytics' MySQL database, which will
serve high level aggregated WMF metrics through an API
Added Paths:
-----------
trunk/wikistats/analytics/
trunk/wikistats/analytics/MySQLPrepComscoreData.pl
Added: trunk/wikistats/analytics/MySQLPrepComscoreData.pl
===================================================================
--- trunk/wikistats/analytics/MySQLPrepComscoreData.pl
(rev 0)
+++ trunk/wikistats/analytics/MySQLPrepComscoreData.pl 2011-05-23 15:00:50 UTC
(rev 88643)
@@ -0,0 +1,438 @@
+#!/usr/bin/perl
+
+# Copyright (C) 2011 Wikimedia Foundation
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License version 2
+# as published by the Free Software Foundation.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the GNU General Public License for more details, at
+# http://www.fsf.org/licenses/gpl.html
+
+# Author:
+# Erik Zachte, email [email protected]
+
+# Functionality:
+# comScore data can be downloaded as csv file, which each contain 14 months
history
+# This script uses these files to update 'master' csv files which contain all
known history
+# Then it merges these master files into one csv file which can be loaded into
analytics database
+# Data are: reach by region, unique visitors by region, unique visitors by web
property
+
+# Parameters:
+# -m (required) folder with 'master' csv files (files with all known history)
+# -u (required) folder with 'update' csv files (files with lastest 14 months
history, produced by comScore)
+
+# Output:
+# updated master csv files + merged and formatted csv for import in MySQL
+
+ use Getopt::Std ;
+ use Cwd;
+
+ my $options ;
+ getopt ("mu", \%options) ;
+
+ $true = 1 ;
+ $false = 0 ;
+
+ $script_name = "MySQLPrepComscoreData.pl" ;
+ $script_version = "0.3" ;
+
+# test
+# $source = "comscore" ;
+# $server = "ez_test" ;
+# $generated = "2011-05-06 00:00:00" ;
+# $user = "ezachte" ;
+
+ $dir_analytics = $options {"m"} ;
+ $dir_comscore_updates = $options {"u"} ;
+
+ $dir_analytics = "c:/MySQL/analytics" ; # EZ test only
+ $dir_comscore_updates = "W:/@ Report Card/Data" ; # EZ test only
+
+ if (($dir_analytics eq '') || ($dir_comscore_updates eq ''))
+ { abort ("Specify folder for 'master' csv files as '-m folder', folder for
'update' csv files as -u folder'") ; }
+
+ $file_comscore_reach_master = "excel_out_comscore_reach_regions.csv" ;
+ $file_comscore_reach_update = "*reach*by*region*csv" ;
+ $file_comscore_uv_region_master = "excel_out_comscore_UV_regions.csv" ;
+ $file_comscore_uv_region_update = "*UVs*by*region*csv" ;
+ $file_comscore_uv_property_master = "excel_out_comscore_UV_properties.csv" ;
+ $file_comscore_uv_property_update = "*UV*trend*csv" ;
+
+ print "Directories:\nAnalytics '$dir_analytics'\nUpdates
'$dir_comscore_updates'\n\n" ;
+
+ %region_codes = (
+ "Europe"=>"EU",
+ "North America"=>"NA",
+ "Latin America"=>"LA",
+ "World-Wide" => "W",
+ "Middle East - Africa" => "MA",
+ "Asia Pacific"=> "AS",
+ "India" => "I",
+ "China" => "C"
+ ) ;
+
+ foreach $region_name (keys %region_codes)
+ { $region_names {$region_codes {$region_name}} = $region_name ; }
+
+ @months_short = qw "Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec" ;
+
+ # &ReadDataReachPerRegion ($file_comscore_reach_master,
$file_comscore_reach_update, "%.1f") ;
+ # %reach_region_code = %data ;
+
+ &ReadDataVisitorsPerRegion ($file_comscore_uv_region_master,
$file_comscore_uv_region_update, "%.0f") ;
+ %visitors_region_code = %data ;
+
+ exit ;
+ &ReadDataVisitorsPerProperty ($file_comscore_uv_property_master,
$file_comscore_uv_property_update, "%.0f") ;
+ %visitors_web_property = %data ;
+ exit ;
+
+ &WriteDataAnalytics ;
+
+ print "\nReady\n\n" ;
+ exit ;
+
+sub UpdateFromLatestComscoreData
+{
+ my ($file_comscore_master, $file_comscore_updates) = @_ ;
+
+ if (! -e "$dir_analytics/$file_comscore_master")
+ { abort ("File $file_comscore_master not found!") ; }
+
+ $age_all = -M "$dir_analytics/$file_comscore_master" ;
+ print "Latest comscore master file is " . sprintf ("%.0f", $age_all) . "
days old: '$file_comscore_master'\n" ;
+
+ my $cwd = getcwd ;
+ chdir $dir_comscore_updates ;
+
+ @files = glob($file_comscore_updates) ;
+ $min_age_upd = 999999 ;
+ $file_comscore_updates_latest = '' ;
+ foreach $file (@files)
+ {
+ $age = -M $file ;
+ if ($age < $min_age_upd)
+ {
+ $min_age_upd = $age ;
+ $file_comscore_updates_latest = $file ;
+ }
+ }
+ print "Latest comscore update file is " . sprintf ("%.0f", $min_age_upd) . "
days old: '$file_comscore_updates_latest'\n" ;
+
+ if ($min_age_upd == 999999)
+ {
+ print "No valid update file found. Nothing to update." ;
+ return ;
+ }
+
+ if ($age_all > $min_age_upd)
+ {
+ print "File with master data more recent than latest update csv from
comScore. Nothing to update." ;
+ return ;
+ }
+
+ my $updates_found = $false ;
+
+ print "\nRead updates\n\n" ;
+ open CSV, '<', $file_comscore_updates_latest ;
+ while ($line = <CSV>)
+ {
+ chomp $line ;
+ if ($line =~ /^Location.*?-.*?-.*?-/) # e.g.
'Location,Location,Jan-2010,Feb-2010,Mar-2010,Apr-2010,...'
+ {
+ ($dummy1,$dummy2,@months) = split (',', $line) ;
+ @months = &mmm_yyyy2yyyy_mm (@months) ;
+ }
+ if ($line =~ /^\d,/)
+ {
+ ($index,$region,@data) = split (',', $line) ;
+ $region =~ s/^\s+// ;
+ $region =~ s/\s+$// ;
+ $region_code = $region_codes {$region} ;
+
+ for ($m = 0 ; $m <= $#months ; $m++)
+ {
+ $yyyymm = $months [$m] ;
+ $months {$yyyymm} ++ ;
+ $yyyymm_region_code = "$yyyymm,$region_code" ;
+ $data = $data [$m] ;
+
+ if (! defined $data {$yyyymm_region_code})
+ {
+ $updates_found = $true ;
+ print "New data found: $yyyymm_region_code = $data\n" ;
+ $data {$yyyymm_region_code} = $data ;
+ }
+ }
+ }
+ }
+
+ if (! $updates_found)
+ { print "No new updates found.\n" ; }
+ else
+ { print "\nUpdates found, rewrite master file '$file_comscore_master'.\n\n"
; }
+
+ return ($updates_found) ;
+}
+
+sub ReadDataReachPerRegion
+{
+ my ($file_comscore_master, $file_comscore_updates, $precision) = @_ ;
+
+ undef %months ;
+ undef %data ;
+ undef @regions ;
+
+ open IN, '<', "$dir_analytics/$file_comscore_master" ;
+
+ $lines = 0 ;
+ while ($line = <IN>)
+ {
+ chomp $line ;
+
+ ($yyyymm,@data) = split (',', $line) ;
+
+ if ($lines++ == 0)
+ { @regions = @data ; next ; }
+
+ $field_ndx = 0 ;
+ foreach (@data)
+ {
+ $region = $regions [$field_ndx] ;
+ $region_code = $region_codes {$region} ;
+
+ $data = $data [$field_ndx] ;
+ if ($data eq '')
+ { $data = '0' ; }
+ $months {$yyyymm} ++ ;
+ $data {"$yyyymm,$region_code"} = $data ;
+ # print "Old data $yyyymm,$region_code = $data\n" ;
+ $field_ndx++ ;
+ }
+ }
+ close IN ;
+
+ my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master,
$file_comscore_updates) ;
+ return if ! $updates_found ;
+
+ rename "$dir_analytics/$file_comscore_master",
"$dir_analytics/$file_comscore_master.~" ;
+ open OUT, '>', "$dir_analytics/$file_comscore_master" ;
+
+ $line_out = "yyyymm" ;
+ foreach $region_name (@regions)
+ { $line_out .= ",$region_name" ; }
+ print OUT "$line_out" ;
+
+ foreach $yyyymm (sort {$b cmp $a} keys %months)
+ {
+ $line_out = "\n$yyyymm" ;
+ foreach $region_name (@regions)
+ {
+ $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ;
+ $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ;
+ }
+ print OUT "$line_out" ;
+ }
+
+ close OUT ;
+}
+
+sub ReadDataVisitorsPerRegion
+{
+ my ($file_comscore_master, $file_comscore_updates, $precision) = @_ ;
+
+ undef %months ;
+ undef %data ;
+ undef @regions ;
+
+ open IN, '<', "$dir_analytics/$file_comscore_master" ;
+
+ $lines = 0 ;
+ $metric = 'unique_visitors' ;
+ while ($line = <IN>)
+ {
+ chomp $line ;
+ $line = &GetNumberOnly ($line) ;
+
+ ($yyyymm,@data) = split (',', $line) ;
+
+ if ($lines++ == 0)
+ { @regions = @data ; next ; }
+
+ $field_ndx = 0 ;
+ foreach (@data)
+ {
+ $region = $regions [$field_ndx++] ;
+ $region_code = $region_codes {$region} ;
+
+ $data = $data [$field_ndx] ;
+ if ($data eq '')
+ { $data = '0' ; }
+
+ $months {$yyyymm} ++ ;
+ $data {"$yyyymm,$region_code"} = $data ;
+ }
+ }
+ close IN ;
+
+ my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master,
$file_comscore_updates) ;
+ return if ! $updates_found ;
+
+ rename "$dir_analytics/$file_comscore_master",
"$dir_analytics/$file_comscore_master.~" ;
+ open OUT, '>', "$dir_analytics/$file_comscore_master" ;
+
+ $line_out = "yyyymm" ;
+ foreach $region_name (@regions)
+ { $line_out .= ",$region_name" ; }
+ print OUT "$line_out" ;
+
+ foreach $yyyymm (sort {$b cmp $a} keys %months)
+ {
+ $line_out = "\n$yyyymm" ;
+ foreach $region_name (@regions)
+ {
+ $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ;
+ $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ;
+ }
+ print OUT "$line_out" ;
+ }
+
+ close OUT ;
+}
+
+sub ReadDataVisitorsPerProperty
+{
+ my ($file_comscore_master, $file_comscore_updates, $precision) = @_ ;
+
+ undef %months ;
+ undef %data ;
+
+ open IN, '<', "$dir_analytics/$file_comscore_master" ;
+
+ $lines = 0 ;
+ $metric = 'unique_visitors' ;
+ while ($line = <IN>)
+ {
+ chomp $line ;
+
+ $line = &GetNumberOnly ($line) ;
+
+ ($yyyymm,@data) = split (',', $line) ;
+ if ($lines++ == 0)
+ { @properties = @data ; next ; }
+
+ $field_ndx = 0 ;
+ foreach (@data)
+ {
+ $property = $properties [$field_ndx++] ;
+
+ $data = $data [$field_ndx] ;
+ if ($data eq '')
+ { $data = '0' ; }
+
+ $months {$yyyymm} ++ ;
+ $data {"$yyyymm,$property"} = $data ;
+ }
+ }
+ close IN ;
+
+ my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master,
$file_comscore_updates) ;
+ return if ! $updates_found ;
+
+ rename "$dir_analytics/$file_comscore_master",
"$dir_analytics/$file_comscore_master.~" ;
+ open OUT, '>', "$dir_analytics/$file_comscore_master" ;
+
+ $line_out = "yyyymm" ;
+ foreach $region_name (@regions)
+ { $line_out .= ",$region_name" ; }
+ print OUT "$line_out" ;
+
+ foreach $yyyymm (sort {$b cmp $a} keys %months)
+ {
+ $line_out = "\n$yyyymm" ;
+ foreach $region_name (@regions)
+ {
+ $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ;
+ $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ;
+ }
+ print OUT "$line_out" ;
+ }
+
+ close OUT ;
+}
+
+sub WriteDataAnalytics
+{
+ open OUT, '>', "c:/MySQL/analytics/analytics_in_comscore.csv" ;
+
+ $metric = 'unique_visitors' ;
+ foreach $yyyymm (sort keys %months)
+ {
+ # store meta data elsewhere
+ # $line =
"$generated,$source,$server,$script_name,$script_version,$user,$yyyymm,$country_code,$region_code,$property,$project,$normalized,$metric,$data\n"
;
+ foreach $region_code (sort values %region_codes)
+ {
+ $country_code = '-' ;
+ $property = '-' ;
+ $project = '-' ;
+ $reach = $reach_region_code {"$yyyymm,$region_code"} ;
+ $visitors = $visitors_region_code {"$yyyymm,$region_code"} ;
+
+ if (! defined $reach) { $reach = -1 ; }
+ if (! defined $visitors) { $reach = -1 ; }
+
+ $line =
"$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ;
+ print OUT $line ;
+ print $line ;
+ }
+
+ foreach $property (sort @properties)
+ {
+ $country_code = '-' ;
+ $region_code = '-' ;
+ $project = '-' ;
+ $reach = '-1' ;
+ $visitors = $visitors_web_property {"$yyyymm,$property"} ;
+
+ next if ! defined $visitors ;
+
+ $line =
"$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ;
+ print OUT $line ;
+ print $line ;
+ }
+ }
+}
+
+sub GetNumberOnly
+{
+ my $line = shift ;
+ $line =~ s/("[^\"]+")/($a=$1,$a=~s#,##g,$a)/ge ; # nested regexp: remove
comma's inside double quotes
+ $line =~ s/"//g ;
+ return $line ;
+}
+
+sub mmm_yyyy2yyyy_mm
+{
+ my @months = @_ ;
+ # Jan -> 01, etc
+ foreach my $month (@months)
+ {
+ my ($mmm,$yyyy) = split ('-', $month) ;
+ for ($m = 0 ; $m <= $#months_short ; $m++)
+ {
+ if ($mmm eq $months_short [$m])
+ { $month = "$yyyy-" . sprintf ("%02d", $m+1) ; }
+ }
+ }
+ return @months ;
+}
+
+sub abort
+{
+ $msg = shift ;
+
+ print "\nAbort, reason: $msg\n\n" ;
+ exit ;
+}
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs