http://www.mediawiki.org/wiki/Special:Code/MediaWiki/88649
Revision: 88649
Author: ezachte
Date: 2011-05-23 16:50:38 +0000 (Mon, 23 May 2011)
Log Message:
-----------
perl script ready for integration test + added test file
Modified Paths:
--------------
trunk/wikistats/analytics/MySQLPrepComscoreData.pl
Added Paths:
-----------
trunk/wikistats/analytics/create_and_use_db_analytics.txt
Modified: trunk/wikistats/analytics/MySQLPrepComscoreData.pl
===================================================================
--- trunk/wikistats/analytics/MySQLPrepComscoreData.pl 2011-05-23 16:37:56 UTC
(rev 88648)
+++ trunk/wikistats/analytics/MySQLPrepComscoreData.pl 2011-05-23 16:50:38 UTC
(rev 88649)
@@ -16,6 +16,7 @@
# Functionality:
# comScore data can be downloaded as csv file, which each contain 14 months
history
# This script uses these files to update 'master' csv files which contain all
known history
+# Note: only entities which are already in master file will be updated!
# Then it merges these master files into one csv file which can be loaded into
analytics database
# Data are: reach by region, unique visitors by region, unique visitors by web
property
@@ -26,6 +27,8 @@
# Output:
# updated master csv files + merged and formatted csv for import in MySQL
+# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/analytics/
+
use Getopt::Std ;
use Cwd;
@@ -38,7 +41,7 @@
$script_name = "MySQLPrepComscoreData.pl" ;
$script_version = "0.3" ;
-# test
+# EZ test only
# $source = "comscore" ;
# $server = "ez_test" ;
# $generated = "2011-05-06 00:00:00" ;
@@ -60,6 +63,10 @@
$file_comscore_uv_property_master = "excel_out_comscore_UV_properties.csv" ;
$file_comscore_uv_property_update = "*UV*trend*csv" ;
+ $layout_csv_reach = 1 ;
+ $layout_csv_regions = 2 ;
+ $layout_csv_properties = 3 ;
+
print "Directories:\nAnalytics '$dir_analytics'\nUpdates
'$dir_comscore_updates'\n\n" ;
%region_codes = (
@@ -69,6 +76,7 @@
"World-Wide" => "W",
"Middle East - Africa" => "MA",
"Asia Pacific"=> "AS",
+ "United States" => "US",
"India" => "I",
"China" => "C"
) ;
@@ -78,16 +86,14 @@
@months_short = qw "Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec" ;
- # &ReadDataReachPerRegion ($file_comscore_reach_master,
$file_comscore_reach_update, "%.1f") ;
- # %reach_region_code = %data ;
+ &ReadDataReachPerRegion ($file_comscore_reach_master,
$file_comscore_reach_update, "%.1f", 1, $layout_csv_reach) ;
+ %reach_region_code = %data ;
- &ReadDataVisitorsPerRegion ($file_comscore_uv_region_master,
$file_comscore_uv_region_update, "%.0f") ;
+ &ReadDataVisitorsPerRegion ($file_comscore_uv_region_master,
$file_comscore_uv_region_update, "%.0f", 1000, $layout_csv_regions) ;
%visitors_region_code = %data ;
- exit ;
- &ReadDataVisitorsPerProperty ($file_comscore_uv_property_master,
$file_comscore_uv_property_update, "%.0f") ;
+ &ReadDataVisitorsPerProperty ($file_comscore_uv_property_master,
$file_comscore_uv_property_update, "%.0f", 1000, $layout_csv_properties) ;
%visitors_web_property = %data ;
- exit ;
&WriteDataAnalytics ;
@@ -96,8 +102,14 @@
sub UpdateFromLatestComscoreData
{
- my ($file_comscore_master, $file_comscore_updates) = @_ ;
+ my ($file_comscore_master, $file_comscore_updates, $multiplier, $layout_csv,
@update_only) = @_ ;
+ undef %update_only ;
+ undef %do_not_update ;
+
+ foreach $id (@update_only)
+ { $update_only {$id} = $true ; }
+
if (! -e "$dir_analytics/$file_comscore_master")
{ abort ("File $file_comscore_master not found!") ; }
@@ -140,46 +152,85 @@
while ($line = <CSV>)
{
chomp $line ;
- if ($line =~ /^Location.*?-.*?-.*?-/) # e.g.
'Location,Location,Jan-2010,Feb-2010,Mar-2010,Apr-2010,...'
+ $line = &GetNumberOnly ($line) ;
+
+ if ($line =~ /Jan-\d\d\d\d.*?Feb-\d\d\d\d/) # e.g.
'Location,Location,Jan-2010,Feb-2010,Mar-2010,Apr-2010,...'
{
- ($dummy1,$dummy2,@months) = split (',', $line) ;
+ if ($layout_csv == $layout_csv_properties)
+ { ($dummy1,$dummy2,$dummy3,@months) = split (',', $line) ; } # web
properties csv file
+ else
+ { ($dummy1,$dummy2,@months) = split (',', $line) ; } # uv /
reach csv files
+
@months = &mmm_yyyy2yyyy_mm (@months) ;
}
- if ($line =~ /^\d,/)
+
+ if ($line =~ /^\d+,/)
{
- ($index,$region,@data) = split (',', $line) ;
- $region =~ s/^\s+// ;
- $region =~ s/\s+$// ;
- $region_code = $region_codes {$region} ;
+ if ($layout_csv == $layout_csv_properties)
+ {
+ ($index,$dummy,$property,@data) = split (',', $line) ;
+ $property =~ s/^\s+// ;
+ $property =~ s/\s+$// ;
+ $property =~ s/.*Google.*/Google/i ;
+ $property =~ s/.*Microsoft.*/Microsoft/i ;
+ $property =~ s/.*FACEBOOK.*/Facebook/i ;
+ $property =~ s/.*Yahoo.*/Yahoo/i ;
+ $property =~ s/.*Amazon.*/Amazon/i ;
+ $property =~ s/.*Apple.*/Apple/i ;
+ $property =~ s/.*AOL.*/AOL/i ;
+ $property =~ s/.*Wikimedia.*/Wikimedia/i ;
+ $property =~ s/.*Tencent.*/Tencent/i ;
+ $property =~ s/.*Baidu.*/Baidu/i ;
+ $property =~ s/.*CBS.*/CBS/i ;
+
+ $id = $property ;
+ }
+ else
+ {
+ ($index,$region,@data) = split (',', $line) ;
+ $region =~ s/^\s+// ;
+ $region =~ s/\s+$// ;
+ $id = $region_codes {$region} ;
+ }
+
+ if ($update_only {$id} == 0)
+ {
+ $do_not_update {$id}++ ;
+ next ;
+ }
+
for ($m = 0 ; $m <= $#months ; $m++)
{
$yyyymm = $months [$m] ;
$months {$yyyymm} ++ ;
- $yyyymm_region_code = "$yyyymm,$region_code" ;
- $data = $data [$m] ;
+ $yyyymm_id = "$yyyymm,$id" ;
+ $data = $data [$m] * $multiplier ;
- if (! defined $data {$yyyymm_region_code})
+ if (! defined $data {$yyyymm_id})
{
$updates_found = $true ;
- print "New data found: $yyyymm_region_code = $data\n" ;
- $data {$yyyymm_region_code} = $data ;
+ print "New data found: $yyyymm_id = $data\n" ;
+ $data {$yyyymm_id} = $data ;
}
}
}
}
+ $ignored = join ', ', sort keys %do_not_update ;
+ print "\nEntities ignored:\n$ignored\n\n" ;
+
if (! $updates_found)
- { print "No new updates found.\n" ; }
+ { print "No new updates found\n" ; }
else
- { print "\nUpdates found, rewrite master file '$file_comscore_master'.\n\n"
; }
+ { print "\nUpdates found, rewrite master file '$file_comscore_master'\n\n" ;
}
return ($updates_found) ;
}
sub ReadDataReachPerRegion
{
- my ($file_comscore_master, $file_comscore_updates, $precision) = @_ ;
+ my ($file_comscore_master, $file_comscore_updates, $precision, $layout_csv)
= @_ ;
undef %months ;
undef %data ;
@@ -214,7 +265,7 @@
}
close IN ;
- my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master,
$file_comscore_updates) ;
+ my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master,
$file_comscore_updates, 1, $layout_csv, @regions) ;
return if ! $updates_found ;
rename "$dir_analytics/$file_comscore_master",
"$dir_analytics/$file_comscore_master.~" ;
@@ -241,7 +292,7 @@
sub ReadDataVisitorsPerRegion
{
- my ($file_comscore_master, $file_comscore_updates, $precision) = @_ ;
+ my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier,
$layout_csv) = @_ ;
undef %months ;
undef %data ;
@@ -264,20 +315,24 @@
$field_ndx = 0 ;
foreach (@data)
{
- $region = $regions [$field_ndx++] ;
+ $region = $regions [$field_ndx] ;
$region_code = $region_codes {$region} ;
$data = $data [$field_ndx] ;
if ($data eq '')
{ $data = '0' ; }
+ # print "Old data $yyyymm,$region = $data\n" ;
+
$months {$yyyymm} ++ ;
$data {"$yyyymm,$region_code"} = $data ;
+
+ $field_ndx++ ;
}
}
close IN ;
- my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master,
$file_comscore_updates) ;
+ my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master,
$file_comscore_updates, 1000, $layout_csv, @regions) ;
return if ! $updates_found ;
rename "$dir_analytics/$file_comscore_master",
"$dir_analytics/$file_comscore_master.~" ;
@@ -304,10 +359,11 @@
sub ReadDataVisitorsPerProperty
{
- my ($file_comscore_master, $file_comscore_updates, $precision) = @_ ;
+ my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier,
$layout_csv) = @_ ;
undef %months ;
undef %data ;
+ undef @properties ;
open IN, '<', "$dir_analytics/$file_comscore_master" ;
@@ -317,8 +373,6 @@
{
chomp $line ;
- $line = &GetNumberOnly ($line) ;
-
($yyyymm,@data) = split (',', $line) ;
if ($lines++ == 0)
{ @properties = @data ; next ; }
@@ -326,36 +380,40 @@
$field_ndx = 0 ;
foreach (@data)
{
- $property = $properties [$field_ndx++] ;
-
+ $property = $properties [$field_ndx] ;
+ $property =~ s/.*Yahoo.*/Yahoo/ ;
$data = $data [$field_ndx] ;
if ($data eq '')
{ $data = '0' ; }
+ # print "Old data $yyyymm,$property = $data\n" ;
+
$months {$yyyymm} ++ ;
$data {"$yyyymm,$property"} = $data ;
+
+ $field_ndx++ ;
}
}
close IN ;
- my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master,
$file_comscore_updates) ;
+ my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master,
$file_comscore_updates, 1000, $layout_csv, @properties) ;
return if ! $updates_found ;
rename "$dir_analytics/$file_comscore_master",
"$dir_analytics/$file_comscore_master.~" ;
open OUT, '>', "$dir_analytics/$file_comscore_master" ;
$line_out = "yyyymm" ;
- foreach $region_name (@regions)
- { $line_out .= ",$region_name" ; }
+ foreach $property (@properties)
+ { $line_out .= ",$property" ; }
print OUT "$line_out" ;
foreach $yyyymm (sort {$b cmp $a} keys %months)
{
$line_out = "\n$yyyymm" ;
- foreach $region_name (@regions)
+ foreach $property (@properties)
{
- $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ;
- $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ;
+ $yyyymm_property = "$yyyymm,$property" ;
+ $line_out .= "," . sprintf ($precision, $data {$yyyymm_property}) ;
}
print OUT "$line_out" ;
}
@@ -367,7 +425,7 @@
{
open OUT, '>', "c:/MySQL/analytics/analytics_in_comscore.csv" ;
- $metric = 'unique_visitors' ;
+ $metric = 'unique_visitors' ;
foreach $yyyymm (sort keys %months)
{
# store meta data elsewhere
@@ -400,7 +458,7 @@
$line =
"$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ;
print OUT $line ;
- print $line ;
+ # print $line ;
}
}
}
Added: trunk/wikistats/analytics/create_and_use_db_analytics.txt
===================================================================
--- trunk/wikistats/analytics/create_and_use_db_analytics.txt
(rev 0)
+++ trunk/wikistats/analytics/create_and_use_db_analytics.txt 2011-05-23
16:50:38 UTC (rev 88649)
@@ -0,0 +1,77 @@
+-- invoke this file with "mysql --user=root --password=[...] <
create_and_use_db_analytics.txt"
+
+/* Create database and two tables from scratch */
+
+DROP DATABASE IF EXISTS `analytics` ;
+CREATE DATABASE `analytics` ;
+USE `analytics` ;
+
+CREATE TABLE `comscore` (
+-- meta data (mostly for auditing, may not be sent over API on default)
+/*
+ store meta dat elsewhere,tbd
+ `id` int auto_increment NOT NULL,
+ `generated` timestamp,
+ `source` char (20),
+ `server` char (20),
+ `script_name` char (30),
+ `script_version` char (8),
+ `user` char (20),
+*/
+-- analytics data
+ `yyyymm` char (7),
+ `country_code` char (3),
+ `region_code` char (2),
+ `property` char (20),
+ `project` char (10),
+ `reach` decimal (4,1) DEFAULT '-1',
+ `visitors` decimal (15) DEFAULT '-1',
+ PRIMARY KEY (yyyymm,country_code,region_code,property)
+) ;
+
+CREATE TABLE `comscore_regions` (
+ `region_code` char (2),
+ `language_code` char (10),
+ `region_name` char (18),
+ PRIMARY KEY (language_code,region_code)
+) ;
+
+-- SHOW TABLES ;
+-- DESCRIBE comscore ;
+-- DESCRIBE comscore_regions ;
+
+/* Database Manipulation */
+/* Obviously in real world this is a separate script */
+
+
+-- show contents (debugging only)
+-- SELECT *
+-- FROM comscore_regions ;
+
+LOAD DATA INFILE 'c:/MySQL/analytics/analytics_in_comscore_regions.csv'
+ INTO TABLE comscore_regions
+ FIELDS TERMINATED BY ','
+ OPTIONALLY ENCLOSED BY '"'
+ (language_code,region_code,region_name) ;
+
+LOAD DATA INFILE 'c:/MySQL/analytics/analytics_in_comscore.csv'
+ INTO TABLE comscore
+ FIELDS TERMINATED BY ','
+ OPTIONALLY ENCLOSED BY '"'
+ (yyyymm,country_code,region_code,property,project,reach,visitors) ;
+
+-- show contents (debugging only)
+ SELECT *
+ FROM comscore
+ ORDER BY yyyymm,country_code,region_code,property,project
+ INTO OUTFILE 'c:/MySQL/analytics/analytics_out_comscore_full_table.csv'
+ FIELDS TERMINATED BY ',' ;
+
+-- export all relevant non-meta data from comsCore's reach by region (and
comScore treats India and China as regions in this context)
+SELECT yyyymm,region_name,reach
+ FROM comscore LEFT JOIN comscore_regions ON comscore.region_code =
comscore_regions.region_code AND comscore_regions.language_code = 'en'
+ WHERE (region_name != '') AND (yyyymm BETWEEN '2010-06' AND '2011-05')
+ ORDER BY yyyymm,region_name
+ INTO OUTFILE 'c:/MySQL/analytics/analytics_out_comscore_reach.csv'
+ FIELDS TERMINATED BY ',' ;
+
Property changes on: trunk/wikistats/analytics/create_and_use_db_analytics.txt
___________________________________________________________________
Added: svn:eol-style
+ native
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs