http://www.mediawiki.org/wiki/Special:Code/MediaWiki/88716

Revision: 88716
Author:   ezachte
Date:     2011-05-24 11:53:02 +0000 (Tue, 24 May 2011)
Log Message:
-----------
rename MySQLPrepComscoreData.pl to AnalyticsPrepComscoreData.pl (storage 
solution can change later)

Added Paths:
-----------
    trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl

Removed Paths:
-------------
    trunk/wikistats/analytics/MySQLPrepComscoreData.pl

Copied: trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl (from rev 88649, 
trunk/wikistats/analytics/MySQLPrepComscoreData.pl)
===================================================================
--- trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl                      
        (rev 0)
+++ trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl      2011-05-24 
11:53:02 UTC (rev 88716)
@@ -0,0 +1,496 @@
+#!/usr/bin/perl
+
+# Copyright (C) 2011 Wikimedia Foundation
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License version 2
+# as published by the Free Software Foundation.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the GNU General Public License for more details, at
+# http://www.fsf.org/licenses/gpl.html
+
+# Author:
+# Erik Zachte, email [email protected]
+
+# Functionality:
+# comScore data can be downloaded as csv file, which each contain 14 months 
history
+# This script uses these files to update 'master' csv files which contain all 
known history
+# Note: only entities which are already in master file will be updated!
+# Then it merges these master files into one csv file which can be loaded into 
analytics database
+# Data are: reach by region, unique visitors by region, unique visitors by web 
property
+
+# Parameters:
+# -m (required) folder with 'master' csv files (files with all known history)
+# -u (required) folder with 'update' csv files (files with lastest 14 months 
history, produced by comScore)
+
+# Output:
+# updated master csv files + merged and formatted csv for import in MySQL
+
+# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/analytics/
+
+  use Getopt::Std ;
+  use Cwd;
+
+  my $options ;
+  getopt ("mu", \%options) ;
+
+  $true  = 1 ;
+  $false = 0 ;
+
+  $script_name    = "MySQLPrepComscoreData.pl" ;
+  $script_version = "0.3" ;
+
+# EZ test only
+# $source       = "comscore" ;
+# $server       = "ez_test" ;
+# $generated    = "2011-05-06 00:00:00" ;
+# $user         = "ezachte" ;
+
+  $dir_analytics        = $options {"m"} ;
+  $dir_comscore_updates = $options {"u"} ;
+
+  $dir_analytics        = "c:/MySQL/analytics" ;    # EZ test only
+  $dir_comscore_updates = "W:/@ Report Card/Data" ; # EZ test only
+
+  if (($dir_analytics eq '') || ($dir_comscore_updates eq ''))
+  { abort ("Specify folder for 'master' csv files as '-m folder', folder for 
'update' csv files as -u folder'") ; }
+
+  $file_comscore_reach_master     = "excel_out_comscore_reach_regions.csv" ;
+  $file_comscore_reach_update     = "*reach*by*region*csv" ;
+  $file_comscore_uv_region_master = "excel_out_comscore_UV_regions.csv" ;
+  $file_comscore_uv_region_update = "*UVs*by*region*csv" ;
+  $file_comscore_uv_property_master = "excel_out_comscore_UV_properties.csv" ;
+  $file_comscore_uv_property_update = "*UV*trend*csv" ;
+
+  $layout_csv_reach      = 1 ;
+  $layout_csv_regions    = 2 ;
+  $layout_csv_properties = 3 ;
+
+  print "Directories:\nAnalytics '$dir_analytics'\nUpdates 
'$dir_comscore_updates'\n\n" ;
+
+  %region_codes = (
+    "Europe"=>"EU",
+    "North America"=>"NA",
+    "Latin America"=>"LA",
+    "World-Wide" => "W",
+    "Middle East - Africa" => "MA",
+    "Asia Pacific"=> "AS",
+    "United States" => "US",
+    "India" => "I",
+    "China" => "C"
+  ) ;
+
+  foreach $region_name (keys %region_codes)
+  { $region_names {$region_codes {$region_name}} = $region_name ; }
+
+  @months_short = qw "Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec" ;
+
+  &ReadDataReachPerRegion ($file_comscore_reach_master, 
$file_comscore_reach_update, "%.1f", 1, $layout_csv_reach) ;
+  %reach_region_code = %data ;
+
+  &ReadDataVisitorsPerRegion ($file_comscore_uv_region_master, 
$file_comscore_uv_region_update, "%.0f", 1000, $layout_csv_regions) ;
+  %visitors_region_code = %data ;
+
+  &ReadDataVisitorsPerProperty ($file_comscore_uv_property_master, 
$file_comscore_uv_property_update, "%.0f", 1000, $layout_csv_properties) ;
+  %visitors_web_property = %data ;
+
+  &WriteDataAnalytics ;
+
+  print "\nReady\n\n" ;
+  exit ;
+
+sub UpdateFromLatestComscoreData
+{
+  my ($file_comscore_master, $file_comscore_updates, $multiplier, $layout_csv, 
@update_only) = @_ ;
+
+  undef %update_only ;
+  undef %do_not_update ;
+
+  foreach $id (@update_only)
+  { $update_only {$id} = $true ; }
+
+  if (! -e "$dir_analytics/$file_comscore_master")
+  { abort ("File $file_comscore_master not found!") ; }
+
+  $age_all = -M "$dir_analytics/$file_comscore_master" ;
+  print "Latest comscore master file is " . sprintf ("%.0f", $age_all) . " 
days old: '$file_comscore_master'\n" ;
+
+  my $cwd = getcwd ;
+  chdir $dir_comscore_updates ;
+
+  @files = glob($file_comscore_updates) ;
+  $min_age_upd = 999999 ;
+  $file_comscore_updates_latest = '' ;
+  foreach $file (@files)
+  {
+    $age = -M $file ;
+    if ($age < $min_age_upd)
+    {
+      $min_age_upd = $age ;
+      $file_comscore_updates_latest = $file ;
+    }
+  }
+  print "Latest comscore update file is " . sprintf ("%.0f", $min_age_upd) . " 
days old: '$file_comscore_updates_latest'\n" ;
+
+  if ($min_age_upd == 999999)
+  {
+    print "No valid update file found. Nothing to update." ;
+    return ;
+  }
+
+  if ($age_all > $min_age_upd)
+  {
+    print "File with master data more recent than latest update csv from 
comScore. Nothing to update." ;
+    return ;
+  }
+
+  my $updates_found = $false ;
+
+  print "\nRead updates\n\n" ;
+  open CSV, '<', $file_comscore_updates_latest ;
+  while ($line = <CSV>)
+  {
+    chomp $line ;
+    $line = &GetNumberOnly ($line) ;
+
+    if ($line =~ /Jan-\d\d\d\d.*?Feb-\d\d\d\d/) # e.g. 
'Location,Location,Jan-2010,Feb-2010,Mar-2010,Apr-2010,...'
+    {
+      if ($layout_csv == $layout_csv_properties)
+      { ($dummy1,$dummy2,$dummy3,@months) = split (',', $line) ; } # web 
properties csv file
+      else
+      { ($dummy1,$dummy2,@months) = split (',', $line) ; }         # uv / 
reach csv files
+
+      @months = &mmm_yyyy2yyyy_mm (@months) ;
+    }
+
+    if ($line =~ /^\d+,/)
+    {
+      if ($layout_csv == $layout_csv_properties)
+      {
+        ($index,$dummy,$property,@data) = split (',', $line) ;
+        $property =~ s/^\s+// ;
+        $property =~ s/\s+$// ;
+
+        $property =~ s/.*Google.*/Google/i ;
+        $property =~ s/.*Microsoft.*/Microsoft/i ;
+        $property =~ s/.*FACEBOOK.*/Facebook/i ;
+        $property =~ s/.*Yahoo.*/Yahoo/i ;
+        $property =~ s/.*Amazon.*/Amazon/i ;
+        $property =~ s/.*Apple.*/Apple/i ;
+        $property =~ s/.*AOL.*/AOL/i ;
+        $property =~ s/.*Wikimedia.*/Wikimedia/i ;
+        $property =~ s/.*Tencent.*/Tencent/i ;
+        $property =~ s/.*Baidu.*/Baidu/i ;
+        $property =~ s/.*CBS.*/CBS/i ;
+
+        $id = $property ;
+      }
+      else
+      {
+        ($index,$region,@data) = split (',', $line) ;
+        $region =~ s/^\s+// ;
+        $region =~ s/\s+$// ;
+        $id = $region_codes {$region} ;
+      }
+
+      if ($update_only {$id} == 0)
+      {
+        $do_not_update {$id}++ ;
+        next ;
+      }
+
+      for ($m = 0 ; $m <= $#months ; $m++)
+      {
+        $yyyymm = $months [$m] ;
+        $months {$yyyymm} ++ ;
+        $yyyymm_id = "$yyyymm,$id" ;
+        $data = $data [$m] * $multiplier ;
+
+        if (! defined $data {$yyyymm_id})
+        {
+          $updates_found = $true ;
+          print "New data found: $yyyymm_id = $data\n" ;
+          $data {$yyyymm_id} = $data ;
+        }
+      }
+    }
+  }
+
+  $ignored = join ', ', sort keys %do_not_update ;
+  print "\nEntities ignored:\n$ignored\n\n" ;
+
+  if (! $updates_found)
+  { print "No new updates found\n" ; }
+  else
+  { print "\nUpdates found, rewrite master file '$file_comscore_master'\n\n" ; 
}
+
+  return ($updates_found) ;
+}
+
+sub ReadDataReachPerRegion
+{
+  my ($file_comscore_master, $file_comscore_updates, $precision, $layout_csv) 
= @_ ;
+
+  undef %months ;
+  undef %data ;
+  undef @regions ;
+
+  open IN,  '<', "$dir_analytics/$file_comscore_master" ;
+
+  $lines = 0 ;
+  while ($line = <IN>)
+  {
+    chomp $line ;
+
+    ($yyyymm,@data) = split (',', $line) ;
+
+    if ($lines++ == 0)
+    { @regions = @data ; next ; }
+
+    $field_ndx = 0 ;
+    foreach (@data)
+    {
+      $region      = $regions [$field_ndx] ;
+      $region_code = $region_codes {$region} ;
+
+      $data      = $data [$field_ndx] ;
+      if ($data eq '')
+      { $data = '0' ; }
+      $months {$yyyymm} ++ ;
+      $data {"$yyyymm,$region_code"} = $data ;
+      # print "Old data $yyyymm,$region_code = $data\n" ;
+      $field_ndx++ ;
+    }
+  }
+  close IN ;
+
+  my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, 
$file_comscore_updates, 1, $layout_csv, @regions) ;
+  return if ! $updates_found ;
+
+  rename "$dir_analytics/$file_comscore_master", 
"$dir_analytics/$file_comscore_master.~" ;
+  open OUT,  '>', "$dir_analytics/$file_comscore_master" ;
+
+  $line_out = "yyyymm" ;
+  foreach $region_name (@regions)
+  { $line_out .= ",$region_name" ; }
+  print OUT "$line_out" ;
+
+  foreach $yyyymm (sort {$b cmp $a} keys %months)
+  {
+    $line_out = "\n$yyyymm" ;
+    foreach $region_name (@regions)
+    {
+      $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ;
+      $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ;
+    }
+    print OUT "$line_out" ;
+  }
+
+  close OUT ;
+}
+
+sub ReadDataVisitorsPerRegion
+{
+  my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier, 
$layout_csv) = @_ ;
+
+  undef %months ;
+  undef %data ;
+  undef @regions ;
+
+  open IN,  '<', "$dir_analytics/$file_comscore_master" ;
+
+  $lines  = 0 ;
+  $metric = 'unique_visitors' ;
+  while ($line = <IN>)
+  {
+    chomp $line ;
+    $line = &GetNumberOnly ($line) ;
+
+    ($yyyymm,@data) = split (',', $line) ;
+
+    if ($lines++ == 0)
+    { @regions = @data ; next ; }
+
+    $field_ndx = 0 ;
+    foreach (@data)
+    {
+      $region      = $regions [$field_ndx] ;
+      $region_code = $region_codes {$region} ;
+
+      $data      = $data    [$field_ndx] ;
+      if ($data eq '')
+      { $data = '0' ; }
+
+      # print "Old data $yyyymm,$region = $data\n" ;
+
+      $months {$yyyymm} ++ ;
+      $data {"$yyyymm,$region_code"} = $data ;
+
+      $field_ndx++ ;
+    }
+  }
+  close IN ;
+
+  my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, 
$file_comscore_updates, 1000, $layout_csv, @regions) ;
+  return if ! $updates_found ;
+
+  rename "$dir_analytics/$file_comscore_master", 
"$dir_analytics/$file_comscore_master.~" ;
+  open OUT,  '>', "$dir_analytics/$file_comscore_master" ;
+
+  $line_out = "yyyymm" ;
+  foreach $region_name (@regions)
+  { $line_out .= ",$region_name" ; }
+  print OUT "$line_out" ;
+
+  foreach $yyyymm (sort {$b cmp $a} keys %months)
+  {
+    $line_out = "\n$yyyymm" ;
+    foreach $region_name (@regions)
+    {
+      $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ;
+      $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ;
+    }
+    print OUT "$line_out" ;
+  }
+
+  close OUT ;
+}
+
+sub ReadDataVisitorsPerProperty
+{
+  my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier, 
$layout_csv) = @_ ;
+
+  undef %months ;
+  undef %data ;
+  undef @properties ;
+
+  open IN,  '<', "$dir_analytics/$file_comscore_master" ;
+
+  $lines = 0 ;
+  $metric       = 'unique_visitors' ;
+  while ($line = <IN>)
+  {
+    chomp $line ;
+
+    ($yyyymm,@data) = split (',', $line) ;
+    if ($lines++ == 0)
+    { @properties = @data ; next ; }
+
+    $field_ndx = 0 ;
+    foreach (@data)
+    {
+      $property = $properties [$field_ndx] ;
+      $property =~ s/.*Yahoo.*/Yahoo/ ;
+      $data      = $data    [$field_ndx] ;
+      if ($data eq '')
+      { $data = '0' ; }
+
+      # print "Old data $yyyymm,$property = $data\n" ;
+
+      $months {$yyyymm} ++ ;
+      $data {"$yyyymm,$property"} = $data ;
+
+      $field_ndx++ ;
+    }
+  }
+  close IN ;
+
+  my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, 
$file_comscore_updates, 1000, $layout_csv, @properties) ;
+  return if ! $updates_found ;
+
+  rename "$dir_analytics/$file_comscore_master", 
"$dir_analytics/$file_comscore_master.~" ;
+  open OUT,  '>', "$dir_analytics/$file_comscore_master" ;
+
+  $line_out = "yyyymm" ;
+  foreach $property (@properties)
+  { $line_out .= ",$property" ; }
+  print OUT "$line_out" ;
+
+  foreach $yyyymm (sort {$b cmp $a} keys %months)
+  {
+    $line_out = "\n$yyyymm" ;
+    foreach $property (@properties)
+    {
+      $yyyymm_property = "$yyyymm,$property" ;
+      $line_out .= "," . sprintf ($precision, $data {$yyyymm_property}) ;
+    }
+    print OUT "$line_out" ;
+  }
+
+  close OUT ;
+}
+
+sub WriteDataAnalytics
+{
+  open OUT, '>', "c:/MySQL/analytics/analytics_in_comscore.csv" ;
+
+  $metric = 'unique_visitors' ;
+  foreach $yyyymm (sort keys %months)
+  {
+    # store meta data elsewhere
+    # $line = 
"$generated,$source,$server,$script_name,$script_version,$user,$yyyymm,$country_code,$region_code,$property,$project,$normalized,$metric,$data\n"
 ;
+    foreach $region_code (sort values %region_codes)
+    {
+      $country_code = '-' ;
+      $property     = '-' ;
+      $project      = '-' ;
+      $reach        = $reach_region_code     {"$yyyymm,$region_code"} ;
+      $visitors     = $visitors_region_code  {"$yyyymm,$region_code"} ;
+
+      if (! defined $reach)    { $reach = -1 ; }
+      if (! defined $visitors) { $reach = -1 ; }
+
+      $line = 
"$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ;
+      print OUT $line ;
+      print     $line ;
+    }
+
+    foreach $property (sort @properties)
+    {
+      $country_code = '-' ;
+      $region_code  = '-' ;
+      $project      = '-' ;
+      $reach        = '-1' ;
+      $visitors     = $visitors_web_property {"$yyyymm,$property"} ;
+
+      next if ! defined $visitors ;
+
+      $line = 
"$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ;
+      print OUT $line ;
+      # print     $line ;
+    }
+  }
+}
+
+sub GetNumberOnly
+{
+  my $line = shift ;
+  $line =~ s/("[^\"]+")/($a=$1,$a=~s#,##g,$a)/ge ; # nested regexp: remove 
comma's inside double quotes
+  $line =~ s/"//g ;
+  return $line ;
+}
+
+sub mmm_yyyy2yyyy_mm
+{
+  my @months = @_ ;
+  # Jan -> 01, etc
+  foreach my $month (@months)
+  {
+    my ($mmm,$yyyy) = split ('-', $month) ;
+    for ($m = 0 ; $m <= $#months_short ; $m++)
+    {
+      if ($mmm eq $months_short [$m])
+      { $month = "$yyyy-" . sprintf ("%02d", $m+1) ; }
+    }
+  }
+  return @months ;
+}
+
+sub abort
+{
+  $msg = shift ;
+
+  print  "\nAbort, reason: $msg\n\n" ;
+  exit ;
+}

Deleted: trunk/wikistats/analytics/MySQLPrepComscoreData.pl
===================================================================
--- trunk/wikistats/analytics/MySQLPrepComscoreData.pl  2011-05-24 07:08:07 UTC 
(rev 88715)
+++ trunk/wikistats/analytics/MySQLPrepComscoreData.pl  2011-05-24 11:53:02 UTC 
(rev 88716)
@@ -1,496 +0,0 @@
-#!/usr/bin/perl
-
-# Copyright (C) 2011 Wikimedia Foundation
-# This program is free software; you can redistribute it and/or
-# modify it under the terms of the GNU General Public License version 2
-# as published by the Free Software Foundation.
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-# See the GNU General Public License for more details, at
-# http://www.fsf.org/licenses/gpl.html
-
-# Author:
-# Erik Zachte, email [email protected]
-
-# Functionality:
-# comScore data can be downloaded as csv file, which each contain 14 months 
history
-# This script uses these files to update 'master' csv files which contain all 
known history
-# Note: only entities which are already in master file will be updated!
-# Then it merges these master files into one csv file which can be loaded into 
analytics database
-# Data are: reach by region, unique visitors by region, unique visitors by web 
property
-
-# Parameters:
-# -m (required) folder with 'master' csv files (files with all known history)
-# -u (required) folder with 'update' csv files (files with lastest 14 months 
history, produced by comScore)
-
-# Output:
-# updated master csv files + merged and formatted csv for import in MySQL
-
-# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/analytics/
-
-  use Getopt::Std ;
-  use Cwd;
-
-  my $options ;
-  getopt ("mu", \%options) ;
-
-  $true  = 1 ;
-  $false = 0 ;
-
-  $script_name    = "MySQLPrepComscoreData.pl" ;
-  $script_version = "0.3" ;
-
-# EZ test only
-# $source       = "comscore" ;
-# $server       = "ez_test" ;
-# $generated    = "2011-05-06 00:00:00" ;
-# $user         = "ezachte" ;
-
-  $dir_analytics        = $options {"m"} ;
-  $dir_comscore_updates = $options {"u"} ;
-
-  $dir_analytics        = "c:/MySQL/analytics" ;    # EZ test only
-  $dir_comscore_updates = "W:/@ Report Card/Data" ; # EZ test only
-
-  if (($dir_analytics eq '') || ($dir_comscore_updates eq ''))
-  { abort ("Specify folder for 'master' csv files as '-m folder', folder for 
'update' csv files as -u folder'") ; }
-
-  $file_comscore_reach_master     = "excel_out_comscore_reach_regions.csv" ;
-  $file_comscore_reach_update     = "*reach*by*region*csv" ;
-  $file_comscore_uv_region_master = "excel_out_comscore_UV_regions.csv" ;
-  $file_comscore_uv_region_update = "*UVs*by*region*csv" ;
-  $file_comscore_uv_property_master = "excel_out_comscore_UV_properties.csv" ;
-  $file_comscore_uv_property_update = "*UV*trend*csv" ;
-
-  $layout_csv_reach      = 1 ;
-  $layout_csv_regions    = 2 ;
-  $layout_csv_properties = 3 ;
-
-  print "Directories:\nAnalytics '$dir_analytics'\nUpdates 
'$dir_comscore_updates'\n\n" ;
-
-  %region_codes = (
-    "Europe"=>"EU",
-    "North America"=>"NA",
-    "Latin America"=>"LA",
-    "World-Wide" => "W",
-    "Middle East - Africa" => "MA",
-    "Asia Pacific"=> "AS",
-    "United States" => "US",
-    "India" => "I",
-    "China" => "C"
-  ) ;
-
-  foreach $region_name (keys %region_codes)
-  { $region_names {$region_codes {$region_name}} = $region_name ; }
-
-  @months_short = qw "Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec" ;
-
-  &ReadDataReachPerRegion ($file_comscore_reach_master, 
$file_comscore_reach_update, "%.1f", 1, $layout_csv_reach) ;
-  %reach_region_code = %data ;
-
-  &ReadDataVisitorsPerRegion ($file_comscore_uv_region_master, 
$file_comscore_uv_region_update, "%.0f", 1000, $layout_csv_regions) ;
-  %visitors_region_code = %data ;
-
-  &ReadDataVisitorsPerProperty ($file_comscore_uv_property_master, 
$file_comscore_uv_property_update, "%.0f", 1000, $layout_csv_properties) ;
-  %visitors_web_property = %data ;
-
-  &WriteDataAnalytics ;
-
-  print "\nReady\n\n" ;
-  exit ;
-
-sub UpdateFromLatestComscoreData
-{
-  my ($file_comscore_master, $file_comscore_updates, $multiplier, $layout_csv, 
@update_only) = @_ ;
-
-  undef %update_only ;
-  undef %do_not_update ;
-
-  foreach $id (@update_only)
-  { $update_only {$id} = $true ; }
-
-  if (! -e "$dir_analytics/$file_comscore_master")
-  { abort ("File $file_comscore_master not found!") ; }
-
-  $age_all = -M "$dir_analytics/$file_comscore_master" ;
-  print "Latest comscore master file is " . sprintf ("%.0f", $age_all) . " 
days old: '$file_comscore_master'\n" ;
-
-  my $cwd = getcwd ;
-  chdir $dir_comscore_updates ;
-
-  @files = glob($file_comscore_updates) ;
-  $min_age_upd = 999999 ;
-  $file_comscore_updates_latest = '' ;
-  foreach $file (@files)
-  {
-    $age = -M $file ;
-    if ($age < $min_age_upd)
-    {
-      $min_age_upd = $age ;
-      $file_comscore_updates_latest = $file ;
-    }
-  }
-  print "Latest comscore update file is " . sprintf ("%.0f", $min_age_upd) . " 
days old: '$file_comscore_updates_latest'\n" ;
-
-  if ($min_age_upd == 999999)
-  {
-    print "No valid update file found. Nothing to update." ;
-    return ;
-  }
-
-  if ($age_all > $min_age_upd)
-  {
-    print "File with master data more recent than latest update csv from 
comScore. Nothing to update." ;
-    return ;
-  }
-
-  my $updates_found = $false ;
-
-  print "\nRead updates\n\n" ;
-  open CSV, '<', $file_comscore_updates_latest ;
-  while ($line = <CSV>)
-  {
-    chomp $line ;
-    $line = &GetNumberOnly ($line) ;
-
-    if ($line =~ /Jan-\d\d\d\d.*?Feb-\d\d\d\d/) # e.g. 
'Location,Location,Jan-2010,Feb-2010,Mar-2010,Apr-2010,...'
-    {
-      if ($layout_csv == $layout_csv_properties)
-      { ($dummy1,$dummy2,$dummy3,@months) = split (',', $line) ; } # web 
properties csv file
-      else
-      { ($dummy1,$dummy2,@months) = split (',', $line) ; }         # uv / 
reach csv files
-
-      @months = &mmm_yyyy2yyyy_mm (@months) ;
-    }
-
-    if ($line =~ /^\d+,/)
-    {
-      if ($layout_csv == $layout_csv_properties)
-      {
-        ($index,$dummy,$property,@data) = split (',', $line) ;
-        $property =~ s/^\s+// ;
-        $property =~ s/\s+$// ;
-
-        $property =~ s/.*Google.*/Google/i ;
-        $property =~ s/.*Microsoft.*/Microsoft/i ;
-        $property =~ s/.*FACEBOOK.*/Facebook/i ;
-        $property =~ s/.*Yahoo.*/Yahoo/i ;
-        $property =~ s/.*Amazon.*/Amazon/i ;
-        $property =~ s/.*Apple.*/Apple/i ;
-        $property =~ s/.*AOL.*/AOL/i ;
-        $property =~ s/.*Wikimedia.*/Wikimedia/i ;
-        $property =~ s/.*Tencent.*/Tencent/i ;
-        $property =~ s/.*Baidu.*/Baidu/i ;
-        $property =~ s/.*CBS.*/CBS/i ;
-
-        $id = $property ;
-      }
-      else
-      {
-        ($index,$region,@data) = split (',', $line) ;
-        $region =~ s/^\s+// ;
-        $region =~ s/\s+$// ;
-        $id = $region_codes {$region} ;
-      }
-
-      if ($update_only {$id} == 0)
-      {
-        $do_not_update {$id}++ ;
-        next ;
-      }
-
-      for ($m = 0 ; $m <= $#months ; $m++)
-      {
-        $yyyymm = $months [$m] ;
-        $months {$yyyymm} ++ ;
-        $yyyymm_id = "$yyyymm,$id" ;
-        $data = $data [$m] * $multiplier ;
-
-        if (! defined $data {$yyyymm_id})
-        {
-          $updates_found = $true ;
-          print "New data found: $yyyymm_id = $data\n" ;
-          $data {$yyyymm_id} = $data ;
-        }
-      }
-    }
-  }
-
-  $ignored = join ', ', sort keys %do_not_update ;
-  print "\nEntities ignored:\n$ignored\n\n" ;
-
-  if (! $updates_found)
-  { print "No new updates found\n" ; }
-  else
-  { print "\nUpdates found, rewrite master file '$file_comscore_master'\n\n" ; 
}
-
-  return ($updates_found) ;
-}
-
-sub ReadDataReachPerRegion
-{
-  my ($file_comscore_master, $file_comscore_updates, $precision, $layout_csv) 
= @_ ;
-
-  undef %months ;
-  undef %data ;
-  undef @regions ;
-
-  open IN,  '<', "$dir_analytics/$file_comscore_master" ;
-
-  $lines = 0 ;
-  while ($line = <IN>)
-  {
-    chomp $line ;
-
-    ($yyyymm,@data) = split (',', $line) ;
-
-    if ($lines++ == 0)
-    { @regions = @data ; next ; }
-
-    $field_ndx = 0 ;
-    foreach (@data)
-    {
-      $region      = $regions [$field_ndx] ;
-      $region_code = $region_codes {$region} ;
-
-      $data      = $data [$field_ndx] ;
-      if ($data eq '')
-      { $data = '0' ; }
-      $months {$yyyymm} ++ ;
-      $data {"$yyyymm,$region_code"} = $data ;
-      # print "Old data $yyyymm,$region_code = $data\n" ;
-      $field_ndx++ ;
-    }
-  }
-  close IN ;
-
-  my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, 
$file_comscore_updates, 1, $layout_csv, @regions) ;
-  return if ! $updates_found ;
-
-  rename "$dir_analytics/$file_comscore_master", 
"$dir_analytics/$file_comscore_master.~" ;
-  open OUT,  '>', "$dir_analytics/$file_comscore_master" ;
-
-  $line_out = "yyyymm" ;
-  foreach $region_name (@regions)
-  { $line_out .= ",$region_name" ; }
-  print OUT "$line_out" ;
-
-  foreach $yyyymm (sort {$b cmp $a} keys %months)
-  {
-    $line_out = "\n$yyyymm" ;
-    foreach $region_name (@regions)
-    {
-      $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ;
-      $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ;
-    }
-    print OUT "$line_out" ;
-  }
-
-  close OUT ;
-}
-
-sub ReadDataVisitorsPerRegion
-{
-  my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier, 
$layout_csv) = @_ ;
-
-  undef %months ;
-  undef %data ;
-  undef @regions ;
-
-  open IN,  '<', "$dir_analytics/$file_comscore_master" ;
-
-  $lines  = 0 ;
-  $metric = 'unique_visitors' ;
-  while ($line = <IN>)
-  {
-    chomp $line ;
-    $line = &GetNumberOnly ($line) ;
-
-    ($yyyymm,@data) = split (',', $line) ;
-
-    if ($lines++ == 0)
-    { @regions = @data ; next ; }
-
-    $field_ndx = 0 ;
-    foreach (@data)
-    {
-      $region      = $regions [$field_ndx] ;
-      $region_code = $region_codes {$region} ;
-
-      $data      = $data    [$field_ndx] ;
-      if ($data eq '')
-      { $data = '0' ; }
-
-      # print "Old data $yyyymm,$region = $data\n" ;
-
-      $months {$yyyymm} ++ ;
-      $data {"$yyyymm,$region_code"} = $data ;
-
-      $field_ndx++ ;
-    }
-  }
-  close IN ;
-
-  my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, 
$file_comscore_updates, 1000, $layout_csv, @regions) ;
-  return if ! $updates_found ;
-
-  rename "$dir_analytics/$file_comscore_master", 
"$dir_analytics/$file_comscore_master.~" ;
-  open OUT,  '>', "$dir_analytics/$file_comscore_master" ;
-
-  $line_out = "yyyymm" ;
-  foreach $region_name (@regions)
-  { $line_out .= ",$region_name" ; }
-  print OUT "$line_out" ;
-
-  foreach $yyyymm (sort {$b cmp $a} keys %months)
-  {
-    $line_out = "\n$yyyymm" ;
-    foreach $region_name (@regions)
-    {
-      $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ;
-      $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ;
-    }
-    print OUT "$line_out" ;
-  }
-
-  close OUT ;
-}
-
-sub ReadDataVisitorsPerProperty
-{
-  my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier, 
$layout_csv) = @_ ;
-
-  undef %months ;
-  undef %data ;
-  undef @properties ;
-
-  open IN,  '<', "$dir_analytics/$file_comscore_master" ;
-
-  $lines = 0 ;
-  $metric       = 'unique_visitors' ;
-  while ($line = <IN>)
-  {
-    chomp $line ;
-
-    ($yyyymm,@data) = split (',', $line) ;
-    if ($lines++ == 0)
-    { @properties = @data ; next ; }
-
-    $field_ndx = 0 ;
-    foreach (@data)
-    {
-      $property = $properties [$field_ndx] ;
-      $property =~ s/.*Yahoo.*/Yahoo/ ;
-      $data      = $data    [$field_ndx] ;
-      if ($data eq '')
-      { $data = '0' ; }
-
-      # print "Old data $yyyymm,$property = $data\n" ;
-
-      $months {$yyyymm} ++ ;
-      $data {"$yyyymm,$property"} = $data ;
-
-      $field_ndx++ ;
-    }
-  }
-  close IN ;
-
-  my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, 
$file_comscore_updates, 1000, $layout_csv, @properties) ;
-  return if ! $updates_found ;
-
-  rename "$dir_analytics/$file_comscore_master", 
"$dir_analytics/$file_comscore_master.~" ;
-  open OUT,  '>', "$dir_analytics/$file_comscore_master" ;
-
-  $line_out = "yyyymm" ;
-  foreach $property (@properties)
-  { $line_out .= ",$property" ; }
-  print OUT "$line_out" ;
-
-  foreach $yyyymm (sort {$b cmp $a} keys %months)
-  {
-    $line_out = "\n$yyyymm" ;
-    foreach $property (@properties)
-    {
-      $yyyymm_property = "$yyyymm,$property" ;
-      $line_out .= "," . sprintf ($precision, $data {$yyyymm_property}) ;
-    }
-    print OUT "$line_out" ;
-  }
-
-  close OUT ;
-}
-
-sub WriteDataAnalytics
-{
-  open OUT, '>', "c:/MySQL/analytics/analytics_in_comscore.csv" ;
-
-  $metric = 'unique_visitors' ;
-  foreach $yyyymm (sort keys %months)
-  {
-    # store meta data elsewhere
-    # $line = 
"$generated,$source,$server,$script_name,$script_version,$user,$yyyymm,$country_code,$region_code,$property,$project,$normalized,$metric,$data\n"
 ;
-    foreach $region_code (sort values %region_codes)
-    {
-      $country_code = '-' ;
-      $property     = '-' ;
-      $project      = '-' ;
-      $reach        = $reach_region_code     {"$yyyymm,$region_code"} ;
-      $visitors     = $visitors_region_code  {"$yyyymm,$region_code"} ;
-
-      if (! defined $reach)    { $reach = -1 ; }
-      if (! defined $visitors) { $reach = -1 ; }
-
-      $line = 
"$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ;
-      print OUT $line ;
-      print     $line ;
-    }
-
-    foreach $property (sort @properties)
-    {
-      $country_code = '-' ;
-      $region_code  = '-' ;
-      $project      = '-' ;
-      $reach        = '-1' ;
-      $visitors     = $visitors_web_property {"$yyyymm,$property"} ;
-
-      next if ! defined $visitors ;
-
-      $line = 
"$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ;
-      print OUT $line ;
-      # print     $line ;
-    }
-  }
-}
-
-sub GetNumberOnly
-{
-  my $line = shift ;
-  $line =~ s/("[^\"]+")/($a=$1,$a=~s#,##g,$a)/ge ; # nested regexp: remove 
comma's inside double quotes
-  $line =~ s/"//g ;
-  return $line ;
-}
-
-sub mmm_yyyy2yyyy_mm
-{
-  my @months = @_ ;
-  # Jan -> 01, etc
-  foreach my $month (@months)
-  {
-    my ($mmm,$yyyy) = split ('-', $month) ;
-    for ($m = 0 ; $m <= $#months_short ; $m++)
-    {
-      if ($mmm eq $months_short [$m])
-      { $month = "$yyyy-" . sprintf ("%02d", $m+1) ; }
-    }
-  }
-  return @months ;
-}
-
-sub abort
-{
-  $msg = shift ;
-
-  print  "\nAbort, reason: $msg\n\n" ;
-  exit ;
-}


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to