http://www.mediawiki.org/wiki/Special:Code/MediaWiki/88720

Revision: 88720
Author:   ezachte
Date:     2011-05-24 15:52:01 +0000 (Tue, 24 May 2011)
Log Message:
-----------


Modified Paths:
--------------
    trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl
    trunk/wikistats/analytics/create_and_use_db_analytics.txt

Added Paths:
-----------
    trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl

Modified: trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl
===================================================================
--- trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl      2011-05-24 
15:34:59 UTC (rev 88719)
+++ trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl      2011-05-24 
15:52:01 UTC (rev 88720)
@@ -54,7 +54,7 @@
 # $dir_comscore_updates = "W:/@ Report Card/Data" ; # EZ test only
 
   if (($dir_analytics eq '') || ($dir_comscore_updates eq ''))
-  { abort ("Specify folder for 'master' csv files as '-m folder', folder for 
'update' csv files as -u folder'") ; }
+  { Abort ("Specify folder for 'master' csv files as '-m folder', folder for 
'update' csv files as -u folder'") ; }
 
   $file_comscore_reach_master     = "excel_out_comscore_reach_regions.csv" ;
   $file_comscore_reach_update     = "*reach*by*region*csv" ;
@@ -111,7 +111,7 @@
   { $update_only {$id} = $true ; }
 
   if (! -e "$dir_analytics/$file_comscore_master")
-  { abort ("File $file_comscore_master not found!") ; }
+  { Abort ("File $file_comscore_master not found!") ; }
 
   $age_all = -M "$dir_analytics/$file_comscore_master" ;
   print "Latest comscore master file is " . sprintf ("%.0f", $age_all) . " 
days old: '$file_comscore_master'\n" ;
@@ -438,9 +438,11 @@
       $reach        = $reach_region_code     {"$yyyymm,$region_code"} ;
       $visitors     = $visitors_region_code  {"$yyyymm,$region_code"} ;
 
-      if (! defined $reach)    { $reach = -1 ; }
-      if (! defined $visitors) { $reach = -1 ; }
+      if (! defined $reach)    { $reach    = -1 ; }
+      if (! defined $visitors) { $visitors = -1 ; }
 
+      next if $reach == -1 and $visitors == -1 ;
+
       $line = 
"$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ;
       print OUT $line ;
       print     $line ;
@@ -487,7 +489,7 @@
   return @months ;
 }
 
-sub abort
+sub Abort
 {
   $msg = shift ;
 

Added: trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl
===================================================================
--- trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl                  
        (rev 0)
+++ trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl  2011-05-24 
15:52:01 UTC (rev 88720)
@@ -0,0 +1,281 @@
+#!/usr/local/bin/perl
+
+# Copyright (C) 2011 Wikimedia Foundation
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License version 2
+# as published by the Free Software Foundation.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the GNU General Public License for more details, at
+# http://www.fsf.org/licenses/gpl.html
+
+# Author:
+# Erik Zachte, email [email protected]
+# loosely based on predecessor
+# 
http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/reportcard/ReportCardExtractWikiCountsOutput.pl
+
+# Functionality:
+# tba
+
+# Parameters:
+# tba
+
+# Output:
+# updated csv file for import in MySQL
+
+# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/analytics/
+
+  use Getopt::Std ;
+
+  $true  = 1 ;
+  $false = 0 ;
+
+  @projects = ('wb','wk','wn','wp','wq','ws','wv','wx','commons','*') ;
+
+  $file_csv_monthly_data         = "StatisticsMonthly.csv" ;
+  $file_csv_user_activity_spread = "StatisticsUserActivitySpread.csv" ;
+  $file_csv_analytics_in         = "analytics_in_wikistats.csv" ;
+
+  &ParseArguments ;
+  &ReadStatisticsMonthly ;
+  &WriteMonthlyData ;
+
+  print "\nReady\n\n" ;
+  exit ;
+
+sub ParseArguments
+{
+  my (@options, $arguments) ;
+
+  getopt ("io", \%options) ;
+
+  foreach $arg (sort keys %options)
+  { $arguments .= " -$arg " . $options {$arg} . "\n" ; }
+  print ("\nArguments\n$arguments\n") ;
+
+# $options {"i"} = "w:/# out bayes" ;     # EZ test
+# $options {"o"} = "c:/MySQL/analytics" ; # EZ test
+
+  die ("Specify input folder for projectcounts files as: -i path") if (! 
defined ($options {"i"})) ;
+  die ("Specify output folder as: -o path'")                       if (! 
defined ($options {"o"})) ;
+
+  $path_in  = $options {"i"} ;
+  $path_out = $options {"o"} ;
+
+  die "Input folder '$path_in' does not exist"   if (! -d $path_in) ;
+  die "Output folder '$path_out' does not exist" if (! -d $path_out) ;
+
+  print "Input  folder: $path_in\n" ;
+  print "Output folder: $path_out\n\n" ;
+
+  $file_csv_out = "$path_out/analytics_in_wikistats.csv" ;
+}
+
+sub ReadStatisticsMonthly
+{
+  &ReadStatisticsMonthlyForProject ("wb") ;
+  &ReadStatisticsMonthlyForProject ("wk") ;
+  &ReadStatisticsMonthlyForProject ("wn") ;
+  &ReadStatisticsMonthlyForProject ("wp") ;
+  &ReadStatisticsMonthlyForProject ("wq") ;
+  &ReadStatisticsMonthlyForProject ("ws") ;
+  &ReadStatisticsMonthlyForProject ("wv") ;
+  &ReadStatisticsMonthlyForProject ("wx") ;
+
+# &ReadStatisticsPerBinariesExtensionCommons ;
+}
+
+sub ReadStatisticsMonthlyForProject
+{
+  my $project = shift;
+  $all_projects = "*" ;
+
+  my $file_csv_in_1 = "$path_in/csv_$project/$file_csv_monthly_data" ;
+  my $file_csv_in_2 = "$path_in/csv_$project/$file_csv_user_activity_spread" ;
+
+  if (! -e $file_csv_in_1)
+  { &Abort ("Input file '$file_csv_in_1' not found") ; }
+  if (! -e $file_csv_in_2)
+  { &Abort ("Input file '$file_csv_in_2' not found") ; }
+
+  my $yyyymm ;
+
+  print "Read '$file_csv_in_1'\n" ;
+  open CSV_IN, '<', $file_csv_in_1 ;
+  while ($line = <CSV_IN>)
+  {
+    chomp $line ;
+    ($language,$date,$counts) = split (',', $line, 3) ;
+    @fields = split (',', $counts) ;
+
+    next if ! &AcceptWiki ($project,$language) ;
+
+    ($month,$day,$year) = split ('\/', $date) ;
+    $yyyymm = sprintf ("%04d-%02d", $year, $month) ;
+
+    foreach $field (@fields)
+    {
+      if ($field eq '-')
+      { $field = 0 ; }
+    }
+
+    $data = $fields  [0] . ',' . # contributors all time
+            $fields  [1] . ',' . # new contributors
+            'data2,'           . # place holder for more data, to be inserted 
later
+            $fields  [4] . ',' . # articles
+            $fields  [6] . ',' . # articles new per day
+            $fields  [9] . ',' . # larger than 0.5 kB
+            $fields [10] . ',' . # larger than 2.0 kB
+            $fields  [7] . ',' . # mean edits per article
+            $fields  [8] . ',' . # mean bytes per article
+            $fields [11] . ',' . # edits
+            $fields [12] . ',' . # size in bytes
+            $fields [13] . ',' . # size in words
+            $fields [14] . ',' . # links internal
+            $fields [15] . ',' . # links interwiki
+            $fields [16] . ',' . # links images
+            $fields [17] . ',' . # links external
+            $fields [18] ;       # redirects
+
+    $data1 {"$project,$language,$yyyymm"} = $data ;
+  }
+  close CSV_IN ;
+
+  # now read (very) active editors from newer more accurate file (split data 
for reg users and bots, unlike StatisticsMonthly.csv)
+
+  print "Read '$file_csv_in_2'\n" ;
+  open CSV_IN, '<', $file_csv_in_2 ;
+  while ($line = <CSV_IN>)
+  {
+    chomp $line ;
+    ($language,$date,$reguser_bot,$group,@counts) = split (',', $line) ;
+
+    next if ! &AcceptWiki ($project,$language) ;
+
+    if ($reguser_bot ne "R") { next ; }  # R: reg user, B: bot
+    if ($group       ne "A") { next ; }  # A: articles, T: talk pages, O: 
other namespaces
+
+    ($month,$day,$year) = split ('\/', $date) ;
+    $yyyymm = sprintf ("%04d-%02d", $year, $month) ;
+
+    # data have been collected in WikiCountsProcess.pm and been written in 
WikiCountsOutput.pm
+    # count user with over x edits
+    # threshold starting with a 3 are 10xSQRT(10), 100xSQRT(10), 
1000xSQRT(10), etc
+    # @thresholds = 
(1,3,5,10,25,32,50,100,250,316,500,1000,2500,3162,5000,10000,25000,31623,50000,100000,250000,316228,500000,1000000,2500000,3162278,500000,10000000,25000000,31622777,5000000,100000000)
 ;
+    $edits_ge_5   = @counts [2] > 0 ? @counts [2] : 0 ;
+    $edits_ge_25  = @counts [4] > 0 ? @counts [4] : 0 ;
+    $edits_ge_100 = @counts [7] > 0 ? @counts [7] : 0 ;
+    $data2 {"$project,$language,$yyyymm"} = 
"$edits_ge_5,$edits_ge_25,$edits_ge_100" ;
+    # prep string with right amount of comma's
+    if ($data2_default eq '')
+    {
+      $data2_default = $data2 {"$project,$language,$yyyymm"} ;
+      $data2_default =~ s/[^,]+/0/g ;
+    }
+  }
+  close CSV_IN ;
+}
+
+#sub ReadStatisticsPerBinariesExtensionCommons
+#{
+#  my $file_csv_in = "$path_in/csv_wx/StatisticsPerBinariesExtension.csv" ;
+#  my $mmax = -1 ;
+
+#  if (! -e $file_csv_in)
+#  { &Abort ("Input file '$file_csv_in' not found") ; }
+
+#  print "Read '$file_csv_in'\n" ;
+#  open CSV_IN, '<', $file_csv_in ;
+#  while ($line = <CSV_IN>)
+#  {
+#    chomp $line ;
+#    ($language,$date,$counts) = split (',', $line, 3) ;
+
+#    if ($language ne "commons") { next ; }
+
+#    if ($date eq "00/0000")
+#    {
+#      @fields = split (',', $counts) ;
+#      $field_ndx = 0 ;
+#      foreach $field (@fields)
+#      {
+#        $ext_cnt {-1}{$field_ndx} = $field ;
+#      # print "EXT_CNT $field_ndx : $field\n" ;
+#        $field_ndx ++ ;
+#      }
+#      next ;
+#    }
+
+#    ($month,$year) = split ('\/', $date) ;
+#    my $m = &months_since_2000_01 ($year,$month) ;
+#    next if $m < $m_start ;
+
+#    if ($m > $mmax)
+#    { $mmax = $m ; }
+
+#    @fields = split (',', $counts) ;
+#    $field_ndx = 0 ;
+#    foreach $field (@fields)
+#    {
+#      $ext_cnt {$m}{$field_ndx}  = $field ;
+#      $ext_tot {$m}             += $field ;
+#      $field_ndx ++ ;
+#    }
+#  }
+#  close CSV_IN ;
+
+#  %ext_cnt_mmax = %{$ext_cnt {$mmax}} ;
+#  @ext_cnt_mmax = (sort {$ext_cnt_mmax {$b} <=> $ext_cnt_mmax {$a}} keys 
%ext_cnt_mmax) ;
+
+#  $extcnt = 0 ;
+#  foreach $extndx (@ext_cnt_mmax)
+#  {
+#    # print "$extndx < ${ext_cnt {-1}{$extndx}} > : ${ext_cnt_mmax 
{$extndx}}\n" ;
+#    push @extndxs, $extndx ;
+#    if ($extcnt++ >= 9) { last ; }
+#  }
+#}
+
+sub WriteMonthlyData
+{
+  my $file_csv_out = "$path_out/$file_csv_analytics_in" ;
+  open CSV_OUT, '>', $file_csv_out ;
+  foreach $project_wiki_month (sort keys %data1)
+  {
+    $data1 = $data1 {$project_wiki_month} ;
+    $data2 = $data2 {$project_wiki_month} ;
+    if ($data2 eq '')
+    {
+      print "Editor data missing for $project_wiki_month\n" ;
+      $data2 = $data2_default ;
+    }
+    $data1 =~ s/data2/$data2/ ; # insert rather than append to have all editor 
fields follow each other
+    print CSV_OUT "$project_wiki_month,$data1\n" ;
+  }
+  close CSV_OUT ;
+}
+
+sub AcceptWiki
+{
+  my ($project,$language) = @_ ;
+
+  return $false if $language eq 'commons' and $project ne 'wx' ; # commons 
also in wikipedia csv files (bug, hard to cleanup, just skip)
+  return $false if $language eq 'sr'      and $project eq 'wn' ; # ignore 
insane bot spam on
+  return $false if $language =~ 
/mania|team|comcom|closed|chair|langcom|office|searchcom|sep11|nostalgia|stats|test/i
 ;
+
+  return $false if $language =~ /^(?:dk|tlh|ru_sib)$/ ; # dk=dumps exist(ed?) 
but site not, tlh=Klignon, ru-sib=Siberian
+  return $false if $project eq 'wk' and ($language eq "als" or $language eq 
"tlh") ;
+
+  return $true ;
+}
+
+sub Abort
+{
+  my $msg = shift ;
+  print "$msg\nExecution aborted." ;
+  # to do: log also to file
+  exit ;
+}
+
+

Modified: trunk/wikistats/analytics/create_and_use_db_analytics.txt
===================================================================
--- trunk/wikistats/analytics/create_and_use_db_analytics.txt   2011-05-24 
15:34:59 UTC (rev 88719)
+++ trunk/wikistats/analytics/create_and_use_db_analytics.txt   2011-05-24 
15:52:01 UTC (rev 88720)
@@ -1,23 +1,23 @@
 -- invoke this file with "mysql --user=root --password=[...] < 
create_and_use_db_analytics.txt"
 
-/* Create database and two tables from scratch */
-
+-- Create database and two tables from scratch 
 DROP DATABASE IF EXISTS `analytics` ;
 CREATE DATABASE `analytics` ;
+
 USE `analytics` ;
 
 CREATE TABLE `comscore` (
 -- meta data (mostly for auditing, may not be sent over API on default) 
-/* 
-   store meta dat elsewhere,tbd
-  `id`             int auto_increment NOT NULL, 
-  `generated`      timestamp,
-  `source`         char (20),
-  `server`         char (20),
-  `script_name`    char (30),
-  `script_version` char (8),
-  `user`           char (20),
-*/
+-- not used right now, store meta dat elsewhere,tbd, kept for brainstorm 
+--
+--`id`             int auto_increment NOT NULL, 
+--`generated`      timestamp,
+--`source`         char (20),
+--`server`         char (20),
+--`script_name`    char (30),
+--`script_version` char (8),
+--`user`           char (20),
+--
 -- analytics data  
   `yyyymm`         char (7), 
   `country_code`   char (3), 
@@ -36,18 +36,46 @@
   PRIMARY KEY (language_code,region_code)
 ) ;  
 
+CREATE TABLE `wikistats` (
+-- analytics data  
+  `yyyymm`               char (7), 
+  `project`              char (2),
+  `wiki`                 char (15),
+  `editors_all_time`     decimal (10)  DEFAULT '-1', 
+  `editors_new`          decimal (7)   DEFAULT '-1', 
+  `editors_ge_5`         decimal (7)   DEFAULT '-1', 
+  `editors_ge_25`        decimal (7)   DEFAULT '-1', 
+  `editors_ge_100`       decimal (7)   DEFAULT '-1', 
+  `articles`             decimal (12)  DEFAULT '-1', 
+  `articles_new_per_day` decimal (9)   DEFAULT '-1', 
+  `articles_over_bytes_500`  decimal (12)   DEFAULT '-1', 
+  `articles_over_bytes_2000` decimal (12)   DEFAULT '-1', 
+  `edits_per_article`    decimal (9,1) DEFAULT '-1', 
+  `bytes_per_article`    decimal (9,1) DEFAULT '-1', 
+  `edits`                decimal (12)  DEFAULT '-1', 
+  `size_in_bytes`        decimal (15)  DEFAULT '-1', 
+  `size_in_words`        decimal (15)  DEFAULT '-1', 
+  `links_internal`       decimal (15)  DEFAULT '-1', 
+  `links_interwiki`      decimal (15)  DEFAULT '-1', 
+  `links_image`          decimal (15)  DEFAULT '-1', 
+  `links_external`       decimal (15)  DEFAULT '-1', 
+  `redirects`            decimal (15)  DEFAULT '-1', 
+  PRIMARY KEY (yyyymm,project,wiki)
+) ;
+
 -- SHOW TABLES ;
 -- DESCRIBE comscore ;
 -- DESCRIBE comscore_regions ;
+   DESCRIBE wikistats ;
 
-/* Database Manipulation */
-/* Obviously in real world this is a separate script */
+-- Database Manipulation
+-- Obviously in real world this is a separate script
 
-
 -- show contents (debugging only) 
 -- SELECT * 
 --      FROM comscore_regions ;
 
+
 LOAD DATA INFILE 'c:/MySQL/analytics/analytics_in_comscore_regions.csv' 
      INTO TABLE comscore_regions 
      FIELDS TERMINATED BY ',' 
@@ -60,6 +88,11 @@
      OPTIONALLY ENCLOSED BY '"' 
      (yyyymm,country_code,region_code,property,project,reach,visitors) ;
 
+LOAD DATA INFILE 'c:/MySQL/analytics/analytics_in_wikistats.csv' 
+     INTO TABLE wikistats 
+     FIELDS TERMINATED BY ',' 
+     OPTIONALLY ENCLOSED BY '"' 
+     
(project,wiki,yyyymm,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects)
 ;
 -- show contents (debugging only) 
    SELECT * 
        FROM comscore 
@@ -68,10 +101,16 @@
        FIELDS TERMINATED BY ',' ; 
 
 -- export all relevant non-meta data from comsCore's reach by region (and 
comScore treats India and China as regions in this context)
+SELECT *
+       FROM wikistats 
+       WHERE (project = 'wp') AND (wiki = 'en') AND (yyyymm BETWEEN '2010-06' 
AND '2011-05') 
+       ORDER BY project,wiki,yyyymm
+       INTO OUTFILE 'c:/MySQL/analytics/analytics_out_wikistats_test1.csv' 
+       FIELDS TERMINATED BY ',' ; 
+-- export all relevant non-meta data from comsCore's reach by region (and 
comScore treats India and China as regions in this context)
 SELECT yyyymm,region_name,reach
        FROM comscore LEFT JOIN comscore_regions ON comscore.region_code = 
comscore_regions.region_code AND comscore_regions.language_code = 'en'
        WHERE (region_name != '') AND (yyyymm BETWEEN '2010-06' AND '2011-05') 
        ORDER BY yyyymm,region_name 
        INTO OUTFILE 'c:/MySQL/analytics/analytics_out_comscore_reach.csv' 
        FIELDS TERMINATED BY ',' ; 
-       


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to