http://www.mediawiki.org/wiki/Special:Code/MediaWiki/88720
Revision: 88720
Author: ezachte
Date: 2011-05-24 15:52:01 +0000 (Tue, 24 May 2011)
Log Message:
-----------
Modified Paths:
--------------
trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl
trunk/wikistats/analytics/create_and_use_db_analytics.txt
Added Paths:
-----------
trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl
Modified: trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl
===================================================================
--- trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl 2011-05-24
15:34:59 UTC (rev 88719)
+++ trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl 2011-05-24
15:52:01 UTC (rev 88720)
@@ -54,7 +54,7 @@
# $dir_comscore_updates = "W:/@ Report Card/Data" ; # EZ test only
if (($dir_analytics eq '') || ($dir_comscore_updates eq ''))
- { abort ("Specify folder for 'master' csv files as '-m folder', folder for
'update' csv files as -u folder'") ; }
+ { Abort ("Specify folder for 'master' csv files as '-m folder', folder for
'update' csv files as -u folder'") ; }
$file_comscore_reach_master = "excel_out_comscore_reach_regions.csv" ;
$file_comscore_reach_update = "*reach*by*region*csv" ;
@@ -111,7 +111,7 @@
{ $update_only {$id} = $true ; }
if (! -e "$dir_analytics/$file_comscore_master")
- { abort ("File $file_comscore_master not found!") ; }
+ { Abort ("File $file_comscore_master not found!") ; }
$age_all = -M "$dir_analytics/$file_comscore_master" ;
print "Latest comscore master file is " . sprintf ("%.0f", $age_all) . "
days old: '$file_comscore_master'\n" ;
@@ -438,9 +438,11 @@
$reach = $reach_region_code {"$yyyymm,$region_code"} ;
$visitors = $visitors_region_code {"$yyyymm,$region_code"} ;
- if (! defined $reach) { $reach = -1 ; }
- if (! defined $visitors) { $reach = -1 ; }
+ if (! defined $reach) { $reach = -1 ; }
+ if (! defined $visitors) { $visitors = -1 ; }
+ next if $reach == -1 and $visitors == -1 ;
+
$line =
"$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ;
print OUT $line ;
print $line ;
@@ -487,7 +489,7 @@
return @months ;
}
-sub abort
+sub Abort
{
$msg = shift ;
Added: trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl
===================================================================
--- trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl
(rev 0)
+++ trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl 2011-05-24
15:52:01 UTC (rev 88720)
@@ -0,0 +1,281 @@
+#!/usr/local/bin/perl
+
+# Copyright (C) 2011 Wikimedia Foundation
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License version 2
+# as published by the Free Software Foundation.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the GNU General Public License for more details, at
+# http://www.fsf.org/licenses/gpl.html
+
+# Author:
+# Erik Zachte, email [email protected]
+# loosely based on predecessor
+#
http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/reportcard/ReportCardExtractWikiCountsOutput.pl
+
+# Functionality:
+# tba
+
+# Parameters:
+# tba
+
+# Output:
+# updated csv file for import in MySQL
+
+# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/analytics/
+
+ use Getopt::Std ;
+
+ $true = 1 ;
+ $false = 0 ;
+
+ @projects = ('wb','wk','wn','wp','wq','ws','wv','wx','commons','*') ;
+
+ $file_csv_monthly_data = "StatisticsMonthly.csv" ;
+ $file_csv_user_activity_spread = "StatisticsUserActivitySpread.csv" ;
+ $file_csv_analytics_in = "analytics_in_wikistats.csv" ;
+
+ &ParseArguments ;
+ &ReadStatisticsMonthly ;
+ &WriteMonthlyData ;
+
+ print "\nReady\n\n" ;
+ exit ;
+
+sub ParseArguments
+{
+ my (@options, $arguments) ;
+
+ getopt ("io", \%options) ;
+
+ foreach $arg (sort keys %options)
+ { $arguments .= " -$arg " . $options {$arg} . "\n" ; }
+ print ("\nArguments\n$arguments\n") ;
+
+# $options {"i"} = "w:/# out bayes" ; # EZ test
+# $options {"o"} = "c:/MySQL/analytics" ; # EZ test
+
+ die ("Specify input folder for projectcounts files as: -i path") if (!
defined ($options {"i"})) ;
+ die ("Specify output folder as: -o path'") if (!
defined ($options {"o"})) ;
+
+ $path_in = $options {"i"} ;
+ $path_out = $options {"o"} ;
+
+ die "Input folder '$path_in' does not exist" if (! -d $path_in) ;
+ die "Output folder '$path_out' does not exist" if (! -d $path_out) ;
+
+ print "Input folder: $path_in\n" ;
+ print "Output folder: $path_out\n\n" ;
+
+ $file_csv_out = "$path_out/analytics_in_wikistats.csv" ;
+}
+
+sub ReadStatisticsMonthly
+{
+ &ReadStatisticsMonthlyForProject ("wb") ;
+ &ReadStatisticsMonthlyForProject ("wk") ;
+ &ReadStatisticsMonthlyForProject ("wn") ;
+ &ReadStatisticsMonthlyForProject ("wp") ;
+ &ReadStatisticsMonthlyForProject ("wq") ;
+ &ReadStatisticsMonthlyForProject ("ws") ;
+ &ReadStatisticsMonthlyForProject ("wv") ;
+ &ReadStatisticsMonthlyForProject ("wx") ;
+
+# &ReadStatisticsPerBinariesExtensionCommons ;
+}
+
+sub ReadStatisticsMonthlyForProject
+{
+ my $project = shift;
+ $all_projects = "*" ;
+
+ my $file_csv_in_1 = "$path_in/csv_$project/$file_csv_monthly_data" ;
+ my $file_csv_in_2 = "$path_in/csv_$project/$file_csv_user_activity_spread" ;
+
+ if (! -e $file_csv_in_1)
+ { &Abort ("Input file '$file_csv_in_1' not found") ; }
+ if (! -e $file_csv_in_2)
+ { &Abort ("Input file '$file_csv_in_2' not found") ; }
+
+ my $yyyymm ;
+
+ print "Read '$file_csv_in_1'\n" ;
+ open CSV_IN, '<', $file_csv_in_1 ;
+ while ($line = <CSV_IN>)
+ {
+ chomp $line ;
+ ($language,$date,$counts) = split (',', $line, 3) ;
+ @fields = split (',', $counts) ;
+
+ next if ! &AcceptWiki ($project,$language) ;
+
+ ($month,$day,$year) = split ('\/', $date) ;
+ $yyyymm = sprintf ("%04d-%02d", $year, $month) ;
+
+ foreach $field (@fields)
+ {
+ if ($field eq '-')
+ { $field = 0 ; }
+ }
+
+ $data = $fields [0] . ',' . # contributors all time
+ $fields [1] . ',' . # new contributors
+ 'data2,' . # place holder for more data, to be inserted
later
+ $fields [4] . ',' . # articles
+ $fields [6] . ',' . # articles new per day
+ $fields [9] . ',' . # larger than 0.5 kB
+ $fields [10] . ',' . # larger than 2.0 kB
+ $fields [7] . ',' . # mean edits per article
+ $fields [8] . ',' . # mean bytes per article
+ $fields [11] . ',' . # edits
+ $fields [12] . ',' . # size in bytes
+ $fields [13] . ',' . # size in words
+ $fields [14] . ',' . # links internal
+ $fields [15] . ',' . # links interwiki
+ $fields [16] . ',' . # links images
+ $fields [17] . ',' . # links external
+ $fields [18] ; # redirects
+
+ $data1 {"$project,$language,$yyyymm"} = $data ;
+ }
+ close CSV_IN ;
+
+ # now read (very) active editors from newer more accurate file (split data
for reg users and bots, unlike StatisticsMonthly.csv)
+
+ print "Read '$file_csv_in_2'\n" ;
+ open CSV_IN, '<', $file_csv_in_2 ;
+ while ($line = <CSV_IN>)
+ {
+ chomp $line ;
+ ($language,$date,$reguser_bot,$group,@counts) = split (',', $line) ;
+
+ next if ! &AcceptWiki ($project,$language) ;
+
+ if ($reguser_bot ne "R") { next ; } # R: reg user, B: bot
+ if ($group ne "A") { next ; } # A: articles, T: talk pages, O:
other namespaces
+
+ ($month,$day,$year) = split ('\/', $date) ;
+ $yyyymm = sprintf ("%04d-%02d", $year, $month) ;
+
+ # data have been collected in WikiCountsProcess.pm and been written in
WikiCountsOutput.pm
+ # count user with over x edits
+ # threshold starting with a 3 are 10xSQRT(10), 100xSQRT(10),
1000xSQRT(10), etc
+ # @thresholds =
(1,3,5,10,25,32,50,100,250,316,500,1000,2500,3162,5000,10000,25000,31623,50000,100000,250000,316228,500000,1000000,2500000,3162278,500000,10000000,25000000,31622777,5000000,100000000)
;
+ $edits_ge_5 = @counts [2] > 0 ? @counts [2] : 0 ;
+ $edits_ge_25 = @counts [4] > 0 ? @counts [4] : 0 ;
+ $edits_ge_100 = @counts [7] > 0 ? @counts [7] : 0 ;
+ $data2 {"$project,$language,$yyyymm"} =
"$edits_ge_5,$edits_ge_25,$edits_ge_100" ;
+ # prep string with right amount of comma's
+ if ($data2_default eq '')
+ {
+ $data2_default = $data2 {"$project,$language,$yyyymm"} ;
+ $data2_default =~ s/[^,]+/0/g ;
+ }
+ }
+ close CSV_IN ;
+}
+
+#sub ReadStatisticsPerBinariesExtensionCommons
+#{
+# my $file_csv_in = "$path_in/csv_wx/StatisticsPerBinariesExtension.csv" ;
+# my $mmax = -1 ;
+
+# if (! -e $file_csv_in)
+# { &Abort ("Input file '$file_csv_in' not found") ; }
+
+# print "Read '$file_csv_in'\n" ;
+# open CSV_IN, '<', $file_csv_in ;
+# while ($line = <CSV_IN>)
+# {
+# chomp $line ;
+# ($language,$date,$counts) = split (',', $line, 3) ;
+
+# if ($language ne "commons") { next ; }
+
+# if ($date eq "00/0000")
+# {
+# @fields = split (',', $counts) ;
+# $field_ndx = 0 ;
+# foreach $field (@fields)
+# {
+# $ext_cnt {-1}{$field_ndx} = $field ;
+# # print "EXT_CNT $field_ndx : $field\n" ;
+# $field_ndx ++ ;
+# }
+# next ;
+# }
+
+# ($month,$year) = split ('\/', $date) ;
+# my $m = &months_since_2000_01 ($year,$month) ;
+# next if $m < $m_start ;
+
+# if ($m > $mmax)
+# { $mmax = $m ; }
+
+# @fields = split (',', $counts) ;
+# $field_ndx = 0 ;
+# foreach $field (@fields)
+# {
+# $ext_cnt {$m}{$field_ndx} = $field ;
+# $ext_tot {$m} += $field ;
+# $field_ndx ++ ;
+# }
+# }
+# close CSV_IN ;
+
+# %ext_cnt_mmax = %{$ext_cnt {$mmax}} ;
+# @ext_cnt_mmax = (sort {$ext_cnt_mmax {$b} <=> $ext_cnt_mmax {$a}} keys
%ext_cnt_mmax) ;
+
+# $extcnt = 0 ;
+# foreach $extndx (@ext_cnt_mmax)
+# {
+# # print "$extndx < ${ext_cnt {-1}{$extndx}} > : ${ext_cnt_mmax
{$extndx}}\n" ;
+# push @extndxs, $extndx ;
+# if ($extcnt++ >= 9) { last ; }
+# }
+#}
+
+sub WriteMonthlyData
+{
+ my $file_csv_out = "$path_out/$file_csv_analytics_in" ;
+ open CSV_OUT, '>', $file_csv_out ;
+ foreach $project_wiki_month (sort keys %data1)
+ {
+ $data1 = $data1 {$project_wiki_month} ;
+ $data2 = $data2 {$project_wiki_month} ;
+ if ($data2 eq '')
+ {
+ print "Editor data missing for $project_wiki_month\n" ;
+ $data2 = $data2_default ;
+ }
+ $data1 =~ s/data2/$data2/ ; # insert rather than append to have all editor
fields follow each other
+ print CSV_OUT "$project_wiki_month,$data1\n" ;
+ }
+ close CSV_OUT ;
+}
+
+sub AcceptWiki
+{
+ my ($project,$language) = @_ ;
+
+ return $false if $language eq 'commons' and $project ne 'wx' ; # commons
also in wikipedia csv files (bug, hard to cleanup, just skip)
+ return $false if $language eq 'sr' and $project eq 'wn' ; # ignore
insane bot spam on
+ return $false if $language =~
/mania|team|comcom|closed|chair|langcom|office|searchcom|sep11|nostalgia|stats|test/i
;
+
+ return $false if $language =~ /^(?:dk|tlh|ru_sib)$/ ; # dk=dumps exist(ed?)
but site not, tlh=Klignon, ru-sib=Siberian
+ return $false if $project eq 'wk' and ($language eq "als" or $language eq
"tlh") ;
+
+ return $true ;
+}
+
+sub Abort
+{
+ my $msg = shift ;
+ print "$msg\nExecution aborted." ;
+ # to do: log also to file
+ exit ;
+}
+
+
Modified: trunk/wikistats/analytics/create_and_use_db_analytics.txt
===================================================================
--- trunk/wikistats/analytics/create_and_use_db_analytics.txt 2011-05-24
15:34:59 UTC (rev 88719)
+++ trunk/wikistats/analytics/create_and_use_db_analytics.txt 2011-05-24
15:52:01 UTC (rev 88720)
@@ -1,23 +1,23 @@
-- invoke this file with "mysql --user=root --password=[...] <
create_and_use_db_analytics.txt"
-/* Create database and two tables from scratch */
-
+-- Create database and two tables from scratch
DROP DATABASE IF EXISTS `analytics` ;
CREATE DATABASE `analytics` ;
+
USE `analytics` ;
CREATE TABLE `comscore` (
-- meta data (mostly for auditing, may not be sent over API on default)
-/*
- store meta dat elsewhere,tbd
- `id` int auto_increment NOT NULL,
- `generated` timestamp,
- `source` char (20),
- `server` char (20),
- `script_name` char (30),
- `script_version` char (8),
- `user` char (20),
-*/
+-- not used right now, store meta dat elsewhere,tbd, kept for brainstorm
+--
+--`id` int auto_increment NOT NULL,
+--`generated` timestamp,
+--`source` char (20),
+--`server` char (20),
+--`script_name` char (30),
+--`script_version` char (8),
+--`user` char (20),
+--
-- analytics data
`yyyymm` char (7),
`country_code` char (3),
@@ -36,18 +36,46 @@
PRIMARY KEY (language_code,region_code)
) ;
+CREATE TABLE `wikistats` (
+-- analytics data
+ `yyyymm` char (7),
+ `project` char (2),
+ `wiki` char (15),
+ `editors_all_time` decimal (10) DEFAULT '-1',
+ `editors_new` decimal (7) DEFAULT '-1',
+ `editors_ge_5` decimal (7) DEFAULT '-1',
+ `editors_ge_25` decimal (7) DEFAULT '-1',
+ `editors_ge_100` decimal (7) DEFAULT '-1',
+ `articles` decimal (12) DEFAULT '-1',
+ `articles_new_per_day` decimal (9) DEFAULT '-1',
+ `articles_over_bytes_500` decimal (12) DEFAULT '-1',
+ `articles_over_bytes_2000` decimal (12) DEFAULT '-1',
+ `edits_per_article` decimal (9,1) DEFAULT '-1',
+ `bytes_per_article` decimal (9,1) DEFAULT '-1',
+ `edits` decimal (12) DEFAULT '-1',
+ `size_in_bytes` decimal (15) DEFAULT '-1',
+ `size_in_words` decimal (15) DEFAULT '-1',
+ `links_internal` decimal (15) DEFAULT '-1',
+ `links_interwiki` decimal (15) DEFAULT '-1',
+ `links_image` decimal (15) DEFAULT '-1',
+ `links_external` decimal (15) DEFAULT '-1',
+ `redirects` decimal (15) DEFAULT '-1',
+ PRIMARY KEY (yyyymm,project,wiki)
+) ;
+
-- SHOW TABLES ;
-- DESCRIBE comscore ;
-- DESCRIBE comscore_regions ;
+ DESCRIBE wikistats ;
-/* Database Manipulation */
-/* Obviously in real world this is a separate script */
+-- Database Manipulation
+-- Obviously in real world this is a separate script
-
-- show contents (debugging only)
-- SELECT *
-- FROM comscore_regions ;
+
LOAD DATA INFILE 'c:/MySQL/analytics/analytics_in_comscore_regions.csv'
INTO TABLE comscore_regions
FIELDS TERMINATED BY ','
@@ -60,6 +88,11 @@
OPTIONALLY ENCLOSED BY '"'
(yyyymm,country_code,region_code,property,project,reach,visitors) ;
+LOAD DATA INFILE 'c:/MySQL/analytics/analytics_in_wikistats.csv'
+ INTO TABLE wikistats
+ FIELDS TERMINATED BY ','
+ OPTIONALLY ENCLOSED BY '"'
+
(project,wiki,yyyymm,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects)
;
-- show contents (debugging only)
SELECT *
FROM comscore
@@ -68,10 +101,16 @@
FIELDS TERMINATED BY ',' ;
-- export all relevant non-meta data from comsCore's reach by region (and
comScore treats India and China as regions in this context)
+SELECT *
+ FROM wikistats
+ WHERE (project = 'wp') AND (wiki = 'en') AND (yyyymm BETWEEN '2010-06'
AND '2011-05')
+ ORDER BY project,wiki,yyyymm
+ INTO OUTFILE 'c:/MySQL/analytics/analytics_out_wikistats_test1.csv'
+ FIELDS TERMINATED BY ',' ;
+-- export all relevant non-meta data from comsCore's reach by region (and
comScore treats India and China as regions in this context)
SELECT yyyymm,region_name,reach
FROM comscore LEFT JOIN comscore_regions ON comscore.region_code =
comscore_regions.region_code AND comscore_regions.language_code = 'en'
WHERE (region_name != '') AND (yyyymm BETWEEN '2010-06' AND '2011-05')
ORDER BY yyyymm,region_name
INTO OUTFILE 'c:/MySQL/analytics/analytics_out_comscore_reach.csv'
FIELDS TERMINATED BY ',' ;
-
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs