Milimetric has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/220958

Change subject: New script: find Migrations patterns
......................................................................

New script: find Migrations patterns

Change-Id: I5460da06614c748530fbaf83986a30bf0aee2f7d
---
A dammit.lt/readme.txt
M dumps/perl/WikiCountsFindMigrationPatterns.pl
A squids/readme.txt
3 files changed, 718 insertions(+), 297 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/wikistats 
refs/changes/58/220958/1

diff --git a/dammit.lt/readme.txt b/dammit.lt/readme.txt
new file mode 100644
index 0000000..411f07f
--- /dev/null
+++ b/dammit.lt/readme.txt
@@ -0,0 +1,11 @@
+Documentation (few lines):
+https://www.mediawiki.org/wiki/Analytics/Wikistats/TrafficReports
+
+Sitemap for monthly page views, based on dammit projectcounts files:
+http://stats.wikimedia.org/EN/TablesPageViewsSitemap.htm
+
+See also:
+http://stats.wikimedia.org/cgi-bin/search_portal.pl?search=monthly+page+views+color+coded
+
+
+
diff --git a/dumps/perl/WikiCountsFindMigrationPatterns.pl 
b/dumps/perl/WikiCountsFindMigrationPatterns.pl
index d3b0564..c55eb17 100644
--- a/dumps/perl/WikiCountsFindMigrationPatterns.pl
+++ b/dumps/perl/WikiCountsFindMigrationPatterns.pl
@@ -1,5 +1,7 @@
 #!/usr/bin/perl
 
+# =~
+
   #to do check $ns against 'content' namespaces for that wiki
 
   use Time::Local ;
@@ -7,7 +9,7 @@
 
   # ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($time) 
[6] ;
   $yyyy = (localtime(time)) [5] + 1900 ;
-  for ($n = 2001 ; $n <= $yyyy ; $n++)
+  for ($n = 2001 ; $n < $yyyy ; $n++)  # report up till , but not including, 
(incomplete) current year
   { push @periods, $n ; }
 
   $| = 1; # Flush output
@@ -16,14 +18,23 @@
   our $true = 1 ;
   our $false = 0 ;
 
+  # copy aggregated lines to stdout?
+  our $trace   = $true ;
+  our $notrace = $false ;
+
   our $absolute = 1 ;
   our $relative = 0 ;
 
-  our $sort_by_frequency = 1 ;
-  our $sort_alphabetical = 0 ;
+  our $sort_by_frequency = 0 ;
+  our $sort_alphabetical = 1 ;
 
-  our $threshold_perc_dominant_project = 66 ; # only when edit exceeds this 
threshold on some project, that project will be considered main project for 
that user for that period
-  our $threshold_active_enough = 25 ;
+  our $threshold_perc_dominant_project = 60 ; # only when edit exceeds this 
threshold on some project, that project will be considered main project for 
that user for that period
+  our $threshold_active_enough = 5 ;
+
+  our $skip_users_before_logging = 50000 ;
+  our $first_lines_only          = 0 ;
+  if ($first_lines_only > 0)
+  { $skip_users_before_logging = 0 ; }
 
   our %names ;
   $names {'wb'} = 'Wikibooks' ;
@@ -37,22 +48,37 @@
   $names {'wx'} = 'Other projects' ;
   $names {'co'} = 'Commons' ;
   $names {'wd'} = 'Wikidata' ;
-  
+
+  our %editors_migrating_in ;
+  our $users_upd_counts ; # used for spare logging
+  our $period_max ;       # used for reporting
+
   my %options ;
   getopt ("c", \%options) ;
   $path_csv  = $options {'c'} ;
 
-  $file_in              = 
"$path_csv/EditsPerUserMonthNamespaceAllWikisSortedByUser.csv" ;
-  $file_out_yyyymm      = 
"$path_csv/EditsPerUserAllWikisSortedByUserAggregatedMonthly.csv" ;
-  $file_out_yyyy        = 
"$path_csv/EditsPerUserAllWikisSortedByUserAggregatedYearly.csv" ;
-  $file_out_yyyy_migr   = 
"$path_csv/EditsPerUserAllWikisSortedByUserAggregatedYearlyMigrations.csv" ;
-  $file_out_yyyy_matrix = 
"$path_csv/EditsPerUserAllWikisSortedByUserAggregatedYearlyMigrationsMatrix.csv"
 ;
+  if (-d "d:/\@wikimedia/") # test code on local machine
+  { $path_csv = "d:/\@wikimedia/\# out stat1/csv_mw" ; }
+
+  $file_in                       = 
"$path_csv/EditsPerUserMonthNamespaceAllWikisSortedByUserByPeriod.tsv" ;
+  $file_bots                     = "$path_csv/BotsAllProjects.csv" ;
+
+  $file_out_projects_yyyymm      = 
"$path_csv/EditsPerUserPerProjectAggregatedMonthly.tsv" ;
+  $file_out_projects_yyyy        = 
"$path_csv/EditsPerUserPerProjectAgregatedYearly.tsv" ;
+  $file_out_projects_yyyy_matrix = 
"$path_csv/EditsPerUserPerProjectMigrationsPerYearMatrix-Edits$threshold_active_enough-Perc$threshold_perc_dominant_project.csv"
 ;
+
+  $file_out_wikis_yyyymm         = 
"$path_csv/EditsPerUserPerWikiAggregatedMonthlyWp.tsv" ;
+  $file_out_wikis_yyyy           = 
"$path_csv/EditsPerUserPerWikiAggregatedYearlyWp.tsv" ;
+  $file_out_wikis_yyyy_matrix    = 
"$path_csv/EditsPerUserPerWikiMigrationsPerYearMatrix-Edits$threshold_active_enough-Perc$threshold_perc_dominant_project.csv"
 ;
 
   die "Specify path to csv files as: -c [path]" if ! -d $path_csv ;
+  die "Input file '$file_in' not found" if ! -e $file_in ;
   print "Path to csv files: $path_csv\n" ;
-  
- &Aggregate ;
-  &FindMigrations ;
+
+# &Aggregate ;
+# &FindMigrations ('projects') ;
+  &FindMigrations ('wikis') ;
+  &WriteOutput ('wikis') ;;
 
   print "\nReady\n" ;
   exit ;
@@ -62,358 +88,731 @@
   my $user_prev = '' ;
   my $yyyymm_prev = '' ;
   my $yyyy_prev = '' ;
-      
+
   # read merged editor activity for all wikis
   # user, yyyy-mm, project, language, namespace, edits
-  open  CSV_IN, '<', $file_in || die "Could not open $file_in" ;
-  binmode CSV_IN ;
+  open  TSV_IN, '<', $file_in || die "Could not open $file_in" ;
+  binmode TSV_IN ;
+
+  &ReadBots ;
 
   # output one line per user per month, with total edits per project
-  open  CSV_OUT_YYYYMM, '>', $file_out_yyyymm || die "Could not open 
$file_out_yyyymm" ;
-  binmode CSV_OUT_YYYYMM ;
-  print CSV_OUT_YYYYMM "user,period,edits wb,edits wk,edits wn,edits wo,edits 
wp,edits,wq,edits ws,edits wv,edits wx\n" ;
-  
-  # output one line per user per year with total edits per project
-  open  CSV_OUT_YYYY, '>', $file_out_yyyy || die "Could not open 
$file_out_yyyy" ;
-  binmode CSV_OUT_YYYY ;
-  print CSV_OUT_YYYY "user,period,edits wb,edits wk,edits wn,edits wo,edits 
wp,edits,wq,edits ws,edits wv,edits wx\n" ;
-  
-  while ($line = <CSV_IN>) 
-  {
-    chomp $line ;        
-    ($user,$yyyymm,$proj,$lang,$ns,$edits) = split (',', $line) ;        
-    if ($lang eq 'commons')
-    { $proj = 'co' ; }
-    if ($lang eq 'wikidata')
-    { $proj = 'wd' ; }
+  open (my $fh_out_projects_yyyymm, '>', $file_out_projects_yyyymm || die 
"Could not open $file_out_projects_yyyymm") ;
+  binmode  $fh_out_projects_yyyymm ;
+  print    $fh_out_projects_yyyymm "user,period,edits wb,edits wk,edits 
wn,edits wo,edits wp,edits,wq,edits ws,edits wv,edits wx\n" ;
 
-    next if $ns != 0 and $ns != 6 ; # to do check $ns against 'content' 
namespaces for that wiki
-   
+  # output one line per user per year with total edits per project
+  open (my $fh_out_projects_yyyy, '>', $file_out_projects_yyyy || die "Could 
not open $file_out_projects_yyyy") ;
+  binmode  $fh_out_projects_yyyy ;
+  print    $fh_out_projects_yyyy "user,period,edits wb,edits wk,edits wn,edits 
wo,edits wp,edits,wq,edits ws,edits wv,edits wx\n" ;
+
+  # output one line per user per month, with total edits per wiki
+  open (my $fh_out_wikis_yyyymm, '>', $file_out_wikis_yyyymm || die "Could not 
open $file_out_wikis_yyyymm") ;
+  binmode  $fh_out_wikis_yyyymm ;
+  print    $fh_out_wikis_yyyymm "user\tperiod\tmain wiki\tperc edits on main 
wiki\ttotal edits\tedits per wiki\n" ;
+
+  # output one line per user per year with total edits per wiki
+  open (my $fh_out_wikis_yyyy, '>', $file_out_wikis_yyyy || die "Could not 
open $file_out_wikis_yyyy") ;
+  binmode  $fh_out_wikis_yyyy ;
+  print    $fh_out_wikis_yyyy "user\tperiod\tmain wiki\tperc edits on main 
wiki\ttotal edits\tedits per wiki\n" ;
+
+  my $lines_in ;
+  while ($line = <TSV_IN>)
+  {
+    chomp $line ;
+    ($user,undef,$yyyymm,$project,$wiki,$ns,$edits) = split ("\t", $line) ;
+
+    next if $bots {$project}{$wiki}{$user} ;
+
+    last if (($first_lines_only) && ($lines_in++ > $first_lines_only)) ;
+
+    if ($lines_in++ % 10000 == 0)
+    { print "$user\n" ; }
+    # debug:
+# if ($user eq 'Jimbo_Wales')
+# { $a = 1 ; }
+    if ($wiki eq 'commons')
+    { $project = 'co' ; }
+    if ($wiki eq 'wikidata')
+    { $project = 'wd' ; }
+
+    next if $ns != 0 ; # and $ns != 6 ; # to do check $ns against 'content' 
namespaces for that wiki
     $yyyy = substr ($yyyymm,0,4) ;
 
-    if ($user_prev ne '') # not on first line
-    {
-      if (($user ne $user_prev) || # if all edits for this user for this month 
are collected, send to output
-          ($yyyymm ne $yyyymm_prev))
-      {
-        print CSV_OUT_YYYYMM "$user_prev,$yyyymm_prev," . 
-                            (0 + $yyyymm {'tot'}) . ',' .     
-                            (0 + $yyyymm {'wb'}) . ',' .  
-                            (0 + $yyyymm {'wk'}) . ',' .     
-                            (0 + $yyyymm {'wn'}) . ',' .     
-                            (0 + $yyyymm {'wo'}) . ',' .     
-                            (0 + $yyyymm {'wp'}) . ',' .     
-                            (0 + $yyyymm {'wq'}) . ',' .     
-                            (0 + $yyyymm {'ws'}) . ',' .     
-                            (0 + $yyyymm {'wv'}) . ',' .     
-                            (0 + $yyyymm {'wx'}) . ',' .     
-                            (0 + $yyyymm {'co'}) . ',' .     
-                            (0 + $yyyymm {'wd'}) . "\n" ;
-       foreach $key (keys %yyyymm) { $yyyymm {$key} = 0 ; }                 
-      }
+# next if substr ($yyyymm,5,2) ne '01' ;
+# next if $project ne 'wp' ;
+# next if $wiki ne 'en' ;
+ ## next if $yyyy != 2004 ;
 
-      if (($user ne $user_prev) || # if all edits for this user for this year 
are collected, send to output
-          ($yyyy ne $yyyy_prev))
-      {
-        print CSV_OUT_YYYY   "$user_prev,$yyyy_prev," . 
-                            (0 + $yyyy   {'tot'}) . ',' .     
-                            (0 + $yyyy   {'wb'}) . ',' .     
-                            (0 + $yyyy   {'wk'}) . ',' .     
-                            (0 + $yyyy   {'wn'}) . ',' .     
-                            (0 + $yyyy   {'wo'}) . ',' .     
-                            (0 + $yyyy   {'wp'}) . ',' .     
-                            (0 + $yyyy   {'wq'}) . ',' .     
-                            (0 + $yyyy   {'ws'}) . ',' .     
-                            (0 + $yyyy   {'wv'}) . ',' .     
-                            (0 + $yyyy   {'wx'}) . ',' .     
-                            (0 + $yyyy   {'co'}) . ',' .     
-                            (0 + $yyyy   {'wd'}) . "\n" ;   
-       foreach $key (keys %yyyy) { $yyyy {$key} = 0 ; }                     
-      }
+    if (($user ne $user_prev) || # if all edits for this user for this month 
are collected, send to output
+        ($yyyymm ne $yyyymm_prev))
+    {
+      &WriteProjectCounts ($notrace, $fh_out_projects_yyyymm, $user_prev, 
$yyyymm_prev, \%yyyymm_projects) ;
+      &WriteWikiCounts    ($notrace, $fh_out_wikis_yyyymm,    $user_prev, 
$yyyymm_prev, \%yyyymm_wikis) ;
+      undef %yyyymm_projects ;
+      undef %yyyymm_wikis ;
+    }
+
+    if (($user ne $user_prev) || # if all edits for this user for this year 
are collected, send to output
+        ($yyyy ne $yyyy_prev))
+    {
+      &WriteProjectCounts ($notrace, $fh_out_projects_yyyy, $user_prev, 
$yyyy_prev, \%yyyy_projects) ;
+      &WriteWikiCounts    ($notrace, $fh_out_wikis_yyyy,    $user_prev, 
$yyyy_prev, \%yyyy_wikis) ;
+      undef %yyyy_projects ;
+      undef %yyyy_wikis ;
     }
 
     $user_prev   = $user ;
-    $yyyymm_prev = $yyyymm ;  
-    $yyyy_prev   = $yyyy ;  
+    $yyyymm_prev = $yyyymm ;
+    $yyyy_prev   = $yyyy ;
 
-    # aggregate per month/year, per project and all projects 
-    $yyyymm {'tot'} += $edits ;
-    $yyyy   {'tot'} += $edits ;
-    $yyyymm {$proj} += $edits ;
-    $yyyy   {$proj} += $edits ;
-  
-    # debug:  
+    # aggregate per month/year, per project and all projects
+    $yyyymm_projects {'tot'} += $edits ;
+    $yyyy_projects   {'tot'} += $edits ;
+    $yyyymm_projects {$project} += $edits ;
+    $yyyy_projects   {$project} += $edits ;
+
+    # aggregate per month/year, per wiki and all wikis
+    if ($project eq 'wp')
+    {
+      $yyyymm_wikis {'tot'} += $edits ;
+      $yyyy_wikis   {'tot'} += $edits ;
+      $yyyymm_wikis {$wiki} += $edits ;
+      $yyyy_wikis   {$wiki} += $edits ;
+    }
+
+    # debug:
     # print "$user $yyyymm $yyyy $edits $lang\n" ;
   }
 
-  # send totals to output for last user
-  print CSV_OUT_YYYYMM "$user_prev,$yyyymm_prev," . 
-                       (0 + $yyyymm {'tot'}) . ',' .     
-                              (0 + $yyyymm {'wb'}) . ',' .     
-                       (0 + $yyyymm {'wk'}) . ',' .     
-                       (0 + $yyyymm {'wn'}) . ',' .     
-                       (0 + $yyyymm {'wo'}) . ',' .     
-                       (0 + $yyyymm {'wp'}) . ',' .     
-                       (0 + $yyyymm {'wq'}) . ',' .     
-                       (0 + $yyyymm {'ws'}) . ',' .     
-                       (0 + $yyyymm {'wv'}) . ',' .     
-                       (0 + $yyyymm {'wx'}) . ',' .     
-                       (0 + $yyyymm {'co'}) . ',' .     
-                       (0 + $yyyymm {'wd'}) . "\n" ;   
-  print CSV_OUT_YYYYMM "$user_prev,$yyyy_prev," . 
-                       (0 + $yyyy   {'tot'}) . ',' .     
-                              (0 + $yyyy   {'wb'}) . ',' .     
-                       (0 + $yyyy   {'wk'}) . ',' .     
-                       (0 + $yyyy   {'wn'}) . ',' .     
-                       (0 + $yyyy   {'wo'}) . ',' .     
-                       (0 + $yyyy   {'wp'}) . ',' .     
-                       (0 + $yyyy   {'wq'}) . ',' .     
-                       (0 + $yyyy   {'ws'}) . ',' .     
-                       (0 + $yyyy   {'wv'}) . ',' .     
-                       (0 + $yyyy   {'wx'}) . ',' .     
-                       (0 + $yyyy   {'co'}) . ',' .     
-                       (0 + $yyyy   {'wd'}) . "\n" ;   
+  print "\nAll input processed\n\n" ;
+  &WriteProjectCounts ($notrace, $fh_out_projects_yyyymm, $user_prev, 
$yyyymm_prev, \%yyyymm_projects) ;
+  &WriteProjectCounts ($notrace, $fh_out_projects_yyyy,   $user_prev, 
$yyyy_prev,   \%yyyy_projects) ;
+  &WriteWikiCounts    ($notrace, $fh_out_wikis_yyyymm,    $user_prev, 
$yyyymm_prev, \%yyyymm_wikis) ;
+  &WriteWikiCounts    ($notrace, $fh_out_wikis_yyyy,      $user_prev, 
$yyyy_prev,   \%yyyy_wikis) ;
 
-  close CSV_IN ;
-  close CSV_OUT_YYYYMM ;
-  close CSV_OUT_YYYY ;
+  close TSV_IN ;
+}
+
+sub WriteProjectCounts
+{
+  my ($trace, $filehandle, $user, $period, $hash_userdata) = @_ ;
+  my %userdata = %$hash_userdata ;
+
+  return if $user eq '' ; # not on first line of input
+
+  my ($edits_max, $project_main, $perc_main, $project, $line, $key) ;
+
+  $project_main = 'xx' ;
+  $edits_max = 0 ;
+  foreach $project (qw (wb wk wn wo wp wq ws wv wx co wd))
+  {
+    if ($userdata {$project} > $edits_max)
+    {
+      $edits_max = $userdata {$project} ;
+      $project_main = $project ;
+    }
+  }
+
+  if ($userdata {'tot'} == 0)
+  {
+    print "total edits zero for user '$user' period $period" ;
+    exit ;
+  }
+  $perc_main = sprintf ("%.0f", 100 * $edits_max / $userdata {'tot'}) ;
+  $line =  "$user\t$period\t" .
+           $project_main . "\t" .
+           $perc_main . "\t" .
+           (0 + $userdata {'tot'}) . "\t" .
+           (0 + $userdata {'wb'}) . "\t" .
+           (0 + $userdata {'wk'}) . "\t"        .
+           (0 + $userdata {'wn'}) . "\t"        .
+           (0 + $userdata {'wo'}) . "\t"        .
+           (0 + $userdata {'wp'}) . "\t"        .
+           (0 + $userdata {'wq'}) . "\t"        .
+           (0 + $userdata {'ws'}) . "\t"        .
+           (0 + $userdata {'wv'}) . "\t"        .
+           (0 + $userdata {'wx'}) . "\t"        .
+           (0 + $userdata {'co'}) . "\t"        .
+           (0 + $userdata {'wd'}) ;
+
+  if ($trace)
+  { print "$line\n" ; }
+  print $filehandle "$line\n" ;
+}
+
+
+sub WriteWikiCounts
+{
+  my ($trace, $filehandle, $user, $period, $hash_userdata) = @_ ;
+  my %userdata = %$hash_userdata ;
+
+  return if $user eq '' ; # not on first line of input
+
+  my ($edits_max, $wiki_main, $perc_main, $wiki, $line, $key) ;
+
+  $wiki_main = '?' ;
+  $edits_max = 0 ;
+  foreach $wiki (@wikis)
+  {
+    if ($userdata {$wiki} > $edits_max)
+    {
+      $edits_max = $userdata {$wiki} ;
+      $wiki_main = $wiki ;
+    }
+  }
+
+  return if $userdata {'tot'} == 0 ; # user did not edit on Wikipedia
+
+  @wikis_this_user = sort {$userdata {$b} <=> $userdata {$a}} keys %userdata ;
+
+  $perc_main = sprintf ("%.0f", 100 * $edits_max / $userdata {'tot'}) ;
+  $line =  "$user\t$period\t" .
+           $wiki_main . "\t" .
+           $perc_main . "\t" .
+           $userdata {'tot'} . "\t" ;
+
+  foreach $wiki (@wikis_this_user)
+  {
+    next if $wiki eq 'tot' ;
+    last if $userdata {$wiki} == 0 ;
+    $line .= "$wiki:" . $userdata {$wiki} . "|" ;
+  }
+
+  $line =~ s/\|$// ;
+
+  if ($trace)
+  { print "$line\n" ; }
+  print $filehandle "$line\n" ;
 }
 
 sub FindMigrations
 {
-  open  CSV_IN_YYYY, '<', $file_out_yyyy || die "Could not open 
$file_out_yyyy" ;
+  my ($mode) = @_ ;
+
+  if ($mode eq 'projects')
+  {
+    $name_workspace = 'project' ;
+    open  CSV_IN_YYYY,    '<', $file_out_projects_yyyy        || die "Could 
not open $file_out_projects_yyyy" ;
+    open  CSV_OUT_MATRIX, '>', $file_out_projects_yyyy_matrix || die "Could 
not open $file_out_projects_yyyy_matrix" ;
+  }
+  else
+  {
+    $name_workspace = 'wiki' ;
+    open  CSV_IN_YYYY,    '<', $file_out_wikis_yyyy        || die "Could not 
open $file_out_wikis_yyyy" ;
+    open  CSV_OUT_MATRIX, '>', $file_out_wikis_yyyy_matrix || die "Could not 
open $file_out_wikis_yyyy_matrix" ;
+  }
+
   binmode CSV_IN_YYYY ;
-  
-  open  CSV_OUT_YYYY, '>', $file_out_yyyy_migr || die "Could not open 
$file_out_yyyy_migr" ;
-  binmode CSV_OUT_YYYY ;
-  
-  open  CSV_OUT_MATRIX, '>', $file_out_yyyy_matrix || die "Could not open 
$file_out_yyyy_matrix" ;
   binmode CSV_OUT_MATRIX ;
 
+  my $user_prev = '' ;
+  my (%workspace_main, %perc_main, %total_edits, $lines_read) ;
+  my $periods_active_enough = 0 ;
+
+  my $line = <CSV_IN_YYYY> ; # skip header line
   while ($line = <CSV_IN_YYYY>)
   {
-    chomp $line ;        
-    
-    my ($user,$period,$total,$wb,$wk,$wn,$wo,$wp,$wq,$ws,$wv,$wx,$co,$wd) = 
split (',', $line) ;        
+    last if (($first_lines_only) && ($lines_migrations++ > $first_lines_only)) 
;
 
+    chomp $line ;
+  # my 
($user,$period,$workspace_main,$perc_main,$total,$wb,$wk,$wn,$wo,$wp,$wq,$ws,$wv,$wx,$co,$wd)
 = split (',', $line) ; # for projects
+    my ($user,$period,$workspace_main,$perc_main,$total) = split ("\t", $line) 
; # works for projects and wikis
 
-    if ($total < $threshold_active_enough)
+    $workspaces_main {$workspace_main}++ ;
+
+    if ($period gt $period_max)
+    { $period_max = $period ; }
+
+    if (($user ne $user_prev) && ($user_prev ne ''))
     {
-      $active_below_treshold {$period} ++ ;
-      next ;       
-    }      
-  
-    # calc distribution of edits per project for this user 
-    $perc {'wb'} = sprintf ('%.0f', 100 * $wb / $total) ;
-    $perc {'wk'} = sprintf ('%.0f', 100 * $wk / $total) ;
-    $perc {'wn'} = sprintf ('%.0f', 100 * $wn / $total) ;
-    $perc {'wo'} = sprintf ('%.0f', 100 * $wo / $total) ;
-    $perc {'wp'} = sprintf ('%.0f', 100 * $wp / $total) ;
-    $perc {'wq'} = sprintf ('%.0f', 100 * $wq / $total) ;
-    $perc {'ws'} = sprintf ('%.0f', 100 * $ws / $total) ;
-    $perc {'wv'} = sprintf ('%.0f', 100 * $wv / $total) ;
-    $perc {'wx'} = sprintf ('%.0f', 100 * $wx / $total) ;
-    $perc {'co'} = sprintf ('%.0f', 100 * $co / $total) ;
-    $perc {'wd'} = sprintf ('%.0f', 100 * $wd / $total) ;
+      &UpdCountsForThisUser ($user_prev, $periods, \%workspace_main, 
\%perc_main, \%total_edits) ;
 
-    # scan all projects for this user and period until main project found
-    # on main project (when found) count user as 'stayed on same project' or 
'migrated'
-    $main_project_found = $false ;
-    foreach $proj (qw (wb wk wn wo wp wq ws wv wx co wd))
-    {
-      # reset for new user
-      if ($user ne $user_prev)
-      {
-       $proj_prev   = '' ;
-       $perc_prev   = 0 ;
-       $total_prev  = 0 ;
-       $period_prev = 0 ;
-        $main_project_found_prev = $main_project_found ;
-      }
-
-      # if there is one project which received most edits
-      # compare with previous project which received most edits
-      # if different project count as 'migration'
-      if ($perc {$proj} >= $threshold_perc_dominant_project)   
-      { 
-       $active_above_treshold {'all'} {$period} ++ ;
-       $active_above_treshold {$proj} {$period} ++ ;
-
-       $perc = $perc {$proj} ;
-       # print "$user,$period,$proj ($perc\%}}\%\n" ;      
-        
-       # next period for same user ?
-       if (($user eq $user_prev) && ($proj_prev ne '')) 
-        {      
-          # user migrated to other main project ?  
-         if ($proj ne $proj_prev)    
-          {
-           $active_above_treshold_incoming {$proj} {$period} ++ ;
-           $active_above_treshold_leaving  {$proj} {$period} ++ ;
-           
-            # log migration
-           print "$user,$period,$proj_prev ($perc_prev\% of $total_prev) -> 
$proj ($perc\% of $total)})\n" ;   
-           
-           $path = "$proj_prev->$proj" ;       
-           $migrations {"$period,$path"} ++ ;
-           $migrations_total {$path} ++ ; # used as column sort in reporting
-          } 
-         else
-         { $active_above_treshold_continuing {$proj} {$period} ++ ; }
-        }
-
-       # mark this project as dominant for next iteration(s)
-       $proj_prev   = $proj ;
-       $perc_prev   = $perc ;
-       $total_prev  = $total ;
-       $period_prev = $period ;
-       
-       # main project has been found -> stop scan
-        $main_project_found = $true ;
-       last ;
-      }
-
-      $user_prev = $user ;
+      undef %workspace_main ;
+      undef %perc_main ;
+      undef %total_edits ;
+      undef $periods_active_enough ;
+      $periods = 0 ;
     }
 
-    # if not main project found this period, but it was found on previous 
period, count user as 'drop-out'
-    if (! $main_project_found)
-    {
-      if ($main_project_found_prev)
-      {
-        $gone_or_dropped_below_treshold {$proj} {$period} ++ ;
-      }              
-    }
-    $main_project_found_prev = $main_project_found ;
-  }  
+    $workspace_main {$period} = $workspace_main ;
+    $perc_main      {$period} = $perc_main ;
+    $total_edits    {$period} = $total ;
+    $periods ++ ;
 
-  print CSV_OUT_MATRIX "Only editors with at least $threshold_active_enough 
namespace 0 edits in a year are examined here for migration behavior\n" ;
-  print CSV_OUT_MATRIX "An editor is considered to have a 'main project of 
interest' when at least threshold_perc_dominant_project\% of ns 0 edits are on 
that project\n" ;
-  print CSV_OUT_MATRIX "An editor is considered to have 'migrated' when this 
main project differs in consecutive qualifying (enough edits) periods\n" ;
-  print CSV_OUT_MATRIX "An editor can migrate from one project to another and 
migrate back later on\n" ;
-
-  &WriteMigrationsPerPath ($absolute, $sort_alphabetical) ;
-  &WriteMigrationsPerPath ($absolute, $sort_by_frequency) ;
-
-  &WriteMigrationsPerProject ($absolute) ; 
-  &WriteMigrationsPerProject ($relative) ; 
+    $user_prev = $user ;
+  }
+  &UpdCountsForThisUser ($user_prev, $periods, \%workspace_main, \%perc_main, 
\%total_edits) ;
 }
 
- 
+sub UpdCountsForThisUser
+{
+  my 
($username,$periods,$hash_workspace_main,$hash_perc_main,$hash_total_edits) = 
@_ ;
+
+  my %workspace_main = %$hash_workspace_main ;
+  my %perc_main      = %$hash_perc_main ;
+  my %total_edits    = %$hash_total_edits ;
+
+  my ($log_migrations_lines,$userline,$loglines,
+      
$path,$flag,$flag_prev,$period,$total,$total_prev,$perc,$perc_prev,$period_prev,$workspace,$workspace_prev,
+      %qualifying_period, $qualifying_periods, $first_qualifying_period, 
$last_qualifying_period) ;
+
+  foreach $period (sort keys %total_edits)
+  {
+    $perc      = $perc_main   {$period} ;
+    $total     = $total_edits {$period} ;
+
+    if (($total >= $threshold_active_enough) && ($perc >= 
$threshold_perc_dominant_project))
+    {
+      $qualifying_period {$period} ++ ;
+      $qualifying_periods ++ ;
+      $qualifying_periods_overall ++ ;
+      $last_qualifying_period = $period ;
+
+      if ($first_qualifying_period eq '')
+      { $first_qualifying_period = $period ; }
+    }
+    else
+    {
+      if ($total >= $threshold_active_enough)
+      { $non_qualifying_periods_overall ++ ; }
+    }
+  }
+
+  foreach $period (sort keys %total_edits)
+  {
+    $flag = '-' ;
+
+    if ($qualifying_period {$period})
+    {
+      $workspace = $workspace_main {$period} ;
+      $perc      = $perc_main   {$period} ;
+      $total     = $total_edits {$period} ;
+
+      $editors {'all'} {$period} ++ ;
+      $editors {$workspace} {$period} ++ ;
+
+      if ($qualifying_periods == 1)
+      {
+        $editors_once {$workspace} {$period} ++ ; # matrix metric
+        $flag = '1' ; # only edited in one period
+      }
+      # user new or back after no or too few edits?
+      # if (($period_prev eq '') || ($period > $period_prev + 1))
+      elsif ($period eq $first_qualifying_period) # ($flag_prev eq '') || 
($flag_prev =~ /[ep]/))
+      {
+        $editors_new {$workspace} {$period} ++ ; # matrix metric
+        $flag = 'N' ; # new
+      }
+      # user enough active on consecutive periods
+      else
+      {
+        # did main workspace stay the same in those years?
+        if ($workspace eq $workspace_prev)
+        {
+          $editors_staying {$workspace} {$period} ++ ; # matrix metric
+          $flag = 'S' ; # staying
+        }
+        else
+        {
+          # user migrated to other main workspace
+          $editors_migrating_in  {$workspace}      {$period}      ++ ;  # 
matrix metric
+          $editors_migrating_out {$workspace_prev} {$period_prev} ++ ;  # 
matrix metric
+          $flag = 'M' ; # M
+
+          # log migration
+          if (((++ $log_migrations_lines) % 1) == 0)
+          { $loglines .= sprintf ("%30s",$user) . 
"\t$user,$period,$workspace_prev ($perc_prev\% of $total_prev) -> " .
+                                                  "$workspace ($perc\% of 
$total)})\n" ; }
+
+          $path     = "$workspace_prev->$workspace" ;
+          $path_out = "$workspace_prev->" ;
+          $path_in  = "->$workspace" ;
+
+          $migrations {"$period,$path"} ++ ;
+          $migrations_total {$path} ++ ; # used as column sort in reporting
+
+          $migrations_in  {"$period,$path_in"}  ++ ;
+          $migrations_out {"$period_prev,$path_out"} ++ ;
+         }
+      }
+
+      if ($period eq $last_qualifying_period)
+      {
+        $editors_lost {$workspace_prev} {$period} ++ ; # matrix metric
+        $flag = 'X' ; # X = last qualifying period, overrides earlier set flag 
(for trace only)
+      }
+
+      $period_prev     = $period ;
+      $workspace_prev  = $workspace ;
+      $perc_prev       = $perc ;
+      $total_prev      = $total ;
+    }
+    elsif ($total < $threshold_active_enough)
+    { $flag = "e" ; } # not enough edits
+    else
+    { $flag = "p" ; } # not enough percentage edits on main project
+
+    $userline .= 
"$period:$flag:${workspace_main{$period}}:${perc_main{$period}}\%[ed:${total_edits{$period}}],
 " ;
+
+    $flag_prev       = $flag ;
+  }
+
+  $userline =~ s/,\s*$// ;
+  if (($skip_users_before_logging == 0) || ($users_upd_counts++ % 
$skip_users_before_logging == 0))
+  { print sprintf ("%30s",$username) . "\t$userline\n$loglines\n\n" ; }
+}
+
+sub WriteOutput
+{
+  my ($workspace) = @_ ;
+
+# &print_comment ("Only editors with at least $threshold_active_enough 
namespace 0 edits in a year are examined here for migration behavior\n") ;
+# &print_comment ("An editor is considered to have a 'main project of 
interest' when at least $threshold_perc_dominant_project\% of ns 0 edits are on 
that project\n") ;
+
+  &print_comment ("This report is about a subset of editors who contributed 
substantially to one or more Wikimedia wikis and most of those edits to one 
$name_workspace\n") ;
+  &print_comment ("Edits are page changes in 'content' namespaces") ;
+  &print_comment ("Editors qualify when they made at least 
$threshold_active_enough edits in a year, and at least 
$threshold_perc_dominant_project\% of those edits in one ('main') project\n") ;
+  &print_comment ("An editor is considered to have 'migrated' when this main 
project differs in consecutive qualifying (= enough edits) periods") ;
+  &print_comment ("An editor can migrate from one project to another and 
migrate back later on") ;
+  &print_comment ("Each editor is counted at most once a year, on their 'main' 
$name_workspace only") ;
+
+  &WriteMigrationsPerPath ($absolute, $sort_by_frequency, $workspace) ;
+# &WriteMigrationsPerPath ($absolute, $sort_by_frequency) ;
+
+  &WriteMigrationsPerWorkspace ($workspace) ;
+}
+
+
 sub WriteMigrationsPerPath
 {
-  my ($absolute, $sort_order)  = @_ ;
+  my ($absolute, $sort_order, $workspace)  = @_ ;
 
   $absolute = $true ; # not sure yet about how to calc best relative 
migrations -> relative not yet implemented
 
-  if ($absolute)
-  { print CSV_OUT_MATRIX "Absolute migrations\n\n" ; }
-  else
-  { print CSV_OUT_MATRIX "Relative migrations (percentage of editors above 
threshold $threshold_perc_dominant_project in that period (migrating + non 
migrating)\n\n" ; }
+  my $line ;
 
   if ($sort_order == $sort_alphabetical)
-  { 
-    print CSV_OUT_MATRIX "Migration paths ordered alphabetically\n" ; 
-    @sequence_paths = sort {$migrations_total {$b} <=>  $migrations_total 
{$a}} keys %migrations_total ; 
+  {
+    $msg_sort = "paths ordered alphabetically" ;
+    @sequence_paths = sort {$a cmp $b} keys %migrations_total ;
+    @sequence_workspaces = sort {$editors {$b} {$period_max} <=> $editors {$a} 
{$period_max}} keys %workspaces_main ;
   }
   else
-  { 
-    print CSV_OUT_MATRIX "Migration paths ordered by frequency of 
occurrence\n" ; 
-    @sequence_paths = sort {$a cmp $b} keys %migrations_total ; 
+  {
+    $msg_sort = "paths ordered by frequency of occurrence" ;
+    @sequence_paths = sort {$migrations_total {$b} <=>  $migrations_total 
{$a}} keys %migrations_total ;
+    @sequence_workspaces = sort {$editors {$b} {$period_max} <=> $editors {$a} 
{$period_max}} keys %workspaces_main ;
   }
 
-  # print headers   
+  if ($absolute)
+  {
+    $line = "\n\nMigrations per path (= from $name_workspace xx -> to 
$name_workspace yy), as absolute numbers, $msg_sort\n" ;
+    if ($namespace eq 'project')
+    { $line .= "wb:wikibooks, wk:wiktionary, wn:wikinews, wo:wikivoyage, 
wp:wikipedia, wq:wikiquote, ws:wikisource, wv:wikiversity, co:commons, 
wd:wikidata, wx:other projects\n" ; }
+  }
+  else
+  {
+    $line = "\n\nMigrations per path (= from $name_workspace xx -> to 
$name_workspace yy), as relative numbers, $msg_sort\n" ;
+    if ($namespace eq 'project')
+    { $line .= "wb:wikibooks, wk:wiktionary, wn:wikinews, wo:wikivoyage, 
wp:wikipedia, wq:wikiquote, ws:wikisource, wv:wikiversity, co:commons, 
wd:wikidata, wx:other projects\n" ; }
+    $line .= "Relative migrations (percentage of editors above threshold 
$threshold_perc_dominant_project in that period (migrating + non migrating)" ;
+  }
+  &print_comment ($line) ;
+
+  if ($qualifying_periods_overall + $non_qualifying_periods_overall > 0 )
+  { $line = sprintf ("%.1f", 100 - 100 * $qualifying_periods_overall / 
($qualifying_periods_overall + $non_qualifying_periods_overall)) .
+            "\% user-periods were discarded where user had enough edits but no 
'main' $name_workspace (> $threshold_perc_dominant_project\% edits)\n" ;
+    &print_comment ($line) ;
+  }
+
+  # print headers
   $line = "," ;
+  $column = 0 ;
   foreach $path (@sequence_paths)
-  { $line .= "$path," ; }
-  print CSV_OUT_MATRIX "$line\n" ;
-  $line =~ s/,/\t/g ;
-  print "$line\n" ;
+  {
+    $line .= ralign ($path) . ',' ;
+    last if ++$column >= 20 ;
+  }
+  &print_columns ($line) ;
+
+  # print totals
+  $line = "total," ;
+  $column = 0 ;
+  foreach $path (@sequence_paths)
+  {
+    $line .= ralign ($migrations_total {$path}) . ',' ;
+    last if ++$column >= 20 ;
+  }
+  &print_columns ($line) ;
 
   # print per period for each significant migration pair (from->to) number of 
migrations
-  foreach $period (sort keys %periods) 
-  { 
+  foreach $period (@periods)
+  {
+    next if $period == $periods [0] ;
     $line = "$period," ;
+    $column = 0 ;
     foreach $path (@sequence_paths)
-    { $line .= $migrations {"$period,$path"} . "," ; }
-    print CSV_OUT_MATRIX "$line\n" ;
-    $line =~ s/,/\t/g ;
-    print "$line\n" ;
+    {
+      $line .= ralign ($migrations {"$period,$path"}) . "," ;
+      last if ++$column >= 20 ;
+    }
+    &print_columns ($line) ;
   }
-  print CSV_OUT_MATRIX "\n" ;
-}  
 
-sub WriteMigrationsPerProject
+  &print_comment ("\nEditors who migrated in or out - as percentage of total 
editors who qualified for that year and $name_workspace\n\n") ;
+
+  # print headers
+  $line = "," ;
+  $column = 0 ;
+  foreach $path (@sequence_workspaces)
+  {
+    $line .= ralign ("$path in") . ',' . ralign ("$path out") . ',' ;
+    last if ++$column >= 20 ;
+  }
+  &print_columns ($line) ;
+
+#  # print totals
+#  $line = "total," ;
+#  $column = 0 ;
+#  foreach $path (@sequence_workspaces)
+#  {
+#    $line .= ralign ($migrations_total {$path}) . ',' ;
+#    last if ++$column >= 20 ;
+#  }
+#  &print_columns ($line) ;
+
+  # print per period for each significant migration pair (from->to) number of 
migrations
+  foreach $period (@periods)
+  {
+    next if $period == $periods [0] ;
+    $line = "$period," ;
+    $column = 0 ;
+    foreach $workspace (@sequence_workspaces)
+    {
+    #  $migrations_in  = $migrations_in  {"$period,->$workspace"} ;
+    #  $migrations_out = $migrations_out {"$period,$workspace->"} ;
+    # if ($migrations_in  eq '') { $migrations_in  = '-' ; }
+    # if ($migrations_out eq '') { $migrations_out = '-' ; }
+
+      $perc_in  = &percent ($editors {$workspace} {$period},$migrations_in  
{"$period,->$workspace"}) ;
+      $perc_out = &percent ($editors {$workspace} {$period},$migrations_out 
{"$period,$workspace->"}) ;
+
+      $line .= ralign ($perc_in) . ',' . ralign ($perc_out) . ',' ;
+      last if ++$column >= 20 ;
+    }
+    &print_columns ($line) ;
+  }
+
+  &print_comment ("\n") ;
+}
+
+sub WriteMigrationsPerWorkspace
 {
-  my ($absolute) = @_ ;
+  my ($workspace) = @_ ;
 
-  print "\n\n" ;
-  print CSV_OUT_MATRIX "\n\n," ;
 
-  print CSV_OUT_MATRIX "wb:wikibooks, wk:wiktionary, wn:wikinews, 
wo:wikivoyage, wp:wikipedia, wq:wikiquote, ws:wikisource, wv:wikiversity, 
co:commons, wd:wikidata, wx:other projects\n" ;
+  &print_comment ("\n") ;
 
-  print CSV_OUT_MATRIX "Editors who did most editing on one project (> 
threshold_perc_dominant_project\%), per project\n" ; 
-  if ($absolute)
-  { print CSV_OUT_MATRIX "Percentages show how many editors came from other 
projects, as share of total editors on this project\n\n" ; }
+# if ($absolute)
+# { print CSV_OUT_MATRIX "Percentages show how many editors came from other 
workspaces, as share of total editors on this workspace\n\n" ; }
+#  else
+#  { print CSV_OUT_MATRIX "Percentages show how many editors came from other 
workspaces, as share of total editors overall\n\n" ; }
+
+  if ($workspace eq 'projects')
+  { @sequence_workspaces = qw (wb wk wn wo wp wq ws wv co wd wx) ; }
   else
-  { print CSV_OUT_MATRIX "Percentages show how many editors came from other 
projects, as share of total editors overall\n\n" ; }
+  {
+    @sequence_workspaces = sort {$editors {$b} {$period_max} <=> $editors {$a} 
{$period_max}} keys %workspaces_main ;
+    $#sequence_workspaces = 200 ;  # show 10 top workspaces
+  }
 
-  @sequence_projects = qw (wb wk wn wo wp wq ws wv co wd wx) ;
+# @sequence_workspaces = qw (wk wp co wd) ;
 
-  print "1 per mil = 0.1 percent = 0.1% = 1%%\n\n" ;
+  &print_comment ("Edits are page changes in 'content' namespaces") ;
+  &print_comment ("Editors qualify when they made at least 
$threshold_active_enough edits in a year, and at least 
$threshold_perc_dominant_project\% of those edits in one ('main') project\n") ;
+  &print_comment ("Columns:\n") ;
+  &print_comment ("* one year only = editors who only qualified in one year") ;
+  &print_comment ("* new = editors who never edited earlier, or edited but did 
not qualify in earlier years, and qualified again in later year") ;
+  &print_comment ("* staying = editors who qualified in earlier year, and kept 
their focus on this $name_workspace") ;
+  &print_comment ("* migrating in = editors who qualified in earlier year, and 
changed their focus to this $name_workspace") ;
+  &print_comment ("* total = one year only + new + staying + migrating in") ;
+  &print_comment ("* migrating in = editors who qualified in this year, and 
changed their focus in next qualifying year to other $name_workspace") ;
+  &print_comment ("* last year = editors who qualified in this year, and 
stopped editing or did not qualify in later years") ;
 
- # print headers  
-  $line = "proj->," ;  
-  foreach $proj (@sequence_projects)
-# { $line .= &ralign ($proj) . ',' . &ralign ($proj) . ',' ; }
-  { $line .= $names {$proj} . " ($proj),,," ; }
-  print CSV_OUT_MATRIX "$line\n" ;
-  $line =~ s/,/\t/g ;
-  print "$line\n" ;
+# # print headers
+#  $line = "proj->," ;
+#  foreach $project (@sequence_workspaces)
+## { $line .= &ralign ($project) . ',' . &ralign ($project) . ',' ; }
+#  { $line .= $names {$project} . " ($project),,," ; }
+#  print CSV_OUT_MATRIX "$line\n" ;                                            
             absolute
 
-  $line = "period," ;  
-  foreach $proj (@sequence_projects)
-  { $line .= &ralign ('abs') . ',' . &ralign ('rel %%') . ',' . &ralign ('rel2 
%%') . ',,' ; }
-  print CSV_OUT_MATRIX "$line\n" ;
-  $line =~ s/,/\t/g ;
-  print "$line\n" ;
+#  $line =~ s/,/\t/g ;
+#  print "$line\n" ;
+
+#  $line = "period," ;
+#  foreach $project (@sequence_workspaces)
+#  { $line .= &ralign ('abs') . ',' . &ralign ('rel %%') . ',' . &ralign 
('rel2 %%') . ',,' ; }
+#  print CSV_OUT_MATRIX "$line\n" ;
+#  $line =~ s/,/\t/g ;
+#  print "$line\n" ;
 
   # print data per period per project
-  foreach $period (sort keys %periods) 
-  { 
-    $line = "$period," ;
-    foreach $proj (@sequence_projects)
-    { 
-      $active_incoming = $active_above_treshold_incoming {$proj} {$period} ;
-      $active_project  = $active_above_treshold {$proj} {$period} ; 
-      if ($absolute) # calc perc as relative to total editors on this project
-      { $active_total = $active_above_treshold {'all'} {$period} ; }
-      else
-      { $active_total = $active_project ; }
-
-      $perc = '-' ;
-
-      if ($active_total > 0)
-      { 
-       # $perc1 = ralign (sprintf ("%.0f", 1000 * $active_project  / 
$active_total) . '%%') ;
-       # $perc2 = ralign (sprintf ("%.0f", 1000 * $active_incoming / 
$active_total) . '%%') ;
-        $perc1 = ralign (sprintf ("%.0f", 1000 * $active_project  / 
$active_total)) ;
-        $perc2 = ralign (sprintf ("%.0f", 1000 * $active_incoming / 
$active_total)) ;
-      }
-      # if (! $absolute)
-      # { $perc = &ralign (sprintf ("%.0f", $perc1) . '%%') ; }
-
-      if ($active_project eq '')
-      { $active_project = '-' ; }
-      $line .= &ralign ($active_project) . ",$perc1,$perc2,," ; 
-    }
+  foreach $project (@sequence_workspaces)
+  {
+    $line = "\n\n" . $names {$project} . " ($project)\n" ;
     print CSV_OUT_MATRIX "$line\n" ;
     $line =~ s/,/\t/g ;
     print "$line\n" ;
+
+    &print_columns ("period,absolute 
->-------------------------------------------------------------<- 
absolute,,,,,,,,relative->-------------------------------------------------------------<-relative")
 ;
+    $line = "," .  ralign ('one yr') . "," .  ralign ('new') . ',' . ralign 
('staying') . ',' . ralign ('migr') . ',' . ralign ('total') . ',' . ralign 
("migr") . ',' . ralign ("last") .
+            ",," . ralign ('one yr') . ',' . ralign ('new') . ',' . ralign 
('staying') . ',' . ralign ('migr') . ',' . ralign ('total') . ',' . ralign 
("migr") . ',' . ralign ("last") ;
+    &print_columns ($line) ;
+
+    $line = "," . ralign ('only') . ",,," .   ralign ('in') . ',,' . ralign 
('out') . ',' . ralign ('year') . ',' .
+             "," . ralign ('only') . ",,," . ralign ('in') . ',,' . ralign 
('out') . ',' . ralign ('year')  ;
+    &print_columns ($line) ;
+
+    foreach $period (@periods)
+    {
+      # skip first period, no relative changes
+      # next if $period == $periods [0] ;
+
+      $editors_total_this_period = $editors {'all'} {$period} ;
+
+      $count_total          = &dash   ($editors               {$project} 
{$period}) ;
+      $count_new            = &dash   ($editors_new           {$project} 
{$period}) ;
+      $count_once           = &dash   ($editors_once          {$project} 
{$period}) ;
+
+      $percent_total         = &percent 
($editors_total_this_period,$count_total) ;
+      $percent_new           = &percent 
($editors_total_this_period,$count_new) ;
+      $percent_once          = &percent 
($editors_total_this_period,$count_once) ;
+
+      if ($period > $periods [0])
+      {
+        $count_staying        = &dash   ($editors_staying       {$project} 
{$period}) ;
+        $count_lost           = &dash   ($editors_lost          {$project} 
{$period}) ;
+        $count_migrating_in   = &dash   ($editors_migrating_in  {$project} 
{$period}) ;
+        $count_migrating_out  = &dash   ($editors_migrating_out {$project} 
{$period}) ;
+
+        $percent_staying       = &percent 
($editors_total_this_period,$count_staying) ;
+        $percent_lost          = &percent 
($editors_total_this_period,$count_lost) ;
+        $percent_migrating_in  = &percent 
($editors_total_this_period,$count_migrating_in) ;
+        $percent_migrating_out = &percent 
($editors_total_this_period,$count_migrating_out) ;
+
+        # do not present lost editors on last year shown (which is last 
complete year)
+        # some may return later in current year
+        if ($period eq $periods [$#periods])
+        {
+          $dash = ralign ('-') ;
+          $count_lost   = $dash ;
+          $percent_lost = $dash ;
+        }
+      }
+      else
+      {
+        $dash = ralign ('-') ;
+
+        $count_staying        = $dash ;
+        $count_lost           = $dash ;
+        $count_migrating_in   = $dash ;
+        $count_migrating_out  = $dash ;
+
+        $dash = ralign ('-    ') ;
+
+        $percent_staying       = $dash ;
+        $percent_lost          = $dash ;
+        $percent_migrating_in  = $dash ;
+        $percent_migrating_out = $dash ;
+      }
+
+      &print_columns 
("$period,$count_once,$count_new,$count_staying,$count_migrating_in,$count_total,$count_migrating_out,$count_lost,,"
 .
+                               
"$percent_once,$percent_new,$percent_staying,$percent_migrating_in,$percent_total,$percent_migrating_out,$percent_lost")
 ;
+    }
   }
-} 
+}
+
+sub ReadBots
+{
+  die "Bots file '$file_bots' not found" if ! -e $file_bots ;
+
+  open CSV_BOTS, '<', $file_bots ;
+
+  while ($line = <CSV_BOTS>)
+  {
+    chomp $line ;
+    ($project,$wiki,$bots) = split (',', $line) ;
+
+    $wikis {$wiki} ++ ;
+    @bots = split ('\|', $bots) ;
+    foreach $bot (@bots)
+    { $bots {$project} {$wiki} {$bot} ++ ; }
+  }
+
+  @wikis = keys %wikis ; # for iterating over all wiki codes
+}
 
 sub ralign
 {
-  return (sprintf ("%6s", shift)) ;    
+  return (sprintf ("%6s", shift)) ;
+}
+
+# format as permils (1 per mil = 0.1 percent)
+sub percent
+{
+  my ($total,$part) = @_ ;
+  if ($total > 0)
+  {
+    $percent = 100 * ($part/$total) ;
+    if ($percent < 0.1)
+    { $percent = '-  ' ; }
+    elsif ($percent < 1)
+    { $percent = sprintf ("%.2f", $percent) . ' ' ; }
+    else
+    { $percent = sprintf ("%.1f", $percent) . '  ' ; }
+  }
+  else
+  { $percent = ' -  ' ; }
+
+  if ($percent =~ "100\.0")
+  { $percent = '100  ' ; }
+# $permil =~ s/^0\././ ;
+
+  if ($percent =~ /\./)
+  { return ralign ($percent) ; }
+  else
+  { return ralign ($percent . '  ') ; }
+}
+
+sub dash
+{
+  my $value = shift ;
+  if ($value eq '')
+  { $value = '-' ; }
+  return ralign ($value) ;
+}
+
+sub print_comment
+{
+  my $comment = shift ;
+
+  print "$comment\n" ;
+
+  if ($comment =~ /,/)
+  {
+    chomp $comment ;
+    $comment = "\"$comment\"" ;
+  }
+
+  print CSV_OUT_MATRIX "$comment\n" ;
+}
+
+sub print_columns
+{
+  my $data = shift ;
+
+  print CSV_OUT_MATRIX "$data\n" ;
+  $data =~ s/,/\t/g ;
+  print                "$data\n" ;
 }
diff --git a/squids/readme.txt b/squids/readme.txt
new file mode 100644
index 0000000..0a65544
--- /dev/null
+++ b/squids/readme.txt
@@ -0,0 +1,11 @@
+Documentation on squid scripts and output:
+https://www.mediawiki.org/wiki/Analytics/Wikistats/TrafficReports
+
+Sitemap on geo-based traffic reports: 
+http://stats.wikimedia.org/wikimedia/squids/SquidReportsCountriesLanguagesVisitsEdits.htm
+
+Server requests (breakdown of traffic per type, origin, etc)
+http://stats.wikimedia.org/cgi-bin/search_portal.pl?search=server+requests+breakdown
+
+UPD message loss per server group:
+http://stats.wikimedia.org/wikimedia/squids/SquidDataMonthlyPerSquidSet.htm

-- 
To view, visit https://gerrit.wikimedia.org/r/220958
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I5460da06614c748530fbaf83986a30bf0aee2f7d
Gerrit-PatchSet: 1
Gerrit-Project: analytics/wikistats
Gerrit-Branch: master
Gerrit-Owner: Milimetric <dandree...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to