Erik Zachte has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/64598


Change subject: Collect and use countable namespaces via api
......................................................................

Collect and use countable namespaces via api

Change-Id: Ia0dafcf239c5c6b9754ba14727394f9cac3dbb49
---
M dumps/bash/collect_countable_namespaces.sh
M dumps/perl/WikiCounts.pl
M dumps/perl/WikiCountsArguments.pm
M dumps/perl/WikiCountsInput.pm
M dumps/perl/WikiCountsScanNamespacesWithContent.pl
5 files changed, 148 insertions(+), 43 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/wikistats 
refs/changes/98/64598/1

diff --git a/dumps/bash/collect_countable_namespaces.sh 
b/dumps/bash/collect_countable_namespaces.sh
index 3895dc7..f17c5f0 100755
--- a/dumps/bash/collect_countable_namespaces.sh
+++ b/dumps/bash/collect_countable_namespaces.sh
@@ -4,9 +4,13 @@
 
 wikistats=/a/wikistats_git
 perl=$wikistats/dumps/perl
+perl=/home/ezachte/wikistats/dumps/perl # tests
 csv=$wikistats/dumps/csv
 
 clear
+
+cd $csv/csv_mw
+cp StatisticsContentNamespacesExtraNamespaces.csv 
StatisticsContentNamespacesExtraNamespaces.bak
 
 cd $perl
 perl WikiCountsScanNamespacesWithContent.pl -c $csv
@@ -15,4 +19,7 @@
 grep "project" StatisticsContentNamespaces.csv >  
StatisticsContentNamespacesExtraNamespaces.csv # first line with headers
 grep "0|"      StatisticsContentNamespaces.csv >> 
StatisticsContentNamespacesExtraNamespaces.csv
 
+date >> StatisticsContentNamespacesExtraNamespacesDiff.csv
+diff StatisticsContentNamespacesExtraNamespaces.csv 
StatisticsContentNamespacesExtraNamespaces.bak >> 
StatisticsContentNamespacesExtraNamespacesDiff.csv
+
 
diff --git a/dumps/perl/WikiCounts.pl b/dumps/perl/WikiCounts.pl
index 7aab65d..df1e4a7 100755
--- a/dumps/perl/WikiCounts.pl
+++ b/dumps/perl/WikiCounts.pl
@@ -93,6 +93,8 @@
   $nohashes = "skip" ;
   $log_enabled = $false ;
   $skip_on_dumpdate = $false ;
+  
+  $base_content_namespaces_on_api = $false ; # use predefined 'content' 
(=countable) namespaces, or base this on api result ? 
 
   $weekly_plotdata = $false ;
   if ($weekly_plotdata)
diff --git a/dumps/perl/WikiCountsArguments.pm 
b/dumps/perl/WikiCountsArguments.pm
index aba2b95..ea8b2a0 100644
--- a/dumps/perl/WikiCountsArguments.pm
+++ b/dumps/perl/WikiCountsArguments.pm
@@ -449,7 +449,6 @@
     $file_csv_monthly_editors       = $path_out . 
"StatisticsMonthlyEditors.csv" ;
     $file_csv_namespace_stats       = $path_out . "StatisticsPerNamespace.csv" 
;
     $file_csv_namespace_edit_stats  = $path_out . 
"StatisticsEditsPerNamespace.csv" ;
-    $file_csv_content_namespaces    = $path_out . 
"StatisticsContentNamespaces.csv" ;
     $file_csv_users_activity_spread = $path_out . 
"StatisticsUserActivitySpread.csv" ;
     $file_csv_weekly_stats          = $path_out . "StatisticsWeekly.csv" ;
     $file_csv_active_users          = $path_out . "StatisticsActiveUsers.csv" ;
@@ -490,6 +489,9 @@
 
     $file_html_timelines            = $path_out . "Timelines" . uc ($language) 
. ".htm" ;
     $file_html_timelines_skipped    = $path_out . "TimelinesSkipped" . uc 
($language) . ".htm" ;
+    
+    $file_csv_content_namespaces    = $path_out . 
"StatisticsContentNamespaces.csv" ; 
+    $file_csv_content_namespaces    =~ s/csv_\w\w/csv_mw/ ; # collected in 
global dir by job 'collect_countable_namespaces.sh'
   }
 
   if ($testmode)
diff --git a/dumps/perl/WikiCountsInput.pm b/dumps/perl/WikiCountsInput.pm
index 50245a3..512f171 100644
--- a/dumps/perl/WikiCountsInput.pm
+++ b/dumps/perl/WikiCountsInput.pm
@@ -29,15 +29,20 @@
 {
   my ($language, $namespace) = @_ ;
 
-# if changed, also change WikiReportsOutputTables, find in code string 
'NameSpaceArticle'
-  if ($language eq 'strategy')
-  { return (($namespace == 0) || ($namespace == 106)) ; }
-  elsif ($language eq 'commons')
-  { return (($namespace == 0) || ($namespace == 6) || ($namespace == 14)) ; } 
# + file and category namespaces
-  elsif ($mode_ws) # wikisource wikis
-  { return (($namespace == 0) || ($namespace == 102) || ($namespace == 104) || 
($namespace == 106)) ; } # + author, page, index (`100 = portal)
-  else
-  { return ($namespace == 0) ; }
+  if ($base_content_namespaces_on_api)
+  { return ($content_namespace {$namespace} != 0 ) ; }
+  else 
+  {
+  # if changed, also change WikiReportsOutputTables, find in code string 
'NameSpaceArticle'
+    if ($language eq 'strategy')
+    { return (($namespace == 0) || ($namespace == 106)) ; }
+    elsif ($language eq 'commons')
+    { return (($namespace == 0) || ($namespace == 6) || ($namespace == 14)) ; 
} # + file and category namespaces
+    elsif ($mode_ws) # wikisource wikis
+    { return (($namespace == 0) || ($namespace == 102) || ($namespace == 104) 
|| ($namespace == 106)) ; } # + author, page, index (`100 = portal)
+    else
+    { return ($namespace == 0) ; }
+  }  
 }
 
 sub ReadInputXml
@@ -3025,25 +3030,36 @@
 
 sub GetContentNamespaces
 {
-  my ($project,$language) = @_ ;
+  if (! $base_content_namespaces_on_api)
+  {  
+    &LogT ("Use fixed namespaces for counting, not namespaces collected via 
api\n\n") ; 
+    return ;
+  }
+  &LogT ("GetContentNamespaces\n") ;
+       
+  my ($mode,$language) = @_ ;
+  my $project ;
 
-     if ($project eq 'wb') { $project = 'wikibooks' ; }
-  elsif ($project eq 'wk') { $project = 'wiktionary' ; }
-  elsif ($project eq 'wn') { $project = 'wikinews' ; }
-  elsif ($project eq 'wo') { $project = 'wikivoyage' ; }
-  elsif ($project eq 'wp') { $project = 'wikipedia' ; }
-  elsif ($project eq 'wq') { $project = 'wikiquote' ; }
-  elsif ($project eq 'ws') { $project = 'wikisource' ; }
-  elsif ($project eq 'wv') { $project = 'wikiversity' ; }
-  elsif ($project eq 'wx') {
-                             if ($language eq 'species')
-                             { $project = 'wikipedia' ; }
-                             else
-                             { $project = 'wikimedia' ; }
-                           }
-  elsif ($project eq 'wm') { return ; }
-  else                     { die "GetContentNamespaces: invalid project code 
$project" ; }
-
+# disable code to use api directly, always read namespaces from file
+if (0)
+{
+  if    ($mode eq 'wb') { $project = 'wikibooks' ; }
+  elsif ($mode eq 'wk') { $project = 'wiktionary' ; }
+  elsif ($mode eq 'wn') { $project = 'wikinews' ; }
+  elsif ($mode eq 'wo') { $project = 'wikivoyage' ; }
+  elsif ($mode eq 'wp') { $project = 'wikipedia' ; }
+  elsif ($mode eq 'wq') { $project = 'wikiquote' ; }
+  elsif ($mode eq 'ws') { $project = 'wikisource' ; }
+  elsif ($mode eq 'wv') { $project = 'wikiversity' ; }
+  elsif ($mode eq 'wx') {
+                          if ($language eq 'species')
+                          { $project = 'wikipedia' ; }
+                          else
+                          { $project = 'wikimedia' ; }
+                        }
+  elsif ($mode eq 'wm') { return ; }
+  else                  { abort "GetContentNamespaces: invalid mode $mode" ; }
+  
   my $url = 
"http://$language.$project.org/w/api.php?action=query&meta=siteinfo&siprop=namespaces";
 ;
   # &LogT ("\n\nFetch $url'\n\n") ;
   print "$url\n" ;
@@ -3064,11 +3080,41 @@
       $content_namespace {$id} = $true ;
     }
 
+    $content_namespaces_api = $content_namespaces ;
+
     if ($content_namespaces eq '')
     {
       $content_namespaces = "0" ;
       &LogT ("No countable namespaces found via API! Assume namespace 0 is 
only countable namespace.") ;
     }
+    # force extra content namespaces which may not have been defined in api, 
but always were countable
+    if ($project eq 'wikisource')
+    {  
+      if ($content_namespaces !~ /102/)
+      { $content_namespaces   .= "\|102" ; }
+      if ($content_namespaces !~ /104/)
+      { $content_namespaces   .= "\|104" ; }
+      if ($content_namespaces !~ /106/)
+      { $content_namespaces   .= "\|106" ; }
+    }
+    if ($project eq 'wikispecial')
+    {  
+      if ($language eq 'strategy')
+      {
+        if ($content_namespaces !~ /106/)
+        { $content_namespaces   .= "\|106" ; }
+      }
+      if ($language eq 'commons')
+      {
+        if ($content_namespaces !~ /6/)
+       { $content_namespaces   .= "\|6" ; }
+        if ($content_namespaces !~ /14/)
+       { $content_namespaces   .= "\|14" ; }
+      }
+    }
+
+    if ($content_namespaces ne $content_namespaces_api)
+    { &LogT ("Content namespaces patched: $content_namespaces_api -> 
$content_namespaces\n") ; }
 
     $content_namespaces =~ s/\|$// ;
     # &LogT ("Content namespaces for language $language: 
$content_namespaces\n") ;
@@ -3077,22 +3123,37 @@
     push @csv, "$language,$content_namespaces" ;
     &WriteFileCsv ($file_csv_content_namespaces) ;
   }
-  else
-  {
-    &ReadFileCsvOnly ($file_csv_content_namespaces) ;
-    $line = $csv [0] ;
-  # $line = "xx,0|1|2\n" ; # test
-    chomp $line ;
+}  
+# else
+# {
+    if (! -e $file_csv_content_namespaces)
+    { abort ("Namespaces file not found: '$file_csv_content_namespaces'. Run 
'collect_countable_namespaces.sh'\n") ; }
 
-    if ($line ne '')
+    $line_content_namespaces = '' ;
+    open FILE_NS, "<", $file_csv_content_namespaces ;
+    while ($line = <FILE_NS>)
     {
-      &LogT ("Reuse content namespaces from previous run: line='$line'\n") ;
-      my ($language,$content_namespaces) = split (',', $line) ;
-      foreach $id (split "\|", $content_namespaces)
-      { $content_namespace {$id} = $true ; }
-      &LogT ("Content namespaces for language $language: 
$content_namespaces\n") ;
+      if ($line =~ /^$mode\,$language\,/)
+      {
+        chomp ($line) ;
+        $line_content_namespaces = $line ;
+        last ;
+      }
     }
-  }
+    close FILE_NS ;
+
+  # $line_content_namespaces = "xx,0|1|2\n" ; # test
+    if ($line_content_namespaces ne '')
+    {
+      &LogT ("Read content namespaces from file 
'$file_csv_content_namespaces': line='$line_content_namespaces'\n") ;
+      my ($mode,$language,$content_namespaces) = split (',', 
$line_content_namespaces) ;
+      foreach $id (split '\|', $content_namespaces)
+      { $content_namespace {$id} = $true ; }
+      &LogT ("Content namespaces for language $language: " . join (',', sort 
keys %content_namespace) . "\n") ;
+    }
+    else
+    { abort("No entry found in namespaces file '$file_csv_content_namespaces' 
for $mode $language. Run 'collect_countable_namespaces.sh'\n") ; }
+# }
 }
 
 sub FetchWebPage
diff --git a/dumps/perl/WikiCountsScanNamespacesWithContent.pl 
b/dumps/perl/WikiCountsScanNamespacesWithContent.pl
index dec96f1..b013510 100644
--- a/dumps/perl/WikiCountsScanNamespacesWithContent.pl
+++ b/dumps/perl/WikiCountsScanNamespacesWithContent.pl
@@ -22,6 +22,7 @@
   $file_namespaces = "$path_csv/csv_mw/StatisticsContentNamespaces.csv" ;
   $file_run_stats  = "StatisticsLog.csv" ;
 
+  # read previous content (so it will be preserved when api call fails) 
   if (-e $file_namespaces)
   {
     open CSV_IN, '<', $file_namespaces || die "Can't open $file_namespaces" ;
@@ -35,7 +36,7 @@
     }
     close CSV_IN ;
   }
-
+  
   &GetNamespaces ('wb','wikibooks') ;
   &GetNamespaces ('wk','wiktionary') ;
   &GetNamespaces ('wn','wikinews') ;
@@ -46,6 +47,7 @@
   &GetNamespaces ('wv','wikiversity') ;
   &GetNamespaces ('wx','wikimedia') ;
 
+  &ForceExtraContentNamespaces ;
   &SaveNamespaces ;
   print "\nReady\n\n" ;
   exit ;
@@ -76,6 +78,7 @@
       elsif ($lang eq 'sources')    { $url = 'wikisource.org' ; }
       elsif ($lang eq 'mediawiki')  { $url = 'www.mediawiki.org' ; }
       elsif ($lang eq 'foundation') { $url = 'wikimediafoundation.org' ; }
+      elsif ($lang eq 'wikidata')   { $url = 'www.wikidata.org' ; }
     }
     $url .= "/w/api.php?action=query&meta=siteinfo&siprop=namespaces" ;
 
@@ -98,13 +101,43 @@
       if ($ns =~ /^\d+$/)
       { $namespaces {"$proj_code,$lang"} .= "$ns\|" ; }
     }
+
     $namespaces {"$proj_code,$lang"} =~ s/\|$// ;
     print "$proj_code,$lang," . $namespaces {"$proj_code,$lang"} . "\n" ;
     # return if $lines++ > 3 ;
   }
-  &SaveNamespaces ;
 }
 
+sub ForceExtraContentNamespaces
+{
+  # force extra content namespaces which may not have been defined in api, but 
always were countable 
+  foreach $key (sort keys %namespaces)
+  {
+    if ($key =~ /^ws,/)
+    {
+      if ($namespaces {$key} !~ /102/) 
+      { $namespaces   {$key} .= "\|102" ; }
+      if ($namespaces {$key} !~ /104/) 
+      { $namespaces   {$key} .= "\|104" ; }
+      if ($namespaces {$key} !~ /106/) 
+      { $namespaces   {$key} .= "\|106" ; }
+    }
+  }  
+
+  if ($namespaces {"wx,strategy"} !~ /106/) 
+  { $namespaces   {"wx,strategy"} .= "\|106" ; }
+  if ($namespaces {"wx,commons"} !~ /\|6/) 
+  { $namespaces   {"wx,commons"} .= "\|6" ; }
+  if ($namespaces {"wx,commons"} !~ /\|14/) 
+  { $namespaces   {"wx,commons"} .= "\|14" ; }
+
+  foreach $key (sort keys %namespaces)
+  {
+    @namespaces = split ('\|', $namespaces {$key}) ; 
+    @namespaces = sort {$a <=> $b} @namespaces ;
+    $namespaces {$key} = join ('|', @namespaces) ;    
+  }
+}
 
 sub SaveNamespaces
 {

-- 
To view, visit https://gerrit.wikimedia.org/r/64598
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ia0dafcf239c5c6b9754ba14727394f9cac3dbb49
Gerrit-PatchSet: 1
Gerrit-Project: analytics/wikistats
Gerrit-Branch: master
Gerrit-Owner: Erik Zachte <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to