Erik Zachte has uploaded a new change for review.
https://gerrit.wikimedia.org/r/64598
Change subject: Collect and use countable namespaces via api
......................................................................
Collect and use countable namespaces via api
Change-Id: Ia0dafcf239c5c6b9754ba14727394f9cac3dbb49
---
M dumps/bash/collect_countable_namespaces.sh
M dumps/perl/WikiCounts.pl
M dumps/perl/WikiCountsArguments.pm
M dumps/perl/WikiCountsInput.pm
M dumps/perl/WikiCountsScanNamespacesWithContent.pl
5 files changed, 148 insertions(+), 43 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/analytics/wikistats
refs/changes/98/64598/1
diff --git a/dumps/bash/collect_countable_namespaces.sh
b/dumps/bash/collect_countable_namespaces.sh
index 3895dc7..f17c5f0 100755
--- a/dumps/bash/collect_countable_namespaces.sh
+++ b/dumps/bash/collect_countable_namespaces.sh
@@ -4,9 +4,13 @@
wikistats=/a/wikistats_git
perl=$wikistats/dumps/perl
+perl=/home/ezachte/wikistats/dumps/perl # tests
csv=$wikistats/dumps/csv
clear
+
+cd $csv/csv_mw
+cp StatisticsContentNamespacesExtraNamespaces.csv
StatisticsContentNamespacesExtraNamespaces.bak
cd $perl
perl WikiCountsScanNamespacesWithContent.pl -c $csv
@@ -15,4 +19,7 @@
grep "project" StatisticsContentNamespaces.csv >
StatisticsContentNamespacesExtraNamespaces.csv # first line with headers
grep "0|" StatisticsContentNamespaces.csv >>
StatisticsContentNamespacesExtraNamespaces.csv
+date >> StatisticsContentNamespacesExtraNamespacesDiff.csv
+diff StatisticsContentNamespacesExtraNamespaces.csv
StatisticsContentNamespacesExtraNamespaces.bak >>
StatisticsContentNamespacesExtraNamespacesDiff.csv
+
diff --git a/dumps/perl/WikiCounts.pl b/dumps/perl/WikiCounts.pl
index 7aab65d..df1e4a7 100755
--- a/dumps/perl/WikiCounts.pl
+++ b/dumps/perl/WikiCounts.pl
@@ -93,6 +93,8 @@
$nohashes = "skip" ;
$log_enabled = $false ;
$skip_on_dumpdate = $false ;
+
+ $base_content_namespaces_on_api = $false ; # use predefined 'content'
(=countable) namespaces, or base this on api result ?
$weekly_plotdata = $false ;
if ($weekly_plotdata)
diff --git a/dumps/perl/WikiCountsArguments.pm
b/dumps/perl/WikiCountsArguments.pm
index aba2b95..ea8b2a0 100644
--- a/dumps/perl/WikiCountsArguments.pm
+++ b/dumps/perl/WikiCountsArguments.pm
@@ -449,7 +449,6 @@
$file_csv_monthly_editors = $path_out .
"StatisticsMonthlyEditors.csv" ;
$file_csv_namespace_stats = $path_out . "StatisticsPerNamespace.csv"
;
$file_csv_namespace_edit_stats = $path_out .
"StatisticsEditsPerNamespace.csv" ;
- $file_csv_content_namespaces = $path_out .
"StatisticsContentNamespaces.csv" ;
$file_csv_users_activity_spread = $path_out .
"StatisticsUserActivitySpread.csv" ;
$file_csv_weekly_stats = $path_out . "StatisticsWeekly.csv" ;
$file_csv_active_users = $path_out . "StatisticsActiveUsers.csv" ;
@@ -490,6 +489,9 @@
$file_html_timelines = $path_out . "Timelines" . uc ($language)
. ".htm" ;
$file_html_timelines_skipped = $path_out . "TimelinesSkipped" . uc
($language) . ".htm" ;
+
+ $file_csv_content_namespaces = $path_out .
"StatisticsContentNamespaces.csv" ;
+ $file_csv_content_namespaces =~ s/csv_\w\w/csv_mw/ ; # collected in
global dir by job 'collect_countable_namespaces.sh'
}
if ($testmode)
diff --git a/dumps/perl/WikiCountsInput.pm b/dumps/perl/WikiCountsInput.pm
index 50245a3..512f171 100644
--- a/dumps/perl/WikiCountsInput.pm
+++ b/dumps/perl/WikiCountsInput.pm
@@ -29,15 +29,20 @@
{
my ($language, $namespace) = @_ ;
-# if changed, also change WikiReportsOutputTables, find in code string
'NameSpaceArticle'
- if ($language eq 'strategy')
- { return (($namespace == 0) || ($namespace == 106)) ; }
- elsif ($language eq 'commons')
- { return (($namespace == 0) || ($namespace == 6) || ($namespace == 14)) ; }
# + file and category namespaces
- elsif ($mode_ws) # wikisource wikis
- { return (($namespace == 0) || ($namespace == 102) || ($namespace == 104) ||
($namespace == 106)) ; } # + author, page, index (`100 = portal)
- else
- { return ($namespace == 0) ; }
+ if ($base_content_namespaces_on_api)
+ { return ($content_namespace {$namespace} != 0 ) ; }
+ else
+ {
+ # if changed, also change WikiReportsOutputTables, find in code string
'NameSpaceArticle'
+ if ($language eq 'strategy')
+ { return (($namespace == 0) || ($namespace == 106)) ; }
+ elsif ($language eq 'commons')
+ { return (($namespace == 0) || ($namespace == 6) || ($namespace == 14)) ;
} # + file and category namespaces
+ elsif ($mode_ws) # wikisource wikis
+ { return (($namespace == 0) || ($namespace == 102) || ($namespace == 104)
|| ($namespace == 106)) ; } # + author, page, index (`100 = portal)
+ else
+ { return ($namespace == 0) ; }
+ }
}
sub ReadInputXml
@@ -3025,25 +3030,36 @@
sub GetContentNamespaces
{
- my ($project,$language) = @_ ;
+ if (! $base_content_namespaces_on_api)
+ {
+ &LogT ("Use fixed namespaces for counting, not namespaces collected via
api\n\n") ;
+ return ;
+ }
+ &LogT ("GetContentNamespaces\n") ;
+
+ my ($mode,$language) = @_ ;
+ my $project ;
- if ($project eq 'wb') { $project = 'wikibooks' ; }
- elsif ($project eq 'wk') { $project = 'wiktionary' ; }
- elsif ($project eq 'wn') { $project = 'wikinews' ; }
- elsif ($project eq 'wo') { $project = 'wikivoyage' ; }
- elsif ($project eq 'wp') { $project = 'wikipedia' ; }
- elsif ($project eq 'wq') { $project = 'wikiquote' ; }
- elsif ($project eq 'ws') { $project = 'wikisource' ; }
- elsif ($project eq 'wv') { $project = 'wikiversity' ; }
- elsif ($project eq 'wx') {
- if ($language eq 'species')
- { $project = 'wikipedia' ; }
- else
- { $project = 'wikimedia' ; }
- }
- elsif ($project eq 'wm') { return ; }
- else { die "GetContentNamespaces: invalid project code
$project" ; }
-
+# disable code to use api directly, always read namespaces from file
+if (0)
+{
+ if ($mode eq 'wb') { $project = 'wikibooks' ; }
+ elsif ($mode eq 'wk') { $project = 'wiktionary' ; }
+ elsif ($mode eq 'wn') { $project = 'wikinews' ; }
+ elsif ($mode eq 'wo') { $project = 'wikivoyage' ; }
+ elsif ($mode eq 'wp') { $project = 'wikipedia' ; }
+ elsif ($mode eq 'wq') { $project = 'wikiquote' ; }
+ elsif ($mode eq 'ws') { $project = 'wikisource' ; }
+ elsif ($mode eq 'wv') { $project = 'wikiversity' ; }
+ elsif ($mode eq 'wx') {
+ if ($language eq 'species')
+ { $project = 'wikipedia' ; }
+ else
+ { $project = 'wikimedia' ; }
+ }
+ elsif ($mode eq 'wm') { return ; }
+ else { abort "GetContentNamespaces: invalid mode $mode" ; }
+
my $url =
"http://$language.$project.org/w/api.php?action=query&meta=siteinfo&siprop=namespaces"
;
# &LogT ("\n\nFetch $url'\n\n") ;
print "$url\n" ;
@@ -3064,11 +3080,41 @@
$content_namespace {$id} = $true ;
}
+ $content_namespaces_api = $content_namespaces ;
+
if ($content_namespaces eq '')
{
$content_namespaces = "0" ;
&LogT ("No countable namespaces found via API! Assume namespace 0 is
only countable namespace.") ;
}
+ # force extra content namespaces which may not have been defined in api,
but always were countable
+ if ($project eq 'wikisource')
+ {
+ if ($content_namespaces !~ /102/)
+ { $content_namespaces .= "\|102" ; }
+ if ($content_namespaces !~ /104/)
+ { $content_namespaces .= "\|104" ; }
+ if ($content_namespaces !~ /106/)
+ { $content_namespaces .= "\|106" ; }
+ }
+ if ($project eq 'wikispecial')
+ {
+ if ($language eq 'strategy')
+ {
+ if ($content_namespaces !~ /106/)
+ { $content_namespaces .= "\|106" ; }
+ }
+ if ($language eq 'commons')
+ {
+ if ($content_namespaces !~ /6/)
+ { $content_namespaces .= "\|6" ; }
+ if ($content_namespaces !~ /14/)
+ { $content_namespaces .= "\|14" ; }
+ }
+ }
+
+ if ($content_namespaces ne $content_namespaces_api)
+ { &LogT ("Content namespaces patched: $content_namespaces_api ->
$content_namespaces\n") ; }
$content_namespaces =~ s/\|$// ;
# &LogT ("Content namespaces for language $language:
$content_namespaces\n") ;
@@ -3077,22 +3123,37 @@
push @csv, "$language,$content_namespaces" ;
&WriteFileCsv ($file_csv_content_namespaces) ;
}
- else
- {
- &ReadFileCsvOnly ($file_csv_content_namespaces) ;
- $line = $csv [0] ;
- # $line = "xx,0|1|2\n" ; # test
- chomp $line ;
+}
+# else
+# {
+ if (! -e $file_csv_content_namespaces)
+ { abort ("Namespaces file not found: '$file_csv_content_namespaces'. Run
'collect_countable_namespaces.sh'\n") ; }
- if ($line ne '')
+ $line_content_namespaces = '' ;
+ open FILE_NS, "<", $file_csv_content_namespaces ;
+ while ($line = <FILE_NS>)
{
- &LogT ("Reuse content namespaces from previous run: line='$line'\n") ;
- my ($language,$content_namespaces) = split (',', $line) ;
- foreach $id (split "\|", $content_namespaces)
- { $content_namespace {$id} = $true ; }
- &LogT ("Content namespaces for language $language:
$content_namespaces\n") ;
+ if ($line =~ /^$mode\,$language\,/)
+ {
+ chomp ($line) ;
+ $line_content_namespaces = $line ;
+ last ;
+ }
}
- }
+ close FILE_NS ;
+
+ # $line_content_namespaces = "xx,0|1|2\n" ; # test
+ if ($line_content_namespaces ne '')
+ {
+ &LogT ("Read content namespaces from file
'$file_csv_content_namespaces': line='$line_content_namespaces'\n") ;
+ my ($mode,$language,$content_namespaces) = split (',',
$line_content_namespaces) ;
+ foreach $id (split '\|', $content_namespaces)
+ { $content_namespace {$id} = $true ; }
+ &LogT ("Content namespaces for language $language: " . join (',', sort
keys %content_namespace) . "\n") ;
+ }
+ else
+ { abort("No entry found in namespaces file '$file_csv_content_namespaces'
for $mode $language. Run 'collect_countable_namespaces.sh'\n") ; }
+# }
}
sub FetchWebPage
diff --git a/dumps/perl/WikiCountsScanNamespacesWithContent.pl
b/dumps/perl/WikiCountsScanNamespacesWithContent.pl
index dec96f1..b013510 100644
--- a/dumps/perl/WikiCountsScanNamespacesWithContent.pl
+++ b/dumps/perl/WikiCountsScanNamespacesWithContent.pl
@@ -22,6 +22,7 @@
$file_namespaces = "$path_csv/csv_mw/StatisticsContentNamespaces.csv" ;
$file_run_stats = "StatisticsLog.csv" ;
+ # read previous content (so it will be preserved when api call fails)
if (-e $file_namespaces)
{
open CSV_IN, '<', $file_namespaces || die "Can't open $file_namespaces" ;
@@ -35,7 +36,7 @@
}
close CSV_IN ;
}
-
+
&GetNamespaces ('wb','wikibooks') ;
&GetNamespaces ('wk','wiktionary') ;
&GetNamespaces ('wn','wikinews') ;
@@ -46,6 +47,7 @@
&GetNamespaces ('wv','wikiversity') ;
&GetNamespaces ('wx','wikimedia') ;
+ &ForceExtraContentNamespaces ;
&SaveNamespaces ;
print "\nReady\n\n" ;
exit ;
@@ -76,6 +78,7 @@
elsif ($lang eq 'sources') { $url = 'wikisource.org' ; }
elsif ($lang eq 'mediawiki') { $url = 'www.mediawiki.org' ; }
elsif ($lang eq 'foundation') { $url = 'wikimediafoundation.org' ; }
+ elsif ($lang eq 'wikidata') { $url = 'www.wikidata.org' ; }
}
$url .= "/w/api.php?action=query&meta=siteinfo&siprop=namespaces" ;
@@ -98,13 +101,43 @@
if ($ns =~ /^\d+$/)
{ $namespaces {"$proj_code,$lang"} .= "$ns\|" ; }
}
+
$namespaces {"$proj_code,$lang"} =~ s/\|$// ;
print "$proj_code,$lang," . $namespaces {"$proj_code,$lang"} . "\n" ;
# return if $lines++ > 3 ;
}
- &SaveNamespaces ;
}
+sub ForceExtraContentNamespaces
+{
+ # force extra content namespaces which may not have been defined in api, but
always were countable
+ foreach $key (sort keys %namespaces)
+ {
+ if ($key =~ /^ws,/)
+ {
+ if ($namespaces {$key} !~ /102/)
+ { $namespaces {$key} .= "\|102" ; }
+ if ($namespaces {$key} !~ /104/)
+ { $namespaces {$key} .= "\|104" ; }
+ if ($namespaces {$key} !~ /106/)
+ { $namespaces {$key} .= "\|106" ; }
+ }
+ }
+
+ if ($namespaces {"wx,strategy"} !~ /106/)
+ { $namespaces {"wx,strategy"} .= "\|106" ; }
+ if ($namespaces {"wx,commons"} !~ /\|6/)
+ { $namespaces {"wx,commons"} .= "\|6" ; }
+ if ($namespaces {"wx,commons"} !~ /\|14/)
+ { $namespaces {"wx,commons"} .= "\|14" ; }
+
+ foreach $key (sort keys %namespaces)
+ {
+ @namespaces = split ('\|', $namespaces {$key}) ;
+ @namespaces = sort {$a <=> $b} @namespaces ;
+ $namespaces {$key} = join ('|', @namespaces) ;
+ }
+}
sub SaveNamespaces
{
--
To view, visit https://gerrit.wikimedia.org/r/64598
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ia0dafcf239c5c6b9754ba14727394f9cac3dbb49
Gerrit-PatchSet: 1
Gerrit-Project: analytics/wikistats
Gerrit-Branch: master
Gerrit-Owner: Erik Zachte <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits