[MediaWiki-commits] [Gerrit] New WikiStats module - change (pywikibot/core)

John Vandenberg (Code Review) Sun, 09 Nov 2014 02:14:09 -0800

John Vandenberg has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/172104


Change subject: New WikiStats module
......................................................................

New WikiStats module

wikistats.wmflabs.org provides aggregate statistics for families of
wikis.  Pywikibot currently fetches a subset of the data using XML
to create a hard-coded list of languages by size.

This new module provides raw access to all of the data stored in
wikistats, and allows it to be fetched using the smaller csv format,
however the XML format is also supported and is used for Python 2
without the unicodecsv module installed.
e.g. for wiktionaries, the XML is 300Kb vs the csv 26Kb.

Change-Id: Id0070092d2337c9fc86b01e2103999c6dcea42fa
---
A pywikibot/data/wikistats.py
M setup.py
A tests/wikistats_tests.py
3 files changed, 256 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core 
refs/changes/04/172104/1

diff --git a/pywikibot/data/wikistats.py b/pywikibot/data/wikistats.py
new file mode 100644
index 0000000..4ff7f1a
--- /dev/null
+++ b/pywikibot/data/wikistats.py
@@ -0,0 +1,191 @@
+# -*- coding: utf-8  -*-
+"""Objects representing WikiStats API."""
+#
+# (C) Pywikibot team, 2014
+#
+# Distributed under the terms of the MIT license.
+
+import sys
+
+if sys.version_info[0] > 2:
+    import csv
+else:
+    try:
+        import unicodecsv as csv
+    except ImportError:
+        print('wikistats: package unicodecsv is needed for Python 2; '
+              'fallback is larger XML data.')
+        csv = None
+
+import pywikibot
+
+from io import BytesIO, StringIO
+
+from pywikibot.comms import threadedhttp
+
+
+class WikiStats(object):
+
+    """
+    Light wrapper around WikiStats data, caching responses and data.
+
+    The methods accept a pywikibot family name as the WikiStats table name,
+    mapping the names before calling the WikiStats API.
+    """
+
+    families = {
+        'anarchopedia': 'anarchopedias',
+        'wikibooks':    'wikibooks',
+        'wikinews':     'wikinews',
+        'wikipedia':    'wikipedias',
+        'wikiquote':    'wikiquotes',
+        'wikisource':   'wikisources',
+        'wikiversity':  'wikiversity',
+        'wikivoyage':   'wikivoyage',
+        'wiktionary':   'wiktionaries',
+    }
+
+    known_keys = list(families.keys()) + list(families.values())
+
+    def __init__(self, url='https://wikistats.wmflabs.org/'):
+        """Constructor."""
+        self.url = url
+        self._raw = {}
+        self.data = {}
+
+    def fetch(self, table, format="xml"):
+        """
+        Fetch data from WikiStats.
+
+        @param table: table of data to fetch
+        @type table: basestring
+        @param format: Format of data to use
+        @type format: 'xml' or 'csv'.
+        @rtype: bytes
+        """
+        URL = self.url + '/api.php?action=dump&table=%s&format=%s'
+
+        if table not in self.known_keys:
+            pywikibot.warning('WikiStats unknown table %s' % table)
+
+        if table in self.families:
+            table = self.families[table]
+
+        o = threadedhttp.Http()
+        r = o.request(uri=URL % (table, format))
+        if isinstance(r, Exception):
+            raise r
+        return r[1]
+
+    def raw_cached(self, table, format):
+        """
+        Cache raw data.
+
+        @param table: table of data to fetch
+        @type table: basestring
+        @param format: Format of data to use
+        @type format: 'xml' or 'csv'.
+        @rtype: bytes
+        """
+        if format not in self._raw:
+            self._raw[format] = {}
+        if table in self._raw[format]:
+            return self._raw[format][table]
+
+        data = self.fetch(table, format)
+
+        self._raw[format][table] = data
+        return data
+
+    def csv(self, table):
+        """
+        Fetch and parse CSV for a table.
+
+        @param table: table of data to fetch
+        @type table: basestring
+        @rtype: list
+        """
+        data = self.raw_cached(table, 'csv')
+
+        if sys.version_info[0] > 2:
+            f = StringIO(data.decode('utf8'))
+        else:
+            f = BytesIO(data)
+
+        reader = csv.DictReader(f)
+
+        data = [site for site in reader]
+
+        return data
+
+    def xml(self, table):
+        """
+        Fetch and parse XML for a table.
+
+        @param table: table of data to fetch
+        @type table: basestring
+        @rtype: list
+        """
+        from xml.etree import cElementTree
+
+        data = self.raw_cached(table, 'xml')
+
+        f = BytesIO(data)
+        tree = cElementTree.parse(f)
+
+        data = []
+
+        for row in tree.findall('row'):
+            site = {}
+
+            for field in row.findall('field'):
+                site[field.get('name')] = field.text
+
+            data.append(site)
+
+        return data
+
+    def get(self, table, format=None):
+        """
+        Get a list of a table of data using format.
+
+        @param table: table of data to fetch
+        @type table: basestring
+        @param format: Format of data to use
+        @type format: 'xml' or 'csv', or None to autoselect.
+        @rtype: list
+        """
+        if csv or format == 'csv':
+            data = self.csv(table)
+        else:
+            data = self.xml(table)
+        return data
+
+    def get_dict(self, table, format=None):
+        """
+        Get dictionary of a table of data using format.
+
+        @param table: table of data to fetch
+        @type table: basestring
+        @param format: Format of data to use
+        @type format: 'xml' or 'csv', or None to autoselect.
+        @rtype: dict
+        """
+        return dict([(data['prefix'], data)
+                     for data in self.get(table, format)])
+
+    def sorted(self, table, key):
+        """
+        Reverse numerical sort of data.
+
+        @param table: name of table of data
+        @param key: numerical key, such as id, total, good
+        """
+        return sorted(self.get(table),
+                      key=lambda d: int(d[key]),
+                      reverse=True)
+
+    def languages_by_size(self, table):
+        """ Return ordered list of languages by size from WikiStats. """
+        # This assumes they appear in order of size in the WikiStats dump.
+        return [d['prefix'] for d in self.get(table)]
diff --git a/setup.py b/setup.py
index 4fdacda..564ad08 100644
--- a/setup.py
+++ b/setup.py
@@ -27,6 +27,9 @@
     'mwparserfromhell': ['mwparserfromhell>=0.3.3']
 }
 
+if sys.version_info[0] == 2:
+    extra_deps['wikistats-csv'] = ['unicodecsv']
+
 script_deps = {
     'script_wui.py': ['irc', 'lunatic-python', 'crontab'],
     # Note: None of the 'lunatic-python' repos on github support MS Windows.
diff --git a/tests/wikistats_tests.py b/tests/wikistats_tests.py
new file mode 100644
index 0000000..c9e8532
--- /dev/null
+++ b/tests/wikistats_tests.py
@@ -0,0 +1,62 @@
+# -*- coding: utf-8  -*-
+"""Test cases for the WikiStats dataset."""
+#
+# (C) Pywikibot team, 2014
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+#
+
+import sys
+
+from pywikibot.data.wikistats import WikiStats, csv
+
+from tests.aspects import unittest, TestCase
+
+if sys.version_info[0] == 3:
+    basestring = (str, )
+
+
+class WikiStatsTestCase(TestCase):
+
+    """Test WikiStats dump."""
+
+    net = True
+
+    def test_sort(self):
+        ws = WikiStats()
+        data = ws.sorted('wikipedia', 'total')
+        top = data[0]
+        self.assertIn('prefix', top)
+        self.assertIn('total', top)
+        self.assertEqual(top['prefix'], 'en')
+        self.assertIsInstance(top['total'], basestring)
+        self.assertEqual(ws.languages_by_size('wikipedia')[0], 'en')
+        self.assertEqual(ws.languages_by_size('wikisource')[0], 'fr')
+
+    def test_csv(self):
+        if not csv:
+            raise unittest.SkipTest('unicodecsv not installed.')
+        ws = WikiStats()
+        data = ws.get_dict('wikipedia', 'csv')
+        self.assertIsInstance(data, dict)
+        self.assertIn('en', data)
+        self.assertIn('ht', data)
+        self.assertGreater(int(data['en']['total']), 4000000)
+        data = ws.get_dict
+
+    def test_xml(self):
+        ws = WikiStats()
+        data = ws.get_dict('wikisource', 'xml')
+        self.assertIsInstance(data, dict)
+        self.assertIn('en', data)
+        self.assertIn('id', data)
+        self.assertGreater(int(data['fr']['total']), 1600000)
+
+
+if __name__ == '__main__':
+    try:
+        unittest.main()
+    except SystemExit:
+        pass

-- 
To view, visit https://gerrit.wikimedia.org/r/172104
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Id0070092d2337c9fc86b01e2103999c6dcea42fa
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: John Vandenberg <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] New WikiStats module - change (pywikibot/core)

Reply via email to