[Pywikibot-commits] [Gerrit] New WikiStats module - change (pywikibot/core)

jenkins-bot (Code Review) Mon, 17 Nov 2014 03:45:40 -0800

jenkins-bot has submitted this change and it was merged.

Change subject: New WikiStats module
......................................................................



New WikiStats module

wikistats.wmflabs.org provides aggregate statistics for families of
wikis.  Pywikibot currently fetches a subset of the data using XML
to create a hard-coded list of languages by size for the multi-lang
WMF families and family 'anarchopedias'.

This new module provides raw access to all of the data stored in
wikistats, and allows it to be fetched using the smaller csv format,
however the XML format is also supported and is used for Python 2
without the unicodecsv module installed.
e.g. for wiktionaries, the XML is 300Kb vs the csv 26Kb.

Change-Id: Id0070092d2337c9fc86b01e2103999c6dcea42fa
---
A pywikibot/data/wikistats.py
M setup.py
A tests/wikistats_tests.py
3 files changed, 305 insertions(+), 0 deletions(-)

Approvals:
  John Vandenberg: Looks good to me, but someone else must approve
  XZise: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/pywikibot/data/wikistats.py b/pywikibot/data/wikistats.py
new file mode 100644
index 0000000..125eabc
--- /dev/null
+++ b/pywikibot/data/wikistats.py
@@ -0,0 +1,240 @@
+# -*- coding: utf-8  -*-
+"""Objects representing WikiStats API."""
+#
+# (C) Pywikibot team, 2014
+#
+# Distributed under the terms of the MIT license.
+
+import sys
+
+from io import BytesIO, StringIO
+
+import pywikibot
+
+if sys.version_info[0] > 2:
+    import csv
+else:
+    try:
+        import unicodecsv as csv
+    except ImportError:
+        pywikibot.warning(
+            'WikiStats: unicodecsv package required for using csv in Python 2;'
+            ' falling back to using the larger XML datasets.')
+        csv = None
+
+from pywikibot.comms import threadedhttp
+
+
+class WikiStats(object):
+
+    """
+    Light wrapper around WikiStats data, caching responses and data.
+
+    The methods accept a Pywikibot family name as the WikiStats table name,
+    mapping the names before calling the WikiStats API.
+    """
+
+    FAMILY_MAPPING = {
+        'anarchopedia': 'anarchopedias',
+        'wikipedia':    'wikipedias',
+        'wikiquote':    'wikiquotes',
+        'wikisource':   'wikisources',
+        'wiktionary':   'wiktionaries',
+    }
+
+    MISC_SITES_TABLE = 'mediawikis'
+
+    WMF_MULTILANG_TABLES = set([
+        'wikipedias', 'wiktionaries', 'wikisources', 'wikinews',
+        'wikibooks', 'wikiquotes', 'wikivoyage', 'wikiversity',
+    ])
+
+    OTHER_MULTILANG_TABLES = set([
+        'uncyclomedia',
+        'anarchopedias',
+        'rodovid',
+        'wikifur',
+        'wikitravel',
+        'scoutwiki',
+        'opensuse',
+        'metapedias',
+        'lxde',
+        'pardus',
+        'gentoo',
+    ])
+
+    OTHER_TABLES = set([
+        # Farms
+        'wikia',
+        'wikkii',
+        'wikisite',
+        'editthis',
+        'orain',
+        'shoutwiki',
+        'referata',
+
+        # Single purpose/manager sets
+        'wmspecials',
+        'gamepedias',
+        'w3cwikis',
+        'neoseeker',
+        'sourceforge',
+    ])
+
+    ALL_TABLES = (set([MISC_SITES_TABLE]) | WMF_MULTILANG_TABLES |
+                  OTHER_MULTILANG_TABLES | OTHER_TABLES)
+
+    ALL_KEYS = set(FAMILY_MAPPING.keys()) | ALL_TABLES
+
+    def __init__(self, url='https://wikistats.wmflabs.org/'):
+        """Constructor."""
+        self.url = url
+        self._raw = {}
+        self._data = {}
+
+    def fetch(self, table, format="xml"):
+        """
+        Fetch data from WikiStats.
+
+        @param table: table of data to fetch
+        @type table: basestring
+        @param format: Format of data to use
+        @type format: 'xml' or 'csv'.
+        @rtype: bytes
+        """
+        URL = self.url + '/api.php?action=dump&table=%s&format=%s'
+
+        if table not in self.ALL_KEYS:
+            pywikibot.warning('WikiStats unknown table %s' % table)
+
+        if table in self.FAMILY_MAPPING:
+            table = self.FAMILY_MAPPING[table]
+
+        o = threadedhttp.Http()
+        r = o.request(uri=URL % (table, format))
+        if isinstance(r, Exception):
+            raise r
+        return r[1]
+
+    def raw_cached(self, table, format):
+        """
+        Cache raw data.
+
+        @param table: table of data to fetch
+        @type table: basestring
+        @param format: Format of data to use
+        @type format: 'xml' or 'csv'.
+        @rtype: bytes
+        """
+        if format not in self._raw:
+            self._raw[format] = {}
+        if table in self._raw[format]:
+            return self._raw[format][table]
+
+        data = self.fetch(table, format)
+
+        self._raw[format][table] = data
+        return data
+
+    def csv(self, table):
+        """
+        Fetch and parse CSV for a table.
+
+        @param table: table of data to fetch
+        @type table: basestring
+        @rtype: list
+        """
+        if table in self._data.setdefault('csv', {}):
+            return self._data['csv'][table]
+
+        data = self.raw_cached(table, 'csv')
+
+        if sys.version_info[0] > 2:
+            f = StringIO(data.decode('utf8'))
+        else:
+            f = BytesIO(data)
+
+        reader = csv.DictReader(f)
+
+        data = [site for site in reader]
+
+        self._data['csv'][table] = data
+
+        return data
+
+    def xml(self, table):
+        """
+        Fetch and parse XML for a table.
+
+        @param table: table of data to fetch
+        @type table: basestring
+        @rtype: list
+        """
+        if table in self._data.setdefault('xml', {}):
+            return self._data['xml'][table]
+
+        from xml.etree import cElementTree
+
+        data = self.raw_cached(table, 'xml')
+
+        f = BytesIO(data)
+        tree = cElementTree.parse(f)
+
+        data = []
+
+        for row in tree.findall('row'):
+            site = {}
+
+            for field in row.findall('field'):
+                site[field.get('name')] = field.text
+
+            data.append(site)
+
+        self._data['xml'][table] = data
+
+        return data
+
+    def get(self, table, format=None):
+        """
+        Get a list of a table of data using format.
+
+        @param table: table of data to fetch
+        @type table: basestring
+        @param format: Format of data to use
+        @type format: 'xml' or 'csv', or None to autoselect.
+        @rtype: list
+        """
+        if csv or format == 'csv':
+            data = self.csv(table)
+        else:
+            data = self.xml(table)
+        return data
+
+    def get_dict(self, table, format=None):
+        """
+        Get dictionary of a table of data using format.
+
+        @param table: table of data to fetch
+        @type table: basestring
+        @param format: Format of data to use
+        @type format: 'xml' or 'csv', or None to autoselect.
+        @rtype: dict
+        """
+        return dict((data['prefix'], data)
+                    for data in self.get(table, format))
+
+    def sorted(self, table, key):
+        """
+        Reverse numerical sort of data.
+
+        @param table: name of table of data
+        @param key: numerical key, such as id, total, good
+        """
+        return sorted(self.get(table),
+                      key=lambda d: int(d[key]),
+                      reverse=True)
+
+    def languages_by_size(self, table):
+        """ Return ordered list of languages by size from WikiStats. """
+        # This assumes they appear in order of size in the WikiStats dump.
+        return [d['prefix'] for d in self.get(table)]
diff --git a/setup.py b/setup.py
index 4fdacda..564ad08 100644
--- a/setup.py
+++ b/setup.py
@@ -27,6 +27,9 @@
     'mwparserfromhell': ['mwparserfromhell>=0.3.3']
 }
 
+if sys.version_info[0] == 2:
+    extra_deps['wikistats-csv'] = ['unicodecsv']
+
 script_deps = {
     'script_wui.py': ['irc', 'lunatic-python', 'crontab'],
     # Note: None of the 'lunatic-python' repos on github support MS Windows.
diff --git a/tests/wikistats_tests.py b/tests/wikistats_tests.py
new file mode 100644
index 0000000..c9e8532
--- /dev/null
+++ b/tests/wikistats_tests.py
@@ -0,0 +1,62 @@
+# -*- coding: utf-8  -*-
+"""Test cases for the WikiStats dataset."""
+#
+# (C) Pywikibot team, 2014
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+#
+
+import sys
+
+from pywikibot.data.wikistats import WikiStats, csv
+
+from tests.aspects import unittest, TestCase
+
+if sys.version_info[0] == 3:
+    basestring = (str, )
+
+
+class WikiStatsTestCase(TestCase):
+
+    """Test WikiStats dump."""
+
+    net = True
+
+    def test_sort(self):
+        ws = WikiStats()
+        data = ws.sorted('wikipedia', 'total')
+        top = data[0]
+        self.assertIn('prefix', top)
+        self.assertIn('total', top)
+        self.assertEqual(top['prefix'], 'en')
+        self.assertIsInstance(top['total'], basestring)
+        self.assertEqual(ws.languages_by_size('wikipedia')[0], 'en')
+        self.assertEqual(ws.languages_by_size('wikisource')[0], 'fr')
+
+    def test_csv(self):
+        if not csv:
+            raise unittest.SkipTest('unicodecsv not installed.')
+        ws = WikiStats()
+        data = ws.get_dict('wikipedia', 'csv')
+        self.assertIsInstance(data, dict)
+        self.assertIn('en', data)
+        self.assertIn('ht', data)
+        self.assertGreater(int(data['en']['total']), 4000000)
+        data = ws.get_dict
+
+    def test_xml(self):
+        ws = WikiStats()
+        data = ws.get_dict('wikisource', 'xml')
+        self.assertIsInstance(data, dict)
+        self.assertIn('en', data)
+        self.assertIn('id', data)
+        self.assertGreater(int(data['fr']['total']), 1600000)
+
+
+if __name__ == '__main__':
+    try:
+        unittest.main()
+    except SystemExit:
+        pass

-- 
To view, visit https://gerrit.wikimedia.org/r/172104
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Id0070092d2337c9fc86b01e2103999c6dcea42fa
Gerrit-PatchSet: 4
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: John Vandenberg <[email protected]>
Gerrit-Reviewer: John Vandenberg <[email protected]>
Gerrit-Reviewer: Ladsgroup <[email protected]>
Gerrit-Reviewer: Merlijn van Deen <[email protected]>
Gerrit-Reviewer: XZise <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
Pywikibot-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikibot-commits

[Pywikibot-commits] [Gerrit] New WikiStats module - change (pywikibot/core)

Reply via email to