Eranroz has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/276562

Change subject: Performance improvments for WikidataQuery gen
......................................................................

Performance improvments for WikidataQuery gen

WikidataQueryPageGenerator requests for each page its sitelink,
which requires many web requests and is very inefficent.
Instead we use a batch query requesting only the data we need (sitelinks).

Change-Id: I515c7135b7c2f8b9851c82a189abdd102dd562ee
---
M pywikibot/pagegenerators.py
1 file changed, 20 insertions(+), 12 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core 
refs/changes/62/276562/1

diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py
index 6a59a1d..b0ef07c 100644
--- a/pywikibot/pagegenerators.py
+++ b/pywikibot/pagegenerators.py
@@ -40,10 +40,11 @@
     deprecated_args,
     redirect_func,
     issue_deprecation_warning,
+    itergroup,
     DequeGenerator,
     intersect_generators,
     IteratorNextMixin,
-    filter_unique,
+    filter_unique
 )
 
 from pywikibot import date, config, i18n, xmlreader
@@ -2593,11 +2594,12 @@
             yield pywikibot.Page(pywikibot.Link(fd(month, day), site))
 
 
-def WikidataQueryPageGenerator(query, site=None):
+def WikidataQueryPageGenerator(query, site=None, groupsize=50):
     """Generate pages that result from the given WikidataQuery.
 
     @param query: the WikidataQuery query string.
     @param site: Site for generator results.
+    @param groupsize: Number of entities to fetch from repository in each 
batch.
     @type site: L{pywikibot.site.BaseSite}
 
     """
@@ -2613,17 +2615,23 @@
     data = wd_query.query(wd_queryset)
 
     pywikibot.output(u'retrieved %d items' % data[u'status'][u'items'])
-    for item in data[u'items']:
-        page = pywikibot.ItemPage(repo, u'Q{0}'.format(item))
-        if isinstance(site, pywikibot.site.DataSite):
-            yield page
-            continue
 
-        try:
-            link = page.getSitelink(site)
-        except pywikibot.NoPage:
-            continue
-        yield pywikibot.Page(pywikibot.Link(link, site))
+    if isinstance(site, pywikibot.site.DataSite):
+        for item in data[u'items']:
+            page = pywikibot.ItemPage(repo, u'Q{0}'.format(item))
+            yield page
+    else:
+        for sublist in itergroup(data[u'items'], groupsize):
+            req = {'ids': [u'Q{0}'.format(item) for item in sublist],
+                   'sitefilter': site.dbName(),
+                   'action': 'wbgetentities',
+                   'props': 'sitelinks'}
+
+            wbrequest = repo._simple_request(**req)
+            wbdata = wbrequest.submit()
+            for entity in wbdata['entities'].values():
+                if 'sitelinks' in entity and site.dbName() in 
entity['sitelinks']:
+                    yield pywikibot.Page(site, 
entity['sitelinks'][site.dbName()]['title'])
 
 
 def WikidataSPARQLPageGenerator(query, site=None, item_name='item', 
endpoint=None):

-- 
To view, visit https://gerrit.wikimedia.org/r/276562
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I515c7135b7c2f8b9851c82a189abdd102dd562ee
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Eranroz <eranro...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to