Eranroz has uploaded a new change for review. https://gerrit.wikimedia.org/r/276562
Change subject: Performance improvments for WikidataQuery gen ...................................................................... Performance improvments for WikidataQuery gen WikidataQueryPageGenerator requests for each page its sitelink, which requires many web requests and is very inefficent. Instead we use a batch query requesting only the data we need (sitelinks). Change-Id: I515c7135b7c2f8b9851c82a189abdd102dd562ee --- M pywikibot/pagegenerators.py 1 file changed, 20 insertions(+), 12 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core refs/changes/62/276562/1 diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py index 6a59a1d..b0ef07c 100644 --- a/pywikibot/pagegenerators.py +++ b/pywikibot/pagegenerators.py @@ -40,10 +40,11 @@ deprecated_args, redirect_func, issue_deprecation_warning, + itergroup, DequeGenerator, intersect_generators, IteratorNextMixin, - filter_unique, + filter_unique ) from pywikibot import date, config, i18n, xmlreader @@ -2593,11 +2594,12 @@ yield pywikibot.Page(pywikibot.Link(fd(month, day), site)) -def WikidataQueryPageGenerator(query, site=None): +def WikidataQueryPageGenerator(query, site=None, groupsize=50): """Generate pages that result from the given WikidataQuery. @param query: the WikidataQuery query string. @param site: Site for generator results. + @param groupsize: Number of entities to fetch from repository in each batch. @type site: L{pywikibot.site.BaseSite} """ @@ -2613,17 +2615,23 @@ data = wd_query.query(wd_queryset) pywikibot.output(u'retrieved %d items' % data[u'status'][u'items']) - for item in data[u'items']: - page = pywikibot.ItemPage(repo, u'Q{0}'.format(item)) - if isinstance(site, pywikibot.site.DataSite): - yield page - continue - try: - link = page.getSitelink(site) - except pywikibot.NoPage: - continue - yield pywikibot.Page(pywikibot.Link(link, site)) + if isinstance(site, pywikibot.site.DataSite): + for item in data[u'items']: + page = pywikibot.ItemPage(repo, u'Q{0}'.format(item)) + yield page + else: + for sublist in itergroup(data[u'items'], groupsize): + req = {'ids': [u'Q{0}'.format(item) for item in sublist], + 'sitefilter': site.dbName(), + 'action': 'wbgetentities', + 'props': 'sitelinks'} + + wbrequest = repo._simple_request(**req) + wbdata = wbrequest.submit() + for entity in wbdata['entities'].values(): + if 'sitelinks' in entity and site.dbName() in entity['sitelinks']: + yield pywikibot.Page(site, entity['sitelinks'][site.dbName()]['title']) def WikidataSPARQLPageGenerator(query, site=None, item_name='item', endpoint=None): -- To view, visit https://gerrit.wikimedia.org/r/276562 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I515c7135b7c2f8b9851c82a189abdd102dd562ee Gerrit-PatchSet: 1 Gerrit-Project: pywikibot/core Gerrit-Branch: master Gerrit-Owner: Eranroz <eranro...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits