Eranroz has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/329666 )
Change subject: Adding support for importing labels ...................................................................... Adding support for importing labels New capability of harvesting labels for wikibase-item claims. Adding -harvest_labels argument enables to import label for existing claim that has missing label in the site language. Bug: T154313 Change-Id: Ie6fb250f70ba09d72a93e81b25abefb2e01ddcee --- M scripts/harvest_template.py 1 file changed, 96 insertions(+), 54 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core refs/changes/66/329666/1 diff --git a/scripts/harvest_template.py b/scripts/harvest_template.py index 569b25f..d15d8d6 100755 --- a/scripts/harvest_template.py +++ b/scripts/harvest_template.py @@ -13,6 +13,11 @@ This will work on all pages that transclude the template in the article namespace +* python pwb.py harvest_template [generators] -harvest_labels -template:"..." \ + template_parameter PID [template_parameter PID] + +This will set labels to target entity connected through PID using template field. + These command line parameters can be used to specify which pages to work on: ¶ms; @@ -61,7 +66,7 @@ """A bot to add Wikidata claims.""" - def __init__(self, generator, templateTitle, fields): + def __init__(self, generator, templateTitle, harvestLabels, fields): """ Constructor. @@ -78,6 +83,7 @@ self.fields = fields self.cacheSources() self.templateTitles = self.getTemplateSynonyms(self.templateTitle) + self.harvest_labels = harvestLabels def getTemplateSynonyms(self, title): """Fetch redirects of the title, so we can check against them.""" @@ -132,7 +138,7 @@ raise KeyboardInterrupt self.current_page = page item.get() - if set(self.fields.values()) <= set(item.claims.keys()): + if not self.harvest_labels and set(self.fields.values()) <= set(item.claims.keys()): pywikibot.output('%s item %s has claims for all properties. ' 'Skipping.' % (page, item.title())) return @@ -159,60 +165,93 @@ # This field contains something useful for us if field in self.fields: - # Check if the property isn't already set - claim = pywikibot.Claim(self.repo, self.fields[field]) - if claim.getID() in item.get().get('claims'): - pywikibot.output( - 'A claim for %s already exists. Skipping.' - % claim.getID()) - # TODO: Implement smarter approach to merging - # harvested values with existing claims esp. - # without overwriting humans unintentionally. - else: - if claim.type == 'wikibase-item': - # Try to extract a valid page - match = re.search(pywikibot.link_regex, value) - if not match: - pywikibot.output( - '%s field %s value %s is not a ' - 'wikilink. Skipping.' - % (claim.getID(), field, value)) - continue + self.harvest_field(field, value, item, page) - link_text = match.group(1) - linked_item = self._template_link_target(item, link_text) - if not linked_item: - continue + def harvest_field(self, field, value, item, page): + # Check if the property isn't already set + claim = pywikibot.Claim(self.repo, self.fields[field]) + if claim.getID() in item.get().get('claims'): + # special case - importing labels + if self.harvest_labels and claim.type == 'wikibase-item': + self.harvest_label(item.claims[claim.getID()], page.site.lang, value) - claim.setTarget(linked_item) - elif claim.type in ('string', 'external-id'): - claim.setTarget(value.strip()) - elif claim.type == 'commonsMedia': - commonssite = pywikibot.Site("commons", "commons") - imagelink = pywikibot.Link(value, source=commonssite, - defaultNamespace=6) - image = pywikibot.FilePage(imagelink) - if image.isRedirectPage(): - image = pywikibot.FilePage(image.getRedirectTarget()) - if not image.exists(): - pywikibot.output( - '[[%s]] doesn\'t exist so I can\'t link to it' - % (image.title(),)) - continue - claim.setTarget(image) - else: - pywikibot.output( - '%s is not a supported datatype.' - % claim.type) - continue + pywikibot.output( + 'A claim for %s already exists. Skipping.' + % claim.getID()) + # TODO: Implement smarter approach to merging + # harvested values with existing claims esp. + # without overwriting humans unintentionally. - pywikibot.output('Adding %s --> %s' - % (claim.getID(), claim.getTarget())) - item.addClaim(claim) - # A generator might yield pages from multiple sites - source = self.getSource(page.site) - if source: - claim.addSource(source, bot=True) + else: + if claim.type == 'wikibase-item': + # Try to extract a valid page + match = re.search(pywikibot.link_regex, value) + if not match: + pywikibot.output( + '%s field %s value %s is not a ' + 'wikilink. Skipping.' + % (claim.getID(), field, value)) + return + + link_text = match.group(1) + linked_item = self._template_link_target(item, link_text) + if not linked_item: + return + + claim.setTarget(linked_item) + elif claim.type in ('string', 'external-id'): + claim.setTarget(value.strip()) + elif claim.type == 'commonsMedia': + commonssite = pywikibot.Site("commons", "commons") + imagelink = pywikibot.Link(value, source=commonssite, + defaultNamespace=6) + image = pywikibot.FilePage(imagelink) + if image.isRedirectPage(): + image = pywikibot.FilePage(image.getRedirectTarget()) + if not image.exists(): + pywikibot.output( + '[[%s]] doesn\'t exist so I can\'t link to it' + % (image.title(),)) + return + claim.setTarget(image) + else: + pywikibot.output( + '%s is not a supported datatype.' + % claim.type) + return + + pywikibot.output('Adding %s --> %s' + % (claim.getID(), claim.getTarget())) + item.addClaim(claim) + # A generator might yield pages from multiple sites + source = self.getSource(page.site) + if source: + claim.addSource(source, bot=True) + + def harvest_label(self, existing_claim, lang, value): + if len(existing_claim) != 1: + return # skip when there are multiple possible target entities + + target_entity = existing_claim[0].target + target_labels = target_entity.get()['labels'] + if lang in target_labels: + # skip if label already exist + return + + match = re.search(pywikibot.link_regex, value) + if match: + new_label = match.group(1) + else: + # make sure it is not complex wiki code + if re.search('[{\[\]<]', value): + pywikibot.output('Fail to parse value: %s' % value) + return + new_label = value + readable_label = '(%s)' % target_labels['en'] if 'en' in target_labels else '' + pywikibot.output( + 'Adding [%s] label %s to %s %s' + % (lang, new_label, target_entity.getID(), readable_label)) + target_entity.editLabels({lang: new_label}) def main(*args): @@ -226,6 +265,7 @@ """ commandline_arguments = list() template_title = u'' + harvest_labels = False # Process global args and prepare generator args parser local_args = pywikibot.handle_args(args) @@ -238,6 +278,8 @@ u'Please enter the template to work on:') else: template_title = arg[10:] + elif arg.startswith('-harvest_labels'): + harvest_labels = True elif gen.handleArg(arg): if arg.startswith(u'-transcludes:'): template_title = arg[13:] @@ -260,7 +302,7 @@ gen.handleArg(u'-transcludes:' + template_title) generator = gen.getCombinedGenerator() - bot = HarvestRobot(generator, template_title, fields) + bot = HarvestRobot(generator, template_title, harvest_labels, fields) bot.run() if __name__ == "__main__": -- To view, visit https://gerrit.wikimedia.org/r/329666 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ie6fb250f70ba09d72a93e81b25abefb2e01ddcee Gerrit-PatchSet: 1 Gerrit-Project: pywikibot/core Gerrit-Branch: master Gerrit-Owner: Eranroz <eranro...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits