Ejegg has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/371586 )
Change subject: [WIP] harvest multiple values from one parameter ...................................................................... [WIP] harvest multiple values from one parameter Does not work with -islink FIXME: refactor lame claim/claims codepaths, work with claim types besides wikibase-item Bug: T87689 Change-Id: Ied808405a21213e165d51b3fe3d79dfd883e58c0 --- M scripts/harvest_template.py 1 file changed, 54 insertions(+), 19 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core refs/changes/86/371586/1 diff --git a/scripts/harvest_template.py b/scripts/harvest_template.py index 5d99364..5959270 100755 --- a/scripts/harvest_template.py +++ b/scripts/harvest_template.py @@ -71,6 +71,15 @@ page won't be skipped if the item already has that property but there is not the new value. + python pwb.py harvest_template -lang:en -family:wikipedia -namespace:0 \ + -template:"Infobox musical artist" current_members P527 -exists:p -multi + + will import band members from the "current_members" parameter of "Infobox + musical artist" on English Wikipedia as Wikidata property "P527" (has part). + This will only extract multiple band members if each is linked, and will not + add duplicate claims for the same member. + + TODO: 'multi' implies at least exists:p - set that automatically? """ # # (C) Multichill, Amir, 2013 @@ -109,8 +118,9 @@ """Class holding options for a param-property pair.""" availableOptions = { - 'islink': False, 'exists': '', + 'islink': False, + 'multi': False, } @@ -130,11 +140,14 @@ @type fields: dict @keyword islink: Whether non-linked values should be treated as links @type islink: bool + @keyword multi: Whether multiple values should be extracted from a single parameter + @type islink: bool """ self.availableOptions.update({ 'always': True, 'exists': '', 'islink': False, + 'multi': False, }) super(HarvestRobot, self).__init__(**kwargs) self.generator = generator @@ -220,7 +233,8 @@ raise KeyboardInterrupt self.current_page = page item.get() - if set(val[0] for val in self.fields.values()) <= set( + any_multi = any('exists' in val[1].options for val in self.fields.values()) + if not any_multi and set(val[0] for val in self.fields.values()) <= set( item.claims.keys()): pywikibot.output('%s item %s has claims for all properties. ' 'Skipping.' % (page, item.title())) @@ -253,25 +267,43 @@ # This field contains something useful for us prop, options = self.fields[field] claim = pywikibot.Claim(self.repo, prop) + claims = [] # FIXME: this is a horrid way to do multiples if claim.type == 'wikibase-item': - # Try to extract a valid page - match = pywikibot.link_regex.search(value) - if match: - link_text = match.group(1) - else: - if self._get_option_with_fallback(options, 'islink'): - link_text = value - else: + if self._get_option_with_fallback(options, 'multi'): + matches = pywikibot.link_regex.findall(value) + if matches: + for match in matches: + link_text = match[0] + linked_item = self._template_link_target(item, link_text) + if not linked_item: + continue + claim.setTarget(linked_item) + claims.append(claim) + claim = pywikibot.Claim(self.repo, prop) + if len(claims) == 0: pywikibot.output( - '%s field %s value %s is not a wikilink. ' + '%s field %s value %s contains no wikilinks to data items. ' 'Skipping.' % (claim.getID(), field, value)) continue + else: + # Try to extract a valid page + match = pywikibot.link_regex.search(value) + if match: + link_text = match.group(1) + else: + if self._get_option_with_fallback(options, 'islink'): + link_text = value + else: + pywikibot.output( + '%s field %s value %s is not a wikilink. ' + 'Skipping.' % (claim.getID(), field, value)) + continue - linked_item = self._template_link_target(item, link_text) - if not linked_item: - continue + linked_item = self._template_link_target(item, link_text) + if not linked_item: + continue - claim.setTarget(linked_item) + claim.setTarget(linked_item) elif claim.type in ('string', 'external-id'): claim.setTarget(value.strip()) elif claim.type == 'url': @@ -297,10 +329,13 @@ % claim.type) continue - # A generator might yield pages from multiple sites - self.user_add_claim_unless_exists( - item, claim, self._get_option_with_fallback('exists'), - pywikibot.output, page.site) + if len(claims) == 0: + claims.append(claim) + for add_claim in claims: + # A generator might yield pages from multiple sites + self.user_add_claim_unless_exists( + item, add_claim, self._get_option_with_fallback(options, 'exists'), + pywikibot.output, page.site) def main(*args): -- To view, visit https://gerrit.wikimedia.org/r/371586 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ied808405a21213e165d51b3fe3d79dfd883e58c0 Gerrit-PatchSet: 1 Gerrit-Project: pywikibot/core Gerrit-Branch: master Gerrit-Owner: Ejegg <ej...@ejegg.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits