[MediaWiki-commits] [Gerrit] pywikibot/core[master]: Adding support for importing labels

Eranroz (Code Review) Fri, 30 Dec 2016 03:07:39 -0800

Eranroz has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/329666 )


Change subject: Adding support for importing labels
......................................................................

Adding support for importing labels

New capability of harvesting labels for wikibase-item claims.
Adding -harvest_labels argument enables to import label for existing claim
that has missing label in the site language.

Bug: T154313
Change-Id: Ie6fb250f70ba09d72a93e81b25abefb2e01ddcee
---
M scripts/harvest_template.py
1 file changed, 96 insertions(+), 54 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core 
refs/changes/66/329666/1

diff --git a/scripts/harvest_template.py b/scripts/harvest_template.py
index 569b25f..d15d8d6 100755
--- a/scripts/harvest_template.py
+++ b/scripts/harvest_template.py
@@ -13,6 +13,11 @@
 This will work on all pages that transclude the template in the article
 namespace
 
+* python pwb.py harvest_template [generators] -harvest_labels -template:"..." \
+    template_parameter PID [template_parameter PID]
+
+This will set labels to target entity connected through PID using template 
field.
+
 These command line parameters can be used to specify which pages to work on:
 
 &params;
@@ -61,7 +66,7 @@
 
     """A bot to add Wikidata claims."""
 
-    def __init__(self, generator, templateTitle, fields):
+    def __init__(self, generator, templateTitle, harvestLabels, fields):
         """
         Constructor.
 
@@ -78,6 +83,7 @@
         self.fields = fields
         self.cacheSources()
         self.templateTitles = self.getTemplateSynonyms(self.templateTitle)
+        self.harvest_labels = harvestLabels
 
     def getTemplateSynonyms(self, title):
         """Fetch redirects of the title, so we can check against them."""
@@ -132,7 +138,7 @@
             raise KeyboardInterrupt
         self.current_page = page
         item.get()
-        if set(self.fields.values()) <= set(item.claims.keys()):
+        if not self.harvest_labels and set(self.fields.values()) <= 
set(item.claims.keys()):
             pywikibot.output('%s item %s has claims for all properties. '
                              'Skipping.' % (page, item.title()))
             return
@@ -159,60 +165,93 @@
 
                     # This field contains something useful for us
                     if field in self.fields:
-                        # Check if the property isn't already set
-                        claim = pywikibot.Claim(self.repo, self.fields[field])
-                        if claim.getID() in item.get().get('claims'):
-                            pywikibot.output(
-                                'A claim for %s already exists. Skipping.'
-                                % claim.getID())
-                            # TODO: Implement smarter approach to merging
-                            # harvested values with existing claims esp.
-                            # without overwriting humans unintentionally.
-                        else:
-                            if claim.type == 'wikibase-item':
-                                # Try to extract a valid page
-                                match = re.search(pywikibot.link_regex, value)
-                                if not match:
-                                    pywikibot.output(
-                                        '%s field %s value %s is not a '
-                                        'wikilink. Skipping.'
-                                        % (claim.getID(), field, value))
-                                    continue
+                        self.harvest_field(field, value, item, page)
 
-                                link_text = match.group(1)
-                                linked_item = self._template_link_target(item, 
link_text)
-                                if not linked_item:
-                                    continue
+    def harvest_field(self, field, value, item, page):
+        # Check if the property isn't already set
+        claim = pywikibot.Claim(self.repo, self.fields[field])
+        if claim.getID() in item.get().get('claims'):
+            # special case - importing labels
+            if self.harvest_labels and claim.type == 'wikibase-item':
+                self.harvest_label(item.claims[claim.getID()], page.site.lang, 
value)
 
-                                claim.setTarget(linked_item)
-                            elif claim.type in ('string', 'external-id'):
-                                claim.setTarget(value.strip())
-                            elif claim.type == 'commonsMedia':
-                                commonssite = pywikibot.Site("commons", 
"commons")
-                                imagelink = pywikibot.Link(value, 
source=commonssite,
-                                                           defaultNamespace=6)
-                                image = pywikibot.FilePage(imagelink)
-                                if image.isRedirectPage():
-                                    image = 
pywikibot.FilePage(image.getRedirectTarget())
-                                if not image.exists():
-                                    pywikibot.output(
-                                        '[[%s]] doesn\'t exist so I can\'t 
link to it'
-                                        % (image.title(),))
-                                    continue
-                                claim.setTarget(image)
-                            else:
-                                pywikibot.output(
-                                    '%s is not a supported datatype.'
-                                    % claim.type)
-                                continue
+            pywikibot.output(
+                'A claim for %s already exists. Skipping.'
+                % claim.getID())
+            # TODO: Implement smarter approach to merging
+            # harvested values with existing claims esp.
+            # without overwriting humans unintentionally.
 
-                            pywikibot.output('Adding %s --> %s'
-                                             % (claim.getID(), 
claim.getTarget()))
-                            item.addClaim(claim)
-                            # A generator might yield pages from multiple sites
-                            source = self.getSource(page.site)
-                            if source:
-                                claim.addSource(source, bot=True)
+        else:
+            if claim.type == 'wikibase-item':
+                # Try to extract a valid page
+                match = re.search(pywikibot.link_regex, value)
+                if not match:
+                    pywikibot.output(
+                        '%s field %s value %s is not a '
+                        'wikilink. Skipping.'
+                        % (claim.getID(), field, value))
+                    return
+
+                link_text = match.group(1)
+                linked_item = self._template_link_target(item, link_text)
+                if not linked_item:
+                    return
+
+                claim.setTarget(linked_item)
+            elif claim.type in ('string', 'external-id'):
+                claim.setTarget(value.strip())
+            elif claim.type == 'commonsMedia':
+                commonssite = pywikibot.Site("commons", "commons")
+                imagelink = pywikibot.Link(value, source=commonssite,
+                                           defaultNamespace=6)
+                image = pywikibot.FilePage(imagelink)
+                if image.isRedirectPage():
+                    image = pywikibot.FilePage(image.getRedirectTarget())
+                if not image.exists():
+                    pywikibot.output(
+                        '[[%s]] doesn\'t exist so I can\'t link to it'
+                        % (image.title(),))
+                    return
+                claim.setTarget(image)
+            else:
+                pywikibot.output(
+                    '%s is not a supported datatype.'
+                    % claim.type)
+                return
+
+            pywikibot.output('Adding %s --> %s'
+                             % (claim.getID(), claim.getTarget()))
+            item.addClaim(claim)
+            # A generator might yield pages from multiple sites
+            source = self.getSource(page.site)
+            if source:
+                claim.addSource(source, bot=True)
+
+    def harvest_label(self, existing_claim, lang, value):
+        if len(existing_claim) != 1:
+            return  # skip when there are multiple possible target entities
+
+        target_entity = existing_claim[0].target
+        target_labels = target_entity.get()['labels']
+        if lang in target_labels:
+            # skip if label already exist
+            return
+
+        match = re.search(pywikibot.link_regex, value)
+        if match:
+            new_label = match.group(1)
+        else:
+            # make sure it is not complex wiki code
+            if re.search('[{\[\]<]', value):
+                pywikibot.output('Fail to parse value: %s' % value)
+                return
+            new_label = value
+        readable_label = '(%s)' % target_labels['en'] if 'en' in target_labels 
else ''
+        pywikibot.output(
+            'Adding [%s] label %s to %s %s'
+            % (lang, new_label, target_entity.getID(), readable_label))
+        target_entity.editLabels({lang: new_label})
 
 
 def main(*args):
@@ -226,6 +265,7 @@
     """
     commandline_arguments = list()
     template_title = u''
+    harvest_labels = False
 
     # Process global args and prepare generator args parser
     local_args = pywikibot.handle_args(args)
@@ -238,6 +278,8 @@
                     u'Please enter the template to work on:')
             else:
                 template_title = arg[10:]
+        elif arg.startswith('-harvest_labels'):
+            harvest_labels = True
         elif gen.handleArg(arg):
             if arg.startswith(u'-transcludes:'):
                 template_title = arg[13:]
@@ -260,7 +302,7 @@
         gen.handleArg(u'-transcludes:' + template_title)
         generator = gen.getCombinedGenerator()
 
-    bot = HarvestRobot(generator, template_title, fields)
+    bot = HarvestRobot(generator, template_title, harvest_labels, fields)
     bot.run()
 
 if __name__ == "__main__":

-- 
To view, visit https://gerrit.wikimedia.org/r/329666
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ie6fb250f70ba09d72a93e81b25abefb2e01ddcee
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Eranroz <eranro...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] pywikibot/core[master]: Adding support for importing labels

Reply via email to