[MediaWiki-commits] [Gerrit] Normalise data_ingestion script - change (pywikibot/core)

John Vandenberg (Code Review) Sat, 17 Jan 2015 22:50:26 -0800

John Vandenberg has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/185666


Change subject: Normalise data_ingestion script
......................................................................

Normalise data_ingestion script

Photo subclasses FilePage
DataIngestionBot subclasses Bot

Commented out parts of data_ingestion now integrated into
the script.

Bug: T70611
Change-Id: I69bf929cf92bc5cb89c801c9a6da83640595626b
---
M scripts/data_ingestion.py
M scripts/flickrripper.py
M tests/data_ingestion_tests.py
M tests/script_tests.py
M tox.ini
5 files changed, 148 insertions(+), 168 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core 
refs/changes/66/185666/1

diff --git a/scripts/data_ingestion.py b/scripts/data_ingestion.py
index 72e22f5..43a7d0a 100755
--- a/scripts/data_ingestion.py
+++ b/scripts/data_ingestion.py
@@ -1,6 +1,16 @@
 #!/usr/bin/python
 # -*- coding: utf-8  -*-
-"""A generic bot to do data ingestion (batch uploading) to Commons."""
+r"""
+A generic bot to do data ingestion (batch uploading).
+
+usage: data_ingestion.py -csvdir:local_dir/ -page:"config_page"
+
+e.g. python pwb.py scripts/data_ingestion.py \
+            -family:test -lang:test \
+            -csvdir:tests/data \
+            -page:"User:John Vandenberg/data ingestion test template"
+
+"""
 #
 # (C) Pywikibot team, 2013
 #
@@ -9,17 +19,27 @@
 __version__ = '$Id$'
 #
 
-import posixpath
-import hashlib
 import base64
-import sys
+import collections
+import csv
+import hashlib
 import io
+import os
+import sys
+
+import posixpath
 
 import pywikibot
+
+from pywikibot import pagegenerators
+from pywikibot.tools import deprecated, deprecated_args
+
 # TODO: nosetests3 fails on 'import <other_script>', which is used by many
 # of our scripts, but only data_ingestion is directly imported (not via pwb).
 # https://github.com/nose-devs/nose/issues/839
 from scripts import upload
+
+from scripts.flickrripper import cleanUpTitle
 
 if sys.version_info[0] > 2:
     from urllib.parse import urlparse
@@ -29,20 +49,23 @@
     from urllib import urlopen
 
 
-class Photo(object):
+class Photo(pywikibot.FilePage):
 
-    """
-    Represents a Photo (or other file), with metadata, to upload to Commons.
+    """Represents a Photo (or other file), with metadata, to be uploaded."""
 
-    The constructor takes two parameters: URL (string) and metadata (dict with
-    str:str key:value pairs) that can be referred to from the title & template
-    generation.
+    def __init__(self, URL, metadata, site=None):
+        """
+        Constructor.
 
+        @param URL: URL of photo
+        @type URL: str
+        @param metadata: metadata about the photo that can be referred to
+            from the title & template
+        @type metadata: dict
+        @param site: target site
+        @type site: APISite
 
-    """
-
-    def __init__(self, URL, metadata):
-        """Constructor."""
+        """
         self.URL = URL
         self.metadata = metadata
         self.metadata["_url"] = URL
@@ -52,6 +75,12 @@
         if ext == filename:
             self.metadata["_ext"] = ext = None
         self.contents = None
+
+        if not site:
+            site = pywikibot.Site(u'commons', u'commons')
+
+        # default title
+        super(Photo, self).__init__(site, 
self.getTitle('%(_filename)s.%(_ext)s'))
 
     def downloadPhoto(self):
         """
@@ -64,8 +93,8 @@
             self.contents = io.BytesIO(imageFile)
         return self.contents
 
-    def findDuplicateImages(self,
-                            site=pywikibot.Site(u'commons', u'commons')):
+    @deprecated_args(site=None)
+    def findDuplicateImages(self):
         """
         Find duplicates of the photo.
 
@@ -76,6 +105,7 @@
         """
         hashObject = hashlib.sha1()
         hashObject.update(self.downloadPhoto().getvalue())
+        site = self.site
         return site.getFilesFromAnHash(base64.b16encode(hashObject.digest()))
 
     def getTitle(self, fmt):
@@ -87,7 +117,7 @@
         @return: formatted string
         @rtype: unicode
         """
-        return fmt % self.metadata
+        return cleanUpTitle(fmt % self.metadata, underscore=False)
 
     def getDescription(self, template, extraparams={}):
         """Generate a description for a file."""
@@ -108,28 +138,45 @@
         return value.replace("|", "{{!}}")
 
 
-def CSVReader(fileobj, urlcolumn, *args, **kwargs):
+class CSVReader(collections.Iterator):
+
     """CSV reader."""
-    import csv
-    reader = csv.DictReader(fileobj, *args, **kwargs)
 
-    for line in reader:
-        yield Photo(line[urlcolumn], line)
+    def __init__(self, fileobj, urlcolumn='url', site=None, *args, **kwargs):
+        """Constructor."""
+        self.fileobj = fileobj
+        self.urlcolumn = urlcolumn
+        self.site = site
+        self.reader = iter(csv.DictReader(fileobj, *args, **kwargs))
+
+    def __next__(self):
+        """Iterator."""
+        line = next(self.reader)
+        return Photo(line[self.urlcolumn], line, site=self.site)
+    next = __next__
 
 
-class DataIngestionBot:
+class DataIngestionBot(pywikibot.Bot):
 
     """Data ingestion bot."""
 
-    def __init__(self, reader, titlefmt, pagefmt,
+    @deprecated_args(reader='generator')
+    def __init__(self, generator, titlefmt, pagefmt,
                  site=pywikibot.Site(u'commons', u'commons')):
-        self.reader = reader
+        """Constructor."""
+        super(DataIngestionBot, self).__init__(generator=generator)
         self.titlefmt = titlefmt
         self.pagefmt = pagefmt
-        self.site = site
 
-    def _doUpload(self, photo):
-        duplicates = photo.findDuplicateImages(self.site)
+        if site:
+            self.site = site
+
+        # deprecated
+        self.reader = generator
+
+    def treat(self, photo):
+        """Process each page."""
+        duplicates = photo.findDuplicateImages()
         if duplicates:
             pywikibot.output(u"Skipping duplicate of %r" % duplicates)
             return duplicates[0]
@@ -149,178 +196,106 @@
 
         return title
 
+    @deprecated
     def doSingle(self):
+        """Process one page."""
         return self._doUpload(next(self.reader))
 
-    def run(self):
-        for photo in self.reader:
-            self._doUpload(photo)
-
-if __name__ == "__main__":
-    reader = CSVReader(open('tests/data/csv_ingestion.csv'), 'url')
-    bot = DataIngestionBot(
-        reader,
-        "%(name)s - %(set)s.%(_ext)s", ":user:valhallasw/test_template",
-        pywikibot.Site('test', 'test'))
-    bot.run()
-
-'''
-class DataIngestionBot:
-    def __init__(self, configurationPage):
+    @classmethod
+    def parseConfigurationPage(cls, configurationPage):
         """
+        Parse a Page which contains the configuration.
 
+        @param configurationPage: page with configuration
+        @type configurationPage: L{pywikibot.Page}
         """
-        self.site = configurationPage.site()
-        self.configuration = self.parseConfigurationPage(configurationPage)
-
-    def parseConfigurationPage(self, configurationPage):
-        """
-        Expects a pywikibot.page object "configurationPage" which contains the 
configuration
-        """
-        configuration  = {}
+        configuration = {}
         # Set a bunch of defaults
-        configuration['csvDialect']=u'excel'
-        configuration['csvDelimiter']=';'
-        configuration['csvEncoding']=u'Windows-1252' #FIXME: Encoding hell
+        configuration['csvDialect'] = u'excel'
+        configuration['csvDelimiter'] = ';'
+        configuration['csvEncoding'] = u'Windows-1252'  # FIXME: Encoding hell
 
         templates = configurationPage.templatesWithParams()
         for (template, params) in templates:
-            if template == u'Data ingestion':
+            if template.title(withNamespace=False) == u'Data ingestion':
                 for param in params:
                     (field, sep, value) = param.partition(u'=')
 
                     # Remove leading or trailing spaces
                     field = field.strip()
                     value = value.strip()
+                    if not value:
+                        value = None
                     configuration[field] = value
-        print(configuration)
+
         return configuration
 
-
-    def downloadPhoto(self, photoUrl=''):
+    @classmethod
+    def from_config_page(cls, configuration_page, data_dir):
         """
-        Download the photo and store it in a io.BytesIO object.
+        Create a Data Ingestion Bot from a config page.
 
-        TODO: Add exception handling
+        @param configuration_page: page with configuration
+        @type configuration_page: L{pywikibot.Page}
+        @param data_dir: path of data files
+        @type data_dir: str
         """
-        imageFile = urlopen(photoUrl).read()
-        return io.BytesIO(imageFile)
+        configuration = cls.parseConfigurationPage(configuration_page)
 
-    def findDuplicateImages(self, photo=None, site=pywikibot.Site(u'commons', 
u'commons')):
-        """
-        Takes the photo, calculates the SHA1 hash and asks the MediaWiki api 
for a list of duplicates.
+        print(configuration)
 
-        TODO: Add exception handling, fix site thing
-        """
-        hashObject = hashlib.sha1()
-        hashObject.update(photo.getvalue())
-        return site.getFilesFromAnHash(base64.b16encode(hashObject.digest()))
+        filename = os.path.join(data_dir, configuration['csvFile'])
 
-    def getTitle(self, metadata):
-        """
-        Build a title.
-        Have titleFormat to indicate how the title would look.
-        We need to be able to strip off stuff if it's too long. 
configuration.get('maxTitleLength')
-        """
+        data = open(filename)
 
-        #FIXME: Make this configurable.
-        title = self.configuration.get('titleFormat') % metadata
+        generator = CSVReader(data, site=configuration_page.site,
+                              dialect=configuration['csvDialect'],
+                              delimiter=str(configuration['csvDelimiter']))
 
-        description = metadata.get(u'dc:title')
-        identifier = metadata.get(u'dc:identifier')
+        return cls(generator,
+                   configuration['titleFormat'],
+                   configuration['formattingTemplate'])
 
-        if len(description) > 120:
-            description = description[0 : 120]
-
-        title = u'%s - %s.jpg' % (description, identifier)
-
-        return flickrripper.cleanUpTitle(title)
-
-    def cleanDate(self, field):
-        """
-        A function to do date clean up.
-        """
-        # Empty, make it really empty
-        if field == u'-':
-            return u''
-        # TODO: Circa
-        # TODO: Period
-
-        return field
-
-    def cleanEmptyField(self, field):
-        return field
-
-    def procesFile(self, metadata):
-        # FIXME: Do some metadata enrichment
-        #metadata = getEuropeanaMetadata(metadata)
-
-        fileLocation = metadata.get(self.configuration.get('sourceFileField'))
-
-        photo = self.downloadPhoto(fileLocation)
-        duplicates = self.findDuplicateImages(photo)
-
-        # We don't want to upload dupes
-        if duplicates:
-            pywikibot.output(u'Found duplicate image at %s' % duplicates.pop())
-            # The file is at Commons so return True
-            return True
-
-        # FIXME: Do some checking to see if the title already exists
-
-        title = self.getTitle(metadata)
-        description = self.getDescription(metadata)
-
-
-        pywikibot.output(u'Preparing upload for %s.' % title)
-        pywikibot.output(description)
-
-        bot = upload.UploadRobot(url=fileLocation, description=description, 
useFilename=title, keepFilename=True, verifyDescription=False, targetSite = 
self.site)
-        bot.run()
-
-    def processCSV(self):
-        database = {}
-
-        reader = csv.DictReader(open(self.configuration.get('csvFile'), "rb"), 
dialect=self.configuration.get('csvDialect'), 
delimiter=self.configuration.csvDelimiter)
-        # FIXME : Encoding problems 
https://docs.python.org/2/library/csv.html#csv-examples
-        for row in reader:
-            self.metadataCSV(row)
-            self.processFile(metadata)
-
-    def run(self):
-        """
-        Do crap
-        """
-        if not self.configuration.get('sourceFormat'):
-            pywikibot.output(u'The field "sourceFormat" is not set')
-            return False
-
-        if self.configuration.get('sourceFormat') == u'csv':
-            self.processCSV()
-        else:
-            pywikibot.output(u'%s is not a supported source format')
 
 def main():
-    generator = None;
+    """
+    Process command line arguments and invoke bot.
 
+    If args is an empty list, sys.argv is used.
+
+    @param args: command line arguments
+    @type args: list of unicode
+    """
     # Process global args and prepare generator args parser
     local_args = pywikibot.handleArgs()
     genFactory = pagegenerators.GeneratorFactory()
+    csv_dir = None
 
     for arg in local_args:
-        genFactory.handleArg(arg)
+        if arg.startswith('-csvdir:'):
+            csv_dir = arg[8:]
+        else:
+            genFactory.handleArg(arg)
 
     generator = genFactory.getCombinedGenerator()
-    if not generator:
-        return False
+
+    if not generator or not csv_dir:
+        pywikibot.showHelp()
+        return
 
     for page in generator:
-        bot  = DataIngestionBot(page)
-        bot.run()
+        bot = None
+        try:
+            page.get()
+        except pywikibot.NoPage:
+            pywikibot.error('%s does not exist' % page)
+            continue
+        try:
+            bot = DataIngestionBot.from_config_page(page, csv_dir)
+            bot.run()
+        finally:
+            if bot and bot.generator and bot.generator.fileobj:
+                bot.generator.fileobj.close()
 
 if __name__ == "__main__":
-    try:
-        main()
-    finally:
-        print("All done!")
-'''
+    main()
diff --git a/scripts/flickrripper.py b/scripts/flickrripper.py
index 7a4fc9e..c8e9fc4 100644
--- a/scripts/flickrripper.py
+++ b/scripts/flickrripper.py
@@ -209,7 +209,7 @@
         return u'%s - %s - %s.jpg' % (title, project, username)
 
 
-def cleanUpTitle(title):
+def cleanUpTitle(title, underscore=True):
     """ Clean up the title of a potential MediaWiki page.
 
     Otherwise the title of the page might not be allowed by the software.
@@ -233,6 +233,8 @@
     title = re.sub(u"[-,^]([.]|$)", u"\\1", title)
     title = title.replace(u" ", u"_")
     title = title.strip(u"_")
+    if not underscore:
+        title = title.replace(u"_", u" ")
     return title
 
 
diff --git a/tests/data_ingestion_tests.py b/tests/data_ingestion_tests.py
index b62a999..ced9447 100644
--- a/tests/data_ingestion_tests.py
+++ b/tests/data_ingestion_tests.py
@@ -33,8 +33,8 @@
                                                   'author': 'KDE artists | 
Silstor',
                                                   'license': 'LGPL',
                                                   'set': 'Crystal SVG icon 
set',
-                                                  'name': 'Sound icon'}
-                                        )
+                                                  'name': 'Sound icon'},
+                                        site=self.get_site('commons'))
 
     def test_downloadPhoto(self):
         """Test download from http://upload.wikimedia.org/.""";
@@ -66,12 +66,16 @@
 
     """Test CSVReader class."""
 
-    net = False
+    family = 'commons'
+    code = 'commons'
+
+    dry = True
 
     def setUp(self):
         super(TestCSVReader, self).setUp()
         with open(os.path.join(_data_dir, 'csv_ingestion.csv')) as fileobj:
-            self.iterator = data_ingestion.CSVReader(fileobj, 'url')
+            self.iterator = data_ingestion.CSVReader(fileobj, 'url',
+                                                     site=self.get_site())
             self.obj = next(self.iterator)
 
     def test_PhotoURL(self):
diff --git a/tests/script_tests.py b/tests/script_tests.py
index 9a9c88f..10f7cca 100644
--- a/tests/script_tests.py
+++ b/tests/script_tests.py
@@ -302,7 +302,6 @@
                 test_name = 'test_' + script_name + '_help'
             dct[test_name] = test_execution(script_name, ['-help'])
             if script_name in ['version',
-                               'data_ingestion',  # bug 68611
                                'script_wui',      # Failing on travis-ci
                                ] + failed_dep_script_list:
                 dct[test_name] = unittest.expectedFailure(dct[test_name])
@@ -325,7 +324,6 @@
                                             no_args_expected_results)
             if script_name in ['catall',          # stdout user interaction
                                'checkimages',     # bug 68613
-                               'data_ingestion',  # bug 68611
                                'flickrripper',    # Requires a flickr api key
                                'lonelypages',     # uses exit code 1
                                'script_wui',      # Error on any user except 
DrTrigonBot
diff --git a/tox.ini b/tox.ini
index ae60d99..9f0306e 100644
--- a/tox.ini
+++ b/tox.ini
@@ -68,6 +68,7 @@
     scripts/clean_sandbox.py \
     scripts/commonscat.py \
     scripts/coordinate_import.py \
+    scripts/data_ingestion.py \
     scripts/delete.py \
     scripts/flickrripper.py \
     scripts/harvest_template.py \

-- 
To view, visit https://gerrit.wikimedia.org/r/185666
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I69bf929cf92bc5cb89c801c9a6da83640595626b
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: John Vandenberg <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] Normalise data_ingestion script - change (pywikibot/core)

Reply via email to