John Vandenberg has uploaded a new change for review. https://gerrit.wikimedia.org/r/185666
Change subject: Normalise data_ingestion script ...................................................................... Normalise data_ingestion script Photo subclasses FilePage DataIngestionBot subclasses Bot Commented out parts of data_ingestion now integrated into the script. Bug: T70611 Change-Id: I69bf929cf92bc5cb89c801c9a6da83640595626b --- M scripts/data_ingestion.py M scripts/flickrripper.py M tests/data_ingestion_tests.py M tests/script_tests.py M tox.ini 5 files changed, 148 insertions(+), 168 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core refs/changes/66/185666/1 diff --git a/scripts/data_ingestion.py b/scripts/data_ingestion.py index 72e22f5..43a7d0a 100755 --- a/scripts/data_ingestion.py +++ b/scripts/data_ingestion.py @@ -1,6 +1,16 @@ #!/usr/bin/python # -*- coding: utf-8 -*- -"""A generic bot to do data ingestion (batch uploading) to Commons.""" +r""" +A generic bot to do data ingestion (batch uploading). + +usage: data_ingestion.py -csvdir:local_dir/ -page:"config_page" + +e.g. python pwb.py scripts/data_ingestion.py \ + -family:test -lang:test \ + -csvdir:tests/data \ + -page:"User:John Vandenberg/data ingestion test template" + +""" # # (C) Pywikibot team, 2013 # @@ -9,17 +19,27 @@ __version__ = '$Id$' # -import posixpath -import hashlib import base64 -import sys +import collections +import csv +import hashlib import io +import os +import sys + +import posixpath import pywikibot + +from pywikibot import pagegenerators +from pywikibot.tools import deprecated, deprecated_args + # TODO: nosetests3 fails on 'import <other_script>', which is used by many # of our scripts, but only data_ingestion is directly imported (not via pwb). # https://github.com/nose-devs/nose/issues/839 from scripts import upload + +from scripts.flickrripper import cleanUpTitle if sys.version_info[0] > 2: from urllib.parse import urlparse @@ -29,20 +49,23 @@ from urllib import urlopen -class Photo(object): +class Photo(pywikibot.FilePage): - """ - Represents a Photo (or other file), with metadata, to upload to Commons. + """Represents a Photo (or other file), with metadata, to be uploaded.""" - The constructor takes two parameters: URL (string) and metadata (dict with - str:str key:value pairs) that can be referred to from the title & template - generation. + def __init__(self, URL, metadata, site=None): + """ + Constructor. + @param URL: URL of photo + @type URL: str + @param metadata: metadata about the photo that can be referred to + from the title & template + @type metadata: dict + @param site: target site + @type site: APISite - """ - - def __init__(self, URL, metadata): - """Constructor.""" + """ self.URL = URL self.metadata = metadata self.metadata["_url"] = URL @@ -52,6 +75,12 @@ if ext == filename: self.metadata["_ext"] = ext = None self.contents = None + + if not site: + site = pywikibot.Site(u'commons', u'commons') + + # default title + super(Photo, self).__init__(site, self.getTitle('%(_filename)s.%(_ext)s')) def downloadPhoto(self): """ @@ -64,8 +93,8 @@ self.contents = io.BytesIO(imageFile) return self.contents - def findDuplicateImages(self, - site=pywikibot.Site(u'commons', u'commons')): + @deprecated_args(site=None) + def findDuplicateImages(self): """ Find duplicates of the photo. @@ -76,6 +105,7 @@ """ hashObject = hashlib.sha1() hashObject.update(self.downloadPhoto().getvalue()) + site = self.site return site.getFilesFromAnHash(base64.b16encode(hashObject.digest())) def getTitle(self, fmt): @@ -87,7 +117,7 @@ @return: formatted string @rtype: unicode """ - return fmt % self.metadata + return cleanUpTitle(fmt % self.metadata, underscore=False) def getDescription(self, template, extraparams={}): """Generate a description for a file.""" @@ -108,28 +138,45 @@ return value.replace("|", "{{!}}") -def CSVReader(fileobj, urlcolumn, *args, **kwargs): +class CSVReader(collections.Iterator): + """CSV reader.""" - import csv - reader = csv.DictReader(fileobj, *args, **kwargs) - for line in reader: - yield Photo(line[urlcolumn], line) + def __init__(self, fileobj, urlcolumn='url', site=None, *args, **kwargs): + """Constructor.""" + self.fileobj = fileobj + self.urlcolumn = urlcolumn + self.site = site + self.reader = iter(csv.DictReader(fileobj, *args, **kwargs)) + + def __next__(self): + """Iterator.""" + line = next(self.reader) + return Photo(line[self.urlcolumn], line, site=self.site) + next = __next__ -class DataIngestionBot: +class DataIngestionBot(pywikibot.Bot): """Data ingestion bot.""" - def __init__(self, reader, titlefmt, pagefmt, + @deprecated_args(reader='generator') + def __init__(self, generator, titlefmt, pagefmt, site=pywikibot.Site(u'commons', u'commons')): - self.reader = reader + """Constructor.""" + super(DataIngestionBot, self).__init__(generator=generator) self.titlefmt = titlefmt self.pagefmt = pagefmt - self.site = site - def _doUpload(self, photo): - duplicates = photo.findDuplicateImages(self.site) + if site: + self.site = site + + # deprecated + self.reader = generator + + def treat(self, photo): + """Process each page.""" + duplicates = photo.findDuplicateImages() if duplicates: pywikibot.output(u"Skipping duplicate of %r" % duplicates) return duplicates[0] @@ -149,178 +196,106 @@ return title + @deprecated def doSingle(self): + """Process one page.""" return self._doUpload(next(self.reader)) - def run(self): - for photo in self.reader: - self._doUpload(photo) - -if __name__ == "__main__": - reader = CSVReader(open('tests/data/csv_ingestion.csv'), 'url') - bot = DataIngestionBot( - reader, - "%(name)s - %(set)s.%(_ext)s", ":user:valhallasw/test_template", - pywikibot.Site('test', 'test')) - bot.run() - -''' -class DataIngestionBot: - def __init__(self, configurationPage): + @classmethod + def parseConfigurationPage(cls, configurationPage): """ + Parse a Page which contains the configuration. + @param configurationPage: page with configuration + @type configurationPage: L{pywikibot.Page} """ - self.site = configurationPage.site() - self.configuration = self.parseConfigurationPage(configurationPage) - - def parseConfigurationPage(self, configurationPage): - """ - Expects a pywikibot.page object "configurationPage" which contains the configuration - """ - configuration = {} + configuration = {} # Set a bunch of defaults - configuration['csvDialect']=u'excel' - configuration['csvDelimiter']=';' - configuration['csvEncoding']=u'Windows-1252' #FIXME: Encoding hell + configuration['csvDialect'] = u'excel' + configuration['csvDelimiter'] = ';' + configuration['csvEncoding'] = u'Windows-1252' # FIXME: Encoding hell templates = configurationPage.templatesWithParams() for (template, params) in templates: - if template == u'Data ingestion': + if template.title(withNamespace=False) == u'Data ingestion': for param in params: (field, sep, value) = param.partition(u'=') # Remove leading or trailing spaces field = field.strip() value = value.strip() + if not value: + value = None configuration[field] = value - print(configuration) + return configuration - - def downloadPhoto(self, photoUrl=''): + @classmethod + def from_config_page(cls, configuration_page, data_dir): """ - Download the photo and store it in a io.BytesIO object. + Create a Data Ingestion Bot from a config page. - TODO: Add exception handling + @param configuration_page: page with configuration + @type configuration_page: L{pywikibot.Page} + @param data_dir: path of data files + @type data_dir: str """ - imageFile = urlopen(photoUrl).read() - return io.BytesIO(imageFile) + configuration = cls.parseConfigurationPage(configuration_page) - def findDuplicateImages(self, photo=None, site=pywikibot.Site(u'commons', u'commons')): - """ - Takes the photo, calculates the SHA1 hash and asks the MediaWiki api for a list of duplicates. + print(configuration) - TODO: Add exception handling, fix site thing - """ - hashObject = hashlib.sha1() - hashObject.update(photo.getvalue()) - return site.getFilesFromAnHash(base64.b16encode(hashObject.digest())) + filename = os.path.join(data_dir, configuration['csvFile']) - def getTitle(self, metadata): - """ - Build a title. - Have titleFormat to indicate how the title would look. - We need to be able to strip off stuff if it's too long. configuration.get('maxTitleLength') - """ + data = open(filename) - #FIXME: Make this configurable. - title = self.configuration.get('titleFormat') % metadata + generator = CSVReader(data, site=configuration_page.site, + dialect=configuration['csvDialect'], + delimiter=str(configuration['csvDelimiter'])) - description = metadata.get(u'dc:title') - identifier = metadata.get(u'dc:identifier') + return cls(generator, + configuration['titleFormat'], + configuration['formattingTemplate']) - if len(description) > 120: - description = description[0 : 120] - - title = u'%s - %s.jpg' % (description, identifier) - - return flickrripper.cleanUpTitle(title) - - def cleanDate(self, field): - """ - A function to do date clean up. - """ - # Empty, make it really empty - if field == u'-': - return u'' - # TODO: Circa - # TODO: Period - - return field - - def cleanEmptyField(self, field): - return field - - def procesFile(self, metadata): - # FIXME: Do some metadata enrichment - #metadata = getEuropeanaMetadata(metadata) - - fileLocation = metadata.get(self.configuration.get('sourceFileField')) - - photo = self.downloadPhoto(fileLocation) - duplicates = self.findDuplicateImages(photo) - - # We don't want to upload dupes - if duplicates: - pywikibot.output(u'Found duplicate image at %s' % duplicates.pop()) - # The file is at Commons so return True - return True - - # FIXME: Do some checking to see if the title already exists - - title = self.getTitle(metadata) - description = self.getDescription(metadata) - - - pywikibot.output(u'Preparing upload for %s.' % title) - pywikibot.output(description) - - bot = upload.UploadRobot(url=fileLocation, description=description, useFilename=title, keepFilename=True, verifyDescription=False, targetSite = self.site) - bot.run() - - def processCSV(self): - database = {} - - reader = csv.DictReader(open(self.configuration.get('csvFile'), "rb"), dialect=self.configuration.get('csvDialect'), delimiter=self.configuration.csvDelimiter) - # FIXME : Encoding problems https://docs.python.org/2/library/csv.html#csv-examples - for row in reader: - self.metadataCSV(row) - self.processFile(metadata) - - def run(self): - """ - Do crap - """ - if not self.configuration.get('sourceFormat'): - pywikibot.output(u'The field "sourceFormat" is not set') - return False - - if self.configuration.get('sourceFormat') == u'csv': - self.processCSV() - else: - pywikibot.output(u'%s is not a supported source format') def main(): - generator = None; + """ + Process command line arguments and invoke bot. + If args is an empty list, sys.argv is used. + + @param args: command line arguments + @type args: list of unicode + """ # Process global args and prepare generator args parser local_args = pywikibot.handleArgs() genFactory = pagegenerators.GeneratorFactory() + csv_dir = None for arg in local_args: - genFactory.handleArg(arg) + if arg.startswith('-csvdir:'): + csv_dir = arg[8:] + else: + genFactory.handleArg(arg) generator = genFactory.getCombinedGenerator() - if not generator: - return False + + if not generator or not csv_dir: + pywikibot.showHelp() + return for page in generator: - bot = DataIngestionBot(page) - bot.run() + bot = None + try: + page.get() + except pywikibot.NoPage: + pywikibot.error('%s does not exist' % page) + continue + try: + bot = DataIngestionBot.from_config_page(page, csv_dir) + bot.run() + finally: + if bot and bot.generator and bot.generator.fileobj: + bot.generator.fileobj.close() if __name__ == "__main__": - try: - main() - finally: - print("All done!") -''' + main() diff --git a/scripts/flickrripper.py b/scripts/flickrripper.py index 7a4fc9e..c8e9fc4 100644 --- a/scripts/flickrripper.py +++ b/scripts/flickrripper.py @@ -209,7 +209,7 @@ return u'%s - %s - %s.jpg' % (title, project, username) -def cleanUpTitle(title): +def cleanUpTitle(title, underscore=True): """ Clean up the title of a potential MediaWiki page. Otherwise the title of the page might not be allowed by the software. @@ -233,6 +233,8 @@ title = re.sub(u"[-,^]([.]|$)", u"\\1", title) title = title.replace(u" ", u"_") title = title.strip(u"_") + if not underscore: + title = title.replace(u"_", u" ") return title diff --git a/tests/data_ingestion_tests.py b/tests/data_ingestion_tests.py index b62a999..ced9447 100644 --- a/tests/data_ingestion_tests.py +++ b/tests/data_ingestion_tests.py @@ -33,8 +33,8 @@ 'author': 'KDE artists | Silstor', 'license': 'LGPL', 'set': 'Crystal SVG icon set', - 'name': 'Sound icon'} - ) + 'name': 'Sound icon'}, + site=self.get_site('commons')) def test_downloadPhoto(self): """Test download from http://upload.wikimedia.org/.""" @@ -66,12 +66,16 @@ """Test CSVReader class.""" - net = False + family = 'commons' + code = 'commons' + + dry = True def setUp(self): super(TestCSVReader, self).setUp() with open(os.path.join(_data_dir, 'csv_ingestion.csv')) as fileobj: - self.iterator = data_ingestion.CSVReader(fileobj, 'url') + self.iterator = data_ingestion.CSVReader(fileobj, 'url', + site=self.get_site()) self.obj = next(self.iterator) def test_PhotoURL(self): diff --git a/tests/script_tests.py b/tests/script_tests.py index 9a9c88f..10f7cca 100644 --- a/tests/script_tests.py +++ b/tests/script_tests.py @@ -302,7 +302,6 @@ test_name = 'test_' + script_name + '_help' dct[test_name] = test_execution(script_name, ['-help']) if script_name in ['version', - 'data_ingestion', # bug 68611 'script_wui', # Failing on travis-ci ] + failed_dep_script_list: dct[test_name] = unittest.expectedFailure(dct[test_name]) @@ -325,7 +324,6 @@ no_args_expected_results) if script_name in ['catall', # stdout user interaction 'checkimages', # bug 68613 - 'data_ingestion', # bug 68611 'flickrripper', # Requires a flickr api key 'lonelypages', # uses exit code 1 'script_wui', # Error on any user except DrTrigonBot diff --git a/tox.ini b/tox.ini index ae60d99..9f0306e 100644 --- a/tox.ini +++ b/tox.ini @@ -68,6 +68,7 @@ scripts/clean_sandbox.py \ scripts/commonscat.py \ scripts/coordinate_import.py \ + scripts/data_ingestion.py \ scripts/delete.py \ scripts/flickrripper.py \ scripts/harvest_template.py \ -- To view, visit https://gerrit.wikimedia.org/r/185666 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I69bf929cf92bc5cb89c801c9a6da83640595626b Gerrit-PatchSet: 1 Gerrit-Project: pywikibot/core Gerrit-Branch: master Gerrit-Owner: John Vandenberg <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
