jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/370481 )
Change subject: Make scripts dealing with the sparql source field deal with sparql harvested data ...................................................................... Make scripts dealing with the sparql source field deal with sparql harvested data Affected scripts: * Reporting unused monuments * Categorizing images (Option three, see if the list contains Commonscat) * Add coordinates to article (used in edit summary) Bug: T171300 Change-Id: I6197f30c318e11611d24c29a8ca9c45dc1134a80 --- M erfgoedbot/add_coord_to_articles.py M erfgoedbot/categorize_images.py A erfgoedbot/common.py M erfgoedbot/unused_monument_images.py A tests/test_common.py 5 files changed, 152 insertions(+), 31 deletions(-) Approvals: Jean-Frédéric: Looks good to me, approved jenkins-bot: Verified diff --git a/erfgoedbot/add_coord_to_articles.py b/erfgoedbot/add_coord_to_articles.py index 297729b..5a9950e 100644 --- a/erfgoedbot/add_coord_to_articles.py +++ b/erfgoedbot/add_coord_to_articles.py @@ -20,6 +20,7 @@ import pywikibot import monuments_config as mconfig +import common as common from database_connection import ( close_database_connection, connect_to_monuments_database, @@ -122,7 +123,7 @@ aMonument.article = redirTitle if (pageId): if not hasCoordinates(pageId, lang, cursorWiki): - addCoords(countrycode, lang, aMonument, coordconfig) + addCoords(countrycode, lang, aMonument, coordconfig, countryconfig) def getMonumentsWithCoordinates(countrycode, lang, cursor): @@ -230,7 +231,7 @@ return (pageNs, pageTitle) -def addCoords(countrycode, lang, monument, coordconfig): +def addCoords(countrycode, lang, monument, coordconfig, countryconfig): ''' Add the coordinates to article. ''' @@ -269,11 +270,13 @@ newtext = re.sub(catStart, replacementText, newtext, replCount, flags=re.IGNORECASE) if text != newtext: - wikilist = u'' - matchWikipage = re.search("title=(.+?)&", monument.source) - if (matchWikipage and matchWikipage.group(1)): - wikilist = matchWikipage.group(1) - comment = u'Adding template %s based on [[%s]], # %s' % (coordTemplate, wikilist, monument.id) + try: + source_link = common.get_source_link( + monument.source, + countryconfig.get('type')) + except ValueError: + source_link = '' + comment = u'Adding template %s based on %s, # %s' % (coordTemplate, source_link, monument.id) pywikibot.showDiff(text, newtext) modPage = pywikibot.input(u'Modify page: %s ([y]/n) ?' % (monument.article)) if (modPage.lower == 'y' or modPage == ''): diff --git a/erfgoedbot/categorize_images.py b/erfgoedbot/categorize_images.py index e6e7fef..2566416 100644 --- a/erfgoedbot/categorize_images.py +++ b/erfgoedbot/categorize_images.py @@ -18,13 +18,13 @@ ''' import json import os -import re import pywikibot from pywikibot import pagegenerators from pywikibot import textlib import monuments_config as mconfig +import common as common from database_connection import ( close_database_connection, connect_to_monuments_database @@ -53,7 +53,9 @@ return json.load(open(json_file, 'r')) -def categorizeImage(countrycode, lang, commonsTemplateName, commonsCategoryBase, commonsCatTemplates, page, conn, cursor): +def categorizeImage( + countrycode, lang, commonsTemplateName, commonsCategoryBase, + commonsCatTemplates, page, conn, cursor, harvest_type): pywikibot.log(u'Working on: %s' % page.title()) site = pywikibot.Site(u'commons', u'commons') commonsTemplate = pywikibot.Page(site, 'Template:%s' % commonsTemplateName) @@ -96,7 +98,7 @@ u'Monument with id %s not in monuments database' % (monumentId, )) return False - (newcats, categorisation_method) = get_new_categories(monumentId, monData, lang, commonsCatTemplates) + (newcats, categorisation_method) = get_new_categories(monumentId, monData, lang, commonsCatTemplates, harvest_type) # See if one of the three options worked if newcats: @@ -124,7 +126,7 @@ return monumentId -def get_new_categories(monumentId, monData, lang, commonsCatTemplates): +def get_new_categories(monumentId, monData, lang, commonsCatTemplates, harvest_type): (monumentName, monumentCommonscat, monumentArticleTitle, monumentSource, project) = monData commons_site = pywikibot.Site(u'commons', u'commons') @@ -174,7 +176,7 @@ monumentId, str(e))) # Option three is to see if the list contains Commonscat links (whole list) - if not newcats: + if not newcats and harvest_type != 'sparql': monumentList = getList(lang, project, monumentSource) # print monumentList if not monumentList: @@ -259,16 +261,15 @@ def getList(lang, project, monumentSource): - ''' - Get listpage - ''' + """Get the listpage, if not harvested from a sparql query.""" if monumentSource: - regex = u'^(https:)?//%s.%s.org/w/index.php\?title=(.+?)&' % ( - lang, project) - match = re.search(regex, monumentSource) - if not match: + try: + page_title, found_site = common.get_source_page(monumentSource) + except ValueError: return False - page_title = match.group(2) + if (project, lang) != found_site: + return False + site = pywikibot.Site(lang, project) return pywikibot.Page(site, page_title) else: @@ -405,6 +406,7 @@ site = pywikibot.Site(u'commons', u'commons') generator = None commonsTemplate = countryconfig.get('commonsTemplate') + harvest_type = countryconfig.get('type') if overridecat: commonsCategoryBase = pywikibot.Category(site, "%s:%s" % (site.namespace(14), overridecat)) @@ -421,7 +423,8 @@ success = False if not totalImages >= 10000: success = categorizeImage( - countrycode, lang, commonsTemplate, commonsCategoryBase, commonsCatTemplates, page, conn, cursor) + countrycode, lang, commonsTemplate, commonsCategoryBase, + commonsCatTemplates, page, conn, cursor, harvest_type) if success: categorizedImages += 1 diff --git a/erfgoedbot/common.py b/erfgoedbot/common.py new file mode 100644 index 0000000..a6f0369 --- /dev/null +++ b/erfgoedbot/common.py @@ -0,0 +1,60 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +"""Support library of commonly shared functions.""" + +import re + + +def get_source_link(source, harvest_type=None, label=None): + """ + Format the source as an appropriate wiki link. + + Requires the target page to be on the same wiki unless it is a sparql + harvest. Links to Wikidata are always prefixed. + + @param source: the source value from the SQL table + @param harvest_type: the type of harvest from which the source was + extracted. E.g. "sparql" + @param label: Optional label to use for the link + """ + try: + page_title, (project, lang) = get_source_page(source, harvest_type) + except ValueError: + raise + if project == 'wikidata': + page_title = ':d:{0}'.format(page_title) + + if label: + return '[[{0}|{1}]]'.format(page_title, label) + return '[[{0}]]'.format(page_title) + + +def get_source_page(source, harvest_type=None): + """ + Retrieve the wikipage and site from the source field. + + Note that the returned site tuple may not be a valid pywikibot site. E.g. + commons is ('wikimedia', 'commons') rather than ('commons', 'commons'). + + @param source: the source value from the SQL table + @harvest_type: the type of harvest from which the source was extracted. + e.g. "sparql" + """ + site = None + page_name = None + if harvest_type == 'sparql': + site = ('wikidata', 'www') + page_name = source.split('/')[-1] + else: + supported_sites = ['wikipedia', 'wikivoyage', 'wikidata', 'wikimedia'] + pattern = '\/\/(.+?)\.({0})\.org\/w\/index\.php\?title=(.+?)&'.format( + '|'.join(supported_sites)) + m = re.search(pattern, source) + try: + site = (m.group(2), m.group(1)) + page_name = m.group(3) + except AttributeError: + raise ValueError( + u'Could not find source list ({0})'.format(source)) + + return (page_name, site) diff --git a/erfgoedbot/unused_monument_images.py b/erfgoedbot/unused_monument_images.py index 951588d..16e6ebe 100644 --- a/erfgoedbot/unused_monument_images.py +++ b/erfgoedbot/unused_monument_images.py @@ -11,11 +11,10 @@ python unused_monument_images.py -countrycode:XX -langcode:YY ''' -import re - import pywikibot import monuments_config as mconfig +import common as common from database_connection import ( close_database_connection, connect_to_monuments_database, @@ -72,23 +71,22 @@ monumentId = monumentId.upper() if monumentId in withoutPhoto: - m = re.search( - '^[^\?]+\?title\=(.+?)&', withoutPhoto.get(monumentId)) try: - wikiSourceList = m.group(1) - except AttributeError: + source_link = common.get_source_link( + withoutPhoto.get(monumentId), + countryconfig.get('type'), + monumentId) + except ValueError: pywikibot.warning( u'Could not find wikiSourceList for %s (%s)' % ( monumentId, withoutPhoto.get(monumentId))) continue imageName = photos.get(catSortKey) # pywikibot.output(u'Key %s returned a result' % (monumentId,)) - # pywikibot.output(wikiSourceList) # pywikibot.output(imageName) if totalImages <= maxImages: - text += \ - u'File:%s|[[%s|%s]]\n' % ( - unicode(imageName, 'utf-8'), wikiSourceList, monumentId) + text += u'File:{0}|{1}\n'.format( + unicode(imageName, 'utf-8'), source_link) totalImages += 1 except ValueError: pywikibot.warning(u'Got value error for %s' % (monumentId,)) diff --git a/tests/test_common.py b/tests/test_common.py new file mode 100644 index 0000000..126eeb7 --- /dev/null +++ b/tests/test_common.py @@ -0,0 +1,57 @@ +"""Unit tests for common.""" + +import unittest +import mock +from erfgoedbot import common + + +class TestGetSourcePage(unittest.TestCase): + + def test_getSourcePage_wikipedia(self): + source = '//en.wikipedia.org/w/index.php?title=foo&oldid=123' + result = common.get_source_page(source) + self.assertEquals(result, ('foo', ('wikipedia', 'en'))) + + def test_getSourcePage_wikipedia_urlencode(self): + source = '//ka.wikipedia.org/w/index.php?title=%E1%83%95%E1%83%98%E1%83%99&oldid=3179801' + result = common.get_source_page(source) + self.assertEquals( + result, ('%E1%83%95%E1%83%98%E1%83%99', ('wikipedia', 'ka'))) + + def test_getSourcePage_wikivoyage(self): + source = '//ru.wikivoyage.org/w/index.php?title=foo&oldid=123' + result = common.get_source_page(source) + self.assertEquals(result, ('foo', ('wikivoyage', 'ru'))) + + def test_getSourcePage_sparql(self): + source = 'http://www.wikidata.org/entity/Q123' + result = common.get_source_page(source, 'sparql') + self.assertEquals(result, ('Q123', ('wikidata', 'www'))) + + +class TestGetSourceLink(unittest.TestCase): + + def setUp(self): + patcher = mock.patch('erfgoedbot.common.get_source_page') + self.mock_get_source = patcher.start() + self.addCleanup(patcher.stop) + + def test_getSourcePage_wikipedia(self): + self.mock_get_source.return_value = ('foo', ('wikipedia', 'en')) + result = common.get_source_link('a link') + self.assertEquals(result, '[[foo]]') + + def test_getSourcePage_wikipedia_label(self): + self.mock_get_source.return_value = ('foo', ('wikipedia', 'en')) + result = common.get_source_link('a link', label='bar') + self.assertEquals(result, '[[foo|bar]]') + + def test_getSourcePage_sparql(self): + self.mock_get_source.return_value = ('Q123', ('wikidata', 'www')) + result = common.get_source_link('a link', 'sparql') + self.assertEquals(result, '[[:d:Q123]]') + + def test_getSourcePage_sparql_label(self): + self.mock_get_source.return_value = ('Q123', ('wikidata', 'www')) + result = common.get_source_link('a link', 'sparql', 'bar') + self.assertEquals(result, '[[:d:Q123|bar]]') -- To view, visit https://gerrit.wikimedia.org/r/370481 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I6197f30c318e11611d24c29a8ca9c45dc1134a80 Gerrit-PatchSet: 2 Gerrit-Project: labs/tools/heritage Gerrit-Branch: wikidata Gerrit-Owner: Lokal Profil <lokal.pro...@gmail.com> Gerrit-Reviewer: Jean-Frédéric <jeanfrederic.w...@gmail.com> Gerrit-Reviewer: Lokal Profil <lokal.pro...@gmail.com> Gerrit-Reviewer: Multichill <maar...@mdammers.nl> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits