[MediaWiki-commits] [Gerrit] labs...heritage[wikidata]: Make scripts dealing with the sparql source field deal with ...

jenkins-bot (Code Review) Wed, 09 Aug 2017 16:48:07 -0700

jenkins-bot has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/370481 )


Change subject: Make scripts dealing with the sparql source field deal with 
sparql harvested data
......................................................................


Make scripts dealing with the sparql source field deal with sparql harvested 
data

Affected scripts:
* Reporting unused monuments
* Categorizing images (Option three, see if the list contains Commonscat)
* Add coordinates to article (used in edit summary)

Bug: T171300
Change-Id: I6197f30c318e11611d24c29a8ca9c45dc1134a80
---
M erfgoedbot/add_coord_to_articles.py
M erfgoedbot/categorize_images.py
A erfgoedbot/common.py
M erfgoedbot/unused_monument_images.py
A tests/test_common.py
5 files changed, 152 insertions(+), 31 deletions(-)

Approvals:
  Jean-Frédéric: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/erfgoedbot/add_coord_to_articles.py 
b/erfgoedbot/add_coord_to_articles.py
index 297729b..5a9950e 100644
--- a/erfgoedbot/add_coord_to_articles.py
+++ b/erfgoedbot/add_coord_to_articles.py
@@ -20,6 +20,7 @@
 import pywikibot
 
 import monuments_config as mconfig
+import common as common
 from database_connection import (
     close_database_connection,
     connect_to_monuments_database,
@@ -122,7 +123,7 @@
             aMonument.article = redirTitle
         if (pageId):
             if not hasCoordinates(pageId, lang, cursorWiki):
-                addCoords(countrycode, lang, aMonument, coordconfig)
+                addCoords(countrycode, lang, aMonument, coordconfig, 
countryconfig)
 
 
 def getMonumentsWithCoordinates(countrycode, lang, cursor):
@@ -230,7 +231,7 @@
         return (pageNs, pageTitle)
 
 
-def addCoords(countrycode, lang, monument, coordconfig):
+def addCoords(countrycode, lang, monument, coordconfig, countryconfig):
     '''
     Add the coordinates to article.
     '''
@@ -269,11 +270,13 @@
         newtext = re.sub(catStart, replacementText, newtext, replCount, 
flags=re.IGNORECASE)
 
         if text != newtext:
-            wikilist = u''
-            matchWikipage = re.search("title=(.+?)&", monument.source)
-            if (matchWikipage and matchWikipage.group(1)):
-                wikilist = matchWikipage.group(1)
-            comment = u'Adding template %s based on [[%s]], # %s' % 
(coordTemplate, wikilist, monument.id)
+            try:
+                source_link = common.get_source_link(
+                    monument.source,
+                    countryconfig.get('type'))
+            except ValueError:
+                source_link = ''
+            comment = u'Adding template %s based on %s, # %s' % 
(coordTemplate, source_link, monument.id)
             pywikibot.showDiff(text, newtext)
             modPage = pywikibot.input(u'Modify page: %s ([y]/n) ?' % 
(monument.article))
             if (modPage.lower == 'y' or modPage == ''):
diff --git a/erfgoedbot/categorize_images.py b/erfgoedbot/categorize_images.py
index e6e7fef..2566416 100644
--- a/erfgoedbot/categorize_images.py
+++ b/erfgoedbot/categorize_images.py
@@ -18,13 +18,13 @@
 '''
 import json
 import os
-import re
 
 import pywikibot
 from pywikibot import pagegenerators
 from pywikibot import textlib
 
 import monuments_config as mconfig
+import common as common
 from database_connection import (
     close_database_connection,
     connect_to_monuments_database
@@ -53,7 +53,9 @@
     return json.load(open(json_file, 'r'))
 
 
-def categorizeImage(countrycode, lang, commonsTemplateName, 
commonsCategoryBase, commonsCatTemplates, page, conn, cursor):
+def categorizeImage(
+        countrycode, lang, commonsTemplateName, commonsCategoryBase,
+        commonsCatTemplates, page, conn, cursor, harvest_type):
     pywikibot.log(u'Working on: %s' % page.title())
     site = pywikibot.Site(u'commons', u'commons')
     commonsTemplate = pywikibot.Page(site, 'Template:%s' % commonsTemplateName)
@@ -96,7 +98,7 @@
             u'Monument with id %s not in monuments database' % (monumentId, ))
         return False
 
-    (newcats, categorisation_method) = get_new_categories(monumentId, monData, 
lang, commonsCatTemplates)
+    (newcats, categorisation_method) = get_new_categories(monumentId, monData, 
lang, commonsCatTemplates, harvest_type)
 
     # See if one of the three options worked
     if newcats:
@@ -124,7 +126,7 @@
     return monumentId
 
 
-def get_new_categories(monumentId, monData, lang, commonsCatTemplates):
+def get_new_categories(monumentId, monData, lang, commonsCatTemplates, 
harvest_type):
     (monumentName, monumentCommonscat,
      monumentArticleTitle, monumentSource, project) = monData
     commons_site = pywikibot.Site(u'commons', u'commons')
@@ -174,7 +176,7 @@
                     monumentId, str(e)))
 
     # Option three is to see if the list contains Commonscat links (whole list)
-    if not newcats:
+    if not newcats and harvest_type != 'sparql':
         monumentList = getList(lang, project, monumentSource)
         # print monumentList
         if not monumentList:
@@ -259,16 +261,15 @@
 
 
 def getList(lang, project, monumentSource):
-    '''
-    Get listpage
-    '''
+    """Get the listpage, if not harvested from a sparql query."""
     if monumentSource:
-        regex = u'^(https:)?//%s.%s.org/w/index.php\?title=(.+?)&' % (
-            lang, project)
-        match = re.search(regex, monumentSource)
-        if not match:
+        try:
+            page_title, found_site = common.get_source_page(monumentSource)
+        except ValueError:
             return False
-        page_title = match.group(2)
+        if (project, lang) != found_site:
+            return False
+
         site = pywikibot.Site(lang, project)
         return pywikibot.Page(site, page_title)
     else:
@@ -405,6 +406,7 @@
     site = pywikibot.Site(u'commons', u'commons')
     generator = None
     commonsTemplate = countryconfig.get('commonsTemplate')
+    harvest_type = countryconfig.get('type')
 
     if overridecat:
         commonsCategoryBase = pywikibot.Category(site, "%s:%s" % 
(site.namespace(14), overridecat))
@@ -421,7 +423,8 @@
         success = False
         if not totalImages >= 10000:
             success = categorizeImage(
-                countrycode, lang, commonsTemplate, commonsCategoryBase, 
commonsCatTemplates, page, conn, cursor)
+                countrycode, lang, commonsTemplate, commonsCategoryBase,
+                commonsCatTemplates, page, conn, cursor, harvest_type)
         if success:
             categorizedImages += 1
 
diff --git a/erfgoedbot/common.py b/erfgoedbot/common.py
new file mode 100644
index 0000000..a6f0369
--- /dev/null
+++ b/erfgoedbot/common.py
@@ -0,0 +1,60 @@
+#!/usr/bin/python
+# -*- coding: utf-8  -*-
+"""Support library of commonly shared functions."""
+
+import re
+
+
+def get_source_link(source, harvest_type=None, label=None):
+    """
+    Format the source as an appropriate wiki link.
+
+    Requires the target page to be on the same wiki unless it is a sparql
+    harvest. Links to Wikidata are always prefixed.
+
+    @param source: the source value from the SQL table
+    @param harvest_type: the type of harvest from which the source was
+        extracted. E.g. "sparql"
+    @param label: Optional label to use for the link
+    """
+    try:
+        page_title, (project, lang) = get_source_page(source, harvest_type)
+    except ValueError:
+        raise
+    if project == 'wikidata':
+        page_title = ':d:{0}'.format(page_title)
+
+    if label:
+        return '[[{0}|{1}]]'.format(page_title, label)
+    return '[[{0}]]'.format(page_title)
+
+
+def get_source_page(source, harvest_type=None):
+    """
+    Retrieve the wikipage and site from the source field.
+
+    Note that the returned site tuple may not be a valid pywikibot site. E.g.
+    commons is ('wikimedia', 'commons') rather than ('commons', 'commons').
+
+    @param source: the source value from the SQL table
+    @harvest_type: the type of harvest from which the source was extracted.
+        e.g. "sparql"
+    """
+    site = None
+    page_name = None
+    if harvest_type == 'sparql':
+        site = ('wikidata', 'www')
+        page_name = source.split('/')[-1]
+    else:
+        supported_sites = ['wikipedia', 'wikivoyage', 'wikidata', 'wikimedia']
+        pattern = '\/\/(.+?)\.({0})\.org\/w\/index\.php\?title=(.+?)&'.format(
+            '|'.join(supported_sites))
+        m = re.search(pattern, source)
+        try:
+            site = (m.group(2), m.group(1))
+            page_name = m.group(3)
+        except AttributeError:
+            raise ValueError(
+                u'Could not find source list ({0})'.format(source))
+
+    return (page_name, site)
diff --git a/erfgoedbot/unused_monument_images.py 
b/erfgoedbot/unused_monument_images.py
index 951588d..16e6ebe 100644
--- a/erfgoedbot/unused_monument_images.py
+++ b/erfgoedbot/unused_monument_images.py
@@ -11,11 +11,10 @@
 python unused_monument_images.py -countrycode:XX -langcode:YY
 
 '''
-import re
-
 import pywikibot
 
 import monuments_config as mconfig
+import common as common
 from database_connection import (
     close_database_connection,
     connect_to_monuments_database,
@@ -72,23 +71,22 @@
                 monumentId = monumentId.upper()
 
             if monumentId in withoutPhoto:
-                m = re.search(
-                    '^[^\?]+\?title\=(.+?)&', withoutPhoto.get(monumentId))
                 try:
-                    wikiSourceList = m.group(1)
-                except AttributeError:
+                    source_link = common.get_source_link(
+                        withoutPhoto.get(monumentId),
+                        countryconfig.get('type'),
+                        monumentId)
+                except ValueError:
                     pywikibot.warning(
                         u'Could not find wikiSourceList for %s (%s)' % (
                             monumentId, withoutPhoto.get(monumentId)))
                     continue
                 imageName = photos.get(catSortKey)
                 # pywikibot.output(u'Key %s returned a result' % (monumentId,))
-                # pywikibot.output(wikiSourceList)
                 # pywikibot.output(imageName)
                 if totalImages <= maxImages:
-                    text += \
-                        u'File:%s|[[%s|%s]]\n' % (
-                            unicode(imageName, 'utf-8'), wikiSourceList, 
monumentId)
+                    text += u'File:{0}|{1}\n'.format(
+                        unicode(imageName, 'utf-8'), source_link)
                 totalImages += 1
         except ValueError:
             pywikibot.warning(u'Got value error for %s' % (monumentId,))
diff --git a/tests/test_common.py b/tests/test_common.py
new file mode 100644
index 0000000..126eeb7
--- /dev/null
+++ b/tests/test_common.py
@@ -0,0 +1,57 @@
+"""Unit tests for common."""
+
+import unittest
+import mock
+from erfgoedbot import common
+
+
+class TestGetSourcePage(unittest.TestCase):
+
+    def test_getSourcePage_wikipedia(self):
+        source = '//en.wikipedia.org/w/index.php?title=foo&oldid=123'
+        result = common.get_source_page(source)
+        self.assertEquals(result, ('foo', ('wikipedia', 'en')))
+
+    def test_getSourcePage_wikipedia_urlencode(self):
+        source = 
'//ka.wikipedia.org/w/index.php?title=%E1%83%95%E1%83%98%E1%83%99&oldid=3179801'
+        result = common.get_source_page(source)
+        self.assertEquals(
+            result, ('%E1%83%95%E1%83%98%E1%83%99', ('wikipedia', 'ka')))
+
+    def test_getSourcePage_wikivoyage(self):
+        source = '//ru.wikivoyage.org/w/index.php?title=foo&oldid=123'
+        result = common.get_source_page(source)
+        self.assertEquals(result, ('foo', ('wikivoyage', 'ru')))
+
+    def test_getSourcePage_sparql(self):
+        source = 'http://www.wikidata.org/entity/Q123'
+        result = common.get_source_page(source, 'sparql')
+        self.assertEquals(result, ('Q123', ('wikidata', 'www')))
+
+
+class TestGetSourceLink(unittest.TestCase):
+
+    def setUp(self):
+        patcher = mock.patch('erfgoedbot.common.get_source_page')
+        self.mock_get_source = patcher.start()
+        self.addCleanup(patcher.stop)
+
+    def test_getSourcePage_wikipedia(self):
+        self.mock_get_source.return_value = ('foo', ('wikipedia', 'en'))
+        result = common.get_source_link('a link')
+        self.assertEquals(result, '[[foo]]')
+
+    def test_getSourcePage_wikipedia_label(self):
+        self.mock_get_source.return_value = ('foo', ('wikipedia', 'en'))
+        result = common.get_source_link('a link', label='bar')
+        self.assertEquals(result, '[[foo|bar]]')
+
+    def test_getSourcePage_sparql(self):
+        self.mock_get_source.return_value = ('Q123', ('wikidata', 'www'))
+        result = common.get_source_link('a link', 'sparql')
+        self.assertEquals(result, '[[:d:Q123]]')
+
+    def test_getSourcePage_sparql_label(self):
+        self.mock_get_source.return_value = ('Q123', ('wikidata', 'www'))
+        result = common.get_source_link('a link', 'sparql', 'bar')
+        self.assertEquals(result, '[[:d:Q123|bar]]')

-- 
To view, visit https://gerrit.wikimedia.org/r/370481
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I6197f30c318e11611d24c29a8ca9c45dc1134a80
Gerrit-PatchSet: 2
Gerrit-Project: labs/tools/heritage
Gerrit-Branch: wikidata
Gerrit-Owner: Lokal Profil <lokal.pro...@gmail.com>
Gerrit-Reviewer: Jean-Frédéric <jeanfrederic.w...@gmail.com>
Gerrit-Reviewer: Lokal Profil <lokal.pro...@gmail.com>
Gerrit-Reviewer: Multichill <maar...@mdammers.nl>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] labs...heritage[wikidata]: Make scripts dealing with the sparql source field deal with ...

Reply via email to