Lokal Profil has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/364840 )

Change subject: Proof of concept to harvest Wikidata into monuments database
......................................................................

Proof of concept to harvest Wikidata into monuments database

Bug: T165988
Change-Id: I3b4d656a3f71f436507b27eabcf38623e2bd23ec
---
M erfgoedbot/monuments_config/nl-wd_nl.json
A erfgoedbot/template/wikidata_query.sparql
M erfgoedbot/update_database.py
M tests/custom_assertions.py
M tests/test_monuments_config.py
5 files changed, 136 insertions(+), 7 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/labs/tools/heritage 
refs/changes/40/364840/1

diff --git a/erfgoedbot/monuments_config/nl-wd_nl.json 
b/erfgoedbot/monuments_config/nl-wd_nl.json
index 8678b14..61d8216 100644
--- a/erfgoedbot/monuments_config/nl-wd_nl.json
+++ b/erfgoedbot/monuments_config/nl-wd_nl.json
@@ -11,7 +11,7 @@
     "unusedImagesPage": "Wikipedia:Wikiproject/Erfgoed/Nederlandse Erfgoed 
Inventarisatie/Ongebruikte foto's",
     "imagesWithoutIdPage": "Wikipedia:Wikiproject/Erfgoed/Nederlandse Erfgoed 
Inventarisatie/Foto's zonder id",
     "missingCommonscatPage": "Wikipedia:Wikiproject/Erfgoed/Nederlandse 
Erfgoed Inventarisatie/Missende commonscat links",
-    "sparql": "{ ?item wdt:P359 [] } UNION { ?item wdt:P1435 wd:Q916333 } 
UNION { ?item wdt:P1435 wd:Q13423591 } UNION { ?item wdt:P1435 wd:Q17698911 }",
+    "sparql": "{ ?item wdt:P359 ?id } UNION { ?item wdt:P1435 wd:Q916333 } 
UNION { ?item wdt:P1435 wd:Q13423591 } UNION { ?item wdt:P1435 wd:Q17698911 }",
     "sql_lang": "Dutch  # Wikidata",
     "sql_country": "Netherlands",
     "sql_data": {
diff --git a/erfgoedbot/template/wikidata_query.sparql 
b/erfgoedbot/template/wikidata_query.sparql
new file mode 100644
index 0000000..dd6dd79
--- /dev/null
+++ b/erfgoedbot/template/wikidata_query.sparql
@@ -0,0 +1,13 @@
+# MonumentsDB harvest
+SELECT DISTINCT ?item ?itemLabel ?id ?admin ?adminLabel ?image ?commonscat 
?address ?coordinate WHERE {
+  # Make it properties and filter out end time
+
+  %(select_statement)s
+
+  OPTIONAL { ?item wdt:P131 ?admin } .
+  OPTIONAL { ?item wdt:P18  ?image } .
+  OPTIONAL { ?item wdt:P373 ?commonscat } .
+  OPTIONAL { ?item wdt:P969 ?address } .
+  OPTIONAL { ?item wdt:P625 ?coordinate } .
+  SERVICE wikibase:label { bd:serviceParam wikibase:language "%(lang)s" }
+  }
diff --git a/erfgoedbot/update_database.py b/erfgoedbot/update_database.py
index 0e49d4a..23e4670 100755
--- a/erfgoedbot/update_database.py
+++ b/erfgoedbot/update_database.py
@@ -11,10 +11,13 @@
 python update_database.py -countrycode:XX -langcode:YY
 
 '''
+import os
 import warnings
 import datetime
+import urlparse
 
 import pywikibot
+import pywikibot.data.sparql
 from pywikibot import pagegenerators
 
 import monuments_config as mconfig
@@ -243,6 +246,55 @@
     return contents
 
 
+def process_monument_wikidata(params, countryconfig, conn, cursor):
+    """Process a single instance of a wikidata sparql result."""
+    if params['itemLabel']:
+        params['name'] = params['itemLabel'].value
+
+    if params['image']:
+        params['image'] = 
urlparse.unquote(params['image'].value).split('/')[-1]
+
+    if params['adminLabel']:
+        params['admin'] = params['adminLabel'].value
+
+    params['source'] = params['item'].value
+    params['wd_item'] = params['item'].getID()
+
+    if params['coordinate']:
+        params['lat'], params['lon'] = 
params['coordinate'].value[len('Point('):-1].split(' ')
+
+    del params['coordinate']
+    del params['adminLabel']
+    del params['itemLabel']
+    del params['item']
+
+    kill_list = []
+    for key, value in params.items():
+        if not value:
+            kill_list.append(key)
+    for key in kill_list:
+        del params[key]
+
+    query = u"""REPLACE INTO `%s`(""" % (countryconfig.get('table'),)
+
+    first_query = u''
+    second_query = u''
+    delimiter = u''
+    value_list = []
+    for key, value in params.items():
+        first_query += delimiter + u"""`%s`""" % (key,)
+        second_query += delimiter + u"""%s"""
+        value_list.append(value)
+        delimiter = u', '
+
+    query += first_query + u""") VALUES ("""
+
+    query += second_query + u""")"""
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always")
+        cursor.execute(query, value_list)
+
+
 def processMonument(params, source, countryconfig, conn, cursor, sourcePage,
                     headerDefaults, unknownFields):
     """Process a single instance of a monument row template."""
@@ -351,6 +403,14 @@
 
 def processCountry(countryconfig, conn, cursor, fullUpdate, daysBack):
     """Process all the monuments of one country."""
+    if countryconfig.get('type') == 'sparql':
+        process_country_wikidata(countryconfig, conn, cursor)
+    else:
+        process_country_list(countryconfig, conn, cursor, fullUpdate, daysBack)
+
+
+def process_country_list(countryconfig, conn, cursor, fullUpdate, daysBack):
+    """Process all the monuments of one country using row templates."""
     site = pywikibot.Site(countryconfig.get('lang'), 
countryconfig.get('project'))
     rowTemplate = pywikibot.Page(
         site, u'%s:%s' % (site.namespace(10), 
countryconfig.get('rowTemplate')))
@@ -392,6 +452,36 @@
             'Could not update field statistics. Details below:\n{}'.format(e))
 
 
+def load_wikidata_template_sparql():
+    """Fetch the SPARQL template for a wikidata config."""
+    filename = 'wikidata_query.sparql'
+    with open(os.path.join(get_template_dir(), filename), 'r') as f:
+        sparql = f.read()
+    return sparql
+
+
+def get_template_dir():
+    """Fetch the SQL template for a wikidata config."""
+    return os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), 'template')
+
+
+def process_country_wikidata(countryconfig, conn, cursor):
+    """Process all the monuments of one country using sparql."""
+    sparql_select = countryconfig.get('sparql')
+    sparql_template = load_wikidata_template_sparql()
+
+    sparql_query = sparql_template % dict(
+        select_statement=sparql_select,
+        lang=countryconfig.get('lang')
+    )
+    # print sparql_query
+    sq = pywikibot.data.sparql.SparqlQuery()
+    query_result = sq.select(sparql_query, full_data=True)
+    for resultitem in query_result:
+        process_monument_wikidata(resultitem, countryconfig, conn, cursor)
+
+
 def main():
     """The main loop."""
     # First find out what to work on
@@ -426,14 +516,14 @@
                 u'I have no config for countrycode "%s" in language "%s"' % (
                     countrycode, lang))
             return False
+
         pywikibot.log(
             u'Working on countrycode "%s" in language "%s"' % (
                 countrycode, lang))
-
         try:
-            processCountry(
-                mconfig.countries.get((countrycode, lang)), conn, cursor,
-                fullUpdate, daysBack)
+            countryconfig = mconfig.countries.get((countrycode, lang))
+            processCountry(countryconfig, conn, cursor,
+                           fullUpdate, daysBack)
         except Exception, e:
             pywikibot.error(
                 u"Unknown error occurred when processing country "
diff --git a/tests/custom_assertions.py b/tests/custom_assertions.py
index 6f40acc..e68b9a1 100644
--- a/tests/custom_assertions.py
+++ b/tests/custom_assertions.py
@@ -21,6 +21,19 @@
             diff = set(first) - set(second)
             raise AssertionError(error_msg % ', '.join(diff))
 
+    def assert_all_in_string(self, first, text, msg=None):
+        """Test that all first are in text, else append failing to msg."""
+        error_msg = u'[%%s] not found in %s' % text
+        if msg:
+            if not self.longMessage:
+                # msg itself specifies where to append failing
+                error_msg = msg
+            else:
+                error_msg = u'%s: %s' % (msg, error_msg)
+        not_found = filter(lambda s: s not in text, first)
+        if not_found:
+            raise AssertionError(error_msg % ', '.join(not_found))
+
     def assert_is_ascii(self, text, msg=None):
         """Assert that a string is ascii."""
         error_msg = u'"%s" not ascii' % text
diff --git a/tests/test_monuments_config.py b/tests/test_monuments_config.py
index b074be9..fdd5353 100644
--- a/tests/test_monuments_config.py
+++ b/tests/test_monuments_config.py
@@ -57,10 +57,11 @@
                 required = required_all + required_base_sql
                 optional = optional_base
             else:
+                required = required_all + required_sql
                 if data.get('type') == 'sparql':
-                    required = required_all + required_base_sparql + 
required_sql
+                    required += required_base_sparql
                 else:
-                    required = required_all + required_base_sql + required_sql
+                    required += required_base_sql
                 optional = optional_base + optional_sql
             self.assertIsInstance(data, dict, msg=self.label)
             self.assert_all_in(required, data.keys(), msg=self.label)
@@ -194,3 +195,15 @@
                 if data.get(template):
                     self.assertNotIn('_', data.get(template), msg=self.label)
                     self.assertNotIn(':', data.get(template), msg=self.label)
+
+    def test_monuments_config_valid_sparql(self):
+        """Ensure that the sparql query delivers ?item and ?id."""
+        # TODO: should ensure primkey entries are present in field
+        for key, data in config.countries.iteritems():
+            if data.get('type') != 'sparql':
+                continue
+
+            self.set_label(key)
+            required_selects = ['?item', '?id']
+            self.assert_all_in_string(
+                required_selects, data.get('sparql'), msg=self.label)

-- 
To view, visit https://gerrit.wikimedia.org/r/364840
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I3b4d656a3f71f436507b27eabcf38623e2bd23ec
Gerrit-PatchSet: 1
Gerrit-Project: labs/tools/heritage
Gerrit-Branch: master
Gerrit-Owner: Lokal Profil <[email protected]>
Gerrit-Reviewer: Jean-Frédéric <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to