Lokal Profil has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/364840 )
Change subject: Proof of concept to harvest Wikidata into monuments database
......................................................................
Proof of concept to harvest Wikidata into monuments database
Bug: T165988
Change-Id: I3b4d656a3f71f436507b27eabcf38623e2bd23ec
---
M erfgoedbot/monuments_config/nl-wd_nl.json
A erfgoedbot/template/wikidata_query.sparql
M erfgoedbot/update_database.py
M tests/custom_assertions.py
M tests/test_monuments_config.py
5 files changed, 136 insertions(+), 7 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/labs/tools/heritage
refs/changes/40/364840/1
diff --git a/erfgoedbot/monuments_config/nl-wd_nl.json
b/erfgoedbot/monuments_config/nl-wd_nl.json
index 8678b14..61d8216 100644
--- a/erfgoedbot/monuments_config/nl-wd_nl.json
+++ b/erfgoedbot/monuments_config/nl-wd_nl.json
@@ -11,7 +11,7 @@
"unusedImagesPage": "Wikipedia:Wikiproject/Erfgoed/Nederlandse Erfgoed
Inventarisatie/Ongebruikte foto's",
"imagesWithoutIdPage": "Wikipedia:Wikiproject/Erfgoed/Nederlandse Erfgoed
Inventarisatie/Foto's zonder id",
"missingCommonscatPage": "Wikipedia:Wikiproject/Erfgoed/Nederlandse
Erfgoed Inventarisatie/Missende commonscat links",
- "sparql": "{ ?item wdt:P359 [] } UNION { ?item wdt:P1435 wd:Q916333 }
UNION { ?item wdt:P1435 wd:Q13423591 } UNION { ?item wdt:P1435 wd:Q17698911 }",
+ "sparql": "{ ?item wdt:P359 ?id } UNION { ?item wdt:P1435 wd:Q916333 }
UNION { ?item wdt:P1435 wd:Q13423591 } UNION { ?item wdt:P1435 wd:Q17698911 }",
"sql_lang": "Dutch # Wikidata",
"sql_country": "Netherlands",
"sql_data": {
diff --git a/erfgoedbot/template/wikidata_query.sparql
b/erfgoedbot/template/wikidata_query.sparql
new file mode 100644
index 0000000..dd6dd79
--- /dev/null
+++ b/erfgoedbot/template/wikidata_query.sparql
@@ -0,0 +1,13 @@
+# MonumentsDB harvest
+SELECT DISTINCT ?item ?itemLabel ?id ?admin ?adminLabel ?image ?commonscat
?address ?coordinate WHERE {
+ # Make it properties and filter out end time
+
+ %(select_statement)s
+
+ OPTIONAL { ?item wdt:P131 ?admin } .
+ OPTIONAL { ?item wdt:P18 ?image } .
+ OPTIONAL { ?item wdt:P373 ?commonscat } .
+ OPTIONAL { ?item wdt:P969 ?address } .
+ OPTIONAL { ?item wdt:P625 ?coordinate } .
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "%(lang)s" }
+ }
diff --git a/erfgoedbot/update_database.py b/erfgoedbot/update_database.py
index 0e49d4a..23e4670 100755
--- a/erfgoedbot/update_database.py
+++ b/erfgoedbot/update_database.py
@@ -11,10 +11,13 @@
python update_database.py -countrycode:XX -langcode:YY
'''
+import os
import warnings
import datetime
+import urlparse
import pywikibot
+import pywikibot.data.sparql
from pywikibot import pagegenerators
import monuments_config as mconfig
@@ -243,6 +246,55 @@
return contents
+def process_monument_wikidata(params, countryconfig, conn, cursor):
+ """Process a single instance of a wikidata sparql result."""
+ if params['itemLabel']:
+ params['name'] = params['itemLabel'].value
+
+ if params['image']:
+ params['image'] =
urlparse.unquote(params['image'].value).split('/')[-1]
+
+ if params['adminLabel']:
+ params['admin'] = params['adminLabel'].value
+
+ params['source'] = params['item'].value
+ params['wd_item'] = params['item'].getID()
+
+ if params['coordinate']:
+ params['lat'], params['lon'] =
params['coordinate'].value[len('Point('):-1].split(' ')
+
+ del params['coordinate']
+ del params['adminLabel']
+ del params['itemLabel']
+ del params['item']
+
+ kill_list = []
+ for key, value in params.items():
+ if not value:
+ kill_list.append(key)
+ for key in kill_list:
+ del params[key]
+
+ query = u"""REPLACE INTO `%s`(""" % (countryconfig.get('table'),)
+
+ first_query = u''
+ second_query = u''
+ delimiter = u''
+ value_list = []
+ for key, value in params.items():
+ first_query += delimiter + u"""`%s`""" % (key,)
+ second_query += delimiter + u"""%s"""
+ value_list.append(value)
+ delimiter = u', '
+
+ query += first_query + u""") VALUES ("""
+
+ query += second_query + u""")"""
+ with warnings.catch_warnings(record=True) as w:
+ warnings.simplefilter("always")
+ cursor.execute(query, value_list)
+
+
def processMonument(params, source, countryconfig, conn, cursor, sourcePage,
headerDefaults, unknownFields):
"""Process a single instance of a monument row template."""
@@ -351,6 +403,14 @@
def processCountry(countryconfig, conn, cursor, fullUpdate, daysBack):
"""Process all the monuments of one country."""
+ if countryconfig.get('type') == 'sparql':
+ process_country_wikidata(countryconfig, conn, cursor)
+ else:
+ process_country_list(countryconfig, conn, cursor, fullUpdate, daysBack)
+
+
+def process_country_list(countryconfig, conn, cursor, fullUpdate, daysBack):
+ """Process all the monuments of one country using row templates."""
site = pywikibot.Site(countryconfig.get('lang'),
countryconfig.get('project'))
rowTemplate = pywikibot.Page(
site, u'%s:%s' % (site.namespace(10),
countryconfig.get('rowTemplate')))
@@ -392,6 +452,36 @@
'Could not update field statistics. Details below:\n{}'.format(e))
+def load_wikidata_template_sparql():
+ """Fetch the SPARQL template for a wikidata config."""
+ filename = 'wikidata_query.sparql'
+ with open(os.path.join(get_template_dir(), filename), 'r') as f:
+ sparql = f.read()
+ return sparql
+
+
+def get_template_dir():
+ """Fetch the SQL template for a wikidata config."""
+ return os.path.join(
+ os.path.dirname(os.path.abspath(__file__)), 'template')
+
+
+def process_country_wikidata(countryconfig, conn, cursor):
+ """Process all the monuments of one country using sparql."""
+ sparql_select = countryconfig.get('sparql')
+ sparql_template = load_wikidata_template_sparql()
+
+ sparql_query = sparql_template % dict(
+ select_statement=sparql_select,
+ lang=countryconfig.get('lang')
+ )
+ # print sparql_query
+ sq = pywikibot.data.sparql.SparqlQuery()
+ query_result = sq.select(sparql_query, full_data=True)
+ for resultitem in query_result:
+ process_monument_wikidata(resultitem, countryconfig, conn, cursor)
+
+
def main():
"""The main loop."""
# First find out what to work on
@@ -426,14 +516,14 @@
u'I have no config for countrycode "%s" in language "%s"' % (
countrycode, lang))
return False
+
pywikibot.log(
u'Working on countrycode "%s" in language "%s"' % (
countrycode, lang))
-
try:
- processCountry(
- mconfig.countries.get((countrycode, lang)), conn, cursor,
- fullUpdate, daysBack)
+ countryconfig = mconfig.countries.get((countrycode, lang))
+ processCountry(countryconfig, conn, cursor,
+ fullUpdate, daysBack)
except Exception, e:
pywikibot.error(
u"Unknown error occurred when processing country "
diff --git a/tests/custom_assertions.py b/tests/custom_assertions.py
index 6f40acc..e68b9a1 100644
--- a/tests/custom_assertions.py
+++ b/tests/custom_assertions.py
@@ -21,6 +21,19 @@
diff = set(first) - set(second)
raise AssertionError(error_msg % ', '.join(diff))
+ def assert_all_in_string(self, first, text, msg=None):
+ """Test that all first are in text, else append failing to msg."""
+ error_msg = u'[%%s] not found in %s' % text
+ if msg:
+ if not self.longMessage:
+ # msg itself specifies where to append failing
+ error_msg = msg
+ else:
+ error_msg = u'%s: %s' % (msg, error_msg)
+ not_found = filter(lambda s: s not in text, first)
+ if not_found:
+ raise AssertionError(error_msg % ', '.join(not_found))
+
def assert_is_ascii(self, text, msg=None):
"""Assert that a string is ascii."""
error_msg = u'"%s" not ascii' % text
diff --git a/tests/test_monuments_config.py b/tests/test_monuments_config.py
index b074be9..fdd5353 100644
--- a/tests/test_monuments_config.py
+++ b/tests/test_monuments_config.py
@@ -57,10 +57,11 @@
required = required_all + required_base_sql
optional = optional_base
else:
+ required = required_all + required_sql
if data.get('type') == 'sparql':
- required = required_all + required_base_sparql +
required_sql
+ required += required_base_sparql
else:
- required = required_all + required_base_sql + required_sql
+ required += required_base_sql
optional = optional_base + optional_sql
self.assertIsInstance(data, dict, msg=self.label)
self.assert_all_in(required, data.keys(), msg=self.label)
@@ -194,3 +195,15 @@
if data.get(template):
self.assertNotIn('_', data.get(template), msg=self.label)
self.assertNotIn(':', data.get(template), msg=self.label)
+
+ def test_monuments_config_valid_sparql(self):
+ """Ensure that the sparql query delivers ?item and ?id."""
+ # TODO: should ensure primkey entries are present in field
+ for key, data in config.countries.iteritems():
+ if data.get('type') != 'sparql':
+ continue
+
+ self.set_label(key)
+ required_selects = ['?item', '?id']
+ self.assert_all_in_string(
+ required_selects, data.get('sparql'), msg=self.label)
--
To view, visit https://gerrit.wikimedia.org/r/364840
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I3b4d656a3f71f436507b27eabcf38623e2bd23ec
Gerrit-PatchSet: 1
Gerrit-Project: labs/tools/heritage
Gerrit-Branch: master
Gerrit-Owner: Lokal Profil <[email protected]>
Gerrit-Reviewer: Jean-Frédéric <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits