Lokal Profil has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/370775 )

Change subject: Harvest monument_article via sparql
......................................................................

Harvest monument_article via sparql

Bug: T172842
Change-Id: I43b33b24e94e4ba40ed16add53c72a6fd6e967a5
---
M erfgoedbot/common.py
M erfgoedbot/template/wikidata_query.sparql
M erfgoedbot/update_database.py
M tests/test_common.py
4 files changed, 45 insertions(+), 4 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/labs/tools/heritage 
refs/changes/75/370775/1

diff --git a/erfgoedbot/common.py b/erfgoedbot/common.py
index a6f0369..333409c 100644
--- a/erfgoedbot/common.py
+++ b/erfgoedbot/common.py
@@ -29,6 +29,19 @@
     return '[[{0}]]'.format(page_title)
 
 
+def get_page_from_url(url):
+    """
+    Retrieve the wikipage and site from a page or entity url.
+    """
+    supported_sites = ['wikipedia', 'wikivoyage', 'wikidata', 'wikimedia']
+    pattern = '\/\/(.+?)\.({0})\.org\/(wiki|entity)\/(.+?)$'.format(
+        '|'.join(supported_sites))
+    m = re.search(pattern, url)
+    site = (m.group(2), m.group(1))
+    page_name = m.group(4)
+    return (page_name, site)
+
+
 def get_source_page(source, harvest_type=None):
     """
     Retrieve the wikipage and site from the source field.
@@ -43,8 +56,11 @@
     site = None
     page_name = None
     if harvest_type == 'sparql':
-        site = ('wikidata', 'www')
-        page_name = source.split('/')[-1]
+        try:
+            return get_page_from_url(source)
+        except AttributeError:
+            raise ValueError(
+                u'Could not find source list ({0})'.format(source))
     else:
         supported_sites = ['wikipedia', 'wikivoyage', 'wikidata', 'wikimedia']
         pattern = '\/\/(.+?)\.({0})\.org\/w\/index\.php\?title=(.+?)&'.format(
diff --git a/erfgoedbot/template/wikidata_query.sparql 
b/erfgoedbot/template/wikidata_query.sparql
index 6a3878d..266e6ea 100644
--- a/erfgoedbot/template/wikidata_query.sparql
+++ b/erfgoedbot/template/wikidata_query.sparql
@@ -1,9 +1,11 @@
 # MonumentsDB harvest
-SELECT DISTINCT ?item ?itemLabel ?id ?admin ?adminLabel ?image ?commonscat 
?address ?coordinate WHERE {
+SELECT DISTINCT ?item ?itemLabel ?id ?monument_article ?admin ?adminLabel 
?image ?commonscat ?address ?coordinate WHERE {
   # Make it properties and filter out end time
 
   %(select_statement)s .
 
+  OPTIONAL { ?monument_article schema:about ?item;
+                               schema:isPartOf 
<https://%(lang)s.%(project)s.org/>; } .
   OPTIONAL { ?item wdt:P131 ?admin } .
   OPTIONAL { ?item wdt:P18  ?image } .
   OPTIONAL { ?item wdt:P373 ?commonscat } .
diff --git a/erfgoedbot/update_database.py b/erfgoedbot/update_database.py
index 23e4670..7d86e93 100755
--- a/erfgoedbot/update_database.py
+++ b/erfgoedbot/update_database.py
@@ -21,6 +21,7 @@
 from pywikibot import pagegenerators
 
 import monuments_config as mconfig
+import common as common
 from converters import (
     extractWikilink,
     extract_elements_from_template_param,
@@ -257,6 +258,9 @@
     if params['adminLabel']:
         params['admin'] = params['adminLabel'].value
 
+    if params['monument_article']:
+        params['monument_article'], _site = 
common.get_page_from_url(params['monument_article'].value)
+
     params['source'] = params['item'].value
     params['wd_item'] = params['item'].getID()
 
@@ -473,7 +477,8 @@
 
     sparql_query = sparql_template % dict(
         select_statement=sparql_select,
-        lang=countryconfig.get('lang')
+        lang=countryconfig.get('lang'),
+        project=countryconfig.get('project')
     )
     # print sparql_query
     sq = pywikibot.data.sparql.SparqlQuery()
diff --git a/tests/test_common.py b/tests/test_common.py
index 126eeb7..2f016c0 100644
--- a/tests/test_common.py
+++ b/tests/test_common.py
@@ -29,6 +29,24 @@
         self.assertEquals(result, ('Q123', ('wikidata', 'www')))
 
 
+class TestGetPageFromUrl(unittest.TestCase):
+
+    def test_get_page_from_url_entity(self):
+        source = 'http://www.wikidata.org/entity/Q123'
+        result = common.get_page_from_url(source)
+        self.assertEquals(result, ('Q123', ('wikidata', 'www')))
+
+    def test_get_page_from_url_page(self):
+        source = 'http://www.wikidata.org/wiki/Q123'
+        result = common.get_page_from_url(source)
+        self.assertEquals(result, ('Q123', ('wikidata', 'www')))
+
+    def test_get_page_from_url_wikipedia(self):
+        source = 'http://en.wikipedia.org/entity/foo'
+        result = common.get_page_from_url(source)
+        self.assertEquals(result, ('foo', ('wikipedia', 'en')))
+
+
 class TestGetSourceLink(unittest.TestCase):
 
     def setUp(self):

-- 
To view, visit https://gerrit.wikimedia.org/r/370775
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I43b33b24e94e4ba40ed16add53c72a6fd6e967a5
Gerrit-PatchSet: 1
Gerrit-Project: labs/tools/heritage
Gerrit-Branch: wikidata
Gerrit-Owner: Lokal Profil <lokal.pro...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to