[MediaWiki-commits] [Gerrit] labs...heritage[wikidata]: Harvest monument_article via sparql

2017-08-09 Thread jenkins-bot (Code Review)
jenkins-bot has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/370775 )

Change subject: Harvest monument_article via sparql
..


Harvest monument_article via sparql

Bug: T172842
Change-Id: I43b33b24e94e4ba40ed16add53c72a6fd6e967a5
---
M erfgoedbot/common.py
M erfgoedbot/template/wikidata_query.sparql
M erfgoedbot/update_database.py
M tests/test_common.py
4 files changed, 45 insertions(+), 4 deletions(-)

Approvals:
  Jean-Frédéric: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/erfgoedbot/common.py b/erfgoedbot/common.py
index a6f0369..333409c 100644
--- a/erfgoedbot/common.py
+++ b/erfgoedbot/common.py
@@ -29,6 +29,19 @@
 return '[[{0}]]'.format(page_title)
 
 
+def get_page_from_url(url):
+"""
+Retrieve the wikipage and site from a page or entity url.
+"""
+supported_sites = ['wikipedia', 'wikivoyage', 'wikidata', 'wikimedia']
+pattern = '\/\/(.+?)\.({0})\.org\/(wiki|entity)\/(.+?)$'.format(
+'|'.join(supported_sites))
+m = re.search(pattern, url)
+site = (m.group(2), m.group(1))
+page_name = m.group(4)
+return (page_name, site)
+
+
 def get_source_page(source, harvest_type=None):
 """
 Retrieve the wikipage and site from the source field.
@@ -43,8 +56,11 @@
 site = None
 page_name = None
 if harvest_type == 'sparql':
-site = ('wikidata', 'www')
-page_name = source.split('/')[-1]
+try:
+return get_page_from_url(source)
+except AttributeError:
+raise ValueError(
+u'Could not find source list ({0})'.format(source))
 else:
 supported_sites = ['wikipedia', 'wikivoyage', 'wikidata', 'wikimedia']
 pattern = '\/\/(.+?)\.({0})\.org\/w\/index\.php\?title=(.+?)&'.format(
diff --git a/erfgoedbot/template/wikidata_query.sparql 
b/erfgoedbot/template/wikidata_query.sparql
index 6a3878d..266e6ea 100644
--- a/erfgoedbot/template/wikidata_query.sparql
+++ b/erfgoedbot/template/wikidata_query.sparql
@@ -1,9 +1,11 @@
 # MonumentsDB harvest
-SELECT DISTINCT ?item ?itemLabel ?id ?admin ?adminLabel ?image ?commonscat 
?address ?coordinate WHERE {
+SELECT DISTINCT ?item ?itemLabel ?id ?monument_article ?admin ?adminLabel 
?image ?commonscat ?address ?coordinate WHERE {
   # Make it properties and filter out end time
 
   %(select_statement)s .
 
+  OPTIONAL { ?monument_article schema:about ?item;
+   schema:isPartOf 
; } .
   OPTIONAL { ?item wdt:P131 ?admin } .
   OPTIONAL { ?item wdt:P18  ?image } .
   OPTIONAL { ?item wdt:P373 ?commonscat } .
diff --git a/erfgoedbot/update_database.py b/erfgoedbot/update_database.py
index 23e4670..7d86e93 100755
--- a/erfgoedbot/update_database.py
+++ b/erfgoedbot/update_database.py
@@ -21,6 +21,7 @@
 from pywikibot import pagegenerators
 
 import monuments_config as mconfig
+import common as common
 from converters import (
 extractWikilink,
 extract_elements_from_template_param,
@@ -257,6 +258,9 @@
 if params['adminLabel']:
 params['admin'] = params['adminLabel'].value
 
+if params['monument_article']:
+params['monument_article'], _site = 
common.get_page_from_url(params['monument_article'].value)
+
 params['source'] = params['item'].value
 params['wd_item'] = params['item'].getID()
 
@@ -473,7 +477,8 @@
 
 sparql_query = sparql_template % dict(
 select_statement=sparql_select,
-lang=countryconfig.get('lang')
+lang=countryconfig.get('lang'),
+project=countryconfig.get('project')
 )
 # print sparql_query
 sq = pywikibot.data.sparql.SparqlQuery()
diff --git a/tests/test_common.py b/tests/test_common.py
index 126eeb7..2f016c0 100644
--- a/tests/test_common.py
+++ b/tests/test_common.py
@@ -29,6 +29,24 @@
 self.assertEquals(result, ('Q123', ('wikidata', 'www')))
 
 
+class TestGetPageFromUrl(unittest.TestCase):
+
+def test_get_page_from_url_entity(self):
+source = 'http://www.wikidata.org/entity/Q123'
+result = common.get_page_from_url(source)
+self.assertEquals(result, ('Q123', ('wikidata', 'www')))
+
+def test_get_page_from_url_page(self):
+source = 'http://www.wikidata.org/wiki/Q123'
+result = common.get_page_from_url(source)
+self.assertEquals(result, ('Q123', ('wikidata', 'www')))
+
+def test_get_page_from_url_wikipedia(self):
+source = 'http://en.wikipedia.org/entity/foo'
+result = common.get_page_from_url(source)
+self.assertEquals(result, ('foo', ('wikipedia', 'en')))
+
+
 class TestGetSourceLink(unittest.TestCase):
 
 def setUp(self):

-- 
To view, visit https://gerrit.wikimedia.org/r/370775
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I43b33b24e94e4ba40ed16add53c72a6fd6e967a5
Gerrit-PatchSet: 1
Gerrit-Project: labs/tools/heritage

[MediaWiki-commits] [Gerrit] labs...heritage[wikidata]: Harvest monument_article via sparql

2017-08-08 Thread Lokal Profil (Code Review)
Lokal Profil has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/370775 )

Change subject: Harvest monument_article via sparql
..

Harvest monument_article via sparql

Bug: T172842
Change-Id: I43b33b24e94e4ba40ed16add53c72a6fd6e967a5
---
M erfgoedbot/common.py
M erfgoedbot/template/wikidata_query.sparql
M erfgoedbot/update_database.py
M tests/test_common.py
4 files changed, 45 insertions(+), 4 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/labs/tools/heritage 
refs/changes/75/370775/1

diff --git a/erfgoedbot/common.py b/erfgoedbot/common.py
index a6f0369..333409c 100644
--- a/erfgoedbot/common.py
+++ b/erfgoedbot/common.py
@@ -29,6 +29,19 @@
 return '[[{0}]]'.format(page_title)
 
 
+def get_page_from_url(url):
+"""
+Retrieve the wikipage and site from a page or entity url.
+"""
+supported_sites = ['wikipedia', 'wikivoyage', 'wikidata', 'wikimedia']
+pattern = '\/\/(.+?)\.({0})\.org\/(wiki|entity)\/(.+?)$'.format(
+'|'.join(supported_sites))
+m = re.search(pattern, url)
+site = (m.group(2), m.group(1))
+page_name = m.group(4)
+return (page_name, site)
+
+
 def get_source_page(source, harvest_type=None):
 """
 Retrieve the wikipage and site from the source field.
@@ -43,8 +56,11 @@
 site = None
 page_name = None
 if harvest_type == 'sparql':
-site = ('wikidata', 'www')
-page_name = source.split('/')[-1]
+try:
+return get_page_from_url(source)
+except AttributeError:
+raise ValueError(
+u'Could not find source list ({0})'.format(source))
 else:
 supported_sites = ['wikipedia', 'wikivoyage', 'wikidata', 'wikimedia']
 pattern = '\/\/(.+?)\.({0})\.org\/w\/index\.php\?title=(.+?)&'.format(
diff --git a/erfgoedbot/template/wikidata_query.sparql 
b/erfgoedbot/template/wikidata_query.sparql
index 6a3878d..266e6ea 100644
--- a/erfgoedbot/template/wikidata_query.sparql
+++ b/erfgoedbot/template/wikidata_query.sparql
@@ -1,9 +1,11 @@
 # MonumentsDB harvest
-SELECT DISTINCT ?item ?itemLabel ?id ?admin ?adminLabel ?image ?commonscat 
?address ?coordinate WHERE {
+SELECT DISTINCT ?item ?itemLabel ?id ?monument_article ?admin ?adminLabel 
?image ?commonscat ?address ?coordinate WHERE {
   # Make it properties and filter out end time
 
   %(select_statement)s .
 
+  OPTIONAL { ?monument_article schema:about ?item;
+   schema:isPartOf 
; } .
   OPTIONAL { ?item wdt:P131 ?admin } .
   OPTIONAL { ?item wdt:P18  ?image } .
   OPTIONAL { ?item wdt:P373 ?commonscat } .
diff --git a/erfgoedbot/update_database.py b/erfgoedbot/update_database.py
index 23e4670..7d86e93 100755
--- a/erfgoedbot/update_database.py
+++ b/erfgoedbot/update_database.py
@@ -21,6 +21,7 @@
 from pywikibot import pagegenerators
 
 import monuments_config as mconfig
+import common as common
 from converters import (
 extractWikilink,
 extract_elements_from_template_param,
@@ -257,6 +258,9 @@
 if params['adminLabel']:
 params['admin'] = params['adminLabel'].value
 
+if params['monument_article']:
+params['monument_article'], _site = 
common.get_page_from_url(params['monument_article'].value)
+
 params['source'] = params['item'].value
 params['wd_item'] = params['item'].getID()
 
@@ -473,7 +477,8 @@
 
 sparql_query = sparql_template % dict(
 select_statement=sparql_select,
-lang=countryconfig.get('lang')
+lang=countryconfig.get('lang'),
+project=countryconfig.get('project')
 )
 # print sparql_query
 sq = pywikibot.data.sparql.SparqlQuery()
diff --git a/tests/test_common.py b/tests/test_common.py
index 126eeb7..2f016c0 100644
--- a/tests/test_common.py
+++ b/tests/test_common.py
@@ -29,6 +29,24 @@
 self.assertEquals(result, ('Q123', ('wikidata', 'www')))
 
 
+class TestGetPageFromUrl(unittest.TestCase):
+
+def test_get_page_from_url_entity(self):
+source = 'http://www.wikidata.org/entity/Q123'
+result = common.get_page_from_url(source)
+self.assertEquals(result, ('Q123', ('wikidata', 'www')))
+
+def test_get_page_from_url_page(self):
+source = 'http://www.wikidata.org/wiki/Q123'
+result = common.get_page_from_url(source)
+self.assertEquals(result, ('Q123', ('wikidata', 'www')))
+
+def test_get_page_from_url_wikipedia(self):
+source = 'http://en.wikipedia.org/entity/foo'
+result = common.get_page_from_url(source)
+self.assertEquals(result, ('foo', ('wikipedia', 'en')))
+
+
 class TestGetSourceLink(unittest.TestCase):
 
 def setUp(self):

-- 
To view, visit https://gerrit.wikimedia.org/r/370775
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I43b33b24e94e4ba40ed16add53c72a6fd6e967a5
Gerrit-PatchSet: 1
Gerrit-Project: labs/tools/heritage