[MediaWiki-commits] [Gerrit] labs...heritage[wikidata]: Harvest monument_article via sparql
jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/370775 ) Change subject: Harvest monument_article via sparql .. Harvest monument_article via sparql Bug: T172842 Change-Id: I43b33b24e94e4ba40ed16add53c72a6fd6e967a5 --- M erfgoedbot/common.py M erfgoedbot/template/wikidata_query.sparql M erfgoedbot/update_database.py M tests/test_common.py 4 files changed, 45 insertions(+), 4 deletions(-) Approvals: Jean-Frédéric: Looks good to me, approved jenkins-bot: Verified diff --git a/erfgoedbot/common.py b/erfgoedbot/common.py index a6f0369..333409c 100644 --- a/erfgoedbot/common.py +++ b/erfgoedbot/common.py @@ -29,6 +29,19 @@ return '[[{0}]]'.format(page_title) +def get_page_from_url(url): +""" +Retrieve the wikipage and site from a page or entity url. +""" +supported_sites = ['wikipedia', 'wikivoyage', 'wikidata', 'wikimedia'] +pattern = '\/\/(.+?)\.({0})\.org\/(wiki|entity)\/(.+?)$'.format( +'|'.join(supported_sites)) +m = re.search(pattern, url) +site = (m.group(2), m.group(1)) +page_name = m.group(4) +return (page_name, site) + + def get_source_page(source, harvest_type=None): """ Retrieve the wikipage and site from the source field. @@ -43,8 +56,11 @@ site = None page_name = None if harvest_type == 'sparql': -site = ('wikidata', 'www') -page_name = source.split('/')[-1] +try: +return get_page_from_url(source) +except AttributeError: +raise ValueError( +u'Could not find source list ({0})'.format(source)) else: supported_sites = ['wikipedia', 'wikivoyage', 'wikidata', 'wikimedia'] pattern = '\/\/(.+?)\.({0})\.org\/w\/index\.php\?title=(.+?)&'.format( diff --git a/erfgoedbot/template/wikidata_query.sparql b/erfgoedbot/template/wikidata_query.sparql index 6a3878d..266e6ea 100644 --- a/erfgoedbot/template/wikidata_query.sparql +++ b/erfgoedbot/template/wikidata_query.sparql @@ -1,9 +1,11 @@ # MonumentsDB harvest -SELECT DISTINCT ?item ?itemLabel ?id ?admin ?adminLabel ?image ?commonscat ?address ?coordinate WHERE { +SELECT DISTINCT ?item ?itemLabel ?id ?monument_article ?admin ?adminLabel ?image ?commonscat ?address ?coordinate WHERE { # Make it properties and filter out end time %(select_statement)s . + OPTIONAL { ?monument_article schema:about ?item; + schema:isPartOf ; } . OPTIONAL { ?item wdt:P131 ?admin } . OPTIONAL { ?item wdt:P18 ?image } . OPTIONAL { ?item wdt:P373 ?commonscat } . diff --git a/erfgoedbot/update_database.py b/erfgoedbot/update_database.py index 23e4670..7d86e93 100755 --- a/erfgoedbot/update_database.py +++ b/erfgoedbot/update_database.py @@ -21,6 +21,7 @@ from pywikibot import pagegenerators import monuments_config as mconfig +import common as common from converters import ( extractWikilink, extract_elements_from_template_param, @@ -257,6 +258,9 @@ if params['adminLabel']: params['admin'] = params['adminLabel'].value +if params['monument_article']: +params['monument_article'], _site = common.get_page_from_url(params['monument_article'].value) + params['source'] = params['item'].value params['wd_item'] = params['item'].getID() @@ -473,7 +477,8 @@ sparql_query = sparql_template % dict( select_statement=sparql_select, -lang=countryconfig.get('lang') +lang=countryconfig.get('lang'), +project=countryconfig.get('project') ) # print sparql_query sq = pywikibot.data.sparql.SparqlQuery() diff --git a/tests/test_common.py b/tests/test_common.py index 126eeb7..2f016c0 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -29,6 +29,24 @@ self.assertEquals(result, ('Q123', ('wikidata', 'www'))) +class TestGetPageFromUrl(unittest.TestCase): + +def test_get_page_from_url_entity(self): +source = 'http://www.wikidata.org/entity/Q123' +result = common.get_page_from_url(source) +self.assertEquals(result, ('Q123', ('wikidata', 'www'))) + +def test_get_page_from_url_page(self): +source = 'http://www.wikidata.org/wiki/Q123' +result = common.get_page_from_url(source) +self.assertEquals(result, ('Q123', ('wikidata', 'www'))) + +def test_get_page_from_url_wikipedia(self): +source = 'http://en.wikipedia.org/entity/foo' +result = common.get_page_from_url(source) +self.assertEquals(result, ('foo', ('wikipedia', 'en'))) + + class TestGetSourceLink(unittest.TestCase): def setUp(self): -- To view, visit https://gerrit.wikimedia.org/r/370775 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I43b33b24e94e4ba40ed16add53c72a6fd6e967a5 Gerrit-PatchSet: 1 Gerrit-Project: labs/tools/heritage
[MediaWiki-commits] [Gerrit] labs...heritage[wikidata]: Harvest monument_article via sparql
Lokal Profil has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/370775 ) Change subject: Harvest monument_article via sparql .. Harvest monument_article via sparql Bug: T172842 Change-Id: I43b33b24e94e4ba40ed16add53c72a6fd6e967a5 --- M erfgoedbot/common.py M erfgoedbot/template/wikidata_query.sparql M erfgoedbot/update_database.py M tests/test_common.py 4 files changed, 45 insertions(+), 4 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/labs/tools/heritage refs/changes/75/370775/1 diff --git a/erfgoedbot/common.py b/erfgoedbot/common.py index a6f0369..333409c 100644 --- a/erfgoedbot/common.py +++ b/erfgoedbot/common.py @@ -29,6 +29,19 @@ return '[[{0}]]'.format(page_title) +def get_page_from_url(url): +""" +Retrieve the wikipage and site from a page or entity url. +""" +supported_sites = ['wikipedia', 'wikivoyage', 'wikidata', 'wikimedia'] +pattern = '\/\/(.+?)\.({0})\.org\/(wiki|entity)\/(.+?)$'.format( +'|'.join(supported_sites)) +m = re.search(pattern, url) +site = (m.group(2), m.group(1)) +page_name = m.group(4) +return (page_name, site) + + def get_source_page(source, harvest_type=None): """ Retrieve the wikipage and site from the source field. @@ -43,8 +56,11 @@ site = None page_name = None if harvest_type == 'sparql': -site = ('wikidata', 'www') -page_name = source.split('/')[-1] +try: +return get_page_from_url(source) +except AttributeError: +raise ValueError( +u'Could not find source list ({0})'.format(source)) else: supported_sites = ['wikipedia', 'wikivoyage', 'wikidata', 'wikimedia'] pattern = '\/\/(.+?)\.({0})\.org\/w\/index\.php\?title=(.+?)&'.format( diff --git a/erfgoedbot/template/wikidata_query.sparql b/erfgoedbot/template/wikidata_query.sparql index 6a3878d..266e6ea 100644 --- a/erfgoedbot/template/wikidata_query.sparql +++ b/erfgoedbot/template/wikidata_query.sparql @@ -1,9 +1,11 @@ # MonumentsDB harvest -SELECT DISTINCT ?item ?itemLabel ?id ?admin ?adminLabel ?image ?commonscat ?address ?coordinate WHERE { +SELECT DISTINCT ?item ?itemLabel ?id ?monument_article ?admin ?adminLabel ?image ?commonscat ?address ?coordinate WHERE { # Make it properties and filter out end time %(select_statement)s . + OPTIONAL { ?monument_article schema:about ?item; + schema:isPartOf ; } . OPTIONAL { ?item wdt:P131 ?admin } . OPTIONAL { ?item wdt:P18 ?image } . OPTIONAL { ?item wdt:P373 ?commonscat } . diff --git a/erfgoedbot/update_database.py b/erfgoedbot/update_database.py index 23e4670..7d86e93 100755 --- a/erfgoedbot/update_database.py +++ b/erfgoedbot/update_database.py @@ -21,6 +21,7 @@ from pywikibot import pagegenerators import monuments_config as mconfig +import common as common from converters import ( extractWikilink, extract_elements_from_template_param, @@ -257,6 +258,9 @@ if params['adminLabel']: params['admin'] = params['adminLabel'].value +if params['monument_article']: +params['monument_article'], _site = common.get_page_from_url(params['monument_article'].value) + params['source'] = params['item'].value params['wd_item'] = params['item'].getID() @@ -473,7 +477,8 @@ sparql_query = sparql_template % dict( select_statement=sparql_select, -lang=countryconfig.get('lang') +lang=countryconfig.get('lang'), +project=countryconfig.get('project') ) # print sparql_query sq = pywikibot.data.sparql.SparqlQuery() diff --git a/tests/test_common.py b/tests/test_common.py index 126eeb7..2f016c0 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -29,6 +29,24 @@ self.assertEquals(result, ('Q123', ('wikidata', 'www'))) +class TestGetPageFromUrl(unittest.TestCase): + +def test_get_page_from_url_entity(self): +source = 'http://www.wikidata.org/entity/Q123' +result = common.get_page_from_url(source) +self.assertEquals(result, ('Q123', ('wikidata', 'www'))) + +def test_get_page_from_url_page(self): +source = 'http://www.wikidata.org/wiki/Q123' +result = common.get_page_from_url(source) +self.assertEquals(result, ('Q123', ('wikidata', 'www'))) + +def test_get_page_from_url_wikipedia(self): +source = 'http://en.wikipedia.org/entity/foo' +result = common.get_page_from_url(source) +self.assertEquals(result, ('foo', ('wikipedia', 'en'))) + + class TestGetSourceLink(unittest.TestCase): def setUp(self): -- To view, visit https://gerrit.wikimedia.org/r/370775 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I43b33b24e94e4ba40ed16add53c72a6fd6e967a5 Gerrit-PatchSet: 1 Gerrit-Project: labs/tools/heritage