jenkins-bot has submitted this change and it was merged.
Change subject: Make categorisation project aware
......................................................................
Make categorisation project aware
Also:
* Fixed an issue where non-category sitelinks on wikidata
could be added as a category.
* Added a test for the wikidata method
* Removed unused method and variable.
Not done:
* Could not successfully mock NoPage triggers in the wikidata
method. Attempt left commented out.
Bug:T111618
Change-Id: Ie1804ec682b284fcbe6cf154b3455992d9112fd5
---
M erfgoedbot/categorize_images.py
M tests/test_categorization.py
2 files changed, 141 insertions(+), 72 deletions(-)
Approvals:
Jean-Frédéric: Looks good to me, approved
jenkins-bot: Verified
diff --git a/erfgoedbot/categorize_images.py b/erfgoedbot/categorize_images.py
index 47ca122..95f75e1 100644
--- a/erfgoedbot/categorize_images.py
+++ b/erfgoedbot/categorize_images.py
@@ -41,7 +41,7 @@
pass
# Contains the commonscat templates for most Wikipedia's (taken from
ex-commonscat.py)
-commonscatTemplates = {
+wikipedia_commonscat_templates = {
'_default': (u'Commonscat', []),
'af': (u'CommonsKategorie', [u'commonscat']),
'an': (u'Commonscat', [u'Commons cat']),
@@ -269,7 +269,7 @@
def get_new_categories(monumentId, monData, lang, commonsCatTemplates):
(monumentName, monumentCommonscat,
- monumentArticleTitle, monumentSource) = monData
+ monumentArticleTitle, monumentSource, project) = monData
commons_site = pywikibot.Site(u'commons', u'commons')
newcats = []
# First try to add a category based on the commonscat field in the list
@@ -285,12 +285,13 @@
except pywikibot.exceptions.InvalidTitle:
pywikibot.warning(
u'Incorrect category title %s' % (monumentCommonscat,))
+
# Option two is to use the article about the monument and see if it has
# Commonscat links
if not newcats:
monumentArticle = None
if monumentArticleTitle:
- project_site = pywikibot.Site(lang, u'wikipedia')
+ project_site = pywikibot.Site(lang, project)
monumentArticle = pywikibot.Page(project_site,
monumentArticleTitle)
if monumentArticle:
try:
@@ -312,9 +313,10 @@
except pywikibot.exceptions.Error as e:
pywikibot.error(u'Error occured with monument %s: %s' % (
monumentId, str(e)))
+
# Option three is to see if the list contains Commonscat links (whole list)
if not newcats:
- monumentList = getList(lang, monumentSource)
+ monumentList = getList(lang, project, monumentSource)
# print monumentList
if not monumentList:
return False
@@ -378,10 +380,11 @@
def getMonData(countrycode, lang, monumentId, conn, cursor):
- '''
- Get monument name and source from db
- '''
- query = u"""SELECT `name`, `commonscat`, `monument_article`, `source` FROM
monuments_all WHERE (country=%s AND lang=%s AND id=%s) LIMIT 1"""
+ """Get monument name and source from db."""
+ query = u"SELECT `name`, `commonscat`, `monument_article`, `source`,
`project` " \
+ u"FROM monuments_all " \
+ u"WHERE (country=%s AND lang=%s AND id=%s) " \
+ u"LIMIT 1"
cursor.execute(query, (countrycode, lang, monumentId))
@@ -393,38 +396,18 @@
return False
-def getArticle(lang, monumentName):
- '''
- Get monument article page from wikilink at monumentName
- '''
-
- if monumentName:
- regex = u'^\[\[(.+?)(\||\])'
-
- match = re.search(regex, monumentName)
- if not match:
- return False
-
- page_title = match.group(1)
- site = pywikibot.Site(lang, u'wikipedia')
-
- return pywikibot.Page(site, page_title)
- else:
- return False
-
-
-def getList(lang, monumentSource):
+def getList(lang, project, monumentSource):
'''
Get listpage
'''
if monumentSource:
- regex = u'^(https:)?//%s.wikipedia.org/w/index.php\?title=(.+?)&' % (
- lang,)
+ regex = u'^(https:)?//%s.%s.org/w/index.php\?title=(.+?)&' % (
+ lang, project)
match = re.search(regex, monumentSource)
if not match:
return False
page_title = match.group(2)
- site = pywikibot.Site(lang, u'wikipedia')
+ site = pywikibot.Site(lang, project)
return pywikibot.Page(site, page_title)
else:
return False
@@ -504,12 +487,13 @@
def get_Commons_category_via_Wikidata(page):
- '''
+ """
Get Commons Category from the linked Wikidata item and P373.
- Raises: NoCommonsCatFromWikidataItemException if either there is no linked
item
- or it does not bear P373 or a sitelink to Commons
- '''
+ Raises: NoCommonsCatFromWikidataItemException if either there is no linked
+ item or it does not bear P373 or a sitelink to a category page on
+ Commons.
+ """
try:
data_item = page.data_item()
claims = data_item.get()['claims']
@@ -517,8 +501,12 @@
return 'Category:' + claims['P373'][0].getTarget()
else:
commons_site = pywikibot.Site(u'commons', u'commons')
- return data_item.getSitelink(commons_site)
- except (pywikibot.NoPage, KeyError):
+ commons_page = data_item.getSitelink(commons_site)
+ if commons_page.startswith('Category:'):
+ return commons_page
+ else:
+ raise NoCommonsCatFromWikidataItemException(page)
+ except (pywikibot.NoPage, KeyError, NoCommonsCatFromWikidataItemException):
raise NoCommonsCatFromWikidataItemException(page)
@@ -544,7 +532,6 @@
site = pywikibot.Site(u'commons', u'commons')
generator = None
- genFactory = pagegenerators.GeneratorFactory()
commonsTemplate = countryconfig.get('commonsTemplate')
if overridecat:
@@ -614,16 +601,20 @@
page.put(newtext=output, comment=comment)
-def getCommonscatTemplates(lang=None):
- '''Get the template name in a language. Expects the language code.
- Return as tuple containing the primary template and it's alternatives
+def getCommonscatTemplates(lang=None, project=None):
+ """
+ Get the template name in a language on a project.
- '''
+ Expects the language code and project.
+ Return as tuple containing the primary template and it's alternatives
+ """
+ project = project or u'wikipedia' # default to wikipedia
+
result = []
- if lang in commonscatTemplates:
- (prim, backups) = commonscatTemplates[lang]
+ if project == u'wikipedia' and lang in wikipedia_commonscat_templates:
+ (prim, backups) = wikipedia_commonscat_templates[lang]
else:
- (prim, backups) = commonscatTemplates[u'_default']
+ (prim, backups) = wikipedia_commonscat_templates[u'_default']
result.append(prim)
result = result + backups
return result
@@ -654,16 +645,19 @@
return False
pywikibot.log(
u'Working on countrycode "%s" in language "%s"' % (countrycode,
lang))
- commonsCatTemplates = getCommonscatTemplates(lang)
+ countryconfig = mconfig.countries.get((countrycode, lang))
+ commonsCatTemplates = getCommonscatTemplates(
+ lang, countryconfig.get('project'))
# print commonsCatTemplates
- processCountry(countrycode, lang, mconfig.countries.get(
- (countrycode, lang)), commonsCatTemplates, conn, cursor,
overridecat=overridecat)
+ processCountry(countrycode, lang, countryconfig, commonsCatTemplates,
+ conn, cursor, overridecat=overridecat)
else:
statistics = []
for (countrycode, lang), countryconfig in
mconfig.countries.iteritems():
pywikibot.log(
u'Working on countrycode "%s" in language "%s"' %
(countrycode, lang))
- commonsCatTemplates = getCommonscatTemplates(lang)
+ commonsCatTemplates = getCommonscatTemplates(
+ lang, countryconfig.get('project'))
result = processCountry(
countrycode, lang, countryconfig, commonsCatTemplates, conn,
cursor)
if result:
diff --git a/tests/test_categorization.py b/tests/test_categorization.py
index e9eb67e..93e4d1e 100644
--- a/tests/test_categorization.py
+++ b/tests/test_categorization.py
@@ -1,12 +1,10 @@
+"""Unit tests for categorize_images."""
+# from pywikibot.site import APISite
+# from pywikibot.exceptions import NoPage
import unittest
-from mock import Mock
-from erfgoedbot.categorize_images import \
- replace_default_cat_with_new_categories_in_image_text, \
- filter_out_categories_to_add, \
- remove_base_category_from_categories_to_add_if_present, \
- deduplicate_categories, \
- NoCategoryToAddException
+import mock
+from erfgoedbot import categorize_images
class TestReplaceCategories(unittest.TestCase):
@@ -14,15 +12,16 @@
"""Test the replace_default_cat_with_new_categories_in_image_text
method."""
def setUp(self):
- self.mock_old_category = Mock()
+ self.mock_old_category = mock.Mock()
self.mock_old_category.title.return_value = "A"
- self.mock_old_category.__str__ = Mock()
+ self.mock_old_category.__str__ = mock.Mock()
self.mock_old_category.__str__.return_value = 'A'
def
test_replace_categories_with_no_category_present_should_add_new_category(self):
old_text = ""
new_categories = ["B"]
- new_text =
replace_default_cat_with_new_categories_in_image_text(old_text,
self.mock_old_category, new_categories)
+ new_text =
categorize_images.replace_default_cat_with_new_categories_in_image_text(
+ old_text, self.mock_old_category, new_categories)
expected_text = u'[[Category:B]]'
self.assertEquals(new_text, expected_text)
@@ -31,34 +30,36 @@
[[Category:A]]
"""
new_categories = ["B"]
- new_text =
replace_default_cat_with_new_categories_in_image_text(old_text,
self.mock_old_category, new_categories)
+ new_text =
categorize_images.replace_default_cat_with_new_categories_in_image_text(
+ old_text, self.mock_old_category, new_categories)
expected_text = u'[[Category:B]]'
self.assertEquals(new_text, expected_text)
def
test_replace_categories_with_no_categories_to_add_raise_Exception(self):
old_text = ""
new_categories = []
- with self.assertRaises(NoCategoryToAddException):
- new_text =
replace_default_cat_with_new_categories_in_image_text(old_text,
self.mock_old_category, new_categories)
+ with self.assertRaises(categorize_images.NoCategoryToAddException):
+
categorize_images.replace_default_cat_with_new_categories_in_image_text(
+ old_text, self.mock_old_category, new_categories)
class TestDeduplicateCategories(unittest.TestCase):
def test_deduplicate_categories(self):
new_categories = ['A', 'B', 'A']
- result = deduplicate_categories(new_categories)
+ result = categorize_images.deduplicate_categories(new_categories)
expected = ['A', 'B']
self.assertItemsEqual(result, expected)
def test_deduplicate_categories_nothing_to_deduplicate(self):
new_categories = ['A', 'B']
- result = deduplicate_categories(new_categories)
+ result = categorize_images.deduplicate_categories(new_categories)
expected = ['A', 'B']
self.assertItemsEqual(result, expected)
def test_deduplicate_categories_no_categories(self):
new_categories = []
- result = deduplicate_categories(new_categories)
+ result = categorize_images.deduplicate_categories(new_categories)
expected = []
self.assertItemsEqual(result, expected)
@@ -66,11 +67,13 @@
class TestRemoveBaseCategory(unittest.TestCase):
def test_remove_category_with_category_present(self):
- result = remove_base_category_from_categories_to_add_if_present(['A',
'B'], 'A')
+ result =
categorize_images.remove_base_category_from_categories_to_add_if_present(
+ ['A', 'B'], 'A')
self.assertItemsEqual(result, ['B'])
def test_remove_category_without_category_present(self):
- result = remove_base_category_from_categories_to_add_if_present(['B',
'C'], 'A')
+ result =
categorize_images.remove_base_category_from_categories_to_add_if_present(
+ ['B', 'C'], 'A')
self.assertItemsEqual(result, ['B', 'C'])
@@ -79,23 +82,95 @@
def test_filter_categories_all_present(self):
new_categories = ['A', 'B']
current_categories = ['A', 'B', 'C']
- result = filter_out_categories_to_add(new_categories,
current_categories)
+ result = categorize_images.filter_out_categories_to_add(
+ new_categories, current_categories)
self.assertEquals(result, [])
def test_filter_out_some_categories_present(self):
new_categories = ['A', 'B', 'C']
current_categories = ['A', 'B']
- result = filter_out_categories_to_add(new_categories,
current_categories)
+ result = categorize_images.filter_out_categories_to_add(
+ new_categories, current_categories)
self.assertEquals(result, ['C'])
def test_filter_out_with_no_categories_among_present(self):
new_categories = ['A', 'B', 'C']
current_categories = ['D', 'E']
- result = filter_out_categories_to_add(new_categories,
current_categories)
+ result = categorize_images.filter_out_categories_to_add(
+ new_categories, current_categories)
self.assertItemsEqual(result, ['A', 'B', 'C'])
def test_filter_out_with_no_categories_present(self):
new_categories = ['A', 'B', 'C']
current_categories = []
- result = filter_out_categories_to_add(new_categories,
current_categories)
+ result = categorize_images.filter_out_categories_to_add(
+ new_categories, current_categories)
self.assertItemsEqual(result, ['A', 'B', 'C'])
+
+
+class TestGetCommonsCategoryViaWikidata(unittest.TestCase):
+
+ """Tests the get_Commons_category_via_Wikidata method."""
+
+ def setUp(self):
+ self.mock_page = mock.create_autospec(
+ categorize_images.pywikibot.Page)
+ self.mock_data_item = mock.create_autospec(
+ categorize_images.pywikibot.ItemPage)
+ self.mock_claim = mock.create_autospec(
+ categorize_images.pywikibot.Claim)
+
+ self.mock_claim.getTarget.return_value = u'A category'
+ self.mock_data_item.get.return_value = {
+ u'claims': {},
+ u'sitelinks': {},
+ }
+ self.claims = {u'P373': [self.mock_claim, ]}
+ self.page_sitelink = u'Some page'
+ self.category_sitelink = u'Category:Some category'
+
+ # def test_get_Commons_category_via_Wikidata_no_data_item(self):
+ # self.mock_page.data_item.side_effect = NoPage
+ # with
self.assertRaises(categorize_images.NoCommonsCatFromWikidataItemException):
+ #
categorize_images.get_Commons_category_via_Wikidata(self.mock_page)
+
+ # def test_get_Commons_category_via_Wikidata_no_claim_or_sitelink(self):
+ # expected_site = APISite("commons", "commons")
+ # self.mock_page.data_item.return_value = self.mock_data_item
+ # self.mock_data_item.getSitelink.side_effect = NoPage
+ # with
self.assertRaises(categorize_images.NoCommonsCatFromWikidataItemException):
+ #
categorize_images.get_Commons_category_via_Wikidata(self.mock_page)
+ # self.mock_data_item.getSitelink.assert_called_once_with(
+ # expected_site)
+
+ def test_get_Commons_category_via_Wikidata_with_claim(self):
+ self.mock_page.data_item.return_value = self.mock_data_item
+ self.mock_data_item.get.return_value[u'claims'] = self.claims
+ expected = u'Category:A category'
+ self.assertEquals(
+
categorize_images.get_Commons_category_via_Wikidata(self.mock_page),
+ expected)
+
+ def test_get_Commons_category_via_Wikidata_with_page_sitelink(self):
+ self.mock_page.data_item.return_value = self.mock_data_item
+ self.mock_data_item.getSitelink.return_value = self.page_sitelink
+ with
self.assertRaises(categorize_images.NoCommonsCatFromWikidataItemException):
+ categorize_images.get_Commons_category_via_Wikidata(self.mock_page)
+
+ def test_get_Commons_category_via_Wikidata_with_category_sitelink(self):
+ self.mock_page.data_item.return_value = self.mock_data_item
+ self.mock_data_item.getSitelink.return_value = self.category_sitelink
+ expected = u'Category:Some category'
+ self.assertEquals(
+
categorize_images.get_Commons_category_via_Wikidata(self.mock_page),
+ expected)
+
+ def
test_get_Commons_category_via_Wikidata_with_claim_and_category_sitelink(self):
+ """Ensure claim value is chosen over sitelink value."""
+ self.mock_page.data_item.return_value = self.mock_data_item
+ self.mock_data_item.get.return_value[u'claims'] = self.claims
+ self.mock_data_item.getSitelink.return_value = self.category_sitelink
+ expected = u'Category:A category'
+ self.assertEquals(
+
categorize_images.get_Commons_category_via_Wikidata(self.mock_page),
+ expected)
--
To view, visit https://gerrit.wikimedia.org/r/299273
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Ie1804ec682b284fcbe6cf154b3455992d9112fd5
Gerrit-PatchSet: 1
Gerrit-Project: labs/tools/heritage
Gerrit-Branch: master
Gerrit-Owner: Lokal Profil <[email protected]>
Gerrit-Reviewer: Jean-Frédéric <[email protected]>
Gerrit-Reviewer: Lokal Profil <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits