Dachary has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/311841

Change subject: bot: handle ambiguous names in search_entity
......................................................................

bot: handle ambiguous names in search_entity

In some cases there is no other way but to hardcode the item id because
the labels are ambiguous and there is no sure way to figure out which
one is relevant in the context of the FLOSS project.

When there are two items and one of them is a disambiguation page, just
ignore it.

Change-Id: I14b895a6a67dcb7da6246c489d10f1b77e1d5777
Signed-off-by: Loic Dachary <l...@dachary.org>
---
M FLOSSbot/bot.py
M tests/test_bot.py
2 files changed, 87 insertions(+), 6 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/pywikibot/bots/FLOSSbot 
refs/changes/41/311841/1

diff --git a/FLOSSbot/bot.py b/FLOSSbot/bot.py
index 6381027..66eff53 100644
--- a/FLOSSbot/bot.py
+++ b/FLOSSbot/bot.py
@@ -95,16 +95,56 @@
             self.entities[type][name] = found
         return found
 
+    #
+    # Hardcode the desired wikidata item when there are
+    # multiple items with the same english label and no
+    # trivial way to disambiguate them.
+    #
+    authoritative = {
+        'wikidata': {
+            'git': 'Q186055',
+            'Fossil': 'Q1439431',
+        },
+        'test': {
+        },
+    }
+
     def search_entity(self, site, name, **kwargs):
-        found = None
+        if name in Bot.authoritative[site.code]:
+            candidate = pywikibot.ItemPage(
+                site, Bot.authoritative[site.code][name], 0)
+            if candidate.get()['labels']['en'] == name:
+                return candidate
+        candidates = []
         for p in site.search_entities(name, 'en', **kwargs):
-            if p['label'] == name:
+            # log.debug("looking for entity " + name + ", found " + str(p))
+            if p.get('label') == name:
                 if kwargs['type'] == 'property':
-                    found = p
+                    candidates.append(p)
                 else:
-                    found = pywikibot.ItemPage(site, p['id'], 0)
-                break
-        return found
+                    candidates.append(pywikibot.ItemPage(site, p['id'], 0))
+        if len(candidates) == 0:
+            return None
+        elif len(candidates) > 1 and kwargs['type'] == 'item':
+            found = []
+            for candidate in candidates:
+                item = candidate.get()
+                ok = True
+                for instance_of in item['claims'].get(self.P_instance_of, []):
+                    if (instance_of.getTarget() ==
+                            self.Q_Wikimedia_disambiguation_page):
+                        log.debug("ignore disambiguation page " +
+                                  candidate.getID() + " for " + name)
+                        ok = False
+                        break
+                if ok:
+                    found.append(candidate)
+            if len(found) != 1:
+                raise ValueError("found multiple items for " + name +
+                                 " " + str(found))
+            return found[0]
+        else:
+            return candidates[0]
 
     lookup_item = lookup_entity
 
diff --git a/tests/test_bot.py b/tests/test_bot.py
index a9dbba9..d28537e 100644
--- a/tests/test_bot.py
+++ b/tests/test_bot.py
@@ -19,6 +19,7 @@
 import logging
 from datetime import date
 
+import pytest
 import pywikibot
 
 from FLOSSbot.bot import Bot
@@ -105,3 +106,43 @@
         bot.set_retrieved(item, claim, date(1965, 11, 2))
         assert bot.need_verification(claim) is True
         bot.clear_entity_label(item.getID())
+
+    def test_search_entity(self):
+        bot = Bot(argparse.Namespace(
+            test=True,
+            user='FLOSSbotCI',
+        ))
+        name = TestWikidata.random_name()
+        entity = {
+            "labels": {
+                "en": {
+                    "language": "en",
+                    "value": name,
+                }
+            },
+        }
+        first = bot.site.editEntity({'new': 'item'}, entity)
+        first = pywikibot.ItemPage(bot.site, first['entity']['id'], 0)
+        second = bot.site.editEntity({'new': 'item'}, entity)
+        second = pywikibot.ItemPage(bot.site, second['entity']['id'], 0)
+
+        with pytest.raises(ValueError) as e:
+            bot.search_entity(bot.site, name, type='item')
+        assert "found multiple items" in str(e.value)
+
+        claim = pywikibot.Claim(bot.site, bot.P_instance_of, 0)
+        claim.setTarget(bot.Q_Wikimedia_disambiguation_page)
+        first.addClaim(claim)
+
+        found = bot.search_entity(bot.site, name, type='item')
+        assert found.getID() == second.getID()
+
+        bot.site.editEntity({'new': 'item'}, entity)
+
+        with pytest.raises(ValueError) as e:
+            bot.search_entity(bot.site, name, type='item')
+        assert "found multiple items" in str(e.value)
+
+        Bot.authoritative['test'][name] = second.getID()
+        found = bot.search_entity(bot.site, name, type='item')
+        assert found.getID() == second.getID()

-- 
To view, visit https://gerrit.wikimedia.org/r/311841
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I14b895a6a67dcb7da6246c489d10f1b77e1d5777
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/bots/FLOSSbot
Gerrit-Branch: master
Gerrit-Owner: Dachary <l...@dachary.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to