jenkins-bot has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/372064 )

Change subject: [IMPR] Exclude links in disambiguation templates from 
possibilities
......................................................................


[IMPR] Exclude links in disambiguation templates from possibilities

Bug: T118719
Change-Id: I7f6762f9b7ebda4bbf04598fa15ba4f82266c9c6
---
M scripts/solve_disambiguation.py
A tests/solve_disambiguation_tests.py
2 files changed, 130 insertions(+), 7 deletions(-)

Approvals:
  jenkins-bot: Verified
  Xqt: Looks good to me, approved



diff --git a/scripts/solve_disambiguation.py b/scripts/solve_disambiguation.py
index 54efe23..7169f6d 100755
--- a/scripts/solve_disambiguation.py
+++ b/scripts/solve_disambiguation.py
@@ -71,7 +71,7 @@
 # (C) Daniel Herding, 2004
 # (C) Andre Engels, 2003-2004
 # (C) WikiWichtel, 2004
-# (C) Pywikibot team, 2003-2017
+# (C) Pywikibot team, 2003-2018
 #
 # Distributed under the terms of the MIT license.
 #
@@ -85,7 +85,7 @@
 import pywikibot
 from pywikibot import editor as editarticle
 from pywikibot.tools import first_lower, first_upper as firstcap
-from pywikibot import pagegenerators, config, i18n
+from pywikibot import pagegenerators, config, i18n, textlib
 from pywikibot.bot import (
     Bot, QuitKeyboardInterrupt,
     StandardOption, HighlightContextOption, ListOption, OutputProxyOption,
@@ -96,6 +96,53 @@
 dn_template = {
     'en': u'{{dn}}',
     'fr': u'{{Lien vers un homonyme}}',
+}
+
+# Regexes of disambiguation template titles to exclude links from
+disamb_templates = {
+    'wikipedia': {
+        'bs': [r'[Čč]vor', r'[Dd]isambig'],
+        'cs': [r'[Rr]ozcestník', r'[Rr]ozcestník[ _]-[ _][^\}]+'],
+        'en': [r'[Dd]isambig-plants', r'[Dd]isambig(uation)?',
+               r'[Dd]isambiguation[ _]cleanup', r'[Gg]eodis',
+               r'[Hh]ndis-cleanup',
+               r'[Ll]etter-Number[ _]Combination[ _]Disambiguation',
+               r'[Mm]il-unit-dis', r'[Nn]umberdis', r'.+?[ _]disambiguation'],
+        'haw': [r'[Hh]uaʻōlelo[ _]puana[ _]like'],
+        'hr': [r'[Rr]azdvojba', r'[Dd]isambig'],
+        'no': [r'[Pp]eker', r'[Ee]tternavn', r'[Dd]isambig',
+               r'[Tt]obokstavsforkortelse', r'[Tt]rebokstavsforkortelse',
+               r'[Ff]lertydig', r'[Pp]ekerside'],
+        'nov': [r'[Dd]esambig'],
+        'qr': [r"[Ss]ut'ichana[ _]qillqa", r'[Dd]isambig', r'SJM'],
+        'rmy': [r'[Dd]udalipen'],
+        'sk': [r'[Dd]isambig', r'[Rr]ozlišovacia[ _]stránka',
+               r'[Dd]isambiguation'],
+        'sr': [r'[Dd]isambig(uation)?', r'ВЗО', r'[Вв]зо', r'[Вв]ишезначна',
+               r'[Вв]ишезначна[ _]одредница', r'[Вв]ишезначност',
+               r'[Vv]išeznačna[ _]odrednica-lat'],
+        'tg': [r'Ибҳомзудоӣ', r'[Dd]isambig', r'Рафъи[ _]ибҳом',
+               r'[Dd]isambiguation'],
+        'tr': [r'[Aa]nlam[ _]ayrım', r'[Dd]isambig', r'[Aa]nlam[ _]ayrımı',
+               r'[Kk]işi[ _]adları[ _]\(anlam[ _]ayrımı\)',
+               r'[Yy]erleşim[ _]yerleri[ _]\(anlam[ _]ayrımı\)',
+               r'[Kk]ısaltmalar[ _]\(anlam[ _]ayrımı\)',
+               r'[Cc]oğrafya[ _]\(anlam[ _]ayrımı\)',
+               r'[Yy]erleşim[ _]yerleri[ _]\(anlam[ _]ayrımı\)',
+               r'[Ss]ayılar[ _]\(anlam[ _]ayrımı\)',
+               r"ABD'deki[ _]iller[ _]\(anlam[ _]ayrımı\)"],
+        'wo': [r'[Bb]okktekki'],
+        'yi': [r'באדייטען'],
+        'zea': [r'[Dd]p', r'[Dd]eurverwiespagina'],
+        'zh-classical': [r'釋義', r'消歧義', r'[Dd]isambig'],
+    },
+    'loveto': {
+        '1911': [r'[Dd]isamb'],
+    },
+    'wowwiki': {
+        'en': [r'[Dd]isambig', r'[Dd]isambig\/quest', r'[Dd]isambig\/quest2',
+               r'[Dd]isambig\/achievement2'],
+    },
 }
 
 # disambiguation page name format for "primary topic" disambiguations
@@ -989,6 +1036,35 @@
                     pywikibot.output(u'Page not saved: %s' % error.args)
         return 'done'
 
+    def get_disambiguation_links(self, disambPage):
+        """Get links from disambPage excluding links from disamb_templates.
+
+        @param disambPage: the disambiguation page
+        @type disambPage: pywikibot.Page
+        @return: list of processed links
+        @rtype: list of str
+
+        """
+        site_disamb_templates = i18n.translate(self.site, disamb_templates)
+        if site_disamb_templates:
+            exceptions = ['nowiki', 'comment', 'category', 'file', 'interwiki']
+            stripped_text = disambPage.text
+            exc_regexes = textlib._get_regexes(exceptions, self.site)
+            for exc in exc_regexes:
+                stripped_text = exc.sub(r'', stripped_text)
+            for template in site_disamb_templates:
+                template_regex = re.compile(
+                    r'\{\{ *(?:' + r':|'.join(self.site.namespaces[10]) +
+                    r':)?' + template + r'\s*(\|[^\}]*)?\}\}'
+                )
+                stripped_text = template_regex.sub(r'', stripped_text)
+            disambPage.text = stripped_text
+            full_text = disambPage.expand_text()
+            links = re.findall(r'\[\[([^\]\|]+)(?:\|[^\]]*|)\]\]', full_text)
+        else:
+            links = disambPage.linkedPages()
+        return links
+
     def findAlternatives(self, disambPage):
         """Extend self.alternatives using correctcap of disambPage.linkedPages.
 
@@ -1013,12 +1089,12 @@
                 try:
                     disambPage2 = pywikibot.Page(
                         pywikibot.Link(disambTitle, self.mysite))
-                    links = disambPage2.linkedPages()
+                    links = self.get_disambiguation_links(disambPage2)
                     links = [correctcap(l, disambPage2.get()) for l in links]
                 except pywikibot.NoPage:
                     pywikibot.output(u"No page at %s, using redirect target."
                                      % disambTitle)
-                    links = disambPage.linkedPages()[:1]
+                    links = self.get_disambiguation_links(disambPage)[:1]
                     links = [correctcap(l, disambPage.get(get_redirect=True))
                              for l in links]
                 self.alternatives += links
@@ -1049,19 +1125,19 @@
                                 primary_topic_format[self.mylang]
                                 % disambPage.title(),
                                 self.mysite))
-                        links = disambPage2.linkedPages()
+                        links = self.get_disambiguation_links(disambPage2)
                         links = [correctcap(l, disambPage2.get())
                                  for l in links]
                     except pywikibot.NoPage:
                         pywikibot.output(
                             'Page does not exist; using first link in page %s.'
                             % disambPage.title())
-                        links = disambPage.linkedPages()[:1]
+                        links = self.get_disambiguation_links(disambPage)[:1]
                         links = [correctcap(l, disambPage.get())
                                  for l in links]
                 else:
                     try:
-                        links = disambPage.linkedPages()
+                        links = self.get_disambiguation_links(disambPage)
                         links = [correctcap(l, disambPage.get())
                                  for l in links]
                     except pywikibot.NoPage:
diff --git a/tests/solve_disambiguation_tests.py 
b/tests/solve_disambiguation_tests.py
new file mode 100644
index 0000000..9c86bb3
--- /dev/null
+++ b/tests/solve_disambiguation_tests.py
@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+"""Test solve_disambiguation bot module."""
+#
+# (C) Pywikibot team, 2018
+#
+# Distributed under the terms of the MIT license.
+#
+from __future__ import absolute_import, unicode_literals
+
+import pywikibot
+
+from scripts.solve_disambiguation import DisambiguationRobot
+
+from tests.aspects import TestCase, unittest
+
+
+class TestGettingDisambigLinks(TestCase):
+    """Test getting disambiguation links."""
+
+    family = 'wikipedia'
+    code = 'en'
+
+    def test_get(self):
+        """Test getting disambiguation links."""
+        page = pywikibot.Page(self.site, 'foo')
+        bot = DisambiguationRobot(None, [], True, False, None, False, False,
+                                  minimum=0)
+        page.text = '* [[Link1]]\n* [[Link2]]'
+        newlinks = bot.get_disambiguation_links(page)
+        links = [
+            pywikibot.Link(self.site, 'Link1'),
+            pywikibot.Link(self.site, 'Link2')]
+        self.assertEqual(newlinks, links)
+
+    def test_get_without_templates(self):
+        """Test excluding links from disamb_templates."""
+        page = pywikibot.Page(self.site, 'foo')
+        bot = DisambiguationRobot(None, [], True, False, None, False, False,
+                                  minimum=0)
+        page.text = '* [[Link1]]\n{{Disambig}}'
+        newlinks = bot.get_disambiguation_links(page)
+        links = [pywikibot.Link(self.site, 'Link1')]
+        self.assertEqual(newlinks, links)
+
+
+if __name__ == '__main__':
+    unittest.main()

-- 
To view, visit https://gerrit.wikimedia.org/r/372064
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I7f6762f9b7ebda4bbf04598fa15ba4f82266c9c6
Gerrit-PatchSet: 23
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Dvorapa <dvor...@seznam.cz>
Gerrit-Reviewer: Dalba <dalba.w...@gmail.com>
Gerrit-Reviewer: Dvorapa <dvor...@seznam.cz>
Gerrit-Reviewer: Framawiki <framaw...@tools.wmflabs.org>
Gerrit-Reviewer: JAn Dudík <jan.du...@gmail.com>
Gerrit-Reviewer: John Vandenberg <jay...@gmail.com>
Gerrit-Reviewer: Magul <tomasz.magul...@gmail.com>
Gerrit-Reviewer: Matěj Suchánek <matejsuchane...@gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhall...@arctus.nl>
Gerrit-Reviewer: Mpaa <mpaa.w...@gmail.com>
Gerrit-Reviewer: XZise <commodorefabia...@gmx.de>
Gerrit-Reviewer: Xqt <i...@gno.de>
Gerrit-Reviewer: Zoranzoki21 <zorandori4...@gmail.com>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
Pywikibot-commits mailing list
Pywikibot-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/pywikibot-commits

Reply via email to