Dalba has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/268938

Change subject: solve_disambiguation.py: Also work with redirects of 
disambiguation pages
......................................................................

solve_disambiguation.py: Also work with redirects of disambiguation pages

To make this happen:
- ReferringPageGeneratorWithIgnore should follow redirects of disambPage.
- The `treat` function should loop over disambPage and its redirects.

While doing the above:
- The `treat` function is too long; break it into two new
    functions: `include` and `inner_treat`.
- Add docstrings for `correctcap`, `treat` and `findAlternatives`.

Bug: T118777
Change-Id: I5918133623f878ad5cbc9bfcc697ba497580f7b3
---
M scripts/solve_disambiguation.py
1 file changed, 269 insertions(+), 208 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core 
refs/changes/38/268938/1

diff --git a/scripts/solve_disambiguation.py b/scripts/solve_disambiguation.py
index 5a87738..905e2a3 100755
--- a/scripts/solve_disambiguation.py
+++ b/scripts/solve_disambiguation.py
@@ -347,8 +347,15 @@
 
 
 def correctcap(link, text):
-    # If text links to a page with title link uncapitalized, uncapitalize link,
-    # otherwise capitalize it
+    """Return the link capitalized/uncapitalized according to the text.
+
+    @param link: link page.
+    @type link: pywikibot.Page
+    @return: uncapitalized link-title if text links to link with an
+        uncapitalized title, otherwise capitalizes link-title.
+    @rtype: str
+
+    """
     linkupper = link.title()
     linklower = first_lower(linkupper)
     if "[[%s]]" % linklower in text or "[[%s|" % linklower in text:
@@ -371,13 +378,13 @@
 
     def __iter__(self):
         # TODO: start yielding before all referring pages have been found
-        refs = [
+        refs = list(set(
             page for page in self.disambPage.getReferences(
-                follow_redirects=False,
+                follow_redirects=True,
                 withTemplateInclusion=False,
                 namespaces=0 if self.main_only else None
             )
-        ]
+        ))
         pywikibot.output(u"Found %d references." % len(refs))
         # Remove ignorables
         if self.disambPage.site.family.name in ignore_title and \
@@ -627,23 +634,47 @@
                                 flags=re.X)
 
     def treat(self, refPage, disambPage):
-        """
-        Treat a page.
+        """Treat a page.
 
-        Parameters:
-            disambPage - The disambiguation page or redirect we don't want
-                anything to link to
-            refPage - A page linking to disambPage
-        Returns False if the user pressed q to completely quit the program.
-        Otherwise, returns True.
+        @param disambPage: the disambiguation page or redirect we don't want
+            anything to link to
+        @type disambPage: pywikibot.Page
+        @param refPage: A page linking to disambPage
+        @type refPage: pywikibot.Page
+        @return: False if the user pressed q to completely quit the program.
+            Otherwise, returns True.
+        @rtype: bool
 
         """
-        # TODO: break this function up into subroutines!
+        #import pdb; pdb.set_trace()
+        inner_treat = self.inner_treat(refPage, disambPage)
+        if inner_treat is None:
+            # Go to next page.
+            return True
+        elif inner_treat is False:
+            return False
+        for disamb_redirect in disambPage.getReferences(redirectsOnly=True):
+            inner_treat = self.inner_treat(refPage, disamb_redirect)
+            if inner_treat is None:
+                return True
+            elif inner_treat is False:
+                return False
+        return True
 
-        self.current_page = refPage
-        include = False
-        unlink_counter = 0
-        new_targets = []
+    def include(self, refPage, disambPage):
+        """Return variable indicating if the page should be included or not.
+
+        @param disambPage: the disambiguation page or redirect we don't want
+            anything to link to
+        @type disambPage: pywikibot.Page
+        @param refPage: A page linking to disambPage
+        @type refPage: pywikibot.Page
+        @return: True if refPage does not contain any of the ignore regexes.
+            "redirect" if refPage IsRedirectPage and user chooses too work
+            on it. False if refPage does not exist or in any other situation.
+        @rtype: bool or str
+
+        """
         try:
             text = refPage.get()
             ignoreReason = self.checkContents(text)
@@ -651,7 +682,7 @@
                 pywikibot.output('\n\nSkipping %s because it contains %s.\n\n'
                                  % (refPage.title(), ignoreReason))
             else:
-                include = True
+                return True, text
         except pywikibot.IsRedirectPage:
             pywikibot.output(u'%s is a redirect to %s'
                              % (refPage.title(), disambPage.title()))
@@ -683,211 +714,240 @@
                             break
                 elif choice == 'c':
                     text = refPage.get(get_redirect=True)
-                    include = "redirect"
+                    return "redirect", text
         except pywikibot.NoPage:
             pywikibot.output(
                 u'Page [[%s]] does not seem to exist?! Skipping.'
                 % refPage.title())
-            include = False
-        if include in (True, "redirect"):
-            # make a backup of the original text so we can show the changes 
later
-            original_text = text
-            n = 0
-            curpos = 0
-            dn = False
-            edited = False
-            # This loop will run until we have finished the current page
-            while True:
-                m = self.linkR.search(text, pos=curpos)
-                if not m:
-                    if n == 0:
-                        pywikibot.output(u"No changes necessary in %s"
-                                         % refPage.title())
-                        return True
-                    else:
-                        # stop loop and save page
-                        break
-                # Make sure that next time around we will not find this same 
hit.
-                curpos = m.start() + 1
-                try:
-                    foundlink = pywikibot.Link(m.group('title'),
-                                               disambPage.site)
-                    foundlink.parse()
-                except pywikibot.Error:
-                    continue
-                # ignore interwiki links
-                if foundlink.site != disambPage.site:
-                    continue
-                # Check whether the link found is to disambPage.
-                try:
-                    if foundlink.canonical_title() != disambPage.title():
-                        continue
-                except pywikibot.Error:
-                    # must be a broken link
-                    pywikibot.log(u"Invalid link [[%s]] in page [[%s]]"
-                                  % (m.group('title'), refPage.title()))
-                    continue
-                n += 1
-                # how many bytes should be displayed around the current link
-                context = 60
-                # check if there's a dn-template here already
-                if (self.dnSkip and self.dn_template_str and
-                        self.dn_template_str[:-2] in text[m.end():m.end() +
-                                                          
len(self.dn_template_str) + 8]):
-                    continue
+            return False
 
-                edit = EditOption('edit page', 'e', text, m.start(), 
disambPage.title())
-                context_option = HighlightContextOption(
-                    'more context', 'm', text, 60, start=m.start(), 
end=m.end())
-                context_option.before_question = True
+    def inner_treat(self, refPage, disambPage):
+        """The treat function loops over this function.
 
-                options = [ListOption(self.alternatives, ''),
-                           ListOption(self.alternatives, 'r'),
-                           StandardOption('skip link', 's'),
-                           edit,
-                           StandardOption('next page', 'n'),
-                           StandardOption('unlink', 'u')]
-                if self.dn_template_str:
-                    # '?', '/' for old choice
-                    options += [AliasOption('tag template %s' % 
self.dn_template_str,
-                                            ['t', '?', '/'])]
-                options += [context_option]
-                if not edited:
-                    options += [ShowPageOption('show disambiguation page', 'd',
-                                               m.start(), disambPage)]
-                options += [
-                    OutputProxyOption('list', 'l',
-                                      SequenceOutputter(self.alternatives)),
-                    AddAlternativeOption('add new', 'a',
-                                         SequenceOutputter(self.alternatives))]
-                if edited:
-                    options += [StandardOption('save in this form', 'x')]
+        @param disambPage: The disambiguation page or redirect we don't want
+            anything to link to
+        @type disambPage: pywikibot.Page
+        @param refPage: A page linking to disambPage
+        @type refPage: pywikibot.Page
+        @return: False if the user pressed q to completely quit the program.
+            None if the user pressed n to skip this page.
+            Otherwise, returns True.
+        @rtype: bool
 
-                # TODO: Output context on each question
-                answer = pywikibot.input_choice('Option', options,
-                                                default=self.always)
-                if answer == 'x':
-                    assert edited, 'invalid option before editing'
-                    break
-                elif answer == 's':
-                    n -= 1  # TODO what's this for?
-                    continue
-                elif answer == 'e':
-                    text = edit.new_text
-                    edited = True
-                    curpos = 0
-                    continue
-                elif answer == 'n':
-                    # skip this page
-                    if self.primary:
-                        # If run with the -primary argument, skip this
-                        # occurrence next time.
-                        self.primaryIgnoreManager.ignore(refPage)
+        """
+        # TODO: break this function up into subroutines!
+        include, text = self.include(refPage, disambPage)
+        new_targets = []
+        unlink_counter = 0
+        if include not in (True, "redirect"):
+            return True
+        # Backup the original text so we can show the changes later.
+        original_text = text
+        n = 0
+        curpos = 0
+        dn = False
+        edited = False
+        # This loop will run until we have finished the current page
+        while True:
+            m = self.linkR.search(text, pos=curpos)
+            if not m:
+                if n == 0:
+                    # No changes necessary
                     return True
-
-                # The link looks like this:
-                # [[page_title|link_text]]trailing_chars
-                page_title = m.group('title')
-                link_text = m.group('label')
-
-                if not link_text:
-                    # or like this: [[page_title]]trailing_chars
-                    link_text = page_title
-                if m.group('section') is None:
-                    section = ''
                 else:
-                    section = m.group('section')
-                trailing_chars = m.group('linktrail')
-                if trailing_chars:
-                    link_text += trailing_chars
-                if answer == 't':
-                    assert self.dn_template_str
-                    # small chunk of text to search
-                    search_text = text[m.end():m.end() + context]
-                    # figure out where the link (and sentance) ends, put note
-                    # there
-                    end_of_word_match = re.search(r'\s', search_text)
-                    if end_of_word_match:
-                        position_split = end_of_word_match.start(0)
-                    else:
-                        position_split = 0
-                    # insert dab needed template
-                    text = (text[:m.end() + position_split] +
-                            self.dn_template_str +
-                            text[m.end() + position_split:])
-                    dn = True
+                    # stop loop and save page
+                    break
+            # Make sure that next time around we won't find this same hit.
+            curpos = m.start() + 1
+            try:
+                foundlink = pywikibot.Link(m.group('title'),
+                                           disambPage.site)
+                foundlink.parse()
+            except pywikibot.Error:
+                continue
+            # ignore interwiki links
+            if foundlink.site != disambPage.site:
+                continue
+            # Check whether the link found is to disambPage.
+            try:
+                if foundlink.canonical_title() != disambPage.title():
                     continue
-                elif answer == 'u':
-                    # unlink - we remove the section if there's any
-                    text = text[:m.start()] + link_text + text[m.end():]
-                    unlink_counter += 1
-                    continue
-                else:
-                    # Check that no option from above was missed
-                    assert isinstance(answer, tuple), 'only tuple answer left.'
-                    assert answer[0] in ['r', ''], 'only valid tuple answers.'
-                    if answer[0] == 'r':
-                        # we want to throw away the original link text
-                        replaceit = link_text == page_title
-                    elif include == "redirect":
-                        replaceit = True
-                    else:
-                        replaceit = False
+            except pywikibot.Error:
+                # must be a broken link
+                pywikibot.log(u"Invalid link [[%s]] in page [[%s]]"
+                              % (m.group('title'), refPage.title()))
+                continue
+            n += 1
+            # how many bytes should be displayed around the current link
+            context = 60
+            # check if there's a dn-template here already
+            if (self.dnSkip and self.dn_template_str and
+                    self.dn_template_str[:-2] in text[m.end():m.end() +
+                                                      
len(self.dn_template_str) + 8]):
+                continue
 
-                    new_page_title = answer[1]
-                    repPl = pywikibot.Page(pywikibot.Link(new_page_title,
-                                                          disambPage.site))
-                    if (new_page_title[0].isupper() or
-                            link_text[0].isupper()):
-                        new_page_title = repPl.title()
-                    else:
-                        new_page_title = repPl.title()
-                        new_page_title = first_lower(new_page_title)
-                    if new_page_title not in new_targets:
-                        new_targets.append(new_page_title)
-                    if replaceit and trailing_chars:
-                        newlink = "[[%s%s]]%s" % (new_page_title,
-                                                  section,
-                                                  trailing_chars)
-                    elif replaceit or (new_page_title == link_text and
-                                       not section):
-                        newlink = "[[%s]]" % new_page_title
-                    # check if we can create a link with trailing characters
-                    # instead of a pipelink
-                    elif (
-                        (len(new_page_title) <= len(link_text)) and
-                        (firstcap(link_text[:len(new_page_title)]) == 
firstcap(new_page_title)) and
-                        (re.sub(self.trailR, '', 
link_text[len(new_page_title):]) == '') and
-                        (not section)
-                    ):
-                        newlink = "[[%s]]%s" \
-                                  % (link_text[:len(new_page_title)],
-                                     link_text[len(new_page_title):])
-                    else:
-                        newlink = "[[%s%s|%s]]" \
-                                  % (new_page_title, section, link_text)
-                    text = text[:m.start()] + newlink + text[m.end():]
-                    continue
+            edit = EditOption(
+                'edit page', 'e', text, m.start(), disambPage.title()
+            )
+            context_option = HighlightContextOption(
+                'more context', 'm', text, 60, start=m.start(), end=m.end())
+            context_option.before_question = True
 
-                pywikibot.output(text[max(0, m.start() - 30):m.end() + 30])
-            if text == original_text:
-                pywikibot.output(u'\nNo changes have been made:\n')
+            options = [ListOption(self.alternatives, ''),
+                       ListOption(self.alternatives, 'r'),
+                       StandardOption('skip link', 's'),
+                       edit,
+                       StandardOption('next page', 'n'),
+                       StandardOption('unlink', 'u')]
+            if self.dn_template_str:
+                # '?', '/' for old choice
+                options += [AliasOption('tag template %s' % 
self.dn_template_str,
+                                        ['t', '?', '/'])]
+            options += [context_option]
+            if not edited:
+                options += [ShowPageOption('show disambiguation page', 'd',
+                                           m.start(), disambPage)]
+            options += [
+                OutputProxyOption('list', 'l',
+                                  SequenceOutputter(self.alternatives)),
+                AddAlternativeOption('add new', 'a',
+                                     SequenceOutputter(self.alternatives))]
+            if edited:
+                options += [StandardOption('save in this form', 'x')]
+
+            # TODO: Output context on each question
+            answer = pywikibot.input_choice('Option', options,
+                                            default=self.always)
+            if answer == 'x':
+                assert edited, 'invalid option before editing'
+                break
+            elif answer == 's':
+                n -= 1  # TODO what's this for?
+                continue
+            elif answer == 'e':
+                text = edit.new_text
+                edited = True
+                curpos = 0
+                continue
+            elif answer == 'n':
+                # skip this page
+                if self.primary:
+                    # If run with the -primary argument, skip this
+                    # occurrence next time.
+                    self.primaryIgnoreManager.ignore(refPage)
+                return None
+
+            # The link looks like this:
+            # [[page_title|link_text]]trailing_chars
+            page_title = m.group('title')
+            link_text = m.group('label')
+
+            if not link_text:
+                # or like this: [[page_title]]trailing_chars
+                link_text = page_title
+            if m.group('section') is None:
+                section = ''
             else:
-                pywikibot.output(u'\nThe following changes have been made:\n')
-                pywikibot.showDiff(original_text, text)
-                pywikibot.output(u'')
-                # save the page
-                self.setSummaryMessage(disambPage, new_targets, unlink_counter,
-                                       dn)
-                try:
-                    refPage.put_async(text, summary=self.comment)
-                except pywikibot.LockedPage:
-                    pywikibot.output(u'Page not saved: page is locked')
-                except pywikibot.PageNotSaved as error:
-                    pywikibot.output(u'Page not saved: %s' % error.args)
+                section = m.group('section')
+            trailing_chars = m.group('linktrail')
+            if trailing_chars:
+                link_text += trailing_chars
+            if answer == 't':
+                assert self.dn_template_str
+                # small chunk of text to search
+                search_text = text[m.end():m.end() + context]
+                # figure out where the link (and sentance) ends, put note
+                # there
+                end_of_word_match = re.search(r'\s', search_text)
+                if end_of_word_match:
+                    position_split = end_of_word_match.start(0)
+                else:
+                    position_split = 0
+                # insert dab needed template
+                text = (text[:m.end() + position_split] +
+                        self.dn_template_str +
+                        text[m.end() + position_split:])
+                dn = True
+                continue
+            elif answer == 'u':
+                # unlink - we remove the section if there's any
+                text = text[:m.start()] + link_text + text[m.end():]
+                unlink_counter += 1
+                continue
+            else:
+                # Check that no option from above was missed
+                assert isinstance(answer, tuple), 'only tuple answer left.'
+                assert answer[0] in ['r', ''], 'only valid tuple answers.'
+                if answer[0] == 'r':
+                    # we want to throw away the original link text
+                    replaceit = link_text == page_title
+                elif include == "redirect":
+                    replaceit = True
+                else:
+                    replaceit = False
+
+                new_page_title = answer[1]
+                repPl = pywikibot.Page(pywikibot.Link(new_page_title,
+                                                      disambPage.site))
+                if (new_page_title[0].isupper() or
+                        link_text[0].isupper()):
+                    new_page_title = repPl.title()
+                else:
+                    new_page_title = repPl.title()
+                    new_page_title = first_lower(new_page_title)
+                if new_page_title not in new_targets:
+                    new_targets.append(new_page_title)
+                if replaceit and trailing_chars:
+                    newlink = "[[%s%s]]%s" % (new_page_title,
+                                              section,
+                                              trailing_chars)
+                elif replaceit or (new_page_title == link_text and
+                                   not section):
+                    newlink = "[[%s]]" % new_page_title
+                # check if we can create a link with trailing characters
+                # instead of a pipelink
+                elif (
+                    (len(new_page_title) <= len(link_text)) and
+                    (firstcap(link_text[:len(new_page_title)]) == 
firstcap(new_page_title)) and
+                    (re.sub(self.trailR, '', link_text[len(new_page_title):]) 
== '') and
+                    (not section)
+                ):
+                    newlink = "[[%s]]%s" \
+                              % (link_text[:len(new_page_title)],
+                                 link_text[len(new_page_title):])
+                else:
+                    newlink = "[[%s%s|%s]]" \
+                              % (new_page_title, section, link_text)
+                text = text[:m.start()] + newlink + text[m.end():]
+                continue
+
+            pywikibot.output(text[max(0, m.start() - 30):m.end() + 30])
+        if text == original_text:
+            pywikibot.output(u'\nNo changes have been made:\n')
+        else:
+            pywikibot.output(u'\nThe following changes have been made:\n')
+            pywikibot.showDiff(original_text, text)
+            pywikibot.output(u'')
+            # save the page
+            self.setSummaryMessage(disambPage, new_targets, unlink_counter,
+                                   dn)
+            try:
+                refPage.put_async(text, summary=self.comment)
+            except pywikibot.LockedPage:
+                pywikibot.output(u'Page not saved: page is locked')
+            except pywikibot.PageNotSaved as error:
+                pywikibot.output(u'Page not saved: %s' % error.args)
         return True
 
     def findAlternatives(self, disambPage):
+        """Extend self.alternatives using correctcap of disambPage.linkedPages.
+
+        @param disambPage: The disabiguation page.
+        @type disambPage: pywikibot.Page
+        @return: True if everything goes fine, False otherwise.
+        @rtype: bool
+
+        """
         if disambPage.isRedirectPage() and not self.primary:
             if (disambPage.site.lang in self.primary_redir_template and
                     self.primary_redir_template[disambPage.site.lang]
@@ -1050,6 +1110,7 @@
                 main_only=self.main_only
             )
             preloadingGen = pagegenerators.PreloadingGenerator(gen)
+
             for refPage in preloadingGen:
                 if not self.primaryIgnoreManager.isIgnored(refPage):
                     try:

-- 
To view, visit https://gerrit.wikimedia.org/r/268938
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I5918133623f878ad5cbc9bfcc697ba497580f7b3
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Dalba <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to