Dalba has uploaded a new change for review.
https://gerrit.wikimedia.org/r/268938
Change subject: solve_disambiguation.py: Also work with redirects of
disambiguation pages
......................................................................
solve_disambiguation.py: Also work with redirects of disambiguation pages
To make this happen:
- ReferringPageGeneratorWithIgnore should follow redirects of disambPage.
- The `treat` function should loop over disambPage and its redirects.
While doing the above:
- The `treat` function is too long; break it into two new
functions: `include` and `inner_treat`.
- Add docstrings for `correctcap`, `treat` and `findAlternatives`.
Bug: T118777
Change-Id: I5918133623f878ad5cbc9bfcc697ba497580f7b3
---
M scripts/solve_disambiguation.py
1 file changed, 269 insertions(+), 208 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core
refs/changes/38/268938/1
diff --git a/scripts/solve_disambiguation.py b/scripts/solve_disambiguation.py
index 5a87738..905e2a3 100755
--- a/scripts/solve_disambiguation.py
+++ b/scripts/solve_disambiguation.py
@@ -347,8 +347,15 @@
def correctcap(link, text):
- # If text links to a page with title link uncapitalized, uncapitalize link,
- # otherwise capitalize it
+ """Return the link capitalized/uncapitalized according to the text.
+
+ @param link: link page.
+ @type link: pywikibot.Page
+ @return: uncapitalized link-title if text links to link with an
+ uncapitalized title, otherwise capitalizes link-title.
+ @rtype: str
+
+ """
linkupper = link.title()
linklower = first_lower(linkupper)
if "[[%s]]" % linklower in text or "[[%s|" % linklower in text:
@@ -371,13 +378,13 @@
def __iter__(self):
# TODO: start yielding before all referring pages have been found
- refs = [
+ refs = list(set(
page for page in self.disambPage.getReferences(
- follow_redirects=False,
+ follow_redirects=True,
withTemplateInclusion=False,
namespaces=0 if self.main_only else None
)
- ]
+ ))
pywikibot.output(u"Found %d references." % len(refs))
# Remove ignorables
if self.disambPage.site.family.name in ignore_title and \
@@ -627,23 +634,47 @@
flags=re.X)
def treat(self, refPage, disambPage):
- """
- Treat a page.
+ """Treat a page.
- Parameters:
- disambPage - The disambiguation page or redirect we don't want
- anything to link to
- refPage - A page linking to disambPage
- Returns False if the user pressed q to completely quit the program.
- Otherwise, returns True.
+ @param disambPage: the disambiguation page or redirect we don't want
+ anything to link to
+ @type disambPage: pywikibot.Page
+ @param refPage: A page linking to disambPage
+ @type refPage: pywikibot.Page
+ @return: False if the user pressed q to completely quit the program.
+ Otherwise, returns True.
+ @rtype: bool
"""
- # TODO: break this function up into subroutines!
+ #import pdb; pdb.set_trace()
+ inner_treat = self.inner_treat(refPage, disambPage)
+ if inner_treat is None:
+ # Go to next page.
+ return True
+ elif inner_treat is False:
+ return False
+ for disamb_redirect in disambPage.getReferences(redirectsOnly=True):
+ inner_treat = self.inner_treat(refPage, disamb_redirect)
+ if inner_treat is None:
+ return True
+ elif inner_treat is False:
+ return False
+ return True
- self.current_page = refPage
- include = False
- unlink_counter = 0
- new_targets = []
+ def include(self, refPage, disambPage):
+ """Return variable indicating if the page should be included or not.
+
+ @param disambPage: the disambiguation page or redirect we don't want
+ anything to link to
+ @type disambPage: pywikibot.Page
+ @param refPage: A page linking to disambPage
+ @type refPage: pywikibot.Page
+ @return: True if refPage does not contain any of the ignore regexes.
+ "redirect" if refPage IsRedirectPage and user chooses too work
+ on it. False if refPage does not exist or in any other situation.
+ @rtype: bool or str
+
+ """
try:
text = refPage.get()
ignoreReason = self.checkContents(text)
@@ -651,7 +682,7 @@
pywikibot.output('\n\nSkipping %s because it contains %s.\n\n'
% (refPage.title(), ignoreReason))
else:
- include = True
+ return True, text
except pywikibot.IsRedirectPage:
pywikibot.output(u'%s is a redirect to %s'
% (refPage.title(), disambPage.title()))
@@ -683,211 +714,240 @@
break
elif choice == 'c':
text = refPage.get(get_redirect=True)
- include = "redirect"
+ return "redirect", text
except pywikibot.NoPage:
pywikibot.output(
u'Page [[%s]] does not seem to exist?! Skipping.'
% refPage.title())
- include = False
- if include in (True, "redirect"):
- # make a backup of the original text so we can show the changes
later
- original_text = text
- n = 0
- curpos = 0
- dn = False
- edited = False
- # This loop will run until we have finished the current page
- while True:
- m = self.linkR.search(text, pos=curpos)
- if not m:
- if n == 0:
- pywikibot.output(u"No changes necessary in %s"
- % refPage.title())
- return True
- else:
- # stop loop and save page
- break
- # Make sure that next time around we will not find this same
hit.
- curpos = m.start() + 1
- try:
- foundlink = pywikibot.Link(m.group('title'),
- disambPage.site)
- foundlink.parse()
- except pywikibot.Error:
- continue
- # ignore interwiki links
- if foundlink.site != disambPage.site:
- continue
- # Check whether the link found is to disambPage.
- try:
- if foundlink.canonical_title() != disambPage.title():
- continue
- except pywikibot.Error:
- # must be a broken link
- pywikibot.log(u"Invalid link [[%s]] in page [[%s]]"
- % (m.group('title'), refPage.title()))
- continue
- n += 1
- # how many bytes should be displayed around the current link
- context = 60
- # check if there's a dn-template here already
- if (self.dnSkip and self.dn_template_str and
- self.dn_template_str[:-2] in text[m.end():m.end() +
-
len(self.dn_template_str) + 8]):
- continue
+ return False
- edit = EditOption('edit page', 'e', text, m.start(),
disambPage.title())
- context_option = HighlightContextOption(
- 'more context', 'm', text, 60, start=m.start(),
end=m.end())
- context_option.before_question = True
+ def inner_treat(self, refPage, disambPage):
+ """The treat function loops over this function.
- options = [ListOption(self.alternatives, ''),
- ListOption(self.alternatives, 'r'),
- StandardOption('skip link', 's'),
- edit,
- StandardOption('next page', 'n'),
- StandardOption('unlink', 'u')]
- if self.dn_template_str:
- # '?', '/' for old choice
- options += [AliasOption('tag template %s' %
self.dn_template_str,
- ['t', '?', '/'])]
- options += [context_option]
- if not edited:
- options += [ShowPageOption('show disambiguation page', 'd',
- m.start(), disambPage)]
- options += [
- OutputProxyOption('list', 'l',
- SequenceOutputter(self.alternatives)),
- AddAlternativeOption('add new', 'a',
- SequenceOutputter(self.alternatives))]
- if edited:
- options += [StandardOption('save in this form', 'x')]
+ @param disambPage: The disambiguation page or redirect we don't want
+ anything to link to
+ @type disambPage: pywikibot.Page
+ @param refPage: A page linking to disambPage
+ @type refPage: pywikibot.Page
+ @return: False if the user pressed q to completely quit the program.
+ None if the user pressed n to skip this page.
+ Otherwise, returns True.
+ @rtype: bool
- # TODO: Output context on each question
- answer = pywikibot.input_choice('Option', options,
- default=self.always)
- if answer == 'x':
- assert edited, 'invalid option before editing'
- break
- elif answer == 's':
- n -= 1 # TODO what's this for?
- continue
- elif answer == 'e':
- text = edit.new_text
- edited = True
- curpos = 0
- continue
- elif answer == 'n':
- # skip this page
- if self.primary:
- # If run with the -primary argument, skip this
- # occurrence next time.
- self.primaryIgnoreManager.ignore(refPage)
+ """
+ # TODO: break this function up into subroutines!
+ include, text = self.include(refPage, disambPage)
+ new_targets = []
+ unlink_counter = 0
+ if include not in (True, "redirect"):
+ return True
+ # Backup the original text so we can show the changes later.
+ original_text = text
+ n = 0
+ curpos = 0
+ dn = False
+ edited = False
+ # This loop will run until we have finished the current page
+ while True:
+ m = self.linkR.search(text, pos=curpos)
+ if not m:
+ if n == 0:
+ # No changes necessary
return True
-
- # The link looks like this:
- # [[page_title|link_text]]trailing_chars
- page_title = m.group('title')
- link_text = m.group('label')
-
- if not link_text:
- # or like this: [[page_title]]trailing_chars
- link_text = page_title
- if m.group('section') is None:
- section = ''
else:
- section = m.group('section')
- trailing_chars = m.group('linktrail')
- if trailing_chars:
- link_text += trailing_chars
- if answer == 't':
- assert self.dn_template_str
- # small chunk of text to search
- search_text = text[m.end():m.end() + context]
- # figure out where the link (and sentance) ends, put note
- # there
- end_of_word_match = re.search(r'\s', search_text)
- if end_of_word_match:
- position_split = end_of_word_match.start(0)
- else:
- position_split = 0
- # insert dab needed template
- text = (text[:m.end() + position_split] +
- self.dn_template_str +
- text[m.end() + position_split:])
- dn = True
+ # stop loop and save page
+ break
+ # Make sure that next time around we won't find this same hit.
+ curpos = m.start() + 1
+ try:
+ foundlink = pywikibot.Link(m.group('title'),
+ disambPage.site)
+ foundlink.parse()
+ except pywikibot.Error:
+ continue
+ # ignore interwiki links
+ if foundlink.site != disambPage.site:
+ continue
+ # Check whether the link found is to disambPage.
+ try:
+ if foundlink.canonical_title() != disambPage.title():
continue
- elif answer == 'u':
- # unlink - we remove the section if there's any
- text = text[:m.start()] + link_text + text[m.end():]
- unlink_counter += 1
- continue
- else:
- # Check that no option from above was missed
- assert isinstance(answer, tuple), 'only tuple answer left.'
- assert answer[0] in ['r', ''], 'only valid tuple answers.'
- if answer[0] == 'r':
- # we want to throw away the original link text
- replaceit = link_text == page_title
- elif include == "redirect":
- replaceit = True
- else:
- replaceit = False
+ except pywikibot.Error:
+ # must be a broken link
+ pywikibot.log(u"Invalid link [[%s]] in page [[%s]]"
+ % (m.group('title'), refPage.title()))
+ continue
+ n += 1
+ # how many bytes should be displayed around the current link
+ context = 60
+ # check if there's a dn-template here already
+ if (self.dnSkip and self.dn_template_str and
+ self.dn_template_str[:-2] in text[m.end():m.end() +
+
len(self.dn_template_str) + 8]):
+ continue
- new_page_title = answer[1]
- repPl = pywikibot.Page(pywikibot.Link(new_page_title,
- disambPage.site))
- if (new_page_title[0].isupper() or
- link_text[0].isupper()):
- new_page_title = repPl.title()
- else:
- new_page_title = repPl.title()
- new_page_title = first_lower(new_page_title)
- if new_page_title not in new_targets:
- new_targets.append(new_page_title)
- if replaceit and trailing_chars:
- newlink = "[[%s%s]]%s" % (new_page_title,
- section,
- trailing_chars)
- elif replaceit or (new_page_title == link_text and
- not section):
- newlink = "[[%s]]" % new_page_title
- # check if we can create a link with trailing characters
- # instead of a pipelink
- elif (
- (len(new_page_title) <= len(link_text)) and
- (firstcap(link_text[:len(new_page_title)]) ==
firstcap(new_page_title)) and
- (re.sub(self.trailR, '',
link_text[len(new_page_title):]) == '') and
- (not section)
- ):
- newlink = "[[%s]]%s" \
- % (link_text[:len(new_page_title)],
- link_text[len(new_page_title):])
- else:
- newlink = "[[%s%s|%s]]" \
- % (new_page_title, section, link_text)
- text = text[:m.start()] + newlink + text[m.end():]
- continue
+ edit = EditOption(
+ 'edit page', 'e', text, m.start(), disambPage.title()
+ )
+ context_option = HighlightContextOption(
+ 'more context', 'm', text, 60, start=m.start(), end=m.end())
+ context_option.before_question = True
- pywikibot.output(text[max(0, m.start() - 30):m.end() + 30])
- if text == original_text:
- pywikibot.output(u'\nNo changes have been made:\n')
+ options = [ListOption(self.alternatives, ''),
+ ListOption(self.alternatives, 'r'),
+ StandardOption('skip link', 's'),
+ edit,
+ StandardOption('next page', 'n'),
+ StandardOption('unlink', 'u')]
+ if self.dn_template_str:
+ # '?', '/' for old choice
+ options += [AliasOption('tag template %s' %
self.dn_template_str,
+ ['t', '?', '/'])]
+ options += [context_option]
+ if not edited:
+ options += [ShowPageOption('show disambiguation page', 'd',
+ m.start(), disambPage)]
+ options += [
+ OutputProxyOption('list', 'l',
+ SequenceOutputter(self.alternatives)),
+ AddAlternativeOption('add new', 'a',
+ SequenceOutputter(self.alternatives))]
+ if edited:
+ options += [StandardOption('save in this form', 'x')]
+
+ # TODO: Output context on each question
+ answer = pywikibot.input_choice('Option', options,
+ default=self.always)
+ if answer == 'x':
+ assert edited, 'invalid option before editing'
+ break
+ elif answer == 's':
+ n -= 1 # TODO what's this for?
+ continue
+ elif answer == 'e':
+ text = edit.new_text
+ edited = True
+ curpos = 0
+ continue
+ elif answer == 'n':
+ # skip this page
+ if self.primary:
+ # If run with the -primary argument, skip this
+ # occurrence next time.
+ self.primaryIgnoreManager.ignore(refPage)
+ return None
+
+ # The link looks like this:
+ # [[page_title|link_text]]trailing_chars
+ page_title = m.group('title')
+ link_text = m.group('label')
+
+ if not link_text:
+ # or like this: [[page_title]]trailing_chars
+ link_text = page_title
+ if m.group('section') is None:
+ section = ''
else:
- pywikibot.output(u'\nThe following changes have been made:\n')
- pywikibot.showDiff(original_text, text)
- pywikibot.output(u'')
- # save the page
- self.setSummaryMessage(disambPage, new_targets, unlink_counter,
- dn)
- try:
- refPage.put_async(text, summary=self.comment)
- except pywikibot.LockedPage:
- pywikibot.output(u'Page not saved: page is locked')
- except pywikibot.PageNotSaved as error:
- pywikibot.output(u'Page not saved: %s' % error.args)
+ section = m.group('section')
+ trailing_chars = m.group('linktrail')
+ if trailing_chars:
+ link_text += trailing_chars
+ if answer == 't':
+ assert self.dn_template_str
+ # small chunk of text to search
+ search_text = text[m.end():m.end() + context]
+ # figure out where the link (and sentance) ends, put note
+ # there
+ end_of_word_match = re.search(r'\s', search_text)
+ if end_of_word_match:
+ position_split = end_of_word_match.start(0)
+ else:
+ position_split = 0
+ # insert dab needed template
+ text = (text[:m.end() + position_split] +
+ self.dn_template_str +
+ text[m.end() + position_split:])
+ dn = True
+ continue
+ elif answer == 'u':
+ # unlink - we remove the section if there's any
+ text = text[:m.start()] + link_text + text[m.end():]
+ unlink_counter += 1
+ continue
+ else:
+ # Check that no option from above was missed
+ assert isinstance(answer, tuple), 'only tuple answer left.'
+ assert answer[0] in ['r', ''], 'only valid tuple answers.'
+ if answer[0] == 'r':
+ # we want to throw away the original link text
+ replaceit = link_text == page_title
+ elif include == "redirect":
+ replaceit = True
+ else:
+ replaceit = False
+
+ new_page_title = answer[1]
+ repPl = pywikibot.Page(pywikibot.Link(new_page_title,
+ disambPage.site))
+ if (new_page_title[0].isupper() or
+ link_text[0].isupper()):
+ new_page_title = repPl.title()
+ else:
+ new_page_title = repPl.title()
+ new_page_title = first_lower(new_page_title)
+ if new_page_title not in new_targets:
+ new_targets.append(new_page_title)
+ if replaceit and trailing_chars:
+ newlink = "[[%s%s]]%s" % (new_page_title,
+ section,
+ trailing_chars)
+ elif replaceit or (new_page_title == link_text and
+ not section):
+ newlink = "[[%s]]" % new_page_title
+ # check if we can create a link with trailing characters
+ # instead of a pipelink
+ elif (
+ (len(new_page_title) <= len(link_text)) and
+ (firstcap(link_text[:len(new_page_title)]) ==
firstcap(new_page_title)) and
+ (re.sub(self.trailR, '', link_text[len(new_page_title):])
== '') and
+ (not section)
+ ):
+ newlink = "[[%s]]%s" \
+ % (link_text[:len(new_page_title)],
+ link_text[len(new_page_title):])
+ else:
+ newlink = "[[%s%s|%s]]" \
+ % (new_page_title, section, link_text)
+ text = text[:m.start()] + newlink + text[m.end():]
+ continue
+
+ pywikibot.output(text[max(0, m.start() - 30):m.end() + 30])
+ if text == original_text:
+ pywikibot.output(u'\nNo changes have been made:\n')
+ else:
+ pywikibot.output(u'\nThe following changes have been made:\n')
+ pywikibot.showDiff(original_text, text)
+ pywikibot.output(u'')
+ # save the page
+ self.setSummaryMessage(disambPage, new_targets, unlink_counter,
+ dn)
+ try:
+ refPage.put_async(text, summary=self.comment)
+ except pywikibot.LockedPage:
+ pywikibot.output(u'Page not saved: page is locked')
+ except pywikibot.PageNotSaved as error:
+ pywikibot.output(u'Page not saved: %s' % error.args)
return True
def findAlternatives(self, disambPage):
+ """Extend self.alternatives using correctcap of disambPage.linkedPages.
+
+ @param disambPage: The disabiguation page.
+ @type disambPage: pywikibot.Page
+ @return: True if everything goes fine, False otherwise.
+ @rtype: bool
+
+ """
if disambPage.isRedirectPage() and not self.primary:
if (disambPage.site.lang in self.primary_redir_template and
self.primary_redir_template[disambPage.site.lang]
@@ -1050,6 +1110,7 @@
main_only=self.main_only
)
preloadingGen = pagegenerators.PreloadingGenerator(gen)
+
for refPage in preloadingGen:
if not self.primaryIgnoreManager.isIgnored(refPage):
try:
--
To view, visit https://gerrit.wikimedia.org/r/268938
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I5918133623f878ad5cbc9bfcc697ba497580f7b3
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Dalba <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits