jenkins-bot has submitted this change and it was merged. Change subject: New textlib.replace_links() link replacements ......................................................................
New textlib.replace_links() link replacements To allow for user interaction or more complex replacement structure the 'replace' argument can also be a callable. The fixing_redirects script makes use of the new function. Original-Author: Ricordisamoa <[email protected]> Bug: T61686 Change-Id: I6cea2a502981971ec76df59e7ea6196e9f04f0ba --- M pywikibot/page.py M pywikibot/textlib.py M scripts/fixing_redirects.py M tests/textlib_tests.py 4 files changed, 474 insertions(+), 84 deletions(-) Approvals: John Vandenberg: Looks good to me, approved jenkins-bot: Verified diff --git a/pywikibot/page.py b/pywikibot/page.py index 587e991..1dce342 100644 --- a/pywikibot/page.py +++ b/pywikibot/page.py @@ -4898,6 +4898,39 @@ link._title = title return link + @classmethod + def create_separated(cls, link, source, default_namespace=0, section=None, + label=None): + """ + Create a new instance but overwrite section or label. + + The returned Link instance is already parsed. + + @param link: The original link text. + @type link: str + @param source: The source of the link. + @type source: Site + @param default_namespace: The namespace this link uses when no namespace + is defined in the link text. + @type default_namespace: int + @param section: The new section replacing the one in link. If None + (default) it doesn't replace it. + @type section: None or str + @param label: The new label replacing the one in link. If None (default) + it doesn't replace it. + """ + link = cls(link, source, default_namespace) + link.parse() + if section: + link._section = section + elif section is not None: + link._section = None + if label: + link._label = label + elif label is not None: + link._label = '' + return link + # Utility functions for parsing page titles diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py index 462afa2..f75f261 100644 --- a/pywikibot/textlib.py +++ b/pywikibot/textlib.py @@ -16,6 +16,7 @@ __version__ = '$Id$' # +import collections import datetime import re import sys @@ -30,6 +31,7 @@ import pywikibot from pywikibot import config2 as config +from pywikibot.exceptions import InvalidTitle from pywikibot.family import Family from pywikibot.tools import OrderedDict @@ -447,6 +449,219 @@ return marker +def replace_links(text, replace, site=None): + """ + Replace wikilinks selectively. + + The text is searched for a link and on each link it replaces the text + depending on the result for that link. If the result is just None it skips + that link. When it's False it unlinks it and just inserts the label. When it + is a Link instance it'll use the target, section and label from that Link + instance. If it's a string or Page instance it'll use just the target from + the replacement and the section and label from the original link. + + If either the section or label should be used the replacement can be a + function which returns a Link instance and copies the value which should + remaining. + + @param text: the text in which to replace links + @type text: basestring + @param replace: either a callable which reacts like described above. + The callable must accept four parameters link, text, groups, rng and + allows for user interaction. The groups are a dict containing 'title', + 'section', 'label' and 'linktrail' and the rng are the start and end + position of the link. The 'label' in groups contains everything after + the first pipe which might contain additional data which is used in File + namespace for example. + Alternatively it can be a sequence containing two items where the first + must be a Link or Page and the second has the same meaning as the result + by the callable. It'll convert that into a callable where the first item + (the Link or Page) has to be equal to the found link and in that case it + will apply the second value from the sequence. + @type replace: sequence of pywikibot.Page/pywikibot.Link/str or + callable + @param site: a Site object to use if replace is not a sequence or the link + to be replaced is not a Link or Page instance. + @type site: pywikibot.APISite + """ + def to_link(source): + """Return the link from source when it's a Page otherwise itself.""" + if isinstance(source, pywikibot.Page): + return pywikibot.Page._link + elif isinstance(source, basestring): + return pywikibot.Link(source, site) + else: + return source + + def sequence_replace(link, text, groups, rng): + if replace_items[0] == link: + return replace_items[1] + else: + return None + + def check_replacement_class(replacement): + """Normalize the replacement into a list.""" + # separate checks as basestring is a type in Python 2 + if (not isinstance(replacement, (pywikibot.Page, pywikibot.Link)) and + not isinstance(replacement, basestring)): + raise ValueError('The replacement must be None, False, ' + 'a sequence, a Link or a basestring but ' + 'is "{0}"'.format(type(replacement))) + + if isinstance(replace, collections.Sequence): + if len(replace) != 2: + raise ValueError('When used as a sequence, the "replace" ' + 'argument must contain exactly 2 items.') + replace_items = [to_link(replace[0]), replace[1]] + if not isinstance(replace_items[0], pywikibot.Link): + raise ValueError( + 'The original value must be either basestring, Link or Page ' + 'but is "{0}"'.format(type(replace_items[0]))) + if replace_items[1] is not False and replace_items[1] is not None: + check_replacement_class(replace_items[0]) + if (not isinstance(replace_items[1], basestring) and + replace_items[0].site != replace_items[1].site): + raise ValueError('Both pages in the "replace" argument ' + 'must belong to the same site.') + site = replace_items[0].site + replace = sequence_replace + elif site is None: + raise ValueError('If "replace" is not a tuple or list of pages, ' + 'the "site" argument must be provided.') + + linktrail = site.linktrail() + link_pattern = re.compile( + r'\[\[(?P<title>.*?)(#(?P<section>.*?))?(\|(?P<label>.*?))?\]\]' + r'(?P<linktrail>%s)' % linktrail) + extended_label_pattern = re.compile(r'(.*?\]\])({0})'.format(linktrail)) + linktrail = re.compile(linktrail) + curpos = 0 + # This loop will run until we have finished the current page + while True: + m = link_pattern.search(text, pos=curpos) + if not m: + break + # ignore links to sections of the same page + if not m.group('title').strip(): + curpos = m.end() + continue + groups = m.groupdict() + if groups['label'] and '[[' in groups['label']: + # TODO: Work on the link within the label too + # A link within a link, extend the label to the ]] after it + extended_match = extended_label_pattern.search(text, pos=m.end()) + if not extended_match: + # TODO: Unclosed link label, what happens there? + curpos = m.end() + continue + groups['label'] += groups['linktrail'] + extended_match.group(1) + groups['linktrail'] = extended_match.group(2) + end = extended_match.end() + else: + end = m.end() + rng = (m.start(), end) + # Since this point the m variable shouldn't be used as it may not + # contain all contents + del m + try: + link = pywikibot.Link.create_separated( + groups['title'], site, section=groups['section'], + label=groups['label']) + except pywikibot.SiteDefinitionError: + # unrecognized iw prefix + curpos = rng[1] + continue + # ignore interwiki links + if link.site != site: + curpos = rng[1] + continue + + # Check whether the link found should be replaced. + # Either None, False or tuple(Link, bool) + replacement = replace(link, text, groups.copy(), rng) + if replacement is None: + curpos = rng[1] + continue + + # The link looks like this: + # [[page_title|link_text]]trailing_chars + page_title = groups['title'] + link_text = groups['label'] + + if not link_text: + # or like this: [[page_title]]trailing_chars + link_text = page_title + # remove preleading ":" from the link text + if link_text[0] == ':': + link_text = link_text[1:] + trailing_chars = groups['linktrail'] + if trailing_chars: + link_text += trailing_chars + + if replacement is False: + # unlink - we remove the section if there's any + text = text[:rng[0]] + link_text + text[rng[1]:] + # Make sure that next time around we will not find this same hit. + curpos = rng[0] + len(link_text) + continue + + # Verify that it's either Link, Page or basestring + check_replacement_class(replacement) + # Use section and label if it's a Link and not otherwise + if isinstance(replacement, pywikibot.Link): + is_link = True + else: + if isinstance(replacement, pywikibot.Page): + replacement = replacement._link + else: + replacement = pywikibot.Link(replacement, site) + is_link = False + + new_page_title = replacement.canonical_title() + + if is_link: + # Use link's label + link_text = replacement.anchor + if link_text is None: + link_text = new_page_title + must_piped = False + else: + must_piped = True + section = replacement.section + else: + must_piped = True + section = groups['section'] + + if section: + section = '#' + section + else: + section = '' + + # Parse the link text and check if it points to the same page + parsed_link_text = pywikibot.Link(link_text, replacement.site) + try: + parsed_link_text.parse() + except InvalidTitle: + pass + else: + # compare title, but only with parts if linktrail works + if not linktrail.sub('', parsed_link_text.title[len(replacement.title):]): + # TODO: This must also compare everything that was used as a + # prefix (in case insensitive) + must_piped = (not parsed_link_text.title.startswith(replacement.title) or + parsed_link_text.namespace != replacement.namespace) + + if section or must_piped: + newlink = '[[{0}{1}|{2}]]'.format(new_page_title, section, link_text) + else: + newlink = '[[{0}]]{1}'.format(link_text[:len(new_page_title)], + link_text[len(new_page_title):]) + text = text[:rng[0]] + newlink + text[rng[1]:] + # Make sure that next time around we will not find this same hit. + curpos = rng[0] + len(newlink) + return text + + # ----------------------------------------------- # Functions dealing with interwiki language links # ----------------------------------------------- diff --git a/scripts/fixing_redirects.py b/scripts/fixing_redirects.py index 9e13e28..78821f2 100755 --- a/scripts/fixing_redirects.py +++ b/scripts/fixing_redirects.py @@ -23,12 +23,10 @@ __version__ = '$Id$' # -import re import sys import pywikibot from pywikibot import pagegenerators from pywikibot import i18n -from pywikibot.tools import first_lower, first_upper as firstcap # This is required for the text that is shown when you run this script # with the parameter -help. @@ -58,86 +56,6 @@ 'zh': u'Wikipedia:特色条目', } - -def treat(text, linkedPage, targetPage): - """Based on the method of the same name in solve_disambiguation.py.""" - mysite = pywikibot.Site() - linktrail = mysite.linktrail() - - # make a backup of the original text so we can show the changes later - linkR = re.compile(r'\[\[(?P<title>[^\]\|#]*)(?P<section>#[^\]\|]*)?' - r'(\|(?P<label>[^\]]*))?\]\](?P<linktrail>' + linktrail + ')') - curpos = 0 - # This loop will run until we have finished the current page - while True: - m = linkR.search(text, pos=curpos) - if not m: - break - # Make sure that next time around we will not find this same hit. - curpos = m.start() + 1 - # ignore interwiki links and links to sections of the same page - if m.group('title').strip() == '' or \ - mysite.isInterwikiLink(m.group('title')): - continue - else: - actualLinkPage = pywikibot.Page(targetPage.site, m.group('title')) - # Check whether the link found is to page. - if actualLinkPage != linkedPage: - continue - - choice = 'y' - - # The link looks like this: - # [[page_title|link_text]]trailing_chars - page_title = m.group('title') - link_text = m.group('label') - - if not link_text: - # or like this: [[page_title]]trailing_chars - link_text = page_title - if m.group('section') is None: - section = '' - else: - section = m.group('section') - trailing_chars = m.group('linktrail') - if trailing_chars: - link_text += trailing_chars - - if choice in "uU": - # unlink - we remove the section if there's any - text = text[:m.start()] + link_text + text[m.end():] - continue - replaceit = choice in "rR" - - # remove preleading ":" - if link_text[0] == ':': - link_text = link_text[1:] - if link_text[0].isupper(): - new_page_title = targetPage.title() - else: - new_page_title = first_lower(targetPage.title()) - - # remove preleading ":" - if new_page_title[0] == ':': - new_page_title = new_page_title[1:] - - if replaceit and trailing_chars: - newlink = "[[%s%s]]%s" % (new_page_title, section, trailing_chars) - elif replaceit or (new_page_title == link_text and not section): - newlink = "[[%s]]" % new_page_title - # check if we can create a link with trailing characters instead of a - # pipelink - elif len(new_page_title) <= len(link_text) and \ - firstcap(link_text[:len(new_page_title)]) == \ - firstcap(new_page_title) and \ - re.sub(re.compile(linktrail), '', link_text[len(new_page_title):]) == '' and not section: - newlink = "[[%s]]%s" % (link_text[:len(new_page_title)], - link_text[len(new_page_title):]) - else: - newlink = "[[%s%s|%s]]" % (new_page_title, section, link_text) - text = text[:m.start()] + newlink + text[m.end():] - continue - return text pageCache = [] @@ -176,7 +94,7 @@ # no fix to user namespaces if target.namespace() in [0, 1] and not page2.namespace() in [0, 1]: continue - text = treat(text, page2, target) + text = pywikibot.textlib.replace_links(text, [page2, target]) if text != page.get(): comment = i18n.twtranslate(mysite, 'fixing_redirects-fixing') pywikibot.showDiff(page.get(), text) diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py index 3974fd3..bb7545a 100644 --- a/tests/textlib_tests.py +++ b/tests/textlib_tests.py @@ -16,7 +16,7 @@ import pywikibot import pywikibot.textlib as textlib -from pywikibot import config +from pywikibot import config, UnknownSite from pywikibot.tools import OrderedDict from tests.aspects import unittest, TestCase, DefaultDrySiteTestCase @@ -325,6 +325,230 @@ textlib.extract_templates_and_params) +class TestReplaceLinks(TestCase): + + """Test the replace_links function in textlib.""" + + sites = { + 'wt': { + 'family': 'wiktionary', + 'code': 'en', + }, + 'wp': { + 'family': 'wikipedia', + 'code': 'en', + } + } + + dry = True + + text = ('Hello [[World]], [[how|are]] [[you#section|you]]? Are [[you]] a ' + '[[bug:1337]]?') + + @staticmethod + def _dummy_cache(force=False): + pass + + @classmethod + def setUpClass(cls): + super(TestReplaceLinks, cls).setUpClass() + # make APISite.interwiki work, as long as it doesn't call + # _cache_interwikimap with force=True + for site in cls.sites.values(): + site['site']._cache_interwikimap = cls._dummy_cache + site['site']._iw_sites = dict((iw['family'], (iw['site'], True)) + for iw in cls.sites.values()) + site['site']._iw_sites['bug'] = (UnknownSite('Not a wiki'), + False) + site['site']._iw_sites['en'] = (site['site'], True) + cls.wp_site = cls.get_site('wp') + + def test_replacements_function(self): + """Test a dynamic function as the replacements.""" + def callback(link, text, groups, rng): + self.assertEqual(link.site, self.wp_site) + if link.title == 'World': + return pywikibot.Link('Homeworld', link.site) + elif link.title.lower() == 'you': + return False + self.assertEqual( + textlib.replace_links(self.text, callback, self.wp_site), + 'Hello [[Homeworld]], [[how|are]] you? Are you a [[bug:1337]]?') + + def test_replacements_once(self): + """Test dynamic replacement.""" + def callback(link, text, groups, rng): + if link.title.lower() == 'you': + self._count += 1 + if link.section: + return pywikibot.Link( + '{0}#{1}'.format(self._count, link.section), link.site) + else: + return pywikibot.Link('{0}'.format(self._count), link.site) + self._count = 0 # buffer number of found instances + self.assertEqual( + textlib.replace_links(self.text, callback, self.wp_site), + 'Hello [[World]], [[how|are]] [[1#section|1]]? Are [[2]] a ' + '[[bug:1337]]?') + del self._count + + def test_unlink_all(self): + """Test unlinking.""" + def callback(link, text, groups, rng): + self.assertEqual(link.site, self.wp_site) + return False + self.assertEqual( + textlib.replace_links(self.text, callback, self.wp_site), + 'Hello World, are you? Are you a [[bug:1337]]?') + + def test_unlink_some(self): + """Test unlinking only some links.""" + self.assertEqual( + textlib.replace_links(self.text, ('World', False), self.wp_site), + 'Hello World, [[how|are]] [[you#section|you]]? Are [[you]] a ' + '[[bug:1337]]?') + self.assertEqual( + textlib.replace_links('[[User:Namespace|Label]]\n' + '[[User:Namespace#Section|Labelz]]\n' + '[[Nothing]]', + ('User:Namespace', False), + self.wp_site), + 'Label\nLabelz\n[[Nothing]]') + + def test_replace_neighbour(self): + """Test that it replaces two neighbouring links.""" + self.assertEqual( + textlib.replace_links('[[A]][[A]][[C]]', + ('A', 'B'), + self.wp_site), + '[[B|A]][[B|A]][[C]]') + + def test_replacements_simplify(self): + """Test a tuple as a replacement removing the need for a piped link.""" + self.assertEqual( + textlib.replace_links(self.text, + ('how', 'are'), + self.wp_site), + 'Hello [[World]], [[are]] [[you#section|you]]? Are [[you]] a ' + '[[bug:1337]]?') + + def test_replace_file(self): + """Test that it respects the namespace.""" + self.assertEqual( + textlib.replace_links( + '[[File:Meh.png|thumb|Description of [[fancy]]]] [[Fancy]]...', + ('File:Meh.png', 'File:Fancy.png'), + self.wp_site), + '[[File:Fancy.png|thumb|Description of [[fancy]]]] [[Fancy]]...') + + def test_replace_strings(self): + """Test if strings can be used.""" + self.assertEqual( + textlib.replace_links(self.text, ('how', 'are'), self.wp_site), + 'Hello [[World]], [[are]] [[you#section|you]]? Are [[you]] a ' + '[[bug:1337]]?') + + def test_replace_invalid_link_text(self): + """Test that it doesn't pipe a link when it's an invalid link.""" + self.assertEqual( + textlib.replace_links('[[Target|Foo:]]', ('Target', 'Foo'), self.wp_site), + '[[Foo|Foo:]]') + + def test_replace_modes(self): + """Test replacing with or without label and section.""" + source_text = '[[Foo#bar|baz]]' + self.assertEqual( + textlib.replace_links(source_text, ('Foo', 'Bar'), self.wp_site), + '[[Bar#bar|baz]]') + self.assertEqual( + textlib.replace_links(source_text, + ('Foo', pywikibot.Page(self.wp_site, 'Bar')), + self.wp_site), + '[[Bar#bar|baz]]') + self.assertEqual( + textlib.replace_links(source_text, + ('Foo', pywikibot.Link('Bar', self.wp_site)), + self.wp_site), + '[[Bar]]') + self.assertEqual( + textlib.replace_links(source_text, ('Foo', 'Bar#snafu'), self.wp_site), + '[[Bar#bar|baz]]') + self.assertEqual( + textlib.replace_links(source_text, + ('Foo', pywikibot.Page(self.wp_site, 'Bar#snafu')), + self.wp_site), + '[[Bar#bar|baz]]') + self.assertEqual( + textlib.replace_links(source_text, + ('Foo', pywikibot.Link('Bar#snafu', self.wp_site)), + self.wp_site), + '[[Bar#snafu|Bar]]') + self.assertEqual( + textlib.replace_links(source_text, ('Foo', 'Bar|foo'), self.wp_site), + '[[Bar#bar|baz]]') + self.assertEqual( + textlib.replace_links(source_text, + ('Foo', pywikibot.Page(self.wp_site, 'Bar|foo')), + self.wp_site), + '[[Bar#bar|baz]]') + self.assertEqual( + textlib.replace_links(source_text, + ('Foo', pywikibot.Link('Bar|foo', self.wp_site)), + self.wp_site), + '[[Bar|foo]]') + self.assertEqual( + textlib.replace_links(source_text, ('Foo', 'Bar#snafu|foo'), self.wp_site), + '[[Bar#bar|baz]]') + self.assertEqual( + textlib.replace_links(source_text, + ('Foo', pywikibot.Page(self.wp_site, 'Bar#snafu|foo')), + self.wp_site), + '[[Bar#bar|baz]]') + self.assertEqual( + textlib.replace_links(source_text, + ('Foo', pywikibot.Link('Bar#snafu|foo', self.wp_site)), + self.wp_site), + '[[Bar#snafu|foo]]') + + def test_replace_different_case(self): + """Test that it uses piped links when the case is different.""" + source_text = '[[Foo|Bar]] and [[Foo|bar]]' + self.assertEqual( + textlib.replace_links(source_text, ('Foo', 'bar'), self.get_site('wp')), + '[[Bar]] and [[bar]]') + self.assertEqual( + textlib.replace_links(source_text, ('Foo', 'bar'), self.get_site('wt')), + '[[bar|Bar]] and [[bar]]') + self.assertEqual( + textlib.replace_links(source_text, ('Foo', 'Bar'), self.get_site('wt')), + '[[Bar]] and [[Bar|bar]]') + + @unittest.expectedFailure + def test_label_diff_namespace(self): + """Test that it uses the old label when the new doesn't match.""" + # These tests require to get the actual part which is before the title + # (interwiki and namespace prefixes) which could be then compared + # case insensitive. + self.assertEqual( + textlib.replace_links('[[Image:Foobar]]', ('File:Foobar', 'File:Foo'), self.wp_site), + '[[File:Foo|Image:Foobar]]') + self.assertEqual( + textlib.replace_links('[[en:File:Foobar]]', ('File:Foobar', 'File:Foo'), self.wp_site), + '[[File:Foo|en:File:Foobar]]') + + def test_linktrails(self): + """Test that the linktrails are used or applied.""" + self.assertEqual( + textlib.replace_links('[[Foobar]]', ('Foobar', 'Foo'), self.wp_site), + '[[Foo]]bar') + self.assertEqual( + textlib.replace_links('[[Talk:test]]s', ('Talk:Test', 'Talk:Tests'), self.wp_site), + '[[Talk:tests]]') + self.assertEqual( + textlib.replace_links('[[Talk:test]]s', ('Talk:Test', 'Project:Tests'), self.wp_site), + '[[Project:Tests|Talk:tests]]') + + class TestLocalDigits(TestCase): """Test to verify that local digits are correctly being handled.""" -- To view, visit https://gerrit.wikimedia.org/r/137802 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I6cea2a502981971ec76df59e7ea6196e9f04f0ba Gerrit-PatchSet: 32 Gerrit-Project: pywikibot/core Gerrit-Branch: master Gerrit-Owner: Ricordisamoa <[email protected]> Gerrit-Reviewer: John Vandenberg <[email protected]> Gerrit-Reviewer: Ladsgroup <[email protected]> Gerrit-Reviewer: Merlijn van Deen <[email protected]> Gerrit-Reviewer: Mpaa <[email protected]> Gerrit-Reviewer: Ricordisamoa <[email protected]> Gerrit-Reviewer: XZise <[email protected]> Gerrit-Reviewer: Xqt <[email protected]> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ Pywikibot-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/pywikibot-commits
