Dalba has uploaded a new change for review.
https://gerrit.wikimedia.org/r/286296
Change subject: textlib: does_text_contain_section should accept anchorencoded
values
......................................................................
textlib: does_text_contain_section should accept anchorencoded values
- Define a new function called anchorencode to encode section titles.
- Rewrite the does_text_contain_section to use anchorencode.
- Two tests are not compatible with this new behavior. Remove them.
Specifically, the test marked as 'section header must contain
a link' will not pass any longer because brackets ([]) are
removed from anchorencoded values and it's no longer possible
to detect the difference between 'Wiki_markup' and '[[Wiki_markup]]'.
- Add some new tests.
Bug:T133276
Change-Id: Ieeec466a580c7009f6fc889c76fa7af58e9cf181
---
M pywikibot/textlib.py
M tests/pages/enwiki_help_editing.page
M tests/textlib_tests.py
3 files changed, 80 insertions(+), 15 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core
refs/changes/96/286296/1
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index ec7b508..3775a0b 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -7,7 +7,7 @@
"""
#
-# (C) Pywikibot team, 2008-2015
+# (C) Pywikibot team, 2008-2016
#
# Distributed under the terms of the MIT license.
#
@@ -20,6 +20,7 @@
import datetime
import re
import sys
+import urllib
if sys.version_info[0] > 2:
from html.parser import HTMLParser
@@ -126,6 +127,15 @@
'gu': u'૦૧૨૩૪૫૬૭૮૯',
'or': u'୦୧୨୩୪୫୬୭୮୯',
}
+
+URL_PROTOCOLS_REGEX = (
+ r'(?:'
+ r'bitcoin:|ftp://|ftps://|geo:|git://|gopher://|http://|https://|'
+ r'irc://|ircs://|magnet:|mailto:|mms://|news:|nntp://|redis://|'
+ r'sftp://|sip:|sips:|sms:|ssh://|svn://|tel:|telnet://|urn:|'
+ r'worldwind://|xmpp:|//'
+ r')'
+)
def to_local_digits(phrase, lang):
@@ -1669,6 +1679,41 @@
# Page parsing functionality
# --------------------------
+
+def anchorencode(string):
+ """Return the anchor encoded string.
+
+ Similar to MediaWiki's {{anchorencode:string}} parser function.
+ This function is an incomplete implementation of MW's
+ Parser::guessSectionNameFromWikiText.
+
+ @param string: the section title
+ @type string: str
+ @return: anchorencoded string
+ @rtype: str
+
+ """
+ # Parser::stripSectionName
+ # Strip internal link markup
+ string = re.sub(r'\[\[:?([^[|]+)\|([^[]+)\]\]', r'\2', string)
+ string = re.sub(r'\[\[:?([^[]+)\|?\]\]', r'\1', string)
+ # Strip external link markup
+ string = re.sub(
+ r'\[' + URL_PROTOCOLS_REGEX + r'([^ ]+?) ([^[]+)\]',
+ r'\2',
+ string, re.I
+ )
+ # Todo: Parser::doQuotes (to handle italics & bold)
+ # Todo: StringUtils::delimiterReplace( '<', '>', '', $text )
+ # Sanitizer::normalizeSectionNameWhitespace
+ string = re.sub(r'[ _]+', r'_', string).strip('_')
+ # Sanitizer::escapeId
+ return urllib.parse.quote(
+ string,
+ safe=':'
+ ).replace('%', '.')
+
+
def does_text_contain_section(pagetext, section):
"""
Determine whether the page text contains the given section title.
@@ -1684,14 +1729,15 @@
@type pagetext: unicode or string
@param section: a section of a page including wikitext markups
@type section: unicode or string
+ @rtype: bool
"""
- # match preceding colon for text links
- section = re.sub(r'\\\[\\\[(\\:)?', r'\[\[\:?', re.escape(section))
- # match underscores and white spaces
- section = re.sub(r'\\?[ _]', '[ _]', section)
- m = re.search("=+[ ']*%s[ ']*=+" % section, pagetext)
- return bool(m)
+ section = anchorencode(section)
+ for m in re.finditer(r"^(=+)[ ']*(.*?)[ ']*\1 *\r?$", pagetext, re.M):
+ group2 = m.group(2)
+ if group2 == section or anchorencode(group2) == section:
+ return True
+ return False
def reformat_ISBNs(text, match_func):
diff --git a/tests/pages/enwiki_help_editing.page
b/tests/pages/enwiki_help_editing.page
index fa0f54d..b94064e 100644
--- a/tests/pages/enwiki_help_editing.page
+++ b/tests/pages/enwiki_help_editing.page
@@ -87,6 +87,10 @@
===Tools===
* [[Wikipedia:Text editor support|Text editor support]]
+== آزمون ==
+=== [https://www.mediawiki.org/wiki/Manual:Pywikibot Homepage] ===
+To test anchorencoded non-ASCII titles and external links.
+
===See also===
{{Meta}}
* [[WP:WikiProject|WikiProjects]], if you are writing an article about
something that belongs to a group of topics, check here first!
diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py
index a9be159..f8214b5 100644
--- a/tests/textlib_tests.py
+++ b/tests/textlib_tests.py
@@ -74,11 +74,13 @@
self.assertNotContains('enwiki_help_editing', 'Minor_Edits',
'section hashes are case-sensitive')
- @unittest.expectedFailure
def testNonAlphabeticalCharactersInSection(self):
"""Test with non-alphabetical chars in section."""
- self.assertContains('enwiki_help_editing',
'Talk_.28discussion.29_pages',
- 'As used in the TOC')
+ self.assertContains(
+ 'enwiki_help_editing',
+ 'Talk_.28discussion.29_pages',
+ 'As used in the TOC'
+ )
self.assertContains('enwiki_help_editing', 'Talk_(discussion)_pages',
'Understood by mediawiki')
@@ -91,18 +93,31 @@
def test_link_in_section(self):
"""Test with link inside section."""
# section is ==[[Wiki markup]]==
- self.assertContains("enwiki_help_editing", u"[[Wiki markup]]", "Link
as section header")
+ self.assertContains(
+ "enwiki_help_editing", u"[[Wiki markup]]", "Link as section header"
+ )
self.assertContains('enwiki_help_editing', '[[:Wiki markup]]',
'section header link with preleading colon')
- self.assertNotContains('enwiki_help_editing', 'Wiki markup',
- 'section header must be a link')
# section is ===[[:Help]]ful tips===
self.assertContains('enwiki_help_editing', '[[Help]]ful tips',
'Containing link')
self.assertContains('enwiki_help_editing', '[[:Help]]ful tips',
'Containing link with preleading colon')
- self.assertNotContains('enwiki_help_editing', 'Helpful tips',
- 'section header must contain a link')
+
+ def test_anchorendoded_title(self):
+ """Test anchorenoded title."""
+ self.assertContains('enwiki_help_editing', 'آزمون')
+ self.assertContains(
+ 'enwiki_help_editing',
+ '.D8.A2.D8.B2.D9.85.D9.88.D9.86'
+ )
+ self.assertContains(
+ 'enwiki_help_editing',
+ '[https://www.mediawiki.org/wiki/Manual:Pywikibot Homepage]')
+ self.assertContains(
+ 'enwiki_help_editing',
+ 'Homepage'
+ )
class TestFormatInterwiki(TestCase):
--
To view, visit https://gerrit.wikimedia.org/r/286296
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ieeec466a580c7009f6fc889c76fa7af58e9cf181
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Dalba <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits