Dalba has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/286296

Change subject: textlib: does_text_contain_section should accept anchorencoded 
values
......................................................................

textlib: does_text_contain_section should accept anchorencoded values

- Define a new function called anchorencode to encode section titles.
- Rewrite the does_text_contain_section to use anchorencode.
- Two tests are not compatible with this new behavior. Remove them.
    Specifically, the test marked as 'section header must contain
    a link' will not pass any longer because brackets ([]) are
    removed from anchorencoded values and it's no longer possible
    to detect the difference between 'Wiki_markup' and '[[Wiki_markup]]'.
- Add some new tests.

Bug:T133276
Change-Id: Ieeec466a580c7009f6fc889c76fa7af58e9cf181
---
M pywikibot/textlib.py
M tests/pages/enwiki_help_editing.page
M tests/textlib_tests.py
3 files changed, 80 insertions(+), 15 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core 
refs/changes/96/286296/1

diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index ec7b508..3775a0b 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -7,7 +7,7 @@
 
 """
 #
-# (C) Pywikibot team, 2008-2015
+# (C) Pywikibot team, 2008-2016
 #
 # Distributed under the terms of the MIT license.
 #
@@ -20,6 +20,7 @@
 import datetime
 import re
 import sys
+import urllib
 
 if sys.version_info[0] > 2:
     from html.parser import HTMLParser
@@ -126,6 +127,15 @@
     'gu': u'૦૧૨૩૪૫૬૭૮૯',
     'or': u'୦୧୨୩୪୫୬୭୮୯',
 }
+
+URL_PROTOCOLS_REGEX = (
+    r'(?:'
+    r'bitcoin:|ftp://|ftps://|geo:|git://|gopher://|http://|https://|'
+    r'irc://|ircs://|magnet:|mailto:|mms://|news:|nntp://|redis://|'
+    r'sftp://|sip:|sips:|sms:|ssh://|svn://|tel:|telnet://|urn:|'
+    r'worldwind://|xmpp:|//'
+    r')'
+)
 
 
 def to_local_digits(phrase, lang):
@@ -1669,6 +1679,41 @@
 # Page parsing functionality
 # --------------------------
 
+
+def anchorencode(string):
+    """Return the anchor encoded string.
+
+    Similar to MediaWiki's {{anchorencode:string}} parser function.
+    This function is an incomplete implementation of MW's
+    Parser::guessSectionNameFromWikiText.
+
+    @param string: the section title
+    @type string: str
+    @return: anchorencoded string
+    @rtype: str
+
+    """
+    # Parser::stripSectionName
+    # Strip internal link markup
+    string = re.sub(r'\[\[:?([^[|]+)\|([^[]+)\]\]', r'\2', string)
+    string = re.sub(r'\[\[:?([^[]+)\|?\]\]', r'\1', string)
+    # Strip external link markup
+    string = re.sub(
+        r'\[' + URL_PROTOCOLS_REGEX + r'([^ ]+?) ([^[]+)\]',
+        r'\2',
+        string, re.I
+    )
+    # Todo: Parser::doQuotes (to handle italics & bold)
+    # Todo: StringUtils::delimiterReplace( '<', '>', '', $text )
+    # Sanitizer::normalizeSectionNameWhitespace
+    string = re.sub(r'[ _]+', r'_', string).strip('_')
+    # Sanitizer::escapeId
+    return urllib.parse.quote(
+        string,
+        safe=':'
+    ).replace('%', '.')
+
+
 def does_text_contain_section(pagetext, section):
     """
     Determine whether the page text contains the given section title.
@@ -1684,14 +1729,15 @@
     @type pagetext: unicode or string
     @param section: a section of a page including wikitext markups
     @type section: unicode or string
+    @rtype: bool
 
     """
-    # match preceding colon for text links
-    section = re.sub(r'\\\[\\\[(\\:)?', r'\[\[\:?', re.escape(section))
-    # match underscores and white spaces
-    section = re.sub(r'\\?[ _]', '[ _]', section)
-    m = re.search("=+[ ']*%s[ ']*=+" % section, pagetext)
-    return bool(m)
+    section = anchorencode(section)
+    for m in re.finditer(r"^(=+)[ ']*(.*?)[ ']*\1 *\r?$", pagetext, re.M):
+        group2 = m.group(2)
+        if group2 == section or anchorencode(group2) == section:
+            return True
+    return False
 
 
 def reformat_ISBNs(text, match_func):
diff --git a/tests/pages/enwiki_help_editing.page 
b/tests/pages/enwiki_help_editing.page
index fa0f54d..b94064e 100644
--- a/tests/pages/enwiki_help_editing.page
+++ b/tests/pages/enwiki_help_editing.page
@@ -87,6 +87,10 @@
 ===Tools===
 * [[Wikipedia:Text editor support|Text editor support]]
 
+== آزمون ==
+=== [https://www.mediawiki.org/wiki/Manual:Pywikibot Homepage] ===
+To test anchorencoded non-ASCII titles and external links.
+
 ===See also===
 {{Meta}}
 * [[WP:WikiProject|WikiProjects]], if you are writing an article about 
something that belongs to a group of topics, check here first!
diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py
index a9be159..f8214b5 100644
--- a/tests/textlib_tests.py
+++ b/tests/textlib_tests.py
@@ -74,11 +74,13 @@
         self.assertNotContains('enwiki_help_editing', 'Minor_Edits',
                                'section hashes are case-sensitive')
 
-    @unittest.expectedFailure
     def testNonAlphabeticalCharactersInSection(self):
         """Test with non-alphabetical chars in section."""
-        self.assertContains('enwiki_help_editing', 
'Talk_.28discussion.29_pages',
-                            'As used in the TOC')
+        self.assertContains(
+            'enwiki_help_editing',
+            'Talk_.28discussion.29_pages',
+            'As used in the TOC'
+        )
         self.assertContains('enwiki_help_editing', 'Talk_(discussion)_pages',
                             'Understood by mediawiki')
 
@@ -91,18 +93,31 @@
     def test_link_in_section(self):
         """Test with link inside section."""
         # section is ==[[Wiki markup]]==
-        self.assertContains("enwiki_help_editing", u"[[Wiki markup]]", "Link 
as section header")
+        self.assertContains(
+            "enwiki_help_editing", u"[[Wiki markup]]", "Link as section header"
+        )
         self.assertContains('enwiki_help_editing', '[[:Wiki markup]]',
                             'section header link with preleading colon')
-        self.assertNotContains('enwiki_help_editing', 'Wiki markup',
-                               'section header must be a link')
         # section is ===[[:Help]]ful tips===
         self.assertContains('enwiki_help_editing', '[[Help]]ful tips',
                             'Containing link')
         self.assertContains('enwiki_help_editing', '[[:Help]]ful tips',
                             'Containing link with preleading colon')
-        self.assertNotContains('enwiki_help_editing', 'Helpful tips',
-                               'section header must contain a link')
+
+    def test_anchorendoded_title(self):
+        """Test anchorenoded title."""
+        self.assertContains('enwiki_help_editing', 'آزمون')
+        self.assertContains(
+            'enwiki_help_editing',
+            '.D8.A2.D8.B2.D9.85.D9.88.D9.86'
+        )
+        self.assertContains(
+            'enwiki_help_editing',
+            '[https://www.mediawiki.org/wiki/Manual:Pywikibot Homepage]')
+        self.assertContains(
+            'enwiki_help_editing',
+            'Homepage'
+        )
 
 
 class TestFormatInterwiki(TestCase):

-- 
To view, visit https://gerrit.wikimedia.org/r/286296
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ieeec466a580c7009f6fc889c76fa7af58e9cf181
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Dalba <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to