[MediaWiki-commits] [Gerrit] replaceCategoryLinks() no longer normalizes unmodified categ... - change (pywikibot/core)

Gallaecio (Code Review) Wed, 14 Oct 2015 20:51:13 -0700

Gallaecio has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/246459


Change subject: replaceCategoryLinks() no longer normalizes unmodified category 
links
......................................................................

replaceCategoryLinks() no longer normalizes unmodified category links

replaceCategoryLinks() no longer normalizes unmodified category links. For
example, if a template is used in an existing category link that is not
replaced, this function no longer expands the template. However,
note that replaceCategoryLinks() still normalizes the namespace (e.g.
localizes it if it is in English in a non-English wiki).

Users can optionally pass Category.aslink() the text that they want the
cateogory link to contain, as opposed to the normalized name of the
category. This feature is used by replaceCategoryLinks().

Create textlib.normalize_page_name(), refactoring existing code.

Move html2unicode() from page.py to textlib.py.

Change-Id: I23851646319e5d8b49e9528be908bb885c446ea5
---
M pywikibot/__init__.py
M pywikibot/page.py
M pywikibot/textlib.py
3 files changed, 167 insertions(+), 127 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core 
refs/changes/59/246459/1

diff --git a/pywikibot/__init__.py b/pywikibot/__init__.py
index ba3aae3..eeb0ccc 100644
--- a/pywikibot/__init__.py
+++ b/pywikibot/__init__.py
@@ -84,14 +84,14 @@
     'getCategoryLinks', 'categoryFormat', 'replaceCategoryLinks',
     'removeCategoryLinks', 'removeCategoryLinksAndSeparator',
     'replaceCategoryInPlace', 'compileLinkR', 'extract_templates_and_params',
-    'TimeStripper',
+    'TimeStripper', 'html2unicode',
 )
 
 __all__ = (
     'config', 'ui', 'UnicodeMixin', 'translate',
     'Page', 'FilePage', 'Category', 'Link', 'User',
     'ItemPage', 'PropertyPage', 'Claim',
-    'html2unicode', 'url2unicode', 'unicode2html',
+    'url2unicode', 'unicode2html',
     'stdout', 'output', 'warning', 'error', 'critical', 'debug',
     'exception', 'input_choice', 'input', 'input_yn', 'inputChoice',
     'handle_args', 'handleArgs', 'showHelp', 'ui', 'log',
@@ -660,7 +660,7 @@
     PropertyPage,
     Claim,
 )
-from pywikibot.page import html2unicode, url2unicode, unicode2html
+from pywikibot.page import url2unicode, unicode2html
 
 
 link_regex = re.compile(r'\[\[(?P<title>[^\]|[<>{}]*)(\|.*?)?\]\]')
diff --git a/pywikibot/page.py b/pywikibot/page.py
index cc2c1b5..ec62261 100644
--- a/pywikibot/page.py
+++ b/pywikibot/page.py
@@ -26,11 +26,6 @@
 import re
 import sys
 
-try:
-    import unicodedata2 as unicodedata
-except ImportError:
-    import unicodedata
-
 from collections import defaultdict, namedtuple
 from warnings import warn
 
@@ -2336,7 +2331,7 @@
                              % title)
 
     @deprecated_args(forceInterwiki=None, textlink=None, noInterwiki=None)
-    def aslink(self, sortKey=None):
+    def aslink(self, sortKey=None, name_text=None):
         """Return a link to place a page in this Category.
 
         Use this only to generate a "true" category link, not for interwikis
@@ -2348,11 +2343,14 @@
 
         """
         key = sortKey or self.sortKey
+        title_without_section = self.title(withSection=False)
+        if name_text:
+            title_without_section = '%s:%s' % (
+                self.site.namespace(self.namespace()), name_text)
         if key is not None:
-            titleWithSortKey = '%s|%s' % (self.title(withSection=False),
-                                          key)
+            titleWithSortKey = '%s|%s' % (title_without_section, key)
         else:
-            titleWithSortKey = self.title(withSection=False)
+            titleWithSortKey = title_without_section
         return '[[%s]]' % titleWithSortKey
 
     @deprecated_args(startFrom=None, cacheResults=None)
@@ -4719,41 +4717,7 @@
         encodings = [self._source.encoding()] + list(self._source.encodings())
 
         self._text = url2unicode(self._text, encodings=encodings)
-
-        # Clean up the name, it can come from anywhere.
-        # Convert HTML entities to unicode
-        t = html2unicode(self._text)
-
-        # Normalize unicode string to a NFC (composed) format to allow
-        # proper string comparisons to strings output from MediaWiki API.
-        # Due to Python issue 10254, this is not possible on Python 2.6.6
-        # if the string contains combining characters.  See T102461.
-        if (PYTHON_VERSION == (2, 6, 6) and
-                unicodedata.__name__ != 'unicodedata2' and
-                any(unicodedata.combining(c) for c in t)):
-            raise UnicodeError(
-                'Link(%r, %s): combining characters detected, which are '
-                'not supported by Pywikibot on Python 2.6.6. See '
-                'https://phabricator.wikimedia.org/T102461'
-                % (t, self._source))
-        t = unicodedata.normalize('NFC', t)
-
-        # This code was adapted from Title.php : secureAndSplit()
-        #
-        if u'\ufffd' in t:
-            raise pywikibot.Error(
-                "Title contains illegal char (\\uFFFD 'REPLACEMENT 
CHARACTER')")
-
-        # Replace underscores by spaces
-        t = t.replace(u"_", u" ")
-        # replace multiple spaces with a single space
-        while u"  " in t:
-            t = t.replace(u"  ", u" ")
-        # Strip spaces at both ends
-        t = t.strip()
-        # Remove left-to-right and right-to-left markers.
-        t = t.replace(u"\u200e", u"").replace(u"\u200f", u"")
-        self._text = t
+        self._text = textlib.normalize_page_name(self._text)
 
         if source_is_page:
             self._text = source.title(withSection=False) + self._text
@@ -5173,83 +5137,6 @@
 
 
 # Utility functions for parsing page titles
-
-
-def html2unicode(text, ignore=None):
-    """Replace HTML entities with equivalent unicode.
-
-    @param ignore: HTML entities to ignore
-    @param ignore: list of int
-
-    @return: unicode
-    """
-    if ignore is None:
-        ignore = []
-    # This regular expression will match any decimal and hexadecimal entity and
-    # also entities that might be named entities.
-    entityR = re.compile(
-        
r'&(?:amp;)?(#(?P<decimal>\d+)|#x(?P<hex>[0-9a-fA-F]+)|(?P<name>[A-Za-z]+));')
-    # These characters are Html-illegal, but sadly you *can* find some of
-    # these and converting them to chr(decimal) is unsuitable
-    convertIllegalHtmlEntities = {
-        128: 8364,  # €
-        130: 8218,  # ‚
-        131: 402,   # ƒ
-        132: 8222,  # „
-        133: 8230,  # …
-        134: 8224,  # †
-        135: 8225,  # ‡
-        136: 710,   # ˆ
-        137: 8240,  # ‰
-        138: 352,   # Š
-        139: 8249,  # ‹
-        140: 338,   # Œ
-        142: 381,   # Ž
-        145: 8216,  # ‘
-        146: 8217,  # ’
-        147: 8220,  # “
-        148: 8221,  # ”
-        149: 8226,  # •
-        150: 8211,  # –
-        151: 8212,  # —
-        152: 732,   # ˜
-        153: 8482,  # ™
-        154: 353,   # š
-        155: 8250,  # ›
-        156: 339,   # œ
-        158: 382,   # ž
-        159: 376    # Ÿ
-    }
-    # ensuring that illegal &#129; &#141; and &#157, which have no known 
values,
-    # don't get converted to chr(129), chr(141) or chr(157)
-    ignore = set(ignore) | set([129, 141, 157])
-
-    def handle_entity(match):
-        if match.group('decimal'):
-            unicodeCodepoint = int(match.group('decimal'))
-        elif match.group('hex'):
-            unicodeCodepoint = int(match.group('hex'), 16)
-        elif match.group('name'):
-            name = match.group('name')
-            if name in htmlentitydefs.name2codepoint:
-                # We found a known HTML entity.
-                unicodeCodepoint = htmlentitydefs.name2codepoint[name]
-            else:
-                unicodeCodepoint = False
-        try:
-            unicodeCodepoint = convertIllegalHtmlEntities[unicodeCodepoint]
-        except KeyError:
-            pass
-        if unicodeCodepoint and unicodeCodepoint not in ignore:
-            if unicodeCodepoint > sys.maxunicode:
-                # solve narrow Python 2 build exception (UTF-16)
-                return eval("'\\U{0:08x}'".format(unicodeCodepoint))
-            else:
-                return chr(unicodeCodepoint)
-        else:
-            # Leave the entity unchanged
-            return match.group(0)
-    return entityR.sub(handle_entity, text)
 
 
 def UnicodeToAsciiHtml(s):
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 54d20eb..a884df3 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -21,6 +21,11 @@
 import re
 import sys
 
+try:
+    import unicodedata2 as unicodedata
+except ImportError:
+    import unicodedata
+
 if sys.version_info[0] > 2:
     from html.parser import HTMLParser
     basestring = (str,)
@@ -1120,6 +1125,149 @@
     return text
 
 
+def html2unicode(text, ignore=None):
+    """Replace HTML entities with equivalent unicode.
+
+    @param ignore: HTML entities to ignore
+    @param ignore: list of int
+
+    @return: unicode
+    """
+    if ignore is None:
+        ignore = []
+    # This regular expression will match any decimal and hexadecimal entity and
+    # also entities that might be named entities.
+    entityR = re.compile(
+        
r'&(?:amp;)?(#(?P<decimal>\d+)|#x(?P<hex>[0-9a-fA-F]+)|(?P<name>[A-Za-z]+));')
+    # These characters are Html-illegal, but sadly you *can* find some of
+    # these and converting them to chr(decimal) is unsuitable
+    convertIllegalHtmlEntities = {
+        128: 8364,  # €
+        130: 8218,  # ‚
+        131: 402,   # ƒ
+        132: 8222,  # „
+        133: 8230,  # …
+        134: 8224,  # †
+        135: 8225,  # ‡
+        136: 710,   # ˆ
+        137: 8240,  # ‰
+        138: 352,   # Š
+        139: 8249,  # ‹
+        140: 338,   # Œ
+        142: 381,   # Ž
+        145: 8216,  # ‘
+        146: 8217,  # ’
+        147: 8220,  # “
+        148: 8221,  # ”
+        149: 8226,  # •
+        150: 8211,  # –
+        151: 8212,  # —
+        152: 732,   # ˜
+        153: 8482,  # ™
+        154: 353,   # š
+        155: 8250,  # ›
+        156: 339,   # œ
+        158: 382,   # ž
+        159: 376    # Ÿ
+    }
+    # ensuring that illegal &#129; &#141; and &#157, which have no known 
values,
+    # don't get converted to chr(129), chr(141) or chr(157)
+    ignore = set(ignore) | set([129, 141, 157])
+
+    def handle_entity(match):
+        if match.group('decimal'):
+            unicodeCodepoint = int(match.group('decimal'))
+        elif match.group('hex'):
+            unicodeCodepoint = int(match.group('hex'), 16)
+        elif match.group('name'):
+            name = match.group('name')
+            if name in htmlentitydefs.name2codepoint:
+                # We found a known HTML entity.
+                unicodeCodepoint = htmlentitydefs.name2codepoint[name]
+            else:
+                unicodeCodepoint = False
+        try:
+            unicodeCodepoint = convertIllegalHtmlEntities[unicodeCodepoint]
+        except KeyError:
+            pass
+        if unicodeCodepoint and unicodeCodepoint not in ignore:
+            if unicodeCodepoint > sys.maxunicode:
+                # solve narrow Python 2 build exception (UTF-16)
+                return eval("'\\U{0:08x}'".format(unicodeCodepoint))
+            else:
+                return chr(unicodeCodepoint)
+        else:
+            # Leave the entity unchanged
+            return match.group(0)
+    return entityR.sub(handle_entity, text)
+
+
+def normalize_page_name(page_name):
+
+    # Clean up the name, it can come from anywhere.
+    # Convert HTML entities to unicode
+    page_name = html2unicode(page_name)
+
+    # Normalize unicode string to a NFC (composed) format to allow
+    # proper string comparisons to strings output from MediaWiki API.
+    # Due to Python issue 10254, this is not possible on Python 2.6.6
+    # if the string contains combining characters.  See T102461.
+    if (PYTHON_VERSION == (2, 6, 6) and
+            unicodedata.__name__ != 'unicodedata2' and
+            any(unicodedata.combining(character) for character in page_name)):
+        raise UnicodeError(
+            'Link(%r, %s): combining characters detected, which are '
+            'not supported by Pywikibot on Python 2.6.6. See '
+            'https://phabricator.wikimedia.org/T102461'
+            % (page_name, self._source))
+    page_name = unicodedata.normalize('NFC', page_name)
+
+    # This code was adapted from Title.php : secureAndSplit()
+    #
+    if u'\ufffd' in page_name:
+        raise pywikibot.Error(
+            "Title contains illegal char (\\uFFFD 'REPLACEMENT CHARACTER')")
+
+    # Replace underscores by spaces
+    page_name = page_name.replace(u"_", u" ")
+    # replace multiple spaces with a single space
+    while u"  " in page_name:
+        page_name = page_name.replace(u"  ", u" ")
+    # Strip spaces at both ends
+    page_name = page_name.strip()
+    # Remove left-to-right and right-to-left markers.
+    page_name = page_name.replace(u"\u200e", u"").replace(u"\u200f", u"")
+    return page_name
+
+
+def category_name_texts(page_text, site=None):
+    """ Returns a dictionary where keys are the normalized titles of
+    categories found in the specified page text, and values are the actual
+    category title as used in the page, between the : and the ] or |.
+
+    For example, if page_text is "[[Category: my category ]], the resulting
+    dictionary has "My category" as key and " my category " as value.
+    """
+    wiki_code = {}
+    if site is None:
+        site = pywikibot.Site()
+    category_namespaces = '|'.join(site.category_namespaces())
+    category_regexp = re.compile(
+        r'\[\[\s*(?:%s)\s*:(.*?)\]\]' % category_namespaces, re.I)
+    for match in category_regexp.finditer(page_text):
+        category_wiki_code = match.group(1)
+        normalized_category_name = category_wiki_code
+        if u"{" in normalized_category_name:
+            normalized_category_name = site.expand_text(
+                normalized_category_name)
+        normalized_category_name = normalize_page_name(
+            normalized_category_name)
+        if normalized_category_name != category_wiki_code:
+            wiki_code[normalized_category_name] = \
+                category_wiki_code
+    return wiki_code
+
+
 def replaceCategoryLinks(oldtext, new, site=None, addOnly=False):
     """
     Replace all existing category links with new category links.
@@ -1144,12 +1292,14 @@
     iseparator = site.family.interwiki_text_separator
     separatorstripped = separator.strip()
     iseparatorstripped = iseparator.strip()
+    name_texts = {}
     if addOnly:
         s2 = oldtext
     else:
+        name_texts.update(category_name_texts(oldtext, site=site))
         s2 = removeCategoryLinksAndSeparator(oldtext, site=site, marker=marker,
                                              separator=separatorstripped)
-    s = categoryFormat(new, insite=site)
+    s = categoryFormat(new, insite=site, name_texts=name_texts)
     if s:
         if site.code in site.family.category_attop:
             newtext = s + separator + s2
@@ -1183,7 +1333,7 @@
     return newtext.strip()
 
 
-def categoryFormat(categories, insite=None):
+def categoryFormat(categories, insite=None, name_texts={}):
     """Return a string containing links to all categories in a list.
 
     'categories' should be a list of Category or Page objects or strings
@@ -1212,7 +1362,10 @@
         # Make sure a category is casted from Page to Category.
         elif not isinstance(category, pywikibot.Category):
             category = pywikibot.Category(category)
-        link = category.aslink()
+        name_text = None
+        if category.title(withNamespace=False) in name_texts:
+            name_text = name_texts[category.title(withNamespace=False)]
+        link = category.aslink(name_text=name_text)
         catLinks.append(link)
 
     if insite.category_on_one_line():

-- 
To view, visit https://gerrit.wikimedia.org/r/246459
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I23851646319e5d8b49e9528be908bb885c446ea5
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Gallaecio <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] replaceCategoryLinks() no longer normalizes unmodified categ... - change (pywikibot/core)

Reply via email to