[MediaWiki-commits] [Gerrit] [FEAT] chars: Generic module for char classes - change (pywikibot/core)

XZise (Code Review) Sun, 24 May 2015 11:00:43 -0700

XZise has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/213332


Change subject: [FEAT] chars: Generic module for char classes
......................................................................

[FEAT] chars: Generic module for char classes

Add the pywikibot.tools.chars module which handles currently only invisible
characters. This is now used by replace (instead of a script specific
implementation) and the PatchManager class uses this module too to replace
invisible characters with placeholders.

Change-Id: I79c84f6aa5d980e5481e6b441dcd590f00f1a320
---
M pywikibot/diff.py
A pywikibot/tools/chars.py
M scripts/replace.py
A tests/tools_chars_tests.py
4 files changed, 156 insertions(+), 10 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core 
refs/changes/32/213332/1

diff --git a/pywikibot/diff.py b/pywikibot/diff.py
index 5acc741..1fa8e9b 100644
--- a/pywikibot/diff.py
+++ b/pywikibot/diff.py
@@ -26,6 +26,8 @@
     BeautifulSoup = False
 
 import pywikibot
+from pywikibot.tools import chars
+
 from pywikibot.backports import format_range_unified  # introduced in 2.7.2
 from pywikibot.tools import deprecated_args
 
@@ -221,7 +223,8 @@
     """
 
     @deprecated_args(n='context')
-    def __init__(self, text_a, text_b, context=0, by_letter=False):
+    def __init__(self, text_a, text_b, context=0, by_letter=False,
+                 replace_invisible=False):
         """Constructor.
 
         @param text_a: base text
@@ -233,6 +236,9 @@
         @param by_letter: if text_a and text_b are single lines, comparison 
can be done
             letter by letter.
         @type by_letter: bool
+        @param replace_invisible: Replace invisible characters like U+200e with
+            the charnumber in brackets (e.g. <200e>).
+        @type replace_invisible: bool
         """
         if '\n' in text_a or '\n' in text_b:
             self.a = text_a.splitlines(1)
@@ -265,6 +271,7 @@
         self.blocks = self.get_blocks()
         self.context = context
         self._super_hunks = self._generate_super_hunks()
+        self._replace_invisible = replace_invisible
 
     def get_blocks(self):
         """Return list with blocks of indexes which compose a and, where 
applicable, b.
@@ -352,7 +359,10 @@
                 output += extend_context(previous_hunk.a_rng[1], hunk.a_rng[0])
             previous_hunk = hunk
             output += hunk.diff_text
-        return output + extend_context(hunks[-1].a_rng[1], context_range[0][1])
+        output = output + extend_context(hunks[-1].a_rng[1], 
context_range[0][1])
+        if self._replace_invisible:
+            output = chars.replace_invisible(output)
+        return output
 
     def review_hunks(self):
         """Review hunks."""
diff --git a/pywikibot/tools/chars.py b/pywikibot/tools/chars.py
new file mode 100644
index 0000000..b555cb1
--- /dev/null
+++ b/pywikibot/tools/chars.py
@@ -0,0 +1,80 @@
+# -*- coding: utf-8  -*-
+"""Miscellaneous helper functions (not wiki-dependent)."""
+#
+# (C) Pywikibot team, 2008-2015
+#
+# Distributed under the terms of the MIT license.
+#
+from __future__ import unicode_literals
+
+__version__ = '$Id$'
+
+import sys
+
+from pywikibot.tools import LazyRegex
+
+
+if sys.version_info[0] > 2:
+    unicode = str
+
+
+# All characters in the Cf category in a static list. When testing each Unicode
+# codepoint it takes longer especially when working with UCS2. The lists also
+# differ between Python versions which can be avoided by this static list.
+_category_cf = frozenset([
+    '\U000000ad', '\U00000600', '\U00000601', '\U00000602', '\U00000603',
+    '\U00000604', '\U0000061c', '\U000006dd', '\U0000070f', '\U0000180e',
+    '\U0000200b', '\U0000200c', '\U0000200d', '\U0000200e', '\U0000200f',
+    '\U0000202a', '\U0000202b', '\U0000202c', '\U0000202d', '\U0000202e',
+    '\U00002060', '\U00002061', '\U00002062', '\U00002063', '\U00002064',
+    '\U00002066', '\U00002067', '\U00002068', '\U00002069', '\U0000206a',
+    '\U0000206b', '\U0000206c', '\U0000206d', '\U0000206e', '\U0000206f',
+    '\U0000feff', '\U0000fff9', '\U0000fffa', '\U0000fffb', '\U000110bd',
+    '\U0001d173', '\U0001d174', '\U0001d175', '\U0001d176', '\U0001d177',
+    '\U0001d178', '\U0001d179', '\U0001d17a', '\U000e0001', '\U000e0020',
+    '\U000e0021', '\U000e0022', '\U000e0023', '\U000e0024', '\U000e0025',
+    '\U000e0026', '\U000e0027', '\U000e0028', '\U000e0029', '\U000e002a',
+    '\U000e002b', '\U000e002c', '\U000e002d', '\U000e002e', '\U000e002f',
+    '\U000e0030', '\U000e0031', '\U000e0032', '\U000e0033', '\U000e0034',
+    '\U000e0035', '\U000e0036', '\U000e0037', '\U000e0038', '\U000e0039',
+    '\U000e003a', '\U000e003b', '\U000e003c', '\U000e003d', '\U000e003e',
+    '\U000e003f', '\U000e0040', '\U000e0041', '\U000e0042', '\U000e0043',
+    '\U000e0044', '\U000e0045', '\U000e0046', '\U000e0047', '\U000e0048',
+    '\U000e0049', '\U000e004a', '\U000e004b', '\U000e004c', '\U000e004d',
+    '\U000e004e', '\U000e004f', '\U000e0050', '\U000e0051', '\U000e0052',
+    '\U000e0053', '\U000e0054', '\U000e0055', '\U000e0056', '\U000e0057',
+    '\U000e0058', '\U000e0059', '\U000e005a', '\U000e005b', '\U000e005c',
+    '\U000e005d', '\U000e005e', '\U000e005f', '\U000e0060', '\U000e0061',
+    '\U000e0062', '\U000e0063', '\U000e0064', '\U000e0065', '\U000e0066',
+    '\U000e0067', '\U000e0068', '\U000e0069', '\U000e006a', '\U000e006b',
+    '\U000e006c', '\U000e006d', '\U000e006e', '\U000e006f', '\U000e0070',
+    '\U000e0071', '\U000e0072', '\U000e0073', '\U000e0074', '\U000e0075',
+    '\U000e0076', '\U000e0077', '\U000e0078', '\U000e0079', '\U000e007a',
+    '\U000e007b', '\U000e007c', '\U000e007d', '\U000e007e', '\U000e007f',
+])
+_invisible_chars = frozenset(_category_cf)
+
+# TODO: Is that complex and a lazy regex justified?
+invisible_regex = LazyRegex()
+invisible_regex.raw = '[' + ''.join(_invisible_chars) + ']'
+invisible_regex.flags = 0
+
+
+def contains_invisible(text):
+    """Return True if the text contain any of the invisible characters."""
+    return any(char in _invisible_chars for char in text)
+
+
+def replace_invisible(text):
+    """Replace invisible characters by '<codepoint>'."""
+    def replace(match):
+        match = match.group()
+        if sys.maxunicode < 0x10ffff and len(match) == 2:
+            mask = (1 << 10) - 1
+            assert(ord(match[0]) & ~mask == 0xd800)
+            assert(ord(match[1]) & ~mask == 0xdc00)
+            codepoint = (ord(match[0]) & mask) << 10 | (ord(match[1]) & mask)
+        else:
+            codepoint = ord(match)
+        return '<{0:x}>'.format(codepoint)
+    return invisible_regex.sub(replace, text)
diff --git a/scripts/replace.py b/scripts/replace.py
index 7cb7764..0cf53c2 100755
--- a/scripts/replace.py
+++ b/scripts/replace.py
@@ -137,7 +137,6 @@
 import re
 import time
 import sys
-import unicodedata
 
 import pywikibot
 from pywikibot import i18n, textlib, pagegenerators, Bot
@@ -145,6 +144,8 @@
 
 # Imports predefined replacements tasks from fixes.py
 from pywikibot import fixes
+
+from pywikibot.tools.chars import contains_invisible
 
 if sys.version_info[0] > 2:
     basestring = (str, )
@@ -667,11 +668,6 @@
     return pattern
 
 
-def contains_format_characters(text):
-    """Return True when there are format characters (e.g. U+200E) in text."""
-    return any(unicodedata.category(char) == 'Cf' for char in text)
-
-
 def main(*args):
     """
     Process command line arguments and invoke bot.
@@ -881,10 +877,10 @@
                                           set_summary)
         for replacement in fix['replacements']:
             summary = None if len(replacement) < 3 else replacement[2]
-            if contains_format_characters(replacement[0]):
+            if contains_invisible(replacement[0]):
                 pywikibot.warning('The old string "{0}" contains formatting '
                                   'characters like 
U+200E'.format(replacement[0]))
-            if contains_format_characters(replacement[1]):
+            if contains_invisible(replacement[1]):
                 pywikibot.warning('The new string "{0}" contains formatting '
                                   'characters like 
U+200E'.format(replacement[1]))
             replacements.append(ReplacementListEntry(
diff --git a/tests/tools_chars_tests.py b/tests/tools_chars_tests.py
new file mode 100644
index 0000000..9818c58
--- /dev/null
+++ b/tests/tools_chars_tests.py
@@ -0,0 +1,60 @@
+#!/usr/bin/python
+"""Test tools.chars package."""
+# -*- coding: utf-8  -*-
+#
+# (C) Pywikibot team, 2015
+#
+# Distributed under the terms of the MIT license.
+from __future__ import unicode_literals
+
+__version__ = '$Id$'
+
+import sys
+import unicodedata
+
+from pywikibot.tools import chars
+
+from tests.aspects import unittest, TestCase
+
+
+class CharsTestCase(TestCase):
+
+    """General test case testing the module."""
+
+    net = False
+
+    def test_replace(self):
+        """Test replace_invisible."""
+        self.assertEqual(chars.replace_invisible('Hello world!'), 'Hello 
world!')
+        self.assertEqual(chars.replace_invisible('\u200eRTL\u200f'), 
'<200e>RTL<200f>')
+
+    def test_contains(self):
+        """Test contains_invisible."""
+        self.assertFalse(chars.contains_invisible('Hello world!'))
+        self.assertTrue(chars.contains_invisible('\u200eRTL\u200f'))
+
+    def test_category_cf(self):
+        """Test that all characters in _category_cf are actually in Cf."""
+        invalid = {}
+        for char in chars._category_cf:
+            cat = unicodedata.category(char)
+            if cat != 'Cf':
+                invalid[char] = cat
+        if sys.version_info[0] == 2:
+            # Python 2 adds these from Cf to the wrong categories
+            # TODO: Or has the specification changed?
+            self.assertEqual(invalid.pop('\u0604'), 'Cn')
+            self.assertEqual(invalid.pop('\u061c'), 'Cn')
+            self.assertEqual(invalid.pop('\u2068'), 'Cn')
+            self.assertEqual(invalid.pop('\u180e'), 'Zs')
+            self.assertEqual(invalid.pop('\u2067'), 'Cn')
+            self.assertEqual(invalid.pop('\u2066'), 'Cn')
+            self.assertEqual(invalid.pop('\u2069'), 'Cn')
+        self.assertCountEqual(invalid.items(), [])
+
+
+if __name__ == '__main__':
+    try:
+        unittest.main()
+    except SystemExit:
+        pass

-- 
To view, visit https://gerrit.wikimedia.org/r/213332
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I79c84f6aa5d980e5481e6b441dcd590f00f1a320
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: XZise <commodorefabia...@gmx.de>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] [FEAT] chars: Generic module for char classes - change (pywikibot/core)

Reply via email to