XZise has uploaded a new change for review. https://gerrit.wikimedia.org/r/213332
Change subject: [FEAT] chars: Generic module for char classes ...................................................................... [FEAT] chars: Generic module for char classes Add the pywikibot.tools.chars module which handles currently only invisible characters. This is now used by replace (instead of a script specific implementation) and the PatchManager class uses this module too to replace invisible characters with placeholders. Change-Id: I79c84f6aa5d980e5481e6b441dcd590f00f1a320 --- M pywikibot/diff.py A pywikibot/tools/chars.py M scripts/replace.py A tests/tools_chars_tests.py 4 files changed, 156 insertions(+), 10 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core refs/changes/32/213332/1 diff --git a/pywikibot/diff.py b/pywikibot/diff.py index 5acc741..1fa8e9b 100644 --- a/pywikibot/diff.py +++ b/pywikibot/diff.py @@ -26,6 +26,8 @@ BeautifulSoup = False import pywikibot +from pywikibot.tools import chars + from pywikibot.backports import format_range_unified # introduced in 2.7.2 from pywikibot.tools import deprecated_args @@ -221,7 +223,8 @@ """ @deprecated_args(n='context') - def __init__(self, text_a, text_b, context=0, by_letter=False): + def __init__(self, text_a, text_b, context=0, by_letter=False, + replace_invisible=False): """Constructor. @param text_a: base text @@ -233,6 +236,9 @@ @param by_letter: if text_a and text_b are single lines, comparison can be done letter by letter. @type by_letter: bool + @param replace_invisible: Replace invisible characters like U+200e with + the charnumber in brackets (e.g. <200e>). + @type replace_invisible: bool """ if '\n' in text_a or '\n' in text_b: self.a = text_a.splitlines(1) @@ -265,6 +271,7 @@ self.blocks = self.get_blocks() self.context = context self._super_hunks = self._generate_super_hunks() + self._replace_invisible = replace_invisible def get_blocks(self): """Return list with blocks of indexes which compose a and, where applicable, b. @@ -352,7 +359,10 @@ output += extend_context(previous_hunk.a_rng[1], hunk.a_rng[0]) previous_hunk = hunk output += hunk.diff_text - return output + extend_context(hunks[-1].a_rng[1], context_range[0][1]) + output = output + extend_context(hunks[-1].a_rng[1], context_range[0][1]) + if self._replace_invisible: + output = chars.replace_invisible(output) + return output def review_hunks(self): """Review hunks.""" diff --git a/pywikibot/tools/chars.py b/pywikibot/tools/chars.py new file mode 100644 index 0000000..b555cb1 --- /dev/null +++ b/pywikibot/tools/chars.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- +"""Miscellaneous helper functions (not wiki-dependent).""" +# +# (C) Pywikibot team, 2008-2015 +# +# Distributed under the terms of the MIT license. +# +from __future__ import unicode_literals + +__version__ = '$Id$' + +import sys + +from pywikibot.tools import LazyRegex + + +if sys.version_info[0] > 2: + unicode = str + + +# All characters in the Cf category in a static list. When testing each Unicode +# codepoint it takes longer especially when working with UCS2. The lists also +# differ between Python versions which can be avoided by this static list. +_category_cf = frozenset([ + '\U000000ad', '\U00000600', '\U00000601', '\U00000602', '\U00000603', + '\U00000604', '\U0000061c', '\U000006dd', '\U0000070f', '\U0000180e', + '\U0000200b', '\U0000200c', '\U0000200d', '\U0000200e', '\U0000200f', + '\U0000202a', '\U0000202b', '\U0000202c', '\U0000202d', '\U0000202e', + '\U00002060', '\U00002061', '\U00002062', '\U00002063', '\U00002064', + '\U00002066', '\U00002067', '\U00002068', '\U00002069', '\U0000206a', + '\U0000206b', '\U0000206c', '\U0000206d', '\U0000206e', '\U0000206f', + '\U0000feff', '\U0000fff9', '\U0000fffa', '\U0000fffb', '\U000110bd', + '\U0001d173', '\U0001d174', '\U0001d175', '\U0001d176', '\U0001d177', + '\U0001d178', '\U0001d179', '\U0001d17a', '\U000e0001', '\U000e0020', + '\U000e0021', '\U000e0022', '\U000e0023', '\U000e0024', '\U000e0025', + '\U000e0026', '\U000e0027', '\U000e0028', '\U000e0029', '\U000e002a', + '\U000e002b', '\U000e002c', '\U000e002d', '\U000e002e', '\U000e002f', + '\U000e0030', '\U000e0031', '\U000e0032', '\U000e0033', '\U000e0034', + '\U000e0035', '\U000e0036', '\U000e0037', '\U000e0038', '\U000e0039', + '\U000e003a', '\U000e003b', '\U000e003c', '\U000e003d', '\U000e003e', + '\U000e003f', '\U000e0040', '\U000e0041', '\U000e0042', '\U000e0043', + '\U000e0044', '\U000e0045', '\U000e0046', '\U000e0047', '\U000e0048', + '\U000e0049', '\U000e004a', '\U000e004b', '\U000e004c', '\U000e004d', + '\U000e004e', '\U000e004f', '\U000e0050', '\U000e0051', '\U000e0052', + '\U000e0053', '\U000e0054', '\U000e0055', '\U000e0056', '\U000e0057', + '\U000e0058', '\U000e0059', '\U000e005a', '\U000e005b', '\U000e005c', + '\U000e005d', '\U000e005e', '\U000e005f', '\U000e0060', '\U000e0061', + '\U000e0062', '\U000e0063', '\U000e0064', '\U000e0065', '\U000e0066', + '\U000e0067', '\U000e0068', '\U000e0069', '\U000e006a', '\U000e006b', + '\U000e006c', '\U000e006d', '\U000e006e', '\U000e006f', '\U000e0070', + '\U000e0071', '\U000e0072', '\U000e0073', '\U000e0074', '\U000e0075', + '\U000e0076', '\U000e0077', '\U000e0078', '\U000e0079', '\U000e007a', + '\U000e007b', '\U000e007c', '\U000e007d', '\U000e007e', '\U000e007f', +]) +_invisible_chars = frozenset(_category_cf) + +# TODO: Is that complex and a lazy regex justified? +invisible_regex = LazyRegex() +invisible_regex.raw = '[' + ''.join(_invisible_chars) + ']' +invisible_regex.flags = 0 + + +def contains_invisible(text): + """Return True if the text contain any of the invisible characters.""" + return any(char in _invisible_chars for char in text) + + +def replace_invisible(text): + """Replace invisible characters by '<codepoint>'.""" + def replace(match): + match = match.group() + if sys.maxunicode < 0x10ffff and len(match) == 2: + mask = (1 << 10) - 1 + assert(ord(match[0]) & ~mask == 0xd800) + assert(ord(match[1]) & ~mask == 0xdc00) + codepoint = (ord(match[0]) & mask) << 10 | (ord(match[1]) & mask) + else: + codepoint = ord(match) + return '<{0:x}>'.format(codepoint) + return invisible_regex.sub(replace, text) diff --git a/scripts/replace.py b/scripts/replace.py index 7cb7764..0cf53c2 100755 --- a/scripts/replace.py +++ b/scripts/replace.py @@ -137,7 +137,6 @@ import re import time import sys -import unicodedata import pywikibot from pywikibot import i18n, textlib, pagegenerators, Bot @@ -145,6 +144,8 @@ # Imports predefined replacements tasks from fixes.py from pywikibot import fixes + +from pywikibot.tools.chars import contains_invisible if sys.version_info[0] > 2: basestring = (str, ) @@ -667,11 +668,6 @@ return pattern -def contains_format_characters(text): - """Return True when there are format characters (e.g. U+200E) in text.""" - return any(unicodedata.category(char) == 'Cf' for char in text) - - def main(*args): """ Process command line arguments and invoke bot. @@ -881,10 +877,10 @@ set_summary) for replacement in fix['replacements']: summary = None if len(replacement) < 3 else replacement[2] - if contains_format_characters(replacement[0]): + if contains_invisible(replacement[0]): pywikibot.warning('The old string "{0}" contains formatting ' 'characters like U+200E'.format(replacement[0])) - if contains_format_characters(replacement[1]): + if contains_invisible(replacement[1]): pywikibot.warning('The new string "{0}" contains formatting ' 'characters like U+200E'.format(replacement[1])) replacements.append(ReplacementListEntry( diff --git a/tests/tools_chars_tests.py b/tests/tools_chars_tests.py new file mode 100644 index 0000000..9818c58 --- /dev/null +++ b/tests/tools_chars_tests.py @@ -0,0 +1,60 @@ +#!/usr/bin/python +"""Test tools.chars package.""" +# -*- coding: utf-8 -*- +# +# (C) Pywikibot team, 2015 +# +# Distributed under the terms of the MIT license. +from __future__ import unicode_literals + +__version__ = '$Id$' + +import sys +import unicodedata + +from pywikibot.tools import chars + +from tests.aspects import unittest, TestCase + + +class CharsTestCase(TestCase): + + """General test case testing the module.""" + + net = False + + def test_replace(self): + """Test replace_invisible.""" + self.assertEqual(chars.replace_invisible('Hello world!'), 'Hello world!') + self.assertEqual(chars.replace_invisible('\u200eRTL\u200f'), '<200e>RTL<200f>') + + def test_contains(self): + """Test contains_invisible.""" + self.assertFalse(chars.contains_invisible('Hello world!')) + self.assertTrue(chars.contains_invisible('\u200eRTL\u200f')) + + def test_category_cf(self): + """Test that all characters in _category_cf are actually in Cf.""" + invalid = {} + for char in chars._category_cf: + cat = unicodedata.category(char) + if cat != 'Cf': + invalid[char] = cat + if sys.version_info[0] == 2: + # Python 2 adds these from Cf to the wrong categories + # TODO: Or has the specification changed? + self.assertEqual(invalid.pop('\u0604'), 'Cn') + self.assertEqual(invalid.pop('\u061c'), 'Cn') + self.assertEqual(invalid.pop('\u2068'), 'Cn') + self.assertEqual(invalid.pop('\u180e'), 'Zs') + self.assertEqual(invalid.pop('\u2067'), 'Cn') + self.assertEqual(invalid.pop('\u2066'), 'Cn') + self.assertEqual(invalid.pop('\u2069'), 'Cn') + self.assertCountEqual(invalid.items(), []) + + +if __name__ == '__main__': + try: + unittest.main() + except SystemExit: + pass -- To view, visit https://gerrit.wikimedia.org/r/213332 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I79c84f6aa5d980e5481e6b441dcd590f00f1a320 Gerrit-PatchSet: 1 Gerrit-Project: pywikibot/core Gerrit-Branch: master Gerrit-Owner: XZise <commodorefabia...@gmx.de> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits