Author: Maciej Fijalkowski <fij...@gmail.com> Branch: Changeset: r60235:8b3fd7dd838c Date: 2013-01-20 16:13 +0200 http://bitbucket.org/pypy/pypy/changeset/8b3fd7dd838c/
Log: shuffle stuff around diff --git a/pypy/module/unicodedata/interp_ucd.py b/pypy/module/unicodedata/interp_ucd.py --- a/pypy/module/unicodedata/interp_ucd.py +++ b/pypy/module/unicodedata/interp_ucd.py @@ -9,6 +9,7 @@ from rpython.rlib.objectmodel import we_are_translated from rpython.rlib.runicode import MAXUNICODE from rpython.rlib.unicodedata import unicodedb_5_2_0, unicodedb_3_2_0 +from rpython.rlib.unicodedata.ucd import code_to_unichr, ORD import sys @@ -30,25 +31,6 @@ # The functions below are subtly different from the ones in runicode.py. # When PyPy implements Python 3 they should be merged. -def UNICHR(c): - if c <= sys.maxunicode and c <= MAXUNICODE: - return unichr(c) - else: - c -= 0x10000 - return (unichr(0xD800 + (c >> 10)) + - unichr(0xDC00 + (c & 0x03FF))) - -def ORD(u): - assert isinstance(u, unicode) - if len(u) == 1: - return ord(u[0]) - elif len(u) == 2: - ch1 = ord(u[0]) - ch2 = ord(u[1]) - if 0xD800 <= ch1 <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF: - return (((ch1 - 0xD800) << 10) | (ch2 - 0xDC00)) + 0x10000 - raise ValueError - if MAXUNICODE > 0xFFFF: # Target is wide build def unichr_to_code_w(space, w_unichr): @@ -69,12 +51,6 @@ 'need a single Unicode character as parameter')) return space.int_w(space.ord(w_unichr)) - def code_to_unichr(code): - if not we_are_translated() and sys.maxunicode == 0xFFFF: - # Host CPython is narrow build, generate surrogates - return UNICHR(code) - else: - return unichr(code) else: # Target is narrow build def unichr_to_code_w(space, w_unichr): @@ -97,10 +73,6 @@ raise OperationError(space.w_TypeError, space.wrap( 'need a single Unicode character as parameter')) - def code_to_unichr(code): - # generate surrogates for large codes - return UNICHR(code) - class UCD(Wrappable): def __init__(self, unicodedb): diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py --- a/rpython/rlib/runicode.py +++ b/rpython/rlib/runicode.py @@ -1,49 +1,9 @@ import sys -from rpython.rlib.bitmanipulation import splitter -from rpython.rtyper.lltypesystem import lltype, rffi -from rpython.rlib.objectmodel import we_are_translated, specialize, enforceargs +from rpython.rlib.objectmodel import specialize, enforceargs from rpython.rlib.rstring import StringBuilder, UnicodeBuilder from rpython.rlib.rarithmetic import r_uint, intmask from rpython.rlib.unicodedata import unicodedb - -if rffi.sizeof(lltype.UniChar) == 4: - MAXUNICODE = 0x10ffff -else: - MAXUNICODE = 0xffff -BYTEORDER = sys.byteorder - -if MAXUNICODE > sys.maxunicode: - # A version of unichr which allows codes outside the BMP - # even on narrow unicode builds. - # It will be used when interpreting code on top of a UCS2 CPython, - # when sizeof(wchar_t) == 4. - # Note that Python3 uses a similar implementation. - def UNICHR(c): - assert not we_are_translated() - if c <= sys.maxunicode or c > MAXUNICODE: - return unichr(c) - else: - c -= 0x10000 - return (unichr(0xD800 + (c >> 10)) + - unichr(0xDC00 + (c & 0x03FF))) - UNICHR._flowspace_rewrite_directly_as_ = unichr - # ^^^ NB.: for translation, it's essential to use this hack instead - # of calling unichr() from UNICHR(), because unichr() detects if there - # is a "try:except ValueError" immediately around it. - - def ORD(u): - assert not we_are_translated() - if isinstance(u, unicode) and len(u) == 2: - ch1 = ord(u[0]) - ch2 = ord(u[1]) - if 0xD800 <= ch1 <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF: - return (((ch1 - 0xD800) << 10) | (ch2 - 0xDC00)) + 0x10000 - return ord(u) - ORD._flowspace_rewrite_directly_as_ = ord - -else: - UNICHR = unichr - ORD = ord +from rpython.rlib.unicodedata.ucd import MAXUNICODE, UNICHR, BYTEORDER def default_unicode_error_decode(errors, encoding, msg, s, @@ -446,16 +406,6 @@ result.append(r) return result.build(), pos, bo -def _STORECHAR(result, CH, byteorder): - hi = chr(((CH) >> 8) & 0xff) - lo = chr((CH) & 0xff) - if byteorder == 'little': - result.append(lo) - result.append(hi) - else: - result.append(hi) - result.append(lo) - def unicode_encode_utf_16_helper(s, size, errors, errorhandler=None, byteorder='little'): diff --git a/pypy/module/unicodedata/test_interp_ucd.py b/rpython/rlib/unicodedata/test/test_ucd.py rename from pypy/module/unicodedata/test_interp_ucd.py rename to rpython/rlib/unicodedata/test/test_ucd.py --- a/pypy/module/unicodedata/test_interp_ucd.py +++ b/rpython/rlib/unicodedata/test/test_ucd.py @@ -1,6 +1,6 @@ from rpython.rtyper.test.tool import BaseRtypingTest, LLRtypeMixin from rpython.rlib.unicodedata import unicodedb_5_2_0 -from pypy.module.unicodedata.interp_ucd import code_to_unichr +from rpython.rlib.unicodedata.ucd import code_to_unichr class TestTranslated(BaseRtypingTest, LLRtypeMixin): diff --git a/rpython/rlib/unicodedata/test/test_unicodedata.py b/rpython/rlib/unicodedata/test/test_unicodedata.py --- a/rpython/rlib/unicodedata/test/test_unicodedata.py +++ b/rpython/rlib/unicodedata/test/test_unicodedata.py @@ -1,109 +1,6 @@ import py from rpython.rlib.unicodedata import unicodedb_3_2_0, unicodedb_5_2_0 -class AppTestUnicodeData: - spaceconfig = dict(usemodules=('unicodedata',)) - - def test_hangul_syllables(self): - import unicodedata - # Test all leading, vowel and trailing jamo - # but not every combination of them. - for code, name in ((0xAC00, 'HANGUL SYLLABLE GA'), - (0xAE69, 'HANGUL SYLLABLE GGAEG'), - (0xB0D2, 'HANGUL SYLLABLE NYAGG'), - (0xB33B, 'HANGUL SYLLABLE DYAEGS'), - (0xB5A4, 'HANGUL SYLLABLE DDEON'), - (0xB80D, 'HANGUL SYLLABLE RENJ'), - (0xBA76, 'HANGUL SYLLABLE MYEONH'), - (0xBCDF, 'HANGUL SYLLABLE BYED'), - (0xBF48, 'HANGUL SYLLABLE BBOL'), - (0xC1B1, 'HANGUL SYLLABLE SWALG'), - (0xC41A, 'HANGUL SYLLABLE SSWAELM'), - (0xC683, 'HANGUL SYLLABLE OELB'), - (0xC8EC, 'HANGUL SYLLABLE JYOLS'), - (0xCB55, 'HANGUL SYLLABLE JJULT'), - (0xCDBE, 'HANGUL SYLLABLE CWEOLP'), - (0xD027, 'HANGUL SYLLABLE KWELH'), - (0xD290, 'HANGUL SYLLABLE TWIM'), - (0xD4F9, 'HANGUL SYLLABLE PYUB'), - (0xD762, 'HANGUL SYLLABLE HEUBS'), - (0xAE27, 'HANGUL SYLLABLE GYIS'), - (0xB090, 'HANGUL SYLLABLE GGISS'), - (0xB0AD, 'HANGUL SYLLABLE NANG'), - (0xB316, 'HANGUL SYLLABLE DAEJ'), - (0xB57F, 'HANGUL SYLLABLE DDYAC'), - (0xB7E8, 'HANGUL SYLLABLE RYAEK'), - (0xBA51, 'HANGUL SYLLABLE MEOT'), - (0xBCBA, 'HANGUL SYLLABLE BEP'), - (0xBF23, 'HANGUL SYLLABLE BBYEOH'), - (0xD7A3, 'HANGUL SYLLABLE HIH')): - assert unicodedata.name(unichr(code)) == name - assert unicodedata.lookup(name) == unichr(code) - # Test outside the range - py.test.raises(ValueError, unicodedata.name, unichr(0xAC00 - 1)) - py.test.raises(ValueError, unicodedata.name, unichr(0xD7A3 + 1)) - - def test_cjk(self): - import sys - import unicodedata - cases = ((0x3400, 0x4DB5), - (0x4E00, 0x9FA5)) - if unicodedata.unidata_version >= "5": # don't know the exact limit - cases = ((0x3400, 0x4DB5), - (0x4E00, 0x9FCB), - (0x20000, 0x2A6D6), - (0x2A700, 0x2B734)) - elif unicodedata.unidata_version >= "4.1": - cases = ((0x3400, 0x4DB5), - (0x4E00, 0x9FBB), - (0x20000, 0x2A6D6)) - for first, last in cases: - # Test at and inside the boundary - for i in (first, first + 1, last - 1, last): - charname = 'CJK UNIFIED IDEOGRAPH-%X'%i - char = ('\\U%08X' % i).decode('unicode-escape') - assert unicodedata.name(char) == charname - assert unicodedata.lookup(charname) == char - # Test outside the boundary - for i in first - 1, last + 1: - charname = 'CJK UNIFIED IDEOGRAPH-%X'%i - char = ('\\U%08X' % i).decode('unicode-escape') - try: - unicodedata.name(char) - except ValueError, e: - assert e.message == 'no such name' - py.test.raises(KeyError, unicodedata.lookup, charname) - - def test_bug_1704793(self): # from CPython - import unicodedata - assert unicodedata.lookup("GOTHIC LETTER FAIHU") == u'\U00010346' - - def test_normalize(self): - import unicodedata - py.test.raises(TypeError, unicodedata.normalize, 'x') - - def test_normalize_wide(self): - import sys, unicodedata - if sys.maxunicode < 0x10ffff: - skip("requires a 'wide' python build.") - assert unicodedata.normalize('NFC', u'\U000110a5\U000110ba') == u'\U000110ab' - - def test_linebreaks(self): - linebreaks = (0x0a, 0x0b, 0x0c, 0x0d, 0x85, - 0x1c, 0x1d, 0x1e, 0x2028, 0x2029) - for i in linebreaks: - for j in range(-2, 3): - lines = (unichr(i + j) + u'A').splitlines() - if i + j in linebreaks: - assert len(lines) == 2 - else: - assert len(lines) == 1 - - def test_mirrored(self): - import unicodedata - # For no reason, unicodedata.mirrored() returns an int, not a bool - assert repr(unicodedata.mirrored(u' ')) == '0' - class TestUnicodeData(object): def setup_class(cls): import random, unicodedata diff --git a/rpython/rlib/unicodedata/ucd.py b/rpython/rlib/unicodedata/ucd.py new file mode 100644 --- /dev/null +++ b/rpython/rlib/unicodedata/ucd.py @@ -0,0 +1,87 @@ + +import sys +from rpython.rtyper.lltypesystem import lltype, rffi +from rpython.rlib.objectmodel import we_are_translated + + +if rffi.sizeof(lltype.UniChar) == 4: + MAXUNICODE = 0x10ffff +else: + MAXUNICODE = 0xffff + +BYTEORDER = sys.byteorder + +if MAXUNICODE > sys.maxunicode: + # A version of unichr which allows codes outside the BMP + # even on narrow unicode builds. + # It will be used when interpreting code on top of a UCS2 CPython, + # when sizeof(wchar_t) == 4. + # Note that Python3 uses a similar implementation. + def UNICHR(c): + assert not we_are_translated() + if c <= sys.maxunicode or c > MAXUNICODE: + return unichr(c) + else: + c -= 0x10000 + return (unichr(0xD800 + (c >> 10)) + + unichr(0xDC00 + (c & 0x03FF))) + UNICHR._flowspace_rewrite_directly_as_ = unichr + # ^^^ NB.: for translation, it's essential to use this hack instead + # of calling unichr() from UNICHR(), because unichr() detects if there + # is a "try:except ValueError" immediately around it. + + def ORD(u): + assert not we_are_translated() + if isinstance(u, unicode) and len(u) == 2: + ch1 = ord(u[0]) + ch2 = ord(u[1]) + if 0xD800 <= ch1 <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF: + return (((ch1 - 0xD800) << 10) | (ch2 - 0xDC00)) + 0x10000 + return ord(u) + ORD._flowspace_rewrite_directly_as_ = ord + +else: + UNICHR = unichr + ORD = ord + +if MAXUNICODE > 0xFFFF: + def code_to_unichr(code): + if not we_are_translated() and sys.maxunicode == 0xFFFF: + # Host CPython is narrow build, generate surrogates + return UNICHR(code) + else: + return unichr(code) +else: + def code_to_unichr(code): + # generate surrogates for large codes + return UNICHR(code) + + +def UNICHR(c): + if c <= sys.maxunicode and c <= MAXUNICODE: + return unichr(c) + else: + c -= 0x10000 + return (unichr(0xD800 + (c >> 10)) + + unichr(0xDC00 + (c & 0x03FF))) + +def ORD(u): + assert isinstance(u, unicode) + if len(u) == 1: + return ord(u[0]) + elif len(u) == 2: + ch1 = ord(u[0]) + ch2 = ord(u[1]) + if 0xD800 <= ch1 <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF: + return (((ch1 - 0xD800) << 10) | (ch2 - 0xDC00)) + 0x10000 + raise ValueError + +def _STORECHAR(result, CH, byteorder): + hi = chr(((CH) >> 8) & 0xff) + lo = chr((CH) & 0xff) + if byteorder == 'little': + result.append(lo) + result.append(hi) + else: + result.append(hi) + result.append(lo) _______________________________________________ pypy-commit mailing list pypy-commit@python.org http://mail.python.org/mailman/listinfo/pypy-commit