Author: Amaury Forgeot d'Arc <amaur...@gmail.com> Branch: py3k Changeset: r60256:4b63836b7e97 Date: 2013-01-20 20:27 +0100 http://bitbucket.org/pypy/pypy/changeset/4b63836b7e97/
Log: hg merge default diff --git a/pypy/module/unicodedata/interp_ucd.py b/pypy/module/unicodedata/interp_ucd.py --- a/pypy/module/unicodedata/interp_ucd.py +++ b/pypy/module/unicodedata/interp_ucd.py @@ -9,6 +9,7 @@ from rpython.rlib.objectmodel import we_are_translated from rpython.rlib.runicode import MAXUNICODE from rpython.rlib.unicodedata import unicodedb_5_2_0, unicodedb_3_2_0 +from rpython.rlib.runicode import code_to_unichr, ORD import sys @@ -30,25 +31,6 @@ # The functions below are subtly different from the ones in runicode.py. # When PyPy implements Python 3 they should be merged. -def UNICHR(c): - if c <= sys.maxunicode and c <= MAXUNICODE: - return unichr(c) - else: - c -= 0x10000 - return (unichr(0xD800 + (c >> 10)) + - unichr(0xDC00 + (c & 0x03FF))) - -def ORD(u): - assert isinstance(u, unicode) - if len(u) == 1: - return ord(u[0]) - elif len(u) == 2: - ch1 = ord(u[0]) - ch2 = ord(u[1]) - if 0xD800 <= ch1 <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF: - return (((ch1 - 0xD800) << 10) | (ch2 - 0xDC00)) + 0x10000 - raise ValueError - if MAXUNICODE > 0xFFFF: # Target is wide build def unichr_to_code_w(space, w_unichr): @@ -69,12 +51,6 @@ 'need a single Unicode character as parameter')) return space.int_w(space.ord(w_unichr)) - def code_to_unichr(code): - if not we_are_translated() and sys.maxunicode == 0xFFFF: - # Host CPython is narrow build, generate surrogates - return UNICHR(code) - else: - return unichr(code) else: # Target is narrow build def unichr_to_code_w(space, w_unichr): @@ -97,10 +73,6 @@ raise OperationError(space.w_TypeError, space.wrap( 'need a single Unicode character as parameter')) - def code_to_unichr(code): - # generate surrogates for large codes - return UNICHR(code) - class UCD(Wrappable): def __init__(self, unicodedb): diff --git a/pypy/module/unicodedata/test_unicodedata.py b/pypy/module/unicodedata/test_unicodedata.py new file mode 100644 --- /dev/null +++ b/pypy/module/unicodedata/test_unicodedata.py @@ -0,0 +1,103 @@ + +class AppTestUnicodeData: + spaceconfig = dict(usemodules=('unicodedata',)) + + def test_hangul_syllables(self): + import unicodedata + # Test all leading, vowel and trailing jamo + # but not every combination of them. + for code, name in ((0xAC00, 'HANGUL SYLLABLE GA'), + (0xAE69, 'HANGUL SYLLABLE GGAEG'), + (0xB0D2, 'HANGUL SYLLABLE NYAGG'), + (0xB33B, 'HANGUL SYLLABLE DYAEGS'), + (0xB5A4, 'HANGUL SYLLABLE DDEON'), + (0xB80D, 'HANGUL SYLLABLE RENJ'), + (0xBA76, 'HANGUL SYLLABLE MYEONH'), + (0xBCDF, 'HANGUL SYLLABLE BYED'), + (0xBF48, 'HANGUL SYLLABLE BBOL'), + (0xC1B1, 'HANGUL SYLLABLE SWALG'), + (0xC41A, 'HANGUL SYLLABLE SSWAELM'), + (0xC683, 'HANGUL SYLLABLE OELB'), + (0xC8EC, 'HANGUL SYLLABLE JYOLS'), + (0xCB55, 'HANGUL SYLLABLE JJULT'), + (0xCDBE, 'HANGUL SYLLABLE CWEOLP'), + (0xD027, 'HANGUL SYLLABLE KWELH'), + (0xD290, 'HANGUL SYLLABLE TWIM'), + (0xD4F9, 'HANGUL SYLLABLE PYUB'), + (0xD762, 'HANGUL SYLLABLE HEUBS'), + (0xAE27, 'HANGUL SYLLABLE GYIS'), + (0xB090, 'HANGUL SYLLABLE GGISS'), + (0xB0AD, 'HANGUL SYLLABLE NANG'), + (0xB316, 'HANGUL SYLLABLE DAEJ'), + (0xB57F, 'HANGUL SYLLABLE DDYAC'), + (0xB7E8, 'HANGUL SYLLABLE RYAEK'), + (0xBA51, 'HANGUL SYLLABLE MEOT'), + (0xBCBA, 'HANGUL SYLLABLE BEP'), + (0xBF23, 'HANGUL SYLLABLE BBYEOH'), + (0xD7A3, 'HANGUL SYLLABLE HIH')): + assert unicodedata.name(chr(code)) == name + assert unicodedata.lookup(name) == chr(code) + # Test outside the range + raises(ValueError, unicodedata.name, chr(0xAC00 - 1)) + raises(ValueError, unicodedata.name, chr(0xD7A3 + 1)) + + def test_cjk(self): + import sys + import unicodedata + cases = ((0x3400, 0x4DB5), + (0x4E00, 0x9FA5)) + if unicodedata.unidata_version >= "5": # don't know the exact limit + cases = ((0x3400, 0x4DB5), + (0x4E00, 0x9FCB), + (0x20000, 0x2A6D6), + (0x2A700, 0x2B734)) + elif unicodedata.unidata_version >= "4.1": + cases = ((0x3400, 0x4DB5), + (0x4E00, 0x9FBB), + (0x20000, 0x2A6D6)) + for first, last in cases: + # Test at and inside the boundary + for i in (first, first + 1, last - 1, last): + charname = 'CJK UNIFIED IDEOGRAPH-%X'%i + char = chr(i) + assert unicodedata.name(char) == charname + assert unicodedata.lookup(charname) == char + # Test outside the boundary + for i in first - 1, last + 1: + charname = 'CJK UNIFIED IDEOGRAPH-%X'%i + char = chr(i) + try: + unicodedata.name(char) + except ValueError as e: + assert e.message == 'no such name' + raises(KeyError, unicodedata.lookup, charname) + + def test_bug_1704793(self): # from CPython + import unicodedata + assert unicodedata.lookup("GOTHIC LETTER FAIHU") == '\U00010346' + + def test_normalize(self): + import unicodedata + raises(TypeError, unicodedata.normalize, 'x') + + def test_normalize_wide(self): + import sys, unicodedata + if sys.maxunicode < 0x10ffff: + skip("requires a 'wide' python build.") + assert unicodedata.normalize('NFC', '\U000110a5\U000110ba') == '\U000110ab' + + def test_linebreaks(self): + linebreaks = (0x0a, 0x0b, 0x0c, 0x0d, 0x85, + 0x1c, 0x1d, 0x1e, 0x2028, 0x2029) + for i in linebreaks: + for j in range(-2, 3): + lines = (chr(i + j) + 'A').splitlines() + if i + j in linebreaks: + assert len(lines) == 2 + else: + assert len(lines) == 1 + + def test_mirrored(self): + import unicodedata + # For no reason, unicodedata.mirrored() returns an int, not a bool + assert repr(unicodedata.mirrored(' ')) == '0' diff --git a/rpython/bin/rpython b/rpython/bin/rpython --- a/rpython/bin/rpython +++ b/rpython/bin/rpython @@ -7,7 +7,8 @@ run with --help for more information """ -import sys +import sys, os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) from rpython.translator.goal.translate import main # no implicit targets diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py --- a/rpython/rlib/runicode.py +++ b/rpython/rlib/runicode.py @@ -1,15 +1,16 @@ import sys -from rpython.rlib.bitmanipulation import splitter -from rpython.rtyper.lltypesystem import lltype, rffi -from rpython.rlib.objectmodel import we_are_translated, specialize, enforceargs +from rpython.rlib.objectmodel import specialize, we_are_translated from rpython.rlib.rstring import StringBuilder, UnicodeBuilder from rpython.rlib.rarithmetic import r_uint, intmask from rpython.rlib.unicodedata import unicodedb +from rpython.rtyper.lltypesystem import lltype, rffi + if rffi.sizeof(lltype.UniChar) == 4: MAXUNICODE = 0x10ffff else: MAXUNICODE = 0xffff + BYTEORDER = sys.byteorder if MAXUNICODE > sys.maxunicode: @@ -45,6 +46,27 @@ UNICHR = unichr ORD = ord +if MAXUNICODE > 0xFFFF: + def code_to_unichr(code): + if not we_are_translated() and sys.maxunicode == 0xFFFF: + # Host CPython is narrow build, generate surrogates + return UNICHR(code) + else: + return unichr(code) +else: + def code_to_unichr(code): + # generate surrogates for large codes + return UNICHR(code) + +def _STORECHAR(result, CH, byteorder): + hi = chr(((CH) >> 8) & 0xff) + lo = chr((CH) & 0xff) + if byteorder == 'little': + result.append(lo) + result.append(hi) + else: + result.append(hi) + result.append(lo) def default_unicode_error_decode(errors, encoding, msg, s, startingpos, endingpos): @@ -446,16 +468,6 @@ result.append(r) return result.build(), pos, bo -def _STORECHAR(result, CH, byteorder): - hi = chr(((CH) >> 8) & 0xff) - lo = chr((CH) & 0xff) - if byteorder == 'little': - result.append(lo) - result.append(hi) - else: - result.append(hi) - result.append(lo) - def unicode_encode_utf_16_helper(s, size, errors, errorhandler=None, byteorder='little'): diff --git a/pypy/module/unicodedata/test_interp_ucd.py b/rpython/rlib/unicodedata/test/test_ucd.py rename from pypy/module/unicodedata/test_interp_ucd.py rename to rpython/rlib/unicodedata/test/test_ucd.py --- a/pypy/module/unicodedata/test_interp_ucd.py +++ b/rpython/rlib/unicodedata/test/test_ucd.py @@ -1,6 +1,6 @@ from rpython.rtyper.test.tool import BaseRtypingTest, LLRtypeMixin from rpython.rlib.unicodedata import unicodedb_5_2_0 -from pypy.module.unicodedata.interp_ucd import code_to_unichr +from rpython.rlib.unicodedata.ucd import code_to_unichr class TestTranslated(BaseRtypingTest, LLRtypeMixin): diff --git a/rpython/rlib/unicodedata/test/test_unicodedata.py b/rpython/rlib/unicodedata/test/test_unicodedata.py --- a/rpython/rlib/unicodedata/test/test_unicodedata.py +++ b/rpython/rlib/unicodedata/test/test_unicodedata.py @@ -1,109 +1,6 @@ import py from rpython.rlib.unicodedata import unicodedb_3_2_0, unicodedb_5_2_0 -class AppTestUnicodeData: - spaceconfig = dict(usemodules=('unicodedata',)) - - def test_hangul_syllables(self): - import unicodedata - # Test all leading, vowel and trailing jamo - # but not every combination of them. - for code, name in ((0xAC00, 'HANGUL SYLLABLE GA'), - (0xAE69, 'HANGUL SYLLABLE GGAEG'), - (0xB0D2, 'HANGUL SYLLABLE NYAGG'), - (0xB33B, 'HANGUL SYLLABLE DYAEGS'), - (0xB5A4, 'HANGUL SYLLABLE DDEON'), - (0xB80D, 'HANGUL SYLLABLE RENJ'), - (0xBA76, 'HANGUL SYLLABLE MYEONH'), - (0xBCDF, 'HANGUL SYLLABLE BYED'), - (0xBF48, 'HANGUL SYLLABLE BBOL'), - (0xC1B1, 'HANGUL SYLLABLE SWALG'), - (0xC41A, 'HANGUL SYLLABLE SSWAELM'), - (0xC683, 'HANGUL SYLLABLE OELB'), - (0xC8EC, 'HANGUL SYLLABLE JYOLS'), - (0xCB55, 'HANGUL SYLLABLE JJULT'), - (0xCDBE, 'HANGUL SYLLABLE CWEOLP'), - (0xD027, 'HANGUL SYLLABLE KWELH'), - (0xD290, 'HANGUL SYLLABLE TWIM'), - (0xD4F9, 'HANGUL SYLLABLE PYUB'), - (0xD762, 'HANGUL SYLLABLE HEUBS'), - (0xAE27, 'HANGUL SYLLABLE GYIS'), - (0xB090, 'HANGUL SYLLABLE GGISS'), - (0xB0AD, 'HANGUL SYLLABLE NANG'), - (0xB316, 'HANGUL SYLLABLE DAEJ'), - (0xB57F, 'HANGUL SYLLABLE DDYAC'), - (0xB7E8, 'HANGUL SYLLABLE RYAEK'), - (0xBA51, 'HANGUL SYLLABLE MEOT'), - (0xBCBA, 'HANGUL SYLLABLE BEP'), - (0xBF23, 'HANGUL SYLLABLE BBYEOH'), - (0xD7A3, 'HANGUL SYLLABLE HIH')): - assert unicodedata.name(chr(code)) == name - assert unicodedata.lookup(name) == chr(code) - # Test outside the range - py.test.raises(ValueError, unicodedata.name, chr(0xAC00 - 1)) - py.test.raises(ValueError, unicodedata.name, chr(0xD7A3 + 1)) - - def test_cjk(self): - import sys - import unicodedata - cases = ((0x3400, 0x4DB5), - (0x4E00, 0x9FA5)) - if unicodedata.unidata_version >= "5": # don't know the exact limit - cases = ((0x3400, 0x4DB5), - (0x4E00, 0x9FCB), - (0x20000, 0x2A6D6), - (0x2A700, 0x2B734)) - elif unicodedata.unidata_version >= "4.1": - cases = ((0x3400, 0x4DB5), - (0x4E00, 0x9FBB), - (0x20000, 0x2A6D6)) - for first, last in cases: - # Test at and inside the boundary - for i in (first, first + 1, last - 1, last): - charname = 'CJK UNIFIED IDEOGRAPH-%X'%i - char = chr(i) - assert unicodedata.name(char) == charname - assert unicodedata.lookup(charname) == char - # Test outside the boundary - for i in first - 1, last + 1: - charname = 'CJK UNIFIED IDEOGRAPH-%X'%i - char = chr(i) - try: - unicodedata.name(char) - except ValueError as e: - assert e.message == 'no such name' - py.test.raises(KeyError, unicodedata.lookup, charname) - - def test_bug_1704793(self): # from CPython - import unicodedata - assert unicodedata.lookup("GOTHIC LETTER FAIHU") == '\U00010346' - - def test_normalize(self): - import unicodedata - py.test.raises(TypeError, unicodedata.normalize, 'x') - - def test_normalize_wide(self): - import sys, unicodedata - if sys.maxunicode < 0x10ffff: - skip("requires a 'wide' python build.") - assert unicodedata.normalize('NFC', '\U000110a5\U000110ba') == '\U000110ab' - - def test_linebreaks(self): - linebreaks = (0x0a, 0x0b, 0x0c, 0x0d, 0x85, - 0x1c, 0x1d, 0x1e, 0x2028, 0x2029) - for i in linebreaks: - for j in range(-2, 3): - lines = (chr(i + j) + 'A').splitlines() - if i + j in linebreaks: - assert len(lines) == 2 - else: - assert len(lines) == 1 - - def test_mirrored(self): - import unicodedata - # For no reason, unicodedata.mirrored() returns an int, not a bool - assert repr(unicodedata.mirrored(' ')) == '0' - class TestUnicodeData(object): def setup_class(cls): import random, unicodedata _______________________________________________ pypy-commit mailing list pypy-commit@python.org http://mail.python.org/mailman/listinfo/pypy-commit