Author: Maciej Fijalkowski <[email protected]>
Branch:
Changeset: r60235:8b3fd7dd838c
Date: 2013-01-20 16:13 +0200
http://bitbucket.org/pypy/pypy/changeset/8b3fd7dd838c/
Log: shuffle stuff around
diff --git a/pypy/module/unicodedata/interp_ucd.py
b/pypy/module/unicodedata/interp_ucd.py
--- a/pypy/module/unicodedata/interp_ucd.py
+++ b/pypy/module/unicodedata/interp_ucd.py
@@ -9,6 +9,7 @@
from rpython.rlib.objectmodel import we_are_translated
from rpython.rlib.runicode import MAXUNICODE
from rpython.rlib.unicodedata import unicodedb_5_2_0, unicodedb_3_2_0
+from rpython.rlib.unicodedata.ucd import code_to_unichr, ORD
import sys
@@ -30,25 +31,6 @@
# The functions below are subtly different from the ones in runicode.py.
# When PyPy implements Python 3 they should be merged.
-def UNICHR(c):
- if c <= sys.maxunicode and c <= MAXUNICODE:
- return unichr(c)
- else:
- c -= 0x10000
- return (unichr(0xD800 + (c >> 10)) +
- unichr(0xDC00 + (c & 0x03FF)))
-
-def ORD(u):
- assert isinstance(u, unicode)
- if len(u) == 1:
- return ord(u[0])
- elif len(u) == 2:
- ch1 = ord(u[0])
- ch2 = ord(u[1])
- if 0xD800 <= ch1 <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF:
- return (((ch1 - 0xD800) << 10) | (ch2 - 0xDC00)) + 0x10000
- raise ValueError
-
if MAXUNICODE > 0xFFFF:
# Target is wide build
def unichr_to_code_w(space, w_unichr):
@@ -69,12 +51,6 @@
'need a single Unicode character as parameter'))
return space.int_w(space.ord(w_unichr))
- def code_to_unichr(code):
- if not we_are_translated() and sys.maxunicode == 0xFFFF:
- # Host CPython is narrow build, generate surrogates
- return UNICHR(code)
- else:
- return unichr(code)
else:
# Target is narrow build
def unichr_to_code_w(space, w_unichr):
@@ -97,10 +73,6 @@
raise OperationError(space.w_TypeError, space.wrap(
'need a single Unicode character as parameter'))
- def code_to_unichr(code):
- # generate surrogates for large codes
- return UNICHR(code)
-
class UCD(Wrappable):
def __init__(self, unicodedb):
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -1,49 +1,9 @@
import sys
-from rpython.rlib.bitmanipulation import splitter
-from rpython.rtyper.lltypesystem import lltype, rffi
-from rpython.rlib.objectmodel import we_are_translated, specialize, enforceargs
+from rpython.rlib.objectmodel import specialize, enforceargs
from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
from rpython.rlib.rarithmetic import r_uint, intmask
from rpython.rlib.unicodedata import unicodedb
-
-if rffi.sizeof(lltype.UniChar) == 4:
- MAXUNICODE = 0x10ffff
-else:
- MAXUNICODE = 0xffff
-BYTEORDER = sys.byteorder
-
-if MAXUNICODE > sys.maxunicode:
- # A version of unichr which allows codes outside the BMP
- # even on narrow unicode builds.
- # It will be used when interpreting code on top of a UCS2 CPython,
- # when sizeof(wchar_t) == 4.
- # Note that Python3 uses a similar implementation.
- def UNICHR(c):
- assert not we_are_translated()
- if c <= sys.maxunicode or c > MAXUNICODE:
- return unichr(c)
- else:
- c -= 0x10000
- return (unichr(0xD800 + (c >> 10)) +
- unichr(0xDC00 + (c & 0x03FF)))
- UNICHR._flowspace_rewrite_directly_as_ = unichr
- # ^^^ NB.: for translation, it's essential to use this hack instead
- # of calling unichr() from UNICHR(), because unichr() detects if there
- # is a "try:except ValueError" immediately around it.
-
- def ORD(u):
- assert not we_are_translated()
- if isinstance(u, unicode) and len(u) == 2:
- ch1 = ord(u[0])
- ch2 = ord(u[1])
- if 0xD800 <= ch1 <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF:
- return (((ch1 - 0xD800) << 10) | (ch2 - 0xDC00)) + 0x10000
- return ord(u)
- ORD._flowspace_rewrite_directly_as_ = ord
-
-else:
- UNICHR = unichr
- ORD = ord
+from rpython.rlib.unicodedata.ucd import MAXUNICODE, UNICHR, BYTEORDER
def default_unicode_error_decode(errors, encoding, msg, s,
@@ -446,16 +406,6 @@
result.append(r)
return result.build(), pos, bo
-def _STORECHAR(result, CH, byteorder):
- hi = chr(((CH) >> 8) & 0xff)
- lo = chr((CH) & 0xff)
- if byteorder == 'little':
- result.append(lo)
- result.append(hi)
- else:
- result.append(hi)
- result.append(lo)
-
def unicode_encode_utf_16_helper(s, size, errors,
errorhandler=None,
byteorder='little'):
diff --git a/pypy/module/unicodedata/test_interp_ucd.py
b/rpython/rlib/unicodedata/test/test_ucd.py
rename from pypy/module/unicodedata/test_interp_ucd.py
rename to rpython/rlib/unicodedata/test/test_ucd.py
--- a/pypy/module/unicodedata/test_interp_ucd.py
+++ b/rpython/rlib/unicodedata/test/test_ucd.py
@@ -1,6 +1,6 @@
from rpython.rtyper.test.tool import BaseRtypingTest, LLRtypeMixin
from rpython.rlib.unicodedata import unicodedb_5_2_0
-from pypy.module.unicodedata.interp_ucd import code_to_unichr
+from rpython.rlib.unicodedata.ucd import code_to_unichr
class TestTranslated(BaseRtypingTest, LLRtypeMixin):
diff --git a/rpython/rlib/unicodedata/test/test_unicodedata.py
b/rpython/rlib/unicodedata/test/test_unicodedata.py
--- a/rpython/rlib/unicodedata/test/test_unicodedata.py
+++ b/rpython/rlib/unicodedata/test/test_unicodedata.py
@@ -1,109 +1,6 @@
import py
from rpython.rlib.unicodedata import unicodedb_3_2_0, unicodedb_5_2_0
-class AppTestUnicodeData:
- spaceconfig = dict(usemodules=('unicodedata',))
-
- def test_hangul_syllables(self):
- import unicodedata
- # Test all leading, vowel and trailing jamo
- # but not every combination of them.
- for code, name in ((0xAC00, 'HANGUL SYLLABLE GA'),
- (0xAE69, 'HANGUL SYLLABLE GGAEG'),
- (0xB0D2, 'HANGUL SYLLABLE NYAGG'),
- (0xB33B, 'HANGUL SYLLABLE DYAEGS'),
- (0xB5A4, 'HANGUL SYLLABLE DDEON'),
- (0xB80D, 'HANGUL SYLLABLE RENJ'),
- (0xBA76, 'HANGUL SYLLABLE MYEONH'),
- (0xBCDF, 'HANGUL SYLLABLE BYED'),
- (0xBF48, 'HANGUL SYLLABLE BBOL'),
- (0xC1B1, 'HANGUL SYLLABLE SWALG'),
- (0xC41A, 'HANGUL SYLLABLE SSWAELM'),
- (0xC683, 'HANGUL SYLLABLE OELB'),
- (0xC8EC, 'HANGUL SYLLABLE JYOLS'),
- (0xCB55, 'HANGUL SYLLABLE JJULT'),
- (0xCDBE, 'HANGUL SYLLABLE CWEOLP'),
- (0xD027, 'HANGUL SYLLABLE KWELH'),
- (0xD290, 'HANGUL SYLLABLE TWIM'),
- (0xD4F9, 'HANGUL SYLLABLE PYUB'),
- (0xD762, 'HANGUL SYLLABLE HEUBS'),
- (0xAE27, 'HANGUL SYLLABLE GYIS'),
- (0xB090, 'HANGUL SYLLABLE GGISS'),
- (0xB0AD, 'HANGUL SYLLABLE NANG'),
- (0xB316, 'HANGUL SYLLABLE DAEJ'),
- (0xB57F, 'HANGUL SYLLABLE DDYAC'),
- (0xB7E8, 'HANGUL SYLLABLE RYAEK'),
- (0xBA51, 'HANGUL SYLLABLE MEOT'),
- (0xBCBA, 'HANGUL SYLLABLE BEP'),
- (0xBF23, 'HANGUL SYLLABLE BBYEOH'),
- (0xD7A3, 'HANGUL SYLLABLE HIH')):
- assert unicodedata.name(unichr(code)) == name
- assert unicodedata.lookup(name) == unichr(code)
- # Test outside the range
- py.test.raises(ValueError, unicodedata.name, unichr(0xAC00 - 1))
- py.test.raises(ValueError, unicodedata.name, unichr(0xD7A3 + 1))
-
- def test_cjk(self):
- import sys
- import unicodedata
- cases = ((0x3400, 0x4DB5),
- (0x4E00, 0x9FA5))
- if unicodedata.unidata_version >= "5": # don't know the exact limit
- cases = ((0x3400, 0x4DB5),
- (0x4E00, 0x9FCB),
- (0x20000, 0x2A6D6),
- (0x2A700, 0x2B734))
- elif unicodedata.unidata_version >= "4.1":
- cases = ((0x3400, 0x4DB5),
- (0x4E00, 0x9FBB),
- (0x20000, 0x2A6D6))
- for first, last in cases:
- # Test at and inside the boundary
- for i in (first, first + 1, last - 1, last):
- charname = 'CJK UNIFIED IDEOGRAPH-%X'%i
- char = ('\\U%08X' % i).decode('unicode-escape')
- assert unicodedata.name(char) == charname
- assert unicodedata.lookup(charname) == char
- # Test outside the boundary
- for i in first - 1, last + 1:
- charname = 'CJK UNIFIED IDEOGRAPH-%X'%i
- char = ('\\U%08X' % i).decode('unicode-escape')
- try:
- unicodedata.name(char)
- except ValueError, e:
- assert e.message == 'no such name'
- py.test.raises(KeyError, unicodedata.lookup, charname)
-
- def test_bug_1704793(self): # from CPython
- import unicodedata
- assert unicodedata.lookup("GOTHIC LETTER FAIHU") == u'\U00010346'
-
- def test_normalize(self):
- import unicodedata
- py.test.raises(TypeError, unicodedata.normalize, 'x')
-
- def test_normalize_wide(self):
- import sys, unicodedata
- if sys.maxunicode < 0x10ffff:
- skip("requires a 'wide' python build.")
- assert unicodedata.normalize('NFC', u'\U000110a5\U000110ba') ==
u'\U000110ab'
-
- def test_linebreaks(self):
- linebreaks = (0x0a, 0x0b, 0x0c, 0x0d, 0x85,
- 0x1c, 0x1d, 0x1e, 0x2028, 0x2029)
- for i in linebreaks:
- for j in range(-2, 3):
- lines = (unichr(i + j) + u'A').splitlines()
- if i + j in linebreaks:
- assert len(lines) == 2
- else:
- assert len(lines) == 1
-
- def test_mirrored(self):
- import unicodedata
- # For no reason, unicodedata.mirrored() returns an int, not a bool
- assert repr(unicodedata.mirrored(u' ')) == '0'
-
class TestUnicodeData(object):
def setup_class(cls):
import random, unicodedata
diff --git a/rpython/rlib/unicodedata/ucd.py b/rpython/rlib/unicodedata/ucd.py
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/unicodedata/ucd.py
@@ -0,0 +1,87 @@
+
+import sys
+from rpython.rtyper.lltypesystem import lltype, rffi
+from rpython.rlib.objectmodel import we_are_translated
+
+
+if rffi.sizeof(lltype.UniChar) == 4:
+ MAXUNICODE = 0x10ffff
+else:
+ MAXUNICODE = 0xffff
+
+BYTEORDER = sys.byteorder
+
+if MAXUNICODE > sys.maxunicode:
+ # A version of unichr which allows codes outside the BMP
+ # even on narrow unicode builds.
+ # It will be used when interpreting code on top of a UCS2 CPython,
+ # when sizeof(wchar_t) == 4.
+ # Note that Python3 uses a similar implementation.
+ def UNICHR(c):
+ assert not we_are_translated()
+ if c <= sys.maxunicode or c > MAXUNICODE:
+ return unichr(c)
+ else:
+ c -= 0x10000
+ return (unichr(0xD800 + (c >> 10)) +
+ unichr(0xDC00 + (c & 0x03FF)))
+ UNICHR._flowspace_rewrite_directly_as_ = unichr
+ # ^^^ NB.: for translation, it's essential to use this hack instead
+ # of calling unichr() from UNICHR(), because unichr() detects if there
+ # is a "try:except ValueError" immediately around it.
+
+ def ORD(u):
+ assert not we_are_translated()
+ if isinstance(u, unicode) and len(u) == 2:
+ ch1 = ord(u[0])
+ ch2 = ord(u[1])
+ if 0xD800 <= ch1 <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF:
+ return (((ch1 - 0xD800) << 10) | (ch2 - 0xDC00)) + 0x10000
+ return ord(u)
+ ORD._flowspace_rewrite_directly_as_ = ord
+
+else:
+ UNICHR = unichr
+ ORD = ord
+
+if MAXUNICODE > 0xFFFF:
+ def code_to_unichr(code):
+ if not we_are_translated() and sys.maxunicode == 0xFFFF:
+ # Host CPython is narrow build, generate surrogates
+ return UNICHR(code)
+ else:
+ return unichr(code)
+else:
+ def code_to_unichr(code):
+ # generate surrogates for large codes
+ return UNICHR(code)
+
+
+def UNICHR(c):
+ if c <= sys.maxunicode and c <= MAXUNICODE:
+ return unichr(c)
+ else:
+ c -= 0x10000
+ return (unichr(0xD800 + (c >> 10)) +
+ unichr(0xDC00 + (c & 0x03FF)))
+
+def ORD(u):
+ assert isinstance(u, unicode)
+ if len(u) == 1:
+ return ord(u[0])
+ elif len(u) == 2:
+ ch1 = ord(u[0])
+ ch2 = ord(u[1])
+ if 0xD800 <= ch1 <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF:
+ return (((ch1 - 0xD800) << 10) | (ch2 - 0xDC00)) + 0x10000
+ raise ValueError
+
+def _STORECHAR(result, CH, byteorder):
+ hi = chr(((CH) >> 8) & 0xff)
+ lo = chr((CH) & 0xff)
+ if byteorder == 'little':
+ result.append(lo)
+ result.append(hi)
+ else:
+ result.append(hi)
+ result.append(lo)
_______________________________________________
pypy-commit mailing list
[email protected]
http://mail.python.org/mailman/listinfo/pypy-commit