Author: Maciej Fijalkowski <[email protected]>
Branch:
Changeset: r60527:53723bf32fd5
Date: 2013-01-27 20:02 +0200
http://bitbucket.org/pypy/pypy/changeset/53723bf32fd5/
Log: Help with unicode issues
diff --git a/pypy/module/unicodedata/interp_ucd.py
b/pypy/module/unicodedata/interp_ucd.py
--- a/pypy/module/unicodedata/interp_ucd.py
+++ b/pypy/module/unicodedata/interp_ucd.py
@@ -9,7 +9,7 @@
from rpython.rlib.objectmodel import we_are_translated
from rpython.rlib.runicode import MAXUNICODE
from rpython.rlib.unicodedata import unicodedb_5_2_0, unicodedb_3_2_0
-from rpython.rlib.runicode import code_to_unichr, ORD
+from rpython.rlib.runicode import code_to_unichr, ord_accepts_surrogate
import sys
@@ -28,8 +28,6 @@
# handling: on narrow unicode builds, a surrogate pair is considered as one
# unicode code point.
-# The functions below are subtly different from the ones in runicode.py.
-# When PyPy implements Python 3 they should be merged.
if MAXUNICODE > 0xFFFF:
# Target is wide build
@@ -41,7 +39,7 @@
if not we_are_translated() and sys.maxunicode == 0xFFFF:
# Host CPython is narrow build, accept surrogates
try:
- return ORD(space.unicode_w(w_unichr))
+ return ord_accepts_surrogate(space.unicode_w(w_unichr))
except ValueError:
raise OperationError(space.w_TypeError, space.wrap(
'need a single Unicode character as parameter'))
@@ -68,7 +66,7 @@
else:
# Accept surrogates
try:
- return ORD(space.unicode_w(w_unichr))
+ return ord_accepts_surrogate(space.unicode_w(w_unichr))
except ValueError:
raise OperationError(space.w_TypeError, space.wrap(
'need a single Unicode character as parameter'))
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -13,6 +13,26 @@
BYTEORDER = sys.byteorder
+# python 2.7 has a preview of py3k behavior, so those functions
+# are used either when we're testing wide pypy on narrow cpython
+# or in unicodedata in pypy
+
+def unichr_returns_surrogate(c):
+ if c <= sys.maxunicode or c > MAXUNICODE:
+ return unichr(c)
+ else:
+ c -= 0x10000
+ return (unichr(0xD800 + (c >> 10)) +
+ unichr(0xDC00 + (c & 0x03FF)))
+
+def ord_accepts_surrogate(u):
+ if isinstance(u, unicode) and len(u) == 2:
+ ch1 = ord(u[0])
+ ch2 = ord(u[1])
+ if 0xD800 <= ch1 <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF:
+ return (((ch1 - 0xD800) << 10) | (ch2 - 0xDC00)) + 0x10000
+ return ord(u)
+
if MAXUNICODE > sys.maxunicode:
# A version of unichr which allows codes outside the BMP
# even on narrow unicode builds.
@@ -21,12 +41,7 @@
# Note that Python3 uses a similar implementation.
def UNICHR(c):
assert not we_are_translated()
- if c <= sys.maxunicode or c > MAXUNICODE:
- return unichr(c)
- else:
- c -= 0x10000
- return (unichr(0xD800 + (c >> 10)) +
- unichr(0xDC00 + (c & 0x03FF)))
+ return unichr_returns_surrogate(c)
UNICHR._flowspace_rewrite_directly_as_ = unichr
# ^^^ NB.: for translation, it's essential to use this hack instead
# of calling unichr() from UNICHR(), because unichr() detects if there
@@ -34,12 +49,7 @@
def ORD(u):
assert not we_are_translated()
- if isinstance(u, unicode) and len(u) == 2:
- ch1 = ord(u[0])
- ch2 = ord(u[1])
- if 0xD800 <= ch1 <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF:
- return (((ch1 - 0xD800) << 10) | (ch2 - 0xDC00)) + 0x10000
- return ord(u)
+ return ord_accepts_surrogate(u)
ORD._flowspace_rewrite_directly_as_ = ord
else:
_______________________________________________
pypy-commit mailing list
[email protected]
http://mail.python.org/mailman/listinfo/pypy-commit