Author: Tyler Wade <[email protected]>
Branch: utf8-unicode2
Changeset: r72077:69df5d97a930
Date: 2014-06-16 10:13 -0500
http://bitbucket.org/pypy/pypy/changeset/69df5d97a930/
Log: WIP
diff --git a/pypy/interpreter/test/test_utf8.py
b/pypy/interpreter/test/test_utf8.py
new file mode 100644
--- /dev/null
+++ b/pypy/interpreter/test/test_utf8.py
@@ -0,0 +1,65 @@
+from pypy.interpreter.utf8 import (
+ Utf8Str, Utf8Builder, utf8chr, utf8ord)
+
+def build_utf8str():
+ builder = Utf8Builder()
+ builder.append('A') #0x41
+ builder.append(0x10F) #0xC4 0x8F
+ builder.append(0x20AC) #0xE2 0x82 0xAC
+ builder.append(0x1F63D) #0xF0 0x9F 0x98 0xBD
+ return builder.build()
+
+def test_builder():
+ s = build_utf8str()
+ assert not s._is_ascii
+
+ assert list(s.bytes) == [chr(i) for i in [
+ 0x41,
+ 0xC4, 0x8F,
+ 0xE2, 0x82, 0xAC,
+ 0xF0, 0x9F, 0x98, 0xBD,
+ ]]
+
+def test_unicode_literal_comparison():
+ builder = Utf8Builder()
+ builder.append(0x10F)
+ s = builder.build()
+ assert s == u'\u010F'
+ assert s[0] == u'\u010F'
+ assert s[0] == utf8chr(0x10F)
+
+def test_utf8chr():
+ assert utf8chr(65) == u'A'
+ assert utf8chr(0x7FF) == u'\u07FF'
+ assert utf8chr(0x17FF) == u'\u17FF'
+ assert utf8chr(0x10001) == u'\U00010001'
+
+def test_utf8ord():
+ s = build_utf8str()
+ assert utf8ord(s) == 65
+ assert utf8ord(s, 1) == 0x10F
+ assert utf8ord(s, 2) == 0x20AC
+ assert utf8ord(s, 3) == 0x1F63D
+
+def test_len():
+ s = build_utf8str()
+ assert len(s) == 4
+
+def test_getitem():
+ s = build_utf8str()
+
+ assert s[0] == utf8chr(65)
+ assert s[1] == utf8chr(0x10F)
+ assert s[2] == utf8chr(0x20AC)
+ assert s[3] == utf8chr(0x1F63D)
+ assert s[-1] == utf8chr(0x1F63D)
+ assert s[-2] == utf8chr(0x20AC)
+
+def test_getslice():
+ s = build_utf8str()
+
+ assert s[0:1] == u'A'
+ assert s[0:2] == u'A\u010F'
+ assert s[1:2] == u'\u010F'
+ assert s[-4:-3] == u'A'
+ assert s[-4:-2] == u'A\u010F'
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1,6 +1,5 @@
from pypy.interpreter.error import OperationError
from rpython.rlib.objectmodel import specialize
-from rpython.rlib import runicode
from pypy.module._codecs import interp_codecs
@specialize.memo()
@@ -35,29 +34,30 @@
# These functions take and return unwrapped rpython strings and unicodes
def decode_unicode_escape(space, string):
+ from pypy.interpreter.utf8 import decode_unicode_escape
state = space.fromcache(interp_codecs.CodecState)
unicodedata_handler = state.get_unicodedata_handler(space)
- result, consumed = runicode.str_decode_unicode_escape(
+ result, consumed = decode_unicode_escape(
string, len(string), "strict",
final=True, errorhandler=decode_error_handler(space),
unicodedata_handler=unicodedata_handler)
return result
def decode_raw_unicode_escape(space, string):
- result, consumed = runicode.str_decode_raw_unicode_escape(
+ from pypy.interpreter.utf8 import decode_raw_unicode_escape
+ result, consumed = decode_raw_unicode_escape(
string, len(string), "strict",
final=True, errorhandler=decode_error_handler(space))
return result
def decode_utf8(space, string):
- result, consumed = runicode.str_decode_utf_8(
+ from pypy.interpreter.utf8 import decode_utf_8
+ result, consumed = decode_utf_8(
string, len(string), "strict",
final=True, errorhandler=decode_error_handler(space),
allow_surrogates=True)
return result
def encode_utf8(space, uni):
- return runicode.unicode_encode_utf_8(
- uni, len(uni), "strict",
- errorhandler=encode_error_handler(space),
- allow_surrogates=True)
+ # unicode to string...
+ return s.bytes
diff --git a/pypy/interpreter/utf8.py b/pypy/interpreter/utf8.py
new file mode 100644
--- /dev/null
+++ b/pypy/interpreter/utf8.py
@@ -0,0 +1,569 @@
+from rpython.rlib.rstring import StringBuilder
+from rpython.rlib.objectmodel import specialize
+from rpython.rlib.runicode import utf8_code_length
+
+MAXUNICODE = 0x10ffff
+
+def utf8chr(value):
+ # Like unichr, but returns a Utf8Str object
+ b = Utf8Builder()
+ b.append(value)
+ return b.build()
+
+def utf8ord(ustr, start=0):
+ bytes = ustr.bytes
+ start = ustr.index_of_char(start)
+ codepoint_length = utf8_code_length[ord(bytes[start])]
+
+ if codepoint_length == 1:
+ return ord(bytes[start])
+
+ elif codepoint_length == 2:
+ return ((ord(bytes[start]) & 0x1F) << 6 |
+ (ord(bytes[start + 1]) & 0x3F))
+ elif codepoint_length == 3:
+ return ((ord(bytes[start]) & 0xF) << 12 |
+ (ord(bytes[start + 1]) & 0x3F) << 6 |
+ (ord(bytes[start + 2]) & 0x3F))
+ else:
+ assert codepoint_length == 4
+ return ((ord(bytes[start]) & 0xF) << 18 |
+ (ord(bytes[start + 1]) & 0x3F) << 12 |
+ (ord(bytes[start + 2]) & 0x3F) << 6 |
+ (ord(bytes[start + 3]) & 0x3F))
+
+
+class Utf8Str(object):
+ _immutable_fields_ = ['bytes', '_is_ascii', '_len']
+
+ def __init__(self, data, is_ascii=False, length=-1):
+ # TODO: Maybe I can determine is_ascii rather than have it passed in?
+ # It really depends on what my model ends up looking like?
+ # It is worth noting that this check can be really fast. We just
+ # have to iterate the bytes while checking for (& 0b01000000)
+
+ self.bytes = data
+ self._is_ascii = is_ascii
+
+ if length != -1:
+ self._len = length
+ else:
+ if not is_ascii:
+ #self._len = -1
+ self._calc_length()
+ else:
+ self._len = len(data)
+
+ def _calc_length(self):
+ pos = 0
+ length = 0
+
+ while pos < len(self.bytes):
+ length += 1
+ pos += utf8_code_length[ord(self.bytes[pos])]
+
+ self._len = length
+
+ def index_of_char(self, char):
+ byte = 0
+ pos = 0
+ while pos < char:
+ pos += 1
+ byte += utf8_code_length[ord(self.bytes[byte])]
+
+ return byte
+
+ def __getitem__(self, char_pos):
+ # This if statement is needed for [-1:0] to slice correctly
+ if char_pos < 0:
+ char_pos += self._len
+ return self[char_pos:char_pos+1]
+
+ def __getslice__(self, start, stop):
+ assert start < stop
+ # TODO: If start > _len or stop >= _len, then raise exception
+
+ if self._is_ascii:
+ return Utf8Str(self.bytes[start:stop], True)
+
+ start_byte = self.index_of_char(start)
+ stop_byte = start_byte
+ stop_pos = start
+ # TODO: Is detecting ascii-ness here actually useful? If it will
+ # happen in __init__ anyway, maybe its not worth the extra
+ # complexity.
+ is_ascii = True
+ while stop_pos < stop:
+ stop_pos += 1
+ increment = utf8_code_length[ord(self.bytes[stop_byte])]
+ if increment != 1:
+ is_ascii = False
+ stop_byte += increment
+
+ return Utf8Str(self.bytes[start_byte:stop_byte], is_ascii,
+ stop - start)
+
+ def __len__(self):
+ return self._len
+
+ def __eq__(self, other):
+ """NOT_RPYTHON"""
+ if isinstance(other, Utf8Str):
+ return self.bytes == other.bytes
+ if isinstance(other, unicode):
+ return unicode(self.bytes, 'utf8') == other
+
+ return False
+
+class Utf8Builder(object):
+ @specialize.argtype(1)
+ def __init__(self, init_size=None):
+ if init_size is None:
+ self._builder = StringBuilder()
+ else:
+ self._builder = StringBuilder(init_size)
+ self._is_ascii = True
+
+
+ @specialize.argtype(1)
+ def append(self, c):
+ if isinstance(c, int):
+ if c < 0x80:
+ self._builder.append(chr(c))
+ elif c < 0x800:
+ self._builder.append(chr(0xC0 | (c >> 6)))
+ self._builder.append(chr(0x80 | (c & 0x3F)))
+ self._is_ascii = False
+ elif c < 0x10000:
+ self._builder.append(chr(0xE0 | (c >> 12)))
+ self._builder.append(chr(0x80 | (c >> 6 & 0x3F)))
+ self._builder.append(chr(0x80 | (c & 0x3F)))
+ self._is_ascii = False
+ elif c <= 0x10FFFF:
+ self._builder.append(chr(0xF0 | (c >> 18)))
+ self._builder.append(chr(0x80 | (c >> 12 & 0x3F)))
+ self._builder.append(chr(0x80 | (c >> 6 & 0x3F)))
+ self._builder.append(chr(0x80 | (c & 0x3F)))
+ self._is_ascii = False
+ else:
+ raise ValueError("Invalid unicode codepoint > 0x10FFFF.")
+ else:
+ # TODO: Only allow ord(c) in [0, 127]
+ self._builder.append(c)
+
+ def append_slice(self, s, start, end, is_ascii=False):
+ self._builder.append_slice(s, start, end)
+ if not is_ascii:
+ self._is_ascii = False
+
+ def build(self):
+ return Utf8Str(self._builder.build(), self._is_ascii)
+
+
+# ____________________________________________________________
+# Escape-parsing functions
+
+def decode_raw_unicode_escape(s, size, errors, final=False,
+ errorhandler=None):
+ if errorhandler is None:
+ errorhandler = default_unicode_error_decode
+ if size == 0:
+ # TODO:?
+ return Utf8Str('', True), 0
+
+ result = Utf8Builder(size)
+ pos = 0
+ while pos < size:
+ ch = s[pos]
+
+ # Non-escape characters are interpreted as Unicode ordinals
+ if ch != '\\':
+ result.append(ch)
+ pos += 1
+ continue
+
+ # \u-escapes are only interpreted iff the number of leading
+ # backslashes is odd
+ bs = pos
+ while pos < size:
+ pos += 1
+ if pos == size or s[pos] != '\\':
+ break
+ result.append('\\')
+
+ # we have a backslash at the end of the string, stop here
+ if pos >= size:
+ result.append('\\')
+ break
+
+ if ((pos - bs) & 1 == 0 or
+ pos >= size or
+ (s[pos] != 'u' and s[pos] != 'U')):
+ result.append('\\')
+ result.append(s[pos])
+ pos += 1
+ continue
+
+ digits = 4 if s[pos] == 'u' else 8
+ message = "truncated \\uXXXX"
+ pos += 1
+ pos = hexescape(result, s, pos, digits,
+ "rawunicodeescape", errorhandler, message, errors)
+
+ return result.build(), pos
+
+# Specialize on the errorhandler when it's a constant
[email protected]_or_var(4)
+def decode_unicode_escape(s, size, errors, final=False,
+ errorhandler=None,
+ unicodedata_handler=None):
+ if errorhandler is None:
+ errorhandler = default_unicode_error_decode
+
+ if size == 0:
+ return Utf8Str('', True), 0
+
+ builder = Utf8Builder(size)
+ pos = 0
+ while pos < size:
+ ch = s[pos]
+
+ # Non-escape characters are interpreted as Unicode ordinals
+ if ch != '\\':
+ builder.append(ch)
+ pos += 1
+ continue
+
+ # - Escapes
+ pos += 1
+ if pos >= size:
+ message = "\\ at end of string"
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos-1, size)
+ builder.append(res)
+ continue
+
+ ch = s[pos]
+ pos += 1
+ # \x escapes
+ if ch == '\n': pass
+ elif ch == '\\': builder.append('\\')
+ elif ch == '\'': builder.append('\'')
+ elif ch == '\"': builder.append('\"')
+ elif ch == 'b' : builder.append('\b')
+ elif ch == 'f' : builder.append('\f')
+ elif ch == 't' : builder.append('\t')
+ elif ch == 'n' : builder.append('\n')
+ elif ch == 'r' : builder.append('\r')
+ elif ch == 'v' : builder.append('\v')
+ elif ch == 'a' : builder.append('\a')
+ elif '0' <= ch <= '7':
+ x = ord(ch) - ord('0')
+ if pos < size:
+ ch = s[pos]
+ if '0' <= ch <= '7':
+ pos += 1
+ x = (x<<3) + ord(ch) - ord('0')
+ if pos < size:
+ ch = s[pos]
+ if '0' <= ch <= '7':
+ pos += 1
+ x = (x<<3) + ord(ch) - ord('0')
+ builder.append(x)
+ # hex escapes
+ # \xXX
+ elif ch == 'x':
+ digits = 2
+ message = "truncated \\xXX escape"
+ pos = hexescape(builder, s, pos, digits,
+ "unicodeescape", errorhandler, message, errors)
+
+ # \uXXXX
+ elif ch == 'u':
+ digits = 4
+ message = "truncated \\uXXXX escape"
+ pos = hexescape(builder, s, pos, digits,
+ "unicodeescape", errorhandler, message, errors)
+
+ # \UXXXXXXXX
+ elif ch == 'U':
+ digits = 8
+ message = "truncated \\UXXXXXXXX escape"
+ pos = hexescape(builder, s, pos, digits,
+ "unicodeescape", errorhandler, message, errors)
+
+ # \N{name}
+ elif ch == 'N':
+ message = "malformed \\N character escape"
+ look = pos
+ if unicodedata_handler is None:
+ message = ("\\N escapes not supported "
+ "(can't load unicodedata module)")
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos-1, size)
+ builder.append(res)
+ continue
+
+ if look < size and s[look] == '{':
+ # look for the closing brace
+ while look < size and s[look] != '}':
+ look += 1
+ if look < size and s[look] == '}':
+ # found a name. look it up in the unicode database
+ message = "unknown Unicode character name"
+ name = s[pos+1:look]
+ code = unicodedata_handler.call(name)
+ if code < 0:
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos-1, look+1)
+ builder.append(res)
+ continue
+ pos = look + 1
+ builder.append(code)
+ else:
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos-1, look+1)
+ builder.append(res)
+ else:
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos-1, look+1)
+ builder.append(res)
+ else:
+ builder.append('\\')
+ builder.append(ch)
+
+ return builder.build(), pos
+
+hexdigits = "0123456789ABCDEFabcdef"
+
+def hexescape(builder, s, pos, digits,
+ encoding, errorhandler, message, errors):
+ chr = 0
+ if pos + digits > len(s):
+ endinpos = pos
+ while endinpos < len(s) and s[endinpos] in hexdigits:
+ endinpos += 1
+ res, pos = errorhandler(errors, encoding,
+ message, s, pos-2, endinpos)
+ builder.append(res)
+ else:
+ try:
+ chr = r_uint(int(s[pos:pos+digits], 16))
+ except ValueError:
+ endinpos = pos
+ while s[endinpos] in hexdigits:
+ endinpos += 1
+ res, pos = errorhandler(errors, encoding,
+ message, s, pos-2, endinpos)
+ builder.append(res)
+ else:
+ # when we get here, chr is a 32-bit unicode character
+ if chr <= MAXUNICODE:
+ builder.append(chr)
+ pos += digits
+
+ else:
+ message = "illegal Unicode character"
+ res, pos = errorhandler(errors, encoding,
+ message, s, pos-2, pos+digits)
+ builder.append(res)
+ return pos
+
+# ____________________________________________________________
+
+# Converting bytes (utf8) to unicode?
+# I guess we just make sure we're looking at valid utf-8 and then make the
+# object?
+
+def decode_utf_8(s, size, errors, final=False,
+ errorhandler=None, allow_surrogates=False):
+ if errorhandler is None:
+ errorhandler = default_unicode_error_decode
+ result = Utf8Builder(size)
+ pos = decode_utf_8_impl(s, size, errors, final, errorhandler, result,
+ allow_surrogates=allow_surrogates)
+ return result.build(), pos
+
+def decode_utf_8_impl(s, size, errors, final, errorhandler, result,
+ allow_surrogates):
+ if size == 0:
+ return 0
+
+ # TODO: Instead of assembling and then re-disassembling the codepoints,
+ # just use builder.append_slice
+ pos = 0
+ while pos < size:
+ ordch1 = ord(s[pos])
+ # fast path for ASCII
+ # XXX maybe use a while loop here
+ if ordch1 < 0x80:
+ result.append(ordch1)
+ pos += 1
+ continue
+
+ n = utf8_code_length[ordch1]
+ if pos + n > size:
+ if not final:
+ break
+ charsleft = size - pos - 1 # either 0, 1, 2
+ # note: when we get the 'unexpected end of data' we don't care
+ # about the pos anymore and we just ignore the value
+ if not charsleft:
+ # there's only the start byte and nothing else
+ r, pos = errorhandler(errors, 'utf8',
+ 'unexpected end of data',
+ s, pos, pos+1)
+ result.append(r)
+ break
+ ordch2 = ord(s[pos+1])
+ if n == 3:
+ # 3-bytes seq with only a continuation byte
+ if (ordch2>>6 != 0x2 or # 0b10
+ (ordch1 == 0xe0 and ordch2 < 0xa0)):
+ # or (ordch1 == 0xed and ordch2 > 0x9f)
+ # second byte invalid, take the first and continue
+ r, pos = errorhandler(errors, 'utf8',
+ 'invalid continuation byte',
+ s, pos, pos+1)
+ result.append(r)
+ continue
+ else:
+ # second byte valid, but third byte missing
+ r, pos = errorhandler(errors, 'utf8',
+ 'unexpected end of data',
+ s, pos, pos+2)
+ result.append(r)
+ break
+ elif n == 4:
+ # 4-bytes seq with 1 or 2 continuation bytes
+ if (ordch2>>6 != 0x2 or # 0b10
+ (ordch1 == 0xf0 and ordch2 < 0x90) or
+ (ordch1 == 0xf4 and ordch2 > 0x8f)):
+ # second byte invalid, take the first and continue
+ r, pos = errorhandler(errors, 'utf8',
+ 'invalid continuation byte',
+ s, pos, pos+1)
+ result.append(r)
+ continue
+ elif charsleft == 2 and ord(s[pos+2])>>6 != 0x2: # 0b10
+ # third byte invalid, take the first two and continue
+ r, pos = errorhandler(errors, 'utf8',
+ 'invalid continuation byte',
+ s, pos, pos+2)
+ result.append(r)
+ continue
+ else:
+ # there's only 1 or 2 valid cb, but the others are missing
+ r, pos = errorhandler(errors, 'utf8',
+ 'unexpected end of data',
+ s, pos, pos+charsleft+1)
+ result.append(r)
+ break
+
+ if n == 0:
+ r, pos = errorhandler(errors, 'utf8',
+ 'invalid start byte',
+ s, pos, pos+1)
+ result.append(r)
+
+ elif n == 1:
+ assert 0, "ascii should have gone through the fast path"
+
+ elif n == 2:
+ ordch2 = ord(s[pos+1])
+ if ordch2>>6 != 0x2: # 0b10
+ r, pos = errorhandler(errors, 'utf8',
+ 'invalid continuation byte',
+ s, pos, pos+1)
+ result.append(r)
+ continue
+ # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
+ result.append(((ordch1 & 0x1F) << 6) + # 0b00011111
+ (ordch2 & 0x3F)) # 0b00111111
+ pos += 2
+
+ elif n == 3:
+ ordch2 = ord(s[pos+1])
+ ordch3 = ord(s[pos+2])
+ if (ordch2>>6 != 0x2 or # 0b10
+ (ordch1 == 0xe0 and ordch2 < 0xa0)
+ # surrogates shouldn't be valid UTF-8!
+ or (not allow_surrogates and ordch1 == 0xed and ordch2 > 0x9f)
+ ):
+ r, pos = errorhandler(errors, 'utf8',
+ 'invalid continuation byte',
+ s, pos, pos+1)
+ result.append(r)
+ continue
+ elif ordch3>>6 != 0x2: # 0b10
+ r, pos = errorhandler(errors, 'utf8',
+ 'invalid continuation byte',
+ s, pos, pos+2)
+ result.append(r)
+ continue
+ # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz
+ result.append((((ordch1 & 0x0F) << 12) + # 0b00001111
+ ((ordch2 & 0x3F) << 6) + # 0b00111111
+ (ordch3 & 0x3F))) # 0b00111111
+ pos += 3
+
+ elif n == 4:
+ ordch2 = ord(s[pos+1])
+ ordch3 = ord(s[pos+2])
+ ordch4 = ord(s[pos+3])
+ if (ordch2>>6 != 0x2 or # 0b10
+ (ordch1 == 0xf0 and ordch2 < 0x90) or
+ (ordch1 == 0xf4 and ordch2 > 0x8f)):
+ r, pos = errorhandler(errors, 'utf8',
+ 'invalid continuation byte',
+ s, pos, pos+1)
+ result.append(r)
+ continue
+ elif ordch3>>6 != 0x2: # 0b10
+ r, pos = errorhandler(errors, 'utf8',
+ 'invalid continuation byte',
+ s, pos, pos+2)
+ result.append(r)
+ continue
+ elif ordch4>>6 != 0x2: # 0b10
+ r, pos = errorhandler(errors, 'utf8',
+ 'invalid continuation byte',
+ s, pos, pos+3)
+ result.append(r)
+ continue
+ # 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz
+ c = (((ordch1 & 0x07) << 18) + # 0b00000111
+ ((ordch2 & 0x3F) << 12) + # 0b00111111
+ ((ordch3 & 0x3F) << 6) + # 0b00111111
+ (ordch4 & 0x3F)) # 0b00111111
+
+ # TODO: Why doesn't this raise an error when c > MAXUNICODE? If I'm
+ # converting utf8 -> utf8 is this necessary
+ if c <= MAXUNICODE:
+ result.append(c)
+ pos += 4
+
+ return pos
+
+# ____________________________________________________________
+# Default error handlers
+
+
+def default_unicode_error_decode(errors, encoding, msg, s,
+ startingpos, endingpos):
+ if errors == 'replace':
+ return _unicode_error_replacement, endingpos
+ if errors == 'ignore':
+ return '', endingpos
+ raise UnicodeDecodeError(encoding, s, startingpos, endingpos, msg)
+_unicode_error_replacement = decode_raw_unicode_escape(
+ '\ufffd', 1, default_unicode_error_decode)
+
+def default_unicode_error_encode(errors, encoding, msg, u,
+ startingpos, endingpos):
+ if errors == 'replace':
+ return '?', None, endingpos
+ if errors == 'ignore':
+ return '', None, endingpos
+ raise UnicodeEncodeError(encoding, u, startingpos, endingpos, msg)
+
diff --git a/pypy/module/__builtin__/operation.py
b/pypy/module/__builtin__/operation.py
--- a/pypy/module/__builtin__/operation.py
+++ b/pypy/module/__builtin__/operation.py
@@ -5,7 +5,8 @@
from pypy.interpreter import gateway
from pypy.interpreter.error import OperationError
from pypy.interpreter.gateway import unwrap_spec, WrappedDefault
-from rpython.rlib.runicode import UNICHR
+from pypy.interpreter.utf8 import Utf8Str, utf8chr
+#from rpython.rlib.runicode import UNICHR
from rpython.rlib.rfloat import isnan, isinf, round_double
from rpython.rlib import rfloat
import __builtin__
@@ -28,7 +29,8 @@
"Return a Unicode string of one character with the given ordinal."
# XXX range checking!
try:
- c = UNICHR(code)
+ #c = UNICHR(code)
+ c = utf8chr(code)
except ValueError:
raise OperationError(space.w_ValueError,
space.wrap("unichr() arg out of range"))
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -3,6 +3,7 @@
from pypy.interpreter import special
from pypy.interpreter.baseobjspace import ObjSpace, W_Root
from pypy.interpreter.error import OperationError, oefmt
+from pypy.interpreter.utf8 import Utf8Str
from pypy.interpreter.typedef import get_unique_interplevel_subclass
from pypy.objspace.std import (builtinshortcut, stdtypedef, frame, model,
transparent, callmethod)
@@ -158,8 +159,12 @@
return self.newint(x)
if isinstance(x, str):
return wrapstr(self, x)
+ if isinstance(x, Utf8Str):
+ return wrapunicode(self, x)
+
if isinstance(x, unicode):
- return wrapunicode(self, x)
+ import pdb; pdb.set_trace()
+
if isinstance(x, float):
return W_FloatObject(x)
if isinstance(x, W_Root):
diff --git a/pypy/objspace/std/unicodeobject.py
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -10,6 +10,7 @@
from pypy.interpreter import unicodehelper
from pypy.interpreter.baseobjspace import W_Root
+from pypy.interpreter.utf8 import Utf8Str
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter.gateway import WrappedDefault, interp2app, unwrap_spec
from pypy.module.unicodedata import unicodedb
@@ -29,7 +30,8 @@
_immutable_fields_ = ['_value']
def __init__(w_self, unistr):
- assert isinstance(unistr, unicode)
+ assert isinstance(unistr, Utf8Str)
+ #assert isinstance(unistr, unicode)
w_self._value = unistr
def __repr__(w_self):
@@ -1076,7 +1078,8 @@
return [s for s in value]
-W_UnicodeObject.EMPTY = W_UnicodeObject(u'')
+#W_UnicodeObject.EMPTY = W_UnicodeObject(u'')
+W_UnicodeObject.EMPTY = W_UnicodeObject(Utf8Str(''))
# Helper for converting int/long
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit