Author: Tyler Wade <[email protected]>
Branch: utf8-unicode2
Changeset: r72608:d4419a342b68
Date: 2014-07-29 09:16 -0500
http://bitbucket.org/pypy/pypy/changeset/d4419a342b68/
Log: Fix most remaining module failures and some translation failures
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -10,6 +10,7 @@
from rpython.rlib.rarithmetic import r_uint, SHRT_MIN, SHRT_MAX, \
INT_MIN, INT_MAX, UINT_MAX
+from pypy.interpreter.utf8 import Utf8Str
from pypy.interpreter.executioncontext import (ExecutionContext, ActionFlag,
UserDelAction)
from pypy.interpreter.error import OperationError, new_exception_class, oefmt
@@ -1545,7 +1546,10 @@
return self.str_w(w_obj)
def unicode_w(self, w_obj):
- return w_obj.unicode_w(self)
+ #return w_obj.unicode_w(self)
+ res = w_obj.unicode_w(self)
+ assert isinstance(res, Utf8Str)
+ return res
def unicode0_w(self, w_obj):
"Like unicode_w, but rejects strings with NUL bytes."
diff --git a/pypy/interpreter/test/test_utf8.py
b/pypy/interpreter/test/test_utf8.py
--- a/pypy/interpreter/test/test_utf8.py
+++ b/pypy/interpreter/test/test_utf8.py
@@ -5,6 +5,7 @@
from pypy.interpreter.utf8 import (
Utf8Str, Utf8Builder, utf8chr, utf8ord)
from rpython.rtyper.lltypesystem import rffi
+from rpython.rtyper.test.test_llinterp import interpret
def build_utf8str():
builder = Utf8Builder()
@@ -241,3 +242,25 @@
assert s == u[:4]
rffi.free_wcharp(wcharp)
+
+def test_translate_utf8():
+ def f():
+ s = build_utf8str()
+
+ s *= 10
+ s += Utf8Str('one')
+ return len(s)
+ assert interpret(f, []) == f()
+
+ def f():
+ one = Utf8Str("one")
+ two = Utf8Str("one")
+
+ return int(one == two) + int(not (one != two))
+ assert interpret(f, []) == f()
+
+ def f():
+ one = Utf8Str("one")
+
+ return one == None
+ assert interpret(f, []) == f()
diff --git a/pypy/interpreter/utf8.py b/pypy/interpreter/utf8.py
--- a/pypy/interpreter/utf8.py
+++ b/pypy/interpreter/utf8.py
@@ -1,10 +1,11 @@
from rpython.rlib.rstring import StringBuilder
-from rpython.rlib.objectmodel import specialize
+from rpython.rlib.objectmodel import we_are_translated, specialize
from rpython.rlib.runicode import utf8_code_length
from rpython.rlib.unicodedata import unicodedb_5_2_0 as unicodedb
-from rpython.rlib.rarithmetic import r_uint, intmask
+from rpython.rlib.rarithmetic import r_uint, intmask, base_int
from rpython.rtyper.lltypesystem import rffi, lltype
+
wchar_rint = rffi.r_uint
WCHAR_INTP = rffi.UINTP
WCHAR_INT = rffi.UINT
@@ -14,11 +15,11 @@
WCHAR_INT = rffi.USHORT
-def utf8chr(value, allow_large_codepoints=False):
+def utf8chr(value):
# Like unichr, but returns a Utf8Str object
# TODO: Do this without the builder so its faster
b = Utf8Builder()
- b.append(value, allow_large_codepoints=allow_large_codepoints)
+ b.append(value)
return b.build()
def utf8ord_bytes(bytes, start):
@@ -160,22 +161,26 @@
return hash(self.bytes)
def __eq__(self, other):
- """NOT_RPYTHON"""
if isinstance(other, Utf8Str):
return self.bytes == other.bytes
+ if other is None:
+ return False
if isinstance(other, unicode):
+ assert not we_are_translated()
return unicode(self.bytes, 'utf8') == other
- return False
+ raise ValueError()
def __ne__(self, other):
- """NOT_RPYTHON"""
if isinstance(other, Utf8Str):
return self.bytes != other.bytes
+ if other is None:
+ return True
if isinstance(other, unicode):
+ assert not we_are_translated()
return unicode(self.bytes, 'utf8') != other
- return True
+ raise ValueError()
def __lt__(self, other):
return self.bytes < other.bytes
@@ -194,7 +199,7 @@
if isinstance(other, Utf8Str):
return other.bytes in self.bytes
if isinstance(other, unicode):
- # TODO: Assert fail if translated
+ assert not we_are_translated()
return other in unicode(self.bytes, 'utf8')
if isinstance(other, str):
return other in self.bytes
@@ -247,6 +252,7 @@
else:
end = self.index_of_char(end)
+ assert start >= 0
return start, end
@specialize.argtype(2, 3)
@@ -257,10 +263,12 @@
if isinstance(other, Utf8Str):
pos = self.bytes.find(other.bytes, start, end)
- elif isinstance(other, unicode):
- pos = unicode(self.bytes, 'utf8').find(other, start, end)
elif isinstance(other, str):
pos = self.bytes.find(other, start, end)
+ else:
+ assert isinstance(other, unicode)
+ assert not we_are_translated()
+ pos = unicode(self.bytes, 'utf8').find(other, start, end)
if pos == -1:
return -1
@@ -469,7 +477,7 @@
builder = Utf8Builder()
i = 0;
while True:
- c = int(array[i])
+ c = intmask(array[i])
if c == 0:
break
@@ -504,7 +512,7 @@
if rffi.sizeof(rffi.WCHAR_T) == 2:
if i != size - 1 and 0xD800 <= c <= 0xDBFF:
i += 1
- c2 = int(array[i])
+ c2 = intmask(array[i])
if c2 == 0:
builder.append(c)
break
@@ -530,7 +538,7 @@
if rffi.sizeof(rffi.WCHAR_T) == 2:
if i != size - 1 and 0xD800 <= c <= 0xDBFF:
i += 1
- c2 = int(array[i])
+ c2 = intmask(array[i])
if not (0xDC00 <= c2 <= 0xDFFF):
builder.append(c)
c = c2
@@ -553,8 +561,14 @@
@specialize.argtype(1)
- def append(self, c, allow_large_codepoints=False):
- if isinstance(c, int) or isinstance(c, r_uint):
+ def append(self, c):
+ if isinstance(c, Utf8Str):
+ self._builder.append(c.bytes)
+ if not c._is_ascii:
+ self._is_ascii = False
+ elif isinstance(c, int) or isinstance(c, r_uint):
+ if isinstance(c, base_int):
+ c = intmask(c)
if c < 0x80:
self._builder.append(chr(c))
elif c < 0x800:
@@ -566,7 +580,7 @@
self._builder.append(chr(0x80 | (c >> 6 & 0x3F)))
self._builder.append(chr(0x80 | (c & 0x3F)))
self._is_ascii = False
- elif c <= 0x10FFFF or allow_large_codepoints:
+ elif c <= 0x10FFFF:
self._builder.append(chr(0xF0 | (c >> 18)))
self._builder.append(chr(0x80 | (c >> 12 & 0x3F)))
self._builder.append(chr(0x80 | (c >> 6 & 0x3F)))
@@ -574,10 +588,6 @@
self._is_ascii = False
else:
raise ValueError("Invalid unicode codepoint > 0x10FFFF.")
- elif isinstance(c, Utf8Str):
- self._builder.append(c.bytes)
- if not c._is_ascii:
- self._is_ascii = False
else:
# TODO: Remove this check?
if len(c) == 1:
@@ -769,3 +779,4 @@
del character_calc_value
del ForwardIterBase
del ReverseIterBase
+
diff --git a/pypy/module/_cffi_backend/ctypeprim.py
b/pypy/module/_cffi_backend/ctypeprim.py
--- a/pypy/module/_cffi_backend/ctypeprim.py
+++ b/pypy/module/_cffi_backend/ctypeprim.py
@@ -143,20 +143,20 @@
keepalive_until_here(cdataobj)
return w_res
- def _convert_to_unichar(self, w_ob):
+ def _convert_to_uni_codepoint(self, w_ob):
space = self.space
if space.isinstance_w(w_ob, space.w_unicode):
s = space.unicode_w(w_ob)
if len(s) == 1:
- return s[0]
+ return utf8ord(s, 0)
if (isinstance(w_ob, cdataobj.W_CData) and
isinstance(w_ob.ctype, W_CTypePrimitiveUniChar)):
- return rffi.cast(rffi.CWCHARP, w_ob._cdata)[0]
+ return rffi.cast(utf8.WCHAR_INTP, w_ob._cdata)[0]
raise self._convert_error("unicode string of length 1", w_ob)
def convert_from_object(self, cdata, w_ob):
- value = self._convert_to_unichar(w_ob)
- rffi.cast(utf8.WCHAR_INTP, cdata)[0] = utf8.wchar_rint(utf8ord(value))
+ value = self._convert_to_uni_codepoint(w_ob)
+ rffi.cast(utf8.WCHAR_INTP, cdata)[0] = utf8.wchar_rint(value)
class W_CTypePrimitiveSigned(W_CTypePrimitive):
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -383,7 +383,7 @@
self.readtranslate = newline is None
self.readnl = newline
- self.writetranslate = (newline != Utf8Str(''))
+ self.writetranslate = (newline is None or len(newline) == 0)
if not self.readuniversal:
self.writenl = self.readnl
if self.writenl == Utf8Str('\n'):
diff --git a/pypy/module/_rawffi/array.py b/pypy/module/_rawffi/array.py
--- a/pypy/module/_rawffi/array.py
+++ b/pypy/module/_rawffi/array.py
@@ -5,6 +5,7 @@
from pypy.interpreter.gateway import interp2app, unwrap_spec
from pypy.interpreter.typedef import TypeDef, GetSetProperty,
interp_attrproperty
+from pypy.interpreter.utf8 import utf8ord
from rpython.rtyper.lltypesystem import lltype, rffi
from pypy.interpreter.error import OperationError
from pypy.module._rawffi.interp_rawffi import segfault_exception
diff --git a/pypy/module/_rawffi/interp_rawffi.py
b/pypy/module/_rawffi/interp_rawffi.py
--- a/pypy/module/_rawffi/interp_rawffi.py
+++ b/pypy/module/_rawffi/interp_rawffi.py
@@ -274,7 +274,7 @@
return ptr_val
else:
if T is rffi.CWCHARP:
- return utf8chr(int(rffi.cast(WCHAR_INTP, ptr)[ofs]))
+ return utf8chr(intmask(rffi.cast(WCHAR_INTP, ptr)[ofs]))
return rffi.cast(T, ptr)[ofs]
read_ptr._annspecialcase_ = 'specialize:arg(2)'
@@ -415,6 +415,7 @@
"Expected unicode string of length one as wide character"))
val = utf8ord(s)
+ #val = 0
if rffi.sizeof(rffi.WCHAR_T) == 2 and val > 0xFFFF:
# Utf-16 must be used on systems with a 2 byte wchar_t to
# encode codepoints > 0xFFFF
diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -5,7 +5,7 @@
from pypy.interpreter.typedef import make_weakref_descr
from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
from pypy.interpreter.error import OperationError
-from pypy.interpreter.utf8 import utf8ord
+from pypy.interpreter.utf8 import Utf8Str, utf8ord
from rpython.rlib.rarithmetic import intmask
from rpython.rlib import jit
@@ -286,7 +286,7 @@
space.w_None))
if space.isinstance_w(w_string, space.w_unicode):
- w_emptystr = space.wrap(u'')
+ w_emptystr = space.wrap(Utf8Str(''))
else:
w_emptystr = space.wrap('')
w_item = space.call_method(w_emptystr, 'join',
diff --git a/pypy/module/array/interp_array.py
b/pypy/module/array/interp_array.py
--- a/pypy/module/array/interp_array.py
+++ b/pypy/module/array/interp_array.py
@@ -797,8 +797,13 @@
elif mytype.typecode == 'f':
item = float(item)
elif mytype.typecode == 'u':
- # TODO: Does this nned special handling for 16bit whar_t?
- item = utf8chr(intmask(item), allow_large_codepoints=True)
+ # TODO: Does this need special handling for 16bit whar_t?
+ try:
+ item = utf8chr(intmask(item))
+ except ValueError:
+ raise oefmt(space.w_ValueError,
+ 'character U+%s is not in range[U+0000; '
+ 'U+10ffff]', hex(intmask(item)))
return space.wrap(item)
# interface
@@ -998,9 +1003,9 @@
start = 0
# <a performance hack>
if oldlen == 1:
- if mytype.unwrap == 'str_w' or mytype.unwrap == 'unicode_w':
+ if mytype.unwrap == 'str_w':
zero = not ord(self.buffer[0])
- elif mytype.unwrap == 'int_w' or mytype.unwrap == 'bigint_w':
+ elif mytype.unwrap in ('int_w', 'bigint_w', 'unicode_w'):
zero = not widen(self.buffer[0])
#elif mytype.unwrap == 'float_w':
# value = ...float(self.buffer[0]) xxx handle the case of -0.0
diff --git a/pypy/module/array/test/test_array.py
b/pypy/module/array/test/test_array.py
--- a/pypy/module/array/test/test_array.py
+++ b/pypy/module/array/test/test_array.py
@@ -834,12 +834,6 @@
assert repr(mya('i', [1, 2, 3])) == "array('i', [1, 2, 3])"
assert repr(mya('i', (1, 2, 3))) == "array('i', [1, 2, 3])"
- def test_unicode_outofrange(self):
- a = self.array('u', unicode(r'\x01\u263a\x00\ufeff', 'unicode-escape'))
- b = self.array('u', unicode(r'\x01\u263a\x00\ufeff', 'unicode-escape'))
- b.byteswap()
- assert a != b
-
def test_weakref(self):
import weakref
a = self.array('c', 'Hi!')
@@ -1032,6 +1026,11 @@
def test_fresh_array_buffer_str(self):
assert str(buffer(self.array('i'))) == ''
+ def test_unicode_outofrange(self):
+ b = self.array('u', unicode(r'\x01\u263a\x00\ufeff', 'unicode-escape'))
+ b.byteswap()
+ raises(ValueError, "b[0]")
+
class AppTestArrayBuiltinShortcut(AppTestArray):
spaceconfig = AppTestArray.spaceconfig.copy()
diff --git a/pypy/module/struct/formatiterator.py
b/pypy/module/struct/formatiterator.py
--- a/pypy/module/struct/formatiterator.py
+++ b/pypy/module/struct/formatiterator.py
@@ -3,9 +3,60 @@
from rpython.rlib.rstring import StringBuilder
from rpython.rlib.rstruct.error import StructError
from rpython.rlib.rstruct.formatiterator import FormatIterator
+from rpython.rlib.rstruct.standardfmttable import standard_fmttable
+from rpython.rlib.unroll import unrolling_iterable
+from rpython.rtyper.lltypesystem import rffi
-from pypy.interpreter.error import OperationError
+from pypy.interpreter.error import OperationError, oefmt
+from pypy.interpreter.utf8 import utf8ord, utf8chr
+wchar_len = rffi.sizeof(rffi.WCHAR_T)
+
+unroll_pack_unichar_iter = unrolling_iterable(range(wchar_len-1, -1, -1))
+def pack_unichar(fmtiter):
+ value = utf8ord(fmtiter.accept_unicode_arg())
+
+ # TODO: What do I do on a system with sizeof(wchar_t) == 2? I can't
+ # split it reasonably?
+ #if not min <= value <= max:
+ # raise StructError(errormsg)
+
+ if fmtiter.bigendian:
+ for i in unroll_pack_unichar_iter:
+ x = (value >> (8*i)) & 0xff
+ fmtiter.result.append(chr(x))
+ else:
+ for i in unroll_pack_unichar_iter:
+ fmtiter.result.append(chr(value & 0xff))
+ value >>= 8
+
+unroll_upack_unichar_iter = unrolling_iterable(range(wchar_len))
+def unpack_unichar(fmtiter):
+ #intvalue = inttype(0)
+ intvalue = 0
+ s = fmtiter.read(wchar_len)
+ idx = 0
+ if fmtiter.bigendian:
+ for i in unroll_upack_unichar_iter:
+ x = ord(s[idx])
+ intvalue <<= 8
+ #intvalue |= inttype(x)
+ intvalue |= x
+ idx += 1
+ else:
+ for i in unroll_upack_unichar_iter:
+ x = ord(s[idx])
+ #intvalue |= inttype(x) << (8*i)
+ intvalue |= x << (8*i)
+ idx += 1
+
+ try:
+ value = utf8chr(intvalue)
+ except ValueError:
+ raise oefmt(fmtiter.space.w_ValueError,
+ 'character U+%s is not in range[U+0000; '
+ 'U+10ffff]', hex(intvalue))
+ fmtiter.appendobj(value)
class PackFormatIterator(FormatIterator):
def __init__(self, space, args_w, size):
@@ -20,11 +71,15 @@
@jit.unroll_safe
@specialize.arg(1)
def operate(self, fmtdesc, repetitions):
+ pack = fmtdesc.pack
+ if fmtdesc.fmtchar == 'u':
+ pack = pack_unichar
+
if fmtdesc.needcount:
- fmtdesc.pack(self, repetitions)
+ pack(self, repetitions)
else:
for i in range(repetitions):
- fmtdesc.pack(self)
+ pack(self)
_operate_is_specialized_ = True
@jit.unroll_safe
@@ -115,11 +170,15 @@
@jit.unroll_safe
@specialize.arg(1)
def operate(self, fmtdesc, repetitions):
+ unpack = fmtdesc.unpack
+ if fmtdesc.fmtchar == 'u':
+ unpack = unpack_unichar
+
if fmtdesc.needcount:
- fmtdesc.unpack(self, repetitions)
+ unpack(self, repetitions)
else:
for i in range(repetitions):
- fmtdesc.unpack(self)
+ unpack(self)
_operate_is_specialized_ = True
def align(self, mask):
diff --git a/pypy/module/struct/test/test_struct.py
b/pypy/module/struct/test/test_struct.py
--- a/pypy/module/struct/test/test_struct.py
+++ b/pypy/module/struct/test/test_struct.py
@@ -412,6 +412,9 @@
assert s.unpack(s.pack(42)) == (42,)
assert s.unpack_from(memoryview(s.pack(42))) == (42,)
+ def test_unicode_outofrange(self):
+ raises(ValueError, "self.struct.unpack('u', '0000')")
+
class AppTestStructBuffer(object):
spaceconfig = dict(usemodules=['struct', '__pypy__'])
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit