[pypy-commit] pypy utf8-unicode2: Fix most remaining module failures and some translation failures

waedt Tue, 29 Jul 2014 07:17:47 -0700

Author: Tyler Wade <[email protected]>
Branch: utf8-unicode2
Changeset: r72608:d4419a342b68
Date: 2014-07-29 09:16 -0500
http://bitbucket.org/pypy/pypy/changeset/d4419a342b68/


Log:    Fix most remaining module failures and some translation failures

diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -10,6 +10,7 @@
 from rpython.rlib.rarithmetic import r_uint, SHRT_MIN, SHRT_MAX, \
     INT_MIN, INT_MAX, UINT_MAX
 
+from pypy.interpreter.utf8 import Utf8Str
 from pypy.interpreter.executioncontext import (ExecutionContext, ActionFlag,
     UserDelAction)
 from pypy.interpreter.error import OperationError, new_exception_class, oefmt
@@ -1545,7 +1546,10 @@
         return self.str_w(w_obj)
 
     def unicode_w(self, w_obj):
-        return w_obj.unicode_w(self)
+        #return w_obj.unicode_w(self)
+        res = w_obj.unicode_w(self)
+        assert isinstance(res, Utf8Str)
+        return res
 
     def unicode0_w(self, w_obj):
         "Like unicode_w, but rejects strings with NUL bytes."
diff --git a/pypy/interpreter/test/test_utf8.py 
b/pypy/interpreter/test/test_utf8.py
--- a/pypy/interpreter/test/test_utf8.py
+++ b/pypy/interpreter/test/test_utf8.py
@@ -5,6 +5,7 @@
 from pypy.interpreter.utf8 import (
     Utf8Str, Utf8Builder, utf8chr, utf8ord)
 from rpython.rtyper.lltypesystem import rffi
+from rpython.rtyper.test.test_llinterp import interpret
 
 def build_utf8str():
     builder = Utf8Builder()
@@ -241,3 +242,25 @@
     assert s == u[:4]
 
     rffi.free_wcharp(wcharp)
+
+def test_translate_utf8():
+    def f():
+        s = build_utf8str()
+
+        s *= 10
+        s += Utf8Str('one')
+        return len(s)
+    assert interpret(f, []) == f()
+
+    def f():
+        one = Utf8Str("one")
+        two = Utf8Str("one")
+
+        return int(one == two) + int(not (one != two))
+    assert interpret(f, []) == f()
+
+    def f():
+        one = Utf8Str("one")
+
+        return one == None
+    assert interpret(f, []) == f()
diff --git a/pypy/interpreter/utf8.py b/pypy/interpreter/utf8.py
--- a/pypy/interpreter/utf8.py
+++ b/pypy/interpreter/utf8.py
@@ -1,10 +1,11 @@
 from rpython.rlib.rstring import StringBuilder
-from rpython.rlib.objectmodel import specialize
+from rpython.rlib.objectmodel import we_are_translated, specialize
 from rpython.rlib.runicode import utf8_code_length
 from rpython.rlib.unicodedata import unicodedb_5_2_0 as unicodedb
-from rpython.rlib.rarithmetic import r_uint, intmask
+from rpython.rlib.rarithmetic import r_uint, intmask, base_int
 from rpython.rtyper.lltypesystem import rffi, lltype
 
+
 wchar_rint = rffi.r_uint
 WCHAR_INTP = rffi.UINTP
 WCHAR_INT = rffi.UINT
@@ -14,11 +15,11 @@
     WCHAR_INT = rffi.USHORT
 
 
-def utf8chr(value, allow_large_codepoints=False):
+def utf8chr(value):
     # Like unichr, but returns a Utf8Str object
     # TODO: Do this without the builder so its faster
     b = Utf8Builder()
-    b.append(value, allow_large_codepoints=allow_large_codepoints)
+    b.append(value)
     return b.build()
 
 def utf8ord_bytes(bytes, start):
@@ -160,22 +161,26 @@
         return hash(self.bytes)
 
     def __eq__(self, other):
-        """NOT_RPYTHON"""
         if isinstance(other, Utf8Str):
             return self.bytes == other.bytes
+        if other is None:
+            return False
         if isinstance(other, unicode):
+            assert not we_are_translated()
             return unicode(self.bytes, 'utf8') == other
 
-        return False
+        raise ValueError()
 
     def __ne__(self, other):
-        """NOT_RPYTHON"""
         if isinstance(other, Utf8Str):
             return self.bytes != other.bytes
+        if other is None:
+            return True
         if isinstance(other, unicode):
+            assert not we_are_translated()
             return unicode(self.bytes, 'utf8') != other
 
-        return True
+        raise ValueError()
 
     def __lt__(self, other):
         return self.bytes < other.bytes
@@ -194,7 +199,7 @@
         if isinstance(other, Utf8Str):
             return other.bytes in self.bytes
         if isinstance(other, unicode):
-            # TODO: Assert fail if translated
+            assert not we_are_translated()
             return other in unicode(self.bytes, 'utf8')
         if isinstance(other, str):
             return other in self.bytes
@@ -247,6 +252,7 @@
         else:
             end = self.index_of_char(end)
 
+        assert start >= 0
         return start, end
 
     @specialize.argtype(2, 3)
@@ -257,10 +263,12 @@
 
         if isinstance(other, Utf8Str):
             pos = self.bytes.find(other.bytes, start, end)
-        elif isinstance(other, unicode):
-            pos = unicode(self.bytes, 'utf8').find(other, start, end)
         elif isinstance(other, str):
             pos = self.bytes.find(other, start, end)
+        else:
+            assert isinstance(other, unicode)
+            assert not we_are_translated()
+            pos = unicode(self.bytes, 'utf8').find(other, start, end)
 
         if pos == -1:
             return -1
@@ -469,7 +477,7 @@
         builder = Utf8Builder()
         i = 0;
         while True:
-            c = int(array[i])
+            c = intmask(array[i])
             if c == 0:
                 break
 
@@ -504,7 +512,7 @@
             if rffi.sizeof(rffi.WCHAR_T) == 2:
                 if i != size - 1 and 0xD800 <= c <= 0xDBFF:
                     i += 1
-                    c2 = int(array[i])
+                    c2 = intmask(array[i])
                     if c2 == 0:
                         builder.append(c)
                         break
@@ -530,7 +538,7 @@
             if rffi.sizeof(rffi.WCHAR_T) == 2:
                 if i != size - 1 and 0xD800 <= c <= 0xDBFF:
                     i += 1
-                    c2 = int(array[i])
+                    c2 = intmask(array[i])
                     if not (0xDC00 <= c2 <= 0xDFFF):
                         builder.append(c)
                         c = c2
@@ -553,8 +561,14 @@
 
 
     @specialize.argtype(1)
-    def append(self, c, allow_large_codepoints=False):
-        if isinstance(c, int) or isinstance(c, r_uint):
+    def append(self, c):
+        if isinstance(c, Utf8Str):
+            self._builder.append(c.bytes)
+            if not c._is_ascii:
+                self._is_ascii = False
+        elif isinstance(c, int) or isinstance(c, r_uint):
+            if isinstance(c, base_int):
+                c = intmask(c)
             if c < 0x80:
                 self._builder.append(chr(c))
             elif c < 0x800:
@@ -566,7 +580,7 @@
                 self._builder.append(chr(0x80 | (c >> 6 & 0x3F)))
                 self._builder.append(chr(0x80 | (c & 0x3F)))
                 self._is_ascii = False
-            elif c <= 0x10FFFF or allow_large_codepoints:
+            elif c <= 0x10FFFF:
                 self._builder.append(chr(0xF0 | (c >> 18)))
                 self._builder.append(chr(0x80 | (c >> 12 & 0x3F)))
                 self._builder.append(chr(0x80 | (c >> 6 & 0x3F)))
@@ -574,10 +588,6 @@
                 self._is_ascii = False
             else:
                 raise ValueError("Invalid unicode codepoint > 0x10FFFF.")
-        elif isinstance(c, Utf8Str):
-            self._builder.append(c.bytes)
-            if not c._is_ascii:
-                self._is_ascii = False
         else:
             # TODO: Remove this check?
             if len(c) == 1:
@@ -769,3 +779,4 @@
 del character_calc_value
 del ForwardIterBase
 del ReverseIterBase
+
diff --git a/pypy/module/_cffi_backend/ctypeprim.py 
b/pypy/module/_cffi_backend/ctypeprim.py
--- a/pypy/module/_cffi_backend/ctypeprim.py
+++ b/pypy/module/_cffi_backend/ctypeprim.py
@@ -143,20 +143,20 @@
         keepalive_until_here(cdataobj)
         return w_res
 
-    def _convert_to_unichar(self, w_ob):
+    def _convert_to_uni_codepoint(self, w_ob):
         space = self.space
         if space.isinstance_w(w_ob, space.w_unicode):
             s = space.unicode_w(w_ob)
             if len(s) == 1:
-                return s[0]
+                return utf8ord(s, 0)
         if (isinstance(w_ob, cdataobj.W_CData) and
                isinstance(w_ob.ctype, W_CTypePrimitiveUniChar)):
-            return rffi.cast(rffi.CWCHARP, w_ob._cdata)[0]
+            return rffi.cast(utf8.WCHAR_INTP, w_ob._cdata)[0]
         raise self._convert_error("unicode string of length 1", w_ob)
 
     def convert_from_object(self, cdata, w_ob):
-        value = self._convert_to_unichar(w_ob)
-        rffi.cast(utf8.WCHAR_INTP, cdata)[0] = utf8.wchar_rint(utf8ord(value))
+        value = self._convert_to_uni_codepoint(w_ob)
+        rffi.cast(utf8.WCHAR_INTP, cdata)[0] = utf8.wchar_rint(value)
 
 
 class W_CTypePrimitiveSigned(W_CTypePrimitive):
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -383,7 +383,7 @@
         self.readtranslate = newline is None
         self.readnl = newline
 
-        self.writetranslate = (newline != Utf8Str(''))
+        self.writetranslate = (newline is None or len(newline) == 0)
         if not self.readuniversal:
             self.writenl = self.readnl
             if self.writenl == Utf8Str('\n'):
diff --git a/pypy/module/_rawffi/array.py b/pypy/module/_rawffi/array.py
--- a/pypy/module/_rawffi/array.py
+++ b/pypy/module/_rawffi/array.py
@@ -5,6 +5,7 @@
 
 from pypy.interpreter.gateway import interp2app, unwrap_spec
 from pypy.interpreter.typedef import TypeDef, GetSetProperty, 
interp_attrproperty
+from pypy.interpreter.utf8 import utf8ord
 from rpython.rtyper.lltypesystem import lltype, rffi
 from pypy.interpreter.error import OperationError
 from pypy.module._rawffi.interp_rawffi import segfault_exception
diff --git a/pypy/module/_rawffi/interp_rawffi.py 
b/pypy/module/_rawffi/interp_rawffi.py
--- a/pypy/module/_rawffi/interp_rawffi.py
+++ b/pypy/module/_rawffi/interp_rawffi.py
@@ -274,7 +274,7 @@
                 return ptr_val
     else:
         if T is rffi.CWCHARP:
-            return utf8chr(int(rffi.cast(WCHAR_INTP, ptr)[ofs]))
+            return utf8chr(intmask(rffi.cast(WCHAR_INTP, ptr)[ofs]))
         return rffi.cast(T, ptr)[ofs]
 read_ptr._annspecialcase_ = 'specialize:arg(2)'
 
@@ -415,6 +415,7 @@
                 "Expected unicode string of length one as wide character"))
 
         val = utf8ord(s)
+        #val = 0
         if rffi.sizeof(rffi.WCHAR_T) == 2 and val > 0xFFFF:
             # Utf-16 must be used on systems with a 2 byte wchar_t to
             # encode codepoints > 0xFFFF
diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -5,7 +5,7 @@
 from pypy.interpreter.typedef import make_weakref_descr
 from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
 from pypy.interpreter.error import OperationError
-from pypy.interpreter.utf8 import utf8ord
+from pypy.interpreter.utf8 import Utf8Str, utf8ord
 from rpython.rlib.rarithmetic import intmask
 from rpython.rlib import jit
 
@@ -286,7 +286,7 @@
                                      space.w_None))
 
         if space.isinstance_w(w_string, space.w_unicode):
-            w_emptystr = space.wrap(u'')
+            w_emptystr = space.wrap(Utf8Str(''))
         else:
             w_emptystr = space.wrap('')
         w_item = space.call_method(w_emptystr, 'join',
diff --git a/pypy/module/array/interp_array.py 
b/pypy/module/array/interp_array.py
--- a/pypy/module/array/interp_array.py
+++ b/pypy/module/array/interp_array.py
@@ -797,8 +797,13 @@
             elif mytype.typecode == 'f':
                 item = float(item)
             elif mytype.typecode == 'u':
-                # TODO: Does this nned special handling for 16bit whar_t?
-                item = utf8chr(intmask(item), allow_large_codepoints=True)
+                # TODO: Does this need special handling for 16bit whar_t?
+                try:
+                    item = utf8chr(intmask(item))
+                except ValueError:
+                    raise oefmt(space.w_ValueError,
+                                'character U+%s is not in range[U+0000; '
+                                'U+10ffff]', hex(intmask(item)))
             return space.wrap(item)
 
         # interface
@@ -998,9 +1003,9 @@
             start = 0
         # <a performance hack>
         if oldlen == 1:
-            if mytype.unwrap == 'str_w' or mytype.unwrap == 'unicode_w':
+            if mytype.unwrap == 'str_w':
                 zero = not ord(self.buffer[0])
-            elif mytype.unwrap == 'int_w' or mytype.unwrap == 'bigint_w':
+            elif mytype.unwrap in ('int_w', 'bigint_w', 'unicode_w'):
                 zero = not widen(self.buffer[0])
             #elif mytype.unwrap == 'float_w':
             #    value = ...float(self.buffer[0])  xxx handle the case of -0.0
diff --git a/pypy/module/array/test/test_array.py 
b/pypy/module/array/test/test_array.py
--- a/pypy/module/array/test/test_array.py
+++ b/pypy/module/array/test/test_array.py
@@ -834,12 +834,6 @@
         assert repr(mya('i', [1, 2, 3])) == "array('i', [1, 2, 3])"
         assert repr(mya('i', (1, 2, 3))) == "array('i', [1, 2, 3])"
 
-    def test_unicode_outofrange(self):
-        a = self.array('u', unicode(r'\x01\u263a\x00\ufeff', 'unicode-escape'))
-        b = self.array('u', unicode(r'\x01\u263a\x00\ufeff', 'unicode-escape'))
-        b.byteswap()
-        assert a != b
-
     def test_weakref(self):
         import weakref
         a = self.array('c', 'Hi!')
@@ -1032,6 +1026,11 @@
     def test_fresh_array_buffer_str(self):
         assert str(buffer(self.array('i'))) == ''
 
+    def test_unicode_outofrange(self):
+        b = self.array('u', unicode(r'\x01\u263a\x00\ufeff', 'unicode-escape'))
+        b.byteswap()
+        raises(ValueError, "b[0]")
+
 
 class AppTestArrayBuiltinShortcut(AppTestArray):
     spaceconfig = AppTestArray.spaceconfig.copy()
diff --git a/pypy/module/struct/formatiterator.py 
b/pypy/module/struct/formatiterator.py
--- a/pypy/module/struct/formatiterator.py
+++ b/pypy/module/struct/formatiterator.py
@@ -3,9 +3,60 @@
 from rpython.rlib.rstring import StringBuilder
 from rpython.rlib.rstruct.error import StructError
 from rpython.rlib.rstruct.formatiterator import FormatIterator
+from rpython.rlib.rstruct.standardfmttable import standard_fmttable
+from rpython.rlib.unroll import unrolling_iterable
+from rpython.rtyper.lltypesystem import rffi
 
-from pypy.interpreter.error import OperationError
+from pypy.interpreter.error import OperationError, oefmt
+from pypy.interpreter.utf8 import utf8ord, utf8chr
 
+wchar_len = rffi.sizeof(rffi.WCHAR_T)
+
+unroll_pack_unichar_iter = unrolling_iterable(range(wchar_len-1, -1, -1))
+def pack_unichar(fmtiter):
+    value = utf8ord(fmtiter.accept_unicode_arg())
+
+    # TODO: What do I do on a system with sizeof(wchar_t) == 2? I can't
+    #       split it reasonably?
+    #if not min <= value <= max:
+    #    raise StructError(errormsg)
+
+    if fmtiter.bigendian:
+        for i in unroll_pack_unichar_iter:
+            x = (value >> (8*i)) & 0xff
+            fmtiter.result.append(chr(x))
+    else:
+        for i in unroll_pack_unichar_iter:
+            fmtiter.result.append(chr(value & 0xff))
+            value >>= 8
+
+unroll_upack_unichar_iter = unrolling_iterable(range(wchar_len))
+def unpack_unichar(fmtiter):
+    #intvalue = inttype(0)
+    intvalue = 0
+    s = fmtiter.read(wchar_len)
+    idx = 0
+    if fmtiter.bigendian:
+        for i in unroll_upack_unichar_iter:
+            x = ord(s[idx])
+            intvalue <<= 8
+            #intvalue |= inttype(x)
+            intvalue |= x
+            idx += 1
+    else:
+        for i in unroll_upack_unichar_iter:
+            x = ord(s[idx])
+            #intvalue |= inttype(x) << (8*i)
+            intvalue |= x << (8*i)
+            idx += 1
+
+    try:
+        value = utf8chr(intvalue)
+    except ValueError:
+        raise oefmt(fmtiter.space.w_ValueError,
+                    'character U+%s is not in range[U+0000; '
+                     'U+10ffff]', hex(intvalue))
+    fmtiter.appendobj(value)
 
 class PackFormatIterator(FormatIterator):
     def __init__(self, space, args_w, size):
@@ -20,11 +71,15 @@
     @jit.unroll_safe
     @specialize.arg(1)
     def operate(self, fmtdesc, repetitions):
+        pack = fmtdesc.pack
+        if fmtdesc.fmtchar == 'u':
+            pack = pack_unichar
+
         if fmtdesc.needcount:
-            fmtdesc.pack(self, repetitions)
+            pack(self, repetitions)
         else:
             for i in range(repetitions):
-                fmtdesc.pack(self)
+                pack(self)
     _operate_is_specialized_ = True
 
     @jit.unroll_safe
@@ -115,11 +170,15 @@
     @jit.unroll_safe
     @specialize.arg(1)
     def operate(self, fmtdesc, repetitions):
+        unpack = fmtdesc.unpack
+        if fmtdesc.fmtchar == 'u':
+            unpack = unpack_unichar
+
         if fmtdesc.needcount:
-            fmtdesc.unpack(self, repetitions)
+            unpack(self, repetitions)
         else:
             for i in range(repetitions):
-                fmtdesc.unpack(self)
+                unpack(self)
     _operate_is_specialized_ = True
 
     def align(self, mask):
diff --git a/pypy/module/struct/test/test_struct.py 
b/pypy/module/struct/test/test_struct.py
--- a/pypy/module/struct/test/test_struct.py
+++ b/pypy/module/struct/test/test_struct.py
@@ -412,6 +412,9 @@
         assert s.unpack(s.pack(42)) == (42,)
         assert s.unpack_from(memoryview(s.pack(42))) == (42,)
 
+    def test_unicode_outofrange(self):
+        raises(ValueError, "self.struct.unpack('u', '0000')")
+
 
 class AppTestStructBuffer(object):
     spaceconfig = dict(usemodules=['struct', '__pypy__'])
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy utf8-unicode2: Fix most remaining module failures and some translation failures

Reply via email to