[pypy-commit] pypy utf8-unicode2: WIP fixing translation

waedt Mon, 04 Aug 2014 07:32:33 -0700

Author: Tyler Wade <[email protected]>
Branch: utf8-unicode2
Changeset: r72691:8a2f88e6348d
Date: 2014-08-04 09:26 -0500
http://bitbucket.org/pypy/pypy/changeset/8a2f88e6348d/


Log:    WIP fixing translation

diff --git a/pypy/interpreter/test/test_utf8.py 
b/pypy/interpreter/test/test_utf8.py
--- a/pypy/interpreter/test/test_utf8.py
+++ b/pypy/interpreter/test/test_utf8.py
@@ -243,24 +243,4 @@
 
     rffi.free_wcharp(wcharp)
 
-def test_translate_utf8():
-    def f():
-        s = build_utf8str()
 
-        s *= 10
-        s += Utf8Str('one')
-        return len(s)
-    assert interpret(f, []) == f()
-
-    def f():
-        one = Utf8Str("one")
-        two = Utf8Str("one")
-
-        return int(one == two) + int(not (one != two))
-    assert interpret(f, []) == f()
-
-    def f():
-        one = Utf8Str("one")
-
-        return one == None
-    assert interpret(f, []) == f()
diff --git a/pypy/interpreter/utf8.py b/pypy/interpreter/utf8.py
--- a/pypy/interpreter/utf8.py
+++ b/pypy/interpreter/utf8.py
@@ -1,9 +1,11 @@
 from rpython.rlib.rstring import StringBuilder
-from rpython.rlib.objectmodel import we_are_translated, specialize
+from rpython.rlib.objectmodel import (
+    we_are_translated, specialize, import_from_mixin)
 from rpython.rlib.runicode import utf8_code_length
 from rpython.rlib.unicodedata import unicodedb_5_2_0 as unicodedb
 from rpython.rlib.rarithmetic import r_uint, intmask, base_int
 from rpython.rtyper.lltypesystem import rffi, lltype
+from rpython.tool.sourcetools import func_with_new_name
 
 
 wchar_rint = rffi.r_uint
@@ -26,21 +28,24 @@
     codepoint_length = utf8_code_length[ord(bytes[start])]
 
     if codepoint_length == 1:
-        return ord(bytes[start])
+        res = ord(bytes[start])
 
     elif codepoint_length == 2:
-        return ((ord(bytes[start]) & 0x1F) << 6 |
-                (ord(bytes[start + 1]) & 0x3F))
+        res = ((ord(bytes[start]) & 0x1F) << 6 |
+               (ord(bytes[start + 1]) & 0x3F))
     elif codepoint_length == 3:
-        return ((ord(bytes[start]) & 0xF) << 12 |
-                (ord(bytes[start + 1]) & 0x3F) << 6 |
-                (ord(bytes[start + 2]) & 0x3F))
+        res = ((ord(bytes[start]) & 0xF) << 12 |
+               (ord(bytes[start + 1]) & 0x3F) << 6 |
+               (ord(bytes[start + 2]) & 0x3F))
     else:
         assert codepoint_length == 4
-        return ((ord(bytes[start]) & 0xF) << 18 |
-                (ord(bytes[start + 1]) & 0x3F) << 12 |
-                (ord(bytes[start + 2]) & 0x3F) << 6 |
-                (ord(bytes[start + 3]) & 0x3F))
+        res = ((ord(bytes[start]) & 0xF) << 18 |
+               (ord(bytes[start + 1]) & 0x3F) << 12 |
+               (ord(bytes[start + 2]) & 0x3F) << 6 |
+               (ord(bytes[start + 3]) & 0x3F))
+
+    assert res >= 0
+    return res
 
 def utf8ord(ustr, start=0):
     start = ustr.index_of_char(start)
@@ -53,6 +58,45 @@
     else:
         return ord(s[pos])
 
[email protected](0)
+def EQ(s1, s2):
+    if s1 is None:
+        return s1 is s2
+    if isinstance(s1, Utf8Str):
+        return s1.__eq__(s2)
+    else:
+        return s1 == s2
+
[email protected](0)
+def NE(s1, s2):
+    if s1 is None:
+        return s1 is not s2
+    if isinstance(s1, Utf8Str):
+        return s1.__ne__(s2)
+    else:
+        return s1 != s2
+
[email protected](0)
+def ADD(s1, s2):
+    if isinstance(s1, Utf8Str):
+        return s1.__add__(s2)
+    else:
+        return s1 + s2
+
[email protected](0)
+def MUL(s1, s2):
+    if isinstance(s1, Utf8Str):
+        return s1.__mul__(s2)
+    else:
+        return s1 * s2
+
[email protected](0, 1)
+def IN(s1, s2):
+    if isinstance(s1, Utf8Str):
+        return s2.__contains__(s1)
+    else:
+        return s1 in s2
+
 class Utf8Str(object):
     _immutable_fields_ = ['bytes', '_is_ascii', '_len']
 
@@ -69,7 +113,6 @@
             self._len = length
         else:
             if not is_ascii:
-                #self._len = -1
                 self._calc_length()
             else:
                 self._len = len(data)
@@ -112,14 +155,22 @@
             char_pos += self._len
         return self[char_pos:char_pos+1]
 
+    @specialize.argtype(1, 2)
     def __getslice__(self, start, stop):
+        if start is None:
+            start = 0
+        if stop is None:
+            stop = len(self)
+
+        assert start >= 0
         assert start <= stop
+
         if start == stop:
             return Utf8Str('')
-        # TODO: If start > _len or stop >= _len, then raise exception 
 
         if stop > len(self):
             stop = len(self)
+        assert stop >= 0
 
         if self._is_ascii:
             return Utf8Str(self.bytes[start:stop], True)
@@ -155,6 +206,7 @@
         return Utf8Str(self.bytes * count, self._is_ascii)
 
     def __len__(self):
+        assert self._len >= 0
         return self._len
 
     def __hash__(self):
@@ -252,13 +304,12 @@
         else:
             end = self.index_of_char(end)
 
-        assert start >= 0
         return start, end
 
-    @specialize.argtype(2, 3)
+    @specialize.argtype(1, 2, 3)
     def find(self, other, start=None, end=None):
         start, end = self._bound_check(start, end)
-        if start == -1:
+        if start < 0:
             return -1
 
         if isinstance(other, Utf8Str):
@@ -275,17 +326,18 @@
 
         return self.char_index_of_byte(pos)
 
-    @specialize.argtype(2, 3)
+    @specialize.argtype(1, 2, 3)
     def rfind(self, other, start=None, end=None):
         start, end = self._bound_check(start, end)
-        if start == -1:
+        if start < 0:
             return -1
 
         if isinstance(other, Utf8Str):
             pos = self.bytes.rfind(other.bytes, start, end)
         elif isinstance(other, unicode):
             return unicode(self.bytes, 'utf8').rfind(other, start, end)
-        elif isinstance(other, str):
+        else:
+            assert isinstance(other, str)
             pos = self.bytes.rfind(other, start, end)
 
         if pos == -1:
@@ -293,17 +345,18 @@
 
         return self.char_index_of_byte(pos)
 
-    @specialize.argtype(2, 3)
+    @specialize.argtype(1, 2, 3)
     def count(self, other, start=None, end=None):
         start, end = self._bound_check(start, end)
-        if start == -1:
+        if start < 0:
             return 0
 
         if isinstance(other, Utf8Str):
             count = self.bytes.count(other.bytes, start, end)
         elif isinstance(other, unicode):
             return unicode(self.bytes, 'utf8').count(other, start, end)
-        elif isinstance(other, str):
+        else:
+            assert isinstance(other, str)
             count = self.bytes.count(other, start, end)
 
         if count == -1:
@@ -319,7 +372,8 @@
         if other is not None:
             if isinstance(other, str):
                 other_bytes = other
-            if isinstance(other, Utf8Str):
+            else:
+                assert isinstance(other, Utf8Str)
                 other_bytes = other.bytes
             return [Utf8Str(s) for s in self.bytes.split(other_bytes, 
maxsplit)]
 
@@ -334,6 +388,7 @@
                 break
 
             start_byte = iter.byte_pos
+            assert start_byte >= 0
 
             if maxsplit == 0:
                 res.append(Utf8Str(self.bytes[start_byte:len(self.bytes)],
@@ -349,8 +404,9 @@
                            self._is_ascii))
                 break
 
-            res.append(Utf8Str(self.bytes[start_byte:iter.byte_pos],
-                               self._is_ascii))
+            end = iter.byte_pos
+            assert end >= 0
+            res.append(Utf8Str(self.bytes[start_byte:end], self._is_ascii))
             maxsplit -= 1
 
         return res
@@ -360,7 +416,8 @@
         if other is not None:
             if isinstance(other, str):
                 other_bytes = other
-            if isinstance(other, Utf8Str):
+            else:
+                assert isinstance(other, Utf8Str)
                 other_bytes = other.bytes
             return [Utf8Str(s) for s in self.bytes.rsplit(other_bytes, 
maxsplit)]
 
@@ -397,21 +454,22 @@
         res.reverse()
         return res
 
-    @specialize.argtype(1)
+    #@specialize.argtype(1)
     def join(self, other):
         if len(other) == 0:
             return Utf8Str('')
 
         if isinstance(other[0], Utf8Str):
-            return Utf8Str(
-                self.bytes.join([s.bytes for s in other]),
-                self._is_ascii and all(s._is_ascii for s in other)
-            )
+            is_ascii = self._is_ascii
+            if is_ascii:
+                for s in other:
+                    if not s._is_ascii:
+                        is_ascii = False
+                    break
+            return Utf8Str(self.bytes.join([s.bytes for s in other]), is_ascii)
         else:
-            return Utf8Str(
-                self.bytes.join([s for s in other]),
-                self._is_ascii and all(s._is_ascii for s in other)
-            )
+            return Utf8Str(self.bytes.join([s for s in other]))
+    join._annspecialcase_ = 'specialize:arglistitemtype(1)'
 
     def as_unicode(self):
         """NOT_RPYTHON"""
@@ -423,6 +481,7 @@
         return Utf8Str(u.encode('utf-8'))
 
     def next_char(self, byte_pos):
+        assert byte_pos >= 0
         return byte_pos + utf8_code_length[ord(self.bytes[byte_pos])]
 
     def prev_char(self, byte_pos):
@@ -558,6 +617,7 @@
         else:
             self._builder = StringBuilder(init_size)
         self._is_ascii = True
+        self._length = 0
 
 
     @specialize.argtype(1)
@@ -566,9 +626,11 @@
             self._builder.append(c.bytes)
             if not c._is_ascii:
                 self._is_ascii = False
-        elif isinstance(c, int) or isinstance(c, r_uint):
-            if isinstance(c, base_int):
-                c = intmask(c)
+            self._length += len(c)
+
+        elif isinstance(c, int) or isinstance(c, base_int):
+            c = intmask(c)
+
             if c < 0x80:
                 self._builder.append(chr(c))
             elif c < 0x800:
@@ -588,12 +650,19 @@
                 self._is_ascii = False
             else:
                 raise ValueError("Invalid unicode codepoint > 0x10FFFF.")
-        else:
+            self._length += 1
+        elif isinstance(c, str):
             # TODO: Remove this check?
             if len(c) == 1:
                 assert ord(c) < 128
             self._builder.append(c)
 
+            # XXX The assumption here is that the bytes being appended are
+            #     ASCII, ie 1:1 byte:char
+            self._length += len(c)
+        else:
+            raise TypeError()
+
     @specialize.argtype(1)
     def append_slice(self, s, start, end):
         if isinstance(s, str):
@@ -604,6 +673,7 @@
         else:
             raise TypeError("Invalid type '%s' for Utf8Str.append_slice" %
                             type(s))
+        self._length += end - start
 
     @specialize.argtype(1)
     def append_multiple_char(self, c, count):
@@ -613,12 +683,14 @@
             self._builder.append_multiple_char(chr(c), count)
             return
 
-        if len(c) > 1:
-            import pdb; pdb.set_trace()
         if isinstance(c, str):
             self._builder.append_multiple_char(c, count)
         else:
             self._builder.append_multiple_char(c.bytes, count)
+        self._length += count
+
+    def getlength(self):
+        return self._length
 
     def build(self):
         return Utf8Str(self._builder.build(), self._is_ascii)
@@ -746,9 +818,10 @@
         return iter
 
 def make_iterator(name, base, calc_value, default):
-    class C(base):
+    class C(object):
+        import_from_mixin(base, ['__init__', '__iter__'])
         _default = default
-        _value = calc_value
+        _value = func_with_new_name(calc_value, '_value')
     C.__name__ = name
     return C
 
@@ -780,3 +853,5 @@
 del ForwardIterBase
 del ReverseIterBase
 
+
+
diff --git a/pypy/interpreter/utf8_codecs.py b/pypy/interpreter/utf8_codecs.py
--- a/pypy/interpreter/utf8_codecs.py
+++ b/pypy/interpreter/utf8_codecs.py
@@ -6,7 +6,8 @@
 from rpython.rlib.unicodedata import unicodedb
 from rpython.rlib.runicode import utf8_code_length
 
-from pypy.interpreter.utf8 import Utf8Str, Utf8Builder, utf8chr, utf8ord, ORD
+from pypy.interpreter import utf8
+from pypy.interpreter.utf8 import Utf8Str, Utf8Builder, utf8chr, utf8ord
 
 
 BYTEORDER = sys.byteorder
@@ -416,7 +417,7 @@
                 result.append(rs)
                 continue
             for ch in ru:
-                cd = ORD(ch, 0)
+                cd = utf8.ORD(ch, 0)
                 if cd < limit:
                     result.append(chr(cd))
                 else:
@@ -1293,7 +1294,7 @@
         ch = s[pos]
 
         c = mapping.get(ch, ERROR_CHAR)
-        if c == ERROR_CHAR:
+        if utf8.EQ(c, ERROR_CHAR):
             r, pos = errorhandler(errors, "charmap",
                                   "character maps to <undefined>",
                                   s,  pos, pos + 1)
@@ -1543,7 +1544,7 @@
             # py3k only
             errorhandler('strict', 'decimal', msg, s, collstart, collend)
         for i in range(len(ru)):
-            ch = ORD(ru, i)
+            ch = utf8.ORD(ru, i)
             if unicodedb.isspace(ch):
                 result.append(' ')
                 continue
@@ -1571,16 +1572,16 @@
     if errors == 'replace':
         return _unicode_error_replacement, endingpos
     if errors == 'ignore':
-        return '', endingpos
+        return Utf8Str(''), endingpos
     raise UnicodeDecodeError(encoding, s, startingpos, endingpos, msg)
 _unicode_error_replacement = Utf8Str.from_unicode(u'\ufffd')
 
 def default_unicode_error_encode(errors, encoding, msg, u,
                                  startingpos, endingpos):
     if errors == 'replace':
-        return '?', None, endingpos
+        return Utf8Str('?'), None, endingpos
     if errors == 'ignore':
-        return '', None, endingpos
+        return Utf8Str(''), None, endingpos
 
     if we_are_translated():
         # The constructor for UnicodeEncodeError requires an actual unicode
diff --git a/pypy/module/_cffi_backend/ctypeprim.py 
b/pypy/module/_cffi_backend/ctypeprim.py
--- a/pypy/module/_cffi_backend/ctypeprim.py
+++ b/pypy/module/_cffi_backend/ctypeprim.py
@@ -151,7 +151,7 @@
                 return utf8ord(s, 0)
         if (isinstance(w_ob, cdataobj.W_CData) and
                isinstance(w_ob.ctype, W_CTypePrimitiveUniChar)):
-            return rffi.cast(utf8.WCHAR_INTP, w_ob._cdata)[0]
+            return intmask(rffi.cast(utf8.WCHAR_INTP, w_ob._cdata)[0])
         raise self._convert_error("unicode string of length 1", w_ob)
 
     def convert_from_object(self, cdata, w_ob):
diff --git a/pypy/module/_codecs/interp_codecs.py 
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -3,6 +3,7 @@
 from rpython.rlib.rstring import UnicodeBuilder
 from rpython.rlib.runicode import code_to_unichr, MAXUNICODE
 
+from pypy.interpreter import utf8
 from pypy.interpreter.utf8 import Utf8Builder, Utf8Str, utf8chr, utf8ord
 from pypy.interpreter.error import OperationError, oefmt
 from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
@@ -213,7 +214,7 @@
         text = utf8chr(0xfffd)
         return space.newtuple([space.wrap(text), w_end])
     elif space.isinstance_w(w_exc, space.w_UnicodeTranslateError):
-        text = utf8chr(0xfffd) * size
+        text = utf8.MUL(utf8chr(0xfffd), size)
         return space.newtuple([space.wrap(text), w_end])
     else:
         raise oefmt(space.w_TypeError,
@@ -264,7 +265,7 @@
             lnum = len(num)
             nb = zeros + 2 - lnum # num starts with '0x'
             if nb > 0:
-                builder.append_multiple_char(u'0', nb)
+                builder.append_multiple_char('0', nb)
             builder.append_slice(num, 2, lnum)
             pos += 1
         return space.newtuple([space.wrap(builder.build()), w_end])
@@ -678,7 +679,7 @@
     string = space.readbuf_w(w_string).as_str()
 
     if len(string) == 0:
-        return space.newtuple([space.wrap(u''), space.wrap(0)])
+        return space.newtuple([space.wrap(Utf8Str('')), space.wrap(0)])
 
     final = True
     state = space.fromcache(CodecState)
diff --git a/pypy/module/_io/interp_stringio.py 
b/pypy/module/_io/interp_stringio.py
--- a/pypy/module/_io/interp_stringio.py
+++ b/pypy/module/_io/interp_stringio.py
@@ -111,7 +111,8 @@
 
     def resize_buffer(self, newlength):
         if len(self.buf) > newlength:
-            self.buf = self.buf[:newlength]
+            assert newlength >= 0
+            self.buf = self.buf[0:newlength]
         if len(self.buf) < newlength:
             self.buf.extend([Utf8Str('\0')] * (newlength - len(self.buf)))
 
@@ -190,8 +191,9 @@
             endpos += start
         else:
             endpos = end
+        self.pos = endpos
+        assert start >= 0
         assert endpos >= 0
-        self.pos = endpos
         return space.wrap(Utf8Str("").join(self.buf[start:endpos]))
 
     @unwrap_spec(pos=int, mode=int)
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -6,6 +6,7 @@
 from pypy.interpreter.typedef import (
     GetSetProperty, TypeDef, generic_new_descr, interp_attrproperty,
     interp_attrproperty_w)
+from pypy.interpreter import utf8
 from pypy.interpreter.utf8 import Utf8Str, Utf8Builder, utf8ord
 from pypy.module._codecs import interp_codecs
 from pypy.module._io.interp_iobase import W_IOBase, convert_size, trap_eintr
@@ -76,7 +77,7 @@
         output = space.unicode_w(w_output)
         output_len = len(output)
         if self.pendingcr and (final or output_len):
-            output = Utf8Str('\r') + output
+            output = utf8.ADD(Utf8Str('\r'), output)
             self.pendingcr = False
             output_len += 1
 
@@ -85,7 +86,7 @@
         if not final and output_len > 0:
             last = output_len - 1
             assert last >= 0
-            if output[last] == Utf8Str('\r'):
+            if utf8ord(output, last) == ord('\r'):
                 output = output[:last]
                 self.pendingcr = True
                 output_len -= 1
@@ -101,7 +102,7 @@
         # for the \r
         only_lf = False
         if seennl == SEEN_LF or seennl == 0:
-            only_lf = (output.find(Utf8Str('\r')) < 0)
+            only_lf = (output.find('\r') < 0)
 
         if only_lf:
             # If not already seen, quick scan for a possible "\n" character.
@@ -371,8 +372,9 @@
             newline = None
         else:
             newline = space.unicode_w(w_newline)
-        if newline and newline not in (Utf8Str('\n'), Utf8Str('\r\n'),
-                                       Utf8Str('\r')):
+        if newline and not (utf8.EQ(newline, Utf8Str('\n')) or
+            utf8.EQ(newline, Utf8Str('\r\n')) or
+            utf8.EQ(newline, Utf8Str('\r'))):
             r = space.str_w(space.repr(w_newline))
             raise OperationError(space.w_ValueError, space.wrap(
                 "illegal newline value: %s" % (r,)))
@@ -386,7 +388,7 @@
         self.writetranslate = (newline is None or len(newline) == 0)
         if not self.readuniversal:
             self.writenl = self.readnl
-            if self.writenl == Utf8Str('\n'):
+            if utf8.EQ(self.writenl, Utf8Str('\n')):
                 self.writenl = None
         elif _WINDOWS:
             self.writenl = Utf8Str("\r\n")
@@ -662,7 +664,7 @@
                 offset_to_buffer = 0
             else:
                 assert self.decoded_chars_used == 0
-                line = remaining + self.decoded_chars
+                line = utf8.ADD(remaining, self.decoded_chars)
                 start = 0
                 offset_to_buffer = len(remaining)
                 remaining = None
diff --git a/pypy/module/_locale/interp_locale.py 
b/pypy/module/_locale/interp_locale.py
--- a/pypy/module/_locale/interp_locale.py
+++ b/pypy/module/_locale/interp_locale.py
@@ -3,6 +3,7 @@
 
 from pypy.interpreter.error import OperationError
 from pypy.interpreter.gateway import unwrap_spec
+from pypy.interpreter.utf8 import Utf8Str
 
 from rpython.rlib import rlocale
 from pypy.module.exceptions.interp_exceptions import _new_exception, 
W_Exception
@@ -136,8 +137,8 @@
 
     s1, s2 = space.unicode_w(w_s1), space.unicode_w(w_s2)
 
-    s1_c = rffi.unicode2wcharp(s1)
-    s2_c = rffi.unicode2wcharp(s2)
+    s1_c = Utf8Str.copy_to_new_wcharp(s1)
+    s2_c = Utf8Str.copy_to_new_wcharp(s2)
     try:
         result = _wcscoll(s1_c, s2_c)
     finally:
diff --git a/pypy/module/_multibytecodec/interp_incremental.py 
b/pypy/module/_multibytecodec/interp_incremental.py
--- a/pypy/module/_multibytecodec/interp_incremental.py
+++ b/pypy/module/_multibytecodec/interp_incremental.py
@@ -6,6 +6,8 @@
 from pypy.interpreter.baseobjspace import W_Root
 from pypy.interpreter.gateway import interp2app, unwrap_spec
 from pypy.interpreter.typedef import TypeDef, GetSetProperty
+from pypy.interpreter import utf8
+from pypy.interpreter.utf8 import Utf8Str
 from pypy.module._codecs.interp_codecs import CodecState
 
 
@@ -87,7 +89,7 @@
 
     def _initialize(self):
         self.encodebuf = c_codecs.pypy_cjk_enc_new(self.codec)
-        self.pending = u""
+        self.pending = Utf8Str("")
 
     def _free(self):
         self.pending = None
@@ -100,7 +102,7 @@
         space = self.space
         state = space.fromcache(CodecState)
         if len(self.pending) > 0:
-            object = self.pending + object
+            object = utf8.ADD(self.pending, object)
         try:
             output = c_codecs.encodeex(self.encodebuf, object, self.errors,
                                        state.encode_error_handler, self.name,
diff --git a/pypy/module/_pypyjson/interp_decoder.py 
b/pypy/module/_pypyjson/interp_decoder.py
--- a/pypy/module/_pypyjson/interp_decoder.py
+++ b/pypy/module/_pypyjson/interp_decoder.py
@@ -1,9 +1,10 @@
 import sys
 from rpython.rlib.rstring import StringBuilder
 from rpython.rlib.objectmodel import specialize
-from rpython.rlib import rfloat, runicode
+from rpython.rlib import rfloat
 from rpython.rtyper.lltypesystem import lltype, rffi
 from pypy.interpreter.error import OperationError, oefmt
+from pypy.interpreter.utf8 import utf8chr
 from pypy.interpreter import unicodehelper
 
 OVF_DIGITS = len(str(sys.maxint))
@@ -30,6 +31,7 @@
 
     No bound checking is done, use carefully.
     """
+    '''
     from rpython.rtyper.annlowlevel import llstr, hlunicode
     from rpython.rtyper.lltypesystem.rstr import malloc, UNICODE
     from rpython.rtyper.lltypesystem.lltype import cast_primitive, UniChar
@@ -41,6 +43,12 @@
         ch = ll_s.chars[start+i]
         ll_res.chars[i] = cast_primitive(UniChar, ch)
     return hlunicode(ll_res)
+    '''
+    # TODO: Actually do this without slicing
+    from pypy.interpreter.utf8_codecs import str_decode_latin_1
+    assert start >= 0
+    assert end >= 0
+    return str_decode_latin_1(s[start:end], end - start, 'strict')[0]
 
 TYPE_UNKNOWN = 0
 TYPE_STRING = 1
@@ -369,7 +377,7 @@
             return # help the annotator to know that we'll never go beyond
                    # this point
         #
-        uchr = runicode.code_to_unichr(val)     # may be a surrogate pair again
+        uchr = utf8chr(val)     # may be a surrogate pair again
         utf8_ch = unicodehelper.encode_utf8(self.space, uchr)
         builder.append(utf8_ch)
         return i
diff --git a/pypy/module/_rawffi/interp_rawffi.py 
b/pypy/module/_rawffi/interp_rawffi.py
--- a/pypy/module/_rawffi/interp_rawffi.py
+++ b/pypy/module/_rawffi/interp_rawffi.py
@@ -415,7 +415,6 @@
                 "Expected unicode string of length one as wide character"))
 
         val = utf8ord(s)
-        #val = 0
         if rffi.sizeof(rffi.WCHAR_T) == 2 and val > 0xFFFF:
             # Utf-16 must be used on systems with a 2 byte wchar_t to
             # encode codepoints > 0xFFFF
@@ -597,7 +596,7 @@
 def wcharp2rawunicode(space, address, maxlength=-1):
     if maxlength == -1:
         return wcharp2unicode(space, address)
-    s = rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, address), maxlength)
+    s = Utf8Str.from_wcharpsize(rffi.cast(rffi.CWCHARP, address), maxlength)
     return space.wrap(s)
 
 @unwrap_spec(address=r_uint, newcontent='bufferstr')
diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -5,6 +5,7 @@
 from pypy.interpreter.typedef import make_weakref_descr
 from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
 from pypy.interpreter.error import OperationError
+from pypy.interpreter import utf8
 from pypy.interpreter.utf8 import Utf8Str, utf8ord
 from rpython.rlib.rarithmetic import intmask
 from rpython.rlib import jit
@@ -121,6 +122,8 @@
                 pos = len(unicodestr)
             if endpos > len(unicodestr):
                 endpos = len(unicodestr)
+            assert pos >= 0
+            assert endpos >= 0
             return rsre_core.UnicodeMatchContext(self.code, unicodestr,
                                                  pos, endpos, self.flags)
         else:
@@ -232,7 +235,7 @@
         else:
             if space.isinstance_w(w_ptemplate, space.w_unicode):
                 filter_as_unicode = space.unicode_w(w_ptemplate)
-                literal = u'\\' not in filter_as_unicode
+                literal = utf8.IN('\\', filter_as_unicode)
             else:
                 try:
                     filter_as_string = space.str_w(w_ptemplate)
diff --git a/pypy/module/cpyext/unicodeobject.py 
b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -1,5 +1,5 @@
 from pypy.interpreter.error import OperationError
-from pypy.interpreter.utf8 import Utf8Str
+from pypy.interpreter.utf8 import Utf8Str, utf8chr
 from pypy.interpreter import utf8_codecs
 from rpython.rtyper.lltypesystem import rffi, lltype
 from pypy.module.unicodedata import unicodedb
@@ -138,17 +138,17 @@
 @cpython_api([Py_UNICODE], Py_UNICODE, error=CANNOT_FAIL)
 def Py_UNICODE_TOLOWER(space, ch):
     """Return the character ch converted to lower case."""
-    return unichr(unicodedb.tolower(ord(ch)))
+    return utf8chr(unicodedb.tolower(ord(ch)))
 
 @cpython_api([Py_UNICODE], Py_UNICODE, error=CANNOT_FAIL)
 def Py_UNICODE_TOUPPER(space, ch):
     """Return the character ch converted to upper case."""
-    return unichr(unicodedb.toupper(ord(ch)))
+    return utf8chr(unicodedb.toupper(ord(ch)))
 
 @cpython_api([Py_UNICODE], Py_UNICODE, error=CANNOT_FAIL)
 def Py_UNICODE_TOTITLE(space, ch):
     """Return the character ch converted to title case."""
-    return unichr(unicodedb.totitle(ord(ch)))
+    return utf8chr(unicodedb.totitle(ord(ch)))
 
 @cpython_api([Py_UNICODE], rffi.INT_real, error=CANNOT_FAIL)
 def Py_UNICODE_TODECIMAL(space, ch):
@@ -331,7 +331,7 @@
     Therefore, modification of the resulting Unicode object is only allowed 
when u
     is NULL."""
     if wchar_p:
-        s = rffi.Utf8Str.from_wcharpsize(wchar_p, length)
+        s = Utf8Str.from_wcharpsize(wchar_p, length)
         return make_ref(space, space.wrap(s))
     else:
         return rffi.cast(PyObject, new_empty_unicode(space, length))
diff --git a/pypy/module/exceptions/interp_exceptions.py 
b/pypy/module/exceptions/interp_exceptions.py
--- a/pypy/module/exceptions/interp_exceptions.py
+++ b/pypy/module/exceptions/interp_exceptions.py
@@ -77,6 +77,7 @@
     descr_set_dict, descr_del_dict)
 from pypy.interpreter.gateway import interp2app
 from pypy.interpreter.error import OperationError
+from pypy.interpreter.utf8 import Utf8Str
 from rpython.rlib import rwin32
 
 
@@ -126,7 +127,7 @@
             return space.call_function(space.w_unicode, w_as_str)
         lgt = len(self.args_w)
         if lgt == 0:
-            return space.wrap(u"")
+            return space.wrap(Utf8Str(""))
         if lgt == 1:
             return space.call_function(space.w_unicode, self.args_w[0])
         else:
diff --git a/pypy/module/pyexpat/interp_pyexpat.py 
b/pypy/module/pyexpat/interp_pyexpat.py
--- a/pypy/module/pyexpat/interp_pyexpat.py
+++ b/pypy/module/pyexpat/interp_pyexpat.py
@@ -2,6 +2,7 @@
 from pypy.interpreter.typedef import TypeDef, GetSetProperty
 from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
 from pypy.interpreter.error import OperationError, oefmt
+from pypy.interpreter.utf8 import ORD
 from rpython.rlib import rgc, jit
 from rpython.rtyper.lltypesystem import rffi, lltype
 from rpython.rtyper.tool import rffi_platform
@@ -589,8 +590,8 @@
                         "multi-byte encodings are not supported")
 
         for i in range(256):
-            c = translationmap[i]
-            if c == u'\ufffd':
+            c = ORD(translationmap, i)
+            if c == 0xFFFD:
                 info.c_map[i] = rffi.cast(rffi.INT, -1)
             else:
                 info.c_map[i] = rffi.cast(rffi.INT, c)
diff --git a/pypy/module/unicodedata/interp_ucd.py 
b/pypy/module/unicodedata/interp_ucd.py
--- a/pypy/module/unicodedata/interp_ucd.py
+++ b/pypy/module/unicodedata/interp_ucd.py
@@ -6,7 +6,7 @@
 from pypy.interpreter.baseobjspace import W_Root
 from pypy.interpreter.error import OperationError
 from pypy.interpreter.typedef import TypeDef, interp_attrproperty
-from pypy.interpreter.utf8 import utf8chr
+from pypy.interpreter.utf8 import Utf8Str, utf8chr
 from rpython.rlib.rarithmetic import r_longlong
 from rpython.rlib.objectmodel import we_are_translated
 from rpython.rlib.unicodedata import unicodedb_5_2_0, unicodedb_3_2_0
@@ -225,10 +225,12 @@
                 result[0] = ch
 
         if not composed: # If decomposed normalization we are done
-            return space.wrap(u''.join([unichr(i) for i in result[:j]]))
+            return space.wrap(Utf8Str('').join(
+                    [utf8chr(i) for i in result[:j]]))
 
         if j <= 1:
-            return space.wrap(u''.join([unichr(i) for i in result[:j]]))
+            return space.wrap(Utf8Str('').join(
+                    [utf8chr(i) for i in result[:j]]))
 
         current = result[0]
         starter_pos = 0
@@ -275,7 +277,8 @@
 
         result[starter_pos] = current
 
-        return space.wrap(u''.join([unichr(i) for i in result[:next_insert]]))
+        return space.wrap(Utf8Str('').join(
+                    [utf8chr(i) for i in result[:next_insert]]))
 
 
 methods = {}
diff --git a/pypy/objspace/std/formatting.py b/pypy/objspace/std/formatting.py
--- a/pypy/objspace/std/formatting.py
+++ b/pypy/objspace/std/formatting.py
@@ -9,7 +9,7 @@
 from rpython.rlib.rarithmetic import INT_MAX
 from rpython.tool.sourcetools import func_with_new_name
 from pypy.interpreter.error import OperationError, oefmt
-from pypy.interpreter.utf8 import Utf8Builder, ORD
+from pypy.interpreter.utf8 import Utf8Builder, ORD, utf8chr
 
 
 class BaseStringFormatter(object):
@@ -156,11 +156,6 @@
     # to build two subclasses of the BaseStringFormatter class,
     # each one getting its own subtle differences and RPython types.
 
-    if do_unicode:
-        const = unicode
-    else:
-        const = str
-
     class StringFormatter(BaseStringFormatter):
         def __init__(self, space, fmt, values_w, w_valuedict):
             BaseStringFormatter.__init__(self, space, values_w, w_valuedict)
@@ -365,6 +360,7 @@
                 return
             if prec >= 0 and prec < length:
                 length = prec   # ignore the end of the string if too long
+
             result = self.result
             padding = self.width - length
             if padding < 0:
@@ -475,7 +471,7 @@
                 n = space.int_w(w_value)
                 if do_unicode:
                     try:
-                        c = unichr(n)
+                        c = utf8chr(n)
                     except ValueError:
                         raise OperationError(space.w_OverflowError,
                             space.wrap("unicode character code out of range"))
diff --git a/pypy/objspace/std/newformat.py b/pypy/objspace/std/newformat.py
--- a/pypy/objspace/std/newformat.py
+++ b/pypy/objspace/std/newformat.py
@@ -681,8 +681,12 @@
                     buf.append(c)
             for i in range(d_state - 1, d_state - n_chars - 1, -1):
                 buf.append(digits[i])
+
+            zero = "0"
+            if self.is_unicode:
+                zero = Utf8Str("0")
             for i in range(n_zeros):
-                buf.append("0")
+                buf.append(zero)
 
         def _group_digits(self, spec, digits):
             buf = []
@@ -727,9 +731,12 @@
         def _upcase_string(self, s):
             buf = []
             for c in s:
-                index = ord(c)
+                index = ORD(c, 0)
                 if ord("a") <= index <= ord("z"):
-                    c = chr(index - 32)
+                    if self.is_unicode:
+                        c = utf8chr(index - 32)
+                    else:
+                        c = chr(index - 32)
                 buf.append(c)
             return self.empty.join(buf)
 
@@ -1061,7 +1068,7 @@
             tmp_align = self._align
             tmp_width = self._width
             self._fill_char = ord("\0")
-            self._align = "<"
+            self._align = ord("<")
             self._width = -1
 
             #determine if we have remainder, might include dec or exponent or 
both
diff --git a/pypy/objspace/std/stringmethods.py 
b/pypy/objspace/std/stringmethods.py
--- a/pypy/objspace/std/stringmethods.py
+++ b/pypy/objspace/std/stringmethods.py
@@ -9,7 +9,7 @@
 
 from pypy.interpreter.error import OperationError, oefmt
 from pypy.interpreter.gateway import WrappedDefault, unwrap_spec
-from pypy.interpreter.utf8 import ORD
+from pypy.interpreter import utf8
 from pypy.objspace.std import slicetype
 from pypy.objspace.std.sliceobject import W_SliceObject, normalize_simple_slice
 
@@ -29,6 +29,8 @@
         lenself = len(value)
         start, end = slicetype.unwrap_start_stop(
             space, lenself, w_start, w_end, upper_bound=upper_bound)
+        assert start >= 0
+        assert end >= 0
         return (value, start, end)
 
     def _multi_chr(self, c):
@@ -64,7 +66,7 @@
                 if e.match(space, space.w_TypeError):
                     return space.w_NotImplemented
                 raise
-            return self._new(self._val(space) + other)
+            return self._new(utf8.ADD(self._val(space), other))
 
         # Bytearray overrides this method, CPython doesn't support contacting
         # buffers and strs, and unicodes are always handled above
@@ -80,8 +82,9 @@
         if times <= 0:
             return self._empty()
         if self._len() == 1:
-            return self._new(self._multi_chr(self._val(space)[0]) * times)
-        return self._new(self._val(space) * times)
+            return self._new(utf8.MUL(self._multi_chr(self._val(space)[0]),
+                                      times))
+        return self._new(utf8.MUL(self._val(space), times))
 
     descr_rmul = descr_mul
 
@@ -142,7 +145,9 @@
         if d > 0:
             offset = d//2 + (d & width & 1)
             fillchar = self._multi_chr(fillchar[0])
-            centered = fillchar * offset + value + fillchar * (d - offset)
+            #centered = fillchar * offset + value + fillchar * (d - offset)
+            centered = utf8.ADD(utf8.ADD(utf8.MUL(fillchar, offset), value),
+                        utf8.MUL(fillchar, (d - offset)))
         else:
             centered = value
 
@@ -204,8 +209,11 @@
         expanded = oldtoken = splitted.pop(0)
 
         for token in splitted:
-            expanded += self._multi_chr(' ') * self._tabindent(oldtoken,
-                                                         tabsize) + token
+            #expanded += self._multi_chr(' ') * self._tabindent(oldtoken,
+            #                                             tabsize) + token
+            m = utf8.MUL(self._multi_chr(' '),
+                         self._tabindent(oldtoken, tabsize))
+            expanded = utf8.ADD(expanded, utf8.ADD(m, token))
             oldtoken = token
 
         return self._new(expanded)
@@ -219,8 +227,8 @@
             offset = len(token)
 
             while 1:
-                if (ORD(token, offset-1) == ord("\n") or
-                    ORD(token, offset-1) == ord("\r")):
+                if (utf8.ORD(token, offset-1) == ord("\n") or
+                    utf8.ORD(token, offset-1) == ord("\r")):
                     break
                 distance += 1
                 offset -= 1
@@ -457,7 +465,8 @@
         d = width - len(value)
         if d > 0:
             fillchar = self._multi_chr(fillchar[0])
-            value += fillchar * d
+            #value += fillchar * d
+            value = utf8.ADD(value, utf8.MUL(fillchar, d))
 
         return self._new(value)
 
@@ -471,7 +480,8 @@
         d = width - len(value)
         if d > 0:
             fillchar = self._multi_chr(fillchar[0])
-            value = fillchar * d + value
+            #value = fillchar * d + value
+            value = utf8.ADD(utf8.MUL(fillchar, d), value)
 
         return self._new(value)
 
@@ -606,8 +616,8 @@
             eol = pos
             pos += 1
             # read CRLF as one line break
-            if (pos < length and ORD(value, eol) == ord('\r') and
-                                 ORD(value, pos) == ord('\n')):
+            if (pos < length and utf8.ORD(value, eol) == ord('\r') and
+                                 utf8.ORD(value, pos) == ord('\n')):
                 pos += 1
             if keepends:
                 eol = pos
@@ -768,15 +778,16 @@
     def descr_zfill(self, space, width):
         selfval = self._val(space)
         if len(selfval) == 0:
-            return self._new(self._multi_chr('0') * width)
+            #return self._new(self._multi_chr('0') * width)
+            return self._new(utf8.MUL(self._multi_chr('0'), width))
         num_zeros = width - len(selfval)
         if num_zeros <= 0:
             # cannot return self, in case it is a subclass of str
             return self._new(selfval)
 
         builder = self._builder(width)
-        if len(selfval) > 0 and (ORD(selfval, 0) == ord('+') or
-                                 ORD(selfval, 0) == ord('-')):
+        if len(selfval) > 0 and (utf8.ORD(selfval, 0) == ord('+') or
+                                 utf8.ORD(selfval, 0) == ord('-')):
             # copy sign to first position
             builder.append(selfval[0])
             start = 1
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -8,6 +8,7 @@
 
 from pypy.interpreter import unicodehelper
 from pypy.interpreter.baseobjspace import W_Root
+from pypy.interpreter import utf8
 from pypy.interpreter.utf8 import Utf8Str, Utf8Builder, utf8chr, utf8ord
 from pypy.interpreter.utf8_codecs import (
     make_unicode_escape_function, str_decode_ascii, str_decode_utf_8,
@@ -91,7 +92,7 @@
         return W_UnicodeObject(value)
 
     def _new_from_list(self, value):
-        return W_UnicodeObject(u''.join(value))
+        return W_UnicodeObject(Utf8Str('').join(value))
 
     def _empty(self):
         return W_UnicodeObject.EMPTY
@@ -109,12 +110,21 @@
 
     @staticmethod
     def _op_val(space, w_other):
+        if space.isinstance_w(w_other, space.w_str):
+            w_other = unicode_from_string(space, w_other)
+        elif not isinstance(w_other, W_UnicodeObject):
+            w_other = unicode_from_encoded_object(
+                space, w_other, None, "strict")
+        assert isinstance(w_other, W_UnicodeObject)
+        return w_other._value
+        '''
         if isinstance(w_other, W_UnicodeObject):
             return w_other._value
         if space.isinstance_w(w_other, space.w_str):
             return unicode_from_string(space, w_other)._value
         return unicode_from_encoded_object(
             space, w_other, None, "strict")._value
+        '''
 
     def _chr(self, char):
         assert len(char) == 1
@@ -228,7 +238,7 @@
 
     def descr_eq(self, space, w_other):
         try:
-            res = self._val(space) == self._op_val(space, w_other)
+            res = self._val(space).__eq__(self._op_val(space, w_other))
         except OperationError as e:
             if e.match(space, space.w_TypeError):
                 return space.w_NotImplemented
@@ -244,7 +254,7 @@
 
     def descr_ne(self, space, w_other):
         try:
-            res = self._val(space) != self._op_val(space, w_other)
+            res = self._val(space).__ne__(self._op_val(space, w_other))
         except OperationError as e:
             if e.match(space, space.w_TypeError):
                 return space.w_NotImplemented
@@ -260,7 +270,7 @@
 
     def descr_lt(self, space, w_other):
         try:
-            res = self._val(space) < self._op_val(space, w_other)
+            res = self._val(space).__lt__(self._op_val(space, w_other))
         except OperationError as e:
             if e.match(space, space.w_TypeError):
                 return space.w_NotImplemented
@@ -269,7 +279,7 @@
 
     def descr_le(self, space, w_other):
         try:
-            res = self._val(space) <= self._op_val(space, w_other)
+            res = self._val(space).__le__(self._op_val(space, w_other))
         except OperationError as e:
             if e.match(space, space.w_TypeError):
                 return space.w_NotImplemented
@@ -278,7 +288,7 @@
 
     def descr_gt(self, space, w_other):
         try:
-            res = self._val(space) > self._op_val(space, w_other)
+            res = self._val(space).__gt__(self._op_val(space, w_other))
         except OperationError as e:
             if e.match(space, space.w_TypeError):
                 return space.w_NotImplemented
@@ -287,7 +297,7 @@
 
     def descr_ge(self, space, w_other):
         try:
-            res = self._val(space) >= self._op_val(space, w_other)
+            res = self._val(space).__ge__(self._op_val(space, w_other))
         except OperationError as e:
             if e.match(space, space.w_TypeError):
                 return space.w_NotImplemented
diff --git a/pypy/tool/ann_override.py b/pypy/tool/ann_override.py
--- a/pypy/tool/ann_override.py
+++ b/pypy/tool/ann_override.py
@@ -21,12 +21,16 @@
 
     def specialize__wrap(pol,  funcdesc, args_s):
         from pypy.interpreter.baseobjspace import W_Root
+        from pypy.interpreter.utf8 import Utf8Str
         from rpython.annotator.classdef import ClassDef
         W_Root_def = funcdesc.bookkeeper.getuniqueclassdef(W_Root)
         typ = args_s[1].knowntype
         if isinstance(typ, ClassDef):
-            assert typ.issubclass(W_Root_def)
-            typ = W_Root
+            if typ.issubclass(W_Root_def):
+                typ = W_Root
+            else:
+                assert typ.classdesc.pyobj is Utf8Str
+                typ = Utf8Str
         else:
             assert not issubclass(typ, W_Root)
             assert typ != tuple, "space.wrap(tuple) forbidden; use newtuple()"
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy utf8-unicode2: WIP fixing translation

Reply via email to