[pypy-commit] pypy unicode-utf8: whack whack whack until we get to the point of getitem working

fijal Sat, 07 Oct 2017 06:11:51 -0700

Author: fijal
Branch: unicode-utf8
Changeset: r92623:ecf3b7cd79eb
Date: 2017-10-06 12:14 +0200
http://bitbucket.org/pypy/pypy/changeset/ecf3b7cd79eb/


Log:    whack whack whack until we get to the point of getitem working

diff --git a/TODO b/TODO
--- a/TODO
+++ b/TODO
@@ -1,2 +1,3 @@
 * unskip tests in test_unicodeobject.py
-* rutf8.prev_codepoint_pos should use r_uint
\ No newline at end of file
+* rutf8.prev_codepoint_pos should use r_uint
+* elidable in rutf8.check_utf8, WTF is wrong with that
\ No newline at end of file
diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -126,8 +126,7 @@
         self.orig = handler
 
     def handle(self, errors, encoding, msg, s, pos, endpos):
-        s, p = self.orig(errors, encoding, msg, s, pos, endpos)
-        return s.decode("utf8"), p
+        return self.orig(errors, encoding, msg, s, pos, endpos)
 
 class EncodeWrapper(object):
     def __init__(self, handler):
@@ -145,7 +144,8 @@
 
 def str_decode_unicode_escape(s, slen, errors, final, errorhandler, 
ud_handler):
     w = DecodeWrapper(errorhandler)
-    u, pos = runicode.str_decode_unicode_escape(s, slen, errors, final, 
w.handle,
+    u, pos = runicode.str_decode_unicode_escape(s, slen, errors, final,
+                                                w.handle,
                                                 ud_handler)
     return u.encode('utf8'), pos, len(u)
 
@@ -159,7 +159,7 @@
         return getattr(runicode, encoder_call_name)(u, len(u), errors,
                        w.handle)
     def decoder(s, slen, errors, final, errorhandler):
-        w = DecodeWrapper(errorhandler)
+        w = DecodeWrapper((errorhandler))
         u, pos = getattr(runicode, decoder_name)(s, slen, errors, final, 
w.handle)
         return u.encode('utf8'), pos, len(u)
     encoder.__name__ = encoder_name
diff --git a/pypy/module/_codecs/interp_codecs.py 
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -66,7 +66,7 @@
                             "position %d from error handler out of bounds",
                             newpos)
             w_replace = space.convert_to_w_unicode(w_replace)
-            return w_replace._utf8, newpos
+            return w_replace._utf8.decode('utf8'), newpos
         return call_errorhandler
 
     def make_decode_errorhandler(self, space):
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -38,8 +38,8 @@
         self._utf8 = utf8str
         self._length = length
         self._index_storage = rutf8.null_storage()
-        if not we_are_translated():
-            assert rutf8.check_utf8(utf8str, allow_surrogates=True) == length
+        #if not we_are_translated():
+        #    assert rutf8.check_utf8(utf8str, allow_surrogates=True) == length
 
     def __repr__(self):
         """representation for debugging purposes"""
diff --git a/rpython/jit/codewriter/effectinfo.py 
b/rpython/jit/codewriter/effectinfo.py
--- a/rpython/jit/codewriter/effectinfo.py
+++ b/rpython/jit/codewriter/effectinfo.py
@@ -6,6 +6,10 @@
 from rpython.tool.algo import bitstring
 
 
+class UnsupportedFieldExc(Exception):
+    pass
+
+
 class EffectInfo(object):
     _cache = {}
 
@@ -313,7 +317,10 @@
                 return
             if getattr(T.OF, fieldname) is lltype.Void:
                 return
-            descr = cpu.interiorfielddescrof(T, fieldname)
+            try:
+                descr = cpu.interiorfielddescrof(T, fieldname)
+            except UnsupportedFieldExc:
+                return
             descrs_interiorfields.append(descr)
 
         # a read or a write to an interiorfield, inside an array of
diff --git a/rpython/jit/codewriter/heaptracker.py 
b/rpython/jit/codewriter/heaptracker.py
--- a/rpython/jit/codewriter/heaptracker.py
+++ b/rpython/jit/codewriter/heaptracker.py
@@ -94,6 +94,7 @@
 
 def all_interiorfielddescrs(gccache, ARRAY, get_field_descr=None):
     from rpython.jit.backend.llsupport import descr
+    from rpython.jit.codewriter.effectinfo import UnsupportedFieldExc
 
     if get_field_descr is None:
         get_field_descr = descr.get_field_descr
@@ -107,7 +108,7 @@
         if name == 'typeptr':
             continue # dealt otherwise
         elif isinstance(FIELD, lltype.Struct):
-            raise Exception("unexpected array(struct(struct))")
+            raise UnsupportedFieldExc("unexpected array(struct(struct))")
         res.append(get_field_descr(gccache, ARRAY, name))
     return res
 
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -94,18 +94,20 @@
 
 def default_unicode_error_decode(errors, encoding, msg, s,
                                  startingpos, endingpos):
+    assert endingpos >= 0
     if errors == 'replace':
-        return u'\ufffd'.encode('utf8'), endingpos
+        return u'\ufffd', endingpos
     if errors == 'ignore':
-        return '', endingpos
+        return u'', endingpos
     raise UnicodeDecodeError(encoding, s, startingpos, endingpos, msg)
 
 def default_unicode_error_encode(errors, encoding, msg, u,
                                  startingpos, endingpos):
+    assert endingpos >= 0
     if errors == 'replace':
-        return '?', None, endingpos
+        return u'?', None, endingpos
     if errors == 'ignore':
-        return '', None, endingpos
+        return u'', None, endingpos
     raise UnicodeEncodeError(encoding, u, startingpos, endingpos, msg)
 
 # ____________________________________________________________
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -191,7 +191,7 @@
     def __init__(self, pos):
         self.pos = pos
 
[email protected]
+#@jit.elidable
 def check_ascii(s):
     for i in range(len(s)):
         if ord(s[i]) > 0x7F:
@@ -289,12 +289,14 @@
             (ordch1 == 0xf4 and ordch2 > 0x8f))
 
 
[email protected]
+#@jit.elidable
 def check_utf8(s, allow_surrogates=False):
     """Check that 's' is a utf-8-encoded byte string.
     Returns the length (number of chars) or raise CheckError.
     Note that surrogates are not handled specially here.
     """
+    import pdb
+    pdb.set_trace()
     pos = 0
     continuation_bytes = 0
     while pos < len(s):
@@ -416,6 +418,7 @@
         break
     return storage
 
[email protected]_look_inside
 def codepoint_position_at_index(utf8, storage, index):
     """ Return byte index of a character inside utf8 encoded string, given
     storage of type UTF8_INDEX_STORAGE.  The index must be smaller than
@@ -436,6 +439,7 @@
     else:
         return next_codepoint_pos(utf8, next_codepoint_pos(utf8, bytepos))
 
[email protected]_look_inside
 def codepoint_at_index(utf8, storage, index):
     """ Return codepoint of a character inside utf8 encoded string, given
     storage of type UTF8_INDEX_STORAGE
diff --git a/rpython/rtyper/rstr.py b/rpython/rtyper/rstr.py
--- a/rpython/rtyper/rstr.py
+++ b/rpython/rtyper/rstr.py
@@ -8,7 +8,88 @@
 from rpython.rtyper.rint import IntegerRepr
 from rpython.rtyper.rfloat import FloatRepr
 from rpython.tool.pairtype import pairtype, pair
-from rpython.tool.sourcetools import func_with_new_name
+
+def str_decode_utf8(s):
+    from rpython.rlib.rstring import UnicodeBuilder
+    from rpython.rlib import runicode
+
+    size = len(s)
+    if size == 0:
+        return u''
+
+    result = UnicodeBuilder(size)
+    pos = 0
+    while pos < size:
+        ordch1 = ord(s[pos])
+        # fast path for ASCII
+        # XXX maybe use a while loop here
+        if ordch1 < 0x80:
+            result.append(unichr(ordch1))
+            pos += 1
+            continue
+
+        n = ord(runicode._utf8_code_length[ordch1 - 0x80])
+        if pos + n > size:
+            raise UnicodeDecodeError('utf8', s, pos, pos + 1,
+                                      'whatever')
+        if n == 0:
+            raise UnicodeDecodeError('utf8', s, pos, pos + 1,
+                                     'whatever')
+        elif n == 1:
+            assert 0, "ascii should have gone through the fast path"
+
+        elif n == 2:
+            ordch2 = ord(s[pos+1])
+            if runicode._invalid_byte_2_of_2(ordch2):
+
+                raise UnicodeDecodeError('utf8', s, pos, pos + 1,
+                                         'whatever')
+            # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
+            result.append(unichr(((ordch1 & 0x1F) << 6) +    # 0b00011111
+                                 (ordch2 & 0x3F)))           # 0b00111111
+            pos += 2
+
+        elif n == 3:
+            ordch2 = ord(s[pos+1])
+            ordch3 = ord(s[pos+2])
+            if (runicode._invalid_byte_2_of_3(ordch1, ordch2, True) or
+                runicode._invalid_byte_3_of_3(ordch3)):
+                raise UnicodeDecodeError('utf8', s, pos, pos + 1,
+                                         'whatever')
+            # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz
+            result.append(unichr(((ordch1 & 0x0F) << 12) +     # 0b00001111
+                                 ((ordch2 & 0x3F) << 6) +      # 0b00111111
+                                 (ordch3 & 0x3F)))             # 0b00111111
+            pos += 3
+
+        elif n == 4:
+            ordch2 = ord(s[pos+1])
+            ordch3 = ord(s[pos+2])
+            ordch4 = ord(s[pos+3])
+            if (runicode._invalid_byte_2_of_4(ordch1, ordch2) or
+                runicode._invalid_byte_3_of_4(ordch3) or
+                runicode._invalid_byte_4_of_4(ordch4)):
+
+                raise UnicodeDecodeError('utf8', s, pos, pos + 1,
+                                         'whatever')
+            # 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz
+            c = (((ordch1 & 0x07) << 18) +      # 0b00000111
+                 ((ordch2 & 0x3F) << 12) +      # 0b00111111
+                 ((ordch3 & 0x3F) << 6) +       # 0b00111111
+                 (ordch4 & 0x3F))               # 0b00111111
+            if c <= runicode.MAXUNICODE:
+                result.append(runicode.UNICHR(c))
+            else:
+                # compute and append the two surrogates:
+                # translate from 10000..10FFFF to 0..FFFF
+                c -= 0x10000
+                # high surrogate = top 10 bits added to D800
+                result.append(unichr(0xD800 + (c >> 10)))
+                # low surrogate = bottom 10 bits added to DC00
+                result.append(unichr(0xDC00 + (c & 0x03FF)))
+            pos += 4
+
+    return result.build()
 
 
 class AbstractStringRepr(Repr):
@@ -16,13 +97,10 @@
     @jit.elidable
     def ll_decode_utf8(self, llvalue):
         from rpython.rtyper.annlowlevel import hlstr
-        from rpython.rlib import runicode
         value = hlstr(llvalue)
         assert value is not None
-        errorhandler = runicode.default_unicode_error_decode
         # NB. keep the arguments in sync with annotator/unaryop.py
-        u, pos = runicode.str_decode_utf_8_elidable(
-            value, len(value), 'strict', True, errorhandler, True)
+        u = str_decode_utf8(value)
         # XXX maybe the whole ''.decode('utf-8') should be not RPython.
         return self.ll.llunicode(u)
 
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8: whack whack whack until we get to the point of getitem working

Reply via email to