[pypy-commit] pypy unicode-utf8: whack whack whack whack. I hate RPython

fijal Sat, 07 Oct 2017 06:11:41 -0700

Author: fijal
Branch: unicode-utf8
Changeset: r92615:0379d71a32bf
Date: 2017-10-05 18:40 +0200
http://bitbucket.org/pypy/pypy/changeset/0379d71a32bf/


Log:    whack whack whack whack. I hate RPython

diff --git a/TODO b/TODO
--- a/TODO
+++ b/TODO
@@ -1,1 +1,2 @@
 * unskip tests in test_unicodeobject.py
+* rutf8.prev_codepoint_pos should use r_uint
\ No newline at end of file
diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -70,7 +70,7 @@
     try:
         length = rutf8.check_utf8(string, allow_surrogates=True)
     except rutf8.CheckError as e:
-        XXX
+        raise Exception("foo")
         decode_error_handler(space)('strict', 'utf8', e.msg, string, 
e.startpos,
                                     e.endpos)
         raise False, "unreachable"
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -37,7 +37,7 @@
         assert length >= 0
         self._utf8 = utf8str
         self._length = length
-        self._index_storage = None
+        self._index_storage = rutf8.null_storage()
         if not we_are_translated():
             assert rutf8.check_utf8(utf8str, allow_surrogates=True) == length
 
@@ -521,6 +521,8 @@
                 if keepends:
                     eol = pos
                     lgt += line_end_chars
+            assert eol >= 0
+            assert sol >= 0
             strs_w.append(W_UnicodeObject(value[sol:eol], lgt))
         return space.newlist(strs_w)
 
@@ -636,7 +638,7 @@
     def _getitem_result(self, space, index):
         if index >= self._length:
             raise oefmt(space.w_IndexError, "string index out of range")
-        if self._index_storage is None:
+        if self._index_storage == rutf8.null_storage():
             self._index_storage = rutf8.create_utf8_index_storage(self._utf8,
                 self._length)
         start = rutf8.codepoint_position_at_index(self._utf8,
diff --git a/rpython/rlib/rstring.py b/rpython/rlib/rstring.py
--- a/rpython/rlib/rstring.py
+++ b/rpython/rlib/rstring.py
@@ -7,7 +7,7 @@
 from rpython.rtyper.llannotation import SomePtr
 from rpython.rlib import jit
 from rpython.rlib.objectmodel import newlist_hint, resizelist_hint, 
specialize, not_rpython
-from rpython.rlib.rarithmetic import ovfcheck, LONG_BIT as BLOOM_WIDTH
+from rpython.rlib.rarithmetic import ovfcheck, LONG_BIT as BLOOM_WIDTH, intmask
 from rpython.rlib.unicodedata import unicodedb_5_2_0 as unicodedb
 from rpython.rtyper.extregistry import ExtRegistryEntry
 from rpython.tool.pairtype import pairtype
@@ -32,7 +32,9 @@
     if isutf8:
         from rpython.rlib.rutf8 import next_codepoint_pos
         assert pos >= 0
-        return next_codepoint_pos(s, pos)
+        r = next_codepoint_pos(s, pos)
+        assert r >= 0
+        return r
     else:
         return pos + 1
 
@@ -42,7 +44,7 @@
         from rpython.rlib.rutf8 import prev_codepoint_pos
         if pos <= 0:
             return -1
-        return prev_codepoint_pos(s, pos)
+        return intmask(prev_codepoint_pos(s, pos))
     else:
         return pos - 1
 
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -18,7 +18,7 @@
 from rpython.rlib.objectmodel import enforceargs
 from rpython.rlib.rstring import StringBuilder
 from rpython.rlib import jit
-from rpython.rlib.rarithmetic import r_uint
+from rpython.rlib.rarithmetic import r_uint, intmask
 from rpython.rtyper.lltypesystem import lltype
 
 
@@ -81,6 +81,7 @@
     Assumes valid utf8.  'pos' must be before the end of the string.
     """
     chr1 = ord(code[pos])
+    assert pos >= 0
     if chr1 <= 0x7F:
         return pos + 1
     if chr1 <= 0xDF:
@@ -93,20 +94,24 @@
     """Gives the position of the previous codepoint.
     'pos' must not be zero.
     """
-    pos = r_uint(pos)
-    pos -= 1
+    pos -= 1 # ruint
     if pos >= len(code):     # for the case where pos - 1 == len(code):
+        assert pos >= 0
         return pos           # assume there is an extra '\x00' character
     chr1 = ord(code[pos])
     if chr1 <= 0x7F:
+        assert pos >= 0
         return pos
     pos -= 1
     if ord(code[pos]) >= 0xC0:
+        assert pos >= 0
         return pos
     pos -= 1
     if ord(code[pos]) >= 0xC0:
+        assert pos >= 0
         return pos
     pos -= 1
+    assert pos >= 0
     return pos
 
 def compute_length_utf8(s):
@@ -375,6 +380,9 @@
     for _j in range(16):
         ASCII_INDEX_STORAGE[_i].ofs[_j] = chr(_j * 4 + 1)
 
+def null_storage():
+    return lltype.nullptr(UTF8_INDEX_STORAGE)
+
 def create_utf8_index_storage(utf8, utf8len):
     """ Create an index storage which stores index of each 4th character
     in utf8 encoded unicode string.
@@ -421,6 +429,7 @@
     if index == 0:
         return prev_codepoint_pos(utf8, bytepos)
     elif index == 1:
+        assert bytepos >= 0
         return bytepos
     elif index == 2:
         return next_codepoint_pos(utf8, bytepos)
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8: whack whack whack whack. I hate RPython

Reply via email to