Author: fijal
Branch: unicode-utf8
Changeset: r92615:0379d71a32bf
Date: 2017-10-05 18:40 +0200
http://bitbucket.org/pypy/pypy/changeset/0379d71a32bf/
Log: whack whack whack whack. I hate RPython
diff --git a/TODO b/TODO
--- a/TODO
+++ b/TODO
@@ -1,1 +1,2 @@
* unskip tests in test_unicodeobject.py
+* rutf8.prev_codepoint_pos should use r_uint
\ No newline at end of file
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -70,7 +70,7 @@
try:
length = rutf8.check_utf8(string, allow_surrogates=True)
except rutf8.CheckError as e:
- XXX
+ raise Exception("foo")
decode_error_handler(space)('strict', 'utf8', e.msg, string,
e.startpos,
e.endpos)
raise False, "unreachable"
diff --git a/pypy/objspace/std/unicodeobject.py
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -37,7 +37,7 @@
assert length >= 0
self._utf8 = utf8str
self._length = length
- self._index_storage = None
+ self._index_storage = rutf8.null_storage()
if not we_are_translated():
assert rutf8.check_utf8(utf8str, allow_surrogates=True) == length
@@ -521,6 +521,8 @@
if keepends:
eol = pos
lgt += line_end_chars
+ assert eol >= 0
+ assert sol >= 0
strs_w.append(W_UnicodeObject(value[sol:eol], lgt))
return space.newlist(strs_w)
@@ -636,7 +638,7 @@
def _getitem_result(self, space, index):
if index >= self._length:
raise oefmt(space.w_IndexError, "string index out of range")
- if self._index_storage is None:
+ if self._index_storage == rutf8.null_storage():
self._index_storage = rutf8.create_utf8_index_storage(self._utf8,
self._length)
start = rutf8.codepoint_position_at_index(self._utf8,
diff --git a/rpython/rlib/rstring.py b/rpython/rlib/rstring.py
--- a/rpython/rlib/rstring.py
+++ b/rpython/rlib/rstring.py
@@ -7,7 +7,7 @@
from rpython.rtyper.llannotation import SomePtr
from rpython.rlib import jit
from rpython.rlib.objectmodel import newlist_hint, resizelist_hint,
specialize, not_rpython
-from rpython.rlib.rarithmetic import ovfcheck, LONG_BIT as BLOOM_WIDTH
+from rpython.rlib.rarithmetic import ovfcheck, LONG_BIT as BLOOM_WIDTH, intmask
from rpython.rlib.unicodedata import unicodedb_5_2_0 as unicodedb
from rpython.rtyper.extregistry import ExtRegistryEntry
from rpython.tool.pairtype import pairtype
@@ -32,7 +32,9 @@
if isutf8:
from rpython.rlib.rutf8 import next_codepoint_pos
assert pos >= 0
- return next_codepoint_pos(s, pos)
+ r = next_codepoint_pos(s, pos)
+ assert r >= 0
+ return r
else:
return pos + 1
@@ -42,7 +44,7 @@
from rpython.rlib.rutf8 import prev_codepoint_pos
if pos <= 0:
return -1
- return prev_codepoint_pos(s, pos)
+ return intmask(prev_codepoint_pos(s, pos))
else:
return pos - 1
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -18,7 +18,7 @@
from rpython.rlib.objectmodel import enforceargs
from rpython.rlib.rstring import StringBuilder
from rpython.rlib import jit
-from rpython.rlib.rarithmetic import r_uint
+from rpython.rlib.rarithmetic import r_uint, intmask
from rpython.rtyper.lltypesystem import lltype
@@ -81,6 +81,7 @@
Assumes valid utf8. 'pos' must be before the end of the string.
"""
chr1 = ord(code[pos])
+ assert pos >= 0
if chr1 <= 0x7F:
return pos + 1
if chr1 <= 0xDF:
@@ -93,20 +94,24 @@
"""Gives the position of the previous codepoint.
'pos' must not be zero.
"""
- pos = r_uint(pos)
- pos -= 1
+ pos -= 1 # ruint
if pos >= len(code): # for the case where pos - 1 == len(code):
+ assert pos >= 0
return pos # assume there is an extra '\x00' character
chr1 = ord(code[pos])
if chr1 <= 0x7F:
+ assert pos >= 0
return pos
pos -= 1
if ord(code[pos]) >= 0xC0:
+ assert pos >= 0
return pos
pos -= 1
if ord(code[pos]) >= 0xC0:
+ assert pos >= 0
return pos
pos -= 1
+ assert pos >= 0
return pos
def compute_length_utf8(s):
@@ -375,6 +380,9 @@
for _j in range(16):
ASCII_INDEX_STORAGE[_i].ofs[_j] = chr(_j * 4 + 1)
+def null_storage():
+ return lltype.nullptr(UTF8_INDEX_STORAGE)
+
def create_utf8_index_storage(utf8, utf8len):
""" Create an index storage which stores index of each 4th character
in utf8 encoded unicode string.
@@ -421,6 +429,7 @@
if index == 0:
return prev_codepoint_pos(utf8, bytepos)
elif index == 1:
+ assert bytepos >= 0
return bytepos
elif index == 2:
return next_codepoint_pos(utf8, bytepos)
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit