Author: Tyler Wade <[email protected]>
Branch: utf8-unicode2
Changeset: r72691:8a2f88e6348d
Date: 2014-08-04 09:26 -0500
http://bitbucket.org/pypy/pypy/changeset/8a2f88e6348d/
Log: WIP fixing translation
diff --git a/pypy/interpreter/test/test_utf8.py
b/pypy/interpreter/test/test_utf8.py
--- a/pypy/interpreter/test/test_utf8.py
+++ b/pypy/interpreter/test/test_utf8.py
@@ -243,24 +243,4 @@
rffi.free_wcharp(wcharp)
-def test_translate_utf8():
- def f():
- s = build_utf8str()
- s *= 10
- s += Utf8Str('one')
- return len(s)
- assert interpret(f, []) == f()
-
- def f():
- one = Utf8Str("one")
- two = Utf8Str("one")
-
- return int(one == two) + int(not (one != two))
- assert interpret(f, []) == f()
-
- def f():
- one = Utf8Str("one")
-
- return one == None
- assert interpret(f, []) == f()
diff --git a/pypy/interpreter/utf8.py b/pypy/interpreter/utf8.py
--- a/pypy/interpreter/utf8.py
+++ b/pypy/interpreter/utf8.py
@@ -1,9 +1,11 @@
from rpython.rlib.rstring import StringBuilder
-from rpython.rlib.objectmodel import we_are_translated, specialize
+from rpython.rlib.objectmodel import (
+ we_are_translated, specialize, import_from_mixin)
from rpython.rlib.runicode import utf8_code_length
from rpython.rlib.unicodedata import unicodedb_5_2_0 as unicodedb
from rpython.rlib.rarithmetic import r_uint, intmask, base_int
from rpython.rtyper.lltypesystem import rffi, lltype
+from rpython.tool.sourcetools import func_with_new_name
wchar_rint = rffi.r_uint
@@ -26,21 +28,24 @@
codepoint_length = utf8_code_length[ord(bytes[start])]
if codepoint_length == 1:
- return ord(bytes[start])
+ res = ord(bytes[start])
elif codepoint_length == 2:
- return ((ord(bytes[start]) & 0x1F) << 6 |
- (ord(bytes[start + 1]) & 0x3F))
+ res = ((ord(bytes[start]) & 0x1F) << 6 |
+ (ord(bytes[start + 1]) & 0x3F))
elif codepoint_length == 3:
- return ((ord(bytes[start]) & 0xF) << 12 |
- (ord(bytes[start + 1]) & 0x3F) << 6 |
- (ord(bytes[start + 2]) & 0x3F))
+ res = ((ord(bytes[start]) & 0xF) << 12 |
+ (ord(bytes[start + 1]) & 0x3F) << 6 |
+ (ord(bytes[start + 2]) & 0x3F))
else:
assert codepoint_length == 4
- return ((ord(bytes[start]) & 0xF) << 18 |
- (ord(bytes[start + 1]) & 0x3F) << 12 |
- (ord(bytes[start + 2]) & 0x3F) << 6 |
- (ord(bytes[start + 3]) & 0x3F))
+ res = ((ord(bytes[start]) & 0xF) << 18 |
+ (ord(bytes[start + 1]) & 0x3F) << 12 |
+ (ord(bytes[start + 2]) & 0x3F) << 6 |
+ (ord(bytes[start + 3]) & 0x3F))
+
+ assert res >= 0
+ return res
def utf8ord(ustr, start=0):
start = ustr.index_of_char(start)
@@ -53,6 +58,45 @@
else:
return ord(s[pos])
[email protected](0)
+def EQ(s1, s2):
+ if s1 is None:
+ return s1 is s2
+ if isinstance(s1, Utf8Str):
+ return s1.__eq__(s2)
+ else:
+ return s1 == s2
+
[email protected](0)
+def NE(s1, s2):
+ if s1 is None:
+ return s1 is not s2
+ if isinstance(s1, Utf8Str):
+ return s1.__ne__(s2)
+ else:
+ return s1 != s2
+
[email protected](0)
+def ADD(s1, s2):
+ if isinstance(s1, Utf8Str):
+ return s1.__add__(s2)
+ else:
+ return s1 + s2
+
[email protected](0)
+def MUL(s1, s2):
+ if isinstance(s1, Utf8Str):
+ return s1.__mul__(s2)
+ else:
+ return s1 * s2
+
[email protected](0, 1)
+def IN(s1, s2):
+ if isinstance(s1, Utf8Str):
+ return s2.__contains__(s1)
+ else:
+ return s1 in s2
+
class Utf8Str(object):
_immutable_fields_ = ['bytes', '_is_ascii', '_len']
@@ -69,7 +113,6 @@
self._len = length
else:
if not is_ascii:
- #self._len = -1
self._calc_length()
else:
self._len = len(data)
@@ -112,14 +155,22 @@
char_pos += self._len
return self[char_pos:char_pos+1]
+ @specialize.argtype(1, 2)
def __getslice__(self, start, stop):
+ if start is None:
+ start = 0
+ if stop is None:
+ stop = len(self)
+
+ assert start >= 0
assert start <= stop
+
if start == stop:
return Utf8Str('')
- # TODO: If start > _len or stop >= _len, then raise exception
if stop > len(self):
stop = len(self)
+ assert stop >= 0
if self._is_ascii:
return Utf8Str(self.bytes[start:stop], True)
@@ -155,6 +206,7 @@
return Utf8Str(self.bytes * count, self._is_ascii)
def __len__(self):
+ assert self._len >= 0
return self._len
def __hash__(self):
@@ -252,13 +304,12 @@
else:
end = self.index_of_char(end)
- assert start >= 0
return start, end
- @specialize.argtype(2, 3)
+ @specialize.argtype(1, 2, 3)
def find(self, other, start=None, end=None):
start, end = self._bound_check(start, end)
- if start == -1:
+ if start < 0:
return -1
if isinstance(other, Utf8Str):
@@ -275,17 +326,18 @@
return self.char_index_of_byte(pos)
- @specialize.argtype(2, 3)
+ @specialize.argtype(1, 2, 3)
def rfind(self, other, start=None, end=None):
start, end = self._bound_check(start, end)
- if start == -1:
+ if start < 0:
return -1
if isinstance(other, Utf8Str):
pos = self.bytes.rfind(other.bytes, start, end)
elif isinstance(other, unicode):
return unicode(self.bytes, 'utf8').rfind(other, start, end)
- elif isinstance(other, str):
+ else:
+ assert isinstance(other, str)
pos = self.bytes.rfind(other, start, end)
if pos == -1:
@@ -293,17 +345,18 @@
return self.char_index_of_byte(pos)
- @specialize.argtype(2, 3)
+ @specialize.argtype(1, 2, 3)
def count(self, other, start=None, end=None):
start, end = self._bound_check(start, end)
- if start == -1:
+ if start < 0:
return 0
if isinstance(other, Utf8Str):
count = self.bytes.count(other.bytes, start, end)
elif isinstance(other, unicode):
return unicode(self.bytes, 'utf8').count(other, start, end)
- elif isinstance(other, str):
+ else:
+ assert isinstance(other, str)
count = self.bytes.count(other, start, end)
if count == -1:
@@ -319,7 +372,8 @@
if other is not None:
if isinstance(other, str):
other_bytes = other
- if isinstance(other, Utf8Str):
+ else:
+ assert isinstance(other, Utf8Str)
other_bytes = other.bytes
return [Utf8Str(s) for s in self.bytes.split(other_bytes,
maxsplit)]
@@ -334,6 +388,7 @@
break
start_byte = iter.byte_pos
+ assert start_byte >= 0
if maxsplit == 0:
res.append(Utf8Str(self.bytes[start_byte:len(self.bytes)],
@@ -349,8 +404,9 @@
self._is_ascii))
break
- res.append(Utf8Str(self.bytes[start_byte:iter.byte_pos],
- self._is_ascii))
+ end = iter.byte_pos
+ assert end >= 0
+ res.append(Utf8Str(self.bytes[start_byte:end], self._is_ascii))
maxsplit -= 1
return res
@@ -360,7 +416,8 @@
if other is not None:
if isinstance(other, str):
other_bytes = other
- if isinstance(other, Utf8Str):
+ else:
+ assert isinstance(other, Utf8Str)
other_bytes = other.bytes
return [Utf8Str(s) for s in self.bytes.rsplit(other_bytes,
maxsplit)]
@@ -397,21 +454,22 @@
res.reverse()
return res
- @specialize.argtype(1)
+ #@specialize.argtype(1)
def join(self, other):
if len(other) == 0:
return Utf8Str('')
if isinstance(other[0], Utf8Str):
- return Utf8Str(
- self.bytes.join([s.bytes for s in other]),
- self._is_ascii and all(s._is_ascii for s in other)
- )
+ is_ascii = self._is_ascii
+ if is_ascii:
+ for s in other:
+ if not s._is_ascii:
+ is_ascii = False
+ break
+ return Utf8Str(self.bytes.join([s.bytes for s in other]), is_ascii)
else:
- return Utf8Str(
- self.bytes.join([s for s in other]),
- self._is_ascii and all(s._is_ascii for s in other)
- )
+ return Utf8Str(self.bytes.join([s for s in other]))
+ join._annspecialcase_ = 'specialize:arglistitemtype(1)'
def as_unicode(self):
"""NOT_RPYTHON"""
@@ -423,6 +481,7 @@
return Utf8Str(u.encode('utf-8'))
def next_char(self, byte_pos):
+ assert byte_pos >= 0
return byte_pos + utf8_code_length[ord(self.bytes[byte_pos])]
def prev_char(self, byte_pos):
@@ -558,6 +617,7 @@
else:
self._builder = StringBuilder(init_size)
self._is_ascii = True
+ self._length = 0
@specialize.argtype(1)
@@ -566,9 +626,11 @@
self._builder.append(c.bytes)
if not c._is_ascii:
self._is_ascii = False
- elif isinstance(c, int) or isinstance(c, r_uint):
- if isinstance(c, base_int):
- c = intmask(c)
+ self._length += len(c)
+
+ elif isinstance(c, int) or isinstance(c, base_int):
+ c = intmask(c)
+
if c < 0x80:
self._builder.append(chr(c))
elif c < 0x800:
@@ -588,12 +650,19 @@
self._is_ascii = False
else:
raise ValueError("Invalid unicode codepoint > 0x10FFFF.")
- else:
+ self._length += 1
+ elif isinstance(c, str):
# TODO: Remove this check?
if len(c) == 1:
assert ord(c) < 128
self._builder.append(c)
+ # XXX The assumption here is that the bytes being appended are
+ # ASCII, ie 1:1 byte:char
+ self._length += len(c)
+ else:
+ raise TypeError()
+
@specialize.argtype(1)
def append_slice(self, s, start, end):
if isinstance(s, str):
@@ -604,6 +673,7 @@
else:
raise TypeError("Invalid type '%s' for Utf8Str.append_slice" %
type(s))
+ self._length += end - start
@specialize.argtype(1)
def append_multiple_char(self, c, count):
@@ -613,12 +683,14 @@
self._builder.append_multiple_char(chr(c), count)
return
- if len(c) > 1:
- import pdb; pdb.set_trace()
if isinstance(c, str):
self._builder.append_multiple_char(c, count)
else:
self._builder.append_multiple_char(c.bytes, count)
+ self._length += count
+
+ def getlength(self):
+ return self._length
def build(self):
return Utf8Str(self._builder.build(), self._is_ascii)
@@ -746,9 +818,10 @@
return iter
def make_iterator(name, base, calc_value, default):
- class C(base):
+ class C(object):
+ import_from_mixin(base, ['__init__', '__iter__'])
_default = default
- _value = calc_value
+ _value = func_with_new_name(calc_value, '_value')
C.__name__ = name
return C
@@ -780,3 +853,5 @@
del ForwardIterBase
del ReverseIterBase
+
+
diff --git a/pypy/interpreter/utf8_codecs.py b/pypy/interpreter/utf8_codecs.py
--- a/pypy/interpreter/utf8_codecs.py
+++ b/pypy/interpreter/utf8_codecs.py
@@ -6,7 +6,8 @@
from rpython.rlib.unicodedata import unicodedb
from rpython.rlib.runicode import utf8_code_length
-from pypy.interpreter.utf8 import Utf8Str, Utf8Builder, utf8chr, utf8ord, ORD
+from pypy.interpreter import utf8
+from pypy.interpreter.utf8 import Utf8Str, Utf8Builder, utf8chr, utf8ord
BYTEORDER = sys.byteorder
@@ -416,7 +417,7 @@
result.append(rs)
continue
for ch in ru:
- cd = ORD(ch, 0)
+ cd = utf8.ORD(ch, 0)
if cd < limit:
result.append(chr(cd))
else:
@@ -1293,7 +1294,7 @@
ch = s[pos]
c = mapping.get(ch, ERROR_CHAR)
- if c == ERROR_CHAR:
+ if utf8.EQ(c, ERROR_CHAR):
r, pos = errorhandler(errors, "charmap",
"character maps to <undefined>",
s, pos, pos + 1)
@@ -1543,7 +1544,7 @@
# py3k only
errorhandler('strict', 'decimal', msg, s, collstart, collend)
for i in range(len(ru)):
- ch = ORD(ru, i)
+ ch = utf8.ORD(ru, i)
if unicodedb.isspace(ch):
result.append(' ')
continue
@@ -1571,16 +1572,16 @@
if errors == 'replace':
return _unicode_error_replacement, endingpos
if errors == 'ignore':
- return '', endingpos
+ return Utf8Str(''), endingpos
raise UnicodeDecodeError(encoding, s, startingpos, endingpos, msg)
_unicode_error_replacement = Utf8Str.from_unicode(u'\ufffd')
def default_unicode_error_encode(errors, encoding, msg, u,
startingpos, endingpos):
if errors == 'replace':
- return '?', None, endingpos
+ return Utf8Str('?'), None, endingpos
if errors == 'ignore':
- return '', None, endingpos
+ return Utf8Str(''), None, endingpos
if we_are_translated():
# The constructor for UnicodeEncodeError requires an actual unicode
diff --git a/pypy/module/_cffi_backend/ctypeprim.py
b/pypy/module/_cffi_backend/ctypeprim.py
--- a/pypy/module/_cffi_backend/ctypeprim.py
+++ b/pypy/module/_cffi_backend/ctypeprim.py
@@ -151,7 +151,7 @@
return utf8ord(s, 0)
if (isinstance(w_ob, cdataobj.W_CData) and
isinstance(w_ob.ctype, W_CTypePrimitiveUniChar)):
- return rffi.cast(utf8.WCHAR_INTP, w_ob._cdata)[0]
+ return intmask(rffi.cast(utf8.WCHAR_INTP, w_ob._cdata)[0])
raise self._convert_error("unicode string of length 1", w_ob)
def convert_from_object(self, cdata, w_ob):
diff --git a/pypy/module/_codecs/interp_codecs.py
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -3,6 +3,7 @@
from rpython.rlib.rstring import UnicodeBuilder
from rpython.rlib.runicode import code_to_unichr, MAXUNICODE
+from pypy.interpreter import utf8
from pypy.interpreter.utf8 import Utf8Builder, Utf8Str, utf8chr, utf8ord
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
@@ -213,7 +214,7 @@
text = utf8chr(0xfffd)
return space.newtuple([space.wrap(text), w_end])
elif space.isinstance_w(w_exc, space.w_UnicodeTranslateError):
- text = utf8chr(0xfffd) * size
+ text = utf8.MUL(utf8chr(0xfffd), size)
return space.newtuple([space.wrap(text), w_end])
else:
raise oefmt(space.w_TypeError,
@@ -264,7 +265,7 @@
lnum = len(num)
nb = zeros + 2 - lnum # num starts with '0x'
if nb > 0:
- builder.append_multiple_char(u'0', nb)
+ builder.append_multiple_char('0', nb)
builder.append_slice(num, 2, lnum)
pos += 1
return space.newtuple([space.wrap(builder.build()), w_end])
@@ -678,7 +679,7 @@
string = space.readbuf_w(w_string).as_str()
if len(string) == 0:
- return space.newtuple([space.wrap(u''), space.wrap(0)])
+ return space.newtuple([space.wrap(Utf8Str('')), space.wrap(0)])
final = True
state = space.fromcache(CodecState)
diff --git a/pypy/module/_io/interp_stringio.py
b/pypy/module/_io/interp_stringio.py
--- a/pypy/module/_io/interp_stringio.py
+++ b/pypy/module/_io/interp_stringio.py
@@ -111,7 +111,8 @@
def resize_buffer(self, newlength):
if len(self.buf) > newlength:
- self.buf = self.buf[:newlength]
+ assert newlength >= 0
+ self.buf = self.buf[0:newlength]
if len(self.buf) < newlength:
self.buf.extend([Utf8Str('\0')] * (newlength - len(self.buf)))
@@ -190,8 +191,9 @@
endpos += start
else:
endpos = end
+ self.pos = endpos
+ assert start >= 0
assert endpos >= 0
- self.pos = endpos
return space.wrap(Utf8Str("").join(self.buf[start:endpos]))
@unwrap_spec(pos=int, mode=int)
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -6,6 +6,7 @@
from pypy.interpreter.typedef import (
GetSetProperty, TypeDef, generic_new_descr, interp_attrproperty,
interp_attrproperty_w)
+from pypy.interpreter import utf8
from pypy.interpreter.utf8 import Utf8Str, Utf8Builder, utf8ord
from pypy.module._codecs import interp_codecs
from pypy.module._io.interp_iobase import W_IOBase, convert_size, trap_eintr
@@ -76,7 +77,7 @@
output = space.unicode_w(w_output)
output_len = len(output)
if self.pendingcr and (final or output_len):
- output = Utf8Str('\r') + output
+ output = utf8.ADD(Utf8Str('\r'), output)
self.pendingcr = False
output_len += 1
@@ -85,7 +86,7 @@
if not final and output_len > 0:
last = output_len - 1
assert last >= 0
- if output[last] == Utf8Str('\r'):
+ if utf8ord(output, last) == ord('\r'):
output = output[:last]
self.pendingcr = True
output_len -= 1
@@ -101,7 +102,7 @@
# for the \r
only_lf = False
if seennl == SEEN_LF or seennl == 0:
- only_lf = (output.find(Utf8Str('\r')) < 0)
+ only_lf = (output.find('\r') < 0)
if only_lf:
# If not already seen, quick scan for a possible "\n" character.
@@ -371,8 +372,9 @@
newline = None
else:
newline = space.unicode_w(w_newline)
- if newline and newline not in (Utf8Str('\n'), Utf8Str('\r\n'),
- Utf8Str('\r')):
+ if newline and not (utf8.EQ(newline, Utf8Str('\n')) or
+ utf8.EQ(newline, Utf8Str('\r\n')) or
+ utf8.EQ(newline, Utf8Str('\r'))):
r = space.str_w(space.repr(w_newline))
raise OperationError(space.w_ValueError, space.wrap(
"illegal newline value: %s" % (r,)))
@@ -386,7 +388,7 @@
self.writetranslate = (newline is None or len(newline) == 0)
if not self.readuniversal:
self.writenl = self.readnl
- if self.writenl == Utf8Str('\n'):
+ if utf8.EQ(self.writenl, Utf8Str('\n')):
self.writenl = None
elif _WINDOWS:
self.writenl = Utf8Str("\r\n")
@@ -662,7 +664,7 @@
offset_to_buffer = 0
else:
assert self.decoded_chars_used == 0
- line = remaining + self.decoded_chars
+ line = utf8.ADD(remaining, self.decoded_chars)
start = 0
offset_to_buffer = len(remaining)
remaining = None
diff --git a/pypy/module/_locale/interp_locale.py
b/pypy/module/_locale/interp_locale.py
--- a/pypy/module/_locale/interp_locale.py
+++ b/pypy/module/_locale/interp_locale.py
@@ -3,6 +3,7 @@
from pypy.interpreter.error import OperationError
from pypy.interpreter.gateway import unwrap_spec
+from pypy.interpreter.utf8 import Utf8Str
from rpython.rlib import rlocale
from pypy.module.exceptions.interp_exceptions import _new_exception,
W_Exception
@@ -136,8 +137,8 @@
s1, s2 = space.unicode_w(w_s1), space.unicode_w(w_s2)
- s1_c = rffi.unicode2wcharp(s1)
- s2_c = rffi.unicode2wcharp(s2)
+ s1_c = Utf8Str.copy_to_new_wcharp(s1)
+ s2_c = Utf8Str.copy_to_new_wcharp(s2)
try:
result = _wcscoll(s1_c, s2_c)
finally:
diff --git a/pypy/module/_multibytecodec/interp_incremental.py
b/pypy/module/_multibytecodec/interp_incremental.py
--- a/pypy/module/_multibytecodec/interp_incremental.py
+++ b/pypy/module/_multibytecodec/interp_incremental.py
@@ -6,6 +6,8 @@
from pypy.interpreter.baseobjspace import W_Root
from pypy.interpreter.gateway import interp2app, unwrap_spec
from pypy.interpreter.typedef import TypeDef, GetSetProperty
+from pypy.interpreter import utf8
+from pypy.interpreter.utf8 import Utf8Str
from pypy.module._codecs.interp_codecs import CodecState
@@ -87,7 +89,7 @@
def _initialize(self):
self.encodebuf = c_codecs.pypy_cjk_enc_new(self.codec)
- self.pending = u""
+ self.pending = Utf8Str("")
def _free(self):
self.pending = None
@@ -100,7 +102,7 @@
space = self.space
state = space.fromcache(CodecState)
if len(self.pending) > 0:
- object = self.pending + object
+ object = utf8.ADD(self.pending, object)
try:
output = c_codecs.encodeex(self.encodebuf, object, self.errors,
state.encode_error_handler, self.name,
diff --git a/pypy/module/_pypyjson/interp_decoder.py
b/pypy/module/_pypyjson/interp_decoder.py
--- a/pypy/module/_pypyjson/interp_decoder.py
+++ b/pypy/module/_pypyjson/interp_decoder.py
@@ -1,9 +1,10 @@
import sys
from rpython.rlib.rstring import StringBuilder
from rpython.rlib.objectmodel import specialize
-from rpython.rlib import rfloat, runicode
+from rpython.rlib import rfloat
from rpython.rtyper.lltypesystem import lltype, rffi
from pypy.interpreter.error import OperationError, oefmt
+from pypy.interpreter.utf8 import utf8chr
from pypy.interpreter import unicodehelper
OVF_DIGITS = len(str(sys.maxint))
@@ -30,6 +31,7 @@
No bound checking is done, use carefully.
"""
+ '''
from rpython.rtyper.annlowlevel import llstr, hlunicode
from rpython.rtyper.lltypesystem.rstr import malloc, UNICODE
from rpython.rtyper.lltypesystem.lltype import cast_primitive, UniChar
@@ -41,6 +43,12 @@
ch = ll_s.chars[start+i]
ll_res.chars[i] = cast_primitive(UniChar, ch)
return hlunicode(ll_res)
+ '''
+ # TODO: Actually do this without slicing
+ from pypy.interpreter.utf8_codecs import str_decode_latin_1
+ assert start >= 0
+ assert end >= 0
+ return str_decode_latin_1(s[start:end], end - start, 'strict')[0]
TYPE_UNKNOWN = 0
TYPE_STRING = 1
@@ -369,7 +377,7 @@
return # help the annotator to know that we'll never go beyond
# this point
#
- uchr = runicode.code_to_unichr(val) # may be a surrogate pair again
+ uchr = utf8chr(val) # may be a surrogate pair again
utf8_ch = unicodehelper.encode_utf8(self.space, uchr)
builder.append(utf8_ch)
return i
diff --git a/pypy/module/_rawffi/interp_rawffi.py
b/pypy/module/_rawffi/interp_rawffi.py
--- a/pypy/module/_rawffi/interp_rawffi.py
+++ b/pypy/module/_rawffi/interp_rawffi.py
@@ -415,7 +415,6 @@
"Expected unicode string of length one as wide character"))
val = utf8ord(s)
- #val = 0
if rffi.sizeof(rffi.WCHAR_T) == 2 and val > 0xFFFF:
# Utf-16 must be used on systems with a 2 byte wchar_t to
# encode codepoints > 0xFFFF
@@ -597,7 +596,7 @@
def wcharp2rawunicode(space, address, maxlength=-1):
if maxlength == -1:
return wcharp2unicode(space, address)
- s = rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, address), maxlength)
+ s = Utf8Str.from_wcharpsize(rffi.cast(rffi.CWCHARP, address), maxlength)
return space.wrap(s)
@unwrap_spec(address=r_uint, newcontent='bufferstr')
diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -5,6 +5,7 @@
from pypy.interpreter.typedef import make_weakref_descr
from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
from pypy.interpreter.error import OperationError
+from pypy.interpreter import utf8
from pypy.interpreter.utf8 import Utf8Str, utf8ord
from rpython.rlib.rarithmetic import intmask
from rpython.rlib import jit
@@ -121,6 +122,8 @@
pos = len(unicodestr)
if endpos > len(unicodestr):
endpos = len(unicodestr)
+ assert pos >= 0
+ assert endpos >= 0
return rsre_core.UnicodeMatchContext(self.code, unicodestr,
pos, endpos, self.flags)
else:
@@ -232,7 +235,7 @@
else:
if space.isinstance_w(w_ptemplate, space.w_unicode):
filter_as_unicode = space.unicode_w(w_ptemplate)
- literal = u'\\' not in filter_as_unicode
+ literal = utf8.IN('\\', filter_as_unicode)
else:
try:
filter_as_string = space.str_w(w_ptemplate)
diff --git a/pypy/module/cpyext/unicodeobject.py
b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -1,5 +1,5 @@
from pypy.interpreter.error import OperationError
-from pypy.interpreter.utf8 import Utf8Str
+from pypy.interpreter.utf8 import Utf8Str, utf8chr
from pypy.interpreter import utf8_codecs
from rpython.rtyper.lltypesystem import rffi, lltype
from pypy.module.unicodedata import unicodedb
@@ -138,17 +138,17 @@
@cpython_api([Py_UNICODE], Py_UNICODE, error=CANNOT_FAIL)
def Py_UNICODE_TOLOWER(space, ch):
"""Return the character ch converted to lower case."""
- return unichr(unicodedb.tolower(ord(ch)))
+ return utf8chr(unicodedb.tolower(ord(ch)))
@cpython_api([Py_UNICODE], Py_UNICODE, error=CANNOT_FAIL)
def Py_UNICODE_TOUPPER(space, ch):
"""Return the character ch converted to upper case."""
- return unichr(unicodedb.toupper(ord(ch)))
+ return utf8chr(unicodedb.toupper(ord(ch)))
@cpython_api([Py_UNICODE], Py_UNICODE, error=CANNOT_FAIL)
def Py_UNICODE_TOTITLE(space, ch):
"""Return the character ch converted to title case."""
- return unichr(unicodedb.totitle(ord(ch)))
+ return utf8chr(unicodedb.totitle(ord(ch)))
@cpython_api([Py_UNICODE], rffi.INT_real, error=CANNOT_FAIL)
def Py_UNICODE_TODECIMAL(space, ch):
@@ -331,7 +331,7 @@
Therefore, modification of the resulting Unicode object is only allowed
when u
is NULL."""
if wchar_p:
- s = rffi.Utf8Str.from_wcharpsize(wchar_p, length)
+ s = Utf8Str.from_wcharpsize(wchar_p, length)
return make_ref(space, space.wrap(s))
else:
return rffi.cast(PyObject, new_empty_unicode(space, length))
diff --git a/pypy/module/exceptions/interp_exceptions.py
b/pypy/module/exceptions/interp_exceptions.py
--- a/pypy/module/exceptions/interp_exceptions.py
+++ b/pypy/module/exceptions/interp_exceptions.py
@@ -77,6 +77,7 @@
descr_set_dict, descr_del_dict)
from pypy.interpreter.gateway import interp2app
from pypy.interpreter.error import OperationError
+from pypy.interpreter.utf8 import Utf8Str
from rpython.rlib import rwin32
@@ -126,7 +127,7 @@
return space.call_function(space.w_unicode, w_as_str)
lgt = len(self.args_w)
if lgt == 0:
- return space.wrap(u"")
+ return space.wrap(Utf8Str(""))
if lgt == 1:
return space.call_function(space.w_unicode, self.args_w[0])
else:
diff --git a/pypy/module/pyexpat/interp_pyexpat.py
b/pypy/module/pyexpat/interp_pyexpat.py
--- a/pypy/module/pyexpat/interp_pyexpat.py
+++ b/pypy/module/pyexpat/interp_pyexpat.py
@@ -2,6 +2,7 @@
from pypy.interpreter.typedef import TypeDef, GetSetProperty
from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
from pypy.interpreter.error import OperationError, oefmt
+from pypy.interpreter.utf8 import ORD
from rpython.rlib import rgc, jit
from rpython.rtyper.lltypesystem import rffi, lltype
from rpython.rtyper.tool import rffi_platform
@@ -589,8 +590,8 @@
"multi-byte encodings are not supported")
for i in range(256):
- c = translationmap[i]
- if c == u'\ufffd':
+ c = ORD(translationmap, i)
+ if c == 0xFFFD:
info.c_map[i] = rffi.cast(rffi.INT, -1)
else:
info.c_map[i] = rffi.cast(rffi.INT, c)
diff --git a/pypy/module/unicodedata/interp_ucd.py
b/pypy/module/unicodedata/interp_ucd.py
--- a/pypy/module/unicodedata/interp_ucd.py
+++ b/pypy/module/unicodedata/interp_ucd.py
@@ -6,7 +6,7 @@
from pypy.interpreter.baseobjspace import W_Root
from pypy.interpreter.error import OperationError
from pypy.interpreter.typedef import TypeDef, interp_attrproperty
-from pypy.interpreter.utf8 import utf8chr
+from pypy.interpreter.utf8 import Utf8Str, utf8chr
from rpython.rlib.rarithmetic import r_longlong
from rpython.rlib.objectmodel import we_are_translated
from rpython.rlib.unicodedata import unicodedb_5_2_0, unicodedb_3_2_0
@@ -225,10 +225,12 @@
result[0] = ch
if not composed: # If decomposed normalization we are done
- return space.wrap(u''.join([unichr(i) for i in result[:j]]))
+ return space.wrap(Utf8Str('').join(
+ [utf8chr(i) for i in result[:j]]))
if j <= 1:
- return space.wrap(u''.join([unichr(i) for i in result[:j]]))
+ return space.wrap(Utf8Str('').join(
+ [utf8chr(i) for i in result[:j]]))
current = result[0]
starter_pos = 0
@@ -275,7 +277,8 @@
result[starter_pos] = current
- return space.wrap(u''.join([unichr(i) for i in result[:next_insert]]))
+ return space.wrap(Utf8Str('').join(
+ [utf8chr(i) for i in result[:next_insert]]))
methods = {}
diff --git a/pypy/objspace/std/formatting.py b/pypy/objspace/std/formatting.py
--- a/pypy/objspace/std/formatting.py
+++ b/pypy/objspace/std/formatting.py
@@ -9,7 +9,7 @@
from rpython.rlib.rarithmetic import INT_MAX
from rpython.tool.sourcetools import func_with_new_name
from pypy.interpreter.error import OperationError, oefmt
-from pypy.interpreter.utf8 import Utf8Builder, ORD
+from pypy.interpreter.utf8 import Utf8Builder, ORD, utf8chr
class BaseStringFormatter(object):
@@ -156,11 +156,6 @@
# to build two subclasses of the BaseStringFormatter class,
# each one getting its own subtle differences and RPython types.
- if do_unicode:
- const = unicode
- else:
- const = str
-
class StringFormatter(BaseStringFormatter):
def __init__(self, space, fmt, values_w, w_valuedict):
BaseStringFormatter.__init__(self, space, values_w, w_valuedict)
@@ -365,6 +360,7 @@
return
if prec >= 0 and prec < length:
length = prec # ignore the end of the string if too long
+
result = self.result
padding = self.width - length
if padding < 0:
@@ -475,7 +471,7 @@
n = space.int_w(w_value)
if do_unicode:
try:
- c = unichr(n)
+ c = utf8chr(n)
except ValueError:
raise OperationError(space.w_OverflowError,
space.wrap("unicode character code out of range"))
diff --git a/pypy/objspace/std/newformat.py b/pypy/objspace/std/newformat.py
--- a/pypy/objspace/std/newformat.py
+++ b/pypy/objspace/std/newformat.py
@@ -681,8 +681,12 @@
buf.append(c)
for i in range(d_state - 1, d_state - n_chars - 1, -1):
buf.append(digits[i])
+
+ zero = "0"
+ if self.is_unicode:
+ zero = Utf8Str("0")
for i in range(n_zeros):
- buf.append("0")
+ buf.append(zero)
def _group_digits(self, spec, digits):
buf = []
@@ -727,9 +731,12 @@
def _upcase_string(self, s):
buf = []
for c in s:
- index = ord(c)
+ index = ORD(c, 0)
if ord("a") <= index <= ord("z"):
- c = chr(index - 32)
+ if self.is_unicode:
+ c = utf8chr(index - 32)
+ else:
+ c = chr(index - 32)
buf.append(c)
return self.empty.join(buf)
@@ -1061,7 +1068,7 @@
tmp_align = self._align
tmp_width = self._width
self._fill_char = ord("\0")
- self._align = "<"
+ self._align = ord("<")
self._width = -1
#determine if we have remainder, might include dec or exponent or
both
diff --git a/pypy/objspace/std/stringmethods.py
b/pypy/objspace/std/stringmethods.py
--- a/pypy/objspace/std/stringmethods.py
+++ b/pypy/objspace/std/stringmethods.py
@@ -9,7 +9,7 @@
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter.gateway import WrappedDefault, unwrap_spec
-from pypy.interpreter.utf8 import ORD
+from pypy.interpreter import utf8
from pypy.objspace.std import slicetype
from pypy.objspace.std.sliceobject import W_SliceObject, normalize_simple_slice
@@ -29,6 +29,8 @@
lenself = len(value)
start, end = slicetype.unwrap_start_stop(
space, lenself, w_start, w_end, upper_bound=upper_bound)
+ assert start >= 0
+ assert end >= 0
return (value, start, end)
def _multi_chr(self, c):
@@ -64,7 +66,7 @@
if e.match(space, space.w_TypeError):
return space.w_NotImplemented
raise
- return self._new(self._val(space) + other)
+ return self._new(utf8.ADD(self._val(space), other))
# Bytearray overrides this method, CPython doesn't support contacting
# buffers and strs, and unicodes are always handled above
@@ -80,8 +82,9 @@
if times <= 0:
return self._empty()
if self._len() == 1:
- return self._new(self._multi_chr(self._val(space)[0]) * times)
- return self._new(self._val(space) * times)
+ return self._new(utf8.MUL(self._multi_chr(self._val(space)[0]),
+ times))
+ return self._new(utf8.MUL(self._val(space), times))
descr_rmul = descr_mul
@@ -142,7 +145,9 @@
if d > 0:
offset = d//2 + (d & width & 1)
fillchar = self._multi_chr(fillchar[0])
- centered = fillchar * offset + value + fillchar * (d - offset)
+ #centered = fillchar * offset + value + fillchar * (d - offset)
+ centered = utf8.ADD(utf8.ADD(utf8.MUL(fillchar, offset), value),
+ utf8.MUL(fillchar, (d - offset)))
else:
centered = value
@@ -204,8 +209,11 @@
expanded = oldtoken = splitted.pop(0)
for token in splitted:
- expanded += self._multi_chr(' ') * self._tabindent(oldtoken,
- tabsize) + token
+ #expanded += self._multi_chr(' ') * self._tabindent(oldtoken,
+ # tabsize) + token
+ m = utf8.MUL(self._multi_chr(' '),
+ self._tabindent(oldtoken, tabsize))
+ expanded = utf8.ADD(expanded, utf8.ADD(m, token))
oldtoken = token
return self._new(expanded)
@@ -219,8 +227,8 @@
offset = len(token)
while 1:
- if (ORD(token, offset-1) == ord("\n") or
- ORD(token, offset-1) == ord("\r")):
+ if (utf8.ORD(token, offset-1) == ord("\n") or
+ utf8.ORD(token, offset-1) == ord("\r")):
break
distance += 1
offset -= 1
@@ -457,7 +465,8 @@
d = width - len(value)
if d > 0:
fillchar = self._multi_chr(fillchar[0])
- value += fillchar * d
+ #value += fillchar * d
+ value = utf8.ADD(value, utf8.MUL(fillchar, d))
return self._new(value)
@@ -471,7 +480,8 @@
d = width - len(value)
if d > 0:
fillchar = self._multi_chr(fillchar[0])
- value = fillchar * d + value
+ #value = fillchar * d + value
+ value = utf8.ADD(utf8.MUL(fillchar, d), value)
return self._new(value)
@@ -606,8 +616,8 @@
eol = pos
pos += 1
# read CRLF as one line break
- if (pos < length and ORD(value, eol) == ord('\r') and
- ORD(value, pos) == ord('\n')):
+ if (pos < length and utf8.ORD(value, eol) == ord('\r') and
+ utf8.ORD(value, pos) == ord('\n')):
pos += 1
if keepends:
eol = pos
@@ -768,15 +778,16 @@
def descr_zfill(self, space, width):
selfval = self._val(space)
if len(selfval) == 0:
- return self._new(self._multi_chr('0') * width)
+ #return self._new(self._multi_chr('0') * width)
+ return self._new(utf8.MUL(self._multi_chr('0'), width))
num_zeros = width - len(selfval)
if num_zeros <= 0:
# cannot return self, in case it is a subclass of str
return self._new(selfval)
builder = self._builder(width)
- if len(selfval) > 0 and (ORD(selfval, 0) == ord('+') or
- ORD(selfval, 0) == ord('-')):
+ if len(selfval) > 0 and (utf8.ORD(selfval, 0) == ord('+') or
+ utf8.ORD(selfval, 0) == ord('-')):
# copy sign to first position
builder.append(selfval[0])
start = 1
diff --git a/pypy/objspace/std/unicodeobject.py
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -8,6 +8,7 @@
from pypy.interpreter import unicodehelper
from pypy.interpreter.baseobjspace import W_Root
+from pypy.interpreter import utf8
from pypy.interpreter.utf8 import Utf8Str, Utf8Builder, utf8chr, utf8ord
from pypy.interpreter.utf8_codecs import (
make_unicode_escape_function, str_decode_ascii, str_decode_utf_8,
@@ -91,7 +92,7 @@
return W_UnicodeObject(value)
def _new_from_list(self, value):
- return W_UnicodeObject(u''.join(value))
+ return W_UnicodeObject(Utf8Str('').join(value))
def _empty(self):
return W_UnicodeObject.EMPTY
@@ -109,12 +110,21 @@
@staticmethod
def _op_val(space, w_other):
+ if space.isinstance_w(w_other, space.w_str):
+ w_other = unicode_from_string(space, w_other)
+ elif not isinstance(w_other, W_UnicodeObject):
+ w_other = unicode_from_encoded_object(
+ space, w_other, None, "strict")
+ assert isinstance(w_other, W_UnicodeObject)
+ return w_other._value
+ '''
if isinstance(w_other, W_UnicodeObject):
return w_other._value
if space.isinstance_w(w_other, space.w_str):
return unicode_from_string(space, w_other)._value
return unicode_from_encoded_object(
space, w_other, None, "strict")._value
+ '''
def _chr(self, char):
assert len(char) == 1
@@ -228,7 +238,7 @@
def descr_eq(self, space, w_other):
try:
- res = self._val(space) == self._op_val(space, w_other)
+ res = self._val(space).__eq__(self._op_val(space, w_other))
except OperationError as e:
if e.match(space, space.w_TypeError):
return space.w_NotImplemented
@@ -244,7 +254,7 @@
def descr_ne(self, space, w_other):
try:
- res = self._val(space) != self._op_val(space, w_other)
+ res = self._val(space).__ne__(self._op_val(space, w_other))
except OperationError as e:
if e.match(space, space.w_TypeError):
return space.w_NotImplemented
@@ -260,7 +270,7 @@
def descr_lt(self, space, w_other):
try:
- res = self._val(space) < self._op_val(space, w_other)
+ res = self._val(space).__lt__(self._op_val(space, w_other))
except OperationError as e:
if e.match(space, space.w_TypeError):
return space.w_NotImplemented
@@ -269,7 +279,7 @@
def descr_le(self, space, w_other):
try:
- res = self._val(space) <= self._op_val(space, w_other)
+ res = self._val(space).__le__(self._op_val(space, w_other))
except OperationError as e:
if e.match(space, space.w_TypeError):
return space.w_NotImplemented
@@ -278,7 +288,7 @@
def descr_gt(self, space, w_other):
try:
- res = self._val(space) > self._op_val(space, w_other)
+ res = self._val(space).__gt__(self._op_val(space, w_other))
except OperationError as e:
if e.match(space, space.w_TypeError):
return space.w_NotImplemented
@@ -287,7 +297,7 @@
def descr_ge(self, space, w_other):
try:
- res = self._val(space) >= self._op_val(space, w_other)
+ res = self._val(space).__ge__(self._op_val(space, w_other))
except OperationError as e:
if e.match(space, space.w_TypeError):
return space.w_NotImplemented
diff --git a/pypy/tool/ann_override.py b/pypy/tool/ann_override.py
--- a/pypy/tool/ann_override.py
+++ b/pypy/tool/ann_override.py
@@ -21,12 +21,16 @@
def specialize__wrap(pol, funcdesc, args_s):
from pypy.interpreter.baseobjspace import W_Root
+ from pypy.interpreter.utf8 import Utf8Str
from rpython.annotator.classdef import ClassDef
W_Root_def = funcdesc.bookkeeper.getuniqueclassdef(W_Root)
typ = args_s[1].knowntype
if isinstance(typ, ClassDef):
- assert typ.issubclass(W_Root_def)
- typ = W_Root
+ if typ.issubclass(W_Root_def):
+ typ = W_Root
+ else:
+ assert typ.classdesc.pyobj is Utf8Str
+ typ = Utf8Str
else:
assert not issubclass(typ, W_Root)
assert typ != tuple, "space.wrap(tuple) forbidden; use newtuple()"
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit