[pypy-commit] pypy unicode-utf8: merge heads
Author: Armin Rigo Branch: unicode-utf8 Changeset: r93134:25ac6121d03c Date: 2017-11-23 10:26 +0100 http://bitbucket.org/pypy/pypy/changeset/25ac6121d03c/ Log:merge heads diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py --- a/pypy/interpreter/baseobjspace.py +++ b/pypy/interpreter/baseobjspace.py @@ -272,7 +272,7 @@ self._typed_unwrap_error(space, "unicode") def convert_to_w_unicode(self, space): -self._typed_unwrap_error(space, "unicode") +self._typed_unwrap_error(space, "unicode") def bytearray_list_of_chars_w(self, space): self._typed_unwrap_error(space, "bytearray") @@ -1759,6 +1759,11 @@ def utf8_w(self, w_obj): return w_obj.utf8_w(self) + +def unicode_w(self, w_obj): +# XXX: kill me! +return w_obj.utf8_w(self).decode('utf-8') + def convert_to_w_unicode(self, w_obj): return w_obj.convert_to_w_unicode(self) diff --git a/pypy/module/_io/interp_stringio.py b/pypy/module/_io/interp_stringio.py --- a/pypy/module/_io/interp_stringio.py +++ b/pypy/module/_io/interp_stringio.py @@ -184,9 +184,7 @@ start, end ) -if endpos >= 0: -endpos += start -else: +if endpos < 0: endpos = end assert endpos >= 0 self.pos = endpos diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py --- a/pypy/module/_io/interp_textio.py +++ b/pypy/module/_io/interp_textio.py @@ -97,7 +97,7 @@ output_len -= 1 if output_len == 0: -return space.newutf8("", 1, FLAG_ASCII) +return space.newutf8("", 0, FLAG_ASCII) # Record which newlines are read and do newline translation if # desired, all in one pass. @@ -224,30 +224,28 @@ def _find_line_ending(self, line, start, end): size = end - start if self.readtranslate: - # Newlines are already translated, only search for \n -pos = line.find(u'\n', start, end) +pos = line.find('\n', start, end) if pos >= 0: -return pos - start + 1, 0 +return pos + 1, 0 else: return -1, size elif self.readuniversal: # Universal newline search. Find any of \r, \r\n, \n # The decoder ensures that \r\n are not split in two pieces -i = 0 +i = start while True: -# Fast path for non-control chars. The loop always ends -# since the Py_UNICODE storage is NUL-terminated. -while i < size and line[start + i] > '\r': +# Fast path for non-control chars. +while i < end and line[i] > '\r': i += 1 -if i >= size: +if i >= end: return -1, size -ch = line[start + i] +ch = line[i] i += 1 if ch == '\n': return i, 0 if ch == '\r': -if line[start + i] == '\n': +if line[i] == '\n': return i + 1, 0 else: return i, 0 @@ -255,7 +253,7 @@ # Non-universal mode. pos = line.find(self.readnl, start, end) if pos >= 0: -return pos - start + len(self.readnl), 0 +return pos + len(self.readnl), 0 else: pos = line.find(self.readnl[0], start, end) if pos >= 0: @@ -520,8 +518,13 @@ # _ # read methods -def _set_decoded_chars(self, chars): -self.decoded_chars = chars +def _unset_decoded(self): +self.decoded_chars = None +self.decoded_chars_used = 0 + +def _set_decoded(self, space, w_decoded): +check_decoded(space, w_decoded) +self.decoded_chars = space.utf8_w(w_decoded) self.decoded_chars_used = 0 def _get_decoded_chars(self, size): @@ -580,8 +583,7 @@ eof = space.len_w(w_input) == 0 w_decoded = space.call_method(self.w_decoder, "decode", w_input, space.newbool(eof)) -check_decoded(space, w_decoded) -self._set_decoded_chars(space.utf8_w(w_decoded)) +self._set_decoded(space, w_decoded) if space.len_w(w_decoded) > 0: eof = False @@ -617,13 +619,13 @@ w_bytes = space.call_method(self.w_buffer, "read") w_decoded = space.call_method(self.w_decoder, "decode", w_bytes, space.w_True) check_decoded(space, w_decoded) -w_result = space.newunicode(self._get_decoded_chars(-1)) +w_result = space.new_from_utf8(self._get_decoded_chars(-1))
[pypy-commit] pypy unicode-utf8: Tweak the unicode FLAG_xx values for performance; collapse two identical helpers; move combine_flags() to rutf8
Author: Armin Rigo Branch: unicode-utf8 Changeset: r93133:a1cf21d7a124 Date: 2017-11-23 10:24 +0100 http://bitbucket.org/pypy/pypy/changeset/a1cf21d7a124/ Log:Tweak the unicode FLAG_xx values for performance; collapse two identical helpers; move combine_flags() to rutf8 diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -3,6 +3,7 @@ from pypy.interpreter.error import OperationError from rpython.rlib.objectmodel import specialize from rpython.rlib import rutf8 +from rpython.rlib.rutf8 import combine_flags from rpython.rlib.rarithmetic import r_uint, intmask from rpython.rlib.rstring import StringBuilder from pypy.module._codecs import interp_codecs @@ -43,14 +44,6 @@ from pypy.objspace.std.unicodeobject import encode_object return encode_object(space, w_data, encoding, errors) -def combine_flags(one, two): -if one == rutf8.FLAG_ASCII and two == rutf8.FLAG_ASCII: -return rutf8.FLAG_ASCII -elif (one == rutf8.FLAG_HAS_SURROGATES or - two == rutf8.FLAG_HAS_SURROGATES): -return rutf8.FLAG_HAS_SURROGATES -return rutf8.FLAG_REGULAR - def _has_surrogate(u): for c in u: @@ -788,7 +781,8 @@ # first surrogate surrogate = outCh else: -flag = combine_flags(flag, rutf8.unichr_to_flag(outCh)) +flag = combine_flags(flag, + rutf8.get_flag_from_code(outCh)) outsize += 1 assert outCh >= 0 rutf8.unichr_as_utf8_append(result, outCh, True) diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -356,7 +356,7 @@ elif unicodedb.islower(ch): ch = unicodedb.toupper(ch) if ch >= 0x80: -flag = unicodehelper.combine_flags(flag, rutf8.FLAG_REGULAR) +flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR) rutf8.unichr_as_utf8_append(builder, ch) return W_UnicodeObject(builder.build(), self._length, flag) @@ -381,7 +381,7 @@ else: ch = unicodedb.tolower(ch) if ch >= 0x80: -flag = unicodehelper.combine_flags(flag, rutf8.FLAG_REGULAR) +flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR) rutf8.unichr_as_utf8_append(builder, ch) previous_is_cased = unicodedb.iscased(ch) return builder.build(), flag @@ -407,7 +407,7 @@ codepoint = space.int_w(w_newval) elif isinstance(w_newval, W_UnicodeObject): result.append(w_newval._utf8) -flag = unicodehelper.combine_flags(flag, w_newval._get_flag()) +flag = rutf8.combine_flags(flag, w_newval._get_flag()) result_length += w_newval._length continue else: @@ -416,7 +416,7 @@ "or unicode") try: if codepoint >= 0x80: -flag = unicodehelper.combine_flags(flag, rutf8.FLAG_REGULAR) +flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR) rutf8.unichr_as_utf8_append(result, codepoint, allow_surrogates=True) result_length += 1 @@ -540,7 +540,7 @@ while pos < len(self._utf8): lower = unicodedb.tolower(rutf8.codepoint_at_pos(self._utf8, pos)) if lower >= 0x80: -flag = unicodehelper.combine_flags(flag, rutf8.FLAG_REGULAR) +flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR) rutf8.unichr_as_utf8_append(builder, lower) # XXX allow surrogates? pos = rutf8.next_codepoint_pos(self._utf8, pos) return W_UnicodeObject(builder.build(), self._len(), flag) @@ -642,7 +642,7 @@ if e.match(space, space.w_TypeError): return space.w_NotImplemented raise -flag = unicodehelper.combine_flags(self._get_flag(), w_other._get_flag()) +flag = rutf8.combine_flags(self._get_flag(), w_other._get_flag()) return W_UnicodeObject(self._utf8 + w_other._utf8, self._len() + w_other._len(), flag) @@ -667,7 +667,7 @@ # XXX Maybe the extra copy here is okay? It was basically going to # happen anyway, what with being placed into the builder w_u = self.convert_arg_to_w_unicode(space, w_s) -flag = unicodehelper.combine_flags(flag, w_u._get_flag()) +flag = rutf8.combine_flags(flag, w_u._get_flag()) unwrappe
[pypy-commit] pypy unicode-utf8: Tests and fixes for 'allow_surrogates=True' in various unicode methods
Author: Armin Rigo Branch: unicode-utf8 Changeset: r93135:16bfad77e3d5 Date: 2017-11-23 10:33 +0100 http://bitbucket.org/pypy/pypy/changeset/16bfad77e3d5/ Log:Tests and fixes for 'allow_surrogates=True' in various unicode methods diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py --- a/pypy/objspace/std/test/test_unicodeobject.py +++ b/pypy/objspace/std/test/test_unicodeobject.py @@ -299,6 +299,7 @@ assert u"Brown Fox".title() == u"Brown Fox" assert u"bro!wn fox".title() == u"Bro!Wn Fox" assert u"brow\u4321n fox".title() == u"Brow\u4321N Fox" +assert u'\ud800'.title() == u'\ud800' def test_istitle(self): assert u"".istitle() == False @@ -328,10 +329,12 @@ assert u'A'.lower() == u'a' assert u'\u0105'.lower() == u'\u0105' assert u'\u0104'.lower() == u'\u0105' +assert u'\ud800'.lower() == u'\ud800' assert u'a'.upper() == u'A' assert u'A'.upper() == u'A' assert u'\u0105'.upper() == u'\u0104' assert u'\u0104'.upper() == u'\u0104' +assert u'\ud800'.upper() == u'\ud800' def test_capitalize(self): assert u"brown fox".capitalize() == u"Brown fox" @@ -354,6 +357,8 @@ # check with Ll chars with no upper - nothing changes here assert (u'\u019b\u1d00\u1d86\u0221\u1fb7'.capitalize() == u'\u019b\u1d00\u1d86\u0221\u1fb7') +assert u'\ud800'.capitalize() == u'\ud800' +assert u'xx\ud800'.capitalize() == u'Xx\ud800' def test_rjust(self): s = u"abc" @@ -844,6 +849,7 @@ def test_swapcase(self): assert u'\xe4\xc4\xdf'.swapcase() == u'\xc4\xe4\xdf' +assert u'\ud800'.swapcase() == u'\ud800' def test_buffer(self): buf = buffer(u'XY') diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -357,7 +357,7 @@ ch = unicodedb.toupper(ch) if ch >= 0x80: flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR) -rutf8.unichr_as_utf8_append(builder, ch) +rutf8.unichr_as_utf8_append(builder, ch, allow_surrogates=True) return W_UnicodeObject(builder.build(), self._length, flag) def descr_title(self, space): @@ -382,7 +382,7 @@ ch = unicodedb.tolower(ch) if ch >= 0x80: flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR) -rutf8.unichr_as_utf8_append(builder, ch) +rutf8.unichr_as_utf8_append(builder, ch, allow_surrogates=True) previous_is_cased = unicodedb.iscased(ch) return builder.build(), flag @@ -541,7 +541,7 @@ lower = unicodedb.tolower(rutf8.codepoint_at_pos(self._utf8, pos)) if lower >= 0x80: flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR) -rutf8.unichr_as_utf8_append(builder, lower) # XXX allow surrogates? +rutf8.unichr_as_utf8_append(builder, lower, allow_surrogates=True) pos = rutf8.next_codepoint_pos(self._utf8, pos) return W_UnicodeObject(builder.build(), self._len(), flag) @@ -721,7 +721,7 @@ if uchar >= 0x80: flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR) i = rutf8.next_codepoint_pos(value, i) -rutf8.unichr_as_utf8_append(builder, uchar) +rutf8.unichr_as_utf8_append(builder, uchar, allow_surrogates=True) return W_UnicodeObject(builder.build(), self._length, flag) @unwrap_spec(width=int) @@ -831,14 +831,14 @@ uchar = rutf8.codepoint_at_pos(value, 0) i = rutf8.next_codepoint_pos(value, 0) ch = unicodedb.toupper(uchar) -rutf8.unichr_as_utf8_append(builder, ch) +rutf8.unichr_as_utf8_append(builder, ch, allow_surrogates=True) if ch >= 0x80: flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR) while i < len(value): uchar = rutf8.codepoint_at_pos(value, i) i = rutf8.next_codepoint_pos(value, i) ch = unicodedb.tolower(uchar) -rutf8.unichr_as_utf8_append(builder, ch) +rutf8.unichr_as_utf8_append(builder, ch, allow_surrogates=True) if ch >= 0x80: flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR) return W_UnicodeObject(builder.build(), self._len(), flag) ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy unicode-utf8: Review for surrogates
Author: Armin Rigo Branch: unicode-utf8 Changeset: r93136:dc6582a05b85 Date: 2017-11-23 10:48 +0100 http://bitbucket.org/pypy/pypy/changeset/dc6582a05b85/ Log:Review for surrogates diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -370,14 +370,15 @@ builder.append(res) else: # when we get here, chr is a 32-bit unicode character -if chr > 0x10: +try: +rutf8.unichr_as_utf8_append(builder, intmask(chr), True) +except ValueError: message = "illegal Unicode character" res, pos = errorhandler(errors, encoding, message, s, pos-2, pos+digits) size, flag = rutf8.check_utf8(res, True) builder.append(res) else: -rutf8.unichr_as_utf8_append(builder, intmask(chr), True) flag = rutf8.get_flag_from_code(intmask(chr)) pos += digits size = 1 @@ -466,7 +467,7 @@ pos += 1 x = (x<<3) + ord(ch) - ord('0') outsize += 1 -if x >= 0x7F: +if x > 0x7F: rutf8.unichr_as_utf8_append(builder, x) flag = combine_flags(rutf8.FLAG_REGULAR, flag) else: @@ -524,7 +525,9 @@ pos = look + 1 outsize += 1 flag = combine_flags(flag, rutf8.get_flag_from_code(code)) -rutf8.unichr_as_utf8_append(builder, code) +rutf8.unichr_as_utf8_append(builder, code, +allow_surrogates=True) +# xxx 'code' is probably always within range here... else: res, pos = errorhandler(errors, "unicodeescape", message, s, pos-1, look+1) @@ -772,7 +775,8 @@ surrogate = 0 continue else: -rutf8.unichr_as_utf8_append(result, surrogate) +rutf8.unichr_as_utf8_append(result, surrogate, +allow_surrogates=True) flag = rutf8.FLAG_HAS_SURROGATES outsize += 1 surrogate = 0 @@ -1236,7 +1240,7 @@ result.append(r) continue -rutf8.unichr_as_utf8_append(result, ch) +rutf8.unichr_as_utf8_append(result, ch, allow_surrogates=True) pos += 4 r = result.build() lgt, flag = rutf8.check_utf8(r, True) @@ -1360,7 +1364,7 @@ s, pos, pos + unicode_bytes) result.append(res) continue -rutf8.unichr_as_utf8_append(result, intmask(t)) +rutf8.unichr_as_utf8_append(result, intmask(t), allow_surrogates=True) pos += unicode_bytes r = result.build() lgt, flag = rutf8.check_utf8(r, True) diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py --- a/pypy/module/_multibytecodec/c_codecs.py +++ b/pypy/module/_multibytecodec/c_codecs.py @@ -127,7 +127,7 @@ errorcb, namecb, stringdata) src = pypy_cjk_dec_outbuf(decodebuf) length = pypy_cjk_dec_outlen(decodebuf) -return rffi.wcharpsize2utf8(src, length) +return rffi.wcharpsize2utf8(src, length) # assumes no out-of-range chars def multibytecodec_decerror(decodebuf, e, errors, errorcb, namecb, stringdata): diff --git a/rpython/rtyper/lltypesystem/rffi.py b/rpython/rtyper/lltypesystem/rffi.py --- a/rpython/rtyper/lltypesystem/rffi.py +++ b/rpython/rtyper/lltypesystem/rffi.py @@ -1012,6 +1012,7 @@ def wcharpsize2utf8(w, size): """ Helper to convert WCHARP pointer to utf8 in one go. Equivalent to wcharpsize2unicode().encode("utf8") +Raises ValueError if characters are outside range(0x11)! """ from rpython.rlib import rutf8 ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy unicode-utf8: Fixes for _cffi_backend
Author: Armin Rigo Branch: unicode-utf8 Changeset: r93137:a94b5860dbb3 Date: 2017-11-23 15:40 +0100 http://bitbucket.org/pypy/pypy/changeset/a94b5860dbb3/ Log:Fixes for _cffi_backend diff --git a/pypy/module/_cffi_backend/ctypearray.py b/pypy/module/_cffi_backend/ctypearray.py --- a/pypy/module/_cffi_backend/ctypearray.py +++ b/pypy/module/_cffi_backend/ctypearray.py @@ -64,13 +64,10 @@ elif space.isinstance_w(w_value, space.w_unicode): from pypy.module._cffi_backend import wchar_helper w_u = space.convert_arg_to_w_unicode(w_value) -if self.citem.size == 4: +if self.ctitem.size == 2: +length = wchar_helper.utf8_size_as_char16(w_u._utf8) +else: length = w_u._len() -else: -if not w_u._has_surrogates(): -length = w_u._len() -else: -length = wchar_helper.unicode_size_as_char16(w_u._utf8, w_u._len()) return (w_value, length + 1) else: explicitlength = space.getindex_w(w_value, space.w_OverflowError) diff --git a/pypy/module/_cffi_backend/ctypeprim.py b/pypy/module/_cffi_backend/ctypeprim.py --- a/pypy/module/_cffi_backend/ctypeprim.py +++ b/pypy/module/_cffi_backend/ctypeprim.py @@ -40,16 +40,13 @@ return ord(s[0]) def cast_unicode(self, w_ob): -import pdb -pdb.set_trace() space = self.space w_u = space.convert_arg_to_w_unicode(w_ob) if w_u._len() != 1: raise oefmt(space.w_TypeError, "cannot cast unicode string of length %d to ctype '%s'", w_u._len(), self.name) -ordinal = rutf8.codepoint_at_pos(w_u._utf8, 0) -return intmask(ordinal) +return rutf8.codepoint_at_pos(w_u._utf8, 0) def cast(self, w_ob): from pypy.module._cffi_backend import ctypeptr @@ -175,21 +172,19 @@ return self.space.newint(value)# r_uint => 'long' object def convert_to_object(self, cdata): -if self.is_signed_wchar: -code = ord(rffi.cast(rffi.CWCHARP, cdata)[0]) -return self.space.newutf8( -rutf8.unichr_as_utf8(code), 1, -rutf8.get_flag_from_code(code)) -else: -value = misc.read_raw_ulong_data(cdata, self.size) # r_uint -try: -u = wchar_helper.ordinal_to_unicode(value) -except wchar_helper.OutOfRange as e: -raise oefmt(self.space.w_ValueError, -"char32_t out of range for " -"conversion to unicode: %s", hex(e.ordinal)) -return self.space.newutf8(rutf8.unichr_as_utf8(ord(u)), 1, -rutf8.get_flag_from_code(ord(u))) +value = misc.read_raw_ulong_data(cdata, self.size) # r_uint +try: +utf8 = rutf8.unichr_as_utf8(value, allow_surrogates=True) +except ValueError: +if self.is_signed_wchar: +s = hex(intmask(value)) +else: +s = hex(value) +raise oefmt(self.space.w_ValueError, +"%s out of range for conversion to unicode: %s", +self.name, s) +flag = rutf8.get_flag_from_code(intmask(value)) +return self.space.newutf8(utf8, 1, flag) def string(self, cdataobj, maxlen): with cdataobj as ptr: @@ -200,7 +195,13 @@ # returns a r_uint. If self.size == 2, it is smaller than 0x1 space = self.space if space.isinstance_w(w_ob, space.w_unicode): -return rutf8.codepoint_at_pos(space.utf8_w(w_ob), 0) +w_u = space.convert_arg_to_w_unicode(w_ob) +if w_u._len() != 1: +raise self._convert_error("single character", w_ob) +ordinal = rutf8.codepoint_at_pos(w_u._utf8, 0) +if self.size == 2 and ordinal > 0x: +raise self._convert_error("single character <= 0x", w_ob) +return r_uint(ordinal) elif (isinstance(w_ob, cdataobj.W_CData) and isinstance(w_ob.ctype, W_CTypePrimitiveUniChar) and w_ob.ctype.size == self.size): @@ -214,15 +215,15 @@ def unpack_ptr(self, w_ctypeptr, ptr, length): if self.size == 2: -u = wchar_helper.unicode_from_char16(ptr, length) +utf8, lgt, flag = wchar_helper.utf8_from_char16(ptr, length) else: try: -u = wchar_helper.unicode_from_char32(ptr, length) +utf8, lgt, flag = wchar_helper.utf8_from_char32(ptr, length) except wchar_helper.OutOfRange as e: raise oefmt(self.space.w_ValueError, -"char32_t out of range for " -"conversion to unicode: %s", hex(e.ordinal)) -return self.s
[pypy-commit] pypy unicode-utf8: Utf8StringBuilder
Author: fijal Branch: unicode-utf8 Changeset: r93138:9ede67aee27e Date: 2017-11-23 15:49 +0100 http://bitbucket.org/pypy/pypy/changeset/9ede67aee27e/ Log:Utf8StringBuilder diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py --- a/rpython/rlib/rutf8.py +++ b/rpython/rlib/rutf8.py @@ -16,9 +16,11 @@ """ import sys -from rpython.rlib.objectmodel import enforceargs, we_are_translated +from rpython.rlib.objectmodel import enforceargs, we_are_translated, specialize from rpython.rlib.rstring import StringBuilder from rpython.rlib import jit +from rpython.rlib.signature import signature +from rpython.rlib.types import char, none from rpython.rlib.rarithmetic import r_uint from rpython.rlib.unicodedata import unicodedb from rpython.rtyper.lltypesystem import lltype, rffi @@ -316,6 +318,11 @@ return res, flag raise CheckError(~res) +def get_utf8_length_flag(s): +""" Get the length and flag out of valid utf8. For now just calls check_utf8 +""" +return check_utf8(s, True) + @jit.elidable def _check_utf8(s, allow_surrogates, start, stop): pos = start @@ -655,6 +662,53 @@ return unicode_escape #, char_escape_helper +class Utf8StringBuilder(object): +def __init__(self, size=0): +self._s = StringBuilder(size) +self._lgt = 0 +self._flag = FLAG_ASCII + +def append(self, s): +# for strings +self._s.append(s) +newlgt, newflag = get_utf8_length_flag(s) +self._lgt += newlgt +self._flag = combine_flags(self._flag, newflag) + +@signature(char(), returns=none()) +def append_char(self, s): +# for characters, ascii +self._lgt += 1 +self._s.append(s) + +def append_code(self, code): +self._flag = combine_flags(self._flag, get_flag_from_code(code)) +self._lgt += 1 +unichr_as_utf8_append(self._s, code, True) + +def build(self): +return self._s.build() + +def get_flag(self): +return self._flag + +def get_length(self): +return self._lgt + +class Utf8StringIterator(object): +def __init__(self, utf8s): +self._utf8 = utf8s +self._end = len(utf8s) +self._pos = 0 + +def done(self): +return self._pos == self._end + +def next(self): +ret = codepoint_at_pos(self._utf8, self._pos) +self._pos = next_codepoint_pos(self._utf8, self._pos) +return ret + def decode_latin_1(s): if len(s) == 0: return s diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py --- a/rpython/rlib/test/test_rutf8.py +++ b/rpython/rlib/test/test_rutf8.py @@ -139,3 +139,39 @@ result = rutf8.surrogate_in_utf8(uni) expected = any(uch for uch in unichars if u'\ud800' <= uch <= u'\udfff') assert result == expected + +@given(strategies.text()) +def test_get_utf8_length_flag(u): +exp_lgt = len(u) +exp_flag = rutf8.FLAG_ASCII +for c in u: +if ord(c) > 0x7F: +exp_flag = rutf8.FLAG_REGULAR +lgt, flag = rutf8.get_utf8_length_flag(u.encode('utf8')) +assert lgt == exp_lgt +assert flag == exp_flag + +def test_utf8_string_builder(): +s = rutf8.Utf8StringBuilder() +s.append("foo") +s.append_char("x") +assert s.get_flag() == rutf8.FLAG_ASCII +assert s.get_length() == 4 +assert s.build() == "foox" +s.append(u"\u1234".encode("utf8")) +assert s.get_flag() == rutf8.FLAG_REGULAR +assert s.get_length() == 5 +assert s.build().decode("utf8") == u"foox\u1234" +s.append("foo") +s.append_char("x") +assert s.get_flag() == rutf8.FLAG_REGULAR +assert s.get_length() == 9 +assert s.build().decode("utf8") == u"foox\u1234foox" +s = rutf8.Utf8StringBuilder() +s.append_code(0x1234) +assert s.build().decode("utf8") == u"\u1234" +assert s.get_flag() == rutf8.FLAG_REGULAR +assert s.get_length() == 1 +s.append_code(0xD800) +assert s.get_flag() == rutf8.FLAG_HAS_SURROGATES +assert s.get_length() == 2 ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy unicode-utf8: merge
Author: fijal Branch: unicode-utf8 Changeset: r93139:3e45feebc910 Date: 2017-11-23 15:49 +0100 http://bitbucket.org/pypy/pypy/changeset/3e45feebc910/ Log:merge diff --git a/pypy/module/_cffi_backend/ctypearray.py b/pypy/module/_cffi_backend/ctypearray.py --- a/pypy/module/_cffi_backend/ctypearray.py +++ b/pypy/module/_cffi_backend/ctypearray.py @@ -64,13 +64,10 @@ elif space.isinstance_w(w_value, space.w_unicode): from pypy.module._cffi_backend import wchar_helper w_u = space.convert_arg_to_w_unicode(w_value) -if self.citem.size == 4: +if self.ctitem.size == 2: +length = wchar_helper.utf8_size_as_char16(w_u._utf8) +else: length = w_u._len() -else: -if not w_u._has_surrogates(): -length = w_u._len() -else: -length = wchar_helper.unicode_size_as_char16(w_u._utf8, w_u._len()) return (w_value, length + 1) else: explicitlength = space.getindex_w(w_value, space.w_OverflowError) diff --git a/pypy/module/_cffi_backend/ctypeprim.py b/pypy/module/_cffi_backend/ctypeprim.py --- a/pypy/module/_cffi_backend/ctypeprim.py +++ b/pypy/module/_cffi_backend/ctypeprim.py @@ -40,16 +40,13 @@ return ord(s[0]) def cast_unicode(self, w_ob): -import pdb -pdb.set_trace() space = self.space w_u = space.convert_arg_to_w_unicode(w_ob) if w_u._len() != 1: raise oefmt(space.w_TypeError, "cannot cast unicode string of length %d to ctype '%s'", w_u._len(), self.name) -ordinal = rutf8.codepoint_at_pos(w_u._utf8, 0) -return intmask(ordinal) +return rutf8.codepoint_at_pos(w_u._utf8, 0) def cast(self, w_ob): from pypy.module._cffi_backend import ctypeptr @@ -175,21 +172,19 @@ return self.space.newint(value)# r_uint => 'long' object def convert_to_object(self, cdata): -if self.is_signed_wchar: -code = ord(rffi.cast(rffi.CWCHARP, cdata)[0]) -return self.space.newutf8( -rutf8.unichr_as_utf8(code), 1, -rutf8.get_flag_from_code(code)) -else: -value = misc.read_raw_ulong_data(cdata, self.size) # r_uint -try: -u = wchar_helper.ordinal_to_unicode(value) -except wchar_helper.OutOfRange as e: -raise oefmt(self.space.w_ValueError, -"char32_t out of range for " -"conversion to unicode: %s", hex(e.ordinal)) -return self.space.newutf8(rutf8.unichr_as_utf8(ord(u)), 1, -rutf8.get_flag_from_code(ord(u))) +value = misc.read_raw_ulong_data(cdata, self.size) # r_uint +try: +utf8 = rutf8.unichr_as_utf8(value, allow_surrogates=True) +except ValueError: +if self.is_signed_wchar: +s = hex(intmask(value)) +else: +s = hex(value) +raise oefmt(self.space.w_ValueError, +"%s out of range for conversion to unicode: %s", +self.name, s) +flag = rutf8.get_flag_from_code(intmask(value)) +return self.space.newutf8(utf8, 1, flag) def string(self, cdataobj, maxlen): with cdataobj as ptr: @@ -200,7 +195,13 @@ # returns a r_uint. If self.size == 2, it is smaller than 0x1 space = self.space if space.isinstance_w(w_ob, space.w_unicode): -return rutf8.codepoint_at_pos(space.utf8_w(w_ob), 0) +w_u = space.convert_arg_to_w_unicode(w_ob) +if w_u._len() != 1: +raise self._convert_error("single character", w_ob) +ordinal = rutf8.codepoint_at_pos(w_u._utf8, 0) +if self.size == 2 and ordinal > 0x: +raise self._convert_error("single character <= 0x", w_ob) +return r_uint(ordinal) elif (isinstance(w_ob, cdataobj.W_CData) and isinstance(w_ob.ctype, W_CTypePrimitiveUniChar) and w_ob.ctype.size == self.size): @@ -214,15 +215,15 @@ def unpack_ptr(self, w_ctypeptr, ptr, length): if self.size == 2: -u = wchar_helper.unicode_from_char16(ptr, length) +utf8, lgt, flag = wchar_helper.utf8_from_char16(ptr, length) else: try: -u = wchar_helper.unicode_from_char32(ptr, length) +utf8, lgt, flag = wchar_helper.utf8_from_char32(ptr, length) except wchar_helper.OutOfRange as e: raise oefmt(self.space.w_ValueError, -"char32_t out of range for " -"conversion to unicode: %s", hex(e.ordinal)) -return self.space.newunicode(u) +
[pypy-commit] pypy unicode-utf8: provide explicit examples
Author: fijal Branch: unicode-utf8 Changeset: r93140:d24fe4f59c96 Date: 2017-11-23 15:57 +0100 http://bitbucket.org/pypy/pypy/changeset/d24fe4f59c96/ Log:provide explicit examples diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py --- a/rpython/rlib/test/test_rutf8.py +++ b/rpython/rlib/test/test_rutf8.py @@ -30,6 +30,7 @@ @settings(max_examples=1) @given(strategies.binary(), strategies.booleans()) +@example('\xf1\x80\x80\x80', False) def test_check_utf8(s, allow_surrogates): _test_check_utf8(s, allow_surrogates) @@ -134,19 +135,23 @@ assert repr(u) == repr_func(u.encode('utf8')) @given(strategies.lists(strategies.characters())) +@example([u'\ud800', u'\udc00']) def test_surrogate_in_utf8(unichars): uni = u''.join(unichars).encode('utf-8') result = rutf8.surrogate_in_utf8(uni) expected = any(uch for uch in unichars if u'\ud800' <= uch <= u'\udfff') assert result == expected -@given(strategies.text()) -def test_get_utf8_length_flag(u): +@given(strategies.lists(strategies.characters())) +def test_get_utf8_length_flag(unichars): +u = u''.join(unichars) exp_lgt = len(u) exp_flag = rutf8.FLAG_ASCII for c in u: if ord(c) > 0x7F: exp_flag = rutf8.FLAG_REGULAR +if 0xD800 <= ord(c) <= 0xDFFF: +exp_flag = rutf8.FLAG_HAS_SURROGATES lgt, flag = rutf8.get_utf8_length_flag(u.encode('utf8')) assert lgt == exp_lgt assert flag == exp_flag ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy unicode-utf8: fix test on narrow host
Author: fijal Branch: unicode-utf8 Changeset: r93141:eb564d44a7c8 Date: 2017-11-23 16:15 +0100 http://bitbucket.org/pypy/pypy/changeset/eb564d44a7c8/ Log:fix test on narrow host diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py --- a/rpython/rlib/test/test_rutf8.py +++ b/rpython/rlib/test/test_rutf8.py @@ -57,12 +57,13 @@ assert ~(length) == e.start else: assert valid -assert length == len(u) if flag == rutf8.FLAG_ASCII: s.decode('ascii') # assert did not raise elif flag == rutf8.FLAG_HAS_SURROGATES: assert allow_surrogates assert _has_surrogates(s) +if sys.maxunicode == 0x10 or not _has_surrogates(s): +assert length == len(u) @given(strategies.characters()) def test_next_pos(uni): ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy unicode-utf8: fix tests on narrow host
Author: fijal Branch: unicode-utf8 Changeset: r93142:fa3bcbe5b09f Date: 2017-11-23 16:17 +0100 http://bitbucket.org/pypy/pypy/changeset/fa3bcbe5b09f/ Log:fix tests on narrow host diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py --- a/rpython/rlib/test/test_rutf8.py +++ b/rpython/rlib/test/test_rutf8.py @@ -138,7 +138,7 @@ @given(strategies.lists(strategies.characters())) @example([u'\ud800', u'\udc00']) def test_surrogate_in_utf8(unichars): -uni = u''.join(unichars).encode('utf-8') +uni = ''.join([u.encode('utf8') for u in unichars]) result = rutf8.surrogate_in_utf8(uni) expected = any(uch for uch in unichars if u'\ud800' <= uch <= u'\udfff') assert result == expected @@ -153,6 +153,7 @@ exp_flag = rutf8.FLAG_REGULAR if 0xD800 <= ord(c) <= 0xDFFF: exp_flag = rutf8.FLAG_HAS_SURROGATES +break lgt, flag = rutf8.get_utf8_length_flag(u.encode('utf8')) assert lgt == exp_lgt assert flag == exp_flag ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy unicode-utf8: more tests
Author: fijal Branch: unicode-utf8 Changeset: r93143:e4a568e4514c Date: 2017-11-23 16:32 +0100 http://bitbucket.org/pypy/pypy/changeset/e4a568e4514c/ Log:more tests diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py --- a/rpython/rlib/test/test_rutf8.py +++ b/rpython/rlib/test/test_rutf8.py @@ -154,8 +154,9 @@ if 0xD800 <= ord(c) <= 0xDFFF: exp_flag = rutf8.FLAG_HAS_SURROGATES break -lgt, flag = rutf8.get_utf8_length_flag(u.encode('utf8')) -assert lgt == exp_lgt +lgt, flag = rutf8.get_utf8_length_flag(''.join([c.encode('utf8') for c in u])) +if exp_flag != rutf8.FLAG_HAS_SURROGATES: +assert lgt == exp_lgt assert flag == exp_flag def test_utf8_string_builder(): @@ -182,3 +183,11 @@ s.append_code(0xD800) assert s.get_flag() == rutf8.FLAG_HAS_SURROGATES assert s.get_length() == 2 + +@given(strategies.text()) +def test_utf8_iterator(arg): +u = rutf8.Utf8StringIterator(arg.encode('utf8')) +l = [] +while not u.done(): +l.append(unichr(u.next())) +assert list(arg) == l ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy unicode-utf8: merge default
Author: fijal Branch: unicode-utf8 Changeset: r93144:177352fb8cf4 Date: 2017-11-23 16:46 +0100 http://bitbucket.org/pypy/pypy/changeset/177352fb8cf4/ Log:merge default diff too long, truncating to 2000 out of 7577 lines diff --git a/.hgignore b/.hgignore --- a/.hgignore +++ b/.hgignore @@ -71,6 +71,8 @@ ^lib_pypy/.+.c$ ^lib_pypy/.+.o$ ^lib_pypy/.+.so$ +^lib_pypy/.+.pyd$ +^lib_pypy/Release/ ^pypy/doc/discussion/.+\.html$ ^include/.+\.h$ ^include/.+\.inl$ diff --git a/extra_tests/requirements.txt b/extra_tests/requirements.txt new file mode 100644 --- /dev/null +++ b/extra_tests/requirements.txt @@ -0,0 +1,2 @@ +pytest +hypothesis diff --git a/extra_tests/test_bytes.py b/extra_tests/test_bytes.py new file mode 100644 --- /dev/null +++ b/extra_tests/test_bytes.py @@ -0,0 +1,84 @@ +from hypothesis import strategies as st +from hypothesis import given, example + +st_bytestring = st.binary() | st.binary().map(bytearray) + +@given(st_bytestring, st_bytestring, st_bytestring) +def test_find(u, prefix, suffix): +s = prefix + u + suffix +assert 0 <= s.find(u) <= len(prefix) +assert s.find(u, len(prefix), len(s) - len(suffix)) == len(prefix) + +@given(st_bytestring, st_bytestring, st_bytestring) +def test_index(u, prefix, suffix): +s = prefix + u + suffix +assert 0 <= s.index(u) <= len(prefix) +assert s.index(u, len(prefix), len(s) - len(suffix)) == len(prefix) + +@given(st_bytestring, st_bytestring, st_bytestring) +def test_rfind(u, prefix, suffix): +s = prefix + u + suffix +assert s.rfind(u) >= len(prefix) +assert s.rfind(u, len(prefix), len(s) - len(suffix)) == len(prefix) + +@given(st_bytestring, st_bytestring, st_bytestring) +def test_rindex(u, prefix, suffix): +s = prefix + u + suffix +assert s.rindex(u) >= len(prefix) +assert s.rindex(u, len(prefix), len(s) - len(suffix)) == len(prefix) + +def adjust_indices(u, start, end): +if end < 0: +end = max(end + len(u), 0) +else: +end = min(end, len(u)) +if start < 0: +start = max(start + len(u), 0) +return start, end + +@given(st_bytestring, st_bytestring) +def test_startswith_basic(u, v): +assert u.startswith(v) is (u[:len(v)] == v) + +@example(b'x', b'', 1) +@example(b'x', b'', 2) +@given(st_bytestring, st_bytestring, st.integers()) +def test_startswith_start(u, v, start): +expected = u[start:].startswith(v) if v else (start <= len(u)) +assert u.startswith(v, start) is expected + +@example(b'x', b'', 1, 0) +@example(b'xx', b'', -1, 0) +@given(st_bytestring, st_bytestring, st.integers(), st.integers()) +def test_startswith_3(u, v, start, end): +if v: +expected = u[start:end].startswith(v) +else: # CPython leaks implementation details in this case +start0, end0 = adjust_indices(u, start, end) +expected = start0 <= len(u) and start0 <= end0 +assert u.startswith(v, start, end) is expected + +@given(st_bytestring, st_bytestring) +def test_endswith_basic(u, v): +if len(v) > len(u): +assert u.endswith(v) is False +else: +assert u.endswith(v) is (u[len(u) - len(v):] == v) + +@example(b'x', b'', 1) +@example(b'x', b'', 2) +@given(st_bytestring, st_bytestring, st.integers()) +def test_endswith_2(u, v, start): +expected = u[start:].endswith(v) if v else (start <= len(u)) +assert u.endswith(v, start) is expected + +@example(b'x', b'', 1, 0) +@example(b'xx', b'', -1, 0) +@given(st_bytestring, st_bytestring, st.integers(), st.integers()) +def test_endswith_3(u, v, start, end): +if v: +expected = u[start:end].endswith(v) +else: # CPython leaks implementation details in this case +start0, end0 = adjust_indices(u, start, end) +expected = start0 <= len(u) and start0 <= end0 +assert u.endswith(v, start, end) is expected diff --git a/extra_tests/test_unicode.py b/extra_tests/test_unicode.py --- a/extra_tests/test_unicode.py +++ b/extra_tests/test_unicode.py @@ -1,3 +1,4 @@ +import sys import pytest from hypothesis import strategies as st from hypothesis import given, settings, example @@ -32,3 +33,89 @@ @given(s=st.text()) def test_composition(s, norm1, norm2, norm3): assert normalize(norm2, normalize(norm1, s)) == normalize(norm3, s) + +@given(st.text(), st.text(), st.text()) +def test_find(u, prefix, suffix): +s = prefix + u + suffix +assert 0 <= s.find(u) <= len(prefix) +assert s.find(u, len(prefix), len(s) - len(suffix)) == len(prefix) + +@given(st.text(), st.text(), st.text()) +def test_index(u, prefix, suffix): +s = prefix + u + suffix +assert 0 <= s.index(u) <= len(prefix) +assert s.index(u, len(prefix), len(s) - len(suffix)) == len(prefix) + +@given(st.text(), st.text(), st.text()) +def test_rfind(u, prefix, suffix): +s = prefix + u + suffix +assert s.rfind(u) >= len(prefix) +assert s.rfind(u, len(prefix), len(s) - len(suffix)) == len(prefix) + +@given(st.text(), st.text(), st.text()) +def test_rindex(u, prefix, suffi
[pypy-commit] pypy default: refactor
Author: Ronan Lamy Branch: Changeset: r93145:ff05ee1c4b6a Date: 2017-11-23 16:48 + http://bitbucket.org/pypy/pypy/changeset/ff05ee1c4b6a/ Log:refactor diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py --- a/pypy/module/_io/interp_textio.py +++ b/pypy/module/_io/interp_textio.py @@ -541,6 +541,10 @@ self.decoded_chars_used += size return chars +def _has_data(self): +return (self.decoded_chars is not None and +self.decoded_chars_used < len(self.decoded_chars)) + def _read_chunk(self, space): """Read and decode the next chunk of data from the BufferedReader. The return value is True unless EOF was reached. The decoded string @@ -588,6 +592,19 @@ return not eof +def _ensure_data(self, space): +while not self._has_data(): +try: +if not self._read_chunk(space): +self._unset_decoded() +self.snapshot = None +return False +except OperationError as e: +if trap_eintr(space, e): +continue +raise +return True + def next_w(self, space): self._check_attached(space) self.telling = False @@ -621,23 +638,13 @@ builder = UnicodeBuilder(size) # Keep reading chunks until we have n characters to return -while True: +while remaining > 0: +if not self._ensure_data(space): +break data = self._get_decoded_chars(remaining) builder.append(data) remaining -= len(data) -if remaining <= 0: # Done -break - -try: -if not self._read_chunk(space): -# EOF -break -except OperationError as e: -if trap_eintr(space, e): -continue -raise - return space.newunicode(builder.build()) def readline_w(self, space, w_limit=None): @@ -653,20 +660,9 @@ while True: # First, get some data if necessary -has_data = True -while not self.decoded_chars: -try: -if not self._read_chunk(space): -has_data = False -break -except OperationError as e: -if trap_eintr(space, e): -continue -raise +has_data = self._ensure_data(space) if not has_data: # end of file -self._unset_decoded() -self.snapshot = None start = endpos = offset_to_buffer = 0 break ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy unicode-utf8: fix multibytecodec
Author: fijal Branch: unicode-utf8 Changeset: r93146:99ca8cf9bbc4 Date: 2017-11-23 18:30 +0100 http://bitbucket.org/pypy/pypy/changeset/99ca8cf9bbc4/ Log:fix multibytecodec diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py --- a/pypy/module/_multibytecodec/c_codecs.py +++ b/pypy/module/_multibytecodec/c_codecs.py @@ -197,19 +197,21 @@ MBENC_FLUSH = 1 MBENC_RESET = 2 -def encode(codec, unicodedata, errors="strict", errorcb=None, namecb=None): +def encode(codec, unicodedata, length, errors="strict", errorcb=None, + namecb=None): encodebuf = pypy_cjk_enc_new(codec) if not encodebuf: raise MemoryError try: -return encodeex(encodebuf, unicodedata, errors, errorcb, namecb) +return encodeex(encodebuf, unicodedata, length, errors, errorcb, namecb) finally: pypy_cjk_enc_free(encodebuf) -def encodeex(encodebuf, unicodedata, errors="strict", errorcb=None, +def encodeex(encodebuf, utf8data, length, errors="strict", errorcb=None, namecb=None, ignore_error=0): -inleft = len(unicodedata) -with rffi.scoped_nonmoving_unicodebuffer(unicodedata) as inbuf: +inleft = length +inbuf = rffi.utf82wcharp(utf8data, length) +try: if pypy_cjk_enc_init(encodebuf, inbuf, inleft) < 0: raise MemoryError if ignore_error == 0: @@ -221,16 +223,18 @@ if r == 0 or r == ignore_error: break multibytecodec_encerror(encodebuf, r, errors, -errorcb, namecb, unicodedata) +errorcb, namecb, utf8data) while flags & MBENC_RESET: r = pypy_cjk_enc_reset(encodebuf) if r == 0: break multibytecodec_encerror(encodebuf, r, errors, -errorcb, namecb, unicodedata) +errorcb, namecb, utf8data) src = pypy_cjk_enc_outbuf(encodebuf) length = pypy_cjk_enc_outlen(encodebuf) return rffi.charpsize2str(src, length) +finally: +lltype.free(inbuf, flavor='raw') def multibytecodec_encerror(encodebuf, e, errors, errorcb, namecb, unicodedata): @@ -256,21 +260,16 @@ elif errors == "replace": codec = pypy_cjk_enc_getcodec(encodebuf) try: -replace = encode(codec, u"?") +replace = encode(codec, "?", 1) except EncodeDecodeError: replace = "?" else: assert errorcb -XXX -retu, rets, end = errorcb(errors, namecb, reason, - unicodedata.encode("utf8"), start, end) -if rets is not None: -# py3k only -replace = rets -else: -assert retu is not None -codec = pypy_cjk_enc_getcodec(encodebuf) -replace = encode(codec, retu, "strict", errorcb, namecb) +rets, end = errorcb(errors, namecb, reason, +unicodedata, start, end) +codec = pypy_cjk_enc_getcodec(encodebuf) +lgt, _ = rutf8.get_utf8_length_flag(rets) +replace = encode(codec, rets, lgt, "strict", errorcb, namecb) with rffi.scoped_nonmovingbuffer(replace) as inbuf: r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end) if r == MBERR_NOMEMORY: diff --git a/pypy/module/_multibytecodec/interp_incremental.py b/pypy/module/_multibytecodec/interp_incremental.py --- a/pypy/module/_multibytecodec/interp_incremental.py +++ b/pypy/module/_multibytecodec/interp_incremental.py @@ -1,4 +1,5 @@ from rpython.rtyper.lltypesystem import lltype +from rpython.rlib import rutf8 from pypy.module._multibytecodec import c_codecs from pypy.module._multibytecodec.interp_multibytecodec import ( MultibyteCodec, wrap_unicodedecodeerror, wrap_runtimeerror, @@ -65,7 +66,8 @@ pos = c_codecs.pypy_cjk_dec_inbuf_consumed(self.decodebuf) assert 0 <= pos <= len(object) self.pending = object[pos:] -return space.newunicode(output) +lgt, flag = rutf8.get_utf8_length_flag(output) +return space.newutf8(output, lgt, flag) @unwrap_spec(errors="text_or_none") @@ -88,7 +90,8 @@ def _initialize(self): self.encodebuf = c_codecs.pypy_cjk_enc_new(self.codec) -self.pending = u"" +self.pending = "" +self.pending_len = 0 def _free(self): self.pending = None @@ -96,25 +99,37 @@ c_codecs.pypy_cjk_enc_free(self.encodebuf) self.encodebuf = lltype.nullptr(c_codecs.ENCODEBUF_P.TO) -@unwrap_spec(object='utf8', final=bool) -def encode_w(self, object, final=False): -u_object = object.decode('utf8') +@unwrap_spec(final=bool) +def encode_w(self, space, w_object, final=False): +utf8data, length = space.utf8_len_w(w_object)
[pypy-commit] pypy default: Simplify _find_line_ending() and fix logic in the case of embedded \r and self.readnl=='\r\n'
Author: Ronan Lamy Branch: Changeset: r93147:8369cd92f7d0 Date: 2017-11-23 17:52 + http://bitbucket.org/pypy/pypy/changeset/8369cd92f7d0/ Log:Simplify _find_line_ending() and fix logic in the case of embedded \r and self.readnl=='\r\n' diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py --- a/pypy/module/_io/interp_textio.py +++ b/pypy/module/_io/interp_textio.py @@ -216,14 +216,7 @@ def _find_line_ending(self, line, start, end): size = end - start -if self.readtranslate: -# Newlines are already translated, only search for \n -pos = line.find(u'\n', start, end) -if pos >= 0: -return pos + 1, 0 -else: -return -1, size -elif self.readuniversal: +if self.readuniversal: # Universal newline search. Find any of \r, \r\n, \n # The decoder ensures that \r\n are not split in two pieces i = start @@ -242,16 +235,22 @@ return i + 1, 0 else: return i, 0 +if self.readtranslate: +# Newlines are already translated, only search for \n +newline = u'\n' else: # Non-universal mode. -pos = line.find(self.readnl, start, end) -if pos >= 0: -return pos + len(self.readnl), 0 -else: -pos = line.find(self.readnl[0], start, end) -if pos >= 0: -return -1, pos - start -return -1, size +newline = self.readnl +end_scan = end - len(newline) + 1 +for i in range(start, end_scan): +ch = line[i] +if ch == newline[0]: +for j in range(1, len(newline)): +if line[i + j] != newline[j]: +break +else: +return i + len(newline), 0 +return -1, end_scan W_TextIOBase.typedef = TypeDef( ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy unicode-utf8: one part of interp_sre
Author: fijal Branch: unicode-utf8 Changeset: r93148:5a057586add0 Date: 2017-11-23 19:02 +0100 http://bitbucket.org/pypy/pypy/changeset/5a057586add0/ Log:one part of interp_sre diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py --- a/pypy/module/_sre/interp_sre.py +++ b/pypy/module/_sre/interp_sre.py @@ -7,7 +7,8 @@ from pypy.interpreter.error import OperationError, oefmt from rpython.rlib.rarithmetic import intmask from rpython.rlib import jit -from rpython.rlib.rstring import StringBuilder, UnicodeBuilder +from rpython.rlib.rstring import StringBuilder +from rpython.rlib.rutf8 import Utf8StringBuilder # # @@ -237,8 +238,8 @@ filter_is_callable = True else: if space.isinstance_w(w_ptemplate, space.w_unicode): -filter_as_unicode = space.unicode_w(w_ptemplate) -literal = u'\\' not in filter_as_unicode +filter_as_unicode = space.utf8_w(w_ptemplate) +literal = '\\' not in filter_as_unicode use_builder = ( space.isinstance_w(w_string, space.w_unicode) and literal) else: @@ -267,7 +268,7 @@ sublist_w = strbuilder = unicodebuilder = None if use_builder: if filter_as_unicode is not None: -unicodebuilder = UnicodeBuilder(ctx.end) +unicodebuilder = Utf8StringBuilder(ctx.end) else: assert filter_as_string is not None strbuilder = StringBuilder(ctx.end) @@ -335,7 +336,9 @@ return space.newbytes(strbuilder.build()), n else: assert unicodebuilder is not None -return space.newunicode(unicodebuilder.build()), n +return space.newutf8(unicodebuilder.build(), + unicodebuilder.get_length(), + unicodebuilder.get_flag()), n else: if space.isinstance_w(w_string, space.w_unicode): w_emptystr = space.newunicode(u'') ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy unicode-utf8: hg merge default
Author: Ronan Lamy Branch: unicode-utf8 Changeset: r93149:0797bb6394b6 Date: 2017-11-23 18:07 + http://bitbucket.org/pypy/pypy/changeset/0797bb6394b6/ Log:hg merge default diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py --- a/pypy/module/_io/interp_textio.py +++ b/pypy/module/_io/interp_textio.py @@ -223,14 +223,7 @@ def _find_line_ending(self, line, start, end): size = end - start -if self.readtranslate: -# Newlines are already translated, only search for \n -pos = line.find('\n', start, end) -if pos >= 0: -return pos + 1, 0 -else: -return -1, size -elif self.readuniversal: +if self.readuniversal: # Universal newline search. Find any of \r, \r\n, \n # The decoder ensures that \r\n are not split in two pieces i = start @@ -249,16 +242,22 @@ return i + 1, 0 else: return i, 0 +if self.readtranslate: +# Newlines are already translated, only search for \n +newline = '\n' else: # Non-universal mode. -pos = line.find(self.readnl, start, end) -if pos >= 0: -return pos + len(self.readnl), 0 -else: -pos = line.find(self.readnl[0], start, end) -if pos >= 0: -return -1, pos - start -return -1, size +newline = self.readnl +end_scan = end - len(newline) + 1 +for i in range(start, end_scan): +ch = line[i] +if ch == newline[0]: +for j in range(1, len(newline)): +if line[i + j] != newline[j]: +break +else: +return i + len(newline), 0 +return -1, end_scan W_TextIOBase.typedef = TypeDef( @@ -548,6 +547,10 @@ self.decoded_chars_used += size return chars +def _has_data(self): +return (self.decoded_chars is not None and +self.decoded_chars_used < len(self.decoded_chars)) + def _read_chunk(self, space): """Read and decode the next chunk of data from the BufferedReader. The return value is True unless EOF was reached. The decoded string @@ -595,6 +598,19 @@ return not eof +def _ensure_data(self, space): +while not self._has_data(): +try: +if not self._read_chunk(space): +self._unset_decoded() +self.snapshot = None +return False +except OperationError as e: +if trap_eintr(space, e): +continue +raise +return True + def next_w(self, space): self._check_attached(space) self.telling = False @@ -628,23 +644,13 @@ builder = StringBuilder(size) # Keep reading chunks until we have n characters to return -while True: +while remaining > 0: +if not self._ensure_data(space): +break data = self._get_decoded_chars(remaining) builder.append(data) remaining -= len(data) -if remaining <= 0: # Done -break - -try: -if not self._read_chunk(space): -# EOF -break -except OperationError as e: -if trap_eintr(space, e): -continue -raise - return space.new_from_utf8(builder.build()) def readline_w(self, space, w_limit=None): @@ -660,20 +666,9 @@ while True: # First, get some data if necessary -has_data = True -while not self.decoded_chars: -try: -if not self._read_chunk(space): -has_data = False -break -except OperationError as e: -if trap_eintr(space, e): -continue -raise +has_data = self._ensure_data(space) if not has_data: # end of file -self._unset_decoded() -self.snapshot = None start = endpos = offset_to_buffer = 0 break ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy default: fix test use of eci for vmprof_start_sampling, vmprof_start_sampling
Author: Matti Picus Branch: Changeset: r93151:72001f56a97f Date: 2017-11-23 20:28 +0200 http://bitbucket.org/pypy/pypy/changeset/72001f56a97f/ Log:fix test use of eci for vmprof_start_sampling, vmprof_start_sampling diff --git a/rpython/rlib/rvmprof/cintf.py b/rpython/rlib/rvmprof/cintf.py --- a/rpython/rlib/rvmprof/cintf.py +++ b/rpython/rlib/rvmprof/cintf.py @@ -9,6 +9,7 @@ from rpython.rtyper.tool import rffi_platform as platform from rpython.rlib import rthread, jit from rpython.rlib.objectmodel import we_are_translated +from rpython.config.translationoption import get_translation_config class VMProfPlatformUnsupported(Exception): pass @@ -133,11 +134,17 @@ #endif """]) +if get_translation_config() is None: +# tests need the full eci here +_eci = global_eci +else: +_eci = auto_eci + vmprof_stop_sampling = rffi.llexternal("vmprof_stop_sampling", [], - rffi.INT, compilation_info=auto_eci, + rffi.INT, compilation_info=_eci, _nowrapper=True) vmprof_start_sampling = rffi.llexternal("vmprof_start_sampling", [], -lltype.Void, compilation_info=auto_eci, +lltype.Void, compilation_info=_eci, _nowrapper=True) ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy default: cannot pip install vmprof on arm, s390x
Author: Matti Picus Branch: Changeset: r93150:8c42f0f755c0 Date: 2017-11-23 18:48 +0200 http://bitbucket.org/pypy/pypy/changeset/8c42f0f755c0/ Log:cannot pip install vmprof on arm, s390x diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,7 @@ cffi>=1.4.0 -vmprof>=0.4.10 # required to parse log files in rvmprof tests + +# parse log files in rvmprof tests +vmprof>=0.4.10; 'x86' in platform.machine #skip arm, s390x # hypothesis is used for test generation on untranslated tests hypothesis ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy py3.5: merge default into py3.5
Author: Matti Picus Branch: py3.5 Changeset: r93152:ce6402cbdf3c Date: 2017-11-23 22:08 +0200 http://bitbucket.org/pypy/pypy/changeset/ce6402cbdf3c/ Log:merge default into py3.5 diff --git a/pypy/module/_io/interp_stringio.py b/pypy/module/_io/interp_stringio.py --- a/pypy/module/_io/interp_stringio.py +++ b/pypy/module/_io/interp_stringio.py @@ -184,9 +184,7 @@ start, end ) -if endpos >= 0: -endpos += start -else: +if endpos < 0: endpos = end assert endpos >= 0 self.pos = endpos diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py --- a/pypy/module/_io/interp_textio.py +++ b/pypy/module/_io/interp_textio.py @@ -216,44 +216,41 @@ def _find_line_ending(self, line, start, end): size = end - start -if self.readtranslate: - -# Newlines are already translated, only search for \n -pos = line.find(u'\n', start, end) -if pos >= 0: -return pos - start + 1, 0 -else: -return -1, size -elif self.readuniversal: +if self.readuniversal: # Universal newline search. Find any of \r, \r\n, \n # The decoder ensures that \r\n are not split in two pieces -i = 0 +i = start while True: -# Fast path for non-control chars. The loop always ends -# since the Py_UNICODE storage is NUL-terminated. -while i < size and line[start + i] > '\r': +# Fast path for non-control chars. +while i < end and line[i] > '\r': i += 1 -if i >= size: +if i >= end: return -1, size -ch = line[start + i] +ch = line[i] i += 1 if ch == '\n': return i, 0 if ch == '\r': -if line[start + i] == '\n': +if line[i] == '\n': return i + 1, 0 else: return i, 0 +if self.readtranslate: +# Newlines are already translated, only search for \n +newline = u'\n' else: # Non-universal mode. -pos = line.find(self.readnl, start, end) -if pos >= 0: -return pos - start + len(self.readnl), 0 -else: -pos = line.find(self.readnl[0], start, end) -if pos >= 0: -return -1, pos - start -return -1, size +newline = self.readnl +end_scan = end - len(newline) + 1 +for i in range(start, end_scan): +ch = line[i] +if ch == newline[0]: +for j in range(1, len(newline)): +if line[i + j] != newline[j]: +break +else: +return i + len(newline), 0 +return -1, end_scan W_TextIOBase.typedef = TypeDef( @@ -549,8 +546,13 @@ # _ # read methods -def _set_decoded_chars(self, chars): -self.decoded_chars = chars +def _unset_decoded(self): +self.decoded_chars = None +self.decoded_chars_used = 0 + +def _set_decoded(self, space, w_decoded): +check_decoded(space, w_decoded) +self.decoded_chars = space.unicode_w(w_decoded) self.decoded_chars_used = 0 def _get_decoded_chars(self, size): @@ -574,6 +576,10 @@ self.decoded_chars_used += size return chars +def _has_data(self): +return (self.decoded_chars is not None and +self.decoded_chars_used < len(self.decoded_chars)) + def _read_chunk(self, space): """Read and decode the next chunk of data from the BufferedReader. The return value is True unless EOF was reached. The decoded string @@ -616,8 +622,7 @@ eof = input_buf.getlength() == 0 w_decoded = space.call_method(self.w_decoder, "decode", w_input, space.newbool(eof)) -check_decoded(space, w_decoded) -self._set_decoded_chars(space.unicode_w(w_decoded)) +self._set_decoded(space, w_decoded) if space.len_w(w_decoded) > 0: eof = False @@ -629,6 +634,19 @@ return not eof +def _ensure_data(self, space): +while not self._has_data(): +try: +if not self._read_chunk(space): +self._unset_decoded() +self.snapshot = None +return False +except OperationError as e: +if trap_eintr(space, e): +continue +raise +return True + def next_w(sel
[pypy-commit] pypy default: generate conf.h for tests
Author: Matti Picus Branch: Changeset: r93153:d7c94a4970dd Date: 2017-11-24 09:16 +0200 http://bitbucket.org/pypy/pypy/changeset/d7c94a4970dd/ Log:generate conf.h for tests diff --git a/pypy/module/_continuation/test/conftest.py b/pypy/module/_continuation/test/conftest.py new file mode 100644 --- /dev/null +++ b/pypy/module/_continuation/test/conftest.py @@ -0,0 +1,7 @@ +import pytest +import sys + +def pytest_configure(config): +if sys.platform.startswith('linux'): +from rpython.rlib.rvmprof.cintf import configure_libbacktrace_linux +configure_libbacktrace_linux() ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy py3.5: merge default into py3.5
Author: Matti Picus Branch: py3.5 Changeset: r93154:d2807ddb8178 Date: 2017-11-24 09:17 +0200 http://bitbucket.org/pypy/pypy/changeset/d2807ddb8178/ Log:merge default into py3.5 diff --git a/pypy/module/_continuation/test/conftest.py b/pypy/module/_continuation/test/conftest.py new file mode 100644 --- /dev/null +++ b/pypy/module/_continuation/test/conftest.py @@ -0,0 +1,7 @@ +import pytest +import sys + +def pytest_configure(config): +if sys.platform.startswith('linux'): +from rpython.rlib.rvmprof.cintf import configure_libbacktrace_linux +configure_libbacktrace_linux() ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit