Author: fijal Branch: unicode-utf8 Changeset: r93114:cefc9ed0b4c5 Date: 2017-11-21 17:19 +0100 http://bitbucket.org/pypy/pypy/changeset/cefc9ed0b4c5/
Log: general progress diff --git a/pypy/interpreter/pyparser/parsestring.py b/pypy/interpreter/pyparser/parsestring.py --- a/pypy/interpreter/pyparser/parsestring.py +++ b/pypy/interpreter/pyparser/parsestring.py @@ -74,8 +74,8 @@ substr = s[ps : q] if rawmode or '\\' not in s[ps:]: if need_encoding: - utf, (lgt, flag) = unicodehelper.decode_utf8(space, substr) - w_u = space.newutf8(utf, lgt, flag) + lgt, flag = unicodehelper.check_utf8_or_raise(space, substr) + w_u = space.newutf8(substr, lgt, flag) w_v = unicodehelper.encode(space, w_u, encoding) return w_v else: diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -1094,9 +1094,9 @@ byteorder = BYTEORDER pos = 0 + index = 0 while pos < size: ch = rutf8.codepoint_at_pos(s, pos) - pos = rutf8.next_codepoint_pos(s, pos) if ch < 0xD800: _STORECHAR(result, ch, byteorder) @@ -1106,27 +1106,27 @@ elif ch >= 0xE000 or allow_surrogates: _STORECHAR(result, ch, byteorder) else: - ru, pos = errorhandler(errors, public_encoding_name, + ru, newindex = errorhandler(errors, public_encoding_name, 'surrogates not allowed', s, pos-1, pos) - xxx - #if rs is not None: - # # py3k only - # if len(rs) % 2 != 0: - # errorhandler('strict', public_encoding_name, - # 'surrogates not allowed', - # s, pos-1, pos) - # result.append(rs) - # continue - for ch in ru: + for j in range(newindex - index): + pos = rutf8.next_codepoint_pos(s, pos) + j = 0 + while j < len(ru): + ch = rutf8.codepoint_at_pos(ru, j) if ord(ch) < 0xD800: _STORECHAR(result, ord(ch), byteorder) else: errorhandler('strict', public_encoding_name, 'surrogates not allowed', s, pos-1, pos) + j = rutf8.next_codepoint_pos(ru, j) + index = newindex continue + pos = rutf8.next_codepoint_pos(s, pos) + index += 1 + return result.build() def utf8_encode_utf_16(s, errors, @@ -1285,32 +1285,30 @@ byteorder = BYTEORDER pos = 0 + index = 0 while pos < size: ch = rutf8.codepoint_at_pos(s, pos) pos = rutf8.next_codepoint_pos(s, pos) - ch2 = 0 if not allow_surrogates and 0xD800 <= ch < 0xE000: - ru, pos = errorhandler(errors, public_encoding_name, + ru, newindex = errorhandler(errors, public_encoding_name, 'surrogates not allowed', s, pos-1, pos) - XXX - if rs is not None: - # py3k only - if len(rs) % 4 != 0: - errorhandler('strict', public_encoding_name, - 'surrogates not allowed', - s, pos-1, pos) - result.append(rs) - continue - for ch in ru: + for j in range(newindex - index): + pos = rutf8.next_codepoint_pos(s, pos) + j = 0 + while j < len(ru): + ch = rutf8.codepoint_at_pos(ru, j) if ord(ch) < 0xD800: _STORECHAR32(result, ord(ch), byteorder) else: errorhandler('strict', public_encoding_name, - 'surrogates not allowed', - s, pos-1, pos) + 'surrogates not allowed', + s, pos-1, pos) + j = rutf8.next_codepoint_pos(ru, j) + index = newindex continue _STORECHAR32(result, ch, byteorder) + index += 1 return result.build() diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py --- a/pypy/module/_multibytecodec/c_codecs.py +++ b/pypy/module/_multibytecodec/c_codecs.py @@ -2,8 +2,9 @@ from rpython.rtyper.lltypesystem import lltype, rffi from rpython.translator.tool.cbuild import ExternalCompilationInfo from rpython.translator import cdir +from rpython.rlib import rutf8 -UNICODE_REPLACEMENT_CHARACTER = u'\uFFFD' +UNICODE_REPLACEMENT_CHARACTER = u'\uFFFD'.encode("utf8") class EncodeDecodeError(Exception): @@ -126,7 +127,7 @@ errorcb, namecb, stringdata) src = pypy_cjk_dec_outbuf(decodebuf) length = pypy_cjk_dec_outlen(decodebuf) - return rffi.wcharpsize2unicode(src, length) + return rffi.wcharpsize2utf8(src, length) def multibytecodec_decerror(decodebuf, e, errors, errorcb, namecb, stringdata): @@ -148,7 +149,7 @@ if errors == "strict": raise EncodeDecodeError(start, end, reason) elif errors == "ignore": - replace = u"" + replace = "" elif errors == "replace": replace = UNICODE_REPLACEMENT_CHARACTER else: @@ -156,8 +157,12 @@ replace, end = errorcb(errors, namecb, reason, stringdata, start, end) # 'replace' is RPython unicode here - with rffi.scoped_nonmoving_unicodebuffer(replace) as inbuf: - r = pypy_cjk_dec_replace_on_error(decodebuf, inbuf, len(replace), end) + lgt, _ = rutf8.check_utf8(replace, True) + inbuf = rffi.utf82wcharp(replace, lgt) + try: + r = pypy_cjk_dec_replace_on_error(decodebuf, inbuf, lgt, end) + finally: + lltype.free(inbuf, flavor='raw') if r == MBERR_NOMEMORY: raise MemoryError @@ -256,6 +261,7 @@ replace = "?" else: assert errorcb + XXX retu, rets, end = errorcb(errors, namecb, reason, unicodedata.encode("utf8"), start, end) if rets is not None: diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py b/pypy/module/_multibytecodec/interp_multibytecodec.py --- a/pypy/module/_multibytecodec/interp_multibytecodec.py +++ b/pypy/module/_multibytecodec/interp_multibytecodec.py @@ -1,3 +1,6 @@ + +from rpython.rlib import rutf8 + from pypy.interpreter.baseobjspace import W_Root from pypy.interpreter.gateway import interp2app, unwrap_spec from pypy.interpreter.typedef import TypeDef @@ -18,13 +21,14 @@ state = space.fromcache(CodecState) # try: - u_output = c_codecs.decode(self.codec, input, errors, + utf8_output = c_codecs.decode(self.codec, input, errors, state.decode_error_handler, self.name) except c_codecs.EncodeDecodeError as e: raise wrap_unicodedecodeerror(space, e, input, self.name) except RuntimeError: raise wrap_runtimeerror(space) - return space.newtuple([space.newunicode(u_output), + lgt, flag = rutf8.check_utf8(utf8_output, True) + return space.newtuple([space.newutf8(utf8_output, lgt, flag), space.newint(len(input))]) @unwrap_spec(input='utf8', errors="text_or_none") @@ -74,7 +78,7 @@ space.newtext(e.reason)])) def wrap_unicodeencodeerror(space, e, input, inputlen, name): - flag = 13 + _, flag = rutf8.check_utf8(input, True) raise OperationError( space.w_UnicodeEncodeError, space.newtuple([ diff --git a/pypy/objspace/std/newformat.py b/pypy/objspace/std/newformat.py --- a/pypy/objspace/std/newformat.py +++ b/pypy/objspace/std/newformat.py @@ -841,8 +841,7 @@ prefix = "0x" as_str = value.format(LONG_DIGITS[:base], prefix) if self.is_unicode: - XXX - return as_str.decode("latin-1") + return rutf8.decode_latin_1(as_str) return as_str def _int_to_base(self, base, value): diff --git a/rpython/rtyper/lltypesystem/rffi.py b/rpython/rtyper/lltypesystem/rffi.py --- a/rpython/rtyper/lltypesystem/rffi.py +++ b/rpython/rtyper/lltypesystem/rffi.py @@ -1009,6 +1009,29 @@ wcharp2unicoden, wcharpsize2unicode, unicode2wchararray, unicode2rawmem, ) = make_string_mappings(unicode) +def wcharpsize2utf8(w, size): + """ Helper to convert WCHARP pointer to utf8 in one go. + Equivalent to wcharpsize2unicode().encode("utf8") + """ + from rpython.rlib import rutf8 + + s = StringBuilder(size) + for i in range(size): + rutf8.unichr_as_utf8_append(s, ord(w[i])) + return s.build() + +def utf82wcharp(utf8, utf8len): + from rpython.rlib import rutf8 + + w = lltype.malloc(CWCHARP.TO, utf8len, flavor='raw') + i = 0 + index = 0 + while i < len(utf8): + w[index] = unichr(rutf8.codepoint_at_pos(utf8, i)) + i = rutf8.next_codepoint_pos(utf8, i) + index += 1 + return w + # char** CCHARPP = lltype.Ptr(lltype.Array(CCHARP, hints={'nolength': True})) diff --git a/rpython/rtyper/lltypesystem/test/test_rffi.py b/rpython/rtyper/lltypesystem/test/test_rffi.py --- a/rpython/rtyper/lltypesystem/test/test_rffi.py +++ b/rpython/rtyper/lltypesystem/test/test_rffi.py @@ -590,6 +590,14 @@ res = fn(expected_extra_mallocs=range(30)) assert res == 32 * len(d) + def test_wcharp_to_utf8(self): + wchar = lltype.malloc(CWCHARP.TO, 3, flavor='raw') + wchar[0] = u'\u1234' + wchar[1] = u'\x80' + wchar[2] = u'a' + assert wcharpsize2utf8(wchar, 3).decode("utf8") == u'\u1234\x80a' + lltype.free(wchar, flavor='raw') + class TestRffiInternals: def test_struct_create(self): X = CStruct('xx', ('one', INT)) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit