Author: fijal Branch: unicode-utf8 Changeset: r93113:5ffbd0a736d9 Date: 2017-11-21 15:51 +0100 http://bitbucket.org/pypy/pypy/changeset/5ffbd0a736d9/
Log: some rpython fixes diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -211,12 +211,16 @@ r, newpos = errorhandler(errors, 'ascii', msg, utf8, pos, endpos) for j in range(newpos - pos): + i = rutf8.next_codepoint_pos(utf8, i) + + j = 0 + while j < len(r): c = rutf8.codepoint_at_pos(r, j) if c > 0x7F: errorhandler("strict", 'ascii', 'ordinal not in range(128)', utf8, - pos, pos + 1) - i = rutf8.next_codepoint_pos(utf8, i) + pos, pos + 1) + j = rutf8.next_codepoint_pos(r, j) pos = newpos res.append(r) else: @@ -382,8 +386,8 @@ size, flag = rutf8.check_utf8(res, True) builder.append(res) else: - rutf8.unichr_as_utf8_append(builder, chr, True) - flag = rutf8.get_flag_from_code(chr) + rutf8.unichr_as_utf8_append(builder, intmask(chr), True) + flag = rutf8.get_flag_from_code(intmask(chr)) pos += digits size = 1 @@ -755,27 +759,31 @@ if inShift: # in a base-64 section if _utf7_IS_BASE64(ord(ch)): #consume a base-64 character base64buffer = (base64buffer << 6) | _utf7_FROM_BASE64(ch) + assert base64buffer >= 0 base64bits += 6 pos += 1 if base64bits >= 16: # enough bits for a UTF-16 value outCh = base64buffer >> (base64bits - 16) + assert outCh >= 0 base64bits -= 16 base64buffer &= (1 << base64bits) - 1 # clear high bits assert outCh <= 0xffff if surrogate: # expecting a second surrogate if outCh >= 0xDC00 and outCh <= 0xDFFF: - xxxx - result.append( - UNICHR((((surrogate & 0x3FF)<<10) | - (outCh & 0x3FF)) + 0x10000)) + code = (((surrogate & 0x3FF)<<10) | + (outCh & 0x3FF)) + 0x10000 + rutf8.unichr_as_utf8_append(result, code) + outsize += 1 + flag = combine_flags(flag, rutf8.FLAG_REGULAR) surrogate = 0 continue else: - YYYY - result.append(unichr(surrogate)) + rutf8.unichr_as_utf8_append(result, surrogate) + flag = rutf8.FLAG_HAS_SURROGATES + outsize += 1 surrogate = 0 # Not done with outCh: falls back to next line if outCh >= 0xD800 and outCh <= 0xDBFF: @@ -784,6 +792,7 @@ else: flag = combine_flags(flag, rutf8.unichr_to_flag(outCh)) outsize += 1 + assert outCh >= 0 rutf8.unichr_as_utf8_append(result, outCh, True) else: diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py --- a/rpython/rlib/rutf8.py +++ b/rpython/rlib/rutf8.py @@ -19,7 +19,7 @@ from rpython.rlib.objectmodel import enforceargs, we_are_translated from rpython.rlib.rstring import StringBuilder from rpython.rlib import jit -from rpython.rlib.rarithmetic import r_uint, intmask +from rpython.rlib.rarithmetic import r_uint from rpython.rlib.unicodedata import unicodedb from rpython.rtyper.lltypesystem import lltype, rffi @@ -27,6 +27,7 @@ def unichr_as_utf8(code, allow_surrogates=False): """Encode code (numeric value) as utf8 encoded string """ + assert code >= 0 code = r_uint(code) if code <= r_uint(0x7F): # Encode ASCII _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit