Author: fijal Branch: unicode-utf8 Changeset: r93106:b2f3bd9151c0 Date: 2017-11-20 23:05 +0100 http://bitbucket.org/pypy/pypy/changeset/b2f3bd9151c0/
Log: work on formatting diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -25,7 +25,7 @@ # Fast version of the "strict" errors handler. def raise_unicode_exception_encode(errors, encoding, msg, utf8, startingpos, endingpos): - u_len, flag = rutf8.check_utf8(utf8) + u_len, flag = rutf8.check_utf8(utf8, True) raise OperationError(space.w_UnicodeEncodeError, space.newtuple([space.newtext(encoding), space.newutf8(utf8, u_len, flag), @@ -60,13 +60,6 @@ return True return False -def get_flag_from_code(oc): - if oc <= 0x7F: - return rutf8.FLAG_ASCII - if 0xD800 <= oc <= 0xDFFF: - return rutf8.FLAG_HAS_SURROGATES - return rutf8.FLAG_REGULAR - # These functions take and return unwrapped rpython strings def decode_unicode_escape(space, string): state = space.fromcache(interp_codecs.CodecState) @@ -138,6 +131,24 @@ except rutf8.CheckError: return _str_decode_latin_1_slowpath(s, errors, final, errorhandler) +def _str_decode_latin_1_slowpath(s, errors, final, errorhandler): + res = StringBuilder(len(s)) + i = 0 + while i < len(s): + if ord(s[i]) > 0x7F: + while i < len(s) and ord(s[i]) > 0x7F: + rutf8.unichr_as_utf8_append(res, ord(s[i])) + i += 1 + else: + start = i + end = i + 1 + while end < len(s) and ord(s[end]) <= 0x7F: + end += 1 + res.append_slice(s, start, end) + i = end + # cannot be ASCII, cannot have surrogates, I believe + return res.build(), len(s), len(s), rutf8.FLAG_REGULAR + def utf8_encode_latin_1(s, errors, errorhandler): try: rutf8.check_ascii(s) @@ -159,7 +170,6 @@ res.append(chr(oc)) i += 1 else: - XXX r, pos = errorhandler(errors, 'latin1', 'ordinal not in range(256)', s, cur, cur + 1) @@ -358,7 +368,7 @@ builder.append(res) else: rutf8.unichr_as_utf8_append(builder, chr, True) - flag = get_flag_from_code(chr) + flag = rutf8.get_flag_from_code(chr) pos += digits size = 1 @@ -503,7 +513,7 @@ continue pos = look + 1 outsize += 1 - flag = combine_flags(flag, get_flag_from_code(code)) + flag = combine_flags(flag, rutf8.get_flag_from_code(code)) rutf8.unichr_as_utf8_append(builder, code) else: res, pos = errorhandler(errors, "unicodeescape", diff --git a/pypy/objspace/std/bytearrayobject.py b/pypy/objspace/std/bytearrayobject.py --- a/pypy/objspace/std/bytearrayobject.py +++ b/pypy/objspace/std/bytearrayobject.py @@ -189,14 +189,17 @@ return new_bytearray(space, w_bytearraytype, []) def descr_reduce(self, space): + from pypy.interpreter.unicodehelper import str_decode_latin_1 + assert isinstance(self, W_BytearrayObject) w_dict = self.getdict(space) if w_dict is None: w_dict = space.w_None + s, _, lgt, flag = str_decode_latin_1(''.join(self.getdata()), 'strict', + True, None) return space.newtuple([ space.type(self), space.newtuple([ - space.newunicode(''.join(self.getdata()).decode('latin-1')), - space.newtext('latin-1')]), + space.newutf8(s, lgt, flag), space.newtext('latin-1')]), w_dict]) @staticmethod diff --git a/pypy/objspace/std/formatting.py b/pypy/objspace/std/formatting.py --- a/pypy/objspace/std/formatting.py +++ b/pypy/objspace/std/formatting.py @@ -1,11 +1,11 @@ """String formatting routines""" import sys -from rpython.rlib import jit +from rpython.rlib import jit, rutf8 from rpython.rlib.objectmodel import specialize from rpython.rlib.rarithmetic import INT_MAX from rpython.rlib.rfloat import DTSF_ALT, formatd, isnan, isinf -from rpython.rlib.rstring import StringBuilder, UnicodeBuilder +from rpython.rlib.rstring import StringBuilder from rpython.rlib.unroll import unrolling_iterable from rpython.tool.sourcetools import func_with_new_name @@ -153,18 +153,15 @@ # to build two subclasses of the BaseStringFormatter class, # each one getting its own subtle differences and RPython types. - if do_unicode: - const = unicode - else: - const = str - class StringFormatter(BaseStringFormatter): def __init__(self, space, fmt, values_w, w_valuedict): BaseStringFormatter.__init__(self, space, values_w, w_valuedict) - self.fmt = fmt # either a string or a unicode + self.fmt = fmt # always a string, if unicode, utf8 encoded def peekchr(self): - # return the 'current' character + # Return the 'current' character. Note that this returns utf8 + # encoded part, but this is ok since we only need one-character + # comparisons try: return self.fmt[self.fmtpos] except IndexError: @@ -201,7 +198,8 @@ if self.w_valuedict is None: raise oefmt(space.w_TypeError, "format requires a mapping") if do_unicode: - w_key = space.newunicode(key) + lgt, flag = rutf8.check_utf8(key, True) + w_key = space.newutf8(key, lgt, flag) else: w_key = space.newbytes(key) return space.getitem(self.w_valuedict, w_key) @@ -287,10 +285,7 @@ @jit.look_inside_iff(lambda self: jit.isconstant(self.fmt)) def format(self): lgt = len(self.fmt) + 4 * len(self.values_w) + 10 - if do_unicode: - result = UnicodeBuilder(lgt) - else: - result = StringBuilder(lgt) + result = StringBuilder(lgt) self.result = result while True: # fast path: consume as many characters as possible @@ -311,7 +306,7 @@ c = self.peekchr() self.forward() if c == '%': - self.std_wp(const('%')) + self.std_wp('%', False) continue if w_value is None: w_value = self.nextinputvalue() @@ -333,22 +328,27 @@ def unknown_fmtchar(self): space = self.space - c = self.fmt[self.fmtpos - 1] - w_s = space.newunicode(c) if do_unicode else space.newbytes(c) + if do_unicode: + cp = rutf8.codepoint_at_pos(self.fmt, self.fmtpos - 1) + flag = rutf8.get_flag_from_code(cp) + w_s = space.newutf8(rutf8.unichr_as_utf8(cp), 1, flag) + else: + cp = ord(self.fmt[self.fmtpos - 1]) + w_s = space.newbytes(chr(cp)) raise oefmt(space.w_ValueError, "unsupported format character %R (%s) at index %d", - w_s, hex(ord(c)), self.fmtpos - 1) + w_s, hex(cp), self.fmtpos - 1) - @specialize.argtype(1) - def std_wp(self, r): + @specialize.arg(2) + def std_wp(self, r, is_string=False): length = len(r) - if do_unicode and isinstance(r, str): + if do_unicode and is_string: # convert string to unicode using the default encoding - r = self.space.unicode_w(self.space.newbytes(r)) + r = self.space.utf8_w(self.space.newbytes(r)) prec = self.prec if prec == -1 and self.width == 0: # fast path - self.result.append(const(r)) + self.result.append(r) return if prec >= 0 and prec < length: length = prec # ignore the end of the string if too long @@ -358,12 +358,12 @@ padding = 0 assert padding >= 0 if not self.f_ljust and padding > 0: - result.append_multiple_char(const(' '), padding) + result.append_multiple_char(' ', padding) # add any padding at the left of 'r' padding = 0 result.append_slice(r, 0, length) # add 'r' itself if padding > 0: - result.append_multiple_char(const(' '), padding) + result.append_multiple_char(' ', padding) # add any remaining padding at the right def std_wp_number(self, r, prefix=''): @@ -375,10 +375,10 @@ # result.append(), and no startswith() if not f_sign and # not f_blank). if self.f_sign and not r.startswith('-'): - result.append(const('+')) + result.append('+') elif self.f_blank and not r.startswith('-'): - result.append(const(' ')) - result.append(const(r)) + result.append(' ') + result.append(r) return # add a '+' or ' ' sign if necessary sign = r.startswith('-') @@ -405,18 +405,18 @@ assert padding >= 0 if padnumber == '>': - result.append_multiple_char(const(' '), padding) + result.append_multiple_char(' ', padding) # pad with spaces on the left if sign: - result.append(const(r[0])) # the sign - result.append(const(prefix)) # the prefix + result.append(r[0]) # the sign + result.append(prefix) # the prefix if padnumber == '0': - result.append_multiple_char(const('0'), padding) + result.append_multiple_char('0', padding) # pad with zeroes - result.append_slice(const(r), int(sign), len(r)) + result.append_slice(r, int(sign), len(r)) # the rest of the number if padnumber == '<': # spaces on the right - result.append_multiple_char(const(' '), padding) + result.append_multiple_char(' ', padding) def string_formatting(self, w_value): space = self.space @@ -425,8 +425,7 @@ raise oefmt(space.w_TypeError, "operand does not support unary str") w_result = space.get_and_call_function(w_impl, w_value) - if space.isinstance_w(w_result, - space.w_unicode): + if space.isinstance_w(w_result, space.w_unicode): raise NeedUnicodeFormattingError return space.bytes_w(w_result) @@ -443,11 +442,11 @@ else: from pypy.objspace.std.unicodeobject import unicode_from_object w_value = unicode_from_object(space, w_value) - s = space.unicode_w(w_value) - self.std_wp(s) + s = space.utf8_w(w_value) + self.std_wp(s, False) def fmt_r(self, w_value): - self.std_wp(self.space.text_w(self.space.repr(w_value))) + self.std_wp(self.space.text_w(self.space.repr(w_value)), True) def fmt_c(self, w_value): self.prec = -1 # just because @@ -456,30 +455,30 @@ s = space.bytes_w(w_value) if len(s) != 1: raise oefmt(space.w_TypeError, "%c requires int or char") - self.std_wp(s) + self.std_wp(s, True) elif space.isinstance_w(w_value, space.w_unicode): if not do_unicode: raise NeedUnicodeFormattingError - ustr = space.unicode_w(w_value) + ustr = space.utf8_w(w_value) if len(ustr) != 1: raise oefmt(space.w_TypeError, "%c requires int or unichar") - self.std_wp(ustr) + self.std_wp(ustr, False) else: n = space.int_w(w_value) if do_unicode: try: - c = unichr(n) + c = rutf8.unichr_as_utf8(n) except ValueError: raise oefmt(space.w_OverflowError, "unicode character code out of range") - self.std_wp(c) + self.std_wp(c, False) else: try: s = chr(n) except ValueError: raise oefmt(space.w_OverflowError, "character code not in range(256)") - self.std_wp(s) + self.std_wp(s, True) return StringFormatter @@ -510,11 +509,12 @@ pass else: return space.newbytes(result) - # XXX for now, this is performance critical - fmt = space.utf8_w(w_fmt).decode("utf8") + fmt = space.utf8_w(w_fmt) formatter = UnicodeFormatter(space, fmt, values_w, w_valuedict) result = formatter.format() - return space.newunicode(result) + # this can force strings, not sure if it's a problem or not + lgt, flag = rutf8.check_utf8(result, True) + return space.newutf8(result, lgt, flag) def mod_format(space, w_format, w_values, do_unicode=False): if space.isinstance_w(w_values, space.w_tuple): diff --git a/pypy/objspace/std/newformat.py b/pypy/objspace/std/newformat.py --- a/pypy/objspace/std/newformat.py +++ b/pypy/objspace/std/newformat.py @@ -4,11 +4,12 @@ import string from pypy.interpreter.error import OperationError, oefmt -from rpython.rlib import rstring, runicode, rlocale, rfloat, jit +from rpython.rlib import rstring, runicode, rlocale, rfloat, jit, rutf8 from rpython.rlib.objectmodel import specialize from rpython.rlib.rfloat import copysign, formatd from rpython.rlib.rarithmetic import r_uint, intmask from pypy.interpreter.signature import Signature +from pypy.interpreter import unicodehelper @specialize.argtype(1) @@ -50,7 +51,8 @@ if for_unicode: def wrap(self, u): - return self.space.newunicode(u) + lgt, flag = rutf8.check_utf8(u, True) + return self.space.newutf8(u, lgt, flag) else: def wrap(self, s): return self.space.newbytes(s) @@ -59,7 +61,6 @@ def __init__(self, space, template): self.space = space - self.empty = u"" if self.is_unicode else "" self.template = template def build(self, args): @@ -80,10 +81,7 @@ def _build_string(self, start, end, level): space = self.space - if self.is_unicode: - out = rstring.UnicodeBuilder() - else: - out = rstring.StringBuilder() + out = rstring.StringBuilder() if not level: raise oefmt(space.w_ValueError, "Recursion depth exceeded") level -= 1 @@ -344,7 +342,7 @@ w_conversion]) self.parser_list_w.append(w_entry) self.last_end = end + 1 - return self.empty + return "" # w_obj = self._get_argument(name) if conversion is not None: @@ -352,7 +350,7 @@ if recursive: spec = self._build_string(spec_start, end, level) w_rendered = self.space.format(w_obj, self.wrap(spec)) - unwrapper = "unicode_w" if self.is_unicode else "bytes_w" + unwrapper = "utf8_w" if self.is_unicode else "bytes_w" to_interp = getattr(self.space, unwrapper) return to_interp(w_rendered) @@ -379,8 +377,10 @@ def format_method(space, w_string, args, is_unicode): if is_unicode: template = unicode_template_formatter(space, - space.unicode_w(w_string)) - return space.newunicode(template.build(args)) + space.utf8_w(w_string)) + r = template.build(args) + lgt, flag = rutf8.check_utf8(r, True) + return space.newutf8(r, lgt, flag) else: template = str_template_formatter(space, space.bytes_w(w_string)) return space.newbytes(template.build(args)) @@ -416,7 +416,8 @@ if for_unicode: def wrap(self, u): - return self.space.newunicode(u) + lgt, flag = rutf8.check_utf8(u, True) + return self.space.newutf8(u, lgt, flag) else: def wrap(self, s): return self.space.newbytes(s) @@ -426,7 +427,6 @@ def __init__(self, space, spec): self.space = space - self.empty = u"" if self.is_unicode else "" self.spec = spec def _is_alignment(self, c): @@ -492,8 +492,9 @@ presentation_type = spec[i] if self.is_unicode: try: - the_type = spec[i].encode("ascii")[0] - except UnicodeEncodeError: + rutf8.check_utf8(spec[i], True) + the_type = spec[i][0] + except rutf8.CheckError: raise oefmt(space.w_ValueError, "invalid presentation type") else: @@ -538,8 +539,9 @@ return total def _lit(self, s): + assert len(s) == 1 if self.is_unicode: - return s.decode("latin-1") + return rutf8.unichr_as_utf8(ord(s[0])) else: return s @@ -551,10 +553,7 @@ return builder.build() def _builder(self): - if self.is_unicode: - return rstring.UnicodeBuilder() - else: - return rstring.StringBuilder() + return rstring.StringBuilder() def _unknown_presentation(self, tp): raise oefmt(self.space.w_ValueError, @@ -598,8 +597,8 @@ thousands = "" grouping = "\xFF" # special value to mean 'stop' if self.is_unicode: - self._loc_dec = dec.decode("latin-1") - self._loc_thousands = thousands.decode("latin-1") + self._loc_dec = rutf8.decode_latin_1(dec) + self._loc_thousands = rutf8.decode_latin_1(thousands) else: self._loc_dec = dec self._loc_thousands = thousands @@ -718,7 +717,7 @@ ts = self._loc_thousands if need_separator else None self._fill_digits(buf, digits, left, n_chars, n_zeros, ts) buf.reverse() - self._grouped_digits = self.empty.join(buf) + self._grouped_digits = "".join(buf) def _upcase_string(self, s): buf = [] @@ -727,7 +726,7 @@ if ord("a") <= index <= ord("z"): c = chr(index - 32) buf.append(c) - return self.empty.join(buf) + return "".join(buf) def _fill_number(self, spec, num, to_digits, to_prefix, fill_char, @@ -736,10 +735,7 @@ if spec.n_lpadding: out.append_multiple_char(fill_char[0], spec.n_lpadding) if spec.n_sign: - if self.is_unicode: - sign = spec.sign.decode("latin-1") - else: - sign = spec.sign + sign = self._lit(spec.sign) out.append(sign) if spec.n_prefix: pref = num[to_prefix:to_prefix + spec.n_prefix] @@ -783,13 +779,13 @@ raise oefmt(space.w_ValueError, "sign not allowed with 'c' presentation type") value = space.int_w(w_num) - max_char = runicode.MAXUNICODE if self.is_unicode else 0xFF + max_char = 0x10FFFF if self.is_unicode else 0xFF if not (0 <= value <= max_char): raise oefmt(space.w_OverflowError, "%%c arg not in range(%s)", hex(max_char)) if self.is_unicode: - result = runicode.UNICHR(value) + result = rutf8.unichr_as_utf8(value) else: result = chr(value) n_digits = 1 @@ -845,6 +841,7 @@ prefix = "0x" as_str = value.format(LONG_DIGITS[:base], prefix) if self.is_unicode: + XXX return as_str.decode("latin-1") return as_str @@ -852,7 +849,7 @@ if base == 10: s = str(value) if self.is_unicode: - return s.decode("latin-1") + return rutf8.decode_latin_1(s) return s # This part is slow. negative = value < 0 @@ -893,7 +890,7 @@ i -= 1 buf[i] = "-" assert i >= 0 - return self.empty.join(buf[i:]) + return "".join(buf[i:]) def format_int_or_long(self, w_num, kind): space = self.space @@ -975,7 +972,7 @@ have_dec_point, to_remainder = self._parse_number(result, to_number) n_remainder = len(result) - to_remainder if self.is_unicode: - digits = result.decode("latin-1") + digits = rutf8.decode_latin_1(result) else: digits = result spec = self._calc_num_width(0, sign, to_number, n_digits, @@ -1081,8 +1078,8 @@ to_imag_number) if self.is_unicode: - re_num = re_num.decode("latin-1") - im_num = im_num.decode("latin-1") + re_num = rutf8.decode_latin_1(re_num) + im_num = rutf8.decode_latin_1(im_num) #set remainder, in CPython _parse_number sets this #using n_re_digits causes tests to fail @@ -1111,7 +1108,7 @@ self._fill_char = tmp_fill_char #compute L and R padding - stored in self._left_pad and self._right_pad - self._calc_padding(self.empty, re_spec.n_total + im_spec.n_total + 1 + + self._calc_padding("", re_spec.n_total + im_spec.n_total + 1 + add_parens * 2) out = self._builder() @@ -1172,7 +1169,7 @@ @specialize.arg(2) def run_formatter(space, w_format_spec, meth, *args): if space.isinstance_w(w_format_spec, space.w_unicode): - formatter = unicode_formatter(space, space.unicode_w(w_format_spec)) + formatter = unicode_formatter(space, space.utf8_w(w_format_spec)) return getattr(formatter, meth)(*args) else: formatter = str_formatter(space, space.bytes_w(w_format_spec)) diff --git a/pypy/objspace/std/test/test_liststrategies.py b/pypy/objspace/std/test/test_liststrategies.py --- a/pypy/objspace/std/test/test_liststrategies.py +++ b/pypy/objspace/std/test/test_liststrategies.py @@ -600,9 +600,9 @@ def test_unicode(self): l1 = W_ListObject(self.space, [self.space.newbytes("eins"), self.space.newbytes("zwei")]) assert isinstance(l1.strategy, BytesListStrategy) - l2 = W_ListObject(self.space, [self.space.newunicode(u"eins"), self.space.newunicode(u"zwei")]) + l2 = W_ListObject(self.space, [self.space.newutf8("eins", 4, 2), self.space.newutf8("zwei", 4, 2)]) assert isinstance(l2.strategy, UnicodeListStrategy) - l3 = W_ListObject(self.space, [self.space.newbytes("eins"), self.space.newunicode(u"zwei")]) + l3 = W_ListObject(self.space, [self.space.newbytes("eins"), self.space.newutf8("zwei", 4, 2)]) assert isinstance(l3.strategy, ObjectListStrategy) def test_listview_bytes(self): @@ -626,7 +626,7 @@ # the same for unicode w_l = self.space.newlist([self.space.wrap(u'a'), self.space.wrap(u'b')]) w_l.getitems = None - assert space.unicode_w(space.call_method(space.wrap(u"c"), "join", w_l)) == u"acb" + assert space.utf8_w(space.call_method(space.wrap(u"c"), "join", w_l)) == "acb" def test_string_join_returns_same_instance(self): space = self.space diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -331,12 +331,11 @@ def descr__format__(self, space, w_format_spec): if not space.isinstance_w(w_format_spec, space.w_unicode): w_format_spec = space.call_function(space.w_unicode, w_format_spec) - spec = space.unicode_w(w_format_spec) + spec = space.utf8_w(w_format_spec) formatter = newformat.unicode_formatter(space, spec) self2 = unicode_from_object(space, self) assert isinstance(self2, W_UnicodeObject) - # XXX - return formatter.format_string(self2._utf8.decode("utf8")) + return formatter.format_string(self2._utf8) def descr_mod(self, space, w_values): return mod_format(space, self, w_values, do_unicode=True) @@ -526,12 +525,12 @@ def descr_formatter_parser(self, space): from pypy.objspace.std.newformat import unicode_template_formatter - tformat = unicode_template_formatter(space, space.unicode_w(self)) + tformat = unicode_template_formatter(space, space.utf8_w(self)) return tformat.formatter_parser() def descr_formatter_field_name_split(self, space): from pypy.objspace.std.newformat import unicode_template_formatter - tformat = unicode_template_formatter(space, space.unicode_w(self)) + tformat = unicode_template_formatter(space, space.utf8_w(self)) return tformat.formatter_field_name_split() def descr_lower(self, space): @@ -1188,8 +1187,7 @@ rutf8.check_ascii(s) except rutf8.CheckError as a: eh = unicodehelper.encode_error_handler(space) - u_len = w_object._len() - eh(None, "ascii", "ordinal not in range(128)", s, u_len, + eh(None, "ascii", "ordinal not in range(128)", s, a.pos, a.pos + 1) assert False, "always raises" return space.newbytes(s) @@ -1260,7 +1258,7 @@ # test_unicode_conversion_with__str__ if w_unicode_method is None: if space.isinstance_w(w_obj, space.w_unicode): - return space.newunicode(space.unicode_w(w_obj)) + return unicodehelper.convert_arg_to_w_unicode(space, w_obj) w_unicode_method = space.lookup(w_obj, "__str__") if w_unicode_method is not None: w_res = space.get_and_call_function(w_unicode_method, w_obj) diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py --- a/rpython/rlib/rutf8.py +++ b/rpython/rlib/rutf8.py @@ -123,6 +123,13 @@ continuation_bytes += 1 return len(s) - continuation_bytes +def get_flag_from_code(oc): + if oc <= 0x7F: + return FLAG_ASCII + if 0xD800 <= oc <= 0xDFFF: + return FLAG_HAS_SURROGATES + return FLAG_REGULAR + def codepoint_at_pos(code, pos): """ Give a codepoint in code at pos - assumes valid utf8, no checking! """ @@ -651,3 +658,30 @@ return unicode_escape #, char_escape_helper +def decode_latin_1(s): + if len(s) == 0: + return s + if len(s) == 1 and ord(s[0]) <= 0x7F: + return s + try: + check_ascii(s) + return s + except CheckError: + return _decode_latin_1_slowpath(s) + +def _decode_latin_1_slowpath(s): + res = StringBuilder(len(s)) + i = 0 + while i < len(s): + if ord(s[i]) > 0x7F: + while i < len(s) and ord(s[i]) > 0x7F: + unichr_as_utf8_append(res, ord(s[i])) + i += 1 + else: + start = i + end = i + 1 + while end < len(s) and ord(s[end]) <= 0x7F: + end += 1 + res.append_slice(s, start, end) + i = end + return res.build() _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit