[pypy-commit] pypy unicode-utf8: work on formatting

fijal Mon, 20 Nov 2017 14:06:28 -0800

Author: fijal
Branch: unicode-utf8
Changeset: r93106:b2f3bd9151c0
Date: 2017-11-20 23:05 +0100
http://bitbucket.org/pypy/pypy/changeset/b2f3bd9151c0/


Log:    work on formatting

diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -25,7 +25,7 @@
     # Fast version of the "strict" errors handler.
     def raise_unicode_exception_encode(errors, encoding, msg, utf8,
                                        startingpos, endingpos):
-        u_len, flag = rutf8.check_utf8(utf8)
+        u_len, flag = rutf8.check_utf8(utf8, True)
         raise OperationError(space.w_UnicodeEncodeError,
                              space.newtuple([space.newtext(encoding),
                                              space.newutf8(utf8, u_len, flag),
@@ -60,13 +60,6 @@
             return True
     return False
 
-def get_flag_from_code(oc):
-    if oc <= 0x7F:
-        return rutf8.FLAG_ASCII
-    if 0xD800 <= oc <= 0xDFFF:
-        return rutf8.FLAG_HAS_SURROGATES
-    return rutf8.FLAG_REGULAR
-
 # These functions take and return unwrapped rpython strings
 def decode_unicode_escape(space, string):
     state = space.fromcache(interp_codecs.CodecState)
@@ -138,6 +131,24 @@
     except rutf8.CheckError:
         return _str_decode_latin_1_slowpath(s, errors, final, errorhandler)
 
+def _str_decode_latin_1_slowpath(s, errors, final, errorhandler):
+    res = StringBuilder(len(s))
+    i = 0
+    while i < len(s):
+        if ord(s[i]) > 0x7F:
+            while i < len(s) and ord(s[i]) > 0x7F:
+                rutf8.unichr_as_utf8_append(res, ord(s[i]))
+                i += 1
+        else:
+            start = i
+            end = i + 1
+            while end < len(s) and ord(s[end]) <= 0x7F:
+                end += 1
+            res.append_slice(s, start, end)
+            i = end
+    # cannot be ASCII, cannot have surrogates, I believe
+    return res.build(), len(s), len(s), rutf8.FLAG_REGULAR
+
 def utf8_encode_latin_1(s, errors, errorhandler):
     try:
         rutf8.check_ascii(s)
@@ -159,7 +170,6 @@
                 res.append(chr(oc))
                 i += 1
             else:
-                XXX
                 r, pos = errorhandler(errors, 'latin1',
                                       'ordinal not in range(256)', s, cur,
                                       cur + 1)
@@ -358,7 +368,7 @@
                 builder.append(res)
             else:
                 rutf8.unichr_as_utf8_append(builder, chr, True)
-                flag = get_flag_from_code(chr)
+                flag = rutf8.get_flag_from_code(chr)
                 pos += digits
                 size = 1
 
@@ -503,7 +513,7 @@
                         continue
                     pos = look + 1
                     outsize += 1
-                    flag = combine_flags(flag, get_flag_from_code(code))
+                    flag = combine_flags(flag, rutf8.get_flag_from_code(code))
                     rutf8.unichr_as_utf8_append(builder, code)
                 else:
                     res, pos = errorhandler(errors, "unicodeescape",
diff --git a/pypy/objspace/std/bytearrayobject.py 
b/pypy/objspace/std/bytearrayobject.py
--- a/pypy/objspace/std/bytearrayobject.py
+++ b/pypy/objspace/std/bytearrayobject.py
@@ -189,14 +189,17 @@
         return new_bytearray(space, w_bytearraytype, [])
 
     def descr_reduce(self, space):
+        from pypy.interpreter.unicodehelper import str_decode_latin_1
+
         assert isinstance(self, W_BytearrayObject)
         w_dict = self.getdict(space)
         if w_dict is None:
             w_dict = space.w_None
+        s, _, lgt, flag = str_decode_latin_1(''.join(self.getdata()), 'strict',
+            True, None)
         return space.newtuple([
             space.type(self), space.newtuple([
-                space.newunicode(''.join(self.getdata()).decode('latin-1')),
-                space.newtext('latin-1')]),
+                space.newutf8(s, lgt, flag), space.newtext('latin-1')]),
             w_dict])
 
     @staticmethod
diff --git a/pypy/objspace/std/formatting.py b/pypy/objspace/std/formatting.py
--- a/pypy/objspace/std/formatting.py
+++ b/pypy/objspace/std/formatting.py
@@ -1,11 +1,11 @@
 """String formatting routines"""
 import sys
 
-from rpython.rlib import jit
+from rpython.rlib import jit, rutf8
 from rpython.rlib.objectmodel import specialize
 from rpython.rlib.rarithmetic import INT_MAX
 from rpython.rlib.rfloat import DTSF_ALT, formatd, isnan, isinf
-from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
+from rpython.rlib.rstring import StringBuilder
 from rpython.rlib.unroll import unrolling_iterable
 from rpython.tool.sourcetools import func_with_new_name
 
@@ -153,18 +153,15 @@
     # to build two subclasses of the BaseStringFormatter class,
     # each one getting its own subtle differences and RPython types.
 
-    if do_unicode:
-        const = unicode
-    else:
-        const = str
-
     class StringFormatter(BaseStringFormatter):
         def __init__(self, space, fmt, values_w, w_valuedict):
             BaseStringFormatter.__init__(self, space, values_w, w_valuedict)
-            self.fmt = fmt    # either a string or a unicode
+            self.fmt = fmt    # always a string, if unicode, utf8 encoded
 
         def peekchr(self):
-            # return the 'current' character
+            # Return the 'current' character. Note that this returns utf8
+            # encoded part, but this is ok since we only need one-character
+            # comparisons
             try:
                 return self.fmt[self.fmtpos]
             except IndexError:
@@ -201,7 +198,8 @@
             if self.w_valuedict is None:
                 raise oefmt(space.w_TypeError, "format requires a mapping")
             if do_unicode:
-                w_key = space.newunicode(key)
+                lgt, flag = rutf8.check_utf8(key, True)
+                w_key = space.newutf8(key, lgt, flag)
             else:
                 w_key = space.newbytes(key)
             return space.getitem(self.w_valuedict, w_key)
@@ -287,10 +285,7 @@
         @jit.look_inside_iff(lambda self: jit.isconstant(self.fmt))
         def format(self):
             lgt = len(self.fmt) + 4 * len(self.values_w) + 10
-            if do_unicode:
-                result = UnicodeBuilder(lgt)
-            else:
-                result = StringBuilder(lgt)
+            result = StringBuilder(lgt)
             self.result = result
             while True:
                 # fast path: consume as many characters as possible
@@ -311,7 +306,7 @@
                 c = self.peekchr()
                 self.forward()
                 if c == '%':
-                    self.std_wp(const('%'))
+                    self.std_wp('%', False)
                     continue
                 if w_value is None:
                     w_value = self.nextinputvalue()
@@ -333,22 +328,27 @@
 
         def unknown_fmtchar(self):
             space = self.space
-            c = self.fmt[self.fmtpos - 1]
-            w_s = space.newunicode(c) if do_unicode else space.newbytes(c)
+            if do_unicode:
+                cp = rutf8.codepoint_at_pos(self.fmt, self.fmtpos - 1)
+                flag = rutf8.get_flag_from_code(cp)
+                w_s = space.newutf8(rutf8.unichr_as_utf8(cp), 1, flag)
+            else:
+                cp = ord(self.fmt[self.fmtpos - 1])
+                w_s = space.newbytes(chr(cp))
             raise oefmt(space.w_ValueError,
                         "unsupported format character %R (%s) at index %d",
-                        w_s, hex(ord(c)), self.fmtpos - 1)
+                        w_s, hex(cp), self.fmtpos - 1)
 
-        @specialize.argtype(1)
-        def std_wp(self, r):
+        @specialize.arg(2)
+        def std_wp(self, r, is_string=False):
             length = len(r)
-            if do_unicode and isinstance(r, str):
+            if do_unicode and is_string:
                 # convert string to unicode using the default encoding
-                r = self.space.unicode_w(self.space.newbytes(r))
+                r = self.space.utf8_w(self.space.newbytes(r))
             prec = self.prec
             if prec == -1 and self.width == 0:
                 # fast path
-                self.result.append(const(r))
+                self.result.append(r)
                 return
             if prec >= 0 and prec < length:
                 length = prec   # ignore the end of the string if too long
@@ -358,12 +358,12 @@
                 padding = 0
             assert padding >= 0
             if not self.f_ljust and padding > 0:
-                result.append_multiple_char(const(' '), padding)
+                result.append_multiple_char(' ', padding)
                 # add any padding at the left of 'r'
                 padding = 0
             result.append_slice(r, 0, length)       # add 'r' itself
             if padding > 0:
-                result.append_multiple_char(const(' '), padding)
+                result.append_multiple_char(' ', padding)
             # add any remaining padding at the right
 
         def std_wp_number(self, r, prefix=''):
@@ -375,10 +375,10 @@
                 # result.append(), and no startswith() if not f_sign and
                 # not f_blank).
                 if self.f_sign and not r.startswith('-'):
-                    result.append(const('+'))
+                    result.append('+')
                 elif self.f_blank and not r.startswith('-'):
-                    result.append(const(' '))
-                result.append(const(r))
+                    result.append(' ')
+                result.append(r)
                 return
             # add a '+' or ' ' sign if necessary
             sign = r.startswith('-')
@@ -405,18 +405,18 @@
 
             assert padding >= 0
             if padnumber == '>':
-                result.append_multiple_char(const(' '), padding)
+                result.append_multiple_char(' ', padding)
                 # pad with spaces on the left
             if sign:
-                result.append(const(r[0]))        # the sign
-            result.append(const(prefix))               # the prefix
+                result.append(r[0])        # the sign
+            result.append(prefix)               # the prefix
             if padnumber == '0':
-                result.append_multiple_char(const('0'), padding)
+                result.append_multiple_char('0', padding)
                 # pad with zeroes
-            result.append_slice(const(r), int(sign), len(r))
+            result.append_slice(r, int(sign), len(r))
             # the rest of the number
             if padnumber == '<':           # spaces on the right
-                result.append_multiple_char(const(' '), padding)
+                result.append_multiple_char(' ', padding)
 
         def string_formatting(self, w_value):
             space = self.space
@@ -425,8 +425,7 @@
                 raise oefmt(space.w_TypeError,
                             "operand does not support unary str")
             w_result = space.get_and_call_function(w_impl, w_value)
-            if space.isinstance_w(w_result,
-                                              space.w_unicode):
+            if space.isinstance_w(w_result, space.w_unicode):
                 raise NeedUnicodeFormattingError
             return space.bytes_w(w_result)
 
@@ -443,11 +442,11 @@
                 else:
                     from pypy.objspace.std.unicodeobject import 
unicode_from_object
                     w_value = unicode_from_object(space, w_value)
-                s = space.unicode_w(w_value)
-            self.std_wp(s)
+                s = space.utf8_w(w_value)
+            self.std_wp(s, False)
 
         def fmt_r(self, w_value):
-            self.std_wp(self.space.text_w(self.space.repr(w_value)))
+            self.std_wp(self.space.text_w(self.space.repr(w_value)), True)
 
         def fmt_c(self, w_value):
             self.prec = -1     # just because
@@ -456,30 +455,30 @@
                 s = space.bytes_w(w_value)
                 if len(s) != 1:
                     raise oefmt(space.w_TypeError, "%c requires int or char")
-                self.std_wp(s)
+                self.std_wp(s, True)
             elif space.isinstance_w(w_value, space.w_unicode):
                 if not do_unicode:
                     raise NeedUnicodeFormattingError
-                ustr = space.unicode_w(w_value)
+                ustr = space.utf8_w(w_value)
                 if len(ustr) != 1:
                     raise oefmt(space.w_TypeError, "%c requires int or 
unichar")
-                self.std_wp(ustr)
+                self.std_wp(ustr, False)
             else:
                 n = space.int_w(w_value)
                 if do_unicode:
                     try:
-                        c = unichr(n)
+                        c = rutf8.unichr_as_utf8(n)
                     except ValueError:
                         raise oefmt(space.w_OverflowError,
                                     "unicode character code out of range")
-                    self.std_wp(c)
+                    self.std_wp(c, False)
                 else:
                     try:
                         s = chr(n)
                     except ValueError:
                         raise oefmt(space.w_OverflowError,
                                     "character code not in range(256)")
-                    self.std_wp(s)
+                    self.std_wp(s, True)
 
     return StringFormatter
 
@@ -510,11 +509,12 @@
             pass
         else:
             return space.newbytes(result)
-    # XXX for now, this is performance critical
-    fmt = space.utf8_w(w_fmt).decode("utf8")
+    fmt = space.utf8_w(w_fmt)
     formatter = UnicodeFormatter(space, fmt, values_w, w_valuedict)
     result = formatter.format()
-    return space.newunicode(result)
+    # this can force strings, not sure if it's a problem or not
+    lgt, flag = rutf8.check_utf8(result, True)
+    return space.newutf8(result, lgt, flag)
 
 def mod_format(space, w_format, w_values, do_unicode=False):
     if space.isinstance_w(w_values, space.w_tuple):
diff --git a/pypy/objspace/std/newformat.py b/pypy/objspace/std/newformat.py
--- a/pypy/objspace/std/newformat.py
+++ b/pypy/objspace/std/newformat.py
@@ -4,11 +4,12 @@
 import string
 
 from pypy.interpreter.error import OperationError, oefmt
-from rpython.rlib import rstring, runicode, rlocale, rfloat, jit
+from rpython.rlib import rstring, runicode, rlocale, rfloat, jit, rutf8
 from rpython.rlib.objectmodel import specialize
 from rpython.rlib.rfloat import copysign, formatd
 from rpython.rlib.rarithmetic import r_uint, intmask
 from pypy.interpreter.signature import Signature
+from pypy.interpreter import unicodehelper
 
 
 @specialize.argtype(1)
@@ -50,7 +51,8 @@
 
         if for_unicode:
             def wrap(self, u):
-                return self.space.newunicode(u)
+                lgt, flag = rutf8.check_utf8(u, True)
+                return self.space.newutf8(u, lgt, flag)
         else:
             def wrap(self, s):
                 return self.space.newbytes(s)
@@ -59,7 +61,6 @@
 
         def __init__(self, space, template):
             self.space = space
-            self.empty = u"" if self.is_unicode else ""
             self.template = template
 
         def build(self, args):
@@ -80,10 +81,7 @@
 
         def _build_string(self, start, end, level):
             space = self.space
-            if self.is_unicode:
-                out = rstring.UnicodeBuilder()
-            else:
-                out = rstring.StringBuilder()
+            out = rstring.StringBuilder()
             if not level:
                 raise oefmt(space.w_ValueError, "Recursion depth exceeded")
             level -= 1
@@ -344,7 +342,7 @@
                         w_conversion])
                     self.parser_list_w.append(w_entry)
                     self.last_end = end + 1
-                return self.empty
+                return ""
             #
             w_obj = self._get_argument(name)
             if conversion is not None:
@@ -352,7 +350,7 @@
             if recursive:
                 spec = self._build_string(spec_start, end, level)
             w_rendered = self.space.format(w_obj, self.wrap(spec))
-            unwrapper = "unicode_w" if self.is_unicode else "bytes_w"
+            unwrapper = "utf8_w" if self.is_unicode else "bytes_w"
             to_interp = getattr(self.space, unwrapper)
             return to_interp(w_rendered)
 
@@ -379,8 +377,10 @@
 def format_method(space, w_string, args, is_unicode):
     if is_unicode:
         template = unicode_template_formatter(space,
-                                              space.unicode_w(w_string))
-        return space.newunicode(template.build(args))
+                                              space.utf8_w(w_string))
+        r = template.build(args)
+        lgt, flag = rutf8.check_utf8(r, True)
+        return space.newutf8(r, lgt, flag)
     else:
         template = str_template_formatter(space, space.bytes_w(w_string))
         return space.newbytes(template.build(args))
@@ -416,7 +416,8 @@
 
         if for_unicode:
             def wrap(self, u):
-                return self.space.newunicode(u)
+                lgt, flag = rutf8.check_utf8(u, True)
+                return self.space.newutf8(u, lgt, flag)
         else:
             def wrap(self, s):
                 return self.space.newbytes(s)
@@ -426,7 +427,6 @@
 
         def __init__(self, space, spec):
             self.space = space
-            self.empty = u"" if self.is_unicode else ""
             self.spec = spec
 
         def _is_alignment(self, c):
@@ -492,8 +492,9 @@
                 presentation_type = spec[i]
                 if self.is_unicode:
                     try:
-                        the_type = spec[i].encode("ascii")[0]
-                    except UnicodeEncodeError:
+                        rutf8.check_utf8(spec[i], True)
+                        the_type = spec[i][0]
+                    except rutf8.CheckError:
                         raise oefmt(space.w_ValueError,
                                     "invalid presentation type")
                 else:
@@ -538,8 +539,9 @@
             return total
 
         def _lit(self, s):
+            assert len(s) == 1
             if self.is_unicode:
-                return s.decode("latin-1")
+                return rutf8.unichr_as_utf8(ord(s[0]))
             else:
                 return s
 
@@ -551,10 +553,7 @@
             return builder.build()
 
         def _builder(self):
-            if self.is_unicode:
-                return rstring.UnicodeBuilder()
-            else:
-                return rstring.StringBuilder()
+            return rstring.StringBuilder()
 
         def _unknown_presentation(self, tp):
             raise oefmt(self.space.w_ValueError,
@@ -598,8 +597,8 @@
                 thousands = ""
                 grouping = "\xFF"    # special value to mean 'stop'
             if self.is_unicode:
-                self._loc_dec = dec.decode("latin-1")
-                self._loc_thousands = thousands.decode("latin-1")
+                self._loc_dec = rutf8.decode_latin_1(dec)
+                self._loc_thousands = rutf8.decode_latin_1(thousands)
             else:
                 self._loc_dec = dec
                 self._loc_thousands = thousands
@@ -718,7 +717,7 @@
                 ts = self._loc_thousands if need_separator else None
                 self._fill_digits(buf, digits, left, n_chars, n_zeros, ts)
             buf.reverse()
-            self._grouped_digits = self.empty.join(buf)
+            self._grouped_digits = "".join(buf)
 
         def _upcase_string(self, s):
             buf = []
@@ -727,7 +726,7 @@
                 if ord("a") <= index <= ord("z"):
                     c = chr(index - 32)
                 buf.append(c)
-            return self.empty.join(buf)
+            return "".join(buf)
 
 
         def _fill_number(self, spec, num, to_digits, to_prefix, fill_char,
@@ -736,10 +735,7 @@
             if spec.n_lpadding:
                 out.append_multiple_char(fill_char[0], spec.n_lpadding)
             if spec.n_sign:
-                if self.is_unicode:
-                    sign = spec.sign.decode("latin-1")
-                else:
-                    sign = spec.sign
+                sign = self._lit(spec.sign)
                 out.append(sign)
             if spec.n_prefix:
                 pref = num[to_prefix:to_prefix + spec.n_prefix]
@@ -783,13 +779,13 @@
                     raise oefmt(space.w_ValueError,
                                 "sign not allowed with 'c' presentation type")
                 value = space.int_w(w_num)
-                max_char = runicode.MAXUNICODE if self.is_unicode else 0xFF
+                max_char = 0x10FFFF if self.is_unicode else 0xFF
                 if not (0 <= value <= max_char):
                     raise oefmt(space.w_OverflowError,
                                 "%%c arg not in range(%s)",
                                 hex(max_char))
                 if self.is_unicode:
-                    result = runicode.UNICHR(value)
+                    result = rutf8.unichr_as_utf8(value)
                 else:
                     result = chr(value)
                 n_digits = 1
@@ -845,6 +841,7 @@
                 prefix = "0x"
             as_str = value.format(LONG_DIGITS[:base], prefix)
             if self.is_unicode:
+                XXX
                 return as_str.decode("latin-1")
             return as_str
 
@@ -852,7 +849,7 @@
             if base == 10:
                 s = str(value)
                 if self.is_unicode:
-                    return s.decode("latin-1")
+                    return rutf8.decode_latin_1(s)
                 return s
             # This part is slow.
             negative = value < 0
@@ -893,7 +890,7 @@
                 i -= 1
                 buf[i] = "-"
             assert i >= 0
-            return self.empty.join(buf[i:])
+            return "".join(buf[i:])
 
         def format_int_or_long(self, w_num, kind):
             space = self.space
@@ -975,7 +972,7 @@
             have_dec_point, to_remainder = self._parse_number(result, 
to_number)
             n_remainder = len(result) - to_remainder
             if self.is_unicode:
-                digits = result.decode("latin-1")
+                digits = rutf8.decode_latin_1(result)
             else:
                 digits = result
             spec = self._calc_num_width(0, sign, to_number, n_digits,
@@ -1081,8 +1078,8 @@
                                                                to_imag_number)
 
             if self.is_unicode:
-                re_num = re_num.decode("latin-1")
-                im_num = im_num.decode("latin-1")
+                re_num = rutf8.decode_latin_1(re_num)
+                im_num = rutf8.decode_latin_1(im_num)
 
             #set remainder, in CPython _parse_number sets this
             #using n_re_digits causes tests to fail
@@ -1111,7 +1108,7 @@
             self._fill_char = tmp_fill_char
 
             #compute L and R padding - stored in self._left_pad and 
self._right_pad
-            self._calc_padding(self.empty, re_spec.n_total + im_spec.n_total + 
1 +
+            self._calc_padding("", re_spec.n_total + im_spec.n_total + 1 +
                                            add_parens * 2)
 
             out = self._builder()
@@ -1172,7 +1169,7 @@
 @specialize.arg(2)
 def run_formatter(space, w_format_spec, meth, *args):
     if space.isinstance_w(w_format_spec, space.w_unicode):
-        formatter = unicode_formatter(space, space.unicode_w(w_format_spec))
+        formatter = unicode_formatter(space, space.utf8_w(w_format_spec))
         return getattr(formatter, meth)(*args)
     else:
         formatter = str_formatter(space, space.bytes_w(w_format_spec))
diff --git a/pypy/objspace/std/test/test_liststrategies.py 
b/pypy/objspace/std/test/test_liststrategies.py
--- a/pypy/objspace/std/test/test_liststrategies.py
+++ b/pypy/objspace/std/test/test_liststrategies.py
@@ -600,9 +600,9 @@
     def test_unicode(self):
         l1 = W_ListObject(self.space, [self.space.newbytes("eins"), 
self.space.newbytes("zwei")])
         assert isinstance(l1.strategy, BytesListStrategy)
-        l2 = W_ListObject(self.space, [self.space.newunicode(u"eins"), 
self.space.newunicode(u"zwei")])
+        l2 = W_ListObject(self.space, [self.space.newutf8("eins", 4, 2), 
self.space.newutf8("zwei", 4, 2)])
         assert isinstance(l2.strategy, UnicodeListStrategy)
-        l3 = W_ListObject(self.space, [self.space.newbytes("eins"), 
self.space.newunicode(u"zwei")])
+        l3 = W_ListObject(self.space, [self.space.newbytes("eins"), 
self.space.newutf8("zwei", 4, 2)])
         assert isinstance(l3.strategy, ObjectListStrategy)
 
     def test_listview_bytes(self):
@@ -626,7 +626,7 @@
         # the same for unicode
         w_l = self.space.newlist([self.space.wrap(u'a'), 
self.space.wrap(u'b')])
         w_l.getitems = None
-        assert space.unicode_w(space.call_method(space.wrap(u"c"), "join", 
w_l)) == u"acb"
+        assert space.utf8_w(space.call_method(space.wrap(u"c"), "join", w_l)) 
== "acb"
 
     def test_string_join_returns_same_instance(self):
         space = self.space
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -331,12 +331,11 @@
     def descr__format__(self, space, w_format_spec):
         if not space.isinstance_w(w_format_spec, space.w_unicode):
             w_format_spec = space.call_function(space.w_unicode, w_format_spec)
-        spec = space.unicode_w(w_format_spec)
+        spec = space.utf8_w(w_format_spec)
         formatter = newformat.unicode_formatter(space, spec)
         self2 = unicode_from_object(space, self)
         assert isinstance(self2, W_UnicodeObject)
-        # XXX
-        return formatter.format_string(self2._utf8.decode("utf8"))
+        return formatter.format_string(self2._utf8)
 
     def descr_mod(self, space, w_values):
         return mod_format(space, self, w_values, do_unicode=True)
@@ -526,12 +525,12 @@
 
     def descr_formatter_parser(self, space):
         from pypy.objspace.std.newformat import unicode_template_formatter
-        tformat = unicode_template_formatter(space, space.unicode_w(self))
+        tformat = unicode_template_formatter(space, space.utf8_w(self))
         return tformat.formatter_parser()
 
     def descr_formatter_field_name_split(self, space):
         from pypy.objspace.std.newformat import unicode_template_formatter
-        tformat = unicode_template_formatter(space, space.unicode_w(self))
+        tformat = unicode_template_formatter(space, space.utf8_w(self))
         return tformat.formatter_field_name_split()
 
     def descr_lower(self, space):
@@ -1188,8 +1187,7 @@
                 rutf8.check_ascii(s)
             except rutf8.CheckError as a:
                 eh = unicodehelper.encode_error_handler(space)
-                u_len = w_object._len()
-                eh(None, "ascii", "ordinal not in range(128)", s, u_len,
+                eh(None, "ascii", "ordinal not in range(128)", s,
                     a.pos, a.pos + 1)
                 assert False, "always raises"
             return space.newbytes(s)
@@ -1260,7 +1258,7 @@
         # test_unicode_conversion_with__str__
         if w_unicode_method is None:
             if space.isinstance_w(w_obj, space.w_unicode):
-                return space.newunicode(space.unicode_w(w_obj))
+                return unicodehelper.convert_arg_to_w_unicode(space, w_obj)
             w_unicode_method = space.lookup(w_obj, "__str__")
         if w_unicode_method is not None:
             w_res = space.get_and_call_function(w_unicode_method, w_obj)
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -123,6 +123,13 @@
             continuation_bytes += 1
     return len(s) - continuation_bytes
 
+def get_flag_from_code(oc):
+    if oc <= 0x7F:
+        return FLAG_ASCII
+    if 0xD800 <= oc <= 0xDFFF:
+        return FLAG_HAS_SURROGATES
+    return FLAG_REGULAR
+
 def codepoint_at_pos(code, pos):
     """ Give a codepoint in code at pos - assumes valid utf8, no checking!
     """
@@ -651,3 +658,30 @@
 
     return unicode_escape #, char_escape_helper
 
+def decode_latin_1(s):
+    if len(s) == 0:
+        return s
+    if len(s) == 1 and ord(s[0]) <= 0x7F:
+        return s
+    try:
+        check_ascii(s)
+        return s
+    except CheckError:
+        return _decode_latin_1_slowpath(s)
+
+def _decode_latin_1_slowpath(s):
+    res = StringBuilder(len(s))
+    i = 0
+    while i < len(s):
+        if ord(s[i]) > 0x7F:
+            while i < len(s) and ord(s[i]) > 0x7F:
+                unichr_as_utf8_append(res, ord(s[i]))
+                i += 1
+        else:
+            start = i
+            end = i + 1
+            while end < len(s) and ord(s[end]) <= 0x7F:
+                end += 1
+            res.append_slice(s, start, end)
+            i = end
+    return res.build()
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8: work on formatting

Reply via email to