Author: fijal
Branch: unicode-utf8
Changeset: r93106:b2f3bd9151c0
Date: 2017-11-20 23:05 +0100
http://bitbucket.org/pypy/pypy/changeset/b2f3bd9151c0/
Log: work on formatting
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -25,7 +25,7 @@
# Fast version of the "strict" errors handler.
def raise_unicode_exception_encode(errors, encoding, msg, utf8,
startingpos, endingpos):
- u_len, flag = rutf8.check_utf8(utf8)
+ u_len, flag = rutf8.check_utf8(utf8, True)
raise OperationError(space.w_UnicodeEncodeError,
space.newtuple([space.newtext(encoding),
space.newutf8(utf8, u_len, flag),
@@ -60,13 +60,6 @@
return True
return False
-def get_flag_from_code(oc):
- if oc <= 0x7F:
- return rutf8.FLAG_ASCII
- if 0xD800 <= oc <= 0xDFFF:
- return rutf8.FLAG_HAS_SURROGATES
- return rutf8.FLAG_REGULAR
-
# These functions take and return unwrapped rpython strings
def decode_unicode_escape(space, string):
state = space.fromcache(interp_codecs.CodecState)
@@ -138,6 +131,24 @@
except rutf8.CheckError:
return _str_decode_latin_1_slowpath(s, errors, final, errorhandler)
+def _str_decode_latin_1_slowpath(s, errors, final, errorhandler):
+ res = StringBuilder(len(s))
+ i = 0
+ while i < len(s):
+ if ord(s[i]) > 0x7F:
+ while i < len(s) and ord(s[i]) > 0x7F:
+ rutf8.unichr_as_utf8_append(res, ord(s[i]))
+ i += 1
+ else:
+ start = i
+ end = i + 1
+ while end < len(s) and ord(s[end]) <= 0x7F:
+ end += 1
+ res.append_slice(s, start, end)
+ i = end
+ # cannot be ASCII, cannot have surrogates, I believe
+ return res.build(), len(s), len(s), rutf8.FLAG_REGULAR
+
def utf8_encode_latin_1(s, errors, errorhandler):
try:
rutf8.check_ascii(s)
@@ -159,7 +170,6 @@
res.append(chr(oc))
i += 1
else:
- XXX
r, pos = errorhandler(errors, 'latin1',
'ordinal not in range(256)', s, cur,
cur + 1)
@@ -358,7 +368,7 @@
builder.append(res)
else:
rutf8.unichr_as_utf8_append(builder, chr, True)
- flag = get_flag_from_code(chr)
+ flag = rutf8.get_flag_from_code(chr)
pos += digits
size = 1
@@ -503,7 +513,7 @@
continue
pos = look + 1
outsize += 1
- flag = combine_flags(flag, get_flag_from_code(code))
+ flag = combine_flags(flag, rutf8.get_flag_from_code(code))
rutf8.unichr_as_utf8_append(builder, code)
else:
res, pos = errorhandler(errors, "unicodeescape",
diff --git a/pypy/objspace/std/bytearrayobject.py
b/pypy/objspace/std/bytearrayobject.py
--- a/pypy/objspace/std/bytearrayobject.py
+++ b/pypy/objspace/std/bytearrayobject.py
@@ -189,14 +189,17 @@
return new_bytearray(space, w_bytearraytype, [])
def descr_reduce(self, space):
+ from pypy.interpreter.unicodehelper import str_decode_latin_1
+
assert isinstance(self, W_BytearrayObject)
w_dict = self.getdict(space)
if w_dict is None:
w_dict = space.w_None
+ s, _, lgt, flag = str_decode_latin_1(''.join(self.getdata()), 'strict',
+ True, None)
return space.newtuple([
space.type(self), space.newtuple([
- space.newunicode(''.join(self.getdata()).decode('latin-1')),
- space.newtext('latin-1')]),
+ space.newutf8(s, lgt, flag), space.newtext('latin-1')]),
w_dict])
@staticmethod
diff --git a/pypy/objspace/std/formatting.py b/pypy/objspace/std/formatting.py
--- a/pypy/objspace/std/formatting.py
+++ b/pypy/objspace/std/formatting.py
@@ -1,11 +1,11 @@
"""String formatting routines"""
import sys
-from rpython.rlib import jit
+from rpython.rlib import jit, rutf8
from rpython.rlib.objectmodel import specialize
from rpython.rlib.rarithmetic import INT_MAX
from rpython.rlib.rfloat import DTSF_ALT, formatd, isnan, isinf
-from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
+from rpython.rlib.rstring import StringBuilder
from rpython.rlib.unroll import unrolling_iterable
from rpython.tool.sourcetools import func_with_new_name
@@ -153,18 +153,15 @@
# to build two subclasses of the BaseStringFormatter class,
# each one getting its own subtle differences and RPython types.
- if do_unicode:
- const = unicode
- else:
- const = str
-
class StringFormatter(BaseStringFormatter):
def __init__(self, space, fmt, values_w, w_valuedict):
BaseStringFormatter.__init__(self, space, values_w, w_valuedict)
- self.fmt = fmt # either a string or a unicode
+ self.fmt = fmt # always a string, if unicode, utf8 encoded
def peekchr(self):
- # return the 'current' character
+ # Return the 'current' character. Note that this returns utf8
+ # encoded part, but this is ok since we only need one-character
+ # comparisons
try:
return self.fmt[self.fmtpos]
except IndexError:
@@ -201,7 +198,8 @@
if self.w_valuedict is None:
raise oefmt(space.w_TypeError, "format requires a mapping")
if do_unicode:
- w_key = space.newunicode(key)
+ lgt, flag = rutf8.check_utf8(key, True)
+ w_key = space.newutf8(key, lgt, flag)
else:
w_key = space.newbytes(key)
return space.getitem(self.w_valuedict, w_key)
@@ -287,10 +285,7 @@
@jit.look_inside_iff(lambda self: jit.isconstant(self.fmt))
def format(self):
lgt = len(self.fmt) + 4 * len(self.values_w) + 10
- if do_unicode:
- result = UnicodeBuilder(lgt)
- else:
- result = StringBuilder(lgt)
+ result = StringBuilder(lgt)
self.result = result
while True:
# fast path: consume as many characters as possible
@@ -311,7 +306,7 @@
c = self.peekchr()
self.forward()
if c == '%':
- self.std_wp(const('%'))
+ self.std_wp('%', False)
continue
if w_value is None:
w_value = self.nextinputvalue()
@@ -333,22 +328,27 @@
def unknown_fmtchar(self):
space = self.space
- c = self.fmt[self.fmtpos - 1]
- w_s = space.newunicode(c) if do_unicode else space.newbytes(c)
+ if do_unicode:
+ cp = rutf8.codepoint_at_pos(self.fmt, self.fmtpos - 1)
+ flag = rutf8.get_flag_from_code(cp)
+ w_s = space.newutf8(rutf8.unichr_as_utf8(cp), 1, flag)
+ else:
+ cp = ord(self.fmt[self.fmtpos - 1])
+ w_s = space.newbytes(chr(cp))
raise oefmt(space.w_ValueError,
"unsupported format character %R (%s) at index %d",
- w_s, hex(ord(c)), self.fmtpos - 1)
+ w_s, hex(cp), self.fmtpos - 1)
- @specialize.argtype(1)
- def std_wp(self, r):
+ @specialize.arg(2)
+ def std_wp(self, r, is_string=False):
length = len(r)
- if do_unicode and isinstance(r, str):
+ if do_unicode and is_string:
# convert string to unicode using the default encoding
- r = self.space.unicode_w(self.space.newbytes(r))
+ r = self.space.utf8_w(self.space.newbytes(r))
prec = self.prec
if prec == -1 and self.width == 0:
# fast path
- self.result.append(const(r))
+ self.result.append(r)
return
if prec >= 0 and prec < length:
length = prec # ignore the end of the string if too long
@@ -358,12 +358,12 @@
padding = 0
assert padding >= 0
if not self.f_ljust and padding > 0:
- result.append_multiple_char(const(' '), padding)
+ result.append_multiple_char(' ', padding)
# add any padding at the left of 'r'
padding = 0
result.append_slice(r, 0, length) # add 'r' itself
if padding > 0:
- result.append_multiple_char(const(' '), padding)
+ result.append_multiple_char(' ', padding)
# add any remaining padding at the right
def std_wp_number(self, r, prefix=''):
@@ -375,10 +375,10 @@
# result.append(), and no startswith() if not f_sign and
# not f_blank).
if self.f_sign and not r.startswith('-'):
- result.append(const('+'))
+ result.append('+')
elif self.f_blank and not r.startswith('-'):
- result.append(const(' '))
- result.append(const(r))
+ result.append(' ')
+ result.append(r)
return
# add a '+' or ' ' sign if necessary
sign = r.startswith('-')
@@ -405,18 +405,18 @@
assert padding >= 0
if padnumber == '>':
- result.append_multiple_char(const(' '), padding)
+ result.append_multiple_char(' ', padding)
# pad with spaces on the left
if sign:
- result.append(const(r[0])) # the sign
- result.append(const(prefix)) # the prefix
+ result.append(r[0]) # the sign
+ result.append(prefix) # the prefix
if padnumber == '0':
- result.append_multiple_char(const('0'), padding)
+ result.append_multiple_char('0', padding)
# pad with zeroes
- result.append_slice(const(r), int(sign), len(r))
+ result.append_slice(r, int(sign), len(r))
# the rest of the number
if padnumber == '<': # spaces on the right
- result.append_multiple_char(const(' '), padding)
+ result.append_multiple_char(' ', padding)
def string_formatting(self, w_value):
space = self.space
@@ -425,8 +425,7 @@
raise oefmt(space.w_TypeError,
"operand does not support unary str")
w_result = space.get_and_call_function(w_impl, w_value)
- if space.isinstance_w(w_result,
- space.w_unicode):
+ if space.isinstance_w(w_result, space.w_unicode):
raise NeedUnicodeFormattingError
return space.bytes_w(w_result)
@@ -443,11 +442,11 @@
else:
from pypy.objspace.std.unicodeobject import
unicode_from_object
w_value = unicode_from_object(space, w_value)
- s = space.unicode_w(w_value)
- self.std_wp(s)
+ s = space.utf8_w(w_value)
+ self.std_wp(s, False)
def fmt_r(self, w_value):
- self.std_wp(self.space.text_w(self.space.repr(w_value)))
+ self.std_wp(self.space.text_w(self.space.repr(w_value)), True)
def fmt_c(self, w_value):
self.prec = -1 # just because
@@ -456,30 +455,30 @@
s = space.bytes_w(w_value)
if len(s) != 1:
raise oefmt(space.w_TypeError, "%c requires int or char")
- self.std_wp(s)
+ self.std_wp(s, True)
elif space.isinstance_w(w_value, space.w_unicode):
if not do_unicode:
raise NeedUnicodeFormattingError
- ustr = space.unicode_w(w_value)
+ ustr = space.utf8_w(w_value)
if len(ustr) != 1:
raise oefmt(space.w_TypeError, "%c requires int or
unichar")
- self.std_wp(ustr)
+ self.std_wp(ustr, False)
else:
n = space.int_w(w_value)
if do_unicode:
try:
- c = unichr(n)
+ c = rutf8.unichr_as_utf8(n)
except ValueError:
raise oefmt(space.w_OverflowError,
"unicode character code out of range")
- self.std_wp(c)
+ self.std_wp(c, False)
else:
try:
s = chr(n)
except ValueError:
raise oefmt(space.w_OverflowError,
"character code not in range(256)")
- self.std_wp(s)
+ self.std_wp(s, True)
return StringFormatter
@@ -510,11 +509,12 @@
pass
else:
return space.newbytes(result)
- # XXX for now, this is performance critical
- fmt = space.utf8_w(w_fmt).decode("utf8")
+ fmt = space.utf8_w(w_fmt)
formatter = UnicodeFormatter(space, fmt, values_w, w_valuedict)
result = formatter.format()
- return space.newunicode(result)
+ # this can force strings, not sure if it's a problem or not
+ lgt, flag = rutf8.check_utf8(result, True)
+ return space.newutf8(result, lgt, flag)
def mod_format(space, w_format, w_values, do_unicode=False):
if space.isinstance_w(w_values, space.w_tuple):
diff --git a/pypy/objspace/std/newformat.py b/pypy/objspace/std/newformat.py
--- a/pypy/objspace/std/newformat.py
+++ b/pypy/objspace/std/newformat.py
@@ -4,11 +4,12 @@
import string
from pypy.interpreter.error import OperationError, oefmt
-from rpython.rlib import rstring, runicode, rlocale, rfloat, jit
+from rpython.rlib import rstring, runicode, rlocale, rfloat, jit, rutf8
from rpython.rlib.objectmodel import specialize
from rpython.rlib.rfloat import copysign, formatd
from rpython.rlib.rarithmetic import r_uint, intmask
from pypy.interpreter.signature import Signature
+from pypy.interpreter import unicodehelper
@specialize.argtype(1)
@@ -50,7 +51,8 @@
if for_unicode:
def wrap(self, u):
- return self.space.newunicode(u)
+ lgt, flag = rutf8.check_utf8(u, True)
+ return self.space.newutf8(u, lgt, flag)
else:
def wrap(self, s):
return self.space.newbytes(s)
@@ -59,7 +61,6 @@
def __init__(self, space, template):
self.space = space
- self.empty = u"" if self.is_unicode else ""
self.template = template
def build(self, args):
@@ -80,10 +81,7 @@
def _build_string(self, start, end, level):
space = self.space
- if self.is_unicode:
- out = rstring.UnicodeBuilder()
- else:
- out = rstring.StringBuilder()
+ out = rstring.StringBuilder()
if not level:
raise oefmt(space.w_ValueError, "Recursion depth exceeded")
level -= 1
@@ -344,7 +342,7 @@
w_conversion])
self.parser_list_w.append(w_entry)
self.last_end = end + 1
- return self.empty
+ return ""
#
w_obj = self._get_argument(name)
if conversion is not None:
@@ -352,7 +350,7 @@
if recursive:
spec = self._build_string(spec_start, end, level)
w_rendered = self.space.format(w_obj, self.wrap(spec))
- unwrapper = "unicode_w" if self.is_unicode else "bytes_w"
+ unwrapper = "utf8_w" if self.is_unicode else "bytes_w"
to_interp = getattr(self.space, unwrapper)
return to_interp(w_rendered)
@@ -379,8 +377,10 @@
def format_method(space, w_string, args, is_unicode):
if is_unicode:
template = unicode_template_formatter(space,
- space.unicode_w(w_string))
- return space.newunicode(template.build(args))
+ space.utf8_w(w_string))
+ r = template.build(args)
+ lgt, flag = rutf8.check_utf8(r, True)
+ return space.newutf8(r, lgt, flag)
else:
template = str_template_formatter(space, space.bytes_w(w_string))
return space.newbytes(template.build(args))
@@ -416,7 +416,8 @@
if for_unicode:
def wrap(self, u):
- return self.space.newunicode(u)
+ lgt, flag = rutf8.check_utf8(u, True)
+ return self.space.newutf8(u, lgt, flag)
else:
def wrap(self, s):
return self.space.newbytes(s)
@@ -426,7 +427,6 @@
def __init__(self, space, spec):
self.space = space
- self.empty = u"" if self.is_unicode else ""
self.spec = spec
def _is_alignment(self, c):
@@ -492,8 +492,9 @@
presentation_type = spec[i]
if self.is_unicode:
try:
- the_type = spec[i].encode("ascii")[0]
- except UnicodeEncodeError:
+ rutf8.check_utf8(spec[i], True)
+ the_type = spec[i][0]
+ except rutf8.CheckError:
raise oefmt(space.w_ValueError,
"invalid presentation type")
else:
@@ -538,8 +539,9 @@
return total
def _lit(self, s):
+ assert len(s) == 1
if self.is_unicode:
- return s.decode("latin-1")
+ return rutf8.unichr_as_utf8(ord(s[0]))
else:
return s
@@ -551,10 +553,7 @@
return builder.build()
def _builder(self):
- if self.is_unicode:
- return rstring.UnicodeBuilder()
- else:
- return rstring.StringBuilder()
+ return rstring.StringBuilder()
def _unknown_presentation(self, tp):
raise oefmt(self.space.w_ValueError,
@@ -598,8 +597,8 @@
thousands = ""
grouping = "\xFF" # special value to mean 'stop'
if self.is_unicode:
- self._loc_dec = dec.decode("latin-1")
- self._loc_thousands = thousands.decode("latin-1")
+ self._loc_dec = rutf8.decode_latin_1(dec)
+ self._loc_thousands = rutf8.decode_latin_1(thousands)
else:
self._loc_dec = dec
self._loc_thousands = thousands
@@ -718,7 +717,7 @@
ts = self._loc_thousands if need_separator else None
self._fill_digits(buf, digits, left, n_chars, n_zeros, ts)
buf.reverse()
- self._grouped_digits = self.empty.join(buf)
+ self._grouped_digits = "".join(buf)
def _upcase_string(self, s):
buf = []
@@ -727,7 +726,7 @@
if ord("a") <= index <= ord("z"):
c = chr(index - 32)
buf.append(c)
- return self.empty.join(buf)
+ return "".join(buf)
def _fill_number(self, spec, num, to_digits, to_prefix, fill_char,
@@ -736,10 +735,7 @@
if spec.n_lpadding:
out.append_multiple_char(fill_char[0], spec.n_lpadding)
if spec.n_sign:
- if self.is_unicode:
- sign = spec.sign.decode("latin-1")
- else:
- sign = spec.sign
+ sign = self._lit(spec.sign)
out.append(sign)
if spec.n_prefix:
pref = num[to_prefix:to_prefix + spec.n_prefix]
@@ -783,13 +779,13 @@
raise oefmt(space.w_ValueError,
"sign not allowed with 'c' presentation type")
value = space.int_w(w_num)
- max_char = runicode.MAXUNICODE if self.is_unicode else 0xFF
+ max_char = 0x10FFFF if self.is_unicode else 0xFF
if not (0 <= value <= max_char):
raise oefmt(space.w_OverflowError,
"%%c arg not in range(%s)",
hex(max_char))
if self.is_unicode:
- result = runicode.UNICHR(value)
+ result = rutf8.unichr_as_utf8(value)
else:
result = chr(value)
n_digits = 1
@@ -845,6 +841,7 @@
prefix = "0x"
as_str = value.format(LONG_DIGITS[:base], prefix)
if self.is_unicode:
+ XXX
return as_str.decode("latin-1")
return as_str
@@ -852,7 +849,7 @@
if base == 10:
s = str(value)
if self.is_unicode:
- return s.decode("latin-1")
+ return rutf8.decode_latin_1(s)
return s
# This part is slow.
negative = value < 0
@@ -893,7 +890,7 @@
i -= 1
buf[i] = "-"
assert i >= 0
- return self.empty.join(buf[i:])
+ return "".join(buf[i:])
def format_int_or_long(self, w_num, kind):
space = self.space
@@ -975,7 +972,7 @@
have_dec_point, to_remainder = self._parse_number(result,
to_number)
n_remainder = len(result) - to_remainder
if self.is_unicode:
- digits = result.decode("latin-1")
+ digits = rutf8.decode_latin_1(result)
else:
digits = result
spec = self._calc_num_width(0, sign, to_number, n_digits,
@@ -1081,8 +1078,8 @@
to_imag_number)
if self.is_unicode:
- re_num = re_num.decode("latin-1")
- im_num = im_num.decode("latin-1")
+ re_num = rutf8.decode_latin_1(re_num)
+ im_num = rutf8.decode_latin_1(im_num)
#set remainder, in CPython _parse_number sets this
#using n_re_digits causes tests to fail
@@ -1111,7 +1108,7 @@
self._fill_char = tmp_fill_char
#compute L and R padding - stored in self._left_pad and
self._right_pad
- self._calc_padding(self.empty, re_spec.n_total + im_spec.n_total +
1 +
+ self._calc_padding("", re_spec.n_total + im_spec.n_total + 1 +
add_parens * 2)
out = self._builder()
@@ -1172,7 +1169,7 @@
@specialize.arg(2)
def run_formatter(space, w_format_spec, meth, *args):
if space.isinstance_w(w_format_spec, space.w_unicode):
- formatter = unicode_formatter(space, space.unicode_w(w_format_spec))
+ formatter = unicode_formatter(space, space.utf8_w(w_format_spec))
return getattr(formatter, meth)(*args)
else:
formatter = str_formatter(space, space.bytes_w(w_format_spec))
diff --git a/pypy/objspace/std/test/test_liststrategies.py
b/pypy/objspace/std/test/test_liststrategies.py
--- a/pypy/objspace/std/test/test_liststrategies.py
+++ b/pypy/objspace/std/test/test_liststrategies.py
@@ -600,9 +600,9 @@
def test_unicode(self):
l1 = W_ListObject(self.space, [self.space.newbytes("eins"),
self.space.newbytes("zwei")])
assert isinstance(l1.strategy, BytesListStrategy)
- l2 = W_ListObject(self.space, [self.space.newunicode(u"eins"),
self.space.newunicode(u"zwei")])
+ l2 = W_ListObject(self.space, [self.space.newutf8("eins", 4, 2),
self.space.newutf8("zwei", 4, 2)])
assert isinstance(l2.strategy, UnicodeListStrategy)
- l3 = W_ListObject(self.space, [self.space.newbytes("eins"),
self.space.newunicode(u"zwei")])
+ l3 = W_ListObject(self.space, [self.space.newbytes("eins"),
self.space.newutf8("zwei", 4, 2)])
assert isinstance(l3.strategy, ObjectListStrategy)
def test_listview_bytes(self):
@@ -626,7 +626,7 @@
# the same for unicode
w_l = self.space.newlist([self.space.wrap(u'a'),
self.space.wrap(u'b')])
w_l.getitems = None
- assert space.unicode_w(space.call_method(space.wrap(u"c"), "join",
w_l)) == u"acb"
+ assert space.utf8_w(space.call_method(space.wrap(u"c"), "join", w_l))
== "acb"
def test_string_join_returns_same_instance(self):
space = self.space
diff --git a/pypy/objspace/std/unicodeobject.py
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -331,12 +331,11 @@
def descr__format__(self, space, w_format_spec):
if not space.isinstance_w(w_format_spec, space.w_unicode):
w_format_spec = space.call_function(space.w_unicode, w_format_spec)
- spec = space.unicode_w(w_format_spec)
+ spec = space.utf8_w(w_format_spec)
formatter = newformat.unicode_formatter(space, spec)
self2 = unicode_from_object(space, self)
assert isinstance(self2, W_UnicodeObject)
- # XXX
- return formatter.format_string(self2._utf8.decode("utf8"))
+ return formatter.format_string(self2._utf8)
def descr_mod(self, space, w_values):
return mod_format(space, self, w_values, do_unicode=True)
@@ -526,12 +525,12 @@
def descr_formatter_parser(self, space):
from pypy.objspace.std.newformat import unicode_template_formatter
- tformat = unicode_template_formatter(space, space.unicode_w(self))
+ tformat = unicode_template_formatter(space, space.utf8_w(self))
return tformat.formatter_parser()
def descr_formatter_field_name_split(self, space):
from pypy.objspace.std.newformat import unicode_template_formatter
- tformat = unicode_template_formatter(space, space.unicode_w(self))
+ tformat = unicode_template_formatter(space, space.utf8_w(self))
return tformat.formatter_field_name_split()
def descr_lower(self, space):
@@ -1188,8 +1187,7 @@
rutf8.check_ascii(s)
except rutf8.CheckError as a:
eh = unicodehelper.encode_error_handler(space)
- u_len = w_object._len()
- eh(None, "ascii", "ordinal not in range(128)", s, u_len,
+ eh(None, "ascii", "ordinal not in range(128)", s,
a.pos, a.pos + 1)
assert False, "always raises"
return space.newbytes(s)
@@ -1260,7 +1258,7 @@
# test_unicode_conversion_with__str__
if w_unicode_method is None:
if space.isinstance_w(w_obj, space.w_unicode):
- return space.newunicode(space.unicode_w(w_obj))
+ return unicodehelper.convert_arg_to_w_unicode(space, w_obj)
w_unicode_method = space.lookup(w_obj, "__str__")
if w_unicode_method is not None:
w_res = space.get_and_call_function(w_unicode_method, w_obj)
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -123,6 +123,13 @@
continuation_bytes += 1
return len(s) - continuation_bytes
+def get_flag_from_code(oc):
+ if oc <= 0x7F:
+ return FLAG_ASCII
+ if 0xD800 <= oc <= 0xDFFF:
+ return FLAG_HAS_SURROGATES
+ return FLAG_REGULAR
+
def codepoint_at_pos(code, pos):
""" Give a codepoint in code at pos - assumes valid utf8, no checking!
"""
@@ -651,3 +658,30 @@
return unicode_escape #, char_escape_helper
+def decode_latin_1(s):
+ if len(s) == 0:
+ return s
+ if len(s) == 1 and ord(s[0]) <= 0x7F:
+ return s
+ try:
+ check_ascii(s)
+ return s
+ except CheckError:
+ return _decode_latin_1_slowpath(s)
+
+def _decode_latin_1_slowpath(s):
+ res = StringBuilder(len(s))
+ i = 0
+ while i < len(s):
+ if ord(s[i]) > 0x7F:
+ while i < len(s) and ord(s[i]) > 0x7F:
+ unichr_as_utf8_append(res, ord(s[i]))
+ i += 1
+ else:
+ start = i
+ end = i + 1
+ while end < len(s) and ord(s[end]) <= 0x7F:
+ end += 1
+ res.append_slice(s, start, end)
+ i = end
+ return res.build()
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit