Author: Matti Picus <matti.pi...@gmail.com> Branch: unicode-utf8-py3 Changeset: r94835:f287dec62c4e Date: 2018-07-08 21:38 -0700 http://bitbucket.org/pypy/pypy/changeset/f287dec62c4e/
Log: fixes, start to handle some edge cases diff --git a/pypy/objspace/std/dictmultiobject.py b/pypy/objspace/std/dictmultiobject.py --- a/pypy/objspace/std/dictmultiobject.py +++ b/pypy/objspace/std/dictmultiobject.py @@ -122,7 +122,7 @@ if w_fill is None: w_fill = space.w_None if space.is_w(w_type, space.w_dict): - ulist = space.listview_unicode(w_keys) + ulist = space.listview_utf8(w_keys) if ulist is not None: strategy = space.fromcache(UnicodeDictStrategy) storage = strategy.get_storage_fromkeys(ulist, w_fill) @@ -1183,21 +1183,21 @@ # we should implement the same shortcuts as we do for BytesDictStrategy - ## def setitem_str(self, w_dict, key, w_value): - ## assert key is not None - ## self.unerase(w_dict.dstorage)[key] = w_value + def setitem_str(self, w_dict, key, w_value): + assert key is not None + self.unerase(w_dict.dstorage)[key] = w_value - ## def getitem(self, w_dict, w_key): - ## space = self.space - ## # -- This is called extremely often. Hack for performance -- - ## if type(w_key) is space.StringObjectCls: - ## return self.getitem_str(w_dict, w_key.unwrap(space)) - ## # -- End of performance hack -- - ## return AbstractTypedStrategy.getitem(self, w_dict, w_key) + def getitem(self, w_dict, w_key): + space = self.space + # -- This is called extremely often. Hack for performance -- + if type(w_key) is space.StringObjectCls: + return self.getitem_str(w_dict, w_key.unwrap(space)) + # -- End of performance hack -- + return AbstractTypedStrategy.getitem(self, w_dict, w_key) - ## def getitem_str(self, w_dict, key): - ## assert key is not None - ## return self.unerase(w_dict.dstorage).get(key, None) + def getitem_str(self, w_dict, key): + assert key is not None + return self.unerase(w_dict.dstorage).get(key, None) def listview_utf8(self, w_dict): return self.unerase(w_dict.dstorage).keys() @@ -1208,18 +1208,26 @@ def wrapkey(space, key): return space.newutf8(key, len(key)) - ## @jit.look_inside_iff(lambda self, w_dict: - ## w_dict_unrolling_heuristic(w_dict)) - ## def view_as_kwargs(self, w_dict): - ## d = self.unerase(w_dict.dstorage) - ## l = len(d) - ## keys, values = [None] * l, [None] * l - ## i = 0 - ## for key, val in d.iteritems(): - ## keys[i] = key - ## values[i] = val - ## i += 1 - ## return keys, values + @jit.look_inside_iff(lambda self, w_dict: + w_dict_unrolling_heuristic(w_dict)) + def view_as_kwargs(self, w_dict): + d = self.unerase(w_dict.dstorage) + l = len(d) + keys, values = [None] * l, [None] * l + i = 0 + for key, val in d.iteritems(): + keys[i] = key + values[i] = val + i += 1 + return keys, values + + def get_storage_fromkeys(self, keys_w, w_fill): + """Return an initialized storage with keys and fill values""" + storage = {} + mark_dict_non_null(storage) + for key in keys_w: + storage[key] = w_fill + return self.erase(storage) create_iterator_classes(UnicodeDictStrategy) @@ -1426,7 +1434,7 @@ typename = space.type(self).getname(space) w_seq = space.call_function(space.w_list, self) seq_repr = space.utf8_w(space.repr(w_seq)) - return space.newtext(b"%s(%s)" % (typename, seq_repr)) + return space.newtext(u"%s(%s)" % (typename, seq_repr.decode('utf8'))) def descr_len(self, space): return space.len(self.w_dict) diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py --- a/pypy/objspace/std/objspace.py +++ b/pypy/objspace/std/objspace.py @@ -327,14 +327,13 @@ return W_ListObject.newlist_bytes(self, list_s) def newlist_text(self, list_t): - return self.newlist_unicode([ + return self.newlist_utf8([ str_decode_utf8(s, "string", True, None, allow_surrogates=True)[0] for s in list_t]) - def newlist_utf8(self, list_u, is_ascii): - if is_ascii: - return W_ListObject.newlist_utf8(self, list_u) - return ObjSpace.newlist_utf8(self, list_u, False) + def newlist_utf8(self, list_u, is_ascii=True): + # TODO ignoring is_ascii, is that correct? + return W_ListObject.newlist_utf8(self, list_u) def newlist_int(self, list_i): return W_ListObject.newlist_int(self, list_i) @@ -553,8 +552,7 @@ return w_obj.listview_utf8() if type(w_obj) is W_SetObject or type(w_obj) is W_FrozensetObject: return w_obj.listview_utf8() - if (isinstance(w_obj, W_UnicodeObject) and not self._uses_unicode_iter(w_obj) - and w_obj.is_ascii()): + if isinstance(w_obj, W_UnicodeObject) and self._uses_unicode_iter(w_obj): return w_obj.listview_utf8() if isinstance(w_obj, W_ListObject) and self._uses_list_iter(w_obj): return w_obj.getitems_utf8() diff --git a/pypy/objspace/std/test/test_dictmultiobject.py b/pypy/objspace/std/test/test_dictmultiobject.py --- a/pypy/objspace/std/test/test_dictmultiobject.py +++ b/pypy/objspace/std/test/test_dictmultiobject.py @@ -1247,6 +1247,11 @@ self.hash_count += 1 return unicode.__hash__(self) + def is_ascii(self): + return True + + def unwrapped(self): + return True # the minimal 'space' needed to use a W_DictMultiObject class FakeSpace: @@ -1285,15 +1290,17 @@ def text_w(self, u): assert isinstance(u, unicode) - return u.encode('utf-8') + return FakeUnicode(u) def bytes_w(self, string): assert isinstance(string, str) return string - def utf8_w(self, b): + def utf8_w(self, u): + if isinstance(u, unicode): + u = u.encode('utf8') assert isinstance(u, str) - return b + return u def int_w(self, integer, allow_conversion=True): assert isinstance(integer, int) @@ -1301,12 +1308,17 @@ def wrap(self, obj): if isinstance(obj, str): - return obj.decode('ascii') + return FakeUnicode(obj.decode('ascii')) return obj def newtext(self, string): - assert isinstance(string, str) - return string.decode('utf-8') + if isinstance(string, str): + return FakeUnicode(string.decode('utf-8')) + assert isinstance(string, unicode) + return FakeUnicode(string) + + def newutf8(self, obj, lgt): + return obj def newbytes(self, obj): return obj diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py --- a/pypy/objspace/std/test/test_unicodeobject.py +++ b/pypy/objspace/std/test/test_unicodeobject.py @@ -77,22 +77,30 @@ assert space.int_w(w_index) == rexpected expected = u.startswith(v, start) + if expected and start > len(u): + expected = False # python2 vs. python3 w_res = space.call_method(w_u, 'startswith', w_v, space.newint(start)) assert w_res is space.newbool(expected) expected = u.startswith(v, start, start + len1) + if expected and start > len(u): + expected = False # python2 vs. python3 w_res = space.call_method(w_u, 'startswith', w_v, space.newint(start), space.newint(start + len1)) assert w_res is space.newbool(expected) expected = u.endswith(v, start) + if expected and start > len(u): + expected = False # python2 vs. python3 w_res = space.call_method(w_u, 'endswith', w_v, space.newint(start)) assert w_res is space.newbool(expected) expected = u.endswith(v, start, start + len1) + if expected and start > len(u): + expected = False # python2 vs. python3 w_res = space.call_method(w_u, 'endswith', w_v, space.newint(start), space.newint(start + len1)) @@ -102,6 +110,7 @@ space = self.space w_uni = space.wrap(u'abcd') assert space.text_w(w_uni) == 'abcd' + # TODO : how to handle this? w_uni = space.wrap(unichr(0xd921) + unichr(0xdddd)) space.raises_w(space.w_UnicodeEncodeError, space.text_w, w_uni) diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -35,6 +35,7 @@ @enforceargs(utf8str=str) def __init__(self, utf8str, length): assert isinstance(utf8str, bytes) + # TODO: how to handle surrogates assert length >= 0 self._utf8 = utf8str self._length = length @@ -125,7 +126,8 @@ if isinstance(w_other, W_UnicodeObject): return w_other if space.isinstance_w(w_other, space.w_bytes): - return unicode_from_bytes(space, w_other) + raise oefmt(space.w_TypeError, + "Can't convert '%T' object to str implicitly", w_other) if strict: raise oefmt(space.w_TypeError, "%s arg must be None, unicode or str", strict) @@ -142,8 +144,6 @@ def _multi_chr(self, unichar): return unichar - _builder = UnicodeBuilder - def _generic_name(self): return "str" @@ -373,14 +373,15 @@ return mod_format(space, w_values, self, fmt_type=FORMAT_UNICODE) def descr_swapcase(self, space): - input = self._utf8 - builder = rutf8.Utf8StringBuilder(len(input)) - for ch in rutf8.Utf8StringIterator(input): + value = self._utf8 + builder = rutf8.Utf8StringBuilder(len(value)) + for ch in rutf8.Utf8StringIterator(value): if unicodedb.isupper(ch): - ch = unicodedb.tolower(ch) + codes = unicodedb.tolower_full(ch) elif unicodedb.islower(ch): - ch = unicodedb.toupper(ch) - builder.append_code(ch) + codes = unicodedb.toupper_full(ch) + for c in codes: + builder.append_code(c) return self.from_utf8builder(builder) def descr_title(self, space): @@ -393,15 +394,51 @@ input = self._utf8 builder = rutf8.Utf8StringBuilder(len(input)) previous_is_cased = False + i = 0 for ch in rutf8.Utf8StringIterator(input): - if not previous_is_cased: - ch = unicodedb.totitle(ch) + if ch == 0x3a3: + codes = [self._handle_capital_sigma(input, i),] + elif not previous_is_cased: + codes = unicodedb.totitle_full(ch) else: - ch = unicodedb.tolower(ch) - builder.append_code(ch) - previous_is_cased = unicodedb.iscased(ch) + codes = unicodedb.tolower_full(ch) + for c in codes: + builder.append_code(c) + previous_is_cased = unicodedb.iscased(codes[-1]) + i += 1 return self.from_utf8builder(builder) + def _handle_capital_sigma(self, value, i): + # U+03A3 is in the Final_Sigma context when, it is found like this: + #\p{cased} \p{case-ignorable}* U+03A3 not(\p{case-ignorable}* \p{cased}) + # where \p{xxx} is a character with property xxx. + + # TODO: find a better way for utf8 -> codepoints + value = [ch for ch in rutf8.Utf8StringIterator(value)] + j = i - 1 + final_sigma = False + while j >= 0: + ch = value[j] + if unicodedb.iscaseignorable(ch): + j -= 1 + continue + final_sigma = unicodedb.iscased(ch) + break + if final_sigma: + j = i + 1 + length = len(value) + while j < length: + ch = value[j] + if unicodedb.iscaseignorable(ch): + j += 1 + continue + final_sigma = not unicodedb.iscased(ch) + break + if final_sigma: + return 0x3C2 + else: + return 0x3C3 + def descr_translate(self, space, w_table): builder = rutf8.Utf8StringBuilder(len(self._utf8)) for codepoint in rutf8.Utf8StringIterator(self._utf8): @@ -519,23 +556,29 @@ return space.is_w(space.type(w_obj), space.w_unicode) def descr_casefold(self, space): - value = self._val(space) - builder = self._builder(len(value)) - for c in value: - c_ord = ord(c) - folded = unicodedb.casefold_lookup(c_ord) + value = self._utf8 + builder = rutf8.Utf8StringBuilder(len(value)) + for ch in rutf8.Utf8StringIterator(value): + folded = unicodedb.casefold_lookup(ch) if folded is None: - builder.append(unichr(unicodedb.tolower(c_ord))) + builder.append_code(unicodedb.tolower(ch)) else: for r in folded: - builder.append(unichr(r)) - return self._new(builder.build()) + builder.append_code(r) + return self.from_utf8builder(builder) def descr_lower(self, space): - builder = rutf8.Utf8StringBuilder(len(self._utf8)) - for ch in rutf8.Utf8StringIterator(self._utf8): - lower = unicodedb.tolower(ch) - builder.append_code(lower) + value = self._utf8 + builder = rutf8.Utf8StringBuilder(len(value)) + i = 0 + for ch in rutf8.Utf8StringIterator(value): + if ch == 0x3a3: + codes = [self._handle_capital_sigma(value, i),] + else: + codes = unicodedb.tolower_full(ch) + for c in codes: + builder.append_code(c) + i += 1 return self.from_utf8builder(builder) def descr_isdecimal(self, space): @@ -589,11 +632,18 @@ value = self._utf8 if space.isinstance_w(w_prefix, space.w_tuple): return self._startswith_tuple(space, value, w_prefix, start, end) - return space.newbool(self._startswith(space, value, w_prefix, start, + try: + return space.newbool(self._startswith(space, value, w_prefix, start, end)) + except OperationError as e: + if e.match(space, space.w_TypeError): + raise oefmt(space.w_TypeError, 'startswith first arg must be str ' + 'or a tuple of str, not %T', w_prefix) def _startswith(self, space, value, w_prefix, start, end): prefix = self.convert_arg_to_w_unicode(space, w_prefix)._utf8 + if start > len(value): + return False if len(prefix) == 0: return True return startswith(value, prefix, start, end) @@ -603,11 +653,18 @@ value = self._utf8 if space.isinstance_w(w_suffix, space.w_tuple): return self._endswith_tuple(space, value, w_suffix, start, end) - return space.newbool(self._endswith(space, value, w_suffix, start, + try: + return space.newbool(self._endswith(space, value, w_suffix, start, end)) + except OperationError as e: + if e.match(space, space.w_TypeError): + raise oefmt(space.w_TypeError, 'endswith first arg must be str ' + 'or a tuple of str, not %T', w_suffix) def _endswith(self, space, value, w_prefix, start, end): prefix = self.convert_arg_to_w_unicode(space, w_prefix)._utf8 + if start > len(value): + return False if len(prefix) == 0: return True return endswith(value, prefix, start, end) @@ -684,8 +741,9 @@ def descr_upper(self, space): builder = rutf8.Utf8StringBuilder(len(self._utf8)) for ch in rutf8.Utf8StringIterator(self._utf8): - ch = unicodedb.toupper(ch) - builder.append_code(ch) + codes = unicodedb.toupper_full(ch) + for c in codes: + builder.append_code(c) return self.from_utf8builder(builder) @unwrap_spec(width=int) @@ -792,14 +850,16 @@ builder = rutf8.Utf8StringBuilder(len(self._utf8)) it = rutf8.Utf8StringIterator(self._utf8) uchar = it.next() - ch = unicodedb.toupper(uchar) - builder.append_code(ch) + codes = unicodedb.toupper_full(uchar) + # can sometimes give more than one, like for omega-with-Ypogegrammeni, 8179 + for c in codes: + builder.append_code(c) for ch in it: ch = unicodedb.tolower(ch) builder.append_code(ch) return self.from_utf8builder(builder) - @unwrap_spec(width=int, w_fillchar=WrappedDefault(' ')) + @unwrap_spec(width=int, w_fillchar=WrappedDefault(u' ')) def descr_center(self, space, width, w_fillchar): value = self._utf8 fillchar = self.convert_arg_to_w_unicode(space, w_fillchar)._utf8 @@ -978,14 +1038,14 @@ end_index = len(self._utf8) if start > 0: if start > self._length: - start_index = end_index + start_index = end_index + 1 else: start_index = self._index_to_byte(start) if end < self._length: end_index = self._index_to_byte(end) return (start_index, end_index) - @unwrap_spec(width=int, w_fillchar=WrappedDefault(' ')) + @unwrap_spec(width=int, w_fillchar=WrappedDefault(u' ')) def descr_rjust(self, space, width, w_fillchar): value = self._utf8 lgt = self._len() @@ -1004,7 +1064,7 @@ return W_UnicodeObject(value, lgt) - @unwrap_spec(width=int, w_fillchar=WrappedDefault(' ')) + @unwrap_spec(width=int, w_fillchar=WrappedDefault(u' ')) def descr_ljust(self, space, width, w_fillchar): value = self._utf8 w_fillchar = self.convert_arg_to_w_unicode(space, w_fillchar) @@ -1080,23 +1140,11 @@ def descr_isprintable(self, space): - for uchar in self._value: - if not unicodedb.isprintable(ord(uchar)): + for ch in rutf8.Utf8StringIterator(self._utf8): + if not unicodedb.isprintable(ch): return space.w_False return space.w_True - def _fix_fillchar(func): - # XXX: hack - from rpython.tool.sourcetools import func_with_new_name - func = func_with_new_name(func, func.__name__) - func.unwrap_spec = func.unwrap_spec.copy() - func.unwrap_spec['w_fillchar'] = WrappedDefault(u' ') - return func - - descr_center = _fix_fillchar(StringMethods.descr_center) - descr_ljust = _fix_fillchar(StringMethods.descr_ljust) - descr_rjust = _fix_fillchar(StringMethods.descr_rjust) - @staticmethod def _iter_getitem_result(self, space, index): assert isinstance(self, W_UnicodeObject) @@ -1172,7 +1220,7 @@ def decode_object(space, w_obj, encoding, errors): if encoding is None: encoding = getdefaultencoding(space) - if errors is None or errors == 'strict': + if errors is None or errors == 'strict' or errors == 'surrogateescape': if encoding == 'ascii': s = space.charbuf_w(w_obj) unicodehelper.check_ascii_or_raise(space, s) @@ -1824,7 +1872,7 @@ def unicode_to_decimal_w(space, w_unistr, allow_surrogates=False): if not isinstance(w_unistr, W_UnicodeObject): raise oefmt(space.w_TypeError, "expected unicode, got '%T'", w_unistr) - value = _rpy_unicode_to_decimal_w(space, w_unistr.utf8_w(space)) + value = _rpy_unicode_to_decimal_w(space, w_unistr.utf8_w(space).decode('utf8')) return unicodehelper.encode_utf8(space, value, allow_surrogates=allow_surrogates) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit