Author: fijal Branch: unicode-utf8 Changeset: r93326:b4456e64ff3b Date: 2017-12-09 15:10 +0200 http://bitbucket.org/pypy/pypy/changeset/b4456e64ff3b/
Log: fix unicodedata module diff --git a/pypy/module/unicodedata/interp_ucd.py b/pypy/module/unicodedata/interp_ucd.py --- a/pypy/module/unicodedata/interp_ucd.py +++ b/pypy/module/unicodedata/interp_ucd.py @@ -7,11 +7,8 @@ from pypy.interpreter.error import OperationError, oefmt from pypy.interpreter.typedef import TypeDef, interp_attrproperty from rpython.rlib.rarithmetic import r_longlong -from rpython.rlib.objectmodel import we_are_translated -from rpython.rlib.runicode import MAXUNICODE from rpython.rlib.unicodedata import unicodedb_5_2_0, unicodedb_3_2_0 -from rpython.rlib.runicode import code_to_unichr, ord_accepts_surrogate -import sys +from rpython.rlib.rutf8 import Utf8StringBuilder, unichr_as_utf8 # Contants for Hangul characters @@ -30,49 +27,17 @@ # unicode code point. -if MAXUNICODE > 0xFFFF: - # Target is wide build - def unichr_to_code_w(space, w_unichr): - if not space.isinstance_w(w_unichr, space.w_unicode): - raise oefmt( - space.w_TypeError, 'argument 1 must be unicode, not %T', - w_unichr) +# Target is wide build +def unichr_to_code_w(space, w_unichr): + if not space.isinstance_w(w_unichr, space.w_unicode): + raise oefmt( + space.w_TypeError, 'argument 1 must be unicode, not %T', + w_unichr) - if not we_are_translated() and sys.maxunicode == 0xFFFF: - # Host CPython is narrow build, accept surrogates - try: - return ord_accepts_surrogate(space.unicode_w(w_unichr)) - except TypeError: - raise oefmt(space.w_TypeError, - "need a single Unicode character as parameter") - else: - if not space.len_w(w_unichr) == 1: - raise oefmt(space.w_TypeError, - "need a single Unicode character as parameter") - return space.int_w(space.ord(w_unichr)) - -else: - # Target is narrow build - def unichr_to_code_w(space, w_unichr): - if not space.isinstance_w(w_unichr, space.w_unicode): - raise oefmt( - space.w_TypeError, 'argument 1 must be unicode, not %T', - w_unichr) - - if not we_are_translated() and sys.maxunicode > 0xFFFF: - # Host CPython is wide build, forbid surrogates - if not space.len_w(w_unichr) == 1: - raise oefmt(space.w_TypeError, - "need a single Unicode character as parameter") - return space.int_w(space.ord(w_unichr)) - - else: - # Accept surrogates - try: - return ord_accepts_surrogate(space.unicode_w(w_unichr)) - except TypeError: - raise oefmt(space.w_TypeError, - "need a single Unicode character as parameter") + if not space.len_w(w_unichr) == 1: + raise oefmt(space.w_TypeError, + "need a single Unicode character as parameter") + return space.int_w(space.ord(w_unichr)) class UCD(W_Root): @@ -110,7 +75,7 @@ except KeyError: msg = space.mod(space.newtext("undefined character name '%s'"), space.newtext(name)) raise OperationError(space.w_KeyError, msg) - return space.newunicode(code_to_unichr(code)) + return space.newutf8(unichr_as_utf8(code), 1) def name(self, space, w_unichr, w_default=None): code = unichr_to_code_w(space, w_unichr) @@ -259,10 +224,10 @@ result[0] = ch if not composed: # If decomposed normalization we are done - return space.newunicode(u''.join([unichr(i) for i in result[:j]])) + return self.build(space, result, stop=j) if j <= 1: - return space.newunicode(u''.join([unichr(i) for i in result[:j]])) + return self.build(space, result, stop=j) current = result[0] starter_pos = 0 @@ -310,7 +275,13 @@ result[starter_pos] = current - return space.newunicode(u''.join([unichr(i) for i in result[:next_insert]])) + return self.build(space, result, stop=next_insert) + + def build(self, space, r, stop): + builder = Utf8StringBuilder(stop * 3) + for i in range(stop): + builder.append_code(r[i]) + return space.newutf8(builder.build(), stop) methods = {} diff --git a/pypy/module/unicodedata/test/test_hyp.py b/pypy/module/unicodedata/test/test_hyp.py --- a/pypy/module/unicodedata/test/test_hyp.py +++ b/pypy/module/unicodedata/test/test_hyp.py @@ -1,3 +1,4 @@ + import pytest try: from hypothesis import given, strategies as st, example, settings @@ -5,12 +6,14 @@ pytest.skip("hypothesis required") from pypy.module.unicodedata.interp_ucd import ucd +from rpython.rlib.rutf8 import get_utf8_length def make_normalization(space, NF_code): def normalize(s): - w_s = space.newunicode(s) + u = s.encode('utf8') + w_s = space.newutf8(u, get_utf8_length(u)) w_res = ucd.normalize(space, NF_code, w_s) - return space.unicode_w(w_res) + return space.utf8_w(w_res).decode('utf8') return normalize all_forms = ['NFC', 'NFD', 'NFKC', 'NFKD'] _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit