Author: Ronan Lamy <ronan.l...@gmail.com> Branch: unicode-utf8-test Changeset: r93334:1bb5950b8ff5 Date: 2017-12-09 14:51 +0000 http://bitbucket.org/pypy/pypy/changeset/1bb5950b8ff5/
Log: hg merge unicode-utf8 diff --git a/pypy/module/struct/formatiterator.py b/pypy/module/struct/formatiterator.py --- a/pypy/module/struct/formatiterator.py +++ b/pypy/module/struct/formatiterator.py @@ -1,6 +1,6 @@ from rpython.rlib.rarithmetic import (r_uint, r_ulonglong, r_longlong, maxint, intmask) -from rpython.rlib import jit +from rpython.rlib import jit, rutf8 from rpython.rlib.objectmodel import specialize from rpython.rlib.rstruct.error import StructError from rpython.rlib.rstruct.formatiterator import FormatIterator @@ -107,7 +107,7 @@ def accept_unicode_arg(self): w_obj = self.accept_obj_arg() - return self.space.unicode_w(w_obj) + return self.space.utf8_len_w(w_obj) def accept_float_arg(self): w_obj = self.accept_obj_arg() @@ -191,6 +191,10 @@ assert 0, "unreachable" self.result_w.append(w_value) + def append_utf8(self, value): + w_ch = self.space.newutf8(rutf8.unichr_as_utf8(r_uint(value)), 1) + self.result_w.append(w_ch) + def get_pos(self): return self.pos diff --git a/pypy/module/unicodedata/interp_ucd.py b/pypy/module/unicodedata/interp_ucd.py --- a/pypy/module/unicodedata/interp_ucd.py +++ b/pypy/module/unicodedata/interp_ucd.py @@ -75,6 +75,7 @@ except KeyError: msg = space.mod(space.newtext("undefined character name '%s'"), space.newtext(name)) raise OperationError(space.w_KeyError, msg) + assert code >= 0 return space.newutf8(unichr_as_utf8(code), 1) def name(self, space, w_unichr, w_default=None): diff --git a/rpython/rlib/rstruct/nativefmttable.py b/rpython/rlib/rstruct/nativefmttable.py --- a/rpython/rlib/rstruct/nativefmttable.py +++ b/rpython/rlib/rstruct/nativefmttable.py @@ -4,7 +4,7 @@ """ import struct -from rpython.rlib import jit, longlong2float +from rpython.rlib import rutf8, longlong2float from rpython.rlib.objectmodel import specialize from rpython.rlib.rarithmetic import r_singlefloat, widen, intmask from rpython.rlib.rstruct import standardfmttable as std @@ -139,17 +139,17 @@ from rpython.rlib.rstruct import unichar def pack_unichar(fmtiter): - unistr = fmtiter.accept_unicode_arg() - if len(unistr) != 1: + utf8, lgt = fmtiter.accept_unicode_arg() + if lgt != 1: raise StructError("expected a unicode string of length 1") - c = unistr[0] # string->char conversion for the annotator - unichar.pack_unichar(c, fmtiter.wbuf, fmtiter.pos) + uchr = rutf8.codepoint_at_pos(utf8, 0) + unichar.pack_codepoint(uchr, fmtiter.wbuf, fmtiter.pos) fmtiter.advance(unichar.UNICODE_SIZE) @specialize.argtype(0) def unpack_unichar(fmtiter): data = fmtiter.read(unichar.UNICODE_SIZE) - fmtiter.appendobj(unichar.unpack_unichar(data)) + fmtiter.append_utf8(unichar.unpack_codepoint(data)) native_fmttable['u'] = {'size': unichar.UNICODE_SIZE, 'alignment': unichar.UNICODE_SIZE, diff --git a/rpython/rlib/rstruct/unichar.py b/rpython/rlib/rstruct/unichar.py --- a/rpython/rlib/rstruct/unichar.py +++ b/rpython/rlib/rstruct/unichar.py @@ -3,12 +3,8 @@ """ import sys -from rpython.rlib.runicode import MAXUNICODE -if MAXUNICODE <= 65535: - UNICODE_SIZE = 2 -else: - UNICODE_SIZE = 4 +UNICODE_SIZE = 4 BIGENDIAN = sys.byteorder == "big" def pack_unichar(unich, buf, pos): @@ -34,7 +30,7 @@ buf.setitem(pos+2, chr((unich >> 16) & 0xFF)) buf.setitem(pos+3, chr(unich >> 24)) -def unpack_unichar(rawstring): +def unpack_codepoint(rawstring): assert len(rawstring) == UNICODE_SIZE if UNICODE_SIZE == 2: if BIGENDIAN: @@ -54,4 +50,7 @@ ord(rawstring[1]) << 8 | ord(rawstring[2]) << 16 | ord(rawstring[3]) << 24) - return unichr(n) + return n + +def unpack_unichar(rawstring): + return unichr(unpack_codepoint(rawstring)) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit