Author: fijal Branch: unicode-utf8 Changeset: r93155:109fd5f5d4eb Date: 2017-11-23 20:52 +0100 http://bitbucket.org/pypy/pypy/changeset/109fd5f5d4eb/
Log: start working on pypyjson diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py --- a/pypy/interpreter/baseobjspace.py +++ b/pypy/interpreter/baseobjspace.py @@ -1760,10 +1760,6 @@ def utf8_w(self, w_obj): return w_obj.utf8_w(self) - def unicode_w(self, w_obj): - # XXX: kill me! - return w_obj.utf8_w(self).decode('utf-8') - def convert_to_w_unicode(self, w_obj): return w_obj.convert_to_w_unicode(self) diff --git a/pypy/module/_pypyjson/interp_decoder.py b/pypy/module/_pypyjson/interp_decoder.py --- a/pypy/module/_pypyjson/interp_decoder.py +++ b/pypy/module/_pypyjson/interp_decoder.py @@ -1,7 +1,7 @@ import sys from rpython.rlib.rstring import StringBuilder from rpython.rlib.objectmodel import specialize, always_inline, r_dict -from rpython.rlib import rfloat, runicode +from rpython.rlib import rfloat, runicode, rutf8 from rpython.rtyper.lltypesystem import lltype, rffi from pypy.interpreter.error import oefmt from pypy.interpreter import unicodehelper @@ -19,29 +19,6 @@ return 0.0 return x * NEG_POW_10[exp] -def strslice2unicode_latin1(s, start, end): - """ - Convert s[start:end] to unicode. s is supposed to be an RPython string - encoded in latin-1, which means that the numeric value of each char is the - same as the corresponding unicode code point. - - Internally it's implemented at the level of low-level helpers, to avoid - the extra copy we would need if we take the actual slice first. - - No bound checking is done, use carefully. - """ - from rpython.rtyper.annlowlevel import llstr, hlunicode - from rpython.rtyper.lltypesystem.rstr import malloc, UNICODE - from rpython.rtyper.lltypesystem.lltype import cast_primitive, UniChar - length = end-start - ll_s = llstr(s) - ll_res = malloc(UNICODE, length) - ll_res.hash = 0 - for i in range(length): - ch = ll_s.chars[start+i] - ll_res.chars[i] = cast_primitive(UniChar, ch) - return hlunicode(ll_res) - def slice_eq(a, b): (ll_chars1, start1, length1, _) = a (ll_chars2, start2, length2, _) = b @@ -312,8 +289,7 @@ bits |= ord(ch) if ch == '"': self.pos = i - return self.space.newunicode( - self._create_string(start, i - 1, bits)) + return self._create_string(start, i - 1, bits) elif ch == '\\' or ch < '\x20': self.pos = i-1 return self.decode_string_escaped(start) @@ -322,12 +298,15 @@ if bits & 0x80: # the 8th bit is set, it's an utf8 string content_utf8 = self.getslice(start, end) - return unicodehelper.decode_utf8(self.space, content_utf8) + lgt, flag = unicodehelper.check_utf8_or_raise(self.space, + content_utf8) + return self.space.newutf8(content_utf8, lgt, flag) else: # ascii only, fast path (ascii is a strict subset of # latin1, and we already checked that all the chars are < # 128) - return strslice2unicode_latin1(self.s, start, end) + return self.space.newutf8(self.getslice(start, end), + end - start, rutf8.FLAG_ASCII) def decode_string_escaped(self, start): i = self.pos @@ -340,9 +319,10 @@ i += 1 if ch == '"': content_utf8 = builder.build() - content_unicode = unicodehelper.decode_utf8(self.space, content_utf8) + lgt, f = unicodehelper.check_utf8_or_raise(self.space, + content_utf8) self.pos = i - return self.space.newunicode(content_unicode) + return self.space.newutf8(content_utf8, lgt, f) elif ch == '\\': i = self.decode_escape_sequence(i, builder) elif ch < '\x20': diff --git a/pypy/module/_pypyjson/test/test__pypyjson.py b/pypy/module/_pypyjson/test/test__pypyjson.py --- a/pypy/module/_pypyjson/test/test__pypyjson.py +++ b/pypy/module/_pypyjson/test/test__pypyjson.py @@ -10,10 +10,14 @@ assert dec.skip_whitespace(8) == len(s) dec.close() +class FakeSpace(object): + def newutf8(self, s, l, f): + return s + def test_decode_key(): s1 = "123" * 100 s = ' "%s" "%s" ' % (s1, s1) - dec = JSONDecoder('fake space', s) + dec = JSONDecoder(FakeSpace(), s) assert dec.pos == 0 x = dec.decode_key(0) assert x == s1 diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py --- a/pypy/objspace/std/objspace.py +++ b/pypy/objspace/std/objspace.py @@ -367,23 +367,10 @@ assert isinstance(utf8s, str) return W_UnicodeObject(utf8s, length, flag) - def new_from_utf8(self, utf8s): - # XXX: kill me! - assert isinstance(utf8s, str) - length, flag = rutf8.check_utf8(utf8s, True) - return W_UnicodeObject(utf8s, length, flag) - def newfilename(self, s): assert isinstance(s, str) # on pypy3, this decodes the byte string return W_BytesObject(s) # with the filesystem encoding - def newunicode(self, unistr): - # XXX: kill me! - assert isinstance(unistr, unicode) - utf8s = unistr.encode("utf-8") - length, flag = rutf8.check_utf8(utf8s, True) - return self.newutf8(utf8s, length, flag) - def type(self, w_obj): jit.promote(w_obj.__class__) return w_obj.getclass(self) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit