Author: Maciej Fijalkowski <fij...@gmail.com> Branch: Changeset: r48276:51aebf5ad032 Date: 2011-10-20 19:49 +0200 http://bitbucket.org/pypy/pypy/changeset/51aebf5ad032/
Log: merge faster-json branch. It provides encode() optimization for json module, that runs faster than the C extension on CPython (for our benchmark) diff --git a/lib-python/modified-2.7/json/encoder.py b/lib-python/modified-2.7/json/encoder.py --- a/lib-python/modified-2.7/json/encoder.py +++ b/lib-python/modified-2.7/json/encoder.py @@ -2,14 +2,7 @@ """ import re -try: - from _json import encode_basestring_ascii as c_encode_basestring_ascii -except ImportError: - c_encode_basestring_ascii = None -try: - from _json import make_encoder as c_make_encoder -except ImportError: - c_make_encoder = None +from __pypy__.builders import StringBuilder, UnicodeBuilder ESCAPE = re.compile(r'[\x00-\x1f\\"\b\f\n\r\t]') ESCAPE_ASCII = re.compile(r'([\\"]|[^\ -~])') @@ -24,8 +17,7 @@ '\t': '\\t', } for i in range(0x20): - ESCAPE_DCT.setdefault(chr(i), '\\u{0:04x}'.format(i)) - #ESCAPE_DCT.setdefault(chr(i), '\\u%04x' % (i,)) + ESCAPE_DCT.setdefault(chr(i), '\\u%04x' % (i,)) # Assume this produces an infinity on all machines (probably not guaranteed) INFINITY = float('1e66666') @@ -37,10 +29,9 @@ """ def replace(match): return ESCAPE_DCT[match.group(0)] - return '"' + ESCAPE.sub(replace, s) + '"' + return ESCAPE.sub(replace, s) - -def py_encode_basestring_ascii(s): +def encode_basestring_ascii(s): """Return an ASCII-only JSON representation of a Python string """ @@ -53,20 +44,18 @@ except KeyError: n = ord(s) if n < 0x10000: - return '\\u{0:04x}'.format(n) - #return '\\u%04x' % (n,) + return '\\u%04x' % (n,) else: # surrogate pair n -= 0x10000 s1 = 0xd800 | ((n >> 10) & 0x3ff) s2 = 0xdc00 | (n & 0x3ff) - return '\\u{0:04x}\\u{1:04x}'.format(s1, s2) - #return '\\u%04x\\u%04x' % (s1, s2) - return '"' + str(ESCAPE_ASCII.sub(replace, s)) + '"' - - -encode_basestring_ascii = ( - c_encode_basestring_ascii or py_encode_basestring_ascii) + return '\\u%04x\\u%04x' % (s1, s2) + if ESCAPE_ASCII.search(s): + return str(ESCAPE_ASCII.sub(replace, s)) + return s +py_encode_basestring_ascii = lambda s: '"' + encode_basestring_ascii(s) + '"' +c_encode_basestring_ascii = None class JSONEncoder(object): """Extensible JSON <http://json.org> encoder for Python data structures. @@ -147,6 +136,17 @@ self.skipkeys = skipkeys self.ensure_ascii = ensure_ascii + if ensure_ascii: + self.encoder = encode_basestring_ascii + else: + self.encoder = encode_basestring + if encoding != 'utf-8': + orig_encoder = self.encoder + def encoder(o): + if isinstance(o, str): + o = o.decode(encoding) + return orig_encoder(o) + self.encoder = encoder self.check_circular = check_circular self.allow_nan = allow_nan self.sort_keys = sort_keys @@ -184,24 +184,126 @@ '{"foo": ["bar", "baz"]}' """ - # This is for extremely simple cases and benchmarks. + if self.check_circular: + markers = {} + else: + markers = None + if self.ensure_ascii: + builder = StringBuilder() + else: + builder = UnicodeBuilder() + self._encode(o, markers, builder, 0) + return builder.build() + + def _emit_indent(self, builder, _current_indent_level): + if self.indent is not None: + _current_indent_level += 1 + newline_indent = '\n' + (' ' * (self.indent * + _current_indent_level)) + separator = self.item_separator + newline_indent + builder.append(newline_indent) + else: + separator = self.item_separator + return separator, _current_indent_level + + def _emit_unindent(self, builder, _current_indent_level): + if self.indent is not None: + builder.append('\n') + builder.append(' ' * (self.indent * (_current_indent_level - 1))) + + def _encode(self, o, markers, builder, _current_indent_level): if isinstance(o, basestring): - if isinstance(o, str): - _encoding = self.encoding - if (_encoding is not None - and not (_encoding == 'utf-8')): - o = o.decode(_encoding) - if self.ensure_ascii: - return encode_basestring_ascii(o) + builder.append('"') + builder.append(self.encoder(o)) + builder.append('"') + elif o is None: + builder.append('null') + elif o is True: + builder.append('true') + elif o is False: + builder.append('false') + elif isinstance(o, (int, long)): + builder.append(str(o)) + elif isinstance(o, float): + builder.append(self._floatstr(o)) + elif isinstance(o, (list, tuple)): + if not o: + builder.append('[]') + return + self._encode_list(o, markers, builder, _current_indent_level) + elif isinstance(o, dict): + if not o: + builder.append('{}') + return + self._encode_dict(o, markers, builder, _current_indent_level) + else: + self._mark_markers(markers, o) + res = self.default(o) + self._encode(res, markers, builder, _current_indent_level) + self._remove_markers(markers, o) + return res + + def _encode_list(self, l, markers, builder, _current_indent_level): + self._mark_markers(markers, l) + builder.append('[') + first = True + separator, _current_indent_level = self._emit_indent(builder, + _current_indent_level) + for elem in l: + if first: + first = False else: - return encode_basestring(o) - # This doesn't pass the iterator directly to ''.join() because the - # exceptions aren't as detailed. The list call should be roughly - # equivalent to the PySequence_Fast that ''.join() would do. - chunks = self.iterencode(o, _one_shot=True) - if not isinstance(chunks, (list, tuple)): - chunks = list(chunks) - return ''.join(chunks) + builder.append(separator) + self._encode(elem, markers, builder, _current_indent_level) + del elem # XXX grumble + self._emit_unindent(builder, _current_indent_level) + builder.append(']') + self._remove_markers(markers, l) + + def _encode_dict(self, d, markers, builder, _current_indent_level): + self._mark_markers(markers, d) + first = True + builder.append('{') + separator, _current_indent_level = self._emit_indent(builder, + _current_indent_level) + if self.sort_keys: + items = sorted(d.items(), key=lambda kv: kv[0]) + else: + items = d.iteritems() + + for key, v in items: + if first: + first = False + else: + builder.append(separator) + if isinstance(key, basestring): + pass + # JavaScript is weakly typed for these, so it makes sense to + # also allow them. Many encoders seem to do something like this. + elif isinstance(key, float): + key = self._floatstr(key) + elif key is True: + key = 'true' + elif key is False: + key = 'false' + elif key is None: + key = 'null' + elif isinstance(key, (int, long)): + key = str(key) + elif self.skipkeys: + continue + else: + raise TypeError("key " + repr(key) + " is not a string") + builder.append('"') + builder.append(self.encoder(key)) + builder.append('"') + builder.append(self.key_separator) + self._encode(v, markers, builder, _current_indent_level) + del key + del v # XXX grumble + self._emit_unindent(builder, _current_indent_level) + builder.append('}') + self._remove_markers(markers, d) def iterencode(self, o, _one_shot=False): """Encode the given object and yield each string @@ -217,86 +319,54 @@ markers = {} else: markers = None - if self.ensure_ascii: - _encoder = encode_basestring_ascii + return self._iterencode(o, markers, 0) + + def _floatstr(self, o): + # Check for specials. Note that this type of test is processor + # and/or platform-specific, so do tests which don't depend on the + # internals. + + if o != o: + text = 'NaN' + elif o == INFINITY: + text = 'Infinity' + elif o == -INFINITY: + text = '-Infinity' else: - _encoder = encode_basestring - if self.encoding != 'utf-8': - def _encoder(o, _orig_encoder=_encoder, _encoding=self.encoding): - if isinstance(o, str): - o = o.decode(_encoding) - return _orig_encoder(o) + return FLOAT_REPR(o) - def floatstr(o, allow_nan=self.allow_nan, - _repr=FLOAT_REPR, _inf=INFINITY, _neginf=-INFINITY): - # Check for specials. Note that this type of test is processor - # and/or platform-specific, so do tests which don't depend on the - # internals. + if not self.allow_nan: + raise ValueError( + "Out of range float values are not JSON compliant: " + + repr(o)) - if o != o: - text = 'NaN' - elif o == _inf: - text = 'Infinity' - elif o == _neginf: - text = '-Infinity' - else: - return _repr(o) + return text - if not allow_nan: - raise ValueError( - "Out of range float values are not JSON compliant: " + - repr(o)) + def _mark_markers(self, markers, o): + if markers is not None: + if id(o) in markers: + raise ValueError("Circular reference detected") + markers[id(o)] = None - return text + def _remove_markers(self, markers, o): + if markers is not None: + del markers[id(o)] - - if (_one_shot and c_make_encoder is not None - and not self.indent and not self.sort_keys): - _iterencode = c_make_encoder( - markers, self.default, _encoder, self.indent, - self.key_separator, self.item_separator, self.sort_keys, - self.skipkeys, self.allow_nan) - else: - _iterencode = _make_iterencode( - markers, self.default, _encoder, self.indent, floatstr, - self.key_separator, self.item_separator, self.sort_keys, - self.skipkeys, _one_shot) - return _iterencode(o, 0) - -def _make_iterencode(markers, _default, _encoder, _indent, _floatstr, - _key_separator, _item_separator, _sort_keys, _skipkeys, _one_shot, - ## HACK: hand-optimized bytecode; turn globals into locals - ValueError=ValueError, - basestring=basestring, - dict=dict, - float=float, - id=id, - int=int, - isinstance=isinstance, - list=list, - long=long, - str=str, - tuple=tuple, - ): - - def _iterencode_list(lst, _current_indent_level): + def _iterencode_list(self, lst, markers, _current_indent_level): if not lst: yield '[]' return - if markers is not None: - markerid = id(lst) - if markerid in markers: - raise ValueError("Circular reference detected") - markers[markerid] = lst + self._mark_markers(markers, lst) buf = '[' - if _indent is not None: + if self.indent is not None: _current_indent_level += 1 - newline_indent = '\n' + (' ' * (_indent * _current_indent_level)) - separator = _item_separator + newline_indent + newline_indent = '\n' + (' ' * (self.indent * + _current_indent_level)) + separator = self.item_separator + newline_indent buf += newline_indent else: newline_indent = None - separator = _item_separator + separator = self.item_separator first = True for value in lst: if first: @@ -304,7 +374,7 @@ else: buf = separator if isinstance(value, basestring): - yield buf + _encoder(value) + yield buf + '"' + self.encoder(value) + '"' elif value is None: yield buf + 'null' elif value is True: @@ -314,44 +384,43 @@ elif isinstance(value, (int, long)): yield buf + str(value) elif isinstance(value, float): - yield buf + _floatstr(value) + yield buf + self._floatstr(value) else: yield buf if isinstance(value, (list, tuple)): - chunks = _iterencode_list(value, _current_indent_level) + chunks = self._iterencode_list(value, markers, + _current_indent_level) elif isinstance(value, dict): - chunks = _iterencode_dict(value, _current_indent_level) + chunks = self._iterencode_dict(value, markers, + _current_indent_level) else: - chunks = _iterencode(value, _current_indent_level) + chunks = self._iterencode(value, markers, + _current_indent_level) for chunk in chunks: yield chunk if newline_indent is not None: _current_indent_level -= 1 - yield '\n' + (' ' * (_indent * _current_indent_level)) + yield '\n' + (' ' * (self.indent * _current_indent_level)) yield ']' - if markers is not None: - del markers[markerid] + self._remove_markers(markers, lst) - def _iterencode_dict(dct, _current_indent_level): + def _iterencode_dict(self, dct, markers, _current_indent_level): if not dct: yield '{}' return - if markers is not None: - markerid = id(dct) - if markerid in markers: - raise ValueError("Circular reference detected") - markers[markerid] = dct + self._mark_markers(markers, dct) yield '{' - if _indent is not None: + if self.indent is not None: _current_indent_level += 1 - newline_indent = '\n' + (' ' * (_indent * _current_indent_level)) - item_separator = _item_separator + newline_indent + newline_indent = '\n' + (' ' * (self.indent * + _current_indent_level)) + item_separator = self.item_separator + newline_indent yield newline_indent else: newline_indent = None - item_separator = _item_separator + item_separator = self.item_separator first = True - if _sort_keys: + if self.sort_keys: items = sorted(dct.items(), key=lambda kv: kv[0]) else: items = dct.iteritems() @@ -361,7 +430,7 @@ # JavaScript is weakly typed for these, so it makes sense to # also allow them. Many encoders seem to do something like this. elif isinstance(key, float): - key = _floatstr(key) + key = self._floatstr(key) elif key is True: key = 'true' elif key is False: @@ -370,7 +439,7 @@ key = 'null' elif isinstance(key, (int, long)): key = str(key) - elif _skipkeys: + elif self.skipkeys: continue else: raise TypeError("key " + repr(key) + " is not a string") @@ -378,10 +447,10 @@ first = False else: yield item_separator - yield _encoder(key) - yield _key_separator + yield '"' + self.encoder(key) + '"' + yield self.key_separator if isinstance(value, basestring): - yield _encoder(value) + yield '"' + self.encoder(value) + '"' elif value is None: yield 'null' elif value is True: @@ -391,26 +460,28 @@ elif isinstance(value, (int, long)): yield str(value) elif isinstance(value, float): - yield _floatstr(value) + yield self._floatstr(value) else: if isinstance(value, (list, tuple)): - chunks = _iterencode_list(value, _current_indent_level) + chunks = self._iterencode_list(value, markers, + _current_indent_level) elif isinstance(value, dict): - chunks = _iterencode_dict(value, _current_indent_level) + chunks = self._iterencode_dict(value, markers, + _current_indent_level) else: - chunks = _iterencode(value, _current_indent_level) + chunks = self._iterencode(value, markers, + _current_indent_level) for chunk in chunks: yield chunk if newline_indent is not None: _current_indent_level -= 1 - yield '\n' + (' ' * (_indent * _current_indent_level)) + yield '\n' + (' ' * (self.indent * _current_indent_level)) yield '}' - if markers is not None: - del markers[markerid] + self._remove_markers(markers, dct) - def _iterencode(o, _current_indent_level): + def _iterencode(self, o, markers, _current_indent_level): if isinstance(o, basestring): - yield _encoder(o) + yield '"' + self.encoder(o) + '"' elif o is None: yield 'null' elif o is True: @@ -420,23 +491,19 @@ elif isinstance(o, (int, long)): yield str(o) elif isinstance(o, float): - yield _floatstr(o) + yield self._floatstr(o) elif isinstance(o, (list, tuple)): - for chunk in _iterencode_list(o, _current_indent_level): + for chunk in self._iterencode_list(o, markers, + _current_indent_level): yield chunk elif isinstance(o, dict): - for chunk in _iterencode_dict(o, _current_indent_level): + for chunk in self._iterencode_dict(o, markers, + _current_indent_level): yield chunk else: - if markers is not None: - markerid = id(o) - if markerid in markers: - raise ValueError("Circular reference detected") - markers[markerid] = o - o = _default(o) - for chunk in _iterencode(o, _current_indent_level): + self._mark_markers(markers, o) + obj = self.default(o) + for chunk in self._iterencode(obj, markers, + _current_indent_level): yield chunk - if markers is not None: - del markers[markerid] - - return _iterencode + self._remove_markers(markers, o) diff --git a/lib-python/modified-2.7/json/tests/test_unicode.py b/lib-python/modified-2.7/json/tests/test_unicode.py --- a/lib-python/modified-2.7/json/tests/test_unicode.py +++ b/lib-python/modified-2.7/json/tests/test_unicode.py @@ -80,3 +80,9 @@ self.assertEqual(type(json.loads(u'["a"]')[0]), unicode) # Issue 10038. self.assertEqual(type(json.loads('"foo"')), unicode) + + def test_encode_not_utf_8(self): + self.assertEqual(json.dumps('\xb1\xe6', encoding='iso8859-2'), + '"\\u0105\\u0107"') + self.assertEqual(json.dumps(['\xb1\xe6'], encoding='iso8859-2'), + '["\\u0105\\u0107"]') _______________________________________________ pypy-commit mailing list pypy-commit@python.org http://mail.python.org/mailman/listinfo/pypy-commit