Author: Carl Friedrich Bolz-Tereick <cfb...@gmx.de> Branch: Changeset: r97581:89a3715662ef Date: 2019-09-21 21:06 +0200 http://bitbucket.org/pypy/pypy/changeset/89a3715662ef/
Log: merge json-decoder-maps: implement a much faster json decoder (3x speedup for large json files, 2x less memory) used techniques: - SWAR (SIMD within a register) techniques for finding the end of whitespace and the end of strings - cache strings aggressively - use maps for representing the resulting objects. diff too long, truncating to 2000 out of 2102 lines diff --git a/pypy/doc/whatsnew-head.rst b/pypy/doc/whatsnew-head.rst --- a/pypy/doc/whatsnew-head.rst +++ b/pypy/doc/whatsnew-head.rst @@ -5,3 +5,9 @@ .. this is a revision shortly after release-pypy-7.2.0 .. startrev: 78cd4acbcbec + +.. branch: json-decoder-maps + +Much faster and more memory-efficient JSON decoding. The resulting +dictionaries that come out of the JSON decoder have faster lookups too. + diff --git a/pypy/module/_pypyjson/interp_decoder.py b/pypy/module/_pypyjson/interp_decoder.py --- a/pypy/module/_pypyjson/interp_decoder.py +++ b/pypy/module/_pypyjson/interp_decoder.py @@ -1,11 +1,13 @@ import sys from rpython.rlib.rstring import StringBuilder -from rpython.rlib.objectmodel import specialize, always_inline, r_dict -from rpython.rlib import rfloat, rutf8 +from rpython.rlib.objectmodel import specialize, always_inline +from rpython.rlib import rfloat, runicode, jit, objectmodel, rutf8 from rpython.rtyper.lltypesystem import lltype, rffi from rpython.rlib.rarithmetic import r_uint from pypy.interpreter.error import oefmt from pypy.interpreter import unicodehelper +from pypy.interpreter.baseobjspace import W_Root +from pypy.module._pypyjson import simd OVF_DIGITS = len(str(sys.maxint)) @@ -15,45 +17,107 @@ # precomputing negative powers of 10 is MUCH faster than using e.g. math.pow # at runtime NEG_POW_10 = [10.0**-i for i in range(16)] +del i + def neg_pow_10(x, exp): if exp >= len(NEG_POW_10): return 0.0 return x * NEG_POW_10[exp] -def slice_eq(a, b): - (ll_chars1, start1, length1, _) = a - (ll_chars2, start2, length2, _) = b - if length1 != length2: - return False - j = start2 - for i in range(start1, start1 + length1): - if ll_chars1[i] != ll_chars2[j]: - return False - j += 1 - return True -def slice_hash(a): - (ll_chars, start, length, h) = a - return h +class IntCache(object): + """ A cache for wrapped ints between START and END """ -TYPE_UNKNOWN = 0 -TYPE_STRING = 1 -class JSONDecoder(object): + # I also tried various combinations of having an LRU cache for ints as + # well, didn't really help. + + # XXX one thing to do would be to use withintprebuilt in general again, + # hidden behind a 'we_are_jitted' + + START = -10 + END = 256 + + def __init__(self, space): + self.space = space + self.cache = [self.space.newint(i) + for i in range(self.START, self.END)] + + def newint(self, intval): + if self.START <= intval < self.END: + return self.cache[intval - self.START] + return self.space.newint(intval) + + +class JSONDecoder(W_Root): + + LRU_SIZE = 16 + LRU_MASK = LRU_SIZE - 1 + + DEFAULT_SIZE_SCRATCH = 20 + + # string caching is only used if the total size of the message is larger + # than a megabyte. Below that, there can't be that many repeated big + # strings anyway (some experiments showed this to be a reasonable cutoff + # size) + MIN_SIZE_FOR_STRING_CACHE = 1024 * 1024 + + # evaluate the string cache for 200 strings, before looking at the hit rate + # and deciding whether to keep doing it + STRING_CACHE_EVALUATION_SIZE = 200 + + # keep using the string cache if at least 25% of all decoded strings are a + # hit in the cache + STRING_CACHE_USEFULNESS_FACTOR = 4 + + def __init__(self, space, s): self.space = space + self.w_empty_string = space.newutf8("", 0) + self.s = s + # we put our string in a raw buffer so: # 1) we automatically get the '\0' sentinel at the end of the string, # which means that we never have to check for the "end of string" # 2) we can pass the buffer directly to strtod - self.ll_chars = rffi.str2charp(s) + self.ll_chars, self.llobj, self.flag = rffi.get_nonmovingbuffer_ll_final_null(self.s) self.end_ptr = lltype.malloc(rffi.CCHARPP.TO, 1, flavor='raw') self.pos = 0 - self.cache = r_dict(slice_eq, slice_hash, simple_hash_eq=True) + self.intcache = space.fromcache(IntCache) + + # two caches, one for keys, one for general strings. they both have the + # form {hash-as-int: StringCacheEntry} and they don't deal with + # collisions at all. For every hash there is simply one string stored + # and we ignore collisions. + self.cache_keys = {} + self.cache_values = {} + + # we don't cache *all* non-key strings, that would be too expensive. + # instead, keep a cache of the last 16 strings hashes around and add a + # string to the cache only if its hash is seen a second time + self.lru_cache = [0] * self.LRU_SIZE + self.lru_index = 0 + + self.startmap = self.space.fromcache(Terminator) + + # keep a list of objects that are created with maps that aren't clearly + # useful. If they turn out to be useful in the end we are good, + # otherwise convert them to dicts (see .close()) + self.unclear_objects = [] + + # this is a freelist of lists that store the decoded value of an + # object, before they get copied into the eventual dict + self.scratch = [[None] * self.DEFAULT_SIZE_SCRATCH] + def close(self): - rffi.free_charp(self.ll_chars) + rffi.free_nonmovingbuffer_ll(self.ll_chars, self.llobj, self.flag) lltype.free(self.end_ptr, flavor='raw') + # clean up objects that are instances of now blocked maps + for w_obj in self.unclear_objects: + jsonmap = self._get_jsonmap_from_dict(w_obj) + if jsonmap.is_state_blocked(): + self._devolve_jsonmap_dict(w_obj) def getslice(self, start, end): assert start >= 0 @@ -61,23 +125,22 @@ return self.s[start:end] def skip_whitespace(self, i): + ll_chars = self.ll_chars while True: - ch = self.ll_chars[i] + ch = ll_chars[i] if is_whitespace(ch): - i+=1 + i += 1 else: break return i - @specialize.arg(1) - def _raise(self, msg, *args): - raise oefmt(self.space.w_ValueError, msg, *args) - - def decode_any(self, i): + def decode_any(self, i, contextmap=None): + """ Decode an object at position i. Optionally pass a contextmap, if + the value is decoded as the value of a dict. """ i = self.skip_whitespace(i) ch = self.ll_chars[i] if ch == '"': - return self.decode_string(i+1) + return self.decode_string(i+1, contextmap) elif ch == '[': return self.decode_array(i+1) elif ch == '{': @@ -102,6 +165,11 @@ self._raise("No JSON object could be decoded: unexpected '%s' at char %d", ch, i) + + @specialize.arg(1) + def _raise(self, msg, *args): + raise oefmt(self.space.w_ValueError, msg, *args) + def decode_null(self, i): if (self.ll_chars[i] == 'u' and self.ll_chars[i+1] == 'l' and @@ -162,7 +230,7 @@ return self.decode_int_slow(start) self.pos = i - return self.space.newint(intval) + return self.intcache.newint(intval) def decode_float(self, i): from rpython.rlib import rdtoa @@ -214,7 +282,22 @@ ovf_maybe = (count >= OVF_DIGITS) return i, ovf_maybe, sign * intval + def _raise_control_char_in_string(self, ch, startindex, currindex): + if ch == '\0': + self._raise("Unterminated string starting at char %d", + startindex - 1) + else: + self._raise("Invalid control character at char %d", currindex-1) + + def _raise_object_error(self, ch, start, i): + if ch == '\0': + self._raise("Unterminated object starting at char %d", start) + else: + self._raise("Unexpected '%s' when decoding object (char %d)", + ch, i) + def decode_array(self, i): + """ Decode a list. i must be after the opening '[' """ w_list = self.space.newlist([]) start = i i = self.skip_whitespace(start) @@ -248,62 +331,116 @@ self.pos = i+1 return self.space.newdict() - d = self._create_empty_dict() + if self.scratch: + values_w = self.scratch.pop() + else: + values_w = [None] * self.DEFAULT_SIZE_SCRATCH + nextindex = 0 + currmap = self.startmap while True: # parse a key: value - w_name = self.decode_key(i) + currmap = self.decode_key_map(i, currmap) i = self.skip_whitespace(self.pos) ch = self.ll_chars[i] if ch != ':': self._raise("No ':' found at char %d", i) i += 1 - i = self.skip_whitespace(i) - # - w_value = self.decode_any(i) - d[w_name] = w_value + + w_value = self.decode_any(i, currmap) + + if nextindex == len(values_w): # full + values_w = values_w + [None] * len(values_w) # double + values_w[nextindex] = w_value + nextindex += 1 i = self.skip_whitespace(self.pos) ch = self.ll_chars[i] i += 1 if ch == '}': self.pos = i - return self._create_dict(d) + self.scratch.append(values_w) # can reuse next time + if currmap.is_state_blocked(): + dict_w = self._switch_to_dict(currmap, values_w, nextindex) + return self._create_dict(dict_w) + values_w = values_w[:nextindex] + w_res = self._create_dict_map(values_w, currmap) + if not currmap.is_state_useful(): + self.unclear_objects.append(w_res) + return w_res elif ch == ',': - pass - elif ch == '\0': - self._raise("Unterminated object starting at char %d", start) + i = self.skip_whitespace(i) + if currmap.is_state_blocked(): + self.scratch.append(values_w) # can reuse next time + dict_w = self._switch_to_dict(currmap, values_w, nextindex) + return self.decode_object_dict(i, start, dict_w) else: - self._raise("Unexpected '%s' when decoding object (char %d)", - ch, i-1) + self._raise_object_error(ch, start, i - 1) - def decode_string(self, i): - start = i - bits = 0 + def _create_dict_map(self, values_w, jsonmap): + from pypy.objspace.std.jsondict import from_values_and_jsonmap + return from_values_and_jsonmap(self.space, values_w, jsonmap) + + def _devolve_jsonmap_dict(self, w_dict): + from pypy.objspace.std.jsondict import devolve_jsonmap_dict + devolve_jsonmap_dict(w_dict) + + def _get_jsonmap_from_dict(self, w_dict): + from pypy.objspace.std.jsondict import get_jsonmap_from_dict + return get_jsonmap_from_dict(w_dict) + + def _switch_to_dict(self, currmap, values_w, nextindex): + dict_w = self._create_empty_dict() + currmap.fill_dict(dict_w, values_w) + assert len(dict_w) == nextindex + return dict_w + + def decode_object_dict(self, i, start, dict_w): while True: - # this loop is a fast path for strings which do not contain escape - # characters + # parse a key: value + w_key = self.decode_key_string(i) + i = self.skip_whitespace(self.pos) + ch = self.ll_chars[i] + if ch != ':': + self._raise("No ':' found at char %d", i) + i += 1 + + w_value = self.decode_any(i) + dict_w[w_key] = w_value + i = self.skip_whitespace(self.pos) ch = self.ll_chars[i] i += 1 - bits |= ord(ch) - if ch == '"': + if ch == '}': self.pos = i - return self._create_string(start, i - 1, bits) - elif ch == '\\' or ch < '\x20': - self.pos = i-1 - return self.decode_string_escaped(start) + return self._create_dict(dict_w) + elif ch == ',': + i = self.skip_whitespace(i) + else: + self._raise_object_error(ch, start, i - 1) - def _create_string(self, start, end, bits): - if bits & 0x80: - # the 8th bit is set, it's an utf8 string - content_utf8 = self.getslice(start, end) + def decode_string_uncached(self, i): + start = i + ll_chars = self.ll_chars + nonascii, i = simd.find_end_of_string_no_hash(ll_chars, i, len(self.s)) + ch = ll_chars[i] + if ch == '\\': + self.pos = i + return self.decode_string_escaped(start, nonascii) + if ch < '\x20': + self._raise_control_char_in_string(ch, start, i) + else: + assert ch == '"' + + self.pos = i + 1 + return self._create_string_wrapped(start, i, nonascii) + + def _create_string_wrapped(self, start, end, nonascii): + content = self.getslice(start, end) + if nonascii: + # contains non-ascii chars, we need to check that it's valid utf-8 lgt = unicodehelper.check_utf8_or_raise(self.space, - content_utf8) - return self.space.newutf8(content_utf8, lgt) + content) else: - # ascii only, fast path (ascii is a strict subset of - # latin1, and we already checked that all the chars are < - # 128) - return self.space.newutf8(self.getslice(start, end), - end - start) + lgt = end - start + return self.space.newutf8(content, lgt) def _create_dict(self, d): from pypy.objspace.std.dictmultiobject import from_unicode_key_dict @@ -313,8 +450,7 @@ from pypy.objspace.std.dictmultiobject import create_empty_unicode_key_dict return create_empty_unicode_key_dict(self.space) - - def decode_string_escaped(self, start): + def decode_string_escaped(self, start, nonascii): i = self.pos builder = StringBuilder((i - start) * 2) # just an estimate assert start >= 0 @@ -325,25 +461,21 @@ i += 1 if ch == '"': content_utf8 = builder.build() - lgt = unicodehelper.check_utf8_or_raise(self.space, + length = unicodehelper.check_utf8_or_raise(self.space, content_utf8) self.pos = i - return self.space.newutf8(content_utf8, lgt) + return self.space.newutf8(content_utf8, length) elif ch == '\\': - i = self.decode_escape_sequence(i, builder) + i = self.decode_escape_sequence_to_utf8(i, builder) elif ch < '\x20': - if ch == '\0': - self._raise("Unterminated string starting at char %d", - start - 1) - else: - self._raise("Invalid control character at char %d", i-1) + self._raise_control_char_in_string(ch, start, i) else: builder.append(ch) - def decode_escape_sequence(self, i, builder): + def decode_escape_sequence_to_utf8(self, i, stringbuilder): ch = self.ll_chars[i] i += 1 - put = builder.append + put = stringbuilder.append if ch == '\\': put('\\') elif ch == '"': put('"' ) elif ch == '/': put('/' ) @@ -353,22 +485,37 @@ elif ch == 'r': put('\r') elif ch == 't': put('\t') elif ch == 'u': - return self.decode_escape_sequence_unicode(i, builder) + # may be a surrogate pair + return self.decode_escape_sequence_unicode(i, stringbuilder) else: self._raise("Invalid \\escape: %s (char %d)", ch, i-1) return i + def _get_int_val_from_hex4(self, i): + ll_chars = self.ll_chars + res = 0 + for i in range(i, i + 4): + ch = ord(ll_chars[i]) + if ord('a') <= ch <= ord('f'): + digit = ch - ord('a') + 10 + elif ord('A') <= ch <= ord('F'): + digit = ch - ord('A') + 10 + elif ord('0') <= ch <= ord('9'): + digit = ch - ord('0') + else: + raise ValueError + res = (res << 4) + digit + return res + def decode_escape_sequence_unicode(self, i, builder): # at this point we are just after the 'u' of the \u1234 sequence. start = i i += 4 - hexdigits = self.getslice(start, i) try: - val = int(hexdigits, 16) + val = self._get_int_val_from_hex4(start) if (0xd800 <= val <= 0xdbff and self.ll_chars[i] == '\\' and self.ll_chars[i+1] == 'u'): - hexdigits = self.getslice(i+2, i+6) - lowsurr = int(hexdigits, 16) + lowsurr = self._get_int_val_from_hex4(i + 2) if 0xdc00 <= lowsurr <= 0xdfff: # decode surrogate pair val = 0x10000 + (((val - 0xd800) << 10) | @@ -383,45 +530,618 @@ builder.append(utf8_ch) return i - def decode_key(self, i): - """ returns a wrapped unicode """ - from rpython.rlib.rarithmetic import intmask - i = self.skip_whitespace(i) + def decode_string(self, i, contextmap=None): + """ Decode a string at position i (which is right after the opening "). + Optionally pass a contextmap, if the value is decoded as the value of a + dict.""" + + ll_chars = self.ll_chars + start = i + ch = ll_chars[i] + if ch == '"': + self.pos = i + 1 + return self.w_empty_string # surprisingly common + + cache = True + if contextmap is not None: + # keep some statistics about the usefulness of the string cache on + # the contextmap + # the intuition about the contextmap is as follows: + # often there are string values stored in dictionaries that can + # never be usefully cached, like unique ids of objects. Then the + # strings *in those fields* of all objects should never be cached. + # However, the content of other fields can still be useful to + # cache. + contextmap.decoded_strings += 1 + if not contextmap.should_cache_strings(): + cache = False + if len(self.s) < self.MIN_SIZE_FOR_STRING_CACHE: + cache = False + + if not cache: + return self.decode_string_uncached(i) + + strhash, nonascii, i = simd.find_end_of_string(ll_chars, i, len(self.s)) + ch = ll_chars[i] + if ch == '\\': + self.pos = i + return self.decode_string_escaped(start, nonascii) + if ch < '\x20': + self._raise_control_char_in_string(ch, start, i) + else: + assert ch == '"' + + self.pos = i + 1 + + length = i - start + strhash ^= length + + # check cache first: + try: + entry = self.cache_values[strhash] + except KeyError: + w_res = self._create_string_wrapped(start, i, nonascii) + # only add *some* strings to the cache, because keeping them all is + # way too expensive. first we check if the contextmap has caching + # disabled completely. if not, we check whether we have recently + # seen the same hash already, if yes, we cache the string. + if ((contextmap is not None and + contextmap.decoded_strings < self.STRING_CACHE_EVALUATION_SIZE) or + strhash in self.lru_cache): + entry = StringCacheEntry( + self.getslice(start, start + length), w_res) + self.cache_values[strhash] = entry + else: + self.lru_cache[self.lru_index] = strhash + self.lru_index = (self.lru_index + 1) & self.LRU_MASK + return w_res + if not entry.compare(ll_chars, start, length): + # collision! hopefully rare + return self._create_string_wrapped(start, i, nonascii) + if contextmap is not None: + contextmap.cache_hits += 1 + return entry.w_uni + + def decode_key_map(self, i, currmap): + """ Given the current map currmap of an object, decode the next key at + position i. This returns the new map of the object. """ + newmap = self._decode_key_map(i, currmap) + currmap.observe_transition(newmap, self.startmap) + return newmap + + def _decode_key_map(self, i, currmap): + ll_chars = self.ll_chars + # first try to see whether we happen to find currmap.nextmap_first + nextmap = currmap.fast_path_key_parse(self, i) + if nextmap is not None: + return nextmap + + start = i + ch = ll_chars[i] + if ch != '"': + self._raise("Key name must be string at char %d", i) + i += 1 + w_key = self._decode_key_string(i) + return currmap.get_next(w_key, self.s, start, self.pos, self.startmap) + + def _decode_key_string(self, i): + """ decode key at position i as a string. Key strings are always + cached, since they repeat a lot. """ + ll_chars = self.ll_chars + start = i + + strhash, nonascii, i = simd.find_end_of_string(ll_chars, i, len(self.s)) + + ch = ll_chars[i] + if ch == '\\': + self.pos = i + w_key = self.decode_string_escaped(start, nonascii) + return w_key + if ch < '\x20': + self._raise_control_char_in_string(ch, start, i) + length = i - start + strhash ^= length + self.pos = i + 1 + # check cache first: + try: + entry = self.cache_keys[strhash] + except KeyError: + w_res = self._create_string_wrapped(start, i, nonascii) + entry = StringCacheEntry( + self.getslice(start, start + length), w_res) + self.cache_keys[strhash] = entry + return w_res + if not entry.compare(ll_chars, start, length): + # collision! hopefully rare + w_res = self._create_string_wrapped(start, i, nonascii) + else: + w_res = entry.w_uni + return w_res + + def decode_key_string(self, i): ll_chars = self.ll_chars ch = ll_chars[i] if ch != '"': self._raise("Key name must be string at char %d", i) i += 1 + return self._decode_key_string(i) - start = i - bits = 0 - strhash = ord(ll_chars[i]) << 7 - while True: - ch = ll_chars[i] + +class StringCacheEntry(object): + """ A cache entry, bundling the encoded version of a string as it appears + in the input string, and its wrapped decoded variant. """ + def __init__(self, repr, w_uni): + # repr is the escaped string + self.repr = repr + # uni is the wrapped decoded string + self.w_uni = w_uni + + def compare(self, ll_chars, start, length): + """ Check whether self.repr occurs at ll_chars[start:start+length] """ + if length != len(self.repr): + return False + index = start + for c in self.repr: + if not ll_chars[index] == c: + return False + index += 1 + return True + + +class MapBase(object): + """ A map implementation to speed up parsing of json dicts, and to + represent the resulting dicts more compactly and make access faster. """ + + # the basic problem we are trying to solve is the following: dicts in + # json can either be used as objects, or as dictionaries with arbitrary + # string keys. We want to use maps for the former, but not for the + # latter. But we don't know in advance which kind of dict is which. + + # Therefore we create "preliminary" maps where we aren't quite sure yet + # whether they are really useful maps or not. If we see them used often + # enough, we promote them to "useful" maps, which we will actually + # instantiate objects with. + + # If we determine that a map is not used often enough, we can turn it + # into a "blocked" map, which is a point in the map tree where we will + # switch to regular dicts, when we reach that part of the tree. + + # One added complication: We want to keep the number of preliminary maps + # bounded to prevent generating tons of useless maps. but also not too + # small, to support having a json file that contains many uniform objects + # with tons of keys. That's where the idea of "fringe" maps comes into + # play. They are maps that sit between known useful nodes and preliminary + # nodes in the map transition tree. We bound only the number of fringe + # nodes we are considering (to MAX_FRINGE), but not the number of + # preliminary maps. When we have too many fringe maps, we remove the least + # commonly instantiated fringe map and mark it as blocked. + + # allowed graph edges or nodes in nextmap_all: + # USEFUL ------- + # / \ \ + # v v v + # FRINGE USEFUL BLOCKED + # | + # v + # PRELIMINARY + # | + # v + # PRELIMINARY + + # state transitions: + # PRELIMINARY + # / | \ + # | v v + # | FRINGE -> USEFUL + # | | + # \ | + # v v + # BLOCKED + + # the nextmap_first edge can only be these graph edges: + # USEFUL + # | + # v + # USEFUL + # + # FRINGE + # | + # v + # PRELIMINARY + # | + # v + # PRELIMINARY + + USEFUL = 'u' + PRELIMINARY = 'p' + FRINGE = 'f' # buffer between PRELIMINARY and USEFUL + BLOCKED = 'b' + + # tunable parameters + MAX_FRINGE = 40 + USEFUL_THRESHOLD = 5 + + def __init__(self, space): + self.space = space + + # a single transition is stored in .nextmap_first + self.nextmap_first = None + + # nextmap_all is only initialized after seeing the *second* transition + # but then it also contains .nextmap_first + self.nextmap_all = None # later dict {key: nextmap} + + # keep some statistics about every map: how often it was instantiated + # and how many non-blocked leaves the map transition tree has, starting + # from self + self.instantiation_count = 0 + self.number_of_leaves = 1 + + def _check_invariants(self): + if self.nextmap_all: + for next in self.nextmap_all.itervalues(): + next._check_invariants() + elif self.nextmap_first: + self.nextmap_first._check_invariants() + + def get_next(self, w_key, string, start, stop, terminator): + from pypy.objspace.std.dictmultiobject import unicode_hash, unicode_eq + if isinstance(self, JSONMap): + assert not self.state == MapBase.BLOCKED + nextmap_first = self.nextmap_first + if (nextmap_first is not None and + nextmap_first.w_key.eq_w(w_key)): + return nextmap_first + + assert stop >= 0 + assert start >= 0 + + if nextmap_first is None: + # first transition ever seen, don't initialize nextmap_all + next = self._make_next_map(w_key, string[start:stop]) + self.nextmap_first = next + else: + if self.nextmap_all is None: + # 2nd transition ever seen + self.nextmap_all = objectmodel.r_dict(unicode_eq, unicode_hash, + force_non_null=True, simple_hash_eq=True) + self.nextmap_all[nextmap_first.w_key] = nextmap_first + else: + next = self.nextmap_all.get(w_key, None) + if next is not None: + return next + # if we are at this point we didn't find the transition yet, so + # create a new one + next = self._make_next_map(w_key, string[start:stop]) + self.nextmap_all[w_key] = next + + # one new leaf has been created + self.change_number_of_leaves(1) + + terminator.register_potential_fringe(next) + return next + + def change_number_of_leaves(self, difference): + """ add difference to .number_of_leaves of self and its parents """ + if not difference: + return + parent = self + while isinstance(parent, JSONMap): + parent.number_of_leaves += difference + parent = parent.prev + parent.number_of_leaves += difference # terminator + + def fast_path_key_parse(self, decoder, position): + """ Fast path when parsing the next key: We speculate that we will + always see a commonly seen next key, and use strcmp (implemented in + key_repr_cmp) to check whether that is the case. """ + nextmap_first = self.nextmap_first + if nextmap_first: + ll_chars = decoder.ll_chars + assert isinstance(nextmap_first, JSONMap) + if nextmap_first.key_repr_cmp(ll_chars, position): + decoder.pos = position + len(nextmap_first.key_repr) + return nextmap_first + return None + + def observe_transition(self, newmap, terminator): + """ observe a transition from self to newmap. + This does a few things, including updating the self size estimate with + the knowledge that one object transitioned from self to newmap. + also it potentially decides that self should move to state USEFUL.""" + newmap.instantiation_count += 1 + if isinstance(self, JSONMap) and self.state == MapBase.FRINGE: + if self.is_useful(): + self.mark_useful(terminator) + + def _make_next_map(self, w_key, key_repr): + return JSONMap(self.space, self, w_key, key_repr) + + def fill_dict(self, dict_w, values_w): + """ recursively fill the dictionary dict_w in the correct order, + reading from values_w.""" + raise NotImplementedError("abstract base") + + def _all_dot(self, output): + identity = objectmodel.compute_unique_id(self) + output.append('%s [shape=box%s];' % (identity, self._get_dot_text())) + if self.nextmap_all: + for w_key, value in self.nextmap_all.items(): + assert isinstance(value, JSONMap) + if value is self.nextmap_first: + color = ", color=blue" + else: + color = "" + output.append('%s -> %s [label="%s"%s];' % ( + identity, objectmodel.compute_unique_id(value), value.w_key._utf8, color)) + value._all_dot(output) + elif self.nextmap_first is not None: + value = self.nextmap_first + output.append('%s -> %s [label="%s", color=blue];' % ( + identity, objectmodel.compute_unique_id(value), value.w_key._utf8)) + value._all_dot(output) + + + def _get_dot_text(self): + return ", label=base" + + def view(self): + from dotviewer import graphclient + import pytest + r = ["digraph G {"] + self._all_dot(r) + r.append("}") + p = pytest.ensuretemp("jsonmap").join("temp.dot") + p.write("\n".join(r)) + graphclient.display_dot_file(str(p)) + + +class Terminator(MapBase): + """ The root node of the map transition tree. """ + def __init__(self, space): + MapBase.__init__(self, space) + # a set of all map nodes that are currently in the FRINGE state + self.current_fringe = {} + + def register_potential_fringe(self, prelim): + """ add prelim to the fringe, if its prev is either a Terminator or + useful. """ + prev = prelim.prev + if (isinstance(prev, Terminator) or + isinstance(prev, JSONMap) and prev.state == MapBase.USEFUL): + assert prelim.state == MapBase.PRELIMINARY + prelim.state = MapBase.FRINGE + + if len(self.current_fringe) > MapBase.MAX_FRINGE: + self.cleanup_fringe() + self.current_fringe[prelim] = None + + def remove_from_fringe(self, former_fringe): + """ Remove former_fringe from self.current_fringe. """ + assert former_fringe.state in (MapBase.USEFUL, MapBase.BLOCKED) + del self.current_fringe[former_fringe] + + def cleanup_fringe(self): + """ remove the least-instantiated fringe map and block it.""" + min_fringe = None + min_avg = 1e200 + for f in self.current_fringe: + assert f.state == MapBase.FRINGE + avg = f.average_instantiation() + if avg < min_avg: + min_avg = avg + min_fringe = f + assert min_fringe + min_fringe.mark_blocked(self) + + def fill_dict(self, dict_w, values_w): + """ recursively fill the dictionary dict_w in the correct order, + reading from values_w.""" + return 0 + + def _check_invariants(self): + for fringe in self.current_fringe: + assert fringe.state == MapBase.FRINGE + +class JSONMap(MapBase): + """ A map implementation to speed up parsing """ + + def __init__(self, space, prev, w_key, key_repr): + MapBase.__init__(self, space) + + self.prev = prev + self.w_key = w_key + self.key_repr = key_repr + + self.state = MapBase.PRELIMINARY + + # key decoding stats + self.decoded_strings = 0 + self.cache_hits = 0 + + # for jsondict support + self.key_to_index = None + self.keys_in_order = None + self.strategy_instance = None + + def __repr__(self): + return "<JSONMap key_repr=%s #instantiation=%s #leaves=%s prev=%r>" % ( + self.key_repr, self.instantiation_count, self.number_of_leaves, self.prev) + + def _get_terminator(self): # only for _check_invariants + while isinstance(self, JSONMap): + self = self.prev + assert isinstance(self, Terminator) + return self + + def _check_invariants(self): + assert self.state in ( + MapBase.USEFUL, + MapBase.PRELIMINARY, + MapBase.FRINGE, + MapBase.BLOCKED, + ) + + prev = self.prev + if isinstance(prev, JSONMap): + prevstate = prev.state + else: + prevstate = MapBase.USEFUL + + if prevstate == MapBase.USEFUL: + assert self.state != MapBase.PRELIMINARY + elif prevstate == MapBase.PRELIMINARY: + assert self.state == MapBase.PRELIMINARY + elif prevstate == MapBase.FRINGE: + assert self.state == MapBase.PRELIMINARY + else: + # if prevstate is BLOCKED, we shouldn't have recursed here! + assert False, "should be unreachable" + + if self.state == MapBase.BLOCKED: + assert self.nextmap_first is None + assert self.nextmap_all is None + elif self.state == MapBase.FRINGE: + assert self in self._get_terminator().current_fringe + + MapBase._check_invariants(self) + + def mark_useful(self, terminator): + """ mark self as useful, and also the most commonly instantiated + children, recursively """ + was_fringe = self.state == MapBase.FRINGE + assert self.state in (MapBase.FRINGE, MapBase.PRELIMINARY) + self.state = MapBase.USEFUL + if was_fringe: + terminator.remove_from_fringe(self) + # find the most commonly instantiated child, store it into + # nextmap_first and mark it useful, recursively + maxchild = self.nextmap_first + if self.nextmap_all is not None: + for child in self.nextmap_all.itervalues(): + if child.instantiation_count > maxchild.instantiation_count: + maxchild = child + if maxchild is not None: + maxchild.mark_useful(terminator) + if self.nextmap_all: + for child in self.nextmap_all.itervalues(): + if child is not maxchild: + terminator.register_potential_fringe(child) + self.nextmap_first = maxchild + + def mark_blocked(self, terminator): + """ mark self and recursively all its children as blocked.""" + was_fringe = self.state == MapBase.FRINGE + self.state = MapBase.BLOCKED + if was_fringe: + terminator.remove_from_fringe(self) + if self.nextmap_all: + for next in self.nextmap_all.itervalues(): + next.mark_blocked(terminator) + elif self.nextmap_first: + self.nextmap_first.mark_blocked(terminator) + self.nextmap_first = None + self.nextmap_all = None + self.change_number_of_leaves(-self.number_of_leaves + 1) + + def is_state_blocked(self): + return self.state == MapBase.BLOCKED + + def is_state_useful(self): + return self.state == MapBase.USEFUL + + def average_instantiation(self): + """ the number of instantiations, divided by the number of leaves. We + want to favor nodes that have either a high instantiation count, or few + leaves below it. """ + return self.instantiation_count / float(self.number_of_leaves) + + def is_useful(self): + return self.average_instantiation() > self.USEFUL_THRESHOLD + + def should_cache_strings(self): + """ return whether strings parsed in the context of this map should be + cached. """ + # we should cache if either we've seen few strings so far (less than + # STRING_CACHE_EVALUATION_SIZE), or if we've seen many, and the cache + # hit rate has been high enough + return not (self.decoded_strings > JSONDecoder.STRING_CACHE_EVALUATION_SIZE and + self.cache_hits * JSONDecoder.STRING_CACHE_USEFULNESS_FACTOR < self.decoded_strings) + + def key_repr_cmp(self, ll_chars, i): + # XXX should we use "real" memcmp (here in particular, and in other + # places in RPython in general)? + for j, c in enumerate(self.key_repr): + if ll_chars[i] != c: + return False i += 1 - if ch == '"': - break - elif ch == '\\' or ch < '\x20': - self.pos = i-1 - return self.decode_string_escaped(start) - strhash = intmask((1000003 * strhash) ^ ord(ll_chars[i])) - bits |= ord(ch) - length = i - start - 1 - if length == 0: - strhash = -1 + return True + + def fill_dict(self, dict_w, values_w): + index = self.prev.fill_dict(dict_w, values_w) + dict_w[self.w_key] = values_w[index] + return index + 1 + + # _____________________________________________________ + # methods for JsonDictStrategy + + @jit.elidable + def get_index(self, w_key): + from pypy.objspace.std.unicodeobject import W_UnicodeObject + assert isinstance(w_key, W_UnicodeObject) + return self.get_key_to_index().get(w_key, -1) + + def get_key_to_index(self): + from pypy.objspace.std.dictmultiobject import unicode_hash, unicode_eq + key_to_index = self.key_to_index + if key_to_index is None: + key_to_index = self.key_to_index = objectmodel.r_dict(unicode_eq, unicode_hash, + force_non_null=True, simple_hash_eq=True) + # compute depth + curr = self + depth = 0 + while True: + depth += 1 + curr = curr.prev + if not isinstance(curr, JSONMap): + break + + curr = self + while depth: + depth -= 1 + key_to_index[curr.w_key] = depth + curr = curr.prev + if not isinstance(curr, JSONMap): + break + return key_to_index + + def get_keys_in_order(self): + keys_in_order = self.keys_in_order + if keys_in_order is None: + key_to_index = self.get_key_to_index() + keys_in_order = self.keys_in_order = [None] * len(key_to_index) + for w_key, index in key_to_index.iteritems(): + keys_in_order[index] = w_key + return keys_in_order + + # _____________________________________________________ + + def _get_dot_text(self): + if self.nextmap_all is None: + l = int(self.nextmap_first is not None) else: - strhash ^= length - strhash = intmask(strhash) - self.pos = i - # check cache first: - key = (ll_chars, start, length, strhash) - try: - return self.cache[key] - except KeyError: - pass - res = self._create_string(start, i - 1, bits) - self.cache[key] = res + l = len(self.nextmap_all) + extra = "" + if self.decoded_strings: + extra = "\\n%s/%s (%s%%)" % (self.cache_hits, self.decoded_strings, self.cache_hits/float(self.decoded_strings)) + res = ', label="#%s\\nchildren: %s%s"' % (self.instantiation_count, l, extra) + if self.state == MapBase.BLOCKED: + res += ", fillcolor=lightsalmon" + if self.state == MapBase.FRINGE: + res += ", fillcolor=lightgray" + if self.state == MapBase.PRELIMINARY: + res += ", fillcolor=lightslategray" return res @@ -442,3 +1162,4 @@ return w_res finally: decoder.close() + diff --git a/pypy/module/_pypyjson/simd.py b/pypy/module/_pypyjson/simd.py new file mode 100644 --- /dev/null +++ b/pypy/module/_pypyjson/simd.py @@ -0,0 +1,225 @@ +from rpython.rtyper.lltypesystem import lltype, rffi +from rpython.rlib import objectmodel, unroll +from rpython.rlib.rarithmetic import r_uint, intmask, LONG_BIT +from rpython.jit.backend.detect_cpu import autodetect + +# accelerators for string operations using simd on regular word sizes (*not* +# SSE instructions). this style is sometimes called SWAR (SIMD Within A +# Register) or "broadword techniques" + +# XXX remove wordsize and endianness restrictions properly, so far only x86-64 +# is tested + +USE_SIMD = False +if LONG_BIT == 64: + WORD_SIZE = 8 + EVERY_BYTE_ONE = 0x0101010101010101 + EVERY_BYTE_HIGHEST_BIT = 0x8080808080808080 + if autodetect() == "x86-64": + USE_SIMD = True +else: + WORD_SIZE = 4 + EVERY_BYTE_ONE = 0x01010101 + EVERY_BYTE_HIGHEST_BIT = 0x80808080 + + +# helpers + +unrolling_wordsize = unroll.unrolling_iterable(range(WORD_SIZE)) + +def char_repeated_word_width(ch): + return r_uint(EVERY_BYTE_ONE) * ord(ch) + +def any_char_zero(word): + return (word - r_uint(EVERY_BYTE_ONE)) & ~word & r_uint(EVERY_BYTE_HIGHEST_BIT) + +def any_char_in_words_zero(*words): + return _any_char_in_any_word_zero_accum(0, *words) + +def _any_char_in_any_word_zero_accum(accum, word, *words): + accum |= (word - r_uint(EVERY_BYTE_ONE)) & ~word + if not words: + return accum & r_uint(EVERY_BYTE_HIGHEST_BIT) + return _any_char_in_any_word_zero_accum(accum, *words) + +def print_chars(word): + # for debugging + out = '' + for i in range(WORD_SIZE): + out += chr(word & 0xff) + word >>= 8 + return out + +def index_nonzero(word): + # XXX can be done very cheap in theory + assert word + for i in unrolling_wordsize: + if word & 0xff: + return i + word >>= 8 + assert 0 + +def index_zero(word): + # XXX can be done very cheap in theory + assert any_char_zero(word) + for i in unrolling_wordsize: + if not word & 0xff: + return i + word >>= 8 + assert 0 # XXX ??? + +def set_high_bytes_to_zero(word, keep_bytes): + mask = ((~r_uint(0)) << (8 * keep_bytes)) + return word & ~mask + + + +@objectmodel.always_inline +def position_string_ender(word): + maskquote = char_repeated_word_width('"') + maskbackslash = char_repeated_word_width('\\') + maskx20 = char_repeated_word_width(chr(0xff - 0x1f)) + # x1 and x2 check for equality, if a byte is 0 the corresponding + # char is equal to " or \ + x1 = maskquote ^ word + x2 = maskbackslash ^ word + # x3 checks for char < 0x20, the byte is 0 in that case + x3 = maskx20 & word + return any_char_in_words_zero(x1, x2, x3) + +@objectmodel.always_inline +def find_end_of_string_simd_unaligned(ll_chars, startpos, length): + ch = ll_chars[startpos] + strhash = (ord(ch) << 7) ^ 0x345678 + + wordarray = rffi.cast(rffi.ULONGP, rffi.ptradd(ll_chars, startpos)) + num_safe_reads = (length - startpos) // WORD_SIZE + + bits = 0 + for i in range(num_safe_reads): + word = wordarray[i] + cond = position_string_ender(word) + if cond: + break + bits |= word + strhash = intmask((1000003 * strhash) ^ intmask(word)) + else: + # didn't find end of string yet, look at remaining chars + word = 0 + shift = 0 + i = startpos + num_safe_reads * WORD_SIZE + while True: # this loop should run at most WORD_SIZE times, + # if we assume that ll_chars[length] == '\x00' + ch = ll_chars[i] + if ch == '"' or ch == '\\' or ch < '\x20': + break + i += 1 + bits |= ord(ch) + word |= ord(ch) << shift + shift += 8 + if shift: + strhash = intmask((1000003 * strhash) ^ intmask(word)) + + nonascii = bool(bits & char_repeated_word_width(chr(0x80))) + return strhash, nonascii, i + + # compute endposition + nonzero = index_nonzero(cond) + endposition = startpos + i * WORD_SIZE + nonzero + if nonzero: + word = set_high_bytes_to_zero(word, nonzero) + bits |= word + strhash = intmask((1000003 * strhash) ^ intmask(word)) + + nonascii = bool(bits & char_repeated_word_width(chr(0x80))) + + return strhash, nonascii, endposition + +@objectmodel.always_inline +def find_end_of_string_simd_unaligned_no_hash(ll_chars, startpos, length): + ch = ll_chars[startpos] + + wordarray = rffi.cast(rffi.ULONGP, rffi.ptradd(ll_chars, startpos)) + num_safe_reads = (length - startpos) // WORD_SIZE + + bits = 0 + for i in range(num_safe_reads): + word = wordarray[i] + cond = position_string_ender(word) + if cond: + break + bits |= word + else: + # didn't find end of string yet, look at remaining chars + i = startpos + num_safe_reads * WORD_SIZE + while True: # this loop should run at most WORD_SIZE times, + # if we assume that ll_chars[length] == '\x00' + ch = ll_chars[i] + if ch == '"' or ch == '\\' or ch < '\x20': + break + i += 1 + bits |= ord(ch) + + nonascii = bool(bits & char_repeated_word_width(chr(0x80))) + return nonascii, i + + # compute endposition + nonzero = index_nonzero(cond) + endposition = startpos + i * WORD_SIZE + nonzero + if nonzero: + word = set_high_bytes_to_zero(word, nonzero) + bits |= word + + nonascii = bool(bits & char_repeated_word_width(chr(0x80))) + + return nonascii, endposition + + +@objectmodel.always_inline +def find_end_of_string_slow(ll_chars, i, length): + ch = ll_chars[i] + strhash = (ord(ch) << 7) ^ 0x345678 + word = 0 + shift = 0 + + bits = 0 + + while True: + # this loop is a fast path for strings which do not contain escape + # characters + ch = ll_chars[i] + if ch == '"' or ch == '\\' or ch < '\x20': + break + i += 1 + bits |= ord(ch) + + word |= ord(ch) << shift + shift += 8 + if shift == WORD_SIZE * 8: + strhash = intmask((1000003 * strhash) ^ word) + shift = 0 + word = 0 + + if shift: + strhash = intmask((1000003 * strhash) ^ word) + return strhash, bool(bits & 0x80), i + +@objectmodel.always_inline +def find_end_of_string_slow_no_hash(ll_chars, i, length): + bits = 0 + while True: + # this loop is a fast path for strings which do not contain escape + # characters + ch = ll_chars[i] + if ch == '"' or ch == '\\' or ch < '\x20': + break + i += 1 + bits |= ord(ch) + return bool(bits & 0x80), i + +if USE_SIMD: + find_end_of_string = find_end_of_string_simd_unaligned + find_end_of_string_no_hash = find_end_of_string_simd_unaligned_no_hash +else: + find_end_of_string = find_end_of_string_slow + find_end_of_string_no_hash = find_end_of_string_slow_no_hash diff --git a/pypy/module/_pypyjson/test/test__pypyjson.py b/pypy/module/_pypyjson/test/test__pypyjson.py --- a/pypy/module/_pypyjson/test/test__pypyjson.py +++ b/pypy/module/_pypyjson/test/test__pypyjson.py @@ -1,31 +1,266 @@ # -*- encoding: utf-8 -*- -from pypy.module._pypyjson.interp_decoder import JSONDecoder +import pytest +from pypy.module._pypyjson.interp_decoder import JSONDecoder, Terminator, MapBase +from rpython.rtyper.lltypesystem import lltype, rffi -def test_skip_whitespace(): - s = ' hello ' - dec = JSONDecoder('fake space', s) - assert dec.pos == 0 - assert dec.skip_whitespace(0) == 3 - assert dec.skip_whitespace(3) == 3 - assert dec.skip_whitespace(8) == len(s) - dec.close() -class FakeSpace(object): - def newutf8(self, s, l): - return s +class TestJson(object): + def test_skip_whitespace(self): + s = ' hello ' + dec = JSONDecoder(self.space, s) + assert dec.pos == 0 + assert dec.skip_whitespace(0) == 3 + assert dec.skip_whitespace(3) == 3 + assert dec.skip_whitespace(8) == len(s) + dec.close() -def test_decode_key(): - s1 = "123" * 100 - s = ' "%s" "%s" ' % (s1, s1) - dec = JSONDecoder(FakeSpace(), s) - assert dec.pos == 0 - x = dec.decode_key(0) - assert x == s1 - # check caching - y = dec.decode_key(dec.pos) - assert y == s1 - assert y is x - dec.close() + def test_json_map(self): + m = Terminator(self.space) + w_a = self.space.newutf8("a", 1) + w_b = self.space.newutf8("b", 1) + w_c = self.space.newutf8("c", 1) + m1 = m.get_next(w_a, '"a"', 0, 3, m) + assert m1.w_key == w_a + assert m1.nextmap_first is None + assert m1.key_repr == '"a"' + assert m1.key_repr_cmp('"a": 123', 0) + assert not m1.key_repr_cmp('b": 123', 0) + assert m.nextmap_first.w_key == w_a + + m2 = m.get_next(w_a, '"a"', 0, 3, m) + assert m2 is m1 + + m3 = m.get_next(w_b, '"b"', 0, 3, m) + assert m3.w_key == w_b + assert m3.nextmap_first is None + assert m3.key_repr == '"b"' + assert m.nextmap_first is m1 + + m4 = m3.get_next(w_c, '"c"', 0, 3, m) + assert m4.w_key == w_c + assert m4.nextmap_first is None + assert m4.key_repr == '"c"' + assert m3.nextmap_first is m4 + + def test_json_map_get_index(self): + m = Terminator(self.space) + w_a = self.space.newutf8("a", 1) + w_b = self.space.newutf8("b", 1) + w_c = self.space.newutf8("c", 1) + m1 = m.get_next(w_a, 'a"', 0, 2, m) + assert m1.get_index(w_a) == 0 + assert m1.get_index(w_b) == -1 + + m2 = m.get_next(w_b, 'b"', 0, 2, m) + assert m2.get_index(w_b) == 0 + assert m2.get_index(w_a) == -1 + + m3 = m2.get_next(w_c, 'c"', 0, 2, m) + assert m3.get_index(w_b) == 0 + assert m3.get_index(w_c) == 1 + assert m3.get_index(w_a) == -1 + + def test_jsonmap_fill_dict(self): + from collections import OrderedDict + m = Terminator(self.space) + space = self.space + w_a = space.newutf8("a", 1) + w_b = space.newutf8("b", 1) + w_c = space.newutf8("c", 1) + m1 = m.get_next(w_a, 'a"', 0, 2, m) + m2 = m1.get_next(w_b, 'b"', 0, 2, m) + m3 = m2.get_next(w_c, 'c"', 0, 2, m) + d = OrderedDict() + m3.fill_dict(d, [space.w_None, space.w_None, space.w_None]) + assert list(d) == [w_a, w_b, w_c] + + + def test_decode_key_map(self): + m = Terminator(self.space) + m_diff = Terminator(self.space) + for s1 in ["abc", "1001" * 10, u"ä".encode("utf-8")]: + s = ' "%s" "%s" "%s"' % (s1, s1, s1) + dec = JSONDecoder(self.space, s) + assert dec.pos == 0 + m1 = dec.decode_key_map(dec.skip_whitespace(0), m) + assert m1.w_key._utf8 == s1 + assert m1.key_repr == '"%s"' % s1 + + # check caching on w_key level + m2 = dec.decode_key_map(dec.skip_whitespace(dec.pos), m_diff) + assert m1.w_key is m2.w_key + + # check caching on map level + m3 = dec.decode_key_map(dec.skip_whitespace(dec.pos), m_diff) + assert m3 is m2 + dec.close() + + def test_decode_string_caching(self): + for s1 in ["abc", u"ä".encode("utf-8")]: + s = '"%s" "%s" "%s"' % (s1, s1, s1) + dec = JSONDecoder(self.space, s) + dec.MIN_SIZE_FOR_STRING_CACHE = 0 + assert dec.pos == 0 + w_x = dec.decode_string(1) + w_y = dec.decode_string(dec.skip_whitespace(dec.pos) + 1) + assert w_x is not w_y + # check caching + w_z = dec.decode_string(dec.skip_whitespace(dec.pos) + 1) + assert w_z is w_y + dec.close() + + def _make_some_maps(self): + # base -> m1 -> m2 -> m3 + # \-> m4 + w_a = self.space.newutf8("a", 1) + w_b = self.space.newutf8("b", 1) + w_c = self.space.newutf8("c", 1) + w_d = self.space.newutf8("d", 1) + base = Terminator(self.space) + base.instantiation_count = 6 + m1 = base.get_next(w_a, 'a"', 0, 2, base) + m2 = m1.get_next(w_b, 'b"', 0, 2, base) + m3 = m2.get_next(w_c, 'c"', 0, 2, base) + m4 = m2.get_next(w_d, 'd"', 0, 2, base) + return base, m1, m2, m3, m4 + + # unit tests for map state transistions + def test_fringe_to_useful(self): + base, m1, m2, m3, m4 = self._make_some_maps() + base.instantiation_count = 6 + assert m1.state == MapBase.FRINGE + m1.instantiation_count = 6 + + assert m2.state == MapBase.PRELIMINARY + m2.instantiation_count = 6 + + assert m3.state == MapBase.PRELIMINARY + m3.instantiation_count = 2 + assert m2.nextmap_first is m3 + + assert m4.state == MapBase.PRELIMINARY + m4.instantiation_count = 4 + + m1.mark_useful(base) + assert m1.state == MapBase.USEFUL + assert m2.state == MapBase.USEFUL + assert m3.state == MapBase.FRINGE + assert m4.state == MapBase.USEFUL + assert m2.nextmap_first is m4 + + assert m1.number_of_leaves == 2 + base._check_invariants() + + def test_number_of_leaves(self): + w_x = self.space.newutf8("x", 1) + base, m1, m2, m3, m4 = self._make_some_maps() + assert base.number_of_leaves == 2 + assert m1.number_of_leaves == 2 + assert m2.number_of_leaves == 2 + assert m3.number_of_leaves == 1 + assert m4.number_of_leaves == 1 + m5 = m2.get_next(w_x, 'x"', 0, 2, base) + assert base.number_of_leaves == 3 + assert m1.number_of_leaves == 3 + assert m2.number_of_leaves == 3 + assert m5.number_of_leaves == 1 + + def test_number_of_leaves_after_mark_blocked(self): + w_x = self.space.newutf8("x", 1) + base, m1, m2, m3, m4 = self._make_some_maps() + m5 = m2.get_next(w_x, 'x"', 0, 2, base) + assert base.number_of_leaves == 3 + m2.mark_blocked(base) + assert base.number_of_leaves == 1 + + def test_mark_useful_cleans_fringe(self): + base, m1, m2, m3, m4 = self._make_some_maps() + base.instantiation_count = 6 + assert m1.state == MapBase.FRINGE + m1.instantiation_count = 6 + m2.instantiation_count = 6 + m3.instantiation_count = 2 + m4.instantiation_count = 4 + assert base.current_fringe == {m1: None} + + m1.mark_useful(base) + assert base.current_fringe == {m3: None} + + def test_cleanup_fringe(self): + w_a = self.space.newutf8("a", 1) + w_b = self.space.newutf8("b", 1) + w_c = self.space.newutf8("c", 1) + w_d = self.space.newutf8("d", 1) + base = Terminator(self.space) + base.instantiation_count = 6 + m1 = base.get_next(w_a, 'a"', 0, 2, base) + m2 = base.get_next(w_b, 'b"', 0, 2, base) + m3 = base.get_next(w_c, 'c"', 0, 2, base) + m4 = base.get_next(w_d, 'd"', 0, 2, base) + m5 = m4.get_next(w_a, 'a"', 0, 2, base) + base.instantiation_count = 7 + m1.instantiation_count = 2 + m2.instantiation_count = 2 + m3.instantiation_count = 2 + m4.instantiation_count = 1 + m5.instantiation_count = 1 + assert base.current_fringe == dict.fromkeys([m1, m2, m3, m4]) + + base.cleanup_fringe() + assert base.current_fringe == dict.fromkeys([m1, m2, m3]) + assert m4.state == MapBase.BLOCKED + assert m4.nextmap_first is None + assert m4.nextmap_all is None + assert m5.state == MapBase.BLOCKED + assert m5.nextmap_first is None + assert m5.nextmap_all is None + + def test_deal_with_blocked(self): + w_a = self.space.newutf8("a", 1) + w_b = self.space.newutf8("b", 1) + w_c = self.space.newutf8("c", 1) + space = self.space + s = '{"a": 1, "b": 2, "c": 3}' + dec = JSONDecoder(space, s) + dec.startmap = base = Terminator(space) + m1 = base.get_next(w_a, 'a"', 0, 2, base) + m2 = m1.get_next(w_b, 'b"', 0, 2, base) + m2.mark_blocked(base) + w_res = dec.decode_object(1) + assert space.int_w(space.len(w_res)) == 3 + assert space.int_w(space.getitem(w_res, w_a)) == 1 + assert space.int_w(space.getitem(w_res, w_b)) == 2 + assert space.int_w(space.getitem(w_res, w_c)) == 3 + dec.close() + + def test_deal_with_blocked_number_of_leaves(self): + w_a = self.space.newutf8("a", 1) + w_b = self.space.newutf8("b", 1) + w_x = self.space.newutf8("x", 1) + w_u = self.space.newutf8("u", 1) + space = self.space + base = Terminator(space) + m1 = base.get_next(w_a, 'a"', 0, 2, base) + m2 = m1.get_next(w_b, 'b"', 0, 2, base) + m2.get_next(w_x, 'x"', 0, 2, base) + m2.get_next(w_u, 'u"', 0, 2, base) + assert base.number_of_leaves == 2 + m2.mark_blocked(base) + assert base.number_of_leaves == 1 + + def test_instatiation_count(self): + m = Terminator(self.space) + dec = JSONDecoder(self.space, '"abc" "def"') + m1 = dec.decode_key_map(dec.skip_whitespace(0), m) + m2 = dec.decode_key_map(dec.skip_whitespace(6), m1) + m1 = dec.decode_key_map(dec.skip_whitespace(0), m) + m2 = dec.decode_key_map(dec.skip_whitespace(6), m1) + m1 = dec.decode_key_map(dec.skip_whitespace(0), m) + + assert m1.instantiation_count == 3 + assert m2.instantiation_count == 2 + dec.close() + class AppTest(object): spaceconfig = {"objspace.usemodules._pypyjson": True} @@ -55,7 +290,7 @@ raises(ValueError, _pypyjson.loads, 'fa') raises(ValueError, _pypyjson.loads, 'f') raises(ValueError, _pypyjson.loads, 'falXX') - + def test_decode_string(self): import _pypyjson @@ -85,7 +320,7 @@ import _pypyjson assert _pypyjson.loads(r'"\\"') == u'\\' assert _pypyjson.loads(r'"\""') == u'"' - assert _pypyjson.loads(r'"\/"') == u'/' + assert _pypyjson.loads(r'"\/"') == u'/' assert _pypyjson.loads(r'"\b"') == u'\b' assert _pypyjson.loads(r'"\f"') == u'\f' assert _pypyjson.loads(r'"\n"') == u'\n' @@ -101,12 +336,19 @@ import _pypyjson s = r'"hello\nworld' # missing the trailing " raises(ValueError, "_pypyjson.loads(s)") - + def test_escape_sequence_unicode(self): import _pypyjson s = r'"\u1234"' assert _pypyjson.loads(s) == u'\u1234' + def test_escape_sequence_mixed_with_utf8(self): + import _pypyjson + utf8 = u'ä"'.encode("utf-8") + assert _pypyjson.loads(r'"abc\\' + utf8) == u'abc\\ä' + assert _pypyjson.loads(r'"abc\"' + utf8) == u'abc"ä' + assert _pypyjson.loads(r'"def\u1234' + utf8) == u'def\u1234ä' + def test_invalid_utf_8(self): import _pypyjson s = '"\xe0"' # this is an invalid UTF8 sequence inside a string @@ -176,13 +418,18 @@ s = '{"hello": "world", "aaa": "bbb"}' assert _pypyjson.loads(s) == {'hello': 'world', 'aaa': 'bbb'} + assert _pypyjson.loads(s) == {'hello': 'world', + 'aaa': 'bbb'} raises(ValueError, _pypyjson.loads, '{"key"') raises(ValueError, _pypyjson.loads, '{"key": 42') + assert _pypyjson.loads('{"neighborhood": ""}') == { + "neighborhood": ""} + def test_decode_object_nonstring_key(self): import _pypyjson raises(ValueError, "_pypyjson.loads('{42: 43}')") - + def test_decode_array(self): import _pypyjson assert _pypyjson.loads('[]') == [] @@ -263,3 +510,4 @@ for inputtext, errmsg in test_cases: exc = raises(ValueError, _pypyjson.loads, inputtext) assert str(exc.value) == errmsg + diff --git a/pypy/module/_pypyjson/test/test_simd.py b/pypy/module/_pypyjson/test/test_simd.py new file mode 100644 --- /dev/null +++ b/pypy/module/_pypyjson/test/test_simd.py @@ -0,0 +1,110 @@ +import sys +import pytest +from rpython.rtyper.lltypesystem import lltype, rffi +from rpython.rlib.rarithmetic import r_uint, intmask + +from pypy.module._pypyjson.simd import USE_SIMD +from pypy.module._pypyjson.simd import find_end_of_string_slow +from pypy.module._pypyjson.simd import find_end_of_string_slow_no_hash +from pypy.module._pypyjson.simd import print_chars +from pypy.module._pypyjson.simd import find_end_of_string_simd_unaligned, WORD_SIZE +from pypy.module._pypyjson.simd import find_end_of_string_simd_unaligned_no_hash + +try: + from hypothesis import example, given, strategies +except ImportError: + pytest.skip("missing hypothesis!") + +if not USE_SIMD: + pytest.skip("only implemented for 64 bit for now") + +def fill_to_word_size(res, ch=" "): + if len(res) % WORD_SIZE != 0: + res += ch * (WORD_SIZE - (len(res) % WORD_SIZE)) + return res + +def string_to_word(s): + assert len(s) == WORD_SIZE + ll_chars, llobj, flag = rffi.get_nonmovingbuffer_ll_final_null(s) + try: + wordarray = rffi.cast(rffi.ULONGP, ll_chars) + return wordarray[0] + finally: + rffi.free_nonmovingbuffer_ll(ll_chars, llobj, flag) + +def ll(callable, string, *args): + ll_chars, llobj, flag = rffi.get_nonmovingbuffer_ll_final_null(string) + try: + return callable(ll_chars, *args) + finally: + rffi.free_nonmovingbuffer_ll(ll_chars, llobj, flag) + +word = strategies.builds( + r_uint, strategies.integers(min_value=-sys.maxint-1, max_value=sys.maxint)) + +def build_string(prefix, content, end, suffix): + res = prefix + '"' + "".join([chr(x) for x in content]) + end + suffix + return fill_to_word_size(res), len(prefix) + 1 + +string_in_context_strategy = strategies.builds( + build_string, prefix=strategies.binary(), + content=strategies.lists(strategies.integers(1, 255), min_size=1), + end=strategies.sampled_from('"\\\x00\x01'), + suffix=strategies.binary()) + +def compare(string, res1, res2): + hash1, nonascii1, endindex1 = res1 + hash2, nonascii2, endindex2 = res2 + assert endindex1 == endindex2 + if string[endindex1 - 1] == '"': + assert hash1 == hash2 + assert nonascii1 == nonascii2 + + +@example(('" \x80" ', 1)) +@example(('"\x01" ', 1)) +@example(('"aaaaaaaa"\x00\x00\x00\x00\x00\x00\x00 ', 1)) +@example(('"aaaaaaaa" ', 1)) +@example(('"12"', 1)) +@example(('"1234567abcdefghAB"', 1)) +@example(('"1234567abcdefgh"', 1)) +@example((' "123456ABCDEF" \x00', 2)) +@example((' "123456aaaaaaaaABCDEF"\x00', 2)) +@given(string_in_context_strategy) +def test_find_end_of_string(a): + (string, startindex) = a + res = ll(find_end_of_string_slow, string, startindex, len(string)) + hash, nonascii1, endposition1 = res + res2 = ll(find_end_of_string_slow_no_hash, string, startindex, len(string)) + assert res2 == (nonascii1, endposition1) + ch = string[endposition1] + assert ch == '"' or ch == '\\' or ch < '\x20' + for ch in string[startindex:endposition1]: + assert not (ch == '"' or ch == '\\' or ch < '\x20') + compare(string, res, ll(find_end_of_string_simd_unaligned, string, startindex, len(string))) + + nonascii2, endposition2 = ll(find_end_of_string_simd_unaligned_no_hash, string, startindex, len(string)) + assert nonascii1 == nonascii2 + assert endposition1 == endposition2 + +@given(string_in_context_strategy, strategies.binary(min_size=1)) +def test_find_end_of_string_position_invariance(a, prefix): + fn = find_end_of_string_simd_unaligned + (string, startindex) = a + h1, nonascii1, i1 = ll(fn, string, startindex, len(string)) + string2 = prefix + string + h2, nonascii2, i2 = ll(fn, string2, startindex + len(prefix), len(string) + len(prefix)) + assert h1 == h2 + assert nonascii1 == nonascii2 + assert i1 + len(prefix) == i2 + +@given(string_in_context_strategy, strategies.binary(min_size=1)) +def test_find_end_of_string_position_invariance_no_hash(a, prefix): + fn = find_end_of_string_simd_unaligned_no_hash + (string, startindex) = a + nonascii1, i1 = ll(fn, string, startindex, len(string)) + string2 = prefix + string + nonascii2, i2 = ll(fn, string2, startindex + len(prefix), len(string) + len(prefix)) + assert nonascii1 == nonascii2 + assert i1 + len(prefix) == i2 + diff --git a/pypy/objspace/std/jsondict.py b/pypy/objspace/std/jsondict.py new file mode 100644 --- /dev/null +++ b/pypy/objspace/std/jsondict.py @@ -0,0 +1,167 @@ +"""dict implementation specialized for object loaded by the _pypyjson module. + +Somewhat similar to MapDictStrategy, also uses a map. +""" + +from rpython.rlib import jit, rerased, objectmodel, debug + +from pypy.objspace.std.dictmultiobject import ( + UnicodeDictStrategy, DictStrategy, + create_iterator_classes, W_DictObject) + + +def from_values_and_jsonmap(space, values_w, jsonmap): + if not objectmodel.we_are_translated(): + assert len(values_w) == len(jsonmap.get_keys_in_order()) + assert len(values_w) != 0 + debug.make_sure_not_resized(values_w) + strategy = jsonmap.strategy_instance + if strategy is None: + jsonmap.strategy_instance = strategy = JsonDictStrategy(space, jsonmap) + storage = strategy.erase(values_w) + return W_DictObject(space, strategy, storage) + +def devolve_jsonmap_dict(w_dict): + assert isinstance(w_dict, W_DictObject) + strategy = w_dict.get_strategy() + assert isinstance(strategy, JsonDictStrategy) + strategy.switch_to_unicode_strategy(w_dict) + +def get_jsonmap_from_dict(w_dict): + assert isinstance(w_dict, W_DictObject) + strategy = w_dict.get_strategy() + assert isinstance(strategy, JsonDictStrategy) + return strategy.jsonmap + +class JsonDictStrategy(DictStrategy): + erase, unerase = rerased.new_erasing_pair("jsondict") + erase = staticmethod(erase) + unerase = staticmethod(unerase) + + _immutable_fields_ = ['jsonmap'] + + def __init__(self, space, jsonmap): + DictStrategy.__init__(self, space) + self.jsonmap = jsonmap + + def wrap(self, w_key): + return w_key + + def wrapkey(space, key): + return key + + def get_empty_storage(self): + raise NotImplementedError("should not be reachable") + + def is_correct_type(self, w_obj): + space = self.space + return space.is_w(space.type(w_obj), space.w_unicode) + + def _never_equal_to(self, w_lookup_type): + return False + + def length(self, w_dict): + return len(self.unerase(w_dict.dstorage)) + + def getitem(self, w_dict, w_key): + if self.is_correct_type(w_key): + return self.getitem_unicode(w_dict, w_key) + else: + self.switch_to_unicode_strategy(w_dict) + return w_dict.getitem(w_key) + + def getitem_unicode(self, w_dict, w_key): + storage_w = self.unerase(w_dict.dstorage) + if jit.isconstant(w_key): + jit.promote(self) + index = self.jsonmap.get_index(w_key) + if index == -1: + return None + return storage_w[index] + + def setitem(self, w_dict, w_key, w_value): + if self.is_correct_type(w_key): + storage_w = self.unerase(w_dict.dstorage) + index = self.jsonmap.get_index(w_key) + if index != -1: + storage_w[index] = w_value + return + self.switch_to_unicode_strategy(w_dict) + w_dict.setitem(w_key, w_value) + + def setdefault(self, w_dict, w_key, w_default): + if self.is_correct_type(w_key): + w_result = self.getitem_unicode(w_dict, w_key) + if w_result is not None: + return w_result + self.switch_to_unicode_strategy(w_dict) + return w_dict.setdefault(w_key, w_default) + + def delitem(self, w_dict, w_key): + self.switch_to_unicode_strategy(w_dict) + return w_dict.delitem(w_key) + + def popitem(self, w_dict): + self.switch_to_unicode_strategy(w_dict) + return w_dict.popitem() + + def switch_to_unicode_strategy(self, w_dict): + strategy = self.space.fromcache(UnicodeDictStrategy) + values_w = self.unerase(w_dict.dstorage) + storage = strategy.get_empty_storage() + d_new = strategy.unerase(storage) + keys_in_order = self.jsonmap.get_keys_in_order() + assert len(keys_in_order) == len(values_w) + for index, w_key in enumerate(keys_in_order): + assert w_key is not None + assert type(w_key) is self.space.UnicodeObjectCls + d_new[w_key] = values_w[index] + w_dict.set_strategy(strategy) + w_dict.dstorage = storage + + def w_keys(self, w_dict): + return self.space.newlist(self.jsonmap.get_keys_in_order()) + + def values(self, w_dict): + return self.unerase(w_dict.dstorage)[:] # to make resizable + + def items(self, w_dict): + space = self.space + storage_w = self.unerase(w_dict.dstorage) + res = [None] * len(storage_w) + for index, w_key in enumerate(self.jsonmap.get_keys_in_order()): + res[index] = space.newtuple([w_key, storage_w[index]]) + return res + + def getiterkeys(self, w_dict): + return iter(self.jsonmap.get_keys_in_order()) + + def getitervalues(self, w_dict): + storage_w = self.unerase(w_dict.dstorage) + return iter(storage_w) + + def getiteritems_with_hash(self, w_dict): + storage_w = self.unerase(w_dict.dstorage) + return ZipItemsWithHash(self.jsonmap.get_keys_in_order(), storage_w) + + +class ZipItemsWithHash(object): + def __init__(self, list1, list2): + assert len(list1) == len(list2) + self.list1 = list1 + self.list2 = list2 + self.i = 0 + + def __iter__(self): + return self + + def next(self): + i = self.i + if i >= len(self.list1): + raise StopIteration + self.i = i + 1 + w_key = self.list1[i] + return (w_key, self.list2[i], w_key.hash_w()) + + +create_iterator_classes(JsonDictStrategy) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit