Author: Carl Friedrich Bolz-Tereick <[email protected]>
Branch: json-decoder-maps
Changeset: r97543:5a9168e9cb01
Date: 2019-09-19 19:40 +0200
http://bitbucket.org/pypy/pypy/changeset/5a9168e9cb01/
Log: some comments, rename a few attributes
diff --git a/pypy/module/_pypyjson/interp_decoder.py
b/pypy/module/_pypyjson/interp_decoder.py
--- a/pypy/module/_pypyjson/interp_decoder.py
+++ b/pypy/module/_pypyjson/interp_decoder.py
@@ -88,15 +88,14 @@
# two caches, one for keys, one for general strings. they both have the
# form {hash-as-int: StringCacheEntry} and they don't deal with
- # collisions at all. For every hash there is simply one string stored.
- #
- # <antocuni> if I understand it correctly, the point is not only to
- # cache the wrapped strings, but also to avoid to memcpy them out of
- # the original string in the common case. Maybe this should be
- # explained here.
- self.cache = {}
- self.cache_wrapped = {}
+ # collisions at all. For every hash there is simply one string stored
+ # and we ignore collisions.
+ self.cache_keys = {}
+ self.cache_values = {}
+ # we don't cache *all* non-key strings, that would be too expensive.
+ # instead, keep a cache of the last 16 strings hashes around and add a
+ # string to the cache only if its hash is seen a second time
self.lru_cache = [0] * self.LRU_SIZE
self.lru_index = 0
@@ -553,7 +552,8 @@
def decode_string(self, i, contextmap=None):
""" Decode a string at position i (which is right after the opening ").
Optionally pass a contextmap, if the value is decoded as the value of a
- dict. """
+ dict."""
+
ll_chars = self.ll_chars
start = i
ch = ll_chars[i]
@@ -563,6 +563,14 @@
cache = True
if contextmap is not None:
+ # keep some statistics about the usefulness of the string cache on
+ # the contextmap
+ # the intuition about the contextmap is as follows:
+ # often there are string values stored in dictionaries that can
+ # never be usefully cached, like unique ids of objects. Then the
+ # strings *in those fields* of all objects should never be cached.
+ # However, the content of other fields can still be useful to
+ # cache.
contextmap.decoded_strings += 1
if not contextmap.should_cache_strings():
cache = False
@@ -589,17 +597,19 @@
# check cache first:
try:
- entry = self.cache_wrapped[strhash]
+ entry = self.cache_values[strhash]
except KeyError:
w_res = self._create_string_wrapped(start, i, nonascii)
# only add *some* strings to the cache, because keeping them all is
- # way too expensive
+ # way too expensive. first we check if the contextmap has caching
+ # disabled completely. if not, we check whether we have recently
+ # seen the same hash already, if yes, we cache the string.
if ((contextmap is not None and
contextmap.decoded_strings <
self.STRING_CACHE_EVALUATION_SIZE) or
strhash in self.lru_cache):
entry = StringCacheEntry(
self.getslice(start, start + length), w_res)
- self.cache_wrapped[strhash] = entry
+ self.cache_values[strhash] = entry
else:
self.lru_cache[self.lru_index] = strhash
self.lru_index = (self.lru_index + 1) & self.LRU_MASK
@@ -620,7 +630,7 @@
def _decode_key_map(self, i, currmap):
ll_chars = self.ll_chars
- # first try to see whether we happen to find currmap.single_nextmap
+ # first try to see whether we happen to find currmap.nextmap_first
nextmap = currmap.fast_path_key_parse(self, i)
if nextmap is not None:
return nextmap
@@ -653,12 +663,12 @@
self.pos = i + 1
# check cache first:
try:
- entry = self.cache[strhash]
+ entry = self.cache_keys[strhash]
except KeyError:
w_res = self._create_string_wrapped(start, i, nonascii)
entry = StringCacheEntry(
self.getslice(start, start + length), w_res)
- self.cache[strhash] = entry
+ self.cache_keys[strhash] = entry
return w_res
if not entry.compare(ll_chars, start, length):
# collision! hopefully rare
@@ -677,8 +687,8 @@
class StringCacheEntry(object):
- """ A cache entry, bundling the encoded version of a string, and its
wrapped
- decoded variant. """
+ """ A cache entry, bundling the encoded version of a string as it appears
+ in the input string, and its wrapped decoded variant. """
def __init__(self, repr, w_uni):
# repr is the escaped string
self.repr = repr
@@ -725,7 +735,7 @@
# preliminary maps. When we have too many fringe maps, we remove the least
# commonly instantiated fringe map and mark it as blocked.
- # allowed graph edges or nodes in all_next:
+ # allowed graph edges or nodes in nextmap_all:
# USEFUL -------
# / \ \
# v v v
@@ -747,7 +757,7 @@
# v v
# BLOCKED
- # the single_nextmap edge can only be these graph edges:
+ # the nextmap_first edge can only be these graph edges:
# USEFUL
# |
# v
@@ -773,14 +783,12 @@
def __init__(self, space):
self.space = space
- # <antocuni> what about calling this "first_nextmap"? Or, even better:
- # nextmap_first and nextmap_all </antocuni>
- # a single transition is stored in .single_nextmap
- self.single_nextmap = None
+ # a single transition is stored in .nextmap_first
+ self.nextmap_first = None
- # all_next is only initialized after seeing the *second* transition
- # but then it also contains .single_nextmap
- self.all_next = None # later dict {key: nextmap}
+ # nextmap_all is only initialized after seeing the *second* transition
+ # but then it also contains .nextmap_first
+ self.nextmap_all = None # later dict {key: nextmap}
# keep some statistics about every map: how often it was instantiated
# and how many non-blocked leaves the map transition tree has, starting
@@ -789,42 +797,42 @@
self.number_of_leaves = 1
def _check_invariants(self):
- if self.all_next:
- for next in self.all_next.itervalues():
+ if self.nextmap_all:
+ for next in self.nextmap_all.itervalues():
next._check_invariants()
- elif self.single_nextmap:
- self.single_nextmap._check_invariants()
+ elif self.nextmap_first:
+ self.nextmap_first._check_invariants()
def get_next(self, w_key, string, start, stop, terminator):
from pypy.objspace.std.dictmultiobject import unicode_hash, unicode_eq
if isinstance(self, JSONMap):
assert not self.state == MapBase.BLOCKED
- single_nextmap = self.single_nextmap
- if (single_nextmap is not None and
- single_nextmap.w_key.eq_w(w_key)):
- return single_nextmap
+ nextmap_first = self.nextmap_first
+ if (nextmap_first is not None and
+ nextmap_first.w_key.eq_w(w_key)):
+ return nextmap_first
assert stop >= 0
assert start >= 0
- if single_nextmap is None:
- # first transition ever seen, don't initialize all_next
+ if nextmap_first is None:
+ # first transition ever seen, don't initialize nextmap_all
next = self._make_next_map(w_key, string[start:stop])
- self.single_nextmap = next
+ self.nextmap_first = next
else:
- if self.all_next is None:
+ if self.nextmap_all is None:
# 2nd transition ever seen
- self.all_next = objectmodel.r_dict(unicode_eq, unicode_hash,
+ self.nextmap_all = objectmodel.r_dict(unicode_eq, unicode_hash,
force_non_null=True, simple_hash_eq=True)
- self.all_next[single_nextmap.w_key] = single_nextmap
+ self.nextmap_all[nextmap_first.w_key] = nextmap_first
else:
- next = self.all_next.get(w_key, None)
+ next = self.nextmap_all.get(w_key, None)
if next is not None:
return next
# if we are at this point we didn't find the transition yet, so
# create a new one
next = self._make_next_map(w_key, string[start:stop])
- self.all_next[w_key] = next
+ self.nextmap_all[w_key] = next
# one new leaf has been created
self.change_number_of_leaves(1)
@@ -846,13 +854,13 @@
""" Fast path when parsing the next key: We speculate that we will
always see a commonly seen next key, and use strcmp (implemented in
key_repr_cmp) to check whether that is the case. """
- single_nextmap = self.single_nextmap
- if single_nextmap:
+ nextmap_first = self.nextmap_first
+ if nextmap_first:
ll_chars = decoder.ll_chars
- assert isinstance(single_nextmap, JSONMap)
- if single_nextmap.key_repr_cmp(ll_chars, position):
- decoder.pos = position + len(single_nextmap.key_repr)
- return single_nextmap
+ assert isinstance(nextmap_first, JSONMap)
+ if nextmap_first.key_repr_cmp(ll_chars, position):
+ decoder.pos = position + len(nextmap_first.key_repr)
+ return nextmap_first
return None
def observe_transition(self, newmap, terminator):
@@ -871,18 +879,18 @@
def _all_dot(self, output):
identity = objectmodel.compute_unique_id(self)
output.append('%s [shape=box%s];' % (identity, self._get_dot_text()))
- if self.all_next:
- for w_key, value in self.all_next.items():
+ if self.nextmap_all:
+ for w_key, value in self.nextmap_all.items():
assert isinstance(value, JSONMap)
- if value is self.single_nextmap:
+ if value is self.nextmap_first:
color = ", color=blue"
else:
color = ""
output.append('%s -> %s [label="%s"%s];' % (
identity, objectmodel.compute_unique_id(value),
value.w_key._utf8, color))
value._all_dot(output)
- elif self.single_nextmap is not None:
- value = self.single_nextmap
+ elif self.nextmap_first is not None:
+ value = self.nextmap_first
output.append('%s -> %s [label="%s", color=blue];' % (
identity, objectmodel.compute_unique_id(value),
value.w_key._utf8))
value._all_dot(output)
@@ -1000,8 +1008,8 @@
assert False, "should be unreachable"
if self.state == MapBase.BLOCKED:
- assert self.single_nextmap is None
- assert self.all_next is None
+ assert self.nextmap_first is None
+ assert self.nextmap_all is None
elif self.state == MapBase.FRINGE:
assert self in self._get_terminator().current_fringe
@@ -1016,19 +1024,19 @@
if was_fringe:
terminator.remove_from_fringe(self)
# find the most commonly instantiated child, store it into
- # single_nextmap and mark it useful, recursively
- maxchild = self.single_nextmap
- if self.all_next is not None:
- for child in self.all_next.itervalues():
+ # nextmap_first and mark it useful, recursively
+ maxchild = self.nextmap_first
+ if self.nextmap_all is not None:
+ for child in self.nextmap_all.itervalues():
if child.instantiation_count > maxchild.instantiation_count:
maxchild = child
if maxchild is not None:
maxchild.mark_useful(terminator)
- if self.all_next:
- for child in self.all_next.itervalues():
+ if self.nextmap_all:
+ for child in self.nextmap_all.itervalues():
if child is not maxchild:
terminator.register_potential_fringe(child)
- self.single_nextmap = maxchild
+ self.nextmap_first = maxchild
def mark_blocked(self, terminator):
""" mark self and recursively all its children as blocked."""
@@ -1036,13 +1044,13 @@
self.state = MapBase.BLOCKED
if was_fringe:
terminator.remove_from_fringe(self)
- if self.all_next:
- for next in self.all_next.itervalues():
+ if self.nextmap_all:
+ for next in self.nextmap_all.itervalues():
next.mark_blocked(terminator)
- elif self.single_nextmap:
- self.single_nextmap.mark_blocked(terminator)
- self.single_nextmap = None
- self.all_next = None
+ elif self.nextmap_first:
+ self.nextmap_first.mark_blocked(terminator)
+ self.nextmap_first = None
+ self.nextmap_all = None
self.change_number_of_leaves(-self.number_of_leaves + 1)
def is_state_blocked(self):
@@ -1123,10 +1131,10 @@
# _____________________________________________________
def _get_dot_text(self):
- if self.all_next is None:
- l = int(self.single_nextmap is not None)
+ if self.nextmap_all is None:
+ l = int(self.nextmap_first is not None)
else:
- l = len(self.all_next)
+ l = len(self.nextmap_all)
extra = ""
if self.decoded_strings:
extra = "\\n%s/%s (%s%%)" % (self.cache_hits,
self.decoded_strings, self.cache_hits/float(self.decoded_strings))
diff --git a/pypy/module/_pypyjson/test/test__pypyjson.py
b/pypy/module/_pypyjson/test/test__pypyjson.py
--- a/pypy/module/_pypyjson/test/test__pypyjson.py
+++ b/pypy/module/_pypyjson/test/test__pypyjson.py
@@ -21,26 +21,26 @@
w_c = self.space.newutf8("c", 1)
m1 = m.get_next(w_a, '"a"', 0, 3, m)
assert m1.w_key == w_a
- assert m1.single_nextmap is None
+ assert m1.nextmap_first is None
assert m1.key_repr == '"a"'
assert m1.key_repr_cmp('"a": 123', 0)
assert not m1.key_repr_cmp('b": 123', 0)
- assert m.single_nextmap.w_key == w_a
+ assert m.nextmap_first.w_key == w_a
m2 = m.get_next(w_a, '"a"', 0, 3, m)
assert m2 is m1
m3 = m.get_next(w_b, '"b"', 0, 3, m)
assert m3.w_key == w_b
- assert m3.single_nextmap is None
+ assert m3.nextmap_first is None
assert m3.key_repr == '"b"'
- assert m.single_nextmap is m1
+ assert m.nextmap_first is m1
m4 = m3.get_next(w_c, '"c"', 0, 3, m)
assert m4.w_key == w_c
- assert m4.single_nextmap is None
+ assert m4.nextmap_first is None
assert m4.key_repr == '"c"'
- assert m3.single_nextmap is m4
+ assert m3.nextmap_first is m4
def test_json_map_get_index(self):
m = Terminator(self.space)
@@ -121,7 +121,7 @@
assert m3.state == MapBase.PRELIMINARY
m3.instantiation_count = 2
- assert m2.single_nextmap is m3
+ assert m2.nextmap_first is m3
assert m4.state == MapBase.PRELIMINARY
m4.instantiation_count = 4
@@ -131,7 +131,7 @@
assert m2.state == MapBase.USEFUL
assert m3.state == MapBase.FRINGE
assert m4.state == MapBase.USEFUL
- assert m2.single_nextmap is m4
+ assert m2.nextmap_first is m4
assert m1.number_of_leaves == 2
base._check_invariants()
@@ -194,11 +194,11 @@
base.cleanup_fringe()
assert base.current_fringe == dict.fromkeys([m1, m2, m3])
assert m4.state == MapBase.BLOCKED
- assert m4.single_nextmap is None
- assert m4.all_next is None
+ assert m4.nextmap_first is None
+ assert m4.nextmap_all is None
assert m5.state == MapBase.BLOCKED
- assert m5.single_nextmap is None
- assert m5.all_next is None
+ assert m5.nextmap_first is None
+ assert m5.nextmap_all is None
def test_deal_with_blocked(self):
w_a = self.space.newutf8("a", 1)
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit