D4502: util: allow lrucachedict to track cost of entries
This revision was automatically updated to reflect the committed changes. Closed by commit rHGee087f0d7db5: util: allow lrucachedict to track cost of entries (authored by indygreg, committed by ). REPOSITORY rHG Mercurial CHANGES SINCE LAST UPDATE https://phab.mercurial-scm.org/D4502?vs=10832=10941 REVISION DETAIL https://phab.mercurial-scm.org/D4502 AFFECTED FILES contrib/perf.py mercurial/util.py tests/test-lrucachedict.py CHANGE DETAILS diff --git a/tests/test-lrucachedict.py b/tests/test-lrucachedict.py --- a/tests/test-lrucachedict.py +++ b/tests/test-lrucachedict.py @@ -12,27 +12,33 @@ def testsimple(self): d = util.lrucachedict(4) self.assertEqual(d.capacity, 4) -d['a'] = 'va' +d.insert('a', 'va', cost=2) d['b'] = 'vb' d['c'] = 'vc' -d['d'] = 'vd' +d.insert('d', 'vd', cost=42) self.assertEqual(d['a'], 'va') self.assertEqual(d['b'], 'vb') self.assertEqual(d['c'], 'vc') self.assertEqual(d['d'], 'vd') +self.assertEqual(d.totalcost, 44) + # 'a' should be dropped because it was least recently used. d['e'] = 've' self.assertNotIn('a', d) - self.assertIsNone(d.get('a')) +self.assertEqual(d.totalcost, 42) self.assertEqual(d['b'], 'vb') self.assertEqual(d['c'], 'vc') self.assertEqual(d['d'], 'vd') self.assertEqual(d['e'], 've') +# Replacing item with different cost adjusts totalcost. +d.insert('e', 've', cost=4) +self.assertEqual(d.totalcost, 46) + # Touch entries in some order (both get and set). d['e'] d['c'] = 'vc2' @@ -63,12 +69,13 @@ def testcopypartial(self): d = util.lrucachedict(4) -d['a'] = 'va' -d['b'] = 'vb' +d.insert('a', 'va', cost=4) +d.insert('b', 'vb', cost=2) dc = d.copy() self.assertEqual(len(dc), 2) +self.assertEqual(dc.totalcost, 6) for key in ('a', 'b'): self.assertIn(key, dc) self.assertEqual(dc[key], 'v%s' % key) @@ -80,8 +87,10 @@ d['c'] = 'vc' del d['b'] +self.assertEqual(d.totalcost, 4) dc = d.copy() self.assertEqual(len(dc), 2) +self.assertEqual(dc.totalcost, 4) for key in ('a', 'c'): self.assertIn(key, dc) self.assertEqual(dc[key], 'v%s' % key) @@ -93,7 +102,7 @@ def testcopyfull(self): d = util.lrucachedict(4) -d['a'] = 'va' +d.insert('a', 'va', cost=42) d['b'] = 'vb' d['c'] = 'vc' d['d'] = 'vd' @@ -104,13 +113,19 @@ self.assertIn(key, dc) self.assertEqual(dc[key], 'v%s' % key) +self.assertEqual(d.totalcost, 42) +self.assertEqual(dc.totalcost, 42) + # 'a' should be dropped because it was least recently used. dc['e'] = 've' self.assertNotIn('a', dc) for key in ('b', 'c', 'd', 'e'): self.assertIn(key, dc) self.assertEqual(dc[key], 'v%s' % key) +self.assertEqual(d.totalcost, 42) +self.assertEqual(dc.totalcost, 0) + # Contents and order of original dict should remain unchanged. dc['b'] = 'vb_new' @@ -120,25 +135,28 @@ def testcopydecreasecapacity(self): d = util.lrucachedict(5) -d['a'] = 'va' -d['b'] = 'vb' +d.insert('a', 'va', cost=4) +d.insert('b', 'vb', cost=2) d['c'] = 'vc' d['d'] = 'vd' dc = d.copy(2) +self.assertEqual(dc.totalcost, 0) for key in ('a', 'b'): self.assertNotIn(key, dc) for key in ('c', 'd'): self.assertIn(key, dc) self.assertEqual(dc[key], 'v%s' % key) -dc['e'] = 've' +dc.insert('e', 've', cost=7) +self.assertEqual(dc.totalcost, 7) self.assertNotIn('c', dc) for key in ('d', 'e'): self.assertIn(key, dc) self.assertEqual(dc[key], 'v%s' % key) # Original should remain unchanged. +self.assertEqual(d.totalcost, 6) for key in ('a', 'b', 'c', 'd'): self.assertIn(key, d) self.assertEqual(d[key], 'v%s' % key) @@ -174,14 +192,16 @@ def testpopoldest(self): d = util.lrucachedict(4) -d['a'] = 'va' -d['b'] = 'vb' +d.insert('a', 'va', cost=10) +d.insert('b', 'vb', cost=5) self.assertEqual(len(d), 2) self.assertEqual(d.popoldest(), ('a', 'va')) self.assertEqual(len(d), 1) +self.assertEqual(d.totalcost, 5) self.assertEqual(d.popoldest(), ('b', 'vb')) self.assertEqual(len(d), 0) +self.assertEqual(d.totalcost, 0) self.assertIsNone(d.popoldest()) d['a'] = 'va' diff --git a/mercurial/util.py b/mercurial/util.py ---
D4502: util: allow lrucachedict to track cost of entries
lothiraldan added inline comments. INLINE COMMENTS > indygreg wrote in util.py:1277 > Good catch! I'll send a revised patch. > > FWIW, cost accounting on this data structure opens up a lot of potential > around caching on revlogs. I have some alpha-quality commits to replace the > full revision cache on the revlog with an `lrucachedict` and to add a > decompressed chunk cache to revlogs. Such caches can speed up certain > operations drastically. However, we need to be careful about adding always-on > caches to revlogs because they can result in memory bloat. I was thinking > about adding context manager methods to revlogs to temporarily activate > certain aggressive caches in order to facilitate certain operations. e.g. a > fulltext or chunk cache when applying delta groups could make it drastically > faster to compute deltas during bulk insertion. A chunk cache could make > reverse walks significantly faster. Etc. I figured you'd be interested given > recent work in this area :) Having a weighted cache would combine well with our work on intermediate snapshots. If we can keep the right intermediate snapshots in the cache we will get a lot of useful cache hit. REPOSITORY rHG Mercurial REVISION DETAIL https://phab.mercurial-scm.org/D4502 To: indygreg, #hg-reviewers Cc: lothiraldan, mercurial-devel ___ Mercurial-devel mailing list Mercurial-devel@mercurial-scm.org https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
D4502: util: allow lrucachedict to track cost of entries
indygreg updated this revision to Diff 10832. REPOSITORY rHG Mercurial CHANGES SINCE LAST UPDATE https://phab.mercurial-scm.org/D4502?vs=10821=10832 REVISION DETAIL https://phab.mercurial-scm.org/D4502 AFFECTED FILES contrib/perf.py mercurial/util.py tests/test-lrucachedict.py CHANGE DETAILS diff --git a/tests/test-lrucachedict.py b/tests/test-lrucachedict.py --- a/tests/test-lrucachedict.py +++ b/tests/test-lrucachedict.py @@ -12,27 +12,33 @@ def testsimple(self): d = util.lrucachedict(4) self.assertEqual(d.capacity, 4) -d['a'] = 'va' +d.insert('a', 'va', cost=2) d['b'] = 'vb' d['c'] = 'vc' -d['d'] = 'vd' +d.insert('d', 'vd', cost=42) self.assertEqual(d['a'], 'va') self.assertEqual(d['b'], 'vb') self.assertEqual(d['c'], 'vc') self.assertEqual(d['d'], 'vd') +self.assertEqual(d.totalcost, 44) + # 'a' should be dropped because it was least recently used. d['e'] = 've' self.assertNotIn('a', d) - self.assertIsNone(d.get('a')) +self.assertEqual(d.totalcost, 42) self.assertEqual(d['b'], 'vb') self.assertEqual(d['c'], 'vc') self.assertEqual(d['d'], 'vd') self.assertEqual(d['e'], 've') +# Replacing item with different cost adjusts totalcost. +d.insert('e', 've', cost=4) +self.assertEqual(d.totalcost, 46) + # Touch entries in some order (both get and set). d['e'] d['c'] = 'vc2' @@ -63,12 +69,13 @@ def testcopypartial(self): d = util.lrucachedict(4) -d['a'] = 'va' -d['b'] = 'vb' +d.insert('a', 'va', cost=4) +d.insert('b', 'vb', cost=2) dc = d.copy() self.assertEqual(len(dc), 2) +self.assertEqual(dc.totalcost, 6) for key in ('a', 'b'): self.assertIn(key, dc) self.assertEqual(dc[key], 'v%s' % key) @@ -80,8 +87,10 @@ d['c'] = 'vc' del d['b'] +self.assertEqual(d.totalcost, 4) dc = d.copy() self.assertEqual(len(dc), 2) +self.assertEqual(dc.totalcost, 4) for key in ('a', 'c'): self.assertIn(key, dc) self.assertEqual(dc[key], 'v%s' % key) @@ -93,7 +102,7 @@ def testcopyfull(self): d = util.lrucachedict(4) -d['a'] = 'va' +d.insert('a', 'va', cost=42) d['b'] = 'vb' d['c'] = 'vc' d['d'] = 'vd' @@ -104,13 +113,19 @@ self.assertIn(key, dc) self.assertEqual(dc[key], 'v%s' % key) +self.assertEqual(d.totalcost, 42) +self.assertEqual(dc.totalcost, 42) + # 'a' should be dropped because it was least recently used. dc['e'] = 've' self.assertNotIn('a', dc) for key in ('b', 'c', 'd', 'e'): self.assertIn(key, dc) self.assertEqual(dc[key], 'v%s' % key) +self.assertEqual(d.totalcost, 42) +self.assertEqual(dc.totalcost, 0) + # Contents and order of original dict should remain unchanged. dc['b'] = 'vb_new' @@ -120,25 +135,28 @@ def testcopydecreasecapacity(self): d = util.lrucachedict(5) -d['a'] = 'va' -d['b'] = 'vb' +d.insert('a', 'va', cost=4) +d.insert('b', 'vb', cost=2) d['c'] = 'vc' d['d'] = 'vd' dc = d.copy(2) +self.assertEqual(dc.totalcost, 0) for key in ('a', 'b'): self.assertNotIn(key, dc) for key in ('c', 'd'): self.assertIn(key, dc) self.assertEqual(dc[key], 'v%s' % key) -dc['e'] = 've' +dc.insert('e', 've', cost=7) +self.assertEqual(dc.totalcost, 7) self.assertNotIn('c', dc) for key in ('d', 'e'): self.assertIn(key, dc) self.assertEqual(dc[key], 'v%s' % key) # Original should remain unchanged. +self.assertEqual(d.totalcost, 6) for key in ('a', 'b', 'c', 'd'): self.assertIn(key, d) self.assertEqual(d[key], 'v%s' % key) @@ -174,14 +192,16 @@ def testpopoldest(self): d = util.lrucachedict(4) -d['a'] = 'va' -d['b'] = 'vb' +d.insert('a', 'va', cost=10) +d.insert('b', 'vb', cost=5) self.assertEqual(len(d), 2) self.assertEqual(d.popoldest(), ('a', 'va')) self.assertEqual(len(d), 1) +self.assertEqual(d.totalcost, 5) self.assertEqual(d.popoldest(), ('b', 'vb')) self.assertEqual(len(d), 0) +self.assertEqual(d.totalcost, 0) self.assertIsNone(d.popoldest()) d['a'] = 'va' diff --git a/mercurial/util.py b/mercurial/util.py --- a/mercurial/util.py +++ b/mercurial/util.py @@ -1209,18 +1209,21 @@ Holds a reference to nodes on either side as well as a key-value pair for the
D4502: util: allow lrucachedict to track cost of entries
indygreg planned changes to this revision. indygreg added inline comments. INLINE COMMENTS > lothiraldan wrote in util.py:1277 > I'm not sure this line is tested, I didnd't see a test where we replace an > entry with an associated cost Good catch! I'll send a revised patch. FWIW, cost accounting on this data structure opens up a lot of potential around caching on revlogs. I have some alpha-quality commits to replace the full revision cache on the revlog with an `lrucachedict` and to add a decompressed chunk cache to revlogs. Such caches can speed up certain operations drastically. However, we need to be careful about adding always-on caches to revlogs because they can result in memory bloat. I was thinking about adding context manager methods to revlogs to temporarily activate certain aggressive caches in order to facilitate certain operations. e.g. a fulltext or chunk cache when applying delta groups could make it drastically faster to compute deltas during bulk insertion. A chunk cache could make reverse walks significantly faster. Etc. I figured you'd be interested given recent work in this area :) REPOSITORY rHG Mercurial REVISION DETAIL https://phab.mercurial-scm.org/D4502 To: indygreg, #hg-reviewers Cc: lothiraldan, mercurial-devel ___ Mercurial-devel mailing list Mercurial-devel@mercurial-scm.org https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
D4502: util: allow lrucachedict to track cost of entries
lothiraldan added inline comments. INLINE COMMENTS > util.py:1277 > if node is not None: > +self.totalcost -= node.cost > node.value = v I'm not sure this line is tested, I didnd't see a test where we replace an entry with an associated cost REPOSITORY rHG Mercurial REVISION DETAIL https://phab.mercurial-scm.org/D4502 To: indygreg, #hg-reviewers Cc: lothiraldan, mercurial-devel ___ Mercurial-devel mailing list Mercurial-devel@mercurial-scm.org https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
D4502: util: allow lrucachedict to track cost of entries
indygreg created this revision. Herald added a subscriber: mercurial-devel. Herald added a reviewer: hg-reviewers. REVISION SUMMARY Currently, lrucachedict allows tracking of arbitrary items with the only limit being the total number of items in the cache. Caches can be a lot more useful when they are bound by the size of the items in them rather than the number of elements in the cache. In preparation for teaching lrucachedict to enforce a max size of cached items, we teach lrucachedict to optionally associate a numeric cost value with each node. We purposefully let the caller define their own cost for nodes. This does introduce some overhead. Most of it comes from __setitem__, since that function now calls into insert(), thus introducing Python function call overhead. $ hg perflrucachedict --size 4 --gets 100 --sets 100 --mixed 100 ! gets ! wall 0.599552 comb 0.60 user 0.60 sys 0.00 (best of 17) ! wall 0.614643 comb 0.61 user 0.61 sys 0.00 (best of 17) ! inserts ! ! wall 0.655817 comb 0.65 user 0.65 sys 0.00 (best of 16) ! sets ! wall 0.540448 comb 0.54 user 0.54 sys 0.00 (best of 18) ! wall 0.805644 comb 0.81 user 0.81 sys 0.00 (best of 13) ! mixed ! wall 0.651556 comb 0.66 user 0.66 sys 0.00 (best of 15) ! wall 0.781357 comb 0.78 user 0.78 sys 0.00 (best of 13) $ hg perflrucachedict --size 1000 --gets 100 --sets 100 --mixed 100 ! gets ! wall 0.621014 comb 0.62 user 0.62 sys 0.00 (best of 16) ! wall 0.615146 comb 0.62 user 0.62 sys 0.00 (best of 17) ! inserts ! ! wall 0.698115 comb 0.70 user 0.70 sys 0.00 (best of 15) ! sets ! wall 0.560247 comb 0.56 user 0.56 sys 0.00 (best of 18) ! wall 0.832495 comb 0.83 user 0.83 sys 0.00 (best of 12) ! mixed ! wall 0.686172 comb 0.68 user 0.68 sys 0.00 (best of 15) ! wall 0.841359 comb 0.84 user 0.84 sys 0.00 (best of 12) We're still under 1us per insert, which seems like reasonable performance for a cache. If we comment out updating of self.totalcost during insert(), performance of insert() is identical to __setitem__ before. However, I don't want to make total cost evaluation lazy because it has significant performance implications for when we need to evaluate the total cost at mutation time (it requires a cache traversal, which could be expensive for large caches). REPOSITORY rHG Mercurial REVISION DETAIL https://phab.mercurial-scm.org/D4502 AFFECTED FILES contrib/perf.py mercurial/util.py tests/test-lrucachedict.py CHANGE DETAILS diff --git a/tests/test-lrucachedict.py b/tests/test-lrucachedict.py --- a/tests/test-lrucachedict.py +++ b/tests/test-lrucachedict.py @@ -12,21 +12,23 @@ def testsimple(self): d = util.lrucachedict(4) self.assertEqual(d.capacity, 4) -d['a'] = 'va' +d.insert('a', 'va', cost=2) d['b'] = 'vb' d['c'] = 'vc' -d['d'] = 'vd' +d.insert('d', 'vd', cost=42) self.assertEqual(d['a'], 'va') self.assertEqual(d['b'], 'vb') self.assertEqual(d['c'], 'vc') self.assertEqual(d['d'], 'vd') +self.assertEqual(d.totalcost, 44) + # 'a' should be dropped because it was least recently used. d['e'] = 've' self.assertNotIn('a', d) - self.assertIsNone(d.get('a')) +self.assertEqual(d.totalcost, 42) self.assertEqual(d['b'], 'vb') self.assertEqual(d['c'], 'vc') @@ -63,12 +65,13 @@ def testcopypartial(self): d = util.lrucachedict(4) -d['a'] = 'va' -d['b'] = 'vb' +d.insert('a', 'va', cost=4) +d.insert('b', 'vb', cost=2) dc = d.copy() self.assertEqual(len(dc), 2) +self.assertEqual(dc.totalcost, 6) for key in ('a', 'b'): self.assertIn(key, dc) self.assertEqual(dc[key], 'v%s' % key) @@ -80,8 +83,10 @@ d['c'] = 'vc' del d['b'] +self.assertEqual(d.totalcost, 4) dc = d.copy() self.assertEqual(len(dc), 2) +self.assertEqual(dc.totalcost, 4) for key in ('a', 'c'): self.assertIn(key, dc) self.assertEqual(dc[key], 'v%s' % key) @@ -93,7 +98,7 @@ def testcopyfull(self): d = util.lrucachedict(4) -d['a'] = 'va' +d.insert('a', 'va', cost=42) d['b'] = 'vb' d['c'] = 'vc' d['d'] = 'vd' @@ -104,13 +109,19 @@ self.assertIn(key, dc) self.assertEqual(dc[key], 'v%s' % key) +self.assertEqual(d.totalcost, 42) +self.assertEqual(dc.totalcost, 42) + # 'a' should be dropped because it was least recently used. dc['e'] = 've' self.assertNotIn('a', dc) for key in