D4502: util: allow lrucachedict to track cost of entries

2018-09-12 Thread indygreg (Gregory Szorc)
This revision was automatically updated to reflect the committed changes.
Closed by commit rHGee087f0d7db5: util: allow lrucachedict to track cost of 
entries (authored by indygreg, committed by ).

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST UPDATE
  https://phab.mercurial-scm.org/D4502?vs=10832=10941

REVISION DETAIL
  https://phab.mercurial-scm.org/D4502

AFFECTED FILES
  contrib/perf.py
  mercurial/util.py
  tests/test-lrucachedict.py

CHANGE DETAILS

diff --git a/tests/test-lrucachedict.py b/tests/test-lrucachedict.py
--- a/tests/test-lrucachedict.py
+++ b/tests/test-lrucachedict.py
@@ -12,27 +12,33 @@
 def testsimple(self):
 d = util.lrucachedict(4)
 self.assertEqual(d.capacity, 4)
-d['a'] = 'va'
+d.insert('a', 'va', cost=2)
 d['b'] = 'vb'
 d['c'] = 'vc'
-d['d'] = 'vd'
+d.insert('d', 'vd', cost=42)
 
 self.assertEqual(d['a'], 'va')
 self.assertEqual(d['b'], 'vb')
 self.assertEqual(d['c'], 'vc')
 self.assertEqual(d['d'], 'vd')
 
+self.assertEqual(d.totalcost, 44)
+
 # 'a' should be dropped because it was least recently used.
 d['e'] = 've'
 self.assertNotIn('a', d)
-
 self.assertIsNone(d.get('a'))
+self.assertEqual(d.totalcost, 42)
 
 self.assertEqual(d['b'], 'vb')
 self.assertEqual(d['c'], 'vc')
 self.assertEqual(d['d'], 'vd')
 self.assertEqual(d['e'], 've')
 
+# Replacing item with different cost adjusts totalcost.
+d.insert('e', 've', cost=4)
+self.assertEqual(d.totalcost, 46)
+
 # Touch entries in some order (both get and set).
 d['e']
 d['c'] = 'vc2'
@@ -63,12 +69,13 @@
 
 def testcopypartial(self):
 d = util.lrucachedict(4)
-d['a'] = 'va'
-d['b'] = 'vb'
+d.insert('a', 'va', cost=4)
+d.insert('b', 'vb', cost=2)
 
 dc = d.copy()
 
 self.assertEqual(len(dc), 2)
+self.assertEqual(dc.totalcost, 6)
 for key in ('a', 'b'):
 self.assertIn(key, dc)
 self.assertEqual(dc[key], 'v%s' % key)
@@ -80,8 +87,10 @@
 
 d['c'] = 'vc'
 del d['b']
+self.assertEqual(d.totalcost, 4)
 dc = d.copy()
 self.assertEqual(len(dc), 2)
+self.assertEqual(dc.totalcost, 4)
 for key in ('a', 'c'):
 self.assertIn(key, dc)
 self.assertEqual(dc[key], 'v%s' % key)
@@ -93,7 +102,7 @@
 
 def testcopyfull(self):
 d = util.lrucachedict(4)
-d['a'] = 'va'
+d.insert('a', 'va', cost=42)
 d['b'] = 'vb'
 d['c'] = 'vc'
 d['d'] = 'vd'
@@ -104,13 +113,19 @@
 self.assertIn(key, dc)
 self.assertEqual(dc[key], 'v%s' % key)
 
+self.assertEqual(d.totalcost, 42)
+self.assertEqual(dc.totalcost, 42)
+
 # 'a' should be dropped because it was least recently used.
 dc['e'] = 've'
 self.assertNotIn('a', dc)
 for key in ('b', 'c', 'd', 'e'):
 self.assertIn(key, dc)
 self.assertEqual(dc[key], 'v%s' % key)
 
+self.assertEqual(d.totalcost, 42)
+self.assertEqual(dc.totalcost, 0)
+
 # Contents and order of original dict should remain unchanged.
 dc['b'] = 'vb_new'
 
@@ -120,25 +135,28 @@
 
 def testcopydecreasecapacity(self):
 d = util.lrucachedict(5)
-d['a'] = 'va'
-d['b'] = 'vb'
+d.insert('a', 'va', cost=4)
+d.insert('b', 'vb', cost=2)
 d['c'] = 'vc'
 d['d'] = 'vd'
 
 dc = d.copy(2)
+self.assertEqual(dc.totalcost, 0)
 for key in ('a', 'b'):
 self.assertNotIn(key, dc)
 for key in ('c', 'd'):
 self.assertIn(key, dc)
 self.assertEqual(dc[key], 'v%s' % key)
 
-dc['e'] = 've'
+dc.insert('e', 've', cost=7)
+self.assertEqual(dc.totalcost, 7)
 self.assertNotIn('c', dc)
 for key in ('d', 'e'):
 self.assertIn(key, dc)
 self.assertEqual(dc[key], 'v%s' % key)
 
 # Original should remain unchanged.
+self.assertEqual(d.totalcost, 6)
 for key in ('a', 'b', 'c', 'd'):
 self.assertIn(key, d)
 self.assertEqual(d[key], 'v%s' % key)
@@ -174,14 +192,16 @@
 
 def testpopoldest(self):
 d = util.lrucachedict(4)
-d['a'] = 'va'
-d['b'] = 'vb'
+d.insert('a', 'va', cost=10)
+d.insert('b', 'vb', cost=5)
 
 self.assertEqual(len(d), 2)
 self.assertEqual(d.popoldest(), ('a', 'va'))
 self.assertEqual(len(d), 1)
+self.assertEqual(d.totalcost, 5)
 self.assertEqual(d.popoldest(), ('b', 'vb'))
 self.assertEqual(len(d), 0)
+self.assertEqual(d.totalcost, 0)
 self.assertIsNone(d.popoldest())
 
 d['a'] = 'va'
diff --git a/mercurial/util.py b/mercurial/util.py
--- 

D4502: util: allow lrucachedict to track cost of entries

2018-09-08 Thread lothiraldan (Boris Feld)
lothiraldan added inline comments.

INLINE COMMENTS

> indygreg wrote in util.py:1277
> Good catch! I'll send a revised patch.
> 
> FWIW, cost accounting on this data structure opens up a lot of potential 
> around caching on revlogs. I have some alpha-quality commits to replace the 
> full revision cache on the revlog with an `lrucachedict` and to add a 
> decompressed chunk cache to revlogs. Such caches can speed up certain 
> operations drastically. However, we need to be careful about adding always-on 
> caches to revlogs because they can result in memory bloat. I was thinking 
> about adding context manager methods to revlogs to temporarily activate 
> certain aggressive caches in order to facilitate certain operations. e.g. a 
> fulltext or chunk cache when applying delta groups could make it drastically 
> faster to compute deltas during bulk insertion. A chunk cache could make 
> reverse walks significantly faster. Etc. I figured you'd be interested given 
> recent work in this area :)

Having a weighted cache would combine well with our work on intermediate 
snapshots. If we can keep the right intermediate snapshots in the cache we will 
get a lot of useful cache hit.

REPOSITORY
  rHG Mercurial

REVISION DETAIL
  https://phab.mercurial-scm.org/D4502

To: indygreg, #hg-reviewers
Cc: lothiraldan, mercurial-devel
___
Mercurial-devel mailing list
Mercurial-devel@mercurial-scm.org
https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel


D4502: util: allow lrucachedict to track cost of entries

2018-09-07 Thread indygreg (Gregory Szorc)
indygreg updated this revision to Diff 10832.

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST UPDATE
  https://phab.mercurial-scm.org/D4502?vs=10821=10832

REVISION DETAIL
  https://phab.mercurial-scm.org/D4502

AFFECTED FILES
  contrib/perf.py
  mercurial/util.py
  tests/test-lrucachedict.py

CHANGE DETAILS

diff --git a/tests/test-lrucachedict.py b/tests/test-lrucachedict.py
--- a/tests/test-lrucachedict.py
+++ b/tests/test-lrucachedict.py
@@ -12,27 +12,33 @@
 def testsimple(self):
 d = util.lrucachedict(4)
 self.assertEqual(d.capacity, 4)
-d['a'] = 'va'
+d.insert('a', 'va', cost=2)
 d['b'] = 'vb'
 d['c'] = 'vc'
-d['d'] = 'vd'
+d.insert('d', 'vd', cost=42)
 
 self.assertEqual(d['a'], 'va')
 self.assertEqual(d['b'], 'vb')
 self.assertEqual(d['c'], 'vc')
 self.assertEqual(d['d'], 'vd')
 
+self.assertEqual(d.totalcost, 44)
+
 # 'a' should be dropped because it was least recently used.
 d['e'] = 've'
 self.assertNotIn('a', d)
-
 self.assertIsNone(d.get('a'))
+self.assertEqual(d.totalcost, 42)
 
 self.assertEqual(d['b'], 'vb')
 self.assertEqual(d['c'], 'vc')
 self.assertEqual(d['d'], 'vd')
 self.assertEqual(d['e'], 've')
 
+# Replacing item with different cost adjusts totalcost.
+d.insert('e', 've', cost=4)
+self.assertEqual(d.totalcost, 46)
+
 # Touch entries in some order (both get and set).
 d['e']
 d['c'] = 'vc2'
@@ -63,12 +69,13 @@
 
 def testcopypartial(self):
 d = util.lrucachedict(4)
-d['a'] = 'va'
-d['b'] = 'vb'
+d.insert('a', 'va', cost=4)
+d.insert('b', 'vb', cost=2)
 
 dc = d.copy()
 
 self.assertEqual(len(dc), 2)
+self.assertEqual(dc.totalcost, 6)
 for key in ('a', 'b'):
 self.assertIn(key, dc)
 self.assertEqual(dc[key], 'v%s' % key)
@@ -80,8 +87,10 @@
 
 d['c'] = 'vc'
 del d['b']
+self.assertEqual(d.totalcost, 4)
 dc = d.copy()
 self.assertEqual(len(dc), 2)
+self.assertEqual(dc.totalcost, 4)
 for key in ('a', 'c'):
 self.assertIn(key, dc)
 self.assertEqual(dc[key], 'v%s' % key)
@@ -93,7 +102,7 @@
 
 def testcopyfull(self):
 d = util.lrucachedict(4)
-d['a'] = 'va'
+d.insert('a', 'va', cost=42)
 d['b'] = 'vb'
 d['c'] = 'vc'
 d['d'] = 'vd'
@@ -104,13 +113,19 @@
 self.assertIn(key, dc)
 self.assertEqual(dc[key], 'v%s' % key)
 
+self.assertEqual(d.totalcost, 42)
+self.assertEqual(dc.totalcost, 42)
+
 # 'a' should be dropped because it was least recently used.
 dc['e'] = 've'
 self.assertNotIn('a', dc)
 for key in ('b', 'c', 'd', 'e'):
 self.assertIn(key, dc)
 self.assertEqual(dc[key], 'v%s' % key)
 
+self.assertEqual(d.totalcost, 42)
+self.assertEqual(dc.totalcost, 0)
+
 # Contents and order of original dict should remain unchanged.
 dc['b'] = 'vb_new'
 
@@ -120,25 +135,28 @@
 
 def testcopydecreasecapacity(self):
 d = util.lrucachedict(5)
-d['a'] = 'va'
-d['b'] = 'vb'
+d.insert('a', 'va', cost=4)
+d.insert('b', 'vb', cost=2)
 d['c'] = 'vc'
 d['d'] = 'vd'
 
 dc = d.copy(2)
+self.assertEqual(dc.totalcost, 0)
 for key in ('a', 'b'):
 self.assertNotIn(key, dc)
 for key in ('c', 'd'):
 self.assertIn(key, dc)
 self.assertEqual(dc[key], 'v%s' % key)
 
-dc['e'] = 've'
+dc.insert('e', 've', cost=7)
+self.assertEqual(dc.totalcost, 7)
 self.assertNotIn('c', dc)
 for key in ('d', 'e'):
 self.assertIn(key, dc)
 self.assertEqual(dc[key], 'v%s' % key)
 
 # Original should remain unchanged.
+self.assertEqual(d.totalcost, 6)
 for key in ('a', 'b', 'c', 'd'):
 self.assertIn(key, d)
 self.assertEqual(d[key], 'v%s' % key)
@@ -174,14 +192,16 @@
 
 def testpopoldest(self):
 d = util.lrucachedict(4)
-d['a'] = 'va'
-d['b'] = 'vb'
+d.insert('a', 'va', cost=10)
+d.insert('b', 'vb', cost=5)
 
 self.assertEqual(len(d), 2)
 self.assertEqual(d.popoldest(), ('a', 'va'))
 self.assertEqual(len(d), 1)
+self.assertEqual(d.totalcost, 5)
 self.assertEqual(d.popoldest(), ('b', 'vb'))
 self.assertEqual(len(d), 0)
+self.assertEqual(d.totalcost, 0)
 self.assertIsNone(d.popoldest())
 
 d['a'] = 'va'
diff --git a/mercurial/util.py b/mercurial/util.py
--- a/mercurial/util.py
+++ b/mercurial/util.py
@@ -1209,18 +1209,21 @@
 Holds a reference to nodes on either side as well as a key-value
 pair for the 

D4502: util: allow lrucachedict to track cost of entries

2018-09-07 Thread indygreg (Gregory Szorc)
indygreg planned changes to this revision.
indygreg added inline comments.

INLINE COMMENTS

> lothiraldan wrote in util.py:1277
> I'm not sure this line is tested, I didnd't see a test where we replace an 
> entry with an associated cost

Good catch! I'll send a revised patch.

FWIW, cost accounting on this data structure opens up a lot of potential around 
caching on revlogs. I have some alpha-quality commits to replace the full 
revision cache on the revlog with an `lrucachedict` and to add a decompressed 
chunk cache to revlogs. Such caches can speed up certain operations 
drastically. However, we need to be careful about adding always-on caches to 
revlogs because they can result in memory bloat. I was thinking about adding 
context manager methods to revlogs to temporarily activate certain aggressive 
caches in order to facilitate certain operations. e.g. a fulltext or chunk 
cache when applying delta groups could make it drastically faster to compute 
deltas during bulk insertion. A chunk cache could make reverse walks 
significantly faster. Etc. I figured you'd be interested given recent work in 
this area :)

REPOSITORY
  rHG Mercurial

REVISION DETAIL
  https://phab.mercurial-scm.org/D4502

To: indygreg, #hg-reviewers
Cc: lothiraldan, mercurial-devel
___
Mercurial-devel mailing list
Mercurial-devel@mercurial-scm.org
https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel


D4502: util: allow lrucachedict to track cost of entries

2018-09-07 Thread lothiraldan (Boris Feld)
lothiraldan added inline comments.

INLINE COMMENTS

> util.py:1277
>  if node is not None:
> +self.totalcost -= node.cost
>  node.value = v

I'm not sure this line is tested, I didnd't see a test where we replace an 
entry with an associated cost

REPOSITORY
  rHG Mercurial

REVISION DETAIL
  https://phab.mercurial-scm.org/D4502

To: indygreg, #hg-reviewers
Cc: lothiraldan, mercurial-devel
___
Mercurial-devel mailing list
Mercurial-devel@mercurial-scm.org
https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel


D4502: util: allow lrucachedict to track cost of entries

2018-09-06 Thread indygreg (Gregory Szorc)
indygreg created this revision.
Herald added a subscriber: mercurial-devel.
Herald added a reviewer: hg-reviewers.

REVISION SUMMARY
  Currently, lrucachedict allows tracking of arbitrary items with the
  only limit being the total number of items in the cache.
  
  Caches can be a lot more useful when they are bound by the size
  of the items in them rather than the number of elements in the
  cache.
  
  In preparation for teaching lrucachedict to enforce a max size of
  cached items, we teach lrucachedict to optionally associate a numeric
  cost value with each node.
  
  We purposefully let the caller define their own cost for nodes.
  
  This does introduce some overhead. Most of it comes from __setitem__,
  since that function now calls into insert(), thus introducing Python
  function call overhead.
  
  $ hg perflrucachedict --size 4 --gets 100 --sets 100 --mixed 100
  ! gets
  ! wall 0.599552 comb 0.60 user 0.60 sys 0.00 (best of 17)
  ! wall 0.614643 comb 0.61 user 0.61 sys 0.00 (best of 17)
  ! inserts
  ! 
  ! wall 0.655817 comb 0.65 user 0.65 sys 0.00 (best of 16)
  ! sets
  ! wall 0.540448 comb 0.54 user 0.54 sys 0.00 (best of 18)
  ! wall 0.805644 comb 0.81 user 0.81 sys 0.00 (best of 13)
  ! mixed
  ! wall 0.651556 comb 0.66 user 0.66 sys 0.00 (best of 15)
  ! wall 0.781357 comb 0.78 user 0.78 sys 0.00 (best of 13)
  
  $ hg perflrucachedict --size 1000 --gets 100 --sets 100 --mixed 
100
  ! gets
  ! wall 0.621014 comb 0.62 user 0.62 sys 0.00 (best of 16)
  ! wall 0.615146 comb 0.62 user 0.62 sys 0.00 (best of 17)
  ! inserts
  ! 
  ! wall 0.698115 comb 0.70 user 0.70 sys 0.00 (best of 15)
  ! sets
  ! wall 0.560247 comb 0.56 user 0.56 sys 0.00 (best of 18)
  ! wall 0.832495 comb 0.83 user 0.83 sys 0.00 (best of 12)
  ! mixed
  ! wall 0.686172 comb 0.68 user 0.68 sys 0.00 (best of 15)
  ! wall 0.841359 comb 0.84 user 0.84 sys 0.00 (best of 12)
  
  We're still under 1us per insert, which seems like reasonable
  performance for a cache.
  
  If we comment out updating of self.totalcost during insert(),
  performance of insert() is identical to __setitem__ before. However,
  I don't want to make total cost evaluation lazy because it has
  significant performance implications for when we need to evaluate the
  total cost at mutation time (it requires a cache traversal, which could
  be expensive for large caches).

REPOSITORY
  rHG Mercurial

REVISION DETAIL
  https://phab.mercurial-scm.org/D4502

AFFECTED FILES
  contrib/perf.py
  mercurial/util.py
  tests/test-lrucachedict.py

CHANGE DETAILS

diff --git a/tests/test-lrucachedict.py b/tests/test-lrucachedict.py
--- a/tests/test-lrucachedict.py
+++ b/tests/test-lrucachedict.py
@@ -12,21 +12,23 @@
 def testsimple(self):
 d = util.lrucachedict(4)
 self.assertEqual(d.capacity, 4)
-d['a'] = 'va'
+d.insert('a', 'va', cost=2)
 d['b'] = 'vb'
 d['c'] = 'vc'
-d['d'] = 'vd'
+d.insert('d', 'vd', cost=42)
 
 self.assertEqual(d['a'], 'va')
 self.assertEqual(d['b'], 'vb')
 self.assertEqual(d['c'], 'vc')
 self.assertEqual(d['d'], 'vd')
 
+self.assertEqual(d.totalcost, 44)
+
 # 'a' should be dropped because it was least recently used.
 d['e'] = 've'
 self.assertNotIn('a', d)
-
 self.assertIsNone(d.get('a'))
+self.assertEqual(d.totalcost, 42)
 
 self.assertEqual(d['b'], 'vb')
 self.assertEqual(d['c'], 'vc')
@@ -63,12 +65,13 @@
 
 def testcopypartial(self):
 d = util.lrucachedict(4)
-d['a'] = 'va'
-d['b'] = 'vb'
+d.insert('a', 'va', cost=4)
+d.insert('b', 'vb', cost=2)
 
 dc = d.copy()
 
 self.assertEqual(len(dc), 2)
+self.assertEqual(dc.totalcost, 6)
 for key in ('a', 'b'):
 self.assertIn(key, dc)
 self.assertEqual(dc[key], 'v%s' % key)
@@ -80,8 +83,10 @@
 
 d['c'] = 'vc'
 del d['b']
+self.assertEqual(d.totalcost, 4)
 dc = d.copy()
 self.assertEqual(len(dc), 2)
+self.assertEqual(dc.totalcost, 4)
 for key in ('a', 'c'):
 self.assertIn(key, dc)
 self.assertEqual(dc[key], 'v%s' % key)
@@ -93,7 +98,7 @@
 
 def testcopyfull(self):
 d = util.lrucachedict(4)
-d['a'] = 'va'
+d.insert('a', 'va', cost=42)
 d['b'] = 'vb'
 d['c'] = 'vc'
 d['d'] = 'vd'
@@ -104,13 +109,19 @@
 self.assertIn(key, dc)
 self.assertEqual(dc[key], 'v%s' % key)
 
+self.assertEqual(d.totalcost, 42)
+self.assertEqual(dc.totalcost, 42)
+
 # 'a' should be dropped because it was least recently used.
 dc['e'] = 've'
 self.assertNotIn('a', dc)
 for key in