Re: [PATCH 3 of 3] fileset: combine union of basic patterns into single matcher

2018-08-04 Thread Martin von Zweigbergk via Mercurial-devel
On Sat, Aug 4, 2018 at 5:15 AM Yuya Nishihara  wrote:

> # HG changeset patch
> # User Yuya Nishihara 
> # Date 1532161152 -32400
> #  Sat Jul 21 17:19:12 2018 +0900
> # Node ID 33247ffb2c71b954e229da8424798d52c1e0d8e1
> # Parent  db488b9c65596088550b2b96ed25aff60a938219
> fileset: combine union of basic patterns into single matcher
>

Queuing this, thanks.
___
Mercurial-devel mailing list
Mercurial-devel@mercurial-scm.org
https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel


[PATCH 3 of 3] fileset: combine union of basic patterns into single matcher

2018-08-04 Thread Yuya Nishihara
# HG changeset patch
# User Yuya Nishihara 
# Date 1532161152 -32400
#  Sat Jul 21 17:19:12 2018 +0900
# Node ID 33247ffb2c71b954e229da8424798d52c1e0d8e1
# Parent  db488b9c65596088550b2b96ed25aff60a938219
fileset: combine union of basic patterns into single matcher

This appears to improve query performance in a big repository than I thought.
Writing less Python in a hot loop, faster computation we gain.

  $ hg files --cwd mozilla-central --time 'set:a* + b* + c* + d* + e*'
  (orig) time: real 0.670 secs (user 0.640+0.000 sys 0.030+0.000)
  (new)  time: real 0.210 secs (user 0.180+0.000 sys 0.020+0.000)

diff --git a/mercurial/fileset.py b/mercurial/fileset.py
--- a/mercurial/fileset.py
+++ b/mercurial/fileset.py
@@ -50,6 +50,12 @@ def kindpatmatch(mctx, x, y):
 return stringmatch(mctx, _getkindpat(x, y, matchmod.allpatternkinds,
  _("pattern must be a string")))
 
+def patternsmatch(mctx, *xs):
+allkinds = matchmod.allpatternkinds
+patterns = [getpattern(x, allkinds, _("pattern must be a string"))
+for x in xs]
+return mctx.matcher(patterns)
+
 def andmatch(mctx, x, y):
 xm = getmatch(mctx, x)
 ym = getmatch(mctx, y)
@@ -436,6 +442,7 @@ methods = {
 'string': stringmatch,
 'symbol': stringmatch,
 'kindpat': kindpatmatch,
+'patterns': patternsmatch,
 'and': andmatch,
 'or': ormatch,
 'minus': minusmatch,
diff --git a/mercurial/filesetlang.py b/mercurial/filesetlang.py
--- a/mercurial/filesetlang.py
+++ b/mercurial/filesetlang.py
@@ -185,6 +185,21 @@ def _optimizeandops(op, ta, tb):
 return ('minus', ta, tb[1])
 return (op, ta, tb)
 
+def _optimizeunion(xs):
+# collect string patterns so they can be compiled into a single regexp
+ws, ts, ss = [], [], []
+for x in xs:
+w, t = _optimize(x)
+if t is not None and t[0] in {'string', 'symbol', 'kindpat'}:
+ss.append(t)
+continue
+ws.append(w)
+ts.append(t)
+if ss:
+ws.append(WEIGHT_CHECK_FILENAME)
+ts.append(('patterns',) + tuple(ss))
+return ws, ts
+
 def _optimize(x):
 if x is None:
 return 0, x
@@ -206,7 +221,9 @@ def _optimize(x):
 else:
 return wb, _optimizeandops(op, tb, ta)
 if op == 'or':
-ws, ts = zip(*(_optimize(y) for y in x[1:]))
+ws, ts = _optimizeunion(x[1:])
+if len(ts) == 1:
+return ws[0], ts[0] # 'or' operation is fully optimized out
 ts = tuple(it[1] for it in sorted(enumerate(ts),
   key=lambda it: ws[it[0]]))
 return max(ws), (op,) + ts
diff --git a/mercurial/minifileset.py b/mercurial/minifileset.py
--- a/mercurial/minifileset.py
+++ b/mercurial/minifileset.py
@@ -40,7 +40,7 @@ def _compile(tree):
 return f
 raise error.ParseError(_("unsupported file pattern: %s") % name,
hint=_('paths must be prefixed with "path:"'))
-elif op == 'or':
+elif op in {'or', 'patterns'}:
 funcs = [_compile(x) for x in tree[1:]]
 return lambda n, s: any(f(n, s) for f in funcs)
 elif op == 'and':
diff --git a/tests/test-fileset.t b/tests/test-fileset.t
--- a/tests/test-fileset.t
+++ b/tests/test-fileset.t
@@ -53,9 +53,7 @@ Test operators and basic patterns
   (symbol 'glob')
   (symbol 'b?')))
   * matcher:
-  ,
-]>
+  
   a1
   b1
   b2
@@ -182,8 +180,9 @@ Show parsed tree at stages:
 None)))
   * optimized:
   (or
-(symbol 'a1')
-(symbol 'a2')
+(patterns
+  (symbol 'a1')
+  (symbol 'a2'))
 (and
   (func
 (symbol 'clean')
@@ -193,8 +192,7 @@ Show parsed tree at stages:
 (string 'b'
   * matcher:
   ,
-,
+,
 ,
   m2=>]>
@@ -203,13 +201,30 @@ Show parsed tree at stages:
   b1
   b2
 
+Union of basic patterns:
+
+  $ fileset -p optimized -s -r. 'a1 or a2 or path:b1'
+  * optimized:
+  (patterns
+(symbol 'a1')
+(symbol 'a2')
+(kindpat
+  (symbol 'path')
+  (symbol 'b1')))
+  * matcher:
+  
+  a1
+  a2
+  b1
+
 OR expression should be reordered by weight:
 
   $ fileset -p optimized -s -r. 'grep("a") or a1 or grep("b") or b2'
   * optimized:
   (or
-(symbol 'a1')
-(symbol 'b2')
+(patterns
+  (symbol 'a1')
+  (symbol 'b2'))
 (func
   (symbol 'grep')
   (string 'a'))
@@ -218,8 +233,7 @@ OR expression should be reordered by wei
   (string 'b')))
   * matcher:
   ,
-,
+,
 ,
 ]>
   a1
___
Mercurial-devel mailing list
Mercurial-devel@mercurial-scm.org
https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel