I have wrote a benchmark for comparing different regular expression engines
available in Python. It uses tests and data from [1], that were itself inspired
by Boost's benchmark [2].
Tested engines are:
* re, standard regular expression module
* regex, alternative regular expression module [3]
* re2, Python wrapper for Google's RE2 [4]
* pcre, Python PCRE bindings [5]
Running tests for all 20MB text file takes too long time, here are results
(time in millisecons) for 2MB chunk (6000000:8000000):
re regex re2 pcre str.find
Twain 5 2.866 2.118 12.47 3.911 2.72
(?i)Twain 10 84.42 4.366 24.76 17.12
[a-z]shing 165 125 5.466 27.78 180.6
Huck[a-zA-Z]+|Saw[a-zA-Z]+ 52 57.11 72.16 23.87 234
\b\w+nn\b 32 239.5 427.6 23.18 251.9
[a-q][^u-z]{13}x 445 381.8 5.537 5843 224.9
Tom|Sawyer|Huckleberry|Finn 314 52.73 58.45 24.39 422.5
(?i)Tom|Sawyer|Huckleberry|Finn 477 445.6 522.1 27.73 415.4
.{0,2}(Tom|Sawyer|Huckleberry|Finn) 314 451.2 1113 24.38 1497
.{2,4}(Tom|Sawyer|Huckleberry|Finn) 237 450.1 1000 24.3 1549
Tom.{10,25}river|river.{10,25}Tom 1 61.55 58.11 24.97 233.8
[a-zA-Z]+ing 10079 189.4 350.3 47.41 357.6
\s[a-zA-Z]{0,12}ing\s 7160 115.7 23.65 37.74 237.6
([A-Za-z]awyer|[A-Za-z]inn)\s 50 153.7 430.4 27.86 425.3
["'][^"']{0,30}[?!\.]["'] 1618 83.12 77.39 26.96 157.6
There is no absolute leader. All engines have its weak spots. For re these are
case-insensitive search and search a pattern that starts with a set.
pcre is very data-sensitive. For other 2Mb chunk (8000000:10000000) results are
1-2 orders slower:
re regex re2 pcre str.find
Twain 33 2.671 2.209 16.6 413.6 2.75
(?i)Twain 35 90.21 4.36 27.65 459.4
[a-z]shing 120 112.7 2.667 30.94 1895
Huck[a-zA-Z]+|Saw[a-zA-Z]+ 61 57.12 49.9 26.76 1152
\b\w+nn\b 33 238 401.4 26.93 763.7
[a-q][^u-z]{13}x 481 387.7 5.694 5915 6979
Tom|Sawyer|Huckleberry|Finn 845 52.89 59.61 28.42 1.228e+04
(?i)Tom|Sawyer|Huckleberry|Finn 988 452.3 523.4 32.15 1.426e+04
.{0,2}(Tom|Sawyer|Huckleberry|Finn) 845 421.1 1105 29.01 1.343e+04
.{2,4}(Tom|Sawyer|Huckleberry|Finn) 625 398.6 985.6 29.19 9878
Tom.{10,25}river|river.{10,25}Tom 1 61.6 58.33 26.59 254.1
[a-zA-Z]+ing 10109 194.5 349.7 50.85 1.445e+05
\s[a-zA-Z]{0,12}ing\s 7286 120.1 23.73 42.04 1.051e+05
([A-Za-z]awyer|[A-Za-z]inn)\s 43 170.6 402.9 30.84 1119
["'][^"']{0,30}[?!\.]["'] 1686 86.5 110.2 30.62 2.369e+04
[1] http://sljit.sourceforge.net/regex_perf.html
[2] http://www.boost.org/doc/libs/1_36_0/libs/regex/doc/vc71-performance.html
[3] https://pypi.python.org/pypi/regex/2016.03.02
[4] https://pypi.python.org/pypi/re2/0.2.22
[5] https://pypi.python.org/pypi/python-pcre/0.7
#!/usr/bin/env python
import io
import os.path
import sys
# Download and unzip http://www.gutenberg.org/files/3200/old/mtent12.zip
TESTFILE = 'mtent12.txt'
# Tested engines are:
# * re, standard regular expression module
# * regex, alternative regular expression module: https://pypi.python.org/pypi/regex/
# * re2, Python wrapper for Google's RE2: https://pypi.python.org/pypi/re2/
# * pcre, Python PCRE bindings: https://pypi.python.org/pypi/python-pcre/
engines = 're', 'regex', 're2', 'pcre'
tests = (
r'Twain',
r'(?i)Twain',
r'[a-z]shing',
r'Huck[a-zA-Z]+|Saw[a-zA-Z]+',
r'\b\w+nn\b',
r'[a-q][^u-z]{13}x',
r'Tom|Sawyer|Huckleberry|Finn',
r'(?i)Tom|Sawyer|Huckleberry|Finn',
r'.{0,2}(Tom|Sawyer|Huckleberry|Finn)',
r'.{2,4}(Tom|Sawyer|Huckleberry|Finn)',
r'Tom.{10,25}river|river.{10,25}Tom',
r'[a-zA-Z]+ing',
r'\s[a-zA-Z]{0,12}ing\s',
r'([A-Za-z]awyer|[A-Za-z]inn)\s',
r'''["'][^"']{0,30}[?!\.]["']''',
)
class strfinder:
def __init__(self, s):
self.sub = s
def findall(self, text):
sub = self.sub
res = []
append = res.append
find = text.find
size = len(sub)
i = 0
while True:
i = find(sub, i)
if i < 0:
break
append(sub)
i += size
return res
def test_regex_twain(p, text, timer, iterations, maxtime=10):
# Warm up.
count = len(p.findall(text))
times = []
ts = 0
for i in range(iterations):
t0 = timer()
res = p.findall(text)
t = timer() - t0
assert len(res) == count
times.append(t)
ts += 1
if ts > maxtime:
break
return count, min(times)
if __name__ == '__main__':
try:
from time import perf_counter as timer
except ImportError:
from time import time as timer
# A workaround for re2 bug
try:
import builtins
except ImportError:
pass
else:
builtins.basestring = str
renames = []
remods = []
for name in engines:
try:
mod = __import__(name)
except (ImportError, NameError):
pass
else:
renames.append(name)
remods.append(mod)
basedir = os.path.dirname(__file__)
with io.open(os.path.join(basedir, TESTFILE), encoding='latin1') as f:
text = f.read()
assert len(text) == 19665221
text = text[6000000:8000000]
sys.stdout.write('%-35s' % '')
sys.stdout.write(' %5s ' % '')
for name in renames:
sys.stdout.write(' %6s' % name)
sys.stdout.write(' %6s' % 'str.find')
sys.stdout.write('\n\n')
for f in tests:
sys.stdout.write('%-35s' % f)
sys.stdout.flush()
count = None
for mod in remods:
c, t = test_regex_twain(mod.compile(f), text, timer, 5)
if count is None:
count = c
sys.stdout.write(' %5d ' % count)
else:
assert c == count
sys.stdout.write(' %6.4g' % (t * 1000))
sys.stdout.flush()
if f == 'Twain':
c, t = test_regex_twain(strfinder(f), text, timer, 5)
assert c == count
sys.stdout.write(' %6.4g' % (t * 1000))
sys.stdout.flush()
sys.stdout.write('\n')
sys.stdout.flush()
_______________________________________________
Speed mailing list
[email protected]
https://mail.python.org/mailman/listinfo/speed