I have wrote a benchmark for comparing different regular expression engines available in Python. It uses tests and data from [1], that were itself inspired by Boost's benchmark [2].
Tested engines are: * re, standard regular expression module * regex, alternative regular expression module [3] * re2, Python wrapper for Google's RE2 [4] * pcre, Python PCRE bindings [5] Running tests for all 20MB text file takes too long time, here are results (time in millisecons) for 2MB chunk (6000000:8000000): re regex re2 pcre str.find Twain 5 2.866 2.118 12.47 3.911 2.72 (?i)Twain 10 84.42 4.366 24.76 17.12 [a-z]shing 165 125 5.466 27.78 180.6 Huck[a-zA-Z]+|Saw[a-zA-Z]+ 52 57.11 72.16 23.87 234 \b\w+nn\b 32 239.5 427.6 23.18 251.9 [a-q][^u-z]{13}x 445 381.8 5.537 5843 224.9 Tom|Sawyer|Huckleberry|Finn 314 52.73 58.45 24.39 422.5 (?i)Tom|Sawyer|Huckleberry|Finn 477 445.6 522.1 27.73 415.4 .{0,2}(Tom|Sawyer|Huckleberry|Finn) 314 451.2 1113 24.38 1497 .{2,4}(Tom|Sawyer|Huckleberry|Finn) 237 450.1 1000 24.3 1549 Tom.{10,25}river|river.{10,25}Tom 1 61.55 58.11 24.97 233.8 [a-zA-Z]+ing 10079 189.4 350.3 47.41 357.6 \s[a-zA-Z]{0,12}ing\s 7160 115.7 23.65 37.74 237.6 ([A-Za-z]awyer|[A-Za-z]inn)\s 50 153.7 430.4 27.86 425.3 ["'][^"']{0,30}[?!\.]["'] 1618 83.12 77.39 26.96 157.6 There is no absolute leader. All engines have its weak spots. For re these are case-insensitive search and search a pattern that starts with a set. pcre is very data-sensitive. For other 2Mb chunk (8000000:10000000) results are 1-2 orders slower: re regex re2 pcre str.find Twain 33 2.671 2.209 16.6 413.6 2.75 (?i)Twain 35 90.21 4.36 27.65 459.4 [a-z]shing 120 112.7 2.667 30.94 1895 Huck[a-zA-Z]+|Saw[a-zA-Z]+ 61 57.12 49.9 26.76 1152 \b\w+nn\b 33 238 401.4 26.93 763.7 [a-q][^u-z]{13}x 481 387.7 5.694 5915 6979 Tom|Sawyer|Huckleberry|Finn 845 52.89 59.61 28.42 1.228e+04 (?i)Tom|Sawyer|Huckleberry|Finn 988 452.3 523.4 32.15 1.426e+04 .{0,2}(Tom|Sawyer|Huckleberry|Finn) 845 421.1 1105 29.01 1.343e+04 .{2,4}(Tom|Sawyer|Huckleberry|Finn) 625 398.6 985.6 29.19 9878 Tom.{10,25}river|river.{10,25}Tom 1 61.6 58.33 26.59 254.1 [a-zA-Z]+ing 10109 194.5 349.7 50.85 1.445e+05 \s[a-zA-Z]{0,12}ing\s 7286 120.1 23.73 42.04 1.051e+05 ([A-Za-z]awyer|[A-Za-z]inn)\s 43 170.6 402.9 30.84 1119 ["'][^"']{0,30}[?!\.]["'] 1686 86.5 110.2 30.62 2.369e+04 [1] http://sljit.sourceforge.net/regex_perf.html [2] http://www.boost.org/doc/libs/1_36_0/libs/regex/doc/vc71-performance.html [3] https://pypi.python.org/pypi/regex/2016.03.02 [4] https://pypi.python.org/pypi/re2/0.2.22 [5] https://pypi.python.org/pypi/python-pcre/0.7
#!/usr/bin/env python import io import os.path import sys # Download and unzip http://www.gutenberg.org/files/3200/old/mtent12.zip TESTFILE = 'mtent12.txt' # Tested engines are: # * re, standard regular expression module # * regex, alternative regular expression module: https://pypi.python.org/pypi/regex/ # * re2, Python wrapper for Google's RE2: https://pypi.python.org/pypi/re2/ # * pcre, Python PCRE bindings: https://pypi.python.org/pypi/python-pcre/ engines = 're', 'regex', 're2', 'pcre' tests = ( r'Twain', r'(?i)Twain', r'[a-z]shing', r'Huck[a-zA-Z]+|Saw[a-zA-Z]+', r'\b\w+nn\b', r'[a-q][^u-z]{13}x', r'Tom|Sawyer|Huckleberry|Finn', r'(?i)Tom|Sawyer|Huckleberry|Finn', r'.{0,2}(Tom|Sawyer|Huckleberry|Finn)', r'.{2,4}(Tom|Sawyer|Huckleberry|Finn)', r'Tom.{10,25}river|river.{10,25}Tom', r'[a-zA-Z]+ing', r'\s[a-zA-Z]{0,12}ing\s', r'([A-Za-z]awyer|[A-Za-z]inn)\s', r'''["'][^"']{0,30}[?!\.]["']''', ) class strfinder: def __init__(self, s): self.sub = s def findall(self, text): sub = self.sub res = [] append = res.append find = text.find size = len(sub) i = 0 while True: i = find(sub, i) if i < 0: break append(sub) i += size return res def test_regex_twain(p, text, timer, iterations, maxtime=10): # Warm up. count = len(p.findall(text)) times = [] ts = 0 for i in range(iterations): t0 = timer() res = p.findall(text) t = timer() - t0 assert len(res) == count times.append(t) ts += 1 if ts > maxtime: break return count, min(times) if __name__ == '__main__': try: from time import perf_counter as timer except ImportError: from time import time as timer # A workaround for re2 bug try: import builtins except ImportError: pass else: builtins.basestring = str renames = [] remods = [] for name in engines: try: mod = __import__(name) except (ImportError, NameError): pass else: renames.append(name) remods.append(mod) basedir = os.path.dirname(__file__) with io.open(os.path.join(basedir, TESTFILE), encoding='latin1') as f: text = f.read() assert len(text) == 19665221 text = text[6000000:8000000] sys.stdout.write('%-35s' % '') sys.stdout.write(' %5s ' % '') for name in renames: sys.stdout.write(' %6s' % name) sys.stdout.write(' %6s' % 'str.find') sys.stdout.write('\n\n') for f in tests: sys.stdout.write('%-35s' % f) sys.stdout.flush() count = None for mod in remods: c, t = test_regex_twain(mod.compile(f), text, timer, 5) if count is None: count = c sys.stdout.write(' %5d ' % count) else: assert c == count sys.stdout.write(' %6.4g' % (t * 1000)) sys.stdout.flush() if f == 'Twain': c, t = test_regex_twain(strfinder(f), text, timer, 5) assert c == count sys.stdout.write(' %6.4g' % (t * 1000)) sys.stdout.flush() sys.stdout.write('\n') sys.stdout.flush()
_______________________________________________ Speed mailing list Speed@python.org https://mail.python.org/mailman/listinfo/speed