I have wrote a benchmark for comparing different regular expression engines 
available in Python. It uses tests and data from [1], that were itself inspired 
by Boost's benchmark [2].

Tested engines are:

* re, standard regular expression module
* regex, alternative regular expression module [3]
* re2, Python wrapper for Google's RE2 [4]
* pcre, Python PCRE bindings [5]

Running tests for all 20MB text file takes too long time, here are results 
(time in millisecons) for 2MB chunk (6000000:8000000):

                                               re  regex    re2   pcre str.find

Twain                                   5   2.866  2.118  12.47  3.911   2.72
(?i)Twain                              10   84.42  4.366  24.76  17.12
[a-z]shing                            165     125  5.466  27.78  180.6
Huck[a-zA-Z]+|Saw[a-zA-Z]+             52   57.11  72.16  23.87    234
\b\w+nn\b                              32   239.5  427.6  23.18  251.9
[a-q][^u-z]{13}x                      445   381.8  5.537   5843  224.9
Tom|Sawyer|Huckleberry|Finn           314   52.73  58.45  24.39  422.5
(?i)Tom|Sawyer|Huckleberry|Finn       477   445.6  522.1  27.73  415.4
.{0,2}(Tom|Sawyer|Huckleberry|Finn)   314   451.2   1113  24.38   1497
.{2,4}(Tom|Sawyer|Huckleberry|Finn)   237   450.1   1000   24.3   1549
Tom.{10,25}river|river.{10,25}Tom       1   61.55  58.11  24.97  233.8
[a-zA-Z]+ing                        10079   189.4  350.3  47.41  357.6
\s[a-zA-Z]{0,12}ing\s                7160   115.7  23.65  37.74  237.6
([A-Za-z]awyer|[A-Za-z]inn)\s          50   153.7  430.4  27.86  425.3
["'][^"']{0,30}[?!\.]["']            1618   83.12  77.39  26.96  157.6

There is no absolute leader. All engines have its weak spots. For re these are 
case-insensitive search and search a pattern that starts with a set.

pcre is very data-sensitive. For other 2Mb chunk (8000000:10000000) results are 
1-2 orders slower:

                                               re  regex    re2   pcre str.find

Twain                                  33   2.671  2.209   16.6  413.6   2.75
(?i)Twain                              35   90.21   4.36  27.65  459.4
[a-z]shing                            120   112.7  2.667  30.94   1895
Huck[a-zA-Z]+|Saw[a-zA-Z]+             61   57.12   49.9  26.76   1152
\b\w+nn\b                              33     238  401.4  26.93  763.7
[a-q][^u-z]{13}x                      481   387.7  5.694   5915   6979
Tom|Sawyer|Huckleberry|Finn           845   52.89  59.61  28.42 1.228e+04
(?i)Tom|Sawyer|Huckleberry|Finn       988   452.3  523.4  32.15 1.426e+04
.{0,2}(Tom|Sawyer|Huckleberry|Finn)   845   421.1   1105  29.01 1.343e+04
.{2,4}(Tom|Sawyer|Huckleberry|Finn)   625   398.6  985.6  29.19   9878
Tom.{10,25}river|river.{10,25}Tom       1    61.6  58.33  26.59  254.1
[a-zA-Z]+ing                        10109   194.5  349.7  50.85 1.445e+05
\s[a-zA-Z]{0,12}ing\s                7286   120.1  23.73  42.04 1.051e+05
([A-Za-z]awyer|[A-Za-z]inn)\s          43   170.6  402.9  30.84   1119
["'][^"']{0,30}[?!\.]["']            1686    86.5  110.2  30.62 2.369e+04

[1] http://sljit.sourceforge.net/regex_perf.html
[2] http://www.boost.org/doc/libs/1_36_0/libs/regex/doc/vc71-performance.html
[3] https://pypi.python.org/pypi/regex/2016.03.02
[4] https://pypi.python.org/pypi/re2/0.2.22
[5] https://pypi.python.org/pypi/python-pcre/0.7
#!/usr/bin/env python

import io
import os.path
import sys

# Download and unzip http://www.gutenberg.org/files/3200/old/mtent12.zip
TESTFILE = 'mtent12.txt'

# Tested engines are:
# * re, standard regular expression module
# * regex, alternative regular expression module: https://pypi.python.org/pypi/regex/
# * re2, Python wrapper for Google's RE2: https://pypi.python.org/pypi/re2/
# * pcre, Python PCRE bindings: https://pypi.python.org/pypi/python-pcre/
engines = 're', 'regex', 're2', 'pcre'

tests = (
    r'Twain',
    r'(?i)Twain',
    r'[a-z]shing',
    r'Huck[a-zA-Z]+|Saw[a-zA-Z]+',
    r'\b\w+nn\b',
    r'[a-q][^u-z]{13}x',
    r'Tom|Sawyer|Huckleberry|Finn',
    r'(?i)Tom|Sawyer|Huckleberry|Finn',
    r'.{0,2}(Tom|Sawyer|Huckleberry|Finn)',
    r'.{2,4}(Tom|Sawyer|Huckleberry|Finn)',
    r'Tom.{10,25}river|river.{10,25}Tom',
    r'[a-zA-Z]+ing',
    r'\s[a-zA-Z]{0,12}ing\s',
    r'([A-Za-z]awyer|[A-Za-z]inn)\s',
    r'''["'][^"']{0,30}[?!\.]["']''',
)


class strfinder:
    def __init__(self, s):
        self.sub = s

    def findall(self, text):
        sub = self.sub
        res = []
        append = res.append
        find = text.find
        size = len(sub)
        i = 0
        while True:
            i = find(sub, i)
            if i < 0:
                break
            append(sub)
            i += size
        return res


def test_regex_twain(p, text, timer, iterations, maxtime=10):
    # Warm up.
    count = len(p.findall(text))
    times = []
    ts = 0
    for i in range(iterations):
        t0 = timer()
        res = p.findall(text)
        t = timer() - t0
        assert len(res) == count
        times.append(t)
        ts += 1
        if ts > maxtime:
            break
    return count, min(times)

if __name__ == '__main__':
    try:
        from time import perf_counter as timer
    except ImportError:
        from time import time as timer

    # A workaround for re2 bug
    try:
        import builtins
    except ImportError:
        pass
    else:
        builtins.basestring = str

    renames = []
    remods = []
    for name in engines:
        try:
            mod = __import__(name)
        except (ImportError, NameError):
            pass
        else:
            renames.append(name)
            remods.append(mod)

    basedir = os.path.dirname(__file__)
    with io.open(os.path.join(basedir, TESTFILE), encoding='latin1') as f:
        text = f.read()
    assert len(text) == 19665221
    text = text[6000000:8000000]

    sys.stdout.write('%-35s' % '')
    sys.stdout.write(' %5s ' % '')
    for name in renames:
        sys.stdout.write(' %6s' % name)
    sys.stdout.write(' %6s' % 'str.find')
    sys.stdout.write('\n\n')

    for f in tests:
        sys.stdout.write('%-35s' % f)
        sys.stdout.flush()
        count = None
        for mod in remods:
            c, t = test_regex_twain(mod.compile(f), text, timer, 5)
            if count is None:
                count = c
                sys.stdout.write(' %5d ' % count)
            else:
                assert c == count
            sys.stdout.write(' %6.4g' % (t * 1000))
            sys.stdout.flush()
        if f == 'Twain':
            c, t = test_regex_twain(strfinder(f), text, timer, 5)
            assert c == count
            sys.stdout.write(' %6.4g' % (t * 1000))
            sys.stdout.flush()
        sys.stdout.write('\n')
        sys.stdout.flush()
_______________________________________________
Speed mailing list
Speed@python.org
https://mail.python.org/mailman/listinfo/speed

Reply via email to