[PATCH] [libcxx] Checking LIT benchmark testing framework.

Eric Fiselier Fri, 06 Mar 2015 08:16:15 -0800

Hi mclow.lists, chandlerc, danalbert,

Hi All,


This is the initial commit for the benchmark testing framework I plan to use in 
libc++. It functions similarly to the existing LIT setup.

The benchmarks use the Google benchmark library found here: 
http://github.com/google/benchmark.
To enable building the benchmark library use the cmake option 
`-DLIBCXX_ENABLE_BENCHMARK=ON`. This option will checkout the library from 
github and build it in the build/external directory.

Once the library is built the benchmarks can be run in one of two ways.

1. Standalone (without baseline comparison): This simply runs the benchmarks 
and generates the output. Use `-o /path/to/OUTPUT` to save the results of the 
benchmarks.
Example usage:
```
lit -v -o /path/to/baseline.txt /path/to/benchmarks 
```
2. Comparison against baseline: This runs the benchmarks and compares the 
results to a baseline file specified. If the current results are slower by more 
than the "allowed difference" the test fails and the results are reported.
Example usage:
```
lit -sv -o /path/to/current_results.txt --param=baseline=/path/to/baseline.txt 
--param=allowed_difference=2.5 /path/to/benchmarks
```
The `allowed_difference` parameter takes the percentage which the results are 
allowed to differ from the baseline. The default is 5%.

The benchmark tests are not run as part of the regular test suite. They are too 
time consuming and do not provide much value unless they are compared to a 
baseline.  They are instead an entirely separate test suite.

http://reviews.llvm.org/D8107

Files:
  CMakeLists.txt
  external/CMakeLists.txt
  external/Toolchain.cmake.in
  test/benchmark/lit.cfg
  test/benchmark/test.bench.cpp
  test/libcxx/test/benchmark.py
  test/libcxx/test/config.py
  test/libcxx/test/format.py

EMAIL PREFERENCES
  http://reviews.llvm.org/settings/panel/emailpreferences/

Index: CMakeLists.txt
===================================================================
--- CMakeLists.txt
+++ CMakeLists.txt
@@ -68,6 +68,7 @@
   set(LLVM_USE_SANITIZER "" CACHE STRING
       "Define the sanitizer used to build the library and tests")
 endif()
+option(LIBCXX_ENABLE_BENCHMARKS "Enable the benchmark tests." ON)
 
 if (LIBCXX_ENABLE_STATIC_ABI_LIBRARY)
   if (APPLE)
@@ -296,6 +297,7 @@
 # Add source code. This also contains all of the logic for deciding linker flags
 # soname, etc...
 add_subdirectory(lib)
+add_subdirectory(external)
 
 #===============================================================================
 # Setup Tests
Index: external/CMakeLists.txt
===================================================================
--- /dev/null
+++ external/CMakeLists.txt
@@ -0,0 +1,19 @@
+
+if (LIBCXX_ENABLE_BENCHMARKS)
+  configure_file(
+    ${CMAKE_CURRENT_SOURCE_DIR}/Toolchain.cmake.in
+    ${CMAKE_CURRENT_BINARY_DIR}/Toolchain.cmake
+    @ONLY)
+
+  include(ExternalProject)
+
+  ExternalProject_Add(
+    Benchmark
+    SVN_REPOSITORY https://github.com/google/benchmark/branches/api-merge
+    CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${CMAKE_CURRENT_BINARY_DIR}
+               -DCMAKE_TOOLCHAIN_FILE:PATH=${CMAKE_CURRENT_BINARY_DIR}/Toolchain.cmake
+               -DCMAKE_BUILD_TYPE=RELEASE
+               -DBENCHMARK_ENABLE_TESTING:BOOL=OFF
+               -DBENCHMARK_ENABLE_SHARED:BOOL=ON
+  )
+endif()
Index: external/Toolchain.cmake.in
===================================================================
--- /dev/null
+++ external/Toolchain.cmake.in
@@ -0,0 +1,10 @@
+
+set(CMAKE_CXX_COMPILER @CMAKE_CXX_COMPILER@)
+set(CMAKE_C_COMPILER   @CMAKE_C_COMPILER@)
+
+# Try to staticly link the C++ standard library so that we don't have libstdc++
+# and libc++ dynamically linked into our tests.
+if (NOT APPLE AND NOT @CMAKE_SYSTEM_NAME@ STREQUAL "FreeBSD")
+  set(CMAKE_SHARED_LINKER_FLAGS "-static-libgcc -static-libstdc++" CACHE STRING "")
+  set(CMAKE_MODULE_LINKER_FLAGS "-static-libgcc -static-libstdc++" CACHE STRING "")
+endif()
Index: test/benchmark/lit.cfg
===================================================================
--- /dev/null
+++ test/benchmark/lit.cfg
@@ -0,0 +1,42 @@
+# -*- Python -*- vim: set ft=python ts=4 sw=4 expandtab tw=79:
+# Configuration file for the 'lit' test runner.
+import os
+import site
+import sys
+
+site.addsitedir(os.path.join(os.path.dirname(__file__), '..'))
+import libcxx.test.config
+
+# Tell pylint that we know config and lit_config exist somewhere.
+if 'PYLINT_IMPORT' in os.environ:
+    config = object()
+    lit_config = object()
+
+# name: The name of this test suite.
+config.name = 'libc++-benchmark'
+
+config.suffixes = ['.bench.cpp']
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+
+# Infer the test_exec_root from the libcxx_object root.
+obj_root = getattr(config, 'libcxx_obj_root', None)
+
+# Check that the test exec root is known.
+if obj_root is None:
+    import libcxx.test.config
+    libcxx.test.config.loadSiteConfig(
+        lit_config, config, 'libcxx_site_config', 'LIBCXX_SITE_CONFIG')
+    obj_root = getattr(config, 'libcxx_obj_root', None)
+    if obj_root is None:
+        import tempfile
+        obj_root = tempfile.mkdtemp(prefix='libcxx-benchmark-')
+        lit_config.warning('Creating temporary directory for object root: %s' %
+                           obj_root)
+
+config.test_exec_root = os.path.join(obj_root, 'test')
+
+configuration = libcxx.test.config.BenchmarkConfiguration(lit_config, config)
+configuration.configure()
+config.test_format = configuration.get_test_format()
Index: test/benchmark/test.bench.cpp
===================================================================
--- /dev/null
+++ test/benchmark/test.bench.cpp
@@ -0,0 +1,8 @@
+#include "benchmark/minimal_benchmark.h"
+
+static void BM_test_empty(benchmark::State& state) {
+    while (state.KeepRunning()) {}
+}
+BENCHMARK(BM_test_empty);
+
+BENCHMARK_MAIN()
Index: test/libcxx/test/benchmark.py
===================================================================
--- /dev/null
+++ test/libcxx/test/benchmark.py
@@ -0,0 +1,205 @@
+import json
+import re
+
+import lit
+import lit.Test
+
+
+def stringToCode(str_code):
+    if str_code == 'PASS':
+        return lit.Test.PASS
+    elif str_code == 'XFAIL':
+        return lit.Test.XFAIL
+    elif str_code == 'FAIL':
+        return lit.Test.FAIL
+    elif str_code == 'XPASS':
+        return lit.Test.XPASS
+    elif str_code == 'UNRESOLVED':
+        return lit.Test.UNRESOLVED
+    elif str_code == 'UNSUPPORTED':
+        return lit.Test.UNSUPPORTED
+    else:
+        assert False
+
+
+def loadTestResults(from_file):
+    """
+    Read in the output of a benchmark test run.
+    """
+    with open(from_file, 'r') as output_file:
+        output = json.load(output_file)
+    raw_tests = output['tests']
+    tests = {}
+    for rt in raw_tests:
+        test = {
+            'name': rt['name'],
+            'code': stringToCode(rt['code']),
+            'output': rt['output'],
+            'benchmarks': rt['metrics']['benchmarks']
+        }
+        tests[rt['name']] = test
+    return tests
+
+
+# Regex to parse a single line of a benchmarks output. The basic format is as
+# follows: <name> <time> <cpu_time> <iterations> (<extra fields>...)\n
+kbench_line_re = re.compile(
+    '^\s*([^\s]+)\s+([-0-9]+)\s+([-0-9]+)\s+([0-9]+)([^\n]*)')
+
+
+def parseBenchmarkLine(line):
+    """
+    Parse the output of a single benchmark
+    """
+    assert line  # Assert non-empty and non-null
+    if line.startswith('DEBUG: '):
+        line = line[len('DEBUG: '):]
+    # TODO(ericwf): This is a hack because the benchmark name can contain
+    # spaces if it names a template: ex BM_Foo<int, long>. Remove this.
+    new_line = line.replace(', ', ',$')
+    match = kbench_line_re.match(new_line)
+    assert match is not None
+    parsed_bench = {
+        'name':       match.group(1).replace(',$', ', '),
+        'time':       max(int(match.group(2)), 1),  # Ensure non-zero
+        'cpu_time':   max(int(match.group(3)), 1),  # Ensure non-zero
+        'iterations': int(match.group(4)),
+    }
+    parsed_bench['total_cpu_time'] = (parsed_bench['cpu_time'] *
+                                      parsed_bench['iterations'])
+    parsed_bench['total_time'] = (parsed_bench['time'] *
+                                  parsed_bench['iterations'])
+    return parsed_bench
+
+
+def removeRepeatedBenchmarks(benchmark_list):
+    """
+    Some benchmarks are run multiple times and report
+    a mean and stddev at the end. This function removes all of repeated runs
+    and combines the mean and stddev into one benchmark result.
+    Example Output:
+      Name               Time(ns)  Iterations
+      BM_my_test         11         95
+      BM_my_test         10        100
+      BM_my_test         9         105
+      BM_my_test_mean    10        100
+      BM_my_test_stddev  1         5
+      BM_different_test (...)
+    """
+    has_repeats = (len(benchmark_list) >= 4 and
+                   benchmark_list[0]['name'] == benchmark_list[1]['name'])
+    if not has_repeats:
+        return benchmark_list
+    new_benchmark_list = []
+    for i in range(len(benchmark_list)):
+        possible_mean = benchmark_list[i]
+        name = possible_mean['name']
+        is_mean = name.endswith('_mean')
+        if not is_mean:
+            continue
+        real_name = name[:-len('_mean')]
+        new_bench = dict(possible_mean)
+        new_bench['name'] = real_name
+        assert len(benchmark_list) > i+1
+        stddev_bench = benchmark_list[i+1]
+        new_bench['time_stddev'] = stddev_bench['time']
+        new_bench['cpu_time_stddev'] = stddev_bench['cpu_time']
+        new_bench['iterations_stddev'] = stddev_bench['iterations']
+        new_benchmark_list += [new_bench]
+    return new_benchmark_list
+
+
+# Regex to split benchmark output header and results.
+# The header and results are split by a line containing only "-" characters.
+ksplit_line_re = re.compile('\n[-]+\n')
+
+
+def parseBenchmarkOutput(output):
+    """
+    Parse the output of the entire benchmark
+    """
+    # Split the benchmark output header and results based on a line containing
+    # only '-' characters.
+    parts = ksplit_line_re.split(output, maxsplit=1)
+    assert len(parts) == 2
+    benchmark_list = [parseBenchmarkLine(l.strip())
+                      for l in parts[1].split('\n') if l.strip()]
+    benchmark_list = removeRepeatedBenchmarks(benchmark_list)
+    benchmark_dict = {}
+    benchmark_index = 0
+    for b in benchmark_list:
+        benchmark_index += 1
+        b['index'] = benchmark_index
+        benchmark_dict[b['name']] = b
+    return benchmark_dict
+
+
+def createBenchmarkDiff(first, second):
+    """
+    diff two benchmarks and return the difference.
+    """
+    def diff_fn(first, second):
+        return second / float(first)
+    return {
+        'name': first['name'],
+        'iterations': diff_fn(
+            first['iterations'], second['iterations']),
+        'cpu_time': diff_fn(
+            second['cpu_time'], first['cpu_time']),
+        'time': diff_fn(
+            second['time'], first['time'])
+    }
+
+
+def DiffBenchmarkResults(baseline, current):
+    """
+    Diff every benchmark in current against baseline and return
+    the results. If there is no matching benchmark in baseline that benchmark
+    is skipped.
+    """
+    diff_map = {}
+    for curr_k, curr_v in current.iteritems():
+        matching_baseline = baseline.get(curr_k)
+        if not matching_baseline:
+            continue
+        diff = createBenchmarkDiff(curr_v, matching_baseline)
+        diff_map[curr_k] = diff
+    return diff_map
+
+
+def formatDiffString(key, baseline, curr, diff):
+    """
+    Format a user readable string that reports the difference between one
+    value of a benchmarks output.
+    """
+    cmp_str = 'FASTER' if diff[key] < 1.0 else 'SLOWER'
+    fmt_str = '{0:11} {1:8} {2} (current={3}, baseline={4}, diff={5})'
+    label = '%s:' % key
+    diff_v = abs(diff[key])
+    # Print the change as a multiplier if it is >= 2. Otherwise print it as
+    # a percentage.
+    if diff_v >= 2:
+        change = '%.3fx' % diff_v
+    else:
+        change = '%.3f%%' % abs((diff_v * 100) - 100)
+    return fmt_str.format(label, change, cmp_str, curr[key], baseline[key],
+                          abs(curr[key]-baseline[key]))
+
+
+def formatFailDiff(baseline, curr, diff):
+    """
+    Format a user readable string that reports the difference between all
+    values of a benchmark output.
+    """
+    return ('%s failed:\n    %s\n    %s\n    %s\n' %
+            (curr['name'],
+             formatDiffString('cpu_time', baseline, curr, diff),
+             formatDiffString('iterations', baseline, curr, diff),
+             formatDiffString('time', baseline, curr, diff)))
+
+def formatPassDiff(baseline, curr, diff):
+    return ('%s passed:\n    %s\n    %s\n    %s\n' %
+            (curr['name'],
+             formatDiffString('cpu_time', baseline, curr, diff),
+             formatDiffString('iterations', baseline, curr, diff),
+             formatDiffString('time', baseline, curr, diff)))
Index: test/libcxx/test/config.py
===================================================================
--- test/libcxx/test/config.py
+++ test/libcxx/test/config.py
@@ -10,11 +10,12 @@
 import lit.Test  # pylint: disable=import-error,no-name-in-module
 import lit.util  # pylint: disable=import-error,no-name-in-module
 
-from libcxx.test.format import LibcxxTestFormat
+from libcxx.test.format import LibcxxTestFormat, LibcxxBenchmarkFormat
 from libcxx.compiler import CXXCompiler
 from libcxx.test.executor import *
 from libcxx.test.tracing import *
 
+
 def loadSiteConfig(lit_config, config, param_name, env_name):
     # We haven't loaded the site specific configuration (the user is
     # probably trying to run on a test file directly, and either the site
@@ -639,3 +640,59 @@
                 cxx_library_root = self.cxx_library_root
             if cxx_library_root:
                 self.env['DYLD_LIBRARY_PATH'] = cxx_library_root
+
+
+class BenchmarkConfiguration(Configuration):
+    def __init__(self, lit_config, config):
+        super(BenchmarkConfiguration, self).__init__(lit_config, config)
+        self.baseline = None
+        self.allowed_difference = None
+
+    def get_test_format(self):
+        return LibcxxBenchmarkFormat(
+            self.baseline,
+            self.allowed_difference,
+            self.cxx,
+            self.use_clang_verify,
+            self.execute_external,
+            self.executor,
+            exec_env=self.env)
+
+    def configure(self):
+        super(BenchmarkConfiguration, self).configure()
+        self.configure_benchmark_flags()
+        self.configure_baseline()
+        self.configure_allowed_difference()
+        self.print_config_info()
+
+    def configure_baseline(self):
+        res = self.get_lit_conf('baseline')
+        if not res:
+            return
+        if not os.path.isfile(res):
+            self.lit_config.fatal('Invalid output file: %s' % res)
+        self.lit_config.note('Comparing to results file: %s' % res)
+        import libcxx.test.benchmark as benchcxx
+        self.baseline = benchcxx.loadTestResults(res)
+
+    def configure_allowed_difference(self):
+        allowed_diff = self.get_lit_conf('allowed_difference', '5.0')
+        self.allowed_difference = float(allowed_diff)
+
+    def configure_benchmark_flags(self):
+        external_dir = os.path.join(self.libcxx_obj_root, 'external')
+        self.cxx.compile_flags += [
+            '-I' + external_dir + '/include',
+            '-I' + self.libcxx_src_root + '/test/benchmark/support'
+        ]
+        lib_path = external_dir + '/lib'
+        self.cxx.link_flags = ['-L' + lib_path,
+                               '-Wl,-rpath,' + lib_path] + self.cxx.link_flags
+        self.cxx.link_flags += ['-lbenchmark']
+        if sys.platform == 'darwin':
+            dyn_path = self.env.get('DYLD_LIBRARY_PATH')
+            if dyn_path is None:
+                dyn_path = lib_path
+            else:
+                dyn_path = dyn_path + ':' + lib_path
+            self.env['DYLD_LIBRARY_PATH'] = dyn_path
Index: test/libcxx/test/format.py
===================================================================
--- test/libcxx/test/format.py
+++ test/libcxx/test/format.py
@@ -1,13 +1,15 @@
 import errno
 import os
+import re
+import tempfile
 import time
 
 import lit.Test        # pylint: disable=import-error
 import lit.TestRunner  # pylint: disable=import-error
 import lit.util        # pylint: disable=import-error
 
+import libcxx.test.benchmark as benchcxx
 from libcxx.test.executor import LocalExecutor as LocalExecutor
-import libcxx.test.executor
 import libcxx.util
 
 
@@ -41,8 +43,7 @@
 
             filepath = os.path.join(source_path, filename)
             if not os.path.isdir(filepath):
-                if any([filename.endswith(ext)
-                        for ext in localConfig.suffixes]):
+                if any([filepath.endswith(s) for s in localConfig.suffixes]):
                     yield lit.Test.Test(testSuite, path_in_suite + (filename,),
                                         localConfig)
 
@@ -148,3 +149,124 @@
             report = libcxx.util.makeReport(cmd, out, err, rc)
             return (lit.Test.FAIL,
                     report + 'Expected compilation to fail!\n')
+
+
+class LibcxxBenchmarkFormat(LibcxxTestFormat):
+    def __init__(self, baseline, allowed_difference, *args, **kwargs):
+        super(LibcxxBenchmarkFormat, self).__init__(*args, **kwargs)
+        self.baseline = baseline
+        self.allowed_difference = allowed_difference
+
+    def _execute(self, test, lit_config):
+        res = lit.TestRunner.parseIntegratedTestScript(
+            test, require_script=False)
+        # Check if a result for the test was returned. If so return that
+        # result.
+        if isinstance(res, lit.Test.Result):
+            return res
+        if lit_config.noExecute:
+            return lit.Test.Result(lit.Test.PASS)
+        # res is not an instance of lit.test.Result. Expand res into its parts.
+        script, tmpBase, execDir = res
+        # Check that we don't have run lines on tests that don't support them.
+        if len(script) != 0:
+            lit_config.fatal('Unsupported RUN line found in test %s' % name)
+        res = self._benchmark_test(test, tmpBase, execDir, lit_config)
+        if not isinstance(res, lit.Test.Result):
+            code, output = res
+            res = lit.Test.Result(code, output)
+        if not res.code == lit.Test.PASS:
+            return res
+        return self._benchmark_test(test, tmpBase, execDir, lit_config)
+
+    def _benchmark_test(self, test, tmpBase, execDir, lit_config):
+        source_path = test.getSourcePath()
+        exec_path = tmpBase + '.exe'
+        object_path = tmpBase + '.o'
+        # Create the output directory if it does not already exist.
+        lit.util.mkdir_p(os.path.dirname(tmpBase))
+        try:
+            # Compile the test
+            cmd, out, err, rc = self.cxx.compileLinkTwoSteps(
+                source_path, out=exec_path, object_file=object_path,
+                cwd=execDir)
+            compile_cmd = cmd
+            if rc != 0:
+                report = libcxx.util.makeReport(cmd, out, err, rc)
+                report += "Compilation failed unexpectedly!"
+                return lit.Test.FAIL, report
+            # Run the test
+            cmd = [exec_path, '--benchmark_repetitions=3']
+            out, err, rc = self.executor.run(
+                None, cmd=cmd, work_dir=os.path.dirname(source_path),
+                env=self.exec_env)
+            if rc != 0:
+                report = libcxx.util.makeReport(cmd, out, err, rc)
+                report = "Compiled With: %s\n%s" % (compile_cmd, report)
+                report += "Compiled test failed unexpectedly!"
+                return lit.Test.FAIL, report
+            scale_warning = ('CPU scaling is enabled: ' +
+                             'Benchmark timings may be noisy.')
+            if scale_warning in out:
+                lit_config.warning(scale_warning)
+            result = lit.Test.Result(lit.Test.PASS, '')
+            benchmark_data = benchcxx.parseBenchmarkOutput(out)
+            result.addMetric('benchmarks',
+                             lit.Test.toMetricValue(benchmark_data))
+            # Check for a benchmark that looks like it does nothing.
+            # This is likely a problem.
+            bad_results_str = self._detect_bad_results(benchmark_data)
+            if bad_results_str:
+                result.code = lit.Test.FAIL
+                result.output = bad_results_str
+                return result
+            # Compare the results to the baseline if the baseline is present.
+            if self.baseline:
+                failing_bench_str = self._compare_results(
+                    test.getFullName(), result)
+                if failing_bench_str:
+                    result.code = lit.Test.FAIL
+                    result.output = failing_bench_str
+                    result.metrics = {}
+            return result
+        finally:
+            # Note that cleanup of exec_file happens in `_clean()`. If you
+            # override this, cleanup is your reponsibility.
+            self._clean(exec_path)
+
+    def _detect_bad_results(self, benches):
+        bad_results_str = ''
+        for k, v in benches.iteritems():
+            if v['cpu_time'] < 10 and k != 'BM_test_empty':
+                bad_results_str += ('Test %s runs too quickly! cpu_time=%s\n'
+                                    % (k, v['cpu_time']))
+        return bad_results_str
+
+    def _compare_results(self, test_name, result):
+        baseline_results = self.baseline.get(test_name)
+        if baseline_results is None:
+            return None
+        this_bench = result.metrics['benchmarks'].value
+        baseline_bench = baseline_results['benchmarks']
+        # Calculate the timing and iteration differences.
+        diff_metrics = benchcxx.DiffBenchmarkResults(
+            baseline_bench, this_bench)
+        result.addMetric(
+            'benchmark_diff', lit.Test.toMetricValue(diff_metrics))
+        # Collect all of the failing test result strings. Map by index
+        # so that they are printed in the order thay were run.
+        failing_bench_map = {}
+        passing_bench_map = {}
+        for diff_name, diff in diff_metrics.items():
+            curr_b = this_bench[diff_name]
+            baseline_b = baseline_bench[diff_name]
+            if diff['cpu_time'] * 100 - 100 <= self.allowed_difference:
+                passing_bench_map[curr_b['index']] = benchcxx.formatPassDiff(
+                    baseline_b, curr_b, diff)
+            else:
+                failing_bench_map[curr_b['index']] = benchcxx.formatFailDiff(
+                    baseline_b, curr_b, diff)
+        if failing_bench_map:
+            for k, v in passing_bench_map.iteritems():
+                failing_bench_map[k] = v
+        return '\n'.join([v for v in failing_bench_map.values()])

_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits

[PATCH] [libcxx] Checking LIT benchmark testing framework.

Reply via email to