This is an automated email from the ASF dual-hosted git repository. apitrou pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new c3511db ARROW-4827: [C++] Implement benchmark comparison c3511db is described below commit c3511db97e981fd24367916e45fd1d1edd83bb73 Author: François Saint-Jacques <fsaintjacq...@gmail.com> AuthorDate: Thu Apr 25 17:54:09 2019 +0200 ARROW-4827: [C++] Implement benchmark comparison This script/library allows comparing revisions/builds. Author: François Saint-Jacques <fsaintjacq...@gmail.com> Closes #4141 from fsaintjacques/ARROW-4827-benchmark-comparison and squashes the following commits: a047ae4ed <François Saint-Jacques> Satisfy flake8 e95baf317 <François Saint-Jacques> Add comments and move stuff ee39a1feb <François Saint-Jacques> Move cpp_runner_from_rev_or_path in CppRunner 2a953f180 <François Saint-Jacques> Missing files d8e3c1c85 <François Saint-Jacques> Review 514e8e428 <François Saint-Jacques> Introduce RegressionSetArgs 280c93be4 <François Saint-Jacques> Update gitignore dc031bde7 <François Saint-Jacques> Support conda toolchain 28254676c <François Saint-Jacques> Add --cmake-extras to benchmark-diff command e6762899c <François Saint-Jacques> Typo 048ba0ede <François Saint-Jacques> Add verbose_third_party 71b10e98a <François Saint-Jacques> Disable python in benchmarks c3719214c <François Saint-Jacques> Fix flake8 warnings 8845e3e78 <François Saint-Jacques> Remove empty __init__.py 1949f749c <François Saint-Jacques> Supports HEAD revisions 96f999748 <François Saint-Jacques> Add gitignore entry d9692bc8f <François Saint-Jacques> Fix splitlines 90578af61 <François Saint-Jacques> Add --cmake-extras to build command 7696202ba <François Saint-Jacques> Add doc for bin attribute. a281ae8e6 <François Saint-Jacques> Various language fixes 1b028390c <François Saint-Jacques> Rename --cxx_flags to --cxx-flags bc111b2d3 <François Saint-Jacques> Removes copied stuff d6733b6f4 <François Saint-Jacques> Formatting 21b2e14fc <François Saint-Jacques> Add doc and fix bugs 2a81744cf <François Saint-Jacques> Ooops. c85661cf3 <François Saint-Jacques> Add documentation 703cf987a <François Saint-Jacques> commit 2c0d512f8 <François Saint-Jacques> Checkpoint a38f49cd9 <François Saint-Jacques> checkpoint a5ad76d11 <François Saint-Jacques> Fix syntax 712d2ed3c <François Saint-Jacques> initial commit --- .gitignore | 2 + cpp/src/arrow/compute/benchmark-util.h | 13 + .../arrow/compute/kernels/aggregate-benchmark.cc | 4 +- dev/archery/archery/benchmark/compare.py | 122 +++++++++ .../archery/archery/benchmark/core.py | 72 +++--- dev/archery/archery/benchmark/google.py | 162 ++++++++++++ dev/archery/archery/benchmark/runner.py | 114 +++++++++ dev/archery/archery/cli.py | 274 +++++++++++++++++++++ dev/archery/archery/lang/cpp.py | 130 ++++++++++ dev/archery/archery/utils/cmake.py | 213 ++++++++++++++++ .gitignore => dev/archery/archery/utils/codec.py | 69 ++---- dev/archery/archery/utils/command.py | 71 ++++++ dev/archery/archery/utils/git.py | 73 ++++++ .gitignore => dev/archery/archery/utils/logger.py | 45 +--- dev/archery/archery/utils/source.py | 141 +++++++++++ .gitignore => dev/archery/setup.py | 58 ++--- .gitignore => dev/archery/tests/test_benchmarks.py | 55 ++--- docs/source/developers/benchmarks.rst | 127 ++++++++++ docs/source/developers/index.rst | 1 + python/.gitignore | 2 - 20 files changed, 1543 insertions(+), 205 deletions(-) diff --git a/.gitignore b/.gitignore index 6bb237a..4a03020 100644 --- a/.gitignore +++ b/.gitignore @@ -50,6 +50,8 @@ docs/example1.dat docs/example3.dat python/.eggs/ python/doc/ +# Egg metadata +*.egg-info .vscode .idea/ diff --git a/cpp/src/arrow/compute/benchmark-util.h b/cpp/src/arrow/compute/benchmark-util.h index 1678f8d..865da66 100644 --- a/cpp/src/arrow/compute/benchmark-util.h +++ b/cpp/src/arrow/compute/benchmark-util.h @@ -55,5 +55,18 @@ void BenchmarkSetArgs(benchmark::internal::Benchmark* bench) { bench->Args({static_cast<ArgsType>(size), nulls}); } +void RegressionSetArgs(benchmark::internal::Benchmark* bench) { + // Benchmark changed its parameter type between releases from + // int to int64_t. As it doesn't have version macros, we need + // to apply C++ template magic. + using ArgsType = + typename BenchmarkArgsType<decltype(&benchmark::internal::Benchmark::Args)>::type; + bench->Unit(benchmark::kMicrosecond); + + // Regressions should only bench L1 data for better stability + for (auto nulls : std::vector<ArgsType>({0, 1, 10, 50})) + bench->Args({static_cast<ArgsType>(kL1Size), nulls}); +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc b/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc index e81f879..bbc923f 100644 --- a/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc +++ b/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc @@ -309,7 +309,7 @@ BENCHMARK_TEMPLATE(BenchSum, SumBitmapNaive<int64_t>)->Apply(BenchmarkSetArgs); BENCHMARK_TEMPLATE(BenchSum, SumBitmapReader<int64_t>)->Apply(BenchmarkSetArgs); BENCHMARK_TEMPLATE(BenchSum, SumBitmapVectorizeUnroll<int64_t>)->Apply(BenchmarkSetArgs); -static void BenchSumKernel(benchmark::State& state) { +static void RegressionSumKernel(benchmark::State& state) { const int64_t array_size = state.range(0) / sizeof(int64_t); const double null_percent = static_cast<double>(state.range(1)) / 100.0; auto rand = random::RandomArrayGenerator(1923); @@ -328,7 +328,7 @@ static void BenchSumKernel(benchmark::State& state) { state.SetBytesProcessed(state.iterations() * array_size * sizeof(int64_t)); } -BENCHMARK(BenchSumKernel)->Apply(BenchmarkSetArgs); +BENCHMARK(RegressionSumKernel)->Apply(RegressionSetArgs); } // namespace compute } // namespace arrow diff --git a/dev/archery/archery/benchmark/compare.py b/dev/archery/archery/benchmark/compare.py new file mode 100644 index 0000000..bf9811f --- /dev/null +++ b/dev/archery/archery/benchmark/compare.py @@ -0,0 +1,122 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +# Define a global regression threshold as 5%. This is purely subjective and +# flawed. This does not track cumulative regression. +DEFAULT_THRESHOLD = 0.05 + + +class BenchmarkComparator: + """ Compares two benchmarks. + + Encodes the logic of comparing two benchmarks and taking a decision on + if it induce a regression. + """ + + def __init__(self, contender, baseline, threshold=DEFAULT_THRESHOLD, + suite_name=None): + self.contender = contender + self.baseline = baseline + self.threshold = threshold + self.suite_name = suite_name + + @property + def name(self): + return self.baseline.name + + @property + def less_is_better(self): + return self.baseline.less_is_better + + @property + def unit(self): + return self.baseline.unit + + @property + def change(self): + new = self.contender.value + old = self.baseline.value + + if old == 0 and new == 0: + return 0.0 + if old == 0: + return 0.0 + + return float(new - old) / abs(old) + + @property + def confidence(self): + """ Indicate if a comparison of benchmarks should be trusted. """ + return True + + @property + def regression(self): + change = self.change + adjusted_change = change if self.less_is_better else -change + return (self.confidence and adjusted_change > self.threshold) + + def compare(self, comparator=None): + return { + "benchmark": self.name, + "change": self.change, + "regression": self.regression, + "baseline": self.baseline.value, + "contender": self.contender.value, + "unit": self.unit, + "less_is_better": self.less_is_better, + } + + def __call__(self, **kwargs): + return self.compare(**kwargs) + + +def pairwise_compare(contender, baseline): + dict_contender = {e.name: e for e in contender} + dict_baseline = {e.name: e for e in baseline} + + for name in (dict_contender.keys() & dict_baseline.keys()): + yield name, (dict_contender[name], dict_baseline[name]) + + +class RunnerComparator: + """ Compares suites/benchmarks from runners. + + It is up to the caller that ensure that runners are compatible (both from + the same language implementation). + """ + + def __init__(self, contender, baseline, threshold=DEFAULT_THRESHOLD): + self.contender = contender + self.baseline = baseline + self.threshold = threshold + + def comparisons(self, suite_filter=None, benchmark_filter=None): + """ + """ + contender = self.contender.suites(suite_filter, benchmark_filter) + baseline = self.baseline.suites(suite_filter, benchmark_filter) + suites = pairwise_compare(contender, baseline) + + for suite_name, (suite_cont, suite_base) in suites: + benchmarks = pairwise_compare( + suite_cont.benchmarks, suite_base.benchmarks) + + for bench_name, (bench_cont, bench_base) in benchmarks: + yield BenchmarkComparator(bench_cont, bench_base, + threshold=self.threshold, + suite_name=suite_name) diff --git a/.gitignore b/dev/archery/archery/benchmark/core.py similarity index 50% copy from .gitignore copy to dev/archery/archery/benchmark/core.py index 6bb237a..83bc273 100644 --- a/.gitignore +++ b/dev/archery/archery/benchmark/core.py @@ -15,46 +15,32 @@ # specific language governing permissions and limitations # under the License. -apache-rat-*.jar -arrow-src.tar -arrow-src.tar.gz - -# Compiled source -*.a -*.dll -*.o -*.py[ocd] -*.so -*.so.* -*.dylib -.build_cache_dir -dependency-reduced-pom.xml -MANIFEST -compile_commands.json -build.ninja - -# Generated Visual Studio files -*.vcxproj -*.vcxproj.* -*.sln -*.iml - -# Linux perf sample data -perf.data -perf.data.old - -cpp/.idea/ -cpp/apidoc/xml/ -docs/example.gz -docs/example1.dat -docs/example3.dat -python/.eggs/ -python/doc/ - -.vscode -.idea/ -.pytest_cache/ -pkgs -.Rproj.user -arrow.Rcheck/ -docker_cache +import pandas as pa + + +class Benchmark: + def __init__(self, name, unit, less_is_better, values, stats=None): + self.name = name + self.unit = unit + self.less_is_better = less_is_better + self.values = pa.Series(values) + self.statistics = self.values.describe() + + @property + def value(self): + median = "50%" + return float(self.statistics[median]) + + def __repr__(self): + return f"Benchmark[name={self.name},value={self.value}]" + + +class BenchmarkSuite: + def __init__(self, name, benchmarks): + self.name = name + self.benchmarks = benchmarks + + def __repr__(self): + name = self.name + benchmarks = self.benchmarks + return f"BenchmarkSuite[name={name}, benchmarks={benchmarks}]" diff --git a/dev/archery/archery/benchmark/google.py b/dev/archery/archery/benchmark/google.py new file mode 100644 index 0000000..d6efb77 --- /dev/null +++ b/dev/archery/archery/benchmark/google.py @@ -0,0 +1,162 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from itertools import filterfalse, groupby, tee +import json +import subprocess + +from .core import Benchmark +from ..utils.command import Command + + +def partition(pred, iterable): + # adapted from python's examples + t1, t2 = tee(iterable) + return list(filter(pred, t1)), list(filterfalse(pred, t2)) + + +class GoogleBenchmarkCommand(Command): + """ Run a google benchmark binary. + + This assumes the binary supports the standard command line options, + notably `--benchmark_filter`, `--benchmark_format`, etc... + """ + + def __init__(self, benchmark_bin, benchmark_filter=None): + self.bin = benchmark_bin + self.benchmark_filter = benchmark_filter + + def list_benchmarks(self): + argv = ["--benchmark_list_tests"] + if self.benchmark_filter: + argv.append(f"--benchmark_filter={self.benchmark_filter}") + result = self.run(*argv, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + return str.splitlines(result.stdout.decode("utf-8")) + + def results(self): + argv = ["--benchmark_format=json", "--benchmark_repetitions=20"] + + if self.benchmark_filter: + argv.append(f"--benchmark_filter={self.benchmark_filter}") + + return json.loads(self.run(*argv, stdout=subprocess.PIPE, + stderr=subprocess.PIPE).stdout) + + +class GoogleBenchmarkObservation: + """ Represents one run of a single (google c++) benchmark. + + Observations are found when running with `--benchmark_repetitions`. Sadly, + the format mixes values and aggregates, e.g. + + RegressionSumKernel/32768/0 1 us 1 us 25.8077GB/s + RegressionSumKernel/32768/0 1 us 1 us 25.7066GB/s + RegressionSumKernel/32768/0 1 us 1 us 25.1481GB/s + RegressionSumKernel/32768/0 1 us 1 us 25.846GB/s + RegressionSumKernel/32768/0 1 us 1 us 25.6453GB/s + RegressionSumKernel/32768/0_mean 1 us 1 us 25.6307GB/s + RegressionSumKernel/32768/0_median 1 us 1 us 25.7066GB/s + RegressionSumKernel/32768/0_stddev 0 us 0 us 288.046MB/s + + As from benchmark v1.4.1 (2019-04-24), the only way to differentiate an + actual run from the aggregates, is to match on the benchmark name. The + aggregates will be appended with `_$agg_name`. + + This class encapsulate the logic to separate runs from aggregate . This is + hopefully avoided in benchmark's master version with a separate json + attribute. + """ + + def __init__(self, name, real_time, cpu_time, time_unit, size=None, + bytes_per_second=None, **kwargs): + self._name = name + self.real_time = real_time + self.cpu_time = cpu_time + self.time_unit = time_unit + self.size = size + self.bytes_per_second = bytes_per_second + + @property + def is_agg(self): + """ Indicate if the observation is a run or an aggregate. """ + suffixes = ["_mean", "_median", "_stddev"] + return any(map(lambda x: self._name.endswith(x), suffixes)) + + @property + def is_realtime(self): + """ Indicate if the preferred value is realtime instead of cputime. """ + return self.name.find("/realtime") != -1 + + @property + def name(self): + name = self._name + return name.rsplit("_", maxsplit=1)[0] if self.is_agg else name + + @property + def time(self): + return self.real_time if self.is_realtime else self.cpu_time + + @property + def value(self): + """ Return the benchmark value.""" + return self.bytes_per_second if self.size else self.time + + @property + def unit(self): + return "bytes_per_second" if self.size else self.time_unit + + def __repr__(self): + return f"{self.value}" + + +class GoogleBenchmark(Benchmark): + """ A set of GoogleBenchmarkObservations. """ + + def __init__(self, name, runs): + """ Initialize a GoogleBenchmark. + + Parameters + ---------- + name: str + Name of the benchmark + runs: list(GoogleBenchmarkObservation) + Repetitions of GoogleBenchmarkObservation run. + + """ + self.name = name + # exclude google benchmark aggregate artifacts + _, runs = partition(lambda b: b.is_agg, runs) + self.runs = sorted(runs, key=lambda b: b.value) + unit = self.runs[0].unit + # If `size` is found in the json dict, then the benchmark is reported + # in bytes per second + less_is_better = self.runs[0].size is None + values = [b.value for b in self.runs] + super().__init__(name, unit, less_is_better, values) + + def __repr__(self): + return f"GoogleBenchmark[name={self.name},runs={self.runs}]" + + @classmethod + def from_json(cls, payload): + def group_key(x): + return x.name + + benchmarks = map(lambda x: GoogleBenchmarkObservation(**x), payload) + groups = groupby(sorted(benchmarks, key=group_key), group_key) + return [cls(k, list(bs)) for k, bs in groups] diff --git a/dev/archery/archery/benchmark/runner.py b/dev/archery/archery/benchmark/runner.py new file mode 100644 index 0000000..7dc56bd --- /dev/null +++ b/dev/archery/archery/benchmark/runner.py @@ -0,0 +1,114 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import glob +import os +import re + +from .core import BenchmarkSuite +from .google import GoogleBenchmarkCommand, GoogleBenchmark +from ..lang.cpp import CppCMakeDefinition +from ..utils.cmake import CMakeBuild +from ..utils.logger import logger + + +def regex_filter(re_expr): + if re_expr is None: + return lambda s: True + re_comp = re.compile(re_expr) + return lambda s: re_comp.search(s) + + +class BenchmarkRunner: + def suites(self, suite_filter=None, benchmark_filter=None): + raise NotImplementedError("BenchmarkRunner must implement suites") + + +class CppBenchmarkRunner(BenchmarkRunner): + def __init__(self, build): + """ Initialize a CppBenchmarkRunner. """ + self.build = build + + @property + def suites_binaries(self): + """ Returns a list of benchmark binaries for this build. """ + # Ensure build is up-to-date to run benchmarks + self.build() + # Not the best method, but works for now + glob_expr = os.path.join(self.build.binaries_dir, "*-benchmark") + return {os.path.basename(b): b for b in glob.glob(glob_expr)} + + def suite(self, name, suite_bin, benchmark_filter): + """ Returns the resulting benchmarks for a given suite. """ + suite_cmd = GoogleBenchmarkCommand(suite_bin, benchmark_filter) + + # Ensure there will be data + benchmark_names = suite_cmd.list_benchmarks() + if not benchmark_names: + return None + + results = suite_cmd.results() + benchmarks = GoogleBenchmark.from_json(results.get("benchmarks")) + return BenchmarkSuite(name, benchmarks) + + def suites(self, suite_filter=None, benchmark_filter=None): + """ Returns all suite for a runner. """ + suite_matcher = regex_filter(suite_filter) + + suite_and_binaries = self.suites_binaries + for suite_name in suite_and_binaries: + if not suite_matcher(suite_name): + logger.debug(f"Ignoring suite {suite_name}") + continue + + suite_bin = suite_and_binaries[suite_name] + suite = self.suite(suite_name, suite_bin, + benchmark_filter=benchmark_filter) + + # Filter may exclude all benchmarks + if not suite: + logger.debug(f"Suite {suite_name} executed but no results") + continue + + yield suite + + @staticmethod + def from_rev_or_path(src, root, rev_or_path, cmake_conf): + """ Returns a CppBenchmarkRunner from a path or a git revision. + + First, it checks if `rev_or_path` points to a valid CMake build + directory. If so, it creates a CppBenchmarkRunner with this existing + CMakeBuild. + + Otherwise, it assumes `rev_or_path` is a revision and clone/checkout + the given revision and create a fresh CMakeBuild. + """ + build = None + if CMakeBuild.is_build_dir(rev_or_path): + build = CMakeBuild.from_path(rev_or_path) + else: + root_rev = os.path.join(root, rev_or_path) + os.mkdir(root_rev) + + clone_dir = os.path.join(root_rev, "arrow") + # Possibly checkout the sources at given revision, no need to + # perform cleanup on cloned repository as root_rev is reclaimed. + src_rev, _ = src.at_revision(rev_or_path, clone_dir) + cmake_def = CppCMakeDefinition(src_rev.cpp, cmake_conf) + build = cmake_def.build(os.path.join(root_rev, "build")) + + return CppBenchmarkRunner(build) diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py new file mode 100644 index 0000000..4fa8896 --- /dev/null +++ b/dev/archery/archery/cli.py @@ -0,0 +1,274 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import click +from contextlib import contextmanager +import json +import logging +import sys +from tempfile import mkdtemp, TemporaryDirectory + +from .benchmark.compare import RunnerComparator, DEFAULT_THRESHOLD +from .benchmark.runner import CppBenchmarkRunner +from .lang.cpp import CppCMakeDefinition, CppConfiguration +from .utils.codec import JsonEncoder +from .utils.logger import logger, ctx as log_ctx +from .utils.source import ArrowSources + +# Set default logging to INFO in command line. +logging.basicConfig(level=logging.INFO) + + +@click.group() +@click.option("--debug", type=bool, is_flag=True, default=False, + help="Increase logging with debugging output.") +@click.option("-q", "--quiet", type=bool, is_flag=True, default=False, + help="Silence executed commands.") +@click.pass_context +def archery(ctx, debug, quiet): + """ Apache Arrow developer utilities. + + See sub-commands help with `archery <cmd> --help`. + + """ + # Ensure ctx.obj exists + ctx.ensure_object(dict) + + log_ctx.quiet = quiet + if debug: + logger.setLevel(logging.DEBUG) + + +def validate_arrow_sources(ctx, param, src): + """ Ensure a directory contains Arrow cpp sources. """ + if isinstance(src, str): + if not ArrowSources.valid(src): + raise click.BadParameter(f"No Arrow C++ sources found in {src}.") + src = ArrowSources(src) + return src + + +build_dir_type = click.Path(dir_okay=True, file_okay=False, resolve_path=True) +# Supported build types +build_type = click.Choice(["debug", "relwithdebinfo", "release"], + case_sensitive=False) +# Supported warn levels +warn_level_type = click.Choice(["everything", "checkin", "production"], + case_sensitive=False) + + +@archery.command(short_help="Initialize an Arrow C++ build") +@click.option("--src", metavar="<arrow_src>", default=ArrowSources.find(), + callback=validate_arrow_sources, + help="Specify Arrow source directory") +# toolchain +@click.option("--cc", metavar="<compiler>", help="C compiler.") +@click.option("--cxx", metavar="<compiler>", help="C++ compiler.") +@click.option("--cxx-flags", help="C++ compiler flags.") +@click.option("--build-type", default="release", type=build_type, + help="CMake's CMAKE_BUILD_TYPE") +@click.option("--warn-level", default="production", type=warn_level_type, + help="Controls compiler warnings -W(no-)error.") +# components +@click.option("--with-tests", default=True, type=bool, + help="Build with tests.") +@click.option("--with-benchmarks", default=False, type=bool, + help="Build with benchmarks.") +@click.option("--with-python", default=True, type=bool, + help="Build with python extension.") +@click.option("--with-parquet", default=False, type=bool, + help="Build with parquet file support.") +@click.option("--with-gandiva", default=False, type=bool, + help="Build with Gandiva expression compiler support.") +@click.option("--with-plasma", default=False, type=bool, + help="Build with Plasma object store support.") +@click.option("--with-flight", default=False, type=bool, + help="Build with Flight rpc support.") +@click.option("--cmake-extras", type=str, multiple=True, + help="Extra flags/options to pass to cmake invocation. " + "Can be stacked") +# misc +@click.option("-f", "--force", type=bool, is_flag=True, default=False, + help="Delete existing build directory if found.") +@click.option("--targets", type=str, multiple=True, + help="Generator targets to run. Can be stacked.") +@click.argument("build_dir", type=build_dir_type) +@click.pass_context +def build(ctx, src, build_dir, force, targets, **kwargs): + """ Initialize a C++ build directory. + + The build command creates a directory initialized with Arrow's cpp source + cmake and configuration. It can also optionally invoke the generator to + test the build (and used in scripts). + + Note that archery will carry the caller environment. It will also not touch + an existing directory, one must use the `--force` option to remove the + existing directory. + + Examples: + + \b + # Initialize build with clang7 and avx2 support in directory `clang7-build` + \b + archery build --cc=clang-7 --cxx=clang++-7 --cxx-flags=-mavx2 clang7-build + + \b + # Builds and run test + archery build --targets=all --targets=test build + """ + # Arrow's cpp cmake configuration + conf = CppConfiguration(**kwargs) + # This is a closure around cmake invocation, e.g. calling `def.build()` + # yields a directory ready to be run with the generator + cmake_def = CppCMakeDefinition(src.cpp, conf) + # Create build directory + build = cmake_def.build(build_dir, force=force) + + for target in targets: + build.run(target) + + +@contextmanager +def tmpdir(preserve, prefix="arrow-bench-"): + if preserve: + yield mkdtemp(prefix=prefix) + else: + with TemporaryDirectory(prefix=prefix) as tmp: + yield tmp + + +# Running all benchmarks would be prohibitive. Benchmark who needs to be +# monitored for regression should be named with this prefix. +DEFAULT_BENCHMARK_FILTER = "^Regression" + + +@archery.group() +@click.pass_context +def benchmark(ctx): + """ Arrow benchmarking. + + Use the diff sub-command to benchmake revisions, and/or build directories. + """ + pass + + +@benchmark.command(name="diff", short_help="Run the C++ benchmark suite") +@click.option("--src", metavar="<arrow_src>", show_default=True, + default=ArrowSources.find(), + callback=validate_arrow_sources, + help="Specify Arrow source directory") +@click.option("--suite-filter", metavar="<regex>", show_default=True, + type=str, default=None, help="Regex filtering benchmark suites.") +@click.option("--benchmark-filter", metavar="<regex>", show_default=True, + type=str, default=DEFAULT_BENCHMARK_FILTER, + help="Regex filtering benchmark suites.") +@click.option("--preserve", type=bool, default=False, show_default=True, + is_flag=True, help="Preserve workspace for investigation.") +@click.option("--threshold", type=float, default=DEFAULT_THRESHOLD, + show_default=True, + help="Regression failure threshold in percentage.") +@click.option("--cmake-extras", type=str, multiple=True, + help="Extra flags/options to pass to cmake invocation. " + "Can be stacked") +@click.argument("contender", metavar="[<contender>", + default=ArrowSources.WORKSPACE, required=False) +@click.argument("baseline", metavar="[<baseline>]]", default="master", + required=False) +@click.pass_context +def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter, + threshold, cmake_extras, contender, baseline): + """ Compare (diff) benchmark runs. + + This command acts like git-diff but for benchmark results. + + The caller can optionally specify both the contender and the baseline. If + unspecified, the contender will default to the current workspace (like git) + and the baseline will default to master. + + Each target (contender or baseline) can either be a git revision + (commit, tag, special values like HEAD) or a cmake build directory. This + allow comparing git commits, and/or different compilers and/or compiler + flags. + + When a commit is referenced, a local clone of the arrow sources (specified + via --src) is performed and the proper branch is created. This is done in + a temporary directory which can be left intact with the `---preserve` flag. + + The special token "WORKSPACE" is reserved to specify the current git + workspace. This imply that no clone will be performed. + + Examples: + + \b + # Compare workspace (contender) with master (baseline) + \b + archery benchmark diff + + \b + # Compare master (contender) with latest version (baseline) + \b + export LAST=$(git tag -l "apache-arrow-[0-9]*" | sort -rV | head -1) + \b + archery benchmark diff master "$LAST" + + \b + # Compare g++7 (contender) with clang++-7 (baseline) builds + \b + archery build --with-benchmarks=true \\ + --cxx-flags=-ftree-vectorize \\ + --cc=gcc-7 --cxx=g++-7 gcc7-build + \b + archery build --with-benchmarks=true \\ + --cxx-flags=-flax-vector-conversions \\ + --cc=clang-7 --cxx=clang++-7 clang7-build + \b + archery benchmark diff gcc7-build clang7-build + + \b + # Compare default targets but scoped to the suites matching + # `^arrow-compute-aggregate` and benchmarks matching `(Sum|Mean)Kernel`. + \b + archery benchmark diff --suite-filter="^arrow-compute-aggregate" \\ + --benchmark-filter="(Sum|Mean)Kernel" + """ + with tmpdir(preserve) as root: + logger.debug(f"Comparing {contender} (contender) with " + f"{baseline} (baseline)") + + conf = CppConfiguration( + build_type="release", with_tests=True, with_benchmarks=True, + with_python=False, cmake_extras=cmake_extras) + + runner_cont = CppBenchmarkRunner.from_rev_or_path( + src, root, contender, conf) + runner_base = CppBenchmarkRunner.from_rev_or_path( + src, root, baseline, conf) + + runner_comp = RunnerComparator(runner_cont, runner_base, threshold) + comparisons = runner_comp.comparisons(suite_filter, benchmark_filter) + + regressions = 0 + for comparator in comparisons: + regressions += comparator.regression + print(json.dumps(comparator, cls=JsonEncoder)) + + sys.exit(regressions) + + +if __name__ == "__main__": + archery(obj={}) diff --git a/dev/archery/archery/lang/cpp.py b/dev/archery/archery/lang/cpp.py new file mode 100644 index 0000000..84b6346 --- /dev/null +++ b/dev/archery/archery/lang/cpp.py @@ -0,0 +1,130 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os + +from ..utils.cmake import CMakeDefinition + + +def truthifier(value): + return "ON" if value else "OFF" + + +def or_else(value, default): + return value if value else default + + +class CppConfiguration: + def __init__(self, + # toolchain + cc=None, cxx=None, cxx_flags=None, + build_type=None, warn_level=None, + install_prefix=None, use_conda=None, + # components + with_tests=True, with_benchmarks=False, with_python=True, + with_parquet=False, with_gandiva=False, with_plasma=False, + with_flight=False, cmake_extras=None): + self.cc = cc + self.cxx = cxx + self.cxx_flags = cxx_flags + + self.build_type = build_type + self.warn_level = warn_level + self._install_prefix = install_prefix + self._use_conda = use_conda + + self.with_tests = with_tests + self.with_benchmarks = with_benchmarks + self.with_python = with_python + self.with_parquet = with_parquet + self.with_gandiva = with_gandiva + self.with_plasma = with_plasma + self.with_flight = with_flight + self.cmake_extras = cmake_extras + + def _gen_defs(self): + if self.cxx_flags: + yield ("ARROW_CXXFLAGS", self.cxx_flags) + + yield ("CMAKE_BUILD_TYPE", or_else(self.build_type, "debug")) + yield ("BUILD_WARNING_LEVEL", or_else(self.warn_level, "production")) + + # if not ctx.quiet: + # yield ("ARROW_VERBOSE_THIRDPARTY_BUILD", "ON") + + maybe_prefix = self.install_prefix + if maybe_prefix: + yield ("CMAKE_INSTALL_PREFIX", maybe_prefix) + + yield ("ARROW_BUILD_TESTS", truthifier(self.with_tests)) + yield ("ARROW_BUILD_BENCHMARKS", truthifier(self.with_benchmarks)) + + yield ("ARROW_PYTHON", truthifier(self.with_python)) + yield ("ARROW_PARQUET", truthifier(self.with_parquet)) + yield ("ARROW_GANDIVA", truthifier(self.with_gandiva)) + yield ("ARROW_PLASMA", truthifier(self.with_plasma)) + yield ("ARROW_FLIGHT", truthifier(self.with_flight)) + + # Detect custom conda toolchain + if self.use_conda: + for d, v in [('CMAKE_AR', 'AR'), ('CMAKE_RANLIB', 'RANLIB')]: + v = os.environ.get(v) + if v: + yield (d, v) + + @property + def install_prefix(self): + if self._install_prefix: + return self._install_prefix + + if self.use_conda: + return os.environ.get("CONDA_PREFIX") + + return None + + @property + def use_conda(self): + # If the user didn't specify a preference, guess via environment + if self._use_conda is None: + return os.environ.get("CONDA_PREFIX") is not None + + return self._use_conda + + @property + def definitions(self): + extras = list(self.cmake_extras) if self.cmake_extras else [] + return [f"-D{d[0]}={d[1]}" for d in self._gen_defs()] + extras + + @property + def environment(self): + env = os.environ.copy() + + if self.cc: + env["CC"] = self.cc + + if self.cxx: + env["CXX"] = self.cxx + + return env + + +class CppCMakeDefinition(CMakeDefinition): + def __init__(self, source, conf, **kwargs): + self.configuration = conf + super().__init__(source, **kwargs, + definitions=conf.definitions, env=conf.environment, + build_type=conf.build_type) diff --git a/dev/archery/archery/utils/cmake.py b/dev/archery/archery/utils/cmake.py new file mode 100644 index 0000000..38aedab --- /dev/null +++ b/dev/archery/archery/utils/cmake.py @@ -0,0 +1,213 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +import re +from shutil import rmtree, which + +from .command import Command + + +class CMake(Command): + def __init__(self, cmake_bin=None): + self.bin = cmake_bin if cmake_bin else os.environ.get("CMAKE", "cmake") + + @staticmethod + def default_generator(): + """ Infer default generator. + + Gives precedence to ninja if there exists an executable named `ninja` + in the search path. + """ + found_ninja = which("ninja") + return "Ninja" if found_ninja else "Make" + + +cmake = CMake() + + +class CMakeDefinition: + """ CMakeDefinition captures the cmake invocation arguments. + + It allows creating build directories with the same definition, e.g. + ``` + build_1 = cmake_def.build("/tmp/build-1") + build_2 = cmake_def.build("/tmp/build-2") + + ... + + build1.all() + build2.all() + """ + + def __init__(self, source, build_type="release", generator=None, + definitions=None, env=None): + """ Initialize a CMakeDefinition + + Parameters + ---------- + source : str + Source directory where the top-level CMakeLists.txt is + located. This is usually the root of the project. + generator : str, optional + definitions: list(str), optional + env : dict(str,str), optional + Environment to use when invoking cmake. This can be required to + work around cmake deficiencies, e.g. CC and CXX. + """ + self.source = os.path.abspath(source) + self.build_type = build_type + self.generator = generator if generator else cmake.default_generator() + self.definitions = definitions if definitions else [] + self.env = env + + @property + def arguments(self): + """" Return the arguments to cmake invocation. """ + arguments = [ + f"-G{self.generator}", + ] + self.definitions + [ + self.source + ] + return arguments + + def build(self, build_dir, force=False, **kwargs): + """ Invoke cmake into a build directory. + + Parameters + ---------- + build_dir : str + Directory in which the CMake build will be instanciated. + force : bool + If the build folder exists, delete it before. Otherwise if it's + present, an error will be returned. + """ + if os.path.exists(build_dir): + # Extra safety to ensure we're deleting a build folder. + if not CMakeBuild.is_build_dir(build_dir): + raise FileExistsError(f"{build_dir} is not a cmake build") + if not force: + raise FileExistsError(f"{build_dir} exists use force=True") + rmtree(build_dir) + + os.mkdir(build_dir) + + cmake(*self.arguments, cwd=build_dir, env=self.env) + return CMakeBuild(build_dir, self.generator.lower(), self.build_type, + definition=self, **kwargs) + + def __repr__(self): + return f"CMakeDefinition[source={self.source}]" + + +CMAKE_BUILD_TYPE_RE = re.compile("CMAKE_BUILD_TYPE:STRING=([a-zA-Z]+)") + + +class CMakeBuild(Command): + """ CMakeBuild represents a build directory initialized by cmake. + + The build instance can be used to build/test/install. It alleviates the + user to know which generator is used. + """ + + def __init__(self, build_dir, generator, build_type, definition=None): + """ Initialize a CMakeBuild. + + The caller must ensure that cmake was invoked in the build directory. + + Parameters + ---------- + definition : CMakeDefinition + The definition to build from. + build_dir : str + The build directory to setup into. + """ + assert CMakeBuild.is_build_dir(build_dir) + self.build_dir = os.path.abspath(build_dir) + self.bin = generator + self.build_type = build_type + self.definition = definition + + @property + def binaries_dir(self): + return os.path.join(self.build_dir, self.build_type) + + def run(self, *argv, verbose=False, **kwargs): + extra = [] + if verbose: + extra.append("-v" if self.bin.endswith("ninja") else "VERBOSE=1") + # Commands must be ran under the build directory + super().run(*extra, *argv, **kwargs, cwd=self.build_dir) + return self + + def all(self): + return self.run("all") + + def clean(self): + return self.run("clean") + + def install(self): + return self.run("install") + + def test(self): + return self.run("test") + + @staticmethod + def is_build_dir(path): + """ Indicate if a path is CMake build directory. + + This method only checks for the existence of paths and does not do any + validation whatsoever. + """ + cmake_cache = os.path.join(path, "CMakeCache.txt") + cmake_files = os.path.join(path, "CMakeFiles") + return os.path.exists(cmake_cache) and os.path.exists(cmake_files) + + @staticmethod + def from_path(path): + """ Instantiate a CMakeBuild from a path. + + This is used to recover from an existing physical directory (created + with or without CMakeBuild). + + Note that this method is not idempotent as the original definition will + be lost. Only some parameters are recovered (generator and build_type). + """ + if not CMakeBuild.is_build_dir(path): + raise ValueError(f"Not a valid CMakeBuild path: {path}") + + generator = "make" + if os.path.exists(os.path.join(path, "build.ninja")): + generator = "ninja" + + build_type = None + # Infer build_type by looking at CMakeCache.txt and looking for a magic + # definition + cmake_cache_path = os.path.join(path, "CMakeCache.txt") + with open(cmake_cache_path, "r") as cmake_cache: + candidates = CMAKE_BUILD_TYPE_RE.findall(cmake_cache.read()) + build_type = candidates[0].lower() if candidates else "release" + + return CMakeBuild(path, generator, build_type) + + def __repr__(self): + return ("CMakeBuild[" + "build = {}," + "build_type = {}," + "definition = {}]".format(self.build_dir, + self.build_type, + self.definition)) diff --git a/.gitignore b/dev/archery/archery/utils/codec.py similarity index 52% copy from .gitignore copy to dev/archery/archery/utils/codec.py index 6bb237a..612f2df 100644 --- a/.gitignore +++ b/dev/archery/archery/utils/codec.py @@ -15,46 +15,29 @@ # specific language governing permissions and limitations # under the License. -apache-rat-*.jar -arrow-src.tar -arrow-src.tar.gz - -# Compiled source -*.a -*.dll -*.o -*.py[ocd] -*.so -*.so.* -*.dylib -.build_cache_dir -dependency-reduced-pom.xml -MANIFEST -compile_commands.json -build.ninja - -# Generated Visual Studio files -*.vcxproj -*.vcxproj.* -*.sln -*.iml - -# Linux perf sample data -perf.data -perf.data.old - -cpp/.idea/ -cpp/apidoc/xml/ -docs/example.gz -docs/example1.dat -docs/example3.dat -python/.eggs/ -python/doc/ - -.vscode -.idea/ -.pytest_cache/ -pkgs -.Rproj.user -arrow.Rcheck/ -docker_cache + +import json + +from ..benchmark.compare import BenchmarkComparator + + +class JsonEncoder(json.JSONEncoder): + def default(self, o): + if isinstance(o, BenchmarkComparator): + comparator = { + "benchmark": o.name, + "change": o.change, + "regression": o.regression, + "baseline": o.baseline.value, + "contender": o.contender.value, + "unit": o.unit, + "less_is_better": o.less_is_better, + } + + suite_name = o.suite_name + if suite_name: + comparator["suite"] = suite_name + + return comparator + + return json.JSONEncoder.default(self, o) diff --git a/dev/archery/archery/utils/command.py b/dev/archery/archery/utils/command.py new file mode 100644 index 0000000..46d0066 --- /dev/null +++ b/dev/archery/archery/utils/command.py @@ -0,0 +1,71 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +import shutil +import subprocess + +from .logger import logger, ctx + + +def find_exec(executable): + exec_exists = os.path.exists(executable) + return executable if exec_exists else shutil.which(executable) + + +# Decorator running a command and returning stdout +class capture_stdout: + def __init__(self, strip=False): + self.strip = strip + + def __call__(self, f): + def strip_it(x): + return x.strip() if self.strip else x + + def wrapper(*argv, **kwargs): + # Ensure stdout is captured + kwargs["stdout"] = subprocess.PIPE + return strip_it(f(*argv, **kwargs).stdout) + return wrapper + + +class Command: + """ A runnable command. + + Class inheriting from the Command class must provide the bin + property/attribute. + """ + + def run(self, *argv, **kwargs): + assert(hasattr(self, "bin")) + invocation = [find_exec(self.bin)] + invocation.extend(argv) + + for key in ["stdout", "stderr"]: + # Preserve caller intention, otherwise silence + if key not in kwargs and ctx.quiet: + kwargs[key] = subprocess.PIPE + + # Prefer safe by default + if "check" not in kwargs: + kwargs["check"] = True + + logger.debug(f"Executing `{invocation}`") + return subprocess.run(invocation, **kwargs) + + def __call__(self, *argv, **kwargs): + self.run(*argv, **kwargs) diff --git a/dev/archery/archery/utils/git.py b/dev/archery/archery/utils/git.py new file mode 100644 index 0000000..c611352 --- /dev/null +++ b/dev/archery/archery/utils/git.py @@ -0,0 +1,73 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os + +from .command import Command, capture_stdout + + +# Decorator prepending argv with the git sub-command found with the method +# name. +def git_cmd(fn): + # function name is the subcommand + sub_cmd = fn.__name__.replace("_", "-") + + def wrapper(self, *argv, **kwargs): + return fn(self, sub_cmd, *argv, **kwargs) + return wrapper + + +class Git(Command): + def __init__(self, git_bin=None): + self.bin = git_bin if git_bin else os.environ.get("GIT", "git") + + def run_cmd(self, cmd, *argv, git_dir=None, **kwargs): + """ Inject flags before sub-command in argv. """ + opts = [] + if git_dir and isinstance(git_dir, str): + opts.extend(("-C", git_dir)) + + return self.run(*opts, cmd, *argv, **kwargs) + + @git_cmd + def clone(self, *argv, **kwargs): + return self.run_cmd(*argv, **kwargs) + + @git_cmd + def checkout(self, *argv, **kwargs): + return self.run_cmd(*argv, **kwargs) + + @git_cmd + def log(self, *argv, **kwargs): + return self.run_cmd(*argv, **kwargs) + + @git_cmd + def rev_parse(self, *argv, **kwargs): + print(self.head()) + return self.run_cmd(*argv, **kwargs) + + @capture_stdout(strip=True) + def head(self, **kwargs): + """ Return commit pointed by HEAD. """ + return self.rev_parse("HEAD", **kwargs) + + @capture_stdout(strip=True) + def current_branch(self, **kwargs): + return self.rev_parse("--abbrev-ref", "HEAD", **kwargs) + + +git = Git() diff --git a/.gitignore b/dev/archery/archery/utils/logger.py similarity index 61% copy from .gitignore copy to dev/archery/archery/utils/logger.py index 6bb237a..9d0feda 100644 --- a/.gitignore +++ b/dev/archery/archery/utils/logger.py @@ -15,46 +15,15 @@ # specific language governing permissions and limitations # under the License. -apache-rat-*.jar -arrow-src.tar -arrow-src.tar.gz +import logging -# Compiled source -*.a -*.dll -*.o -*.py[ocd] -*.so -*.so.* -*.dylib -.build_cache_dir -dependency-reduced-pom.xml -MANIFEST -compile_commands.json -build.ninja +""" Global logger. """ +logger = logging.getLogger("archery") -# Generated Visual Studio files -*.vcxproj -*.vcxproj.* -*.sln -*.iml -# Linux perf sample data -perf.data -perf.data.old +class LoggingContext: + def __init__(self, quiet=False): + self.quiet = quiet -cpp/.idea/ -cpp/apidoc/xml/ -docs/example.gz -docs/example1.dat -docs/example3.dat -python/.eggs/ -python/doc/ -.vscode -.idea/ -.pytest_cache/ -pkgs -.Rproj.user -arrow.Rcheck/ -docker_cache +ctx = LoggingContext() diff --git a/dev/archery/archery/utils/source.py b/dev/archery/archery/utils/source.py new file mode 100644 index 0000000..12dc735 --- /dev/null +++ b/dev/archery/archery/utils/source.py @@ -0,0 +1,141 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os + +from .git import git + + +class ArrowSources: + """ ArrowSources is a companion class representing a directory containing + Apache Arrow's sources. + """ + # Note that WORKSPACE is a reserved git revision name by this module to + # reference the current git workspace. In other words, this indicates to + # ArrowSources.at_revision that no cloning/checkout is required. + WORKSPACE = "WORKSPACE" + + def __init__(self, path): + """ Initialize an ArrowSources + + The caller must ensure that path is valid arrow source directory (can + be checked with ArrowSources.valid) + + Parameters + ---------- + path : src + """ + assert isinstance(path, str) and ArrowSources.valid(path) + self.path = path + + @property + def cpp(self): + """ Returns the cpp directory of an Arrow sources. """ + return os.path.join(self.path, "cpp") + + @property + def python(self): + """ Returns the python directory of an Arrow sources. """ + return os.path.join(self.path, "python") + + @property + def git_backed(self): + """ Indicate if the sources are backed by git. """ + git_path = os.path.join(self.path, ".git") + return os.path.exists(git_path) + + def at_revision(self, revision, clone_dir): + """ Return a copy of the current sources for a specified git revision. + + This method may return the current object if no checkout is required. + The caller is responsible to remove the cloned repository directory. + + The user can use the special WORKSPACE token to mean the current git + workspace (no checkout performed). + + The second value of the returned tuple indicates if a clone was + performed. + + Parameters + ---------- + revision : str + Revision to checkout sources at. + clone_dir : str + Path to checkout the local clone. + """ + if not self.git_backed: + raise ValueError(f"{self} is not backed by git") + + if revision == ArrowSources.WORKSPACE: + return self, False + + # A local clone is required to leave the current sources intact such + # that builds depending on said sources are not invalidated (or worse + # slightly affected when re-invoking the generator). + git.clone("--local", self.path, clone_dir) + git.checkout(revision, git_dir=clone_dir) + + return ArrowSources(clone_dir), True + + @staticmethod + def valid(src): + """ Indicate if current sources are valid. """ + if isinstance(src, ArrowSources): + return True + if isinstance(src, str): + cpp_path = os.path.join(src, "cpp") + cmake_path = os.path.join(cpp_path, "CMakeLists.txt") + return os.path.exists(cmake_path) + return False + + @staticmethod + def find(path=None): + """ Infer Arrow sources directory from various method. + + The following guesses are done in order until a valid match is found: + + 1. Checks the given optional parameter. + + 2. Checks if the environment variable `ARROW_SRC` is defined and use + this. + + 3. Checks if the current working directory (cwd) is an Arrow source + directory. + + 4. Checks if this file (cli.py) is still in the original source + repository. If so, returns the relative path to the source + directory. + """ + + # Explicit via environment + env = os.environ.get("ARROW_SRC") + + # Implicit via cwd + cwd = os.getcwd() + + # Implicit via current file + this_dir = os.path.dirname(os.path.realpath(__file__)) + this = os.path.join(this_dir, "..", "..", "..", "..") + + for p in [path, env, cwd, this]: + if ArrowSources.valid(p): + return ArrowSources(p) + + return None + + def __repr__(self): + return f"{self.path}" diff --git a/.gitignore b/dev/archery/setup.py similarity index 60% copy from .gitignore copy to dev/archery/setup.py index 6bb237a..2cf692c 100644 --- a/.gitignore +++ b/dev/archery/setup.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -15,46 +16,25 @@ # specific language governing permissions and limitations # under the License. -apache-rat-*.jar -arrow-src.tar -arrow-src.tar.gz +import sys +from setuptools import setup -# Compiled source -*.a -*.dll -*.o -*.py[ocd] -*.so -*.so.* -*.dylib -.build_cache_dir -dependency-reduced-pom.xml -MANIFEST -compile_commands.json -build.ninja -# Generated Visual Studio files -*.vcxproj -*.vcxproj.* -*.sln -*.iml +if sys.version_info < (3, 5): + sys.exit('Python < 3.5 is not supported') -# Linux perf sample data -perf.data -perf.data.old -cpp/.idea/ -cpp/apidoc/xml/ -docs/example.gz -docs/example1.dat -docs/example3.dat -python/.eggs/ -python/doc/ - -.vscode -.idea/ -.pytest_cache/ -pkgs -.Rproj.user -arrow.Rcheck/ -docker_cache +setup( + name='archery', + version="0.1.0", + description='Apache Arrow Developers Tools', + url='http://github.com/apache/arrow', + maintainer='Arrow Developers', + maintainer_email='d...@arrow.apache.org', + packages=['archery'], + install_requires=['click', 'pandas'], + entry_points=''' + [console_scripts] + archery=archery.cli:archery + ''', +) diff --git a/.gitignore b/dev/archery/tests/test_benchmarks.py similarity index 51% copy from .gitignore copy to dev/archery/tests/test_benchmarks.py index 6bb237a..d199a40 100644 --- a/.gitignore +++ b/dev/archery/tests/test_benchmarks.py @@ -15,46 +15,25 @@ # specific language governing permissions and limitations # under the License. -apache-rat-*.jar -arrow-src.tar -arrow-src.tar.gz +from archery.benchmark.core import Benchmark +from archery.benchmark.compare import BenchmarkComparator -# Compiled source -*.a -*.dll -*.o -*.py[ocd] -*.so -*.so.* -*.dylib -.build_cache_dir -dependency-reduced-pom.xml -MANIFEST -compile_commands.json -build.ninja -# Generated Visual Studio files -*.vcxproj -*.vcxproj.* -*.sln -*.iml +def test_benchmark_comparator(): + unit = "micros" -# Linux perf sample data -perf.data -perf.data.old + assert not BenchmarkComparator( + Benchmark("contender", unit, True, [10]), + Benchmark("baseline", unit, True, [20])).regression -cpp/.idea/ -cpp/apidoc/xml/ -docs/example.gz -docs/example1.dat -docs/example3.dat -python/.eggs/ -python/doc/ + assert BenchmarkComparator( + Benchmark("contender", unit, False, [10]), + Benchmark("baseline", unit, False, [20])).regression -.vscode -.idea/ -.pytest_cache/ -pkgs -.Rproj.user -arrow.Rcheck/ -docker_cache + assert BenchmarkComparator( + Benchmark("contender", unit, True, [20]), + Benchmark("baseline", unit, True, [10])).regression + + assert not BenchmarkComparator( + Benchmark("contender", unit, False, [20]), + Benchmark("baseline", unit, False, [10])).regression diff --git a/docs/source/developers/benchmarks.rst b/docs/source/developers/benchmarks.rst new file mode 100644 index 0000000..d0e6f1b --- /dev/null +++ b/docs/source/developers/benchmarks.rst @@ -0,0 +1,127 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _benchmarks: + +********** +Benchmarks +********** + +Archery +======= + +``archery`` is a python library and command line utility made to interact with +Arrow's sources. The main feature is the benchmarking process. + +Installation +~~~~~~~~~~~~ + +The simplest way to install archery is with pip from the top-level directory. +It is recommended to use the ``-e,--editable`` flag so that pip don't copy +the module files but uses the actual sources. + +.. code-block:: shell + + pip install -e dev/archery + archery --help + + # optional: enable bash/zsh autocompletion + eval "$(_ARCHERY_COMPLETE=source archery)" + +Comparison +========== + +One goal with benchmarking is to detect performance regressions. To this end, +``archery`` implements a benchmark comparison facility via the ``benchmark +diff`` command. + +In the default invocation, it will compare the current source (known as the +current workspace in git) with local master branch. + +For more information, invoke the ``archery benchmark diff --help`` command for +multiple examples of invocation. + +Iterating efficiently +~~~~~~~~~~~~~~~~~~~~~ + +Iterating with benchmark development can be a tedious process due to long +build time and long run times. ``archery benchmark diff`` provides 2 methods +to reduce this overhead. + +First, the benchmark command supports comparing existing +build directories, This can be paired with the ``--preserve`` flag to +avoid rebuilding sources from zero. + +.. code-block:: shell + + # First invocation clone and checkouts in a temporary directory. The + # directory is preserved with --preserve + archery benchmark diff --preserve + + # Modify C++ sources + + # Re-run benchmark in the previously created build directory. + archery benchmark diff /tmp/arrow-bench*/{WORKSPACE,master}/build + +Second, the benchmark command supports filtering suites (``--suite-filter``) +and benchmarks (``--benchmark-filter``), both options supports regular +expressions. + +.. code-block:: shell + + # Taking over a previous run, but only filtering for benchmarks matching + # `Kernel` and suite matching `compute-aggregate`. + archery benchmark diff \ + --suite-filter=compute-aggregate --benchmark-filter=Kernel \ + /tmp/arrow-bench*/{WORKSPACE,master}/build + +Both methods can be combined. + +Regression detection +==================== + +Writing a benchmark +~~~~~~~~~~~~~~~~~~~ + +1. The benchmark command will filter (by default) benchmarks with the regular + expression ``^Regression``. This way, not all benchmarks are run by default. + Thus, if you want your benchmark to be verified for regression + automatically, the name must match. + +2. The benchmark command will run with the ``--benchmark_repetitions=K`` + options for statistical significance. Thus, a benchmark should not override + the repetitions in the (C++) benchmark's arguments definition. + +3. Due to #2, a benchmark should run sufficiently fast. Often, when the input + does not fit in memory (L2/L3), the benchmark will be memory bound instead + of CPU bound. In this case, the input can be downsized. + +Scripting +========= + +``archery`` is written as a python library with a command line frontend. The +library can be imported to automate some tasks. + +Some invocation of the command line interface can be quite verbose due to build +output. This can be controlled/avoided with the ``--quiet`` option, e.g. + +.. code-block:: shell + + archery --quiet benchmark diff --benchmark-filter=Kernel + {"benchmark": "BenchSumKernel/32768/0", "change": -0.6498, "regression": true, ... + {"benchmark": "BenchSumKernel/32768/1", "change": 0.01553, "regression": false, ... + ... diff --git a/docs/source/developers/index.rst b/docs/source/developers/index.rst index a58f969..d309638 100644 --- a/docs/source/developers/index.rst +++ b/docs/source/developers/index.rst @@ -22,4 +22,5 @@ cpp python integration + benchmarks documentation diff --git a/python/.gitignore b/python/.gitignore index 3346aa6..8f08f93 100644 --- a/python/.gitignore +++ b/python/.gitignore @@ -25,8 +25,6 @@ pyarrow/include build # setup.py dist directory dist -# Egg metadata -*.egg-info # Coverage .coverage coverage.xml