This is an automated email from the ASF dual-hosted git repository. wesm pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new 3330d66 ARROW-4118: [Python] Fix benchmark setup for "asv run" 3330d66 is described below commit 3330d660643a034168b472b52aebfe0fea84b8cf Author: Antoine Pitrou <anto...@python.org> AuthorDate: Wed Jan 9 16:14:25 2019 -0600 ARROW-4118: [Python] Fix benchmark setup for "asv run" "conda activate" unfortunately isn't available from a non-interactive shell, and running bash as interactive doesn't look like a workable solution. Also fix a setup slowness issue in the Parquet benchmarks, and fix a C++ ABI issue by downloading packages from Anaconda rather than conda-forge. Author: Antoine Pitrou <anto...@python.org> Closes #3357 from pitrou/ARROW-4118-fix-asv-run and squashes the following commits: b07b68e61 <Antoine Pitrou> ARROW-4118: Fix benchmark setup for "asv run" --- docs/source/python/benchmarks.rst | 24 +++++++++++++----------- python/asv-build.sh | 17 ++++++++++++----- python/asv.conf.json | 4 +++- python/benchmarks/parquet.py | 16 +++++++++------- 4 files changed, 37 insertions(+), 24 deletions(-) diff --git a/docs/source/python/benchmarks.rst b/docs/source/python/benchmarks.rst index 7672294..12205c5 100644 --- a/docs/source/python/benchmarks.rst +++ b/docs/source/python/benchmarks.rst @@ -19,35 +19,37 @@ Benchmarks ========== The ``pyarrow`` package comes with a suite of benchmarks meant to -run with `asv`_. You'll need to install the ``asv`` package first +run with `ASV`_. You'll need to install the ``asv`` package first (``pip install asv`` or ``conda install -c conda-forge asv``). -The benchmarks are run using `asv`_ which is also their only requirement. - Running the benchmarks ---------------------- -To run the benchmarks, call ``asv run --python=same``. You cannot use the -plain ``asv run`` command at the moment as asv cannot handle python packages -in subdirectories of a repository. +To run the benchmarks for a locally-built Arrow, run ``asv dev`` or +``asv run --python=same``. -Running with arbitrary revisions --------------------------------- +Running for arbitrary Git revisions +----------------------------------- ASV allows to store results and generate graphs of the benchmarks over -the project's evolution. For this you have the latest development version of ASV: +the project's evolution. You need to have the latest development version of ASV: .. code:: pip install git+https://github.com/airspeed-velocity/asv +The build scripts assume that Conda's ``activate`` script is on the PATH +(the ``conda activate`` command unfortunately isn't available from +non-interactive scripts). + Now you should be ready to run ``asv run`` or whatever other command -suits your needs. +suits your needs. Note that this can be quite long, as each Arrow needs +to be rebuilt for each Git revision you're running the benchmarks for. Compatibility ------------- We only expect the benchmarking setup to work with Python 3.6 or later, -on a Unix-like system. +on a Unix-like system with bash. .. _asv: https://asv.readthedocs.org/ diff --git a/python/asv-build.sh b/python/asv-build.sh index 7b55456..90c7872 100755 --- a/python/asv-build.sh +++ b/python/asv-build.sh @@ -21,7 +21,9 @@ set -e # ASV doesn't activate its conda environment for us if [ -z "$ASV_ENV_DIR" ]; then exit 1; fi -conda activate $ASV_ENV_DIR +# Avoid "conda activate" because it's only set up in interactive shells +# (https://github.com/conda/conda/issues/8072) +source activate $ASV_ENV_DIR echo "== Conda Prefix for benchmarks: " $CONDA_PREFIX " ==" # Build Arrow C++ libraries @@ -32,6 +34,8 @@ export ORC_HOME=$CONDA_PREFIX export PROTOBUF_HOME=$CONDA_PREFIX export BOOST_ROOT=$CONDA_PREFIX +export CXXFLAGS="-D_GLIBCXX_USE_CXX11_ABI=1" + pushd ../cpp mkdir -p build pushd build @@ -40,9 +44,11 @@ cmake -GNinja \ -DCMAKE_BUILD_TYPE=release \ -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ -DARROW_CXXFLAGS=$CXXFLAGS \ - -DARROW_PYTHON=ON \ - -DARROW_PLASMA=ON \ - -DARROW_BUILD_TESTS=OFF \ + -DARROW_USE_GLOG=off \ + -DARROW_PARQUET=on \ + -DARROW_PYTHON=on \ + -DARROW_PLASMA=on \ + -DARROW_BUILD_TESTS=off \ .. cmake --build . --target install @@ -52,7 +58,8 @@ popd # Build pyarrow wrappers export SETUPTOOLS_SCM_PRETEND_VERSION=0.0.1 export PYARROW_BUILD_TYPE=release -export PYARROW_PARALLEL=4 +export PYARROW_PARALLEL=8 +export PYARROW_WITH_PARQUET=1 export PYARROW_WITH_PLASMA=1 python setup.py clean diff --git a/python/asv.conf.json b/python/asv.conf.json index 40938ee..09031c8 100644 --- a/python/asv.conf.json +++ b/python/asv.conf.json @@ -35,6 +35,7 @@ // of the repository. "repo_subdir": "python", + // Custom build commands for Arrow. "build_command": ["/bin/bash {build_dir}/asv-build.sh"], "install_command": ["/bin/bash {build_dir}/asv-install.sh"], "uninstall_command": ["/bin/bash {build_dir}/asv-uninstall.sh"], @@ -56,7 +57,8 @@ // determined by looking for tools on the PATH environment // variable. "environment_type": "conda", - "conda_channels": ["conda-forge", "defaults"], + // Avoid conda-forge to avoid C++ ABI issues + "conda_channels": ["defaults"], // the base URL to show a commit for the project. "show_commit_url": "https://github.com/apache/arrow/commit/", diff --git a/python/benchmarks/parquet.py b/python/benchmarks/parquet.py index fd61793..4f55587 100644 --- a/python/benchmarks/parquet.py +++ b/python/benchmarks/parquet.py @@ -15,11 +15,12 @@ # specific language governing permissions and limitations # under the License. -import pandas as pd -import random import shutil import tempfile +import numpy as np +import pandas as pd + import pyarrow as pa try: import pyarrow.parquet as pq @@ -38,18 +39,19 @@ class ParquetManifestCreation(object): def setup(self, num_partitions, num_threads): if pq is None: - raise NotImplementedError + raise NotImplementedError("Parquet support not enabled") self.tmpdir = tempfile.mkdtemp('benchmark_parquet') - num1 = [random.choice(range(0, num_partitions)) - for _ in range(self.size)] - num2 = [random.choice(range(0, 1000)) for _ in range(self.size)] + rnd = np.random.RandomState(42) + num1 = rnd.randint(0, num_partitions, size=self.size) + num2 = rnd.randint(0, 1000, size=self.size) output_df = pd.DataFrame({'num1': num1, 'num2': num2}) output_table = pa.Table.from_pandas(output_df) pq.write_to_dataset(output_table, self.tmpdir, ['num1']) def teardown(self, num_partitions, num_threads): - shutil.rmtree(self.tmpdir) + if self.tmpdir is not None: + shutil.rmtree(self.tmpdir) def time_manifest_creation(self, num_partitions, num_threads): pq.ParquetManifest(self.tmpdir, metadata_nthreads=num_threads)