This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new a9ef7459 chore(dev/benchmarks): Add Python/asv benchmarking setup
(#412)
a9ef7459 is described below
commit a9ef7459dec2da8a28b4e00fd9cec5b825905432
Author: Dewey Dunnington <[email protected]>
AuthorDate: Wed Mar 27 12:24:30 2024 -0300
chore(dev/benchmarks): Add Python/asv benchmarking setup (#412)
This PR adds a basic asv benchmarking setup for the Python bindings with
a small suite of high-level benchmarks tracking a few of the
opportunities I happen to know involve some Python looping.
---
.github/workflows/benchmarks.yaml | 14 +-
.gitignore | 1 +
dev/benchmarks/.gitignore | 3 +
dev/benchmarks/asv.conf.json | 194 ++++++++++++++++++++++
dev/benchmarks/{.gitignore => python/__init__.py} | 4 -
dev/benchmarks/python/array.py | 112 +++++++++++++
dev/benchmarks/python/ipc.py | 64 +++++++
dev/benchmarks/python/schema.py | 41 +++++
8 files changed, 426 insertions(+), 7 deletions(-)
diff --git a/.github/workflows/benchmarks.yaml
b/.github/workflows/benchmarks.yaml
index de528200..537eeeb4 100644
--- a/.github/workflows/benchmarks.yaml
+++ b/.github/workflows/benchmarks.yaml
@@ -39,17 +39,25 @@ jobs:
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
+ with:
+ cache: 'pip'
- - name: Install pyarrow
+ - name: Install Python dependencies
run: |
- pip install pyarrow
+ pip install pyarrow asv
- name: Genereate fixtures
run: |
cd dev/benchmarks
python generate-fixtures.py
- - name: Run benchmarks
+ - name: Check C benchmarks
run: |
cd dev/benchmarks
./benchmark-run-all.sh
+
+ - name: Check Python benchmarks
+ run: |
+ cd dev/benchmarks
+ asv machine --yes
+ asv run --quick --verbose ${{ github.sha }}^!
diff --git a/.gitignore b/.gitignore
index ab5821a7..12399570 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,3 +26,4 @@ CMakeUserPresets.json
.Rproj.user
.cache
*.swp
+__pycache__
diff --git a/dev/benchmarks/.gitignore b/dev/benchmarks/.gitignore
index 71a6b327..f6a96866 100644
--- a/dev/benchmarks/.gitignore
+++ b/dev/benchmarks/.gitignore
@@ -18,3 +18,6 @@
.Rhistory
benchmark-report.md
fixtures/
+asv_results
+asv_env
+asv_html
diff --git a/dev/benchmarks/asv.conf.json b/dev/benchmarks/asv.conf.json
new file mode 100644
index 00000000..619d44b7
--- /dev/null
+++ b/dev/benchmarks/asv.conf.json
@@ -0,0 +1,194 @@
+{
+ // The version of the config file format. Do not change, unless
+ // you know what you are doing.
+ "version": 1,
+
+ // The name of the project being benchmarked
+ "project": "nanoarrow-python-asv",
+
+ // The project's homepage
+ "project_url": "https://arrow.apache.org/nanoarrow/",
+
+ // The URL or local path of the source code repository for the
+ // project being benchmarked
+ "repo": "../..",
+
+ // The Python project's subdirectory in your repo. If missing or
+ // the empty string, the project is assumed to be located at the root
+ // of the repository.
+ "repo_subdir": "python",
+
+ // Customizable commands for building the project.
+ // See asv.conf.json documentation.
+ // To build the package using pyproject.toml (PEP518), uncomment the
following lines
+ // "build_command": [
+ // "python -m pip install build",
+ // "python -m build",
+ // "python -mpip wheel -w {build_cache_dir} {build_dir}"
+ // ],
+ // To build the package using setuptools and a setup.py file, uncomment
the following lines
+ // "build_command": [
+ // "python setup.py build",
+ // "python -mpip wheel -w {build_cache_dir} {build_dir}"
+ // ],
+
+ // Customizable commands for installing and uninstalling the project.
+ // See asv.conf.json documentation.
+ // "install_command": ["in-dir={env_dir} python -mpip install
{wheel_file}"],
+ // "uninstall_command": ["return-code=any python -mpip uninstall -y
{project}"],
+
+ // List of branches to benchmark. If not provided, defaults to "main"
+ // (for git) or "default" (for mercurial).
+ // "branches": ["main"], // for git
+ // "branches": ["default"], // for mercurial
+
+ // The DVCS being used. If not set, it will be automatically
+ // determined from "repo" by looking at the protocol in the URL
+ // (if remote), or by looking for special directories, such as
+ // ".git" (if local).
+ // "dvcs": "git",
+
+ // The tool to use to create environments. May be "conda",
+ // "virtualenv", "mamba" (above 3.8)
+ // or other value depending on the plugins in use.
+ // If missing or the empty string, the tool will be automatically
+ // determined by looking for tools on the PATH environment
+ // variable.
+ "environment_type": "virtualenv",
+
+ // timeout in seconds for installing any dependencies in environment
+ // defaults to 10 min
+ //"install_timeout": 600,
+
+ // the base URL to show a commit for the project.
+ "show_commit_url": "http://github.com/apache/arrow-nanoarrow/commit/",
+
+ // The Pythons you'd like to test against. If not provided, defaults
+ // to the current version of Python used to run `asv`.
+ // "pythons": ["3.8", "3.12"],
+
+ // The list of conda channel names to be searched for benchmark
+ // dependency packages in the specified order
+ // "conda_channels": ["conda-forge", "defaults"],
+
+ // A conda environment file that is used for environment creation.
+ // "conda_environment_file": "environment.yml",
+
+ // The matrix of dependencies to test. Each key of the "req"
+ // requirements dictionary is the name of a package (in PyPI) and
+ // the values are version numbers. An empty list or empty string
+ // indicates to just test against the default (latest)
+ // version. null indicates that the package is to not be
+ // installed. If the package to be tested is only available from
+ // PyPi, and the 'environment_type' is conda, then you can preface
+ // the package name by 'pip+', and the package will be installed
+ // via pip (with all the conda available packages installed first,
+ // followed by the pip installed packages).
+ //
+ // The ``@env`` and ``@env_nobuild`` keys contain the matrix of
+ // environment variables to pass to build and benchmark commands.
+ // An environment will be created for every combination of the
+ // cartesian product of the "@env" variables in this matrix.
+ // Variables in "@env_nobuild" will be passed to every environment
+ // during the benchmark phase, but will not trigger creation of
+ // new environments. A value of ``null`` means that the variable
+ // will not be set for the current combination.
+ //
+ // "matrix": {
+ // "req": {
+ // "numpy": ["1.6", "1.7"],
+ // "six": ["", null], // test with and without six installed
+ // "pip+emcee": [""] // emcee is only available for install with
pip.
+ // },
+ // "env": {"ENV_VAR_1": ["val1", "val2"]},
+ // "env_nobuild": {"ENV_VAR_2": ["val3", null]},
+ // },
+
+ // Combinations of libraries/python versions can be excluded/included
+ // from the set to test. Each entry is a dictionary containing additional
+ // key-value pairs to include/exclude.
+ //
+ // An exclude entry excludes entries where all values match. The
+ // values are regexps that should match the whole string.
+ //
+ // An include entry adds an environment. Only the packages listed
+ // are installed. The 'python' key is required. The exclude rules
+ // do not apply to includes.
+ //
+ // In addition to package names, the following keys are available:
+ //
+ // - python
+ // Python version, as in the *pythons* variable above.
+ // - environment_type
+ // Environment type, as above.
+ // - sys_platform
+ // Platform, as in sys.platform. Possible values for the common
+ // cases: 'linux2', 'win32', 'cygwin', 'darwin'.
+ // - req
+ // Required packages
+ // - env
+ // Environment variables
+ // - env_nobuild
+ // Non-build environment variables
+ //
+ // "exclude": [
+ // {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
+ // {"environment_type": "conda", "req": {"six": null}}, // don't run
without six on conda
+ // {"env": {"ENV_VAR_1": "val2"}}, // skip val2 for ENV_VAR_1
+ // ],
+ //
+ // "include": [
+ // // additional env for python3.12
+ // {"python": "3.12", "req": {"numpy": "1.26"}, "env_nobuild": {"FOO":
"123"}},
+ // // additional env if run on windows+conda
+ // {"platform": "win32", "environment_type": "conda", "python":
"3.12", "req": {"libpython": ""}},
+ // ],
+
+ // The directory (relative to the current directory) that benchmarks are
+ // stored in. If not provided, defaults to "benchmarks"
+ "benchmark_dir": "python",
+
+ // The directory (relative to the current directory) to cache the Python
+ // environments in. If not provided, defaults to "env"
+ "env_dir": "asv_env",
+
+ // The directory (relative to the current directory) that raw benchmark
+ // results are stored in. If not provided, defaults to "results".
+ "results_dir": "asv_results",
+
+ // The directory (relative to the current directory) that the html tree
+ // should be written to. If not provided, defaults to "html".
+ "html_dir": "asv_html",
+
+ // The number of characters to retain in the commit hashes.
+ // "hash_length": 8,
+
+ // `asv` will cache results of the recent builds in each
+ // environment, making them faster to install next time. This is
+ // the number of builds to keep, per environment.
+ // "build_cache_size": 2,
+
+ // The commits after which the regression search in `asv publish`
+ // should start looking for regressions. Dictionary whose keys are
+ // regexps matching to benchmark names, and values corresponding to
+ // the commit (exclusive) after which to start looking for
+ // regressions. The default is to start from the first commit
+ // with results. If the commit is `null`, regression detection is
+ // skipped for the matching benchmark.
+ //
+ // "regressions_first_commits": {
+ // "some_benchmark": "352cdf", // Consider regressions only after this
commit
+ // "another_benchmark": null, // Skip regression detection altogether
+ // },
+
+ // The thresholds for relative change in results, after which `asv
+ // publish` starts reporting regressions. Dictionary of the same
+ // form as in ``regressions_first_commits``, with values
+ // indicating the thresholds. If multiple entries match, the
+ // maximum is taken. If no entry matches, the default is 5%.
+ //
+ // "regressions_thresholds": {
+ // "some_benchmark": 0.01, // Threshold of 1%
+ // "another_benchmark": 0.5, // Threshold of 50%
+ // },
+}
diff --git a/dev/benchmarks/.gitignore b/dev/benchmarks/python/__init__.py
similarity index 95%
copy from dev/benchmarks/.gitignore
copy to dev/benchmarks/python/__init__.py
index 71a6b327..13a83393 100644
--- a/dev/benchmarks/.gitignore
+++ b/dev/benchmarks/python/__init__.py
@@ -14,7 +14,3 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
-
-.Rhistory
-benchmark-report.md
-fixtures/
diff --git a/dev/benchmarks/python/array.py b/dev/benchmarks/python/array.py
new file mode 100644
index 00000000..0b4f796c
--- /dev/null
+++ b/dev/benchmarks/python/array.py
@@ -0,0 +1,112 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import nanoarrow as na
+
+
+class CArrayBuilderSuite:
+ """
+ Benchmarks for building CArrays
+ """
+
+ def setup(self):
+ self.py_integers = list(range(int(1e6)))
+ self.py_bools = [False, True, True, False] * int(1e6 // 4)
+
+ self.wide_schema = na.c_schema(na.struct([na.int32()] * 10000))
+ self.children = [na.c_array(self.py_integers, na.int32())] * 10000
+
+ def time_build_c_array_int32(self):
+ """Create an int32 array from 1,000,000 Python integers"""
+ na.c_array(self.py_integers, na.int32())
+
+ def time_build_c_array_bool(self):
+ """Create a bool array from 1,000,000 Python booleans"""
+ na.c_array(self.py_bools, na.bool())
+
+ def time_build_c_array_struct_wide(self):
+ """Create a struct array with 10,000 columns"""
+ na.c_array_from_buffers(self.wide_schema, 1e6, [None],
children=self.children)
+
+
+class ArrayIterationSuite:
+ """Benchmarks for consuming an Array using various methods of iteration"""
+
+ def setup(self):
+ self.integers = na.Array(range(int(1e6)), na.int32())
+
+ n = int(1e6)
+ item_size = 7
+ alphabet = b"abcdefghijklmnopqrstuvwxyz"
+ n_alphabets = (item_size * n) // len(alphabet) + 1
+ data_buffer = alphabet * n_alphabets
+ offsets_buffer = na.c_buffer(
+ range(0, (n + 1) * item_size, item_size), na.int32()
+ )
+
+ c_strings = na.c_array_from_buffers(
+ na.string(), n, [None, offsets_buffer, data_buffer]
+ )
+ self.strings = na.Array(c_strings)
+
+ c_long_struct = na.c_array_from_buffers(
+ na.struct([na.int32()] * 100),
+ length=10000,
+ buffers=[None],
+ children=[na.c_array(range(10000), na.int32())] * 100,
+ )
+ self.long_struct = na.Array(c_long_struct)
+
+ c_wide_struct = na.c_array_from_buffers(
+ na.struct([na.int32()] * 10000),
+ length=100,
+ buffers=[None],
+ children=[na.c_array(range(100), na.int32())] * 10000,
+ )
+ self.wide_struct = na.Array(c_wide_struct)
+
+ def time_integers_to_list(self):
+ """Consume an int32 array with 1,000,000 elements into a Python list"""
+ list(self.integers.iter_py())
+
+ def time_strings_to_list(self):
+ """Consume a string array with 1,000,000 elements into a Python list"""
+ list(self.strings.iter_py())
+
+ def time_long_struct_to_dict_list(self):
+ """Consume an struct array with 10,000 elements and 100 columns into a
list
+ of dictionaries
+ """
+ list(self.long_struct.iter_py())
+
+ def time_long_struct_to_tuple_list(self):
+ """Consume an struct array with 10,000 elements and 100 columns into a
list
+ of tuples
+ """
+ list(self.long_struct.iter_tuples())
+
+ def time_wide_struct_to_dict_list(self):
+ """Consume an struct array with 100 elements and 10,000 columns into a
list
+ of dictionaries
+ """
+ list(self.wide_struct.iter_py())
+
+ def time_wide_struct_to_tuple_list(self):
+ """Consume an struct array with 100 elements and 10,000 columns into a
list
+ of tuples
+ """
+ list(self.wide_struct.iter_tuples())
diff --git a/dev/benchmarks/python/ipc.py b/dev/benchmarks/python/ipc.py
new file mode 100644
index 00000000..b841a1d6
--- /dev/null
+++ b/dev/benchmarks/python/ipc.py
@@ -0,0 +1,64 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import io
+import os
+
+import nanoarrow as na
+from nanoarrow import ipc
+
+
+class IpcReaderSuite:
+ """
+ Benchmarks for reading IPC streams
+ """
+
+ def setup(self):
+ self.fixtures_dir = os.path.join(os.path.dirname(__file__), "..",
"fixtures")
+ self.fixture_names = [
+ "float64_basic.arrows",
+ "float64_long.arrows",
+ "float64_wide.arrows",
+ ]
+ self.fixture_buffer = {}
+ for name in self.fixture_names:
+ with open(self.fixture_path(name), "rb") as f:
+ self.fixture_buffer[name] = f.read()
+
+ def fixture_path(self, name):
+ return os.path.join(self.fixtures_dir, name)
+
+ def read_fixture_file(self, name):
+ with ipc.Stream.from_path(self.fixture_path(name)) as in_stream:
+ list(na.c_array_stream(in_stream))
+
+ def read_fixture_buffer(self, name):
+ f = io.BytesIO(self.fixture_buffer[name])
+ with ipc.Stream.from_readable(f) as in_stream:
+ list(na.c_array_stream(in_stream))
+
+ def time_read_float64_basic_file(self):
+ self.read_fixture_file("float64_basic.arrows")
+
+ def time_read_float64_basic_buffer(self):
+ self.read_fixture_buffer("float64_basic.arrows")
+
+ def time_read_float64_long_buffer(self):
+ self.read_fixture_buffer("float64_long.arrows")
+
+ def time_read_float64_wide_buffer(self):
+ self.read_fixture_buffer("float64_wide.arrows")
diff --git a/dev/benchmarks/python/schema.py b/dev/benchmarks/python/schema.py
new file mode 100644
index 00000000..c997e94e
--- /dev/null
+++ b/dev/benchmarks/python/schema.py
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import nanoarrow as na
+
+
+class SchemaSuite:
+ """
+ Benchmarks of some Schema/CSchema operations
+ """
+
+ def setup(self):
+ self.children = [na.int32()] * 10000
+ self.c_children = [na.c_schema(child) for child in self.children]
+ self.c_wide_struct = na.c_schema(na.struct(self.children))
+
+ def time_create_wide_struct_from_schemas(self):
+ """Create a struct Schema with 10000 columns from a list of Schema"""
+ na.struct(self.children)
+
+ def time_create_wide_struct_from_c_schemas(self):
+ """Create a struct Schema with 10000 columns from a list of CSchema"""
+ na.struct(self.c_children)
+
+ def time_c_schema_protocol_wide_struct(self):
+ """Export a struct Schema with 10000 columns via the PyCapsule
protocol"""
+ self.c_wide_struct.__arrow_c_schema__()