asv benchmarking setup (#412)

paleolimbot Wed, 27 Mar 2024 08:24:58 -0700

This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git



The following commit(s) were added to refs/heads/main by this push:
     new a9ef7459 chore(dev/benchmarks): Add Python/asv benchmarking setup 
(#412)
a9ef7459 is described below

commit a9ef7459dec2da8a28b4e00fd9cec5b825905432
Author: Dewey Dunnington <[email protected]>
AuthorDate: Wed Mar 27 12:24:30 2024 -0300

    chore(dev/benchmarks): Add Python/asv benchmarking setup (#412)
    
    This PR adds a basic asv benchmarking setup for the Python bindings with
    a small suite of high-level benchmarks tracking a few of the
    opportunities I happen to know involve some Python looping.
---
 .github/workflows/benchmarks.yaml                 |  14 +-
 .gitignore                                        |   1 +
 dev/benchmarks/.gitignore                         |   3 +
 dev/benchmarks/asv.conf.json                      | 194 ++++++++++++++++++++++
 dev/benchmarks/{.gitignore => python/__init__.py} |   4 -
 dev/benchmarks/python/array.py                    | 112 +++++++++++++
 dev/benchmarks/python/ipc.py                      |  64 +++++++
 dev/benchmarks/python/schema.py                   |  41 +++++
 8 files changed, 426 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/benchmarks.yaml 
b/.github/workflows/benchmarks.yaml
index de528200..537eeeb4 100644
--- a/.github/workflows/benchmarks.yaml
+++ b/.github/workflows/benchmarks.yaml
@@ -39,17 +39,25 @@ jobs:
     steps:
     - uses: actions/checkout@v4
     - uses: actions/setup-python@v4
+      with:
+        cache: 'pip'
 
-    - name: Install pyarrow
+    - name: Install Python dependencies
       run: |
-        pip install pyarrow
+        pip install pyarrow asv
 
     - name: Genereate fixtures
       run: |
         cd dev/benchmarks
         python generate-fixtures.py
 
-    - name: Run benchmarks
+    - name: Check C benchmarks
       run: |
         cd dev/benchmarks
         ./benchmark-run-all.sh
+
+    - name: Check Python benchmarks
+      run: |
+        cd dev/benchmarks
+        asv machine --yes
+        asv run --quick --verbose ${{ github.sha }}^!
diff --git a/.gitignore b/.gitignore
index ab5821a7..12399570 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,3 +26,4 @@ CMakeUserPresets.json
 .Rproj.user
 .cache
 *.swp
+__pycache__
diff --git a/dev/benchmarks/.gitignore b/dev/benchmarks/.gitignore
index 71a6b327..f6a96866 100644
--- a/dev/benchmarks/.gitignore
+++ b/dev/benchmarks/.gitignore
@@ -18,3 +18,6 @@
 .Rhistory
 benchmark-report.md
 fixtures/
+asv_results
+asv_env
+asv_html
diff --git a/dev/benchmarks/asv.conf.json b/dev/benchmarks/asv.conf.json
new file mode 100644
index 00000000..619d44b7
--- /dev/null
+++ b/dev/benchmarks/asv.conf.json
@@ -0,0 +1,194 @@
+{
+    // The version of the config file format.  Do not change, unless
+    // you know what you are doing.
+    "version": 1,
+
+    // The name of the project being benchmarked
+    "project": "nanoarrow-python-asv",
+
+    // The project's homepage
+    "project_url": "https://arrow.apache.org/nanoarrow/";,
+
+    // The URL or local path of the source code repository for the
+    // project being benchmarked
+    "repo": "../..",
+
+    // The Python project's subdirectory in your repo.  If missing or
+    // the empty string, the project is assumed to be located at the root
+    // of the repository.
+    "repo_subdir": "python",
+
+    // Customizable commands for building the project.
+    // See asv.conf.json documentation.
+    // To build the package using pyproject.toml (PEP518), uncomment the 
following lines
+    // "build_command": [
+    //     "python -m pip install build",
+    //     "python -m build",
+    //     "python -mpip wheel -w {build_cache_dir} {build_dir}"
+    // ],
+    // To build the package using setuptools and a setup.py file, uncomment 
the following lines
+    // "build_command": [
+    //     "python setup.py build",
+    //     "python -mpip wheel -w {build_cache_dir} {build_dir}"
+    // ],
+
+    // Customizable commands for installing and uninstalling the project.
+    // See asv.conf.json documentation.
+    // "install_command": ["in-dir={env_dir} python -mpip install 
{wheel_file}"],
+    // "uninstall_command": ["return-code=any python -mpip uninstall -y 
{project}"],
+
+    // List of branches to benchmark. If not provided, defaults to "main"
+    // (for git) or "default" (for mercurial).
+    // "branches": ["main"], // for git
+    // "branches": ["default"],    // for mercurial
+
+    // The DVCS being used.  If not set, it will be automatically
+    // determined from "repo" by looking at the protocol in the URL
+    // (if remote), or by looking for special directories, such as
+    // ".git" (if local).
+    // "dvcs": "git",
+
+    // The tool to use to create environments.  May be "conda",
+    // "virtualenv", "mamba" (above 3.8)
+    // or other value depending on the plugins in use.
+    // If missing or the empty string, the tool will be automatically
+    // determined by looking for tools on the PATH environment
+    // variable.
+    "environment_type": "virtualenv",
+
+    // timeout in seconds for installing any dependencies in environment
+    // defaults to 10 min
+    //"install_timeout": 600,
+
+    // the base URL to show a commit for the project.
+    "show_commit_url": "http://github.com/apache/arrow-nanoarrow/commit/";,
+
+    // The Pythons you'd like to test against.  If not provided, defaults
+    // to the current version of Python used to run `asv`.
+    // "pythons": ["3.8", "3.12"],
+
+    // The list of conda channel names to be searched for benchmark
+    // dependency packages in the specified order
+    // "conda_channels": ["conda-forge", "defaults"],
+
+    // A conda environment file that is used for environment creation.
+    // "conda_environment_file": "environment.yml",
+
+    // The matrix of dependencies to test.  Each key of the "req"
+    // requirements dictionary is the name of a package (in PyPI) and
+    // the values are version numbers.  An empty list or empty string
+    // indicates to just test against the default (latest)
+    // version. null indicates that the package is to not be
+    // installed. If the package to be tested is only available from
+    // PyPi, and the 'environment_type' is conda, then you can preface
+    // the package name by 'pip+', and the package will be installed
+    // via pip (with all the conda available packages installed first,
+    // followed by the pip installed packages).
+    //
+    // The ``@env`` and ``@env_nobuild`` keys contain the matrix of
+    // environment variables to pass to build and benchmark commands.
+    // An environment will be created for every combination of the
+    // cartesian product of the "@env" variables in this matrix.
+    // Variables in "@env_nobuild" will be passed to every environment
+    // during the benchmark phase, but will not trigger creation of
+    // new environments.  A value of ``null`` means that the variable
+    // will not be set for the current combination.
+    //
+    // "matrix": {
+    //     "req": {
+    //         "numpy": ["1.6", "1.7"],
+    //         "six": ["", null],  // test with and without six installed
+    //         "pip+emcee": [""]   // emcee is only available for install with 
pip.
+    //     },
+    //     "env": {"ENV_VAR_1": ["val1", "val2"]},
+    //     "env_nobuild": {"ENV_VAR_2": ["val3", null]},
+    // },
+
+    // Combinations of libraries/python versions can be excluded/included
+    // from the set to test. Each entry is a dictionary containing additional
+    // key-value pairs to include/exclude.
+    //
+    // An exclude entry excludes entries where all values match. The
+    // values are regexps that should match the whole string.
+    //
+    // An include entry adds an environment. Only the packages listed
+    // are installed. The 'python' key is required. The exclude rules
+    // do not apply to includes.
+    //
+    // In addition to package names, the following keys are available:
+    //
+    // - python
+    //     Python version, as in the *pythons* variable above.
+    // - environment_type
+    //     Environment type, as above.
+    // - sys_platform
+    //     Platform, as in sys.platform. Possible values for the common
+    //     cases: 'linux2', 'win32', 'cygwin', 'darwin'.
+    // - req
+    //     Required packages
+    // - env
+    //     Environment variables
+    // - env_nobuild
+    //     Non-build environment variables
+    //
+    // "exclude": [
+    //     {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
+    //     {"environment_type": "conda", "req": {"six": null}}, // don't run 
without six on conda
+    //     {"env": {"ENV_VAR_1": "val2"}}, // skip val2 for ENV_VAR_1
+    // ],
+    //
+    // "include": [
+    //     // additional env for python3.12
+    //     {"python": "3.12", "req": {"numpy": "1.26"}, "env_nobuild": {"FOO": 
"123"}},
+    //     // additional env if run on windows+conda
+    //     {"platform": "win32", "environment_type": "conda", "python": 
"3.12", "req": {"libpython": ""}},
+    // ],
+
+    // The directory (relative to the current directory) that benchmarks are
+    // stored in.  If not provided, defaults to "benchmarks"
+    "benchmark_dir": "python",
+
+    // The directory (relative to the current directory) to cache the Python
+    // environments in.  If not provided, defaults to "env"
+    "env_dir": "asv_env",
+
+    // The directory (relative to the current directory) that raw benchmark
+    // results are stored in.  If not provided, defaults to "results".
+    "results_dir": "asv_results",
+
+    // The directory (relative to the current directory) that the html tree
+    // should be written to.  If not provided, defaults to "html".
+    "html_dir": "asv_html",
+
+    // The number of characters to retain in the commit hashes.
+    // "hash_length": 8,
+
+    // `asv` will cache results of the recent builds in each
+    // environment, making them faster to install next time.  This is
+    // the number of builds to keep, per environment.
+    // "build_cache_size": 2,
+
+    // The commits after which the regression search in `asv publish`
+    // should start looking for regressions. Dictionary whose keys are
+    // regexps matching to benchmark names, and values corresponding to
+    // the commit (exclusive) after which to start looking for
+    // regressions.  The default is to start from the first commit
+    // with results. If the commit is `null`, regression detection is
+    // skipped for the matching benchmark.
+    //
+    // "regressions_first_commits": {
+    //    "some_benchmark": "352cdf",  // Consider regressions only after this 
commit
+    //    "another_benchmark": null,   // Skip regression detection altogether
+    // },
+
+    // The thresholds for relative change in results, after which `asv
+    // publish` starts reporting regressions. Dictionary of the same
+    // form as in ``regressions_first_commits``, with values
+    // indicating the thresholds.  If multiple entries match, the
+    // maximum is taken. If no entry matches, the default is 5%.
+    //
+    // "regressions_thresholds": {
+    //    "some_benchmark": 0.01,     // Threshold of 1%
+    //    "another_benchmark": 0.5,   // Threshold of 50%
+    // },
+}
diff --git a/dev/benchmarks/.gitignore b/dev/benchmarks/python/__init__.py
similarity index 95%
copy from dev/benchmarks/.gitignore
copy to dev/benchmarks/python/__init__.py
index 71a6b327..13a83393 100644
--- a/dev/benchmarks/.gitignore
+++ b/dev/benchmarks/python/__init__.py
@@ -14,7 +14,3 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-.Rhistory
-benchmark-report.md
-fixtures/
diff --git a/dev/benchmarks/python/array.py b/dev/benchmarks/python/array.py
new file mode 100644
index 00000000..0b4f796c
--- /dev/null
+++ b/dev/benchmarks/python/array.py
@@ -0,0 +1,112 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import nanoarrow as na
+
+
+class CArrayBuilderSuite:
+    """
+    Benchmarks for building CArrays
+    """
+
+    def setup(self):
+        self.py_integers = list(range(int(1e6)))
+        self.py_bools = [False, True, True, False] * int(1e6 // 4)
+
+        self.wide_schema = na.c_schema(na.struct([na.int32()] * 10000))
+        self.children = [na.c_array(self.py_integers, na.int32())] * 10000
+
+    def time_build_c_array_int32(self):
+        """Create an int32 array from 1,000,000 Python integers"""
+        na.c_array(self.py_integers, na.int32())
+
+    def time_build_c_array_bool(self):
+        """Create a bool array from 1,000,000 Python booleans"""
+        na.c_array(self.py_bools, na.bool())
+
+    def time_build_c_array_struct_wide(self):
+        """Create a struct array with 10,000 columns"""
+        na.c_array_from_buffers(self.wide_schema, 1e6, [None], 
children=self.children)
+
+
+class ArrayIterationSuite:
+    """Benchmarks for consuming an Array using various methods of iteration"""
+
+    def setup(self):
+        self.integers = na.Array(range(int(1e6)), na.int32())
+
+        n = int(1e6)
+        item_size = 7
+        alphabet = b"abcdefghijklmnopqrstuvwxyz"
+        n_alphabets = (item_size * n) // len(alphabet) + 1
+        data_buffer = alphabet * n_alphabets
+        offsets_buffer = na.c_buffer(
+            range(0, (n + 1) * item_size, item_size), na.int32()
+        )
+
+        c_strings = na.c_array_from_buffers(
+            na.string(), n, [None, offsets_buffer, data_buffer]
+        )
+        self.strings = na.Array(c_strings)
+
+        c_long_struct = na.c_array_from_buffers(
+            na.struct([na.int32()] * 100),
+            length=10000,
+            buffers=[None],
+            children=[na.c_array(range(10000), na.int32())] * 100,
+        )
+        self.long_struct = na.Array(c_long_struct)
+
+        c_wide_struct = na.c_array_from_buffers(
+            na.struct([na.int32()] * 10000),
+            length=100,
+            buffers=[None],
+            children=[na.c_array(range(100), na.int32())] * 10000,
+        )
+        self.wide_struct = na.Array(c_wide_struct)
+
+    def time_integers_to_list(self):
+        """Consume an int32 array with 1,000,000 elements into a Python list"""
+        list(self.integers.iter_py())
+
+    def time_strings_to_list(self):
+        """Consume a string array with 1,000,000 elements into a Python list"""
+        list(self.strings.iter_py())
+
+    def time_long_struct_to_dict_list(self):
+        """Consume an struct array with 10,000 elements and 100 columns into a 
list
+        of dictionaries
+        """
+        list(self.long_struct.iter_py())
+
+    def time_long_struct_to_tuple_list(self):
+        """Consume an struct array with 10,000 elements and 100 columns into a 
list
+        of tuples
+        """
+        list(self.long_struct.iter_tuples())
+
+    def time_wide_struct_to_dict_list(self):
+        """Consume an struct array with 100 elements and 10,000 columns into a 
list
+        of dictionaries
+        """
+        list(self.wide_struct.iter_py())
+
+    def time_wide_struct_to_tuple_list(self):
+        """Consume an struct array with 100 elements and 10,000 columns into a 
list
+        of tuples
+        """
+        list(self.wide_struct.iter_tuples())
diff --git a/dev/benchmarks/python/ipc.py b/dev/benchmarks/python/ipc.py
new file mode 100644
index 00000000..b841a1d6
--- /dev/null
+++ b/dev/benchmarks/python/ipc.py
@@ -0,0 +1,64 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import io
+import os
+
+import nanoarrow as na
+from nanoarrow import ipc
+
+
+class IpcReaderSuite:
+    """
+    Benchmarks for reading IPC streams
+    """
+
+    def setup(self):
+        self.fixtures_dir = os.path.join(os.path.dirname(__file__), "..", 
"fixtures")
+        self.fixture_names = [
+            "float64_basic.arrows",
+            "float64_long.arrows",
+            "float64_wide.arrows",
+        ]
+        self.fixture_buffer = {}
+        for name in self.fixture_names:
+            with open(self.fixture_path(name), "rb") as f:
+                self.fixture_buffer[name] = f.read()
+
+    def fixture_path(self, name):
+        return os.path.join(self.fixtures_dir, name)
+
+    def read_fixture_file(self, name):
+        with ipc.Stream.from_path(self.fixture_path(name)) as in_stream:
+            list(na.c_array_stream(in_stream))
+
+    def read_fixture_buffer(self, name):
+        f = io.BytesIO(self.fixture_buffer[name])
+        with ipc.Stream.from_readable(f) as in_stream:
+            list(na.c_array_stream(in_stream))
+
+    def time_read_float64_basic_file(self):
+        self.read_fixture_file("float64_basic.arrows")
+
+    def time_read_float64_basic_buffer(self):
+        self.read_fixture_buffer("float64_basic.arrows")
+
+    def time_read_float64_long_buffer(self):
+        self.read_fixture_buffer("float64_long.arrows")
+
+    def time_read_float64_wide_buffer(self):
+        self.read_fixture_buffer("float64_wide.arrows")
diff --git a/dev/benchmarks/python/schema.py b/dev/benchmarks/python/schema.py
new file mode 100644
index 00000000..c997e94e
--- /dev/null
+++ b/dev/benchmarks/python/schema.py
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import nanoarrow as na
+
+
+class SchemaSuite:
+    """
+    Benchmarks of some Schema/CSchema operations
+    """
+
+    def setup(self):
+        self.children = [na.int32()] * 10000
+        self.c_children = [na.c_schema(child) for child in self.children]
+        self.c_wide_struct = na.c_schema(na.struct(self.children))
+
+    def time_create_wide_struct_from_schemas(self):
+        """Create a struct Schema with 10000 columns from a list of Schema"""
+        na.struct(self.children)
+
+    def time_create_wide_struct_from_c_schemas(self):
+        """Create a struct Schema with 10000 columns from a list of CSchema"""
+        na.struct(self.c_children)
+
+    def time_c_schema_protocol_wide_struct(self):
+        """Export a struct Schema with 10000 columns via the PyCapsule 
protocol"""
+        self.c_wide_struct.__arrow_c_schema__()

(arrow-nanoarrow) branch main updated: chore(dev/benchmarks): Add Python/asv benchmarking setup (#412)

Reply via email to