This is an automated email from the ASF dual-hosted git repository.
lidavidm pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-adbc.git
The following commit(s) were added to refs/heads/main by this push:
new d311397 test(python/adbc_driver_postgres): add benchmark suite via
asv (#569)
d311397 is described below
commit d311397d965f5f15c9b40a98585e9cb10828fae5
Author: David Li <[email protected]>
AuthorDate: Wed Apr 19 09:15:02 2023 +0900
test(python/adbc_driver_postgres): add benchmark suite via asv (#569)
No setup for actually running them in CI for now (we also probably don't
want that).
- [x] Benchmark integers at different table sizes
- [x] Benchmark different integer types
- [x] Benchmark different float types
- [x] Benchmark different string types
- [x] Benchmark multiple columns of same type
- [x] Benchmark against Pandas 1.x/SQLALchemy 1.x
Fixes #568.
---
ci/conda_env_benchmarking.txt | 18 ++
python/adbc_driver_postgresql/asv.conf.json | 201 +++++++++++++++++++++
.../adbc_driver_postgresql/benchmarks/__init__.py | 16 ++
.../benchmarks/benchmarks.py | 175 ++++++++++++++++++
4 files changed, 410 insertions(+)
diff --git a/ci/conda_env_benchmarking.txt b/ci/conda_env_benchmarking.txt
new file mode 100644
index 0000000..8853eda
--- /dev/null
+++ b/ci/conda_env_benchmarking.txt
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+asv
diff --git a/python/adbc_driver_postgresql/asv.conf.json
b/python/adbc_driver_postgresql/asv.conf.json
new file mode 100644
index 0000000..6ade30d
--- /dev/null
+++ b/python/adbc_driver_postgresql/asv.conf.json
@@ -0,0 +1,201 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+{
+ // The version of the config file format. Do not change, unless
+ // you know what you are doing.
+ "version": 1,
+
+ // The name of the project being benchmarked
+ "project": "adbc-driver-postgresql",
+
+ // The project's homepage
+ "project_url": "https://arrow.apache.org/adbc/",
+
+ // The URL or local path of the source code repository for the
+ // project being benchmarked
+ "repo": "../..",
+
+ // The Python project's subdirectory in your repo. If missing or
+ // the empty string, the project is assumed to be located at the root
+ // of the repository.
+ "repo_subdir": "python/adbc_driver_postgresql",
+
+ // Customizable commands for building, installing, and
+ // uninstalling the project. See asv.conf.json documentation.
+ //
+ // "install_command": ["in-dir={env_dir} python -mpip install
{wheel_file}"],
+ // "uninstall_command": ["return-code=any python -mpip uninstall -y
{project}"],
+ // "build_command": [
+ // "python setup.py build",
+ // "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps
--no-index -w {build_cache_dir} {build_dir}"
+ // ],
+
+ // List of branches to benchmark. If not provided, defaults to "master"
+ // (for git) or "default" (for mercurial).
+ "branches": ["main"], // for git
+
+ // The DVCS being used. If not set, it will be automatically
+ // determined from "repo" by looking at the protocol in the URL
+ // (if remote), or by looking for special directories, such as
+ // ".git" (if local).
+ "dvcs": "git",
+
+ // The tool to use to create environments. May be "conda",
+ // "virtualenv" or other value depending on the plugins in use.
+ // If missing or the empty string, the tool will be automatically
+ // determined by looking for tools on the PATH environment
+ // variable.
+ "environment_type": "conda",
+
+ // timeout in seconds for installing any dependencies in environment
+ // defaults to 10 min
+ //"install_timeout": 600,
+
+ // the base URL to show a commit for the project.
+ "show_commit_url": "http://github.com/apache/arrow-adbc/commit/",
+
+ // The Pythons you'd like to test against. If not provided, defaults
+ // to the current version of Python used to run `asv`.
+ // "pythons": ["2.7", "3.6"],
+
+ // The list of conda channel names to be searched for benchmark
+ // dependency packages in the specified order
+ "conda_channels": ["conda-forge"],
+
+ // A conda environment file that is used for environment creation.
+ // "conda_environment_file": "environment.yml",
+
+ // The matrix of dependencies to test. Each key of the "req"
+ // requirements dictionary is the name of a package (in PyPI) and
+ // the values are version numbers. An empty list or empty string
+ // indicates to just test against the default (latest)
+ // version. null indicates that the package is to not be
+ // installed. If the package to be tested is only available from
+ // PyPi, and the 'environment_type' is conda, then you can preface
+ // the package name by 'pip+', and the package will be installed
+ // via pip (with all the conda available packages installed first,
+ // followed by the pip installed packages).
+ //
+ // The ``@env`` and ``@env_nobuild`` keys contain the matrix of
+ // environment variables to pass to build and benchmark commands.
+ // An environment will be created for every combination of the
+ // cartesian product of the "@env" variables in this matrix.
+ // Variables in "@env_nobuild" will be passed to every environment
+ // during the benchmark phase, but will not trigger creation of
+ // new environments. A value of ``null`` means that the variable
+ // will not be set for the current combination.
+ "matrix": {
+ "req": {
+ "asyncpg": ["0.27.0"],
+ "pandas": ["1.5.3"],
+ // "pip+pgeon": ["0.2.0a0"],
+ "pyarrow": ["11.0.0"],
+ "psycopg": ["3.1.8"],
+ "psycopg2": ["2.9.3"],
+ "python-duckdb": ["0.7.1"],
+ "sqlalchemy": ["1.4.46"],
+ },
+ },
+
+ // Combinations of libraries/python versions can be excluded/included
+ // from the set to test. Each entry is a dictionary containing additional
+ // key-value pairs to include/exclude.
+ //
+ // An exclude entry excludes entries where all values match. The
+ // values are regexps that should match the whole string.
+ //
+ // An include entry adds an environment. Only the packages listed
+ // are installed. The 'python' key is required. The exclude rules
+ // do not apply to includes.
+ //
+ // In addition to package names, the following keys are available:
+ //
+ // - python
+ // Python version, as in the *pythons* variable above.
+ // - environment_type
+ // Environment type, as above.
+ // - sys_platform
+ // Platform, as in sys.platform. Possible values for the common
+ // cases: 'linux2', 'win32', 'cygwin', 'darwin'.
+ // - req
+ // Required packages
+ // - env
+ // Environment variables
+ // - env_nobuild
+ // Non-build environment variables
+ //
+ // "exclude": [
+ // {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
+ // {"environment_type": "conda", "req": {"six": null}}, // don't run
without six on conda
+ // {"env": {"ENV_VAR_1": "val2"}}, // skip val2 for ENV_VAR_1
+ // ],
+ //
+ // "include": [
+ // // additional env for python2.7
+ // {"python": "2.7", "req": {"numpy": "1.8"}, "env_nobuild": {"FOO":
"123"}},
+ // // additional env if run on windows+conda
+ // {"platform": "win32", "environment_type": "conda", "python": "2.7",
"req": {"libpython": ""}},
+ // ],
+
+ // The directory (relative to the current directory) that benchmarks are
+ // stored in. If not provided, defaults to "benchmarks"
+ // "benchmark_dir": "benchmarks",
+
+ // The directory (relative to the current directory) to cache the Python
+ // environments in. If not provided, defaults to "env"
+ "env_dir": ".asv/env",
+
+ // The directory (relative to the current directory) that raw benchmark
+ // results are stored in. If not provided, defaults to "results".
+ "results_dir": ".asv/results",
+
+ // The directory (relative to the current directory) that the html tree
+ // should be written to. If not provided, defaults to "html".
+ "html_dir": ".asv/html",
+
+ // The number of characters to retain in the commit hashes.
+ // "hash_length": 8,
+
+ // `asv` will cache results of the recent builds in each
+ // environment, making them faster to install next time. This is
+ // the number of builds to keep, per environment.
+ // "build_cache_size": 2,
+
+ // The commits after which the regression search in `asv publish`
+ // should start looking for regressions. Dictionary whose keys are
+ // regexps matching to benchmark names, and values corresponding to
+ // the commit (exclusive) after which to start looking for
+ // regressions. The default is to start from the first commit
+ // with results. If the commit is `null`, regression detection is
+ // skipped for the matching benchmark.
+ //
+ // "regressions_first_commits": {
+ // "some_benchmark": "352cdf", // Consider regressions only after this
commit
+ // "another_benchmark": null, // Skip regression detection altogether
+ // },
+
+ // The thresholds for relative change in results, after which `asv
+ // publish` starts reporting regressions. Dictionary of the same
+ // form as in ``regressions_first_commits``, with values
+ // indicating the thresholds. If multiple entries match, the
+ // maximum is taken. If no entry matches, the default is 5%.
+ //
+ // "regressions_thresholds": {
+ // "some_benchmark": 0.01, // Threshold of 1%
+ // "another_benchmark": 0.5, // Threshold of 50%
+ // },
+}
diff --git a/python/adbc_driver_postgresql/benchmarks/__init__.py
b/python/adbc_driver_postgresql/benchmarks/__init__.py
new file mode 100644
index 0000000..13a8339
--- /dev/null
+++ b/python/adbc_driver_postgresql/benchmarks/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
diff --git a/python/adbc_driver_postgresql/benchmarks/benchmarks.py
b/python/adbc_driver_postgresql/benchmarks/benchmarks.py
new file mode 100644
index 0000000..037bb30
--- /dev/null
+++ b/python/adbc_driver_postgresql/benchmarks/benchmarks.py
@@ -0,0 +1,175 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import abc
+import asyncio
+import itertools
+import os
+
+import asyncpg
+import duckdb
+import pandas
+import psycopg
+import sqlalchemy
+
+import adbc_driver_postgresql.dbapi
+
+
+class BenchmarkBase(abc.ABC):
+ async_conn: asyncpg.Connection
+ async_runner: asyncio.Runner
+ conn: adbc_driver_postgresql.dbapi.Connection
+ duck: duckdb.DuckDBPyConnection
+ sqlalchemy_connection: sqlalchemy.engine.base.Connection
+
+ def setup(self, *args, **kwargs) -> None:
+ self.uri = os.environ["ADBC_POSTGRESQL_TEST_URI"]
+
+ self.table = self._make_table_name(*args, **kwargs)
+
+ self.async_runner = asyncio.Runner()
+ self.async_conn = self.async_runner.run(asyncpg.connect(dsn=self.uri))
+
+ self.conn = adbc_driver_postgresql.dbapi.connect(self.uri)
+
+ self.duck = duckdb.connect()
+ self.duck.sql("INSTALL postgres_scanner")
+ self.duck.sql("LOAD postgres_scanner")
+ self.duck.sql(f"CALL postgres_attach('{self.uri}')")
+
+ uri = self.uri.replace("postgres://", "postgresql+psycopg2://")
+ self.sqlalchemy_connection = sqlalchemy.create_engine(uri).connect()
+
+ def teardown(self, *args, **kwargs) -> None:
+ self.async_runner.close()
+ self.conn.close()
+ self.sqlalchemy_connection.close()
+
+ @abc.abstractmethod
+ def _make_table_name(self, *args, **kwargs) -> str:
+ ...
+
+ def time_pandas_adbc(self, row_count: int, data_type: str) -> None:
+ with self.conn.cursor() as cursor:
+ cursor.execute(f"SELECT * FROM {self.table}")
+ cursor.fetch_df()
+
+ def time_pandas_asyncpg(self, row_count: int, data_type: str) -> None:
+ records = self.async_runner.run(
+ self.async_conn.fetch(f"SELECT * FROM {self.table}")
+ )
+ pandas.DataFrame(records)
+
+ # TODO: fails with 'undefined symbol' (probably need to get it into Conda)
+ # def time_pandas_pgeon(self, row_count: int) -> None:
+ # pgeon.copy_query(self.uri, f"SELECT * FROM {self.table}").to_pandas()
+
+ def time_pandas_psycopg2(self, row_count: int, data_type: str) -> None:
+ pandas.read_sql_table(self.table, self.sqlalchemy_connection)
+
+ def time_pandas_duckdb(self, row_count: int, data_type: str) -> None:
+ self.duck.sql(f"SELECT * FROM {self.table}").fetchdf()
+
+
+class OneColumnSuite(BenchmarkBase):
+ """Benchmark the time it takes to fetch a single column of a given type."""
+
+ SETUP_QUERIES = [
+ "DROP TABLE IF EXISTS {table_name}",
+ "CREATE TABLE {table_name} (items {data_type})",
+ """INSERT INTO {table_name} (items)
+ SELECT generated :: {data_type}
+ FROM GENERATE_SERIES(1, {row_count}) temp(generated)""",
+ # TODO: does an index matter, do we want to force PostgreSQL
+ # to update statistics?
+ ]
+
+ param_data = {
+ "row_count": [10_000, 100_000, 1_000_000],
+ "data_type": ["INT", "BIGINT", "FLOAT", "DOUBLE PRECISION"],
+ }
+
+ param_names = list(param_data.keys())
+ params = list(param_data.values())
+
+ def setup_cache(self) -> None:
+ self.uri = os.environ["ADBC_POSTGRESQL_TEST_URI"]
+ with psycopg.connect(self.uri) as conn:
+ with conn.cursor() as cursor:
+ for row_count, data_type in itertools.product(*self.params):
+ table_name = self._make_table_name(row_count, data_type)
+ for query in self.SETUP_QUERIES:
+ cursor.execute(
+ query.format(
+ table_name=table_name,
+ row_count=row_count,
+ data_type=data_type,
+ )
+ )
+
+ def _make_table_name(self, row_count: int, data_type: str) -> str:
+ return (f"bench_{row_count}_{data_type.replace(' ', '_')}").lower()
+
+
+class MultiColumnSuite(BenchmarkBase):
+ """Benchmark the time it takes to fetch multiple columns of a given
type."""
+
+ SETUP_QUERIES = [
+ "DROP TABLE IF EXISTS {table_name}",
+ """
+ CREATE TABLE {table_name} (
+ a {data_type},
+ b {data_type},
+ c {data_type},
+ d {data_type}
+ )
+ """,
+ """
+ INSERT INTO {table_name} (a, b, c, d)
+ SELECT generated :: {data_type},
+ generated :: {data_type},
+ generated :: {data_type},
+ generated :: {data_type}
+ FROM GENERATE_SERIES(1, {row_count}) temp(generated)
+ """,
+ ]
+
+ param_data = {
+ "row_count": [10_000, 100_000, 1_000_000],
+ "data_type": ["INT", "BIGINT", "FLOAT", "DOUBLE PRECISION"],
+ }
+
+ param_names = list(param_data.keys())
+ params = list(param_data.values())
+
+ def setup_cache(self) -> None:
+ self.uri = os.environ["ADBC_POSTGRESQL_TEST_URI"]
+ with psycopg.connect(self.uri) as conn:
+ with conn.cursor() as cursor:
+ for row_count, data_type in itertools.product(*self.params):
+ table_name = self._make_table_name(row_count, data_type)
+ for query in self.SETUP_QUERIES:
+ cursor.execute(
+ query.format(
+ table_name=table_name,
+ row_count=row_count,
+ data_type=data_type,
+ )
+ )
+
+ def _make_table_name(self, row_count: int, data_type: str) -> str:
+ return (f"bench_{row_count}_{data_type.replace(' ', '_')}").lower()