[arrow-adbc] branch main updated: test(python/adbc_driver_postgres): add benchmark suite via asv (#569)

lidavidm Tue, 18 Apr 2023 17:15:15 -0700

This is an automated email from the ASF dual-hosted git repository.

lidavidm pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-adbc.git



The following commit(s) were added to refs/heads/main by this push:
     new d311397  test(python/adbc_driver_postgres): add benchmark suite via 
asv (#569)
d311397 is described below

commit d311397d965f5f15c9b40a98585e9cb10828fae5
Author: David Li <[email protected]>
AuthorDate: Wed Apr 19 09:15:02 2023 +0900

    test(python/adbc_driver_postgres): add benchmark suite via asv (#569)
    
    No setup for actually running them in CI for now (we also probably don't
    want that).
    
    - [x] Benchmark integers at different table sizes
    - [x] Benchmark different integer types
    - [x] Benchmark different float types
    - [x] Benchmark different string types
    - [x] Benchmark multiple columns of same type
    - [x] Benchmark against Pandas 1.x/SQLALchemy 1.x
    
    Fixes #568.
---
 ci/conda_env_benchmarking.txt                      |  18 ++
 python/adbc_driver_postgresql/asv.conf.json        | 201 +++++++++++++++++++++
 .../adbc_driver_postgresql/benchmarks/__init__.py  |  16 ++
 .../benchmarks/benchmarks.py                       | 175 ++++++++++++++++++
 4 files changed, 410 insertions(+)

diff --git a/ci/conda_env_benchmarking.txt b/ci/conda_env_benchmarking.txt
new file mode 100644
index 0000000..8853eda
--- /dev/null
+++ b/ci/conda_env_benchmarking.txt
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+asv
diff --git a/python/adbc_driver_postgresql/asv.conf.json 
b/python/adbc_driver_postgresql/asv.conf.json
new file mode 100644
index 0000000..6ade30d
--- /dev/null
+++ b/python/adbc_driver_postgresql/asv.conf.json
@@ -0,0 +1,201 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+{
+    // The version of the config file format.  Do not change, unless
+    // you know what you are doing.
+    "version": 1,
+
+    // The name of the project being benchmarked
+    "project": "adbc-driver-postgresql",
+
+    // The project's homepage
+    "project_url": "https://arrow.apache.org/adbc/";,
+
+    // The URL or local path of the source code repository for the
+    // project being benchmarked
+    "repo": "../..",
+
+    // The Python project's subdirectory in your repo.  If missing or
+    // the empty string, the project is assumed to be located at the root
+    // of the repository.
+    "repo_subdir": "python/adbc_driver_postgresql",
+
+    // Customizable commands for building, installing, and
+    // uninstalling the project. See asv.conf.json documentation.
+    //
+    // "install_command": ["in-dir={env_dir} python -mpip install 
{wheel_file}"],
+    // "uninstall_command": ["return-code=any python -mpip uninstall -y 
{project}"],
+    // "build_command": [
+    //     "python setup.py build",
+    //     "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps 
--no-index -w {build_cache_dir} {build_dir}"
+    // ],
+
+    // List of branches to benchmark. If not provided, defaults to "master"
+    // (for git) or "default" (for mercurial).
+    "branches": ["main"], // for git
+
+    // The DVCS being used.  If not set, it will be automatically
+    // determined from "repo" by looking at the protocol in the URL
+    // (if remote), or by looking for special directories, such as
+    // ".git" (if local).
+    "dvcs": "git",
+
+    // The tool to use to create environments.  May be "conda",
+    // "virtualenv" or other value depending on the plugins in use.
+    // If missing or the empty string, the tool will be automatically
+    // determined by looking for tools on the PATH environment
+    // variable.
+    "environment_type": "conda",
+
+    // timeout in seconds for installing any dependencies in environment
+    // defaults to 10 min
+    //"install_timeout": 600,
+
+    // the base URL to show a commit for the project.
+    "show_commit_url": "http://github.com/apache/arrow-adbc/commit/";,
+
+    // The Pythons you'd like to test against.  If not provided, defaults
+    // to the current version of Python used to run `asv`.
+    // "pythons": ["2.7", "3.6"],
+
+    // The list of conda channel names to be searched for benchmark
+    // dependency packages in the specified order
+    "conda_channels": ["conda-forge"],
+
+    // A conda environment file that is used for environment creation.
+    // "conda_environment_file": "environment.yml",
+
+    // The matrix of dependencies to test.  Each key of the "req"
+    // requirements dictionary is the name of a package (in PyPI) and
+    // the values are version numbers.  An empty list or empty string
+    // indicates to just test against the default (latest)
+    // version. null indicates that the package is to not be
+    // installed. If the package to be tested is only available from
+    // PyPi, and the 'environment_type' is conda, then you can preface
+    // the package name by 'pip+', and the package will be installed
+    // via pip (with all the conda available packages installed first,
+    // followed by the pip installed packages).
+    //
+    // The ``@env`` and ``@env_nobuild`` keys contain the matrix of
+    // environment variables to pass to build and benchmark commands.
+    // An environment will be created for every combination of the
+    // cartesian product of the "@env" variables in this matrix.
+    // Variables in "@env_nobuild" will be passed to every environment
+    // during the benchmark phase, but will not trigger creation of
+    // new environments.  A value of ``null`` means that the variable
+    // will not be set for the current combination.
+    "matrix": {
+        "req": {
+            "asyncpg": ["0.27.0"],
+            "pandas": ["1.5.3"],
+            // "pip+pgeon": ["0.2.0a0"],
+            "pyarrow": ["11.0.0"],
+            "psycopg": ["3.1.8"],
+            "psycopg2": ["2.9.3"],
+            "python-duckdb": ["0.7.1"],
+            "sqlalchemy": ["1.4.46"],
+        },
+    },
+
+    // Combinations of libraries/python versions can be excluded/included
+    // from the set to test. Each entry is a dictionary containing additional
+    // key-value pairs to include/exclude.
+    //
+    // An exclude entry excludes entries where all values match. The
+    // values are regexps that should match the whole string.
+    //
+    // An include entry adds an environment. Only the packages listed
+    // are installed. The 'python' key is required. The exclude rules
+    // do not apply to includes.
+    //
+    // In addition to package names, the following keys are available:
+    //
+    // - python
+    //     Python version, as in the *pythons* variable above.
+    // - environment_type
+    //     Environment type, as above.
+    // - sys_platform
+    //     Platform, as in sys.platform. Possible values for the common
+    //     cases: 'linux2', 'win32', 'cygwin', 'darwin'.
+    // - req
+    //     Required packages
+    // - env
+    //     Environment variables
+    // - env_nobuild
+    //     Non-build environment variables
+    //
+    // "exclude": [
+    //     {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
+    //     {"environment_type": "conda", "req": {"six": null}}, // don't run 
without six on conda
+    //     {"env": {"ENV_VAR_1": "val2"}}, // skip val2 for ENV_VAR_1
+    // ],
+    //
+    // "include": [
+    //     // additional env for python2.7
+    //     {"python": "2.7", "req": {"numpy": "1.8"}, "env_nobuild": {"FOO": 
"123"}},
+    //     // additional env if run on windows+conda
+    //     {"platform": "win32", "environment_type": "conda", "python": "2.7", 
"req": {"libpython": ""}},
+    // ],
+
+    // The directory (relative to the current directory) that benchmarks are
+    // stored in.  If not provided, defaults to "benchmarks"
+    // "benchmark_dir": "benchmarks",
+
+    // The directory (relative to the current directory) to cache the Python
+    // environments in.  If not provided, defaults to "env"
+    "env_dir": ".asv/env",
+
+    // The directory (relative to the current directory) that raw benchmark
+    // results are stored in.  If not provided, defaults to "results".
+    "results_dir": ".asv/results",
+
+    // The directory (relative to the current directory) that the html tree
+    // should be written to.  If not provided, defaults to "html".
+    "html_dir": ".asv/html",
+
+    // The number of characters to retain in the commit hashes.
+    // "hash_length": 8,
+
+    // `asv` will cache results of the recent builds in each
+    // environment, making them faster to install next time.  This is
+    // the number of builds to keep, per environment.
+    // "build_cache_size": 2,
+
+    // The commits after which the regression search in `asv publish`
+    // should start looking for regressions. Dictionary whose keys are
+    // regexps matching to benchmark names, and values corresponding to
+    // the commit (exclusive) after which to start looking for
+    // regressions.  The default is to start from the first commit
+    // with results. If the commit is `null`, regression detection is
+    // skipped for the matching benchmark.
+    //
+    // "regressions_first_commits": {
+    //    "some_benchmark": "352cdf",  // Consider regressions only after this 
commit
+    //    "another_benchmark": null,   // Skip regression detection altogether
+    // },
+
+    // The thresholds for relative change in results, after which `asv
+    // publish` starts reporting regressions. Dictionary of the same
+    // form as in ``regressions_first_commits``, with values
+    // indicating the thresholds.  If multiple entries match, the
+    // maximum is taken. If no entry matches, the default is 5%.
+    //
+    // "regressions_thresholds": {
+    //    "some_benchmark": 0.01,     // Threshold of 1%
+    //    "another_benchmark": 0.5,   // Threshold of 50%
+    // },
+}
diff --git a/python/adbc_driver_postgresql/benchmarks/__init__.py 
b/python/adbc_driver_postgresql/benchmarks/__init__.py
new file mode 100644
index 0000000..13a8339
--- /dev/null
+++ b/python/adbc_driver_postgresql/benchmarks/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
diff --git a/python/adbc_driver_postgresql/benchmarks/benchmarks.py 
b/python/adbc_driver_postgresql/benchmarks/benchmarks.py
new file mode 100644
index 0000000..037bb30
--- /dev/null
+++ b/python/adbc_driver_postgresql/benchmarks/benchmarks.py
@@ -0,0 +1,175 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import abc
+import asyncio
+import itertools
+import os
+
+import asyncpg
+import duckdb
+import pandas
+import psycopg
+import sqlalchemy
+
+import adbc_driver_postgresql.dbapi
+
+
+class BenchmarkBase(abc.ABC):
+    async_conn: asyncpg.Connection
+    async_runner: asyncio.Runner
+    conn: adbc_driver_postgresql.dbapi.Connection
+    duck: duckdb.DuckDBPyConnection
+    sqlalchemy_connection: sqlalchemy.engine.base.Connection
+
+    def setup(self, *args, **kwargs) -> None:
+        self.uri = os.environ["ADBC_POSTGRESQL_TEST_URI"]
+
+        self.table = self._make_table_name(*args, **kwargs)
+
+        self.async_runner = asyncio.Runner()
+        self.async_conn = self.async_runner.run(asyncpg.connect(dsn=self.uri))
+
+        self.conn = adbc_driver_postgresql.dbapi.connect(self.uri)
+
+        self.duck = duckdb.connect()
+        self.duck.sql("INSTALL postgres_scanner")
+        self.duck.sql("LOAD postgres_scanner")
+        self.duck.sql(f"CALL postgres_attach('{self.uri}')")
+
+        uri = self.uri.replace("postgres://", "postgresql+psycopg2://")
+        self.sqlalchemy_connection = sqlalchemy.create_engine(uri).connect()
+
+    def teardown(self, *args, **kwargs) -> None:
+        self.async_runner.close()
+        self.conn.close()
+        self.sqlalchemy_connection.close()
+
+    @abc.abstractmethod
+    def _make_table_name(self, *args, **kwargs) -> str:
+        ...
+
+    def time_pandas_adbc(self, row_count: int, data_type: str) -> None:
+        with self.conn.cursor() as cursor:
+            cursor.execute(f"SELECT * FROM {self.table}")
+            cursor.fetch_df()
+
+    def time_pandas_asyncpg(self, row_count: int, data_type: str) -> None:
+        records = self.async_runner.run(
+            self.async_conn.fetch(f"SELECT * FROM {self.table}")
+        )
+        pandas.DataFrame(records)
+
+    # TODO: fails with 'undefined symbol' (probably need to get it into Conda)
+    # def time_pandas_pgeon(self, row_count: int) -> None:
+    #     pgeon.copy_query(self.uri, f"SELECT * FROM {self.table}").to_pandas()
+
+    def time_pandas_psycopg2(self, row_count: int, data_type: str) -> None:
+        pandas.read_sql_table(self.table, self.sqlalchemy_connection)
+
+    def time_pandas_duckdb(self, row_count: int, data_type: str) -> None:
+        self.duck.sql(f"SELECT * FROM {self.table}").fetchdf()
+
+
+class OneColumnSuite(BenchmarkBase):
+    """Benchmark the time it takes to fetch a single column of a given type."""
+
+    SETUP_QUERIES = [
+        "DROP TABLE IF EXISTS {table_name}",
+        "CREATE TABLE {table_name} (items {data_type})",
+        """INSERT INTO {table_name} (items)
+        SELECT generated :: {data_type}
+        FROM GENERATE_SERIES(1, {row_count}) temp(generated)""",
+        # TODO: does an index matter, do we want to force PostgreSQL
+        # to update statistics?
+    ]
+
+    param_data = {
+        "row_count": [10_000, 100_000, 1_000_000],
+        "data_type": ["INT", "BIGINT", "FLOAT", "DOUBLE PRECISION"],
+    }
+
+    param_names = list(param_data.keys())
+    params = list(param_data.values())
+
+    def setup_cache(self) -> None:
+        self.uri = os.environ["ADBC_POSTGRESQL_TEST_URI"]
+        with psycopg.connect(self.uri) as conn:
+            with conn.cursor() as cursor:
+                for row_count, data_type in itertools.product(*self.params):
+                    table_name = self._make_table_name(row_count, data_type)
+                    for query in self.SETUP_QUERIES:
+                        cursor.execute(
+                            query.format(
+                                table_name=table_name,
+                                row_count=row_count,
+                                data_type=data_type,
+                            )
+                        )
+
+    def _make_table_name(self, row_count: int, data_type: str) -> str:
+        return (f"bench_{row_count}_{data_type.replace(' ', '_')}").lower()
+
+
+class MultiColumnSuite(BenchmarkBase):
+    """Benchmark the time it takes to fetch multiple columns of a given 
type."""
+
+    SETUP_QUERIES = [
+        "DROP TABLE IF EXISTS {table_name}",
+        """
+        CREATE TABLE {table_name} (
+            a {data_type},
+            b {data_type},
+            c {data_type},
+            d {data_type}
+        )
+        """,
+        """
+        INSERT INTO {table_name} (a, b, c, d)
+        SELECT generated :: {data_type},
+               generated :: {data_type},
+               generated :: {data_type},
+               generated :: {data_type}
+        FROM GENERATE_SERIES(1, {row_count}) temp(generated)
+        """,
+    ]
+
+    param_data = {
+        "row_count": [10_000, 100_000, 1_000_000],
+        "data_type": ["INT", "BIGINT", "FLOAT", "DOUBLE PRECISION"],
+    }
+
+    param_names = list(param_data.keys())
+    params = list(param_data.values())
+
+    def setup_cache(self) -> None:
+        self.uri = os.environ["ADBC_POSTGRESQL_TEST_URI"]
+        with psycopg.connect(self.uri) as conn:
+            with conn.cursor() as cursor:
+                for row_count, data_type in itertools.product(*self.params):
+                    table_name = self._make_table_name(row_count, data_type)
+                    for query in self.SETUP_QUERIES:
+                        cursor.execute(
+                            query.format(
+                                table_name=table_name,
+                                row_count=row_count,
+                                data_type=data_type,
+                            )
+                        )
+
+    def _make_table_name(self, row_count: int, data_type: str) -> str:
+        return (f"bench_{row_count}_{data_type.replace(' ', '_')}").lower()

[arrow-adbc] branch main updated: test(python/adbc_driver_postgres): add benchmark suite via asv (#569)

Reply via email to