This is an automated email from the ASF dual-hosted git repository.
lidavidm pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-adbc.git
The following commit(s) were added to refs/heads/main by this push:
new d4933e410 chore(dev/bench): add a simple python benchmark script for
ADBC (#1879)
d4933e410 is described below
commit d4933e4104e08cf1154b6870af442072ffa7b46e
Author: Matt Topol <[email protected]>
AuthorDate: Tue May 21 20:33:30 2024 -0400
chore(dev/bench): add a simple python benchmark script for ADBC (#1879)
Just a starting point to hopefully continue building up some
benchmarking suite stuff.
---
.pre-commit-config.yaml | 4 +-
dev/bench/README.md | 30 ++++++++
dev/bench/run_bench.py | 177 ++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 209 insertions(+), 2 deletions(-)
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2b726f1b4..fafc20df4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -40,7 +40,7 @@ repos:
- id: trailing-whitespace
exclude: "^r/.*?/_snaps/.*?.md$"
- repo: https://github.com/pre-commit/mirrors-clang-format
- rev: "v18.1.4"
+ rev: "v18.1.5"
hooks:
- id: clang-format
types_or: [c, c++]
@@ -59,7 +59,7 @@ repos:
- "--linelength=90"
- "--verbose=2"
- repo: https://github.com/golangci/golangci-lint
- rev: v1.57.2
+ rev: v1.58.2
hooks:
- id: golangci-lint
entry: bash -c 'cd go/adbc && golangci-lint run --fix --timeout 5m'
diff --git a/dev/bench/README.md b/dev/bench/README.md
new file mode 100644
index 000000000..1c332885d
--- /dev/null
+++ b/dev/bench/README.md
@@ -0,0 +1,30 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+# Simple Python Benchmark script
+
+Connection parameters need to be filled in before the script can be run. The
intent
+is for this to be a simple enough script to provide iterations on running a
`SELECT`
+query solely for testing data transfer and memory usage rates for simple
queries.
+
+The initial sample here is designed for testing against Snowflake, and so
contains
+functions for testing the ADBC Snowflake driver, the
[snowflake-python-connector](https://pypi.org/project/snowflake-connector-python/),
and using ODBC via pyodbc.
+
+If `matplotlib` is installed, it will also draw the timing and memory usage up
as
+charts which can be saved.
diff --git a/dev/bench/run_bench.py b/dev/bench/run_bench.py
new file mode 100644
index 000000000..0ce2fbcf3
--- /dev/null
+++ b/dev/bench/run_bench.py
@@ -0,0 +1,177 @@
+# #!/usr/bin/env python3
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import time
+
+import psutil
+
+import adbc_driver_snowflake.dbapi
+
+# import pyodbc
+# import snowflake.connector
+
+
+process = psutil.Process()
+
+SAMPLE_RATE = 10 # record data every SAMPLE_RATE execution
+
+can_draw = True
+try:
+ import matplotlib.pyplot as plt
+except ImportError:
+ print("graphs cannot be drawn as matplotlib is not installed")
+ can_draw = False
+
+
+def task_execution_decorator(func, perf_file, memory_file):
+ count = 0
+
+ def wrapper(*args, **kwargs):
+ start = time.time()
+ func(*args, **kwargs)
+ memory_usage = (
+ process.memory_info().rss / 1024 / 1024
+ ) # rss is in bytes, we convert to MB
+ period = time.time() - start
+ nonlocal count
+ if count % SAMPLE_RATE == 0:
+ perf_file.write(str(period) + "\n")
+ print(f"execution {count}")
+ print(f"memory usage: {memory_usage} MB")
+ print(f"execution time: {period} s")
+ memory_file.write(str(memory_usage) + "\n")
+ count += 1
+
+ return wrapper
+
+
+def task_fetch_arrow_batches(cursor, table_name, row_count_limit=50000):
+ ret = cursor.execute(
+ f"select * from {table_name} limit {row_count_limit}"
+ ).fetch_arrow_batches() # interface for snowflake-python-connector
+ for _ in ret:
+ pass
+
+
+def task_fetch_record_batch(cursor, table_name, row_count_limit=50000):
+ cursor.execute(f"select * from {table_name} limit {row_count_limit}")
+ ret = cursor.fetch_record_batch() # interface we provide for dbapi
+ for _ in ret:
+ pass
+
+
+def task_fetch_odbc(cursor, table_name, row_count_limit=50000):
+ cursor.execute(f"select * from {table_name} limit {row_count_limit}")
+ batch_size = row_count_limit / 1000
+ while True:
+ rows = cursor.fetchmany(batch_size)
+ if len(rows) == 0:
+ break
+
+
+def execute_task(task, cursor, table_name, iteration_cnt,
row_count_limit=50000):
+ for _ in range(iteration_cnt):
+ task(cursor, table_name, row_count_limit)
+
+
+ADBC_CONNECTION_PARAMETERS = {
+ "user": "...",
+ "password": "...",
+ "adbc.snowflake.sql.account": "...",
+}
+PYODBC_CONNECTION_STR = "DSN=..."
+CONNECTION_PARAMETERS = {
+ "user": "...",
+ "password": "...",
+ "account": "...",
+}
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--iteration_cnt",
+ type=int,
+ default=50,
+ help="how many times to run the test function, default is 5000",
+ )
+ parser.add_argument(
+ "--row_count",
+ type=int,
+ default=10000,
+ help="how many rows of data to fetch",
+ )
+ parser.add_argument(
+ "--test_table_name",
+ type=str,
+ default="SNOWFLAKE_SAMPLE_DATA.TPCH_SF100.LINEITEM",
+ hep="an existing test table that has data prepared",
+ )
+ args = parser.parse_args()
+
+ test_table_name = args.test_table_name
+ perf_record_file = "stress_perf_record"
+ memory_record_file = "stress_memory_record"
+
+ # with pyodbc.connect(PYODBC_CONNECTION_STR) as conn, conn.cursor() as
cursor:
+ # with snowflake.connector.connect(
+ # **CONNECTION_PARAMETERS
+ # ) as conn, conn.cursor() as cursor:
+ with adbc_driver_snowflake.dbapi.connect(
+ **ADBC_CONNECTION_PARAMETERS
+ ) as conn, conn.cursor() as cursor:
+ cursor.adbc.statement.set_options(
+ **{
+ "adbc.snowflake.rpc.prefetch_concurrency": 4,
+ "adbc.rpc.result_queue_size": 100,
+ }
+ )
+
+ with open(perf_record_file, "w") as perf_file, open(
+ memory_record_file, "w"
+ ) as memory_file:
+ # task = task_execution_decorator( # snowflake python connector
+ # task_fetch_arrow_batches, perf_file, memory_file)
+ # task = task_execution_decorator( # pyodbc
+ # task_fetch_odbc, perf_file, memory_file)
+ task = task_execution_decorator(
+ task_fetch_record_batch, perf_file, memory_file
+ )
+ execute_task(
+ task, cursor, test_table_name, args.iteration_cnt,
args.row_count
+ )
+
+ if can_draw:
+ with open(perf_record_file) as perf_file, open(
+ memory_record_file
+ ) as memory_file:
+ # sample rate
+ perf_lines = perf_file.readlines()
+ perf_records = [float(line) for line in perf_lines]
+
+ memory_lines = memory_file.readlines()
+ memory_records = [float(line) for line in memory_lines]
+
+ plt.plot([i for i in range(len(perf_records))], perf_records)
+ plt.title("per iteration execution time")
+ plt.show(block=False)
+ plt.figure()
+ plt.plot([i for i in range(len(memory_records))],
memory_records)
+ plt.title("memory usage")
+ plt.show(block=True)