(arrow-adbc) branch main updated: chore(dev/bench): add a simple python benchmark script for ADBC (#1879)

lidavidm Tue, 21 May 2024 17:34:08 -0700

This is an automated email from the ASF dual-hosted git repository.

lidavidm pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-adbc.git



The following commit(s) were added to refs/heads/main by this push:
     new d4933e410 chore(dev/bench): add a simple python benchmark script for 
ADBC (#1879)
d4933e410 is described below

commit d4933e4104e08cf1154b6870af442072ffa7b46e
Author: Matt Topol <[email protected]>
AuthorDate: Tue May 21 20:33:30 2024 -0400

    chore(dev/bench): add a simple python benchmark script for ADBC (#1879)
    
    Just a starting point to hopefully continue building up some
    benchmarking suite stuff.
---
 .pre-commit-config.yaml |   4 +-
 dev/bench/README.md     |  30 ++++++++
 dev/bench/run_bench.py  | 177 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 209 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2b726f1b4..fafc20df4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -40,7 +40,7 @@ repos:
     - id: trailing-whitespace
       exclude: "^r/.*?/_snaps/.*?.md$"
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: "v18.1.4"
+    rev: "v18.1.5"
     hooks:
       - id: clang-format
         types_or: [c, c++]
@@ -59,7 +59,7 @@ repos:
         - "--linelength=90"
         - "--verbose=2"
   - repo: https://github.com/golangci/golangci-lint
-    rev: v1.57.2
+    rev: v1.58.2
     hooks:
     - id: golangci-lint
       entry: bash -c 'cd go/adbc && golangci-lint run --fix --timeout 5m'
diff --git a/dev/bench/README.md b/dev/bench/README.md
new file mode 100644
index 000000000..1c332885d
--- /dev/null
+++ b/dev/bench/README.md
@@ -0,0 +1,30 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Simple Python Benchmark script
+
+Connection parameters need to be filled in before the script can be run. The 
intent
+is for this to be a simple enough script to provide iterations on running a 
`SELECT`
+query solely for testing data transfer and memory usage rates for simple 
queries.
+
+The initial sample here is designed for testing against Snowflake, and so 
contains
+functions for testing the ADBC Snowflake driver, the 
[snowflake-python-connector](https://pypi.org/project/snowflake-connector-python/),
 and using ODBC via pyodbc.
+
+If `matplotlib` is installed, it will also draw the timing and memory usage up 
as
+charts which can be saved.
diff --git a/dev/bench/run_bench.py b/dev/bench/run_bench.py
new file mode 100644
index 000000000..0ce2fbcf3
--- /dev/null
+++ b/dev/bench/run_bench.py
@@ -0,0 +1,177 @@
+# #!/usr/bin/env python3
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import time
+
+import psutil
+
+import adbc_driver_snowflake.dbapi
+
+# import pyodbc
+# import snowflake.connector
+
+
+process = psutil.Process()
+
+SAMPLE_RATE = 10  # record data every SAMPLE_RATE execution
+
+can_draw = True
+try:
+    import matplotlib.pyplot as plt
+except ImportError:
+    print("graphs cannot be drawn as matplotlib is not installed")
+    can_draw = False
+
+
+def task_execution_decorator(func, perf_file, memory_file):
+    count = 0
+
+    def wrapper(*args, **kwargs):
+        start = time.time()
+        func(*args, **kwargs)
+        memory_usage = (
+            process.memory_info().rss / 1024 / 1024
+        )  # rss is in bytes, we convert to MB
+        period = time.time() - start
+        nonlocal count
+        if count % SAMPLE_RATE == 0:
+            perf_file.write(str(period) + "\n")
+            print(f"execution {count}")
+            print(f"memory usage: {memory_usage} MB")
+            print(f"execution time: {period} s")
+            memory_file.write(str(memory_usage) + "\n")
+        count += 1
+
+    return wrapper
+
+
+def task_fetch_arrow_batches(cursor, table_name, row_count_limit=50000):
+    ret = cursor.execute(
+        f"select * from {table_name} limit {row_count_limit}"
+    ).fetch_arrow_batches()  # interface for snowflake-python-connector
+    for _ in ret:
+        pass
+
+
+def task_fetch_record_batch(cursor, table_name, row_count_limit=50000):
+    cursor.execute(f"select * from {table_name} limit {row_count_limit}")
+    ret = cursor.fetch_record_batch()  # interface we provide for dbapi
+    for _ in ret:
+        pass
+
+
+def task_fetch_odbc(cursor, table_name, row_count_limit=50000):
+    cursor.execute(f"select * from {table_name} limit {row_count_limit}")
+    batch_size = row_count_limit / 1000
+    while True:
+        rows = cursor.fetchmany(batch_size)
+        if len(rows) == 0:
+            break
+
+
+def execute_task(task, cursor, table_name, iteration_cnt, 
row_count_limit=50000):
+    for _ in range(iteration_cnt):
+        task(cursor, table_name, row_count_limit)
+
+
+ADBC_CONNECTION_PARAMETERS = {
+    "user": "...",
+    "password": "...",
+    "adbc.snowflake.sql.account": "...",
+}
+PYODBC_CONNECTION_STR = "DSN=..."
+CONNECTION_PARAMETERS = {
+    "user": "...",
+    "password": "...",
+    "account": "...",
+}
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--iteration_cnt",
+        type=int,
+        default=50,
+        help="how many times to run the test function, default is 5000",
+    )
+    parser.add_argument(
+        "--row_count",
+        type=int,
+        default=10000,
+        help="how many rows of data to fetch",
+    )
+    parser.add_argument(
+        "--test_table_name",
+        type=str,
+        default="SNOWFLAKE_SAMPLE_DATA.TPCH_SF100.LINEITEM",
+        hep="an existing test table that has data prepared",
+    )
+    args = parser.parse_args()
+
+    test_table_name = args.test_table_name
+    perf_record_file = "stress_perf_record"
+    memory_record_file = "stress_memory_record"
+
+    # with pyodbc.connect(PYODBC_CONNECTION_STR) as conn, conn.cursor() as 
cursor:
+    # with snowflake.connector.connect(
+    #  **CONNECTION_PARAMETERS
+    # ) as conn, conn.cursor() as cursor:
+    with adbc_driver_snowflake.dbapi.connect(
+        **ADBC_CONNECTION_PARAMETERS
+    ) as conn, conn.cursor() as cursor:
+        cursor.adbc.statement.set_options(
+            **{
+                "adbc.snowflake.rpc.prefetch_concurrency": 4,
+                "adbc.rpc.result_queue_size": 100,
+            }
+        )
+
+        with open(perf_record_file, "w") as perf_file, open(
+            memory_record_file, "w"
+        ) as memory_file:
+            # task = task_execution_decorator( # snowflake python connector
+            #  task_fetch_arrow_batches, perf_file, memory_file)
+            # task = task_execution_decorator( # pyodbc
+            #  task_fetch_odbc, perf_file, memory_file)
+            task = task_execution_decorator(
+                task_fetch_record_batch, perf_file, memory_file
+            )
+            execute_task(
+                task, cursor, test_table_name, args.iteration_cnt, 
args.row_count
+            )
+
+        if can_draw:
+            with open(perf_record_file) as perf_file, open(
+                memory_record_file
+            ) as memory_file:
+                # sample rate
+                perf_lines = perf_file.readlines()
+                perf_records = [float(line) for line in perf_lines]
+
+                memory_lines = memory_file.readlines()
+                memory_records = [float(line) for line in memory_lines]
+
+                plt.plot([i for i in range(len(perf_records))], perf_records)
+                plt.title("per iteration execution time")
+                plt.show(block=False)
+                plt.figure()
+                plt.plot([i for i in range(len(memory_records))], 
memory_records)
+                plt.title("memory usage")
+                plt.show(block=True)

(arrow-adbc) branch main updated: chore(dev/bench): add a simple python benchmark script for ADBC (#1879)

Reply via email to