Re: [PR] feat: Add microbenchmark for string functions [datafusion-benchmarks]

via GitHub Tue, 30 Dec 2025 11:18:51 -0800


andygrove commented on code in PR #26:
URL: 
https://github.com/apache/datafusion-benchmarks/pull/26#discussion_r2653693480



##########
microbenchmarks/string_functions_benchmark.py:
##########
@@ -0,0 +1,338 @@
+#!/usr/bin/env python3
+"""
+Microbenchmark comparing DataFusion and DuckDB performance
+for SQL string functions on Parquet files.
+"""
+
+import tempfile
+import time
+import os
+from dataclasses import dataclass
+from pathlib import Path
+
+import pyarrow as pa
+import pyarrow.parquet as pq
+import datafusion
+import duckdb
+
+
+@dataclass
+class BenchmarkResult:
+    """Stores benchmark results for a single function."""
+    function_name: str
+    datafusion_time_ms: float
+    duckdb_time_ms: float
+    rows: int
+
+    @property
+    def speedup(self) -> float:
+        """DuckDB time / DataFusion time (>1 means DataFusion is faster)."""
+        if self.datafusion_time_ms == 0:
+            return float('inf')
+        return self.duckdb_time_ms / self.datafusion_time_ms
+
+
+@dataclass
+class StringFunction:
+    """Defines a string function with syntax for both engines."""
+    name: str
+    datafusion_expr: str  # Expression using {col} as placeholder for column 
name
+    duckdb_expr: str      # Expression using {col} as placeholder for column 
name
+
+
+# String functions to benchmark
+# {col} will be replaced with the actual column name
+STRING_FUNCTIONS = [
+    StringFunction("trim", "trim({col})", "trim({col})"),
+    StringFunction("ltrim", "ltrim({col})", "ltrim({col})"),
+    StringFunction("rtrim", "rtrim({col})", "rtrim({col})"),
+    StringFunction("lower", "lower({col})", "lower({col})"),
+    StringFunction("upper", "upper({col})", "upper({col})"),
+    StringFunction("length", "length({col})", "length({col})"),
+    StringFunction("char_length", "char_length({col})", "length({col})"),
+    StringFunction("reverse", "reverse({col})", "reverse({col})"),
+    StringFunction("repeat_3", "repeat({col}, 3)", "repeat({col}, 3)"),
+    StringFunction("concat", "concat({col}, {col})", "concat({col}, {col})"),
+    StringFunction("concat_ws", "concat_ws('-', {col}, {col})", 
"concat_ws('-', {col}, {col})"),
+    StringFunction("substring_1_5", "substring({col}, 1, 5)", 
"substring({col}, 1, 5)"),
+    StringFunction("left_5", "left({col}, 5)", "left({col}, 5)"),
+    StringFunction("right_5", "right({col}, 5)", "right({col}, 5)"),
+    StringFunction("lpad_20", "lpad({col}, 20, '*')", "lpad({col}, 20, '*')"),
+    StringFunction("rpad_20", "rpad({col}, 20, '*')", "rpad({col}, 20, '*')"),
+    StringFunction("replace", "replace({col}, 'a', 'X')", "replace({col}, 'a', 
'X')"),
+    StringFunction("translate", "translate({col}, 'aeiou', '12345')", 
"translate({col}, 'aeiou', '12345')"),
+    StringFunction("ascii", "ascii({col})", "ascii({col})"),
+    StringFunction("md5", "md5({col})", "md5({col})"),
+    StringFunction("sha256", "sha256({col})", "sha256({col})"),
+    StringFunction("btrim", "btrim({col}, ' ')", "trim({col}, ' ')"),
+    StringFunction("split_part", "split_part({col}, ' ', 1)", 
"split_part({col}, ' ', 1)"),
+    StringFunction("starts_with", "starts_with({col}, 'test')", 
"starts_with({col}, 'test')"),
+    StringFunction("ends_with", "ends_with({col}, 'data')", "ends_with({col}, 
'data')"),
+    StringFunction("strpos", "strpos({col}, 'e')", "strpos({col}, 'e')"),
+    StringFunction("regexp_replace", "regexp_replace({col}, '[aeiou]', '*')", 
"regexp_replace({col}, '[aeiou]', '*', 'g')"),
+]
+
+
+def generate_test_data(num_rows: int = 1_000_000) -> pa.Table:
+    """Generate test data with various string patterns."""
+    import random
+    import string
+
+    random.seed(42)  # For reproducibility
+
+    # Generate diverse string data
+    strings = []
+    for i in range(num_rows):
+        # Mix of different string patterns
+        pattern_type = i % 5
+        if pattern_type == 0:
+            # Short strings with spaces
+            s = f"  test_{i % 1000}  "
+        elif pattern_type == 1:
+            # Longer strings
+            s = ''.join(random.choices(string.ascii_lowercase, k=20))
+        elif pattern_type == 2:
+            # Mixed case with numbers
+            s = f"TestData_{i}_Value"
+        elif pattern_type == 3:
+            # Strings with special patterns
+            s = f"hello world {i % 100} data"
+        else:
+            # Random length strings
+            length = random.randint(5, 50)
+            s = ''.join(random.choices(string.ascii_letters + string.digits + 
' ', k=length))
+        strings.append(s)
+
+    table = pa.table({
+        'str_col': pa.array(strings, type=pa.string())
+    })
+
+    return table
+
+
+def setup_datafusion(parquet_path: str) -> datafusion.SessionContext:
+    """Create and configure DataFusion context."""
+    ctx = datafusion.SessionContext()
+    ctx.register_parquet('test_data', parquet_path)
+    return ctx
+
+
+def setup_duckdb(parquet_path: str) -> duckdb.DuckDBPyConnection:
+    """Create and configure DuckDB connection."""
+    conn = duckdb.connect(':memory:')
+    conn.execute(f"CREATE VIEW test_data AS SELECT * FROM 
read_parquet('{parquet_path}')")
+    return conn
+
+
+def benchmark_datafusion(ctx: datafusion.SessionContext, expr: str,
+                         warmup: int = 2, iterations: int = 5) -> float:
+    """Benchmark a query in DataFusion, return average time in ms."""
+    query = f"SELECT {expr} FROM test_data"
+
+    # Warmup runs
+    for _ in range(warmup):
+        ctx.sql(query).collect()
+
+    # Timed runs
+    times = []
+    for _ in range(iterations):
+        start = time.perf_counter()
+        ctx.sql(query).collect()
+        end = time.perf_counter()
+        times.append((end - start) * 1000)  # Convert to ms
+
+    return sum(times) / len(times)
+
+
+def benchmark_duckdb(conn: duckdb.DuckDBPyConnection, expr: str,
+                     warmup: int = 2, iterations: int = 5) -> float:
+    """Benchmark a query in DuckDB, return average time in ms."""
+    query = f"SELECT {expr} FROM test_data"
+
+    # Use fetch_arrow_table() for fair comparison with DataFusion's collect()
+    # Both return Arrow data without Python object conversion overhead
+    for _ in range(warmup):
+        conn.execute(query).fetch_arrow_table()
+
+    # Timed runs
+    times = []
+    for _ in range(iterations):
+        start = time.perf_counter()
+        conn.execute(query).fetch_arrow_table()
+        end = time.perf_counter()
+        times.append((end - start) * 1000)  # Convert to ms
+
+    return sum(times) / len(times)
+
+
+def run_benchmarks(num_rows: int = 1_000_000,
+                   warmup: int = 2,
+                   iterations: int = 5) -> list[BenchmarkResult]:
+    """Run all benchmarks and return results."""
+    results = []
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        parquet_path = os.path.join(tmpdir, 'test_data.parquet')
+
+        # Generate and save test data
+        print(f"Generating {num_rows:,} rows of test data...")
+        table = generate_test_data(num_rows)
+        pq.write_table(table, parquet_path)
+        print(f"Parquet file written to: {parquet_path}")
+        print(f"File size: {os.path.getsize(parquet_path) / 1024 / 1024:.2f} 
MB")
+
+        # Setup engines
+        print("\nSetting up DataFusion...")
+        df_ctx = setup_datafusion(parquet_path)
+
+        print("Setting up DuckDB...")
+        duck_conn = setup_duckdb(parquet_path)
+
+        # Run benchmarks
+        print(f"\nRunning benchmarks ({warmup} warmup, {iterations} iterations 
each)...\n")
+
+        col = 'str_col'
+        for func in STRING_FUNCTIONS:
+            df_expr = func.datafusion_expr.format(col=col)
+            duck_expr = func.duckdb_expr.format(col=col)
+
+            print(f"  Benchmarking: {func.name}...", end=" ", flush=True)
+
+            try:
+                df_time = benchmark_datafusion(df_ctx, df_expr, warmup, 
iterations)
+            except Exception as e:
+                print(f"DataFusion error: {e}")
+                df_time = float('nan')
+
+            try:
+                duck_time = benchmark_duckdb(duck_conn, duck_expr, warmup, 
iterations)
+            except Exception as e:
+                print(f"DuckDB error: {e}")
+                duck_time = float('nan')
+
+            result = BenchmarkResult(
+                function_name=func.name,
+                datafusion_time_ms=df_time,
+                duckdb_time_ms=duck_time,
+                rows=num_rows
+            )
+            results.append(result)
+
+            # Print progress
+            if df_time == df_time and duck_time == duck_time:  # Check for NaN
+                faster = "DataFusion" if df_time < duck_time else "DuckDB"
+                ratio = max(df_time, duck_time) / min(df_time, duck_time)
+                print(f"done ({faster} {ratio:.2f}x faster)")
+            else:
+                print("done (with errors)")
+
+        duck_conn.close()
+
+    return results
+
+
+def format_results_markdown(results: list[BenchmarkResult]) -> str:
+    """Format benchmark results as a markdown table."""
+    lines = [
+        "# String Function Microbenchmarks: DataFusion vs DuckDB",

Review Comment:
   Thanks, I updated this. Header now shows:
   
   ```
   | Function | DataFusion 50.0.0 (ms) | DuckDB 1.4.3 (ms) | Speedup | Faster |
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] feat: Add microbenchmark for string functions [datafusion-benchmarks]

Reply via email to