This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-spatialbench.git
The following commit(s) were added to refs/heads/main by this push:
new adb867c feature: add a benchmark github action to compare a few
spatial lib (#72)
adb867c is described below
commit adb867ccc4b18c80a82a05e202e6eb54e0471aa1
Author: Jia Yu <[email protected]>
AuthorDate: Wed Jan 14 20:22:55 2026 -0700
feature: add a benchmark github action to compare a few spatial lib (#72)
* Add initial code
* Use cached huggingface data
* fix
* Fix
* Fix
* fix
* fix
* Fix
---
.github/workflows/benchmark.yml | 368 +++++++++++++
README.md | 2 +-
benchmark/run_benchmark.py | 607 +++++++++++++++++++++
benchmark/summarize_results.py | 313 +++++++++++
.../geopandas_queries.py | 4 +-
.../print_queries.py | 9 +-
.../spatial_polars.py | 4 +-
7 files changed, 1297 insertions(+), 10 deletions(-)
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
new file mode 100644
index 0000000..8b5c81a
--- /dev/null
+++ b/.github/workflows/benchmark.yml
@@ -0,0 +1,368 @@
+name: SpatialBench Benchmark
+
+on:
+ # Run every other week on Monday at 6:00 UTC
+ schedule:
+ - cron: '0 6 1-7,15-21 * 1'
+ # Run on PRs that modify benchmark code
+ pull_request:
+ paths:
+ - 'benchmark/**'
+ - 'spatialbench-queries/**'
+ - '.github/workflows/benchmark.yml'
+ # Run on pushes to main that modify benchmark code
+ push:
+ branches: ["main"]
+ paths:
+ - 'benchmark/**'
+ - 'spatialbench-queries/**'
+ - '.github/workflows/benchmark.yml'
+ # Allow manual triggering with extended options
+ workflow_dispatch:
+ inputs:
+ scale_factor:
+ description: 'Scale factor for benchmark'
+ required: false
+ default: '1'
+ type: choice
+ options:
+ - '0.1'
+ - '1'
+ - '10'
+ engines:
+ description: 'Engines to benchmark (comma-separated)'
+ required: false
+ default: 'duckdb,geopandas,sedonadb'
+ type: string
+ timeout:
+ description: 'Query timeout in seconds (default: 60, increase for full
benchmark)'
+ required: false
+ default: '60'
+ type: string
+ sedonadb_version:
+ description: 'SedonaDB version (e.g., 1.0.0, leave empty for latest)'
+ required: false
+ default: ''
+ type: string
+ duckdb_version:
+ description: 'DuckDB version (e.g., 1.0.0, leave empty for latest)'
+ required: false
+ default: ''
+ type: string
+ geopandas_version:
+ description: 'GeoPandas version (e.g., 1.0.0, leave empty for latest)'
+ required: false
+ default: ''
+ type: string
+ runs:
+ description: 'Number of runs per query (average taken for fair
comparison)'
+ required: false
+ default: '3'
+ type: choice
+ options:
+ - '1'
+ - '3'
+ - '5'
+
+concurrency:
+ group: ${{ github.repository }}-${{ github.ref }}-benchmark
+ cancel-in-progress: true
+
+env:
+ CARGO_TERM_COLOR: always
+ SCALE_FACTOR: ${{ github.event.inputs.scale_factor || '1' }}
+ BENCHMARK_ENGINES: ${{ github.event.inputs.engines ||
'duckdb,geopandas,sedonadb' }}
+ QUERY_TIMEOUT: ${{ github.event.inputs.timeout || '60' }}
+ BENCHMARK_RUNS: ${{ github.event.inputs.runs || '3' }}
+ # Package versions (empty = latest, can be overridden via workflow_dispatch)
+ SEDONADB_VERSION: ${{ github.event.inputs.sedonadb_version }}
+ DUCKDB_VERSION: ${{ github.event.inputs.duckdb_version }}
+ GEOPANDAS_VERSION: ${{ github.event.inputs.geopandas_version }}
+ # Hugging Face dataset for benchmark data
+ HF_DATASET: apache-sedona/spatialbench
+ HF_DATA_VERSION: v0.1.0
+
+jobs:
+ # Download benchmark data from Hugging Face
+ download-data:
+ name: Download Data (SF${{ github.event.inputs.scale_factor || '1' }})
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Cache benchmark data
+ id: cache-data
+ uses: actions/cache@v4
+ with:
+ path: benchmark-data-sf${{ env.SCALE_FACTOR }}
+ key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{
env.SCALE_FACTOR }}
+
+ - name: Setup Python
+ if: steps.cache-data.outputs.cache-hit != 'true'
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.11'
+
+ - name: Install huggingface-hub
+ if: steps.cache-data.outputs.cache-hit != 'true'
+ run: pip install huggingface-hub
+
+ - name: Download benchmark data from Hugging Face
+ if: steps.cache-data.outputs.cache-hit != 'true'
+ run: |
+ # Map scale factor to HF folder name
+ SF="${{ env.SCALE_FACTOR }}"
+ if [ "$SF" = "0.1" ]; then
+ HF_SF="sf0.1"
+ else
+ HF_SF="sf${SF}"
+ fi
+
+ echo "Downloading data from HF: ${{ env.HF_DATASET }}/${{
env.HF_DATA_VERSION }}/${HF_SF}"
+
+ python -c "
+ from huggingface_hub import snapshot_download
+ import os
+
+ sf = os.environ['SCALE_FACTOR']
+ hf_sf = 'sf0.1' if sf == '0.1' else f'sf{sf}'
+
+ snapshot_download(
+ repo_id='${{ env.HF_DATASET }}',
+ repo_type='dataset',
+ local_dir='hf-data',
+ allow_patterns=[f'${{ env.HF_DATA_VERSION }}/{hf_sf}/**'],
+ )
+ "
+
+ # Move data to expected location
+ mkdir -p benchmark-data-sf${{ env.SCALE_FACTOR }}
+
+ SF="${{ env.SCALE_FACTOR }}"
+ if [ "$SF" = "0.1" ]; then
+ HF_SF="sf0.1"
+ else
+ HF_SF="sf${SF}"
+ fi
+
+ cp -r hf-data/${{ env.HF_DATA_VERSION }}/${HF_SF}/*
benchmark-data-sf${{ env.SCALE_FACTOR }}/
+
+ echo "Downloaded data structure:"
+ find benchmark-data-sf${{ env.SCALE_FACTOR }} -type f -name
"*.parquet" | head -20
+ echo ""
+ echo "Directory contents:"
+ ls -la benchmark-data-sf${{ env.SCALE_FACTOR }}/
+ echo ""
+ echo "Total size:"
+ du -sh benchmark-data-sf${{ env.SCALE_FACTOR }}/
+
+ - name: Show cached data info
+ if: steps.cache-data.outputs.cache-hit == 'true'
+ run: |
+ echo "Using cached benchmark data"
+ echo "Directory contents:"
+ ls -la benchmark-data-sf${{ env.SCALE_FACTOR }}/
+ echo ""
+ echo "Total size:"
+ du -sh benchmark-data-sf${{ env.SCALE_FACTOR }}/
+
+ benchmark-duckdb:
+ name: Benchmark DuckDB (SF${{ github.event.inputs.scale_factor || '1' }})
+ needs: download-data
+ runs-on: ubuntu-latest
+ if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb',
'duckdb')
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Restore benchmark data from cache
+ uses: actions/cache/restore@v4
+ with:
+ path: benchmark-data-sf${{ env.SCALE_FACTOR }}
+ key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{
env.SCALE_FACTOR }}
+ fail-on-cache-miss: true
+
+ - name: Setup Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.11'
+
+ - name: Install dependencies
+ run: |
+ if [ -n "${{ env.DUCKDB_VERSION }}" ]; then
+ pip install "duckdb==${{ env.DUCKDB_VERSION }}" pyarrow pandas
+ else
+ pip install duckdb pyarrow pandas
+ fi
+ echo "Installed DuckDB version: $(python -c 'import duckdb;
print(duckdb.__version__)')"
+
+ - name: Pre-install DuckDB spatial extension
+ run: |
+ python -c "import duckdb; con = duckdb.connect();
con.execute('INSTALL spatial'); print('DuckDB spatial extension installed')"
+
+ - name: Run DuckDB benchmark
+ run: |
+ python benchmark/run_benchmark.py \
+ --data-dir benchmark-data-sf${{ env.SCALE_FACTOR }} \
+ --engines duckdb \
+ --timeout ${{ env.QUERY_TIMEOUT }} \
+ --runs ${{ env.BENCHMARK_RUNS }} \
+ --scale-factor ${{ env.SCALE_FACTOR }} \
+ --output duckdb_results.json
+
+ - name: Upload results
+ uses: actions/upload-artifact@v4
+ with:
+ name: duckdb-results-sf${{ env.SCALE_FACTOR }}
+ path: duckdb_results.json
+ retention-days: 30
+
+ benchmark-geopandas:
+ name: Benchmark GeoPandas (SF${{ github.event.inputs.scale_factor || '1'
}})
+ needs: download-data
+ runs-on: ubuntu-latest
+ if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb',
'geopandas')
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Restore benchmark data from cache
+ uses: actions/cache/restore@v4
+ with:
+ path: benchmark-data-sf${{ env.SCALE_FACTOR }}
+ key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{
env.SCALE_FACTOR }}
+ fail-on-cache-miss: true
+
+ - name: Setup Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.11'
+
+ - name: Install dependencies
+ run: |
+ if [ -n "${{ env.GEOPANDAS_VERSION }}" ]; then
+ pip install "geopandas==${{ env.GEOPANDAS_VERSION }}" pandas
pyarrow shapely
+ else
+ pip install geopandas pandas pyarrow shapely
+ fi
+ echo "Installed GeoPandas version: $(python -c 'from
importlib.metadata import version; print(version(\"geopandas\"))')"
+
+ - name: Run GeoPandas benchmark
+ run: |
+ python benchmark/run_benchmark.py \
+ --data-dir benchmark-data-sf${{ env.SCALE_FACTOR }} \
+ --engines geopandas \
+ --timeout ${{ env.QUERY_TIMEOUT }} \
+ --runs ${{ env.BENCHMARK_RUNS }} \
+ --scale-factor ${{ env.SCALE_FACTOR }} \
+ --output geopandas_results.json
+
+ - name: Upload results
+ uses: actions/upload-artifact@v4
+ with:
+ name: geopandas-results-sf${{ env.SCALE_FACTOR }}
+ path: geopandas_results.json
+ retention-days: 30
+
+ benchmark-sedonadb:
+ name: Benchmark SedonaDB (SF${{ github.event.inputs.scale_factor || '1' }})
+ needs: download-data
+ runs-on: ubuntu-latest
+ if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb',
'sedonadb')
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Restore benchmark data from cache
+ uses: actions/cache/restore@v4
+ with:
+ path: benchmark-data-sf${{ env.SCALE_FACTOR }}
+ key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{
env.SCALE_FACTOR }}
+ fail-on-cache-miss: true
+
+ - name: Setup Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.11'
+
+ - name: Install dependencies
+ run: |
+ if [ -n "${{ env.SEDONADB_VERSION }}" ]; then
+ pip install "sedonadb[geopandas]==${{ env.SEDONADB_VERSION }}"
pandas pyarrow pyproj
+ else
+ pip install "sedonadb[geopandas]" pandas pyarrow pyproj
+ fi
+ echo "Installed SedonaDB version: $(python -c 'from
importlib.metadata import version; print(version(\"sedonadb\"))')"
+
+ - name: Run SedonaDB benchmark
+ run: |
+ python benchmark/run_benchmark.py \
+ --data-dir benchmark-data-sf${{ env.SCALE_FACTOR }} \
+ --engines sedonadb \
+ --timeout ${{ env.QUERY_TIMEOUT }} \
+ --runs ${{ env.BENCHMARK_RUNS }} \
+ --scale-factor ${{ env.SCALE_FACTOR }} \
+ --output sedonadb_results.json
+
+ - name: Upload results
+ uses: actions/upload-artifact@v4
+ with:
+ name: sedonadb-results-sf${{ env.SCALE_FACTOR }}
+ path: sedonadb_results.json
+ retention-days: 30
+
+ summarize-results:
+ name: Summarize Results (SF${{ github.event.inputs.scale_factor || '1' }})
+ needs: [benchmark-duckdb, benchmark-geopandas, benchmark-sedonadb]
+ if: always() && (needs.benchmark-duckdb.result == 'success' ||
needs.benchmark-geopandas.result == 'success' ||
needs.benchmark-sedonadb.result == 'success')
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Download DuckDB results
+ if: needs.benchmark-duckdb.result == 'success'
+ uses: actions/download-artifact@v4
+ with:
+ name: duckdb-results-sf${{ env.SCALE_FACTOR }}
+ path: results
+ continue-on-error: true
+
+ - name: Download GeoPandas results
+ if: needs.benchmark-geopandas.result == 'success'
+ uses: actions/download-artifact@v4
+ with:
+ name: geopandas-results-sf${{ env.SCALE_FACTOR }}
+ path: results
+ continue-on-error: true
+
+ - name: Download SedonaDB results
+ uses: actions/download-artifact@v4
+ with:
+ name: sedonadb-results-sf${{ env.SCALE_FACTOR }}
+ path: results
+ continue-on-error: true
+
+ - name: Setup Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.11'
+
+ - name: Generate summary
+ run: |
+ python benchmark/summarize_results.py \
+ --results-dir results \
+ --timeout ${{ env.QUERY_TIMEOUT }} \
+ --runs ${{ env.BENCHMARK_RUNS }} \
+ --output benchmark_summary.md
+
+ - name: Display summary
+ run: cat benchmark_summary.md
+
+ - name: Add summary to job output
+ run: cat benchmark_summary.md >> $GITHUB_STEP_SUMMARY
+
+ - name: Upload combined results
+ uses: actions/upload-artifact@v4
+ with:
+ name: benchmark-summary-sf${{ env.SCALE_FACTOR }}
+ path: |
+ results/
+ benchmark_summary.md
+ retention-days: 90
diff --git a/README.md b/README.md
index 9020cf3..24680ea 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ We welcome contributions and civil discussions on how to
improve the queries and
You can print the queries in your dialect of choice using the following
command:
```bash
-./print_queries.py <dialect>
+./spatialbench-queries/print_queries.py <dialect>
```
## Data Model
diff --git a/benchmark/run_benchmark.py b/benchmark/run_benchmark.py
new file mode 100644
index 0000000..4b459f4
--- /dev/null
+++ b/benchmark/run_benchmark.py
@@ -0,0 +1,607 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+SpatialBench Benchmark Runner
+
+This script runs spatial benchmarks comparing SedonaDB, DuckDB, and GeoPandas
+on the SpatialBench queries at a specified scale factor.
+"""
+
+import argparse
+import json
+import multiprocessing
+import signal
+import sys
+import time
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Callable
+
+# Add spatialbench-queries directory to path to import query modules
+sys.path.insert(0, str(Path(__file__).parent.parent / "spatialbench-queries"))
+
+# Constants
+QUERY_COUNT = 12
+TABLES = ["building", "customer", "driver", "trip", "vehicle", "zone"]
+
+
+@dataclass
+class BenchmarkResult:
+ """Result of a single query benchmark."""
+ query: str
+ engine: str
+ time_seconds: float | None
+ row_count: int | None
+ status: str # "success", "error", "timeout"
+ error_message: str | None = None
+
+
+@dataclass
+class BenchmarkSuite:
+ """Complete benchmark suite results."""
+ engine: str
+ scale_factor: float
+ results: list[BenchmarkResult] = field(default_factory=list)
+ total_time: float = 0.0
+ timestamp: str = field(default_factory=lambda:
datetime.now(timezone.utc).isoformat())
+ version: str = "unknown"
+
+ def to_dict(self) -> dict[str, Any]:
+ return {
+ "engine": self.engine,
+ "version": self.version,
+ "scale_factor": self.scale_factor,
+ "timestamp": self.timestamp,
+ "total_time": self.total_time,
+ "results": [
+ {
+ "query": r.query,
+ "time_seconds": r.time_seconds,
+ "row_count": r.row_count,
+ "status": r.status,
+ "error_message": r.error_message,
+ }
+ for r in self.results
+ ],
+ }
+
+
+class QueryTimeoutError(Exception):
+ """Raised when a query times out."""
+ pass
+
+
+def _run_query_in_process(
+ result_queue: multiprocessing.Queue,
+ engine_class: type,
+ data_paths: dict[str, str],
+ query_name: str,
+ query_sql: str | None,
+):
+ """Worker function to run a query in a separate process.
+
+ This allows us to forcefully terminate queries that hang or consume
+ too much memory, which SIGALRM cannot do for native code.
+ """
+ try:
+ benchmark = engine_class(data_paths)
+ benchmark.setup()
+ try:
+ start_time = time.perf_counter()
+ row_count, _ = benchmark.execute_query(query_name, query_sql)
+ elapsed = time.perf_counter() - start_time
+ result_queue.put({
+ "status": "success",
+ "time_seconds": round(elapsed, 2),
+ "row_count": row_count,
+ "error_message": None,
+ })
+ finally:
+ benchmark.teardown()
+ except Exception as e:
+ result_queue.put({
+ "status": "error",
+ "time_seconds": None,
+ "row_count": None,
+ "error_message": str(e),
+ })
+
+
+def get_data_paths(data_dir: str) -> dict[str, str]:
+ """Get paths to all data tables.
+
+ Supports two data formats:
+ 1. Directory format: table_name/*.parquet (e.g.,
building/building.1.parquet)
+ 2. Single file format: table_name.parquet (e.g., building.parquet)
+
+ Returns directory paths for directories containing parquet files.
+ Both DuckDB, pandas, and SedonaDB can read all parquet files from a
directory.
+ """
+ data_path = Path(data_dir)
+ paths = {}
+
+ for table in TABLES:
+ table_path = data_path / table
+ # Check for directory format first (from HF:
building/building.1.parquet)
+ if table_path.is_dir():
+ parquet_files = list(table_path.glob("*.parquet"))
+ if parquet_files:
+ # Return directory path - DuckDB, pandas, and SedonaDB all
support reading
+ # all parquet files from a directory
+ paths[table] = str(table_path)
+ else:
+ paths[table] = str(table_path)
+ # Then check for single file format (building.parquet)
+ elif (data_path / f"{table}.parquet").exists():
+ paths[table] = str(data_path / f"{table}.parquet")
+ # Finally check for any matching parquet files
+ else:
+ matches = list(data_path.glob(f"{table}*.parquet"))
+ if matches:
+ paths[table] = str(matches[0])
+
+ return paths
+
+
+class BaseBenchmark(ABC):
+ """Base class for benchmark runners."""
+
+ def __init__(self, data_paths: dict[str, str], engine_name: str):
+ self.data_paths = data_paths
+ self.engine_name = engine_name
+
+ @abstractmethod
+ def setup(self) -> None:
+ """Initialize the benchmark environment."""
+ pass
+
+ @abstractmethod
+ def teardown(self) -> None:
+ """Cleanup the benchmark environment."""
+ pass
+
+ @abstractmethod
+ def execute_query(self, query_name: str, query: str | None) -> tuple[int,
Any]:
+ """Execute a query and return (row_count, result)."""
+ pass
+
+ def run_query(self, query_name: str, query: str | None = None, timeout:
int = 1200) -> BenchmarkResult:
+ """Run a single query with timeout handling."""
+ start_time = time.perf_counter()
+ try:
+ with timeout_handler(timeout, query_name):
+ row_count, _ = self.execute_query(query_name, query)
+ elapsed = time.perf_counter() - start_time
+ return BenchmarkResult(
+ query=query_name,
+ engine=self.engine_name,
+ time_seconds=round(elapsed, 2),
+ row_count=row_count,
+ status="success",
+ )
+ except (TimeoutError, QueryTimeoutError) as e:
+ return BenchmarkResult(
+ query=query_name,
+ engine=self.engine_name,
+ time_seconds=timeout,
+ row_count=None,
+ status="timeout",
+ error_message=str(e),
+ )
+ except Exception as e:
+ elapsed = time.perf_counter() - start_time
+ # If elapsed time is close to or exceeds timeout, treat as timeout
+ # This handles cases where native code (Rust/C) throws a different
exception
+ # when interrupted by SIGALRM
+ if elapsed >= timeout * 0.95: # 95% of timeout to account for
timing variance
+ return BenchmarkResult(
+ query=query_name,
+ engine=self.engine_name,
+ time_seconds=timeout,
+ row_count=None,
+ status="timeout",
+ error_message=f"Query timed out after {timeout}s (original
error: {e})",
+ )
+ return BenchmarkResult(
+ query=query_name,
+ engine=self.engine_name,
+ time_seconds=None,
+ row_count=None,
+ status="error",
+ error_message=str(e),
+ )
+
+
+class DuckDBBenchmark(BaseBenchmark):
+ """DuckDB benchmark runner."""
+
+ def __init__(self, data_paths: dict[str, str]):
+ super().__init__(data_paths, "duckdb")
+ self._conn = None
+
+ def setup(self) -> None:
+ import duckdb
+ self._conn = duckdb.connect()
+ self._conn.execute("LOAD spatial;")
+ self._conn.execute("SET enable_external_file_cache = false;")
+ for table, path in self.data_paths.items():
+ # DuckDB needs glob pattern for directories, add /*.parquet if
path is a directory
+ parquet_path = path
+ if Path(path).is_dir():
+ parquet_path = str(Path(path) / "*.parquet")
+ self._conn.execute(f"CREATE VIEW {table} AS SELECT * FROM
read_parquet('{parquet_path}')")
+
+ def teardown(self) -> None:
+ if self._conn:
+ self._conn.close()
+ self._conn = None
+
+ def execute_query(self, query_name: str, query: str | None) -> tuple[int,
Any]:
+ result = self._conn.execute(query).fetchall()
+ return len(result), result
+
+
+class GeoPandasBenchmark(BaseBenchmark):
+ """GeoPandas benchmark runner."""
+
+ def __init__(self, data_paths: dict[str, str]):
+ super().__init__(data_paths, "geopandas")
+ self._queries = None
+
+ def setup(self) -> None:
+ import importlib.util
+ geopandas_path = Path(__file__).parent.parent / "spatialbench-queries"
/ "geopandas_queries.py"
+ spec = importlib.util.spec_from_file_location("geopandas_queries",
geopandas_path)
+ module = importlib.util.module_from_spec(spec)
+ spec.loader.exec_module(module)
+ self._queries = {f"q{i}": getattr(module, f"q{i}") for i in range(1,
QUERY_COUNT + 1)}
+
+ def teardown(self) -> None:
+ self._queries = None
+
+ def execute_query(self, query_name: str, query: str | None) -> tuple[int,
Any]:
+ if query_name not in self._queries:
+ raise ValueError(f"Query {query_name} not found")
+ result = self._queries[query_name](self.data_paths)
+ return len(result), result
+
+
+class SedonaDBBenchmark(BaseBenchmark):
+ """SedonaDB benchmark runner."""
+
+ def __init__(self, data_paths: dict[str, str]):
+ super().__init__(data_paths, "sedonadb")
+ self._sedona = None
+
+ def setup(self) -> None:
+ import sedonadb
+ self._sedona = sedonadb.connect()
+ for table, path in self.data_paths.items():
+ # SedonaDB needs glob pattern for directories
+ parquet_path = path
+ if Path(path).is_dir():
+ parquet_path = str(Path(path) / "*.parquet")
+ self._sedona.read_parquet(parquet_path).to_view(table,
overwrite=True)
+
+ def teardown(self) -> None:
+ self._sedona = None
+
+ def execute_query(self, query_name: str, query: str | None) -> tuple[int,
Any]:
+ result = self._sedona.sql(query).to_pandas()
+ return len(result), result
+
+
+def get_sql_queries(dialect: str) -> dict[str, str]:
+ """Get SQL queries for a specific dialect from print_queries.py."""
+ from print_queries import DuckDBSpatialBenchBenchmark,
SedonaDBSpatialBenchBenchmark
+
+ dialects = {
+ "duckdb": DuckDBSpatialBenchBenchmark,
+ "sedonadb": SedonaDBSpatialBenchBenchmark,
+ }
+ return dialects[dialect]().queries()
+
+
+def run_query_isolated(
+ engine_class: type,
+ engine_name: str,
+ data_paths: dict[str, str],
+ query_name: str,
+ query_sql: str | None,
+ timeout: int,
+) -> BenchmarkResult:
+ """Run a single query in an isolated subprocess with hard timeout.
+
+ This is more robust than SIGALRM because:
+ 1. Native code (C++/Rust) can be forcefully terminated
+ 2. Memory-hungry queries don't affect the main process
+ 3. Crashed queries don't invalidate the benchmark runner
+ """
+ result_queue = multiprocessing.Queue()
+ process = multiprocessing.Process(
+ target=_run_query_in_process,
+ args=(result_queue, engine_class, data_paths, query_name, query_sql),
+ )
+
+ process.start()
+ process.join(timeout=timeout)
+
+ if process.is_alive():
+ # Query exceeded timeout - forcefully terminate
+ process.terminate()
+ process.join(timeout=5) # Give it 5 seconds to terminate gracefully
+
+ if process.is_alive():
+ # Still alive - kill it
+ process.kill()
+ process.join(timeout=2)
+
+ return BenchmarkResult(
+ query=query_name,
+ engine=engine_name,
+ time_seconds=timeout,
+ row_count=None,
+ status="timeout",
+ error_message=f"Query {query_name} timed out after {timeout}
seconds (process killed)",
+ )
+
+ # Process completed - get result from queue
+ try:
+ result_data = result_queue.get_nowait()
+ return BenchmarkResult(
+ query=query_name,
+ engine=engine_name,
+ time_seconds=result_data["time_seconds"],
+ row_count=result_data["row_count"],
+ status=result_data["status"],
+ error_message=result_data["error_message"],
+ )
+ except Exception:
+ # Process died without putting result in queue
+ return BenchmarkResult(
+ query=query_name,
+ engine=engine_name,
+ time_seconds=None,
+ row_count=None,
+ status="error",
+ error_message=f"Query {query_name} crashed (process exit code:
{process.exitcode})",
+ )
+
+
+def run_benchmark(
+ engine: str,
+ data_paths: dict[str, str],
+ queries: list[str] | None,
+ timeout: int,
+ scale_factor: float,
+ runs: int = 3,
+) -> BenchmarkSuite:
+ """Generic benchmark runner for any engine.
+
+ Each query runs in an isolated subprocess to ensure:
+ - Hard timeout enforcement (process can be killed)
+ - Memory isolation (one query can't OOM the runner)
+ - Crash isolation (one query crash doesn't affect others)
+
+ If runs > 1 and the first run succeeds, additional runs are performed
+ and the average time is reported for fair comparison.
+ """
+
+ from importlib.metadata import version as pkg_version
+
+ # Engine configurations
+ configs = {
+ "duckdb": {
+ "class": DuckDBBenchmark,
+ "version_getter": lambda: __import__("duckdb").__version__,
+ "queries_getter": lambda: get_sql_queries("duckdb"),
+ },
+ "geopandas": {
+ "class": GeoPandasBenchmark,
+ "version_getter": lambda: pkg_version("geopandas"),
+ "queries_getter": lambda: {f"q{i}": None for i in range(1,
QUERY_COUNT + 1)},
+ },
+ "sedonadb": {
+ "class": SedonaDBBenchmark,
+ "version_getter": lambda: pkg_version("sedonadb"),
+ "queries_getter": lambda: get_sql_queries("sedonadb"),
+ },
+ }
+
+ config = configs[engine]
+ version = config["version_getter"]()
+
+ print(f"\n{'=' * 60}")
+ print(f"Running {engine.title()} Benchmark")
+ print(f"{'=' * 60}")
+ print(f"{engine.title()} version: {version}")
+ if runs > 1:
+ print(f"Runs per query: {runs} (average will be reported)")
+
+ suite = BenchmarkSuite(engine=engine, scale_factor=scale_factor,
version=version)
+ all_queries = config["queries_getter"]()
+ engine_class = config["class"]
+
+ for query_name, query_sql in all_queries.items():
+ if queries and query_name not in queries:
+ continue
+
+ print(f" Running {query_name}...", end=" ", flush=True)
+
+ # First run
+ result = run_query_isolated(
+ engine_class=engine_class,
+ engine_name=engine,
+ data_paths=data_paths,
+ query_name=query_name,
+ query_sql=query_sql,
+ timeout=timeout,
+ )
+
+ # If first run succeeded and we want multiple runs, do additional runs
+ if result.status == "success" and runs > 1:
+ run_times = [result.time_seconds]
+
+ for run_num in range(2, runs + 1):
+ additional_result = run_query_isolated(
+ engine_class=engine_class,
+ engine_name=engine,
+ data_paths=data_paths,
+ query_name=query_name,
+ query_sql=query_sql,
+ timeout=timeout,
+ )
+ if additional_result.status == "success":
+ run_times.append(additional_result.time_seconds)
+ else:
+ # If any subsequent run fails, just use successful runs
+ break
+
+ # Calculate average of all successful runs
+ avg_time = round(sum(run_times) / len(run_times), 2)
+ result = BenchmarkResult(
+ query=query_name,
+ engine=engine,
+ time_seconds=avg_time,
+ row_count=result.row_count,
+ status="success",
+ error_message=None,
+ )
+ print(f"{avg_time}s avg ({len(run_times)} runs, {result.row_count}
rows)")
+ elif result.status == "success":
+ print(f"{result.time_seconds}s ({result.row_count} rows)")
+ else:
+ print(f"{result.status.upper()}: {result.error_message}")
+
+ suite.results.append(result)
+ if result.status == "success":
+ suite.total_time += result.time_seconds
+
+ return suite
+
+
+def print_summary(results: list[BenchmarkSuite]) -> None:
+ """Print a summary comparison table."""
+ print(f"\n{'=' * 80}")
+ print("BENCHMARK SUMMARY")
+ print("=" * 80)
+
+ all_queries = sorted(
+ {r.query for suite in results for r in suite.results},
+ key=lambda x: int(x[1:])
+ )
+
+ data = {
+ suite.engine: {
+ r.query: f"{r.time_seconds:.2f}s" if r.status == "success" else
r.status.upper()
+ for r in suite.results
+ }
+ for suite in results
+ }
+
+ engines = [s.engine for s in results]
+ header = f"{'Query':<10}" + "".join(f"{e:<15}" for e in engines)
+ print(header)
+ print("-" * len(header))
+
+ for query in all_queries:
+ row = f"{query:<10}" + "".join(f"{data.get(e, {}).get(query,
'N/A'):<15}" for e in engines)
+ print(row)
+
+ print("-" * len(header))
+ print(f"{'Total':<10}" + "".join(f"{s.total_time:.2f}s{'':<9}" for s in
results))
+
+
+def save_results(results: list[BenchmarkSuite], output_file: str) -> None:
+ """Save results to JSON file."""
+ output = {
+ "benchmark": "spatialbench",
+ "version": "0.1.0",
+ "generated_at": datetime.now(timezone.utc).isoformat(),
+ "results": [suite.to_dict() for suite in results],
+ }
+
+ with open(output_file, "w") as f:
+ json.dump(output, f, indent=2)
+
+ print(f"\nResults saved to {output_file}")
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Run SpatialBench benchmarks comparing SedonaDB, DuckDB,
and GeoPandas"
+ )
+ parser.add_argument("--data-dir", type=str, required=True,
+ help="Path to directory containing benchmark data
(parquet files)")
+ parser.add_argument("--engines", type=str, default="duckdb,geopandas",
+ help="Comma-separated list of engines to benchmark")
+ parser.add_argument("--queries", type=str, default=None,
+ help="Comma-separated list of queries to run (e.g.,
q1,q2,q3)")
+ parser.add_argument("--timeout", type=int, default=10,
+ help="Query timeout in seconds (default: 10)")
+ parser.add_argument("--runs", type=int, default=3,
+ help="Number of runs per query for averaging (default:
3)")
+ parser.add_argument("--output", type=str, default="benchmark_results.json",
+ help="Output file for results")
+ parser.add_argument("--scale-factor", type=float, default=1,
+ help="Scale factor of the data (for reporting only)")
+
+ args = parser.parse_args()
+
+ engines = [e.strip().lower() for e in args.engines.split(",")]
+ valid_engines = {"duckdb", "geopandas", "sedonadb"}
+
+ for e in engines:
+ if e not in valid_engines:
+ print(f"Error: Unknown engine '{e}'. Valid options:
{valid_engines}")
+ sys.exit(1)
+
+ queries = [q.strip().lower() for q in args.queries.split(",")] if
args.queries else None
+
+ data_paths = get_data_paths(args.data_dir)
+ if not data_paths:
+ print(f"Error: No data files found in {args.data_dir}")
+ sys.exit(1)
+
+ print("Data paths:")
+ for table, path in data_paths.items():
+ print(f" {table}: {path}")
+
+ results = [
+ run_benchmark(engine, data_paths, queries, args.timeout,
args.scale_factor, args.runs)
+ for engine in engines
+ ]
+
+ print_summary(results)
+ save_results(results, args.output)
+
+
+if __name__ == "__main__":
+ # Use 'spawn' on macOS to avoid issues with forking and native code
+ # On Linux (GitHub Actions), 'fork' is default and usually works fine
+ import platform
+ if platform.system() == 'Darwin':
+ try:
+ multiprocessing.set_start_method('spawn', force=True)
+ except RuntimeError:
+ pass # Already set
+ main()
diff --git a/benchmark/summarize_results.py b/benchmark/summarize_results.py
new file mode 100644
index 0000000..d324e96
--- /dev/null
+++ b/benchmark/summarize_results.py
@@ -0,0 +1,313 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Summarize benchmark results from multiple engines into a markdown report.
+"""
+
+import argparse
+import json
+from datetime import datetime, timezone
+from pathlib import Path
+
+
+def load_results(results_dir: str) -> dict:
+ """Load all JSON result files from a directory."""
+ results = {}
+ results_path = Path(results_dir)
+
+ for json_file in results_path.glob("*_results.json"):
+ with open(json_file) as f:
+ data = json.load(f)
+ for suite in data.get("results", []):
+ engine = suite["engine"]
+ results[engine] = suite
+
+ return results
+
+
+def format_time(seconds: float | None) -> str:
+ """Format time in seconds to a readable string."""
+ if seconds is None:
+ return "N/A"
+ if seconds < 0.01:
+ return "<0.01s"
+ return f"{seconds:.2f}s"
+
+
+def get_winner(query: str, data: dict, engines: list) -> str | None:
+ """Get the fastest engine for a query."""
+ times = {}
+ for engine in engines:
+ result = data.get(engine, {}).get(query, {})
+ if result.get("status") == "success" and result.get("time_seconds") is
not None:
+ times[engine] = result["time_seconds"]
+
+ if not times:
+ return None
+ return min(times, key=times.get)
+
+
+def generate_markdown_summary(results: dict, output_file: str, query_timeout:
int | None = None, runs: int | None = None) -> str:
+ """Generate a markdown summary of benchmark results for GitHub Actions."""
+ engines = sorted(results.keys())
+
+ if not engines:
+ markdown = "# š SpatialBench Benchmark Results\n\nā ļø No results found."
+ with open(output_file, "w") as f:
+ f.write(markdown)
+ return markdown
+
+ # Get scale factor from first result
+ scale_factor = results[engines[0]].get("scale_factor", 1)
+ timestamp = results[engines[0]].get("timestamp",
datetime.now(timezone.utc).isoformat())
+
+ # Collect all queries
+ all_queries = set()
+ for engine_data in results.values():
+ for r in engine_data.get("results", []):
+ all_queries.add(r["query"])
+ all_queries = sorted(all_queries, key=lambda x: int(x[1:]))
+
+ # Build result lookup
+ data = {}
+ for engine, engine_data in results.items():
+ data[engine] = {}
+ for r in engine_data.get("results", []):
+ data[engine][r["query"]] = r
+
+ # Get version info
+ versions = {engine: results[engine].get("version", "unknown") for engine
in engines}
+
+ # Engine display names with icons
+ engine_icons = {
+ "sedonadb": "šµ SedonaDB",
+ "duckdb": "š¦ DuckDB",
+ "geopandas": "š¼ GeoPandas",
+ }
+
+ # Generate markdown
+ lines = [
+ "# š SpatialBench Benchmark Results",
+ "",
+ "| Parameter | Value |",
+ "|-----------|-------|",
+ f"| **Scale Factor** | {scale_factor} |",
+ f"| **Query Timeout** | {query_timeout}s |",
+ f"| **Runs per Query** | {runs} |",
+ f"| **Timestamp** | {timestamp} |",
+ f"| **Queries** | {len(all_queries)} |",
+ "",
+ "## š§ Software Versions",
+ "",
+ "| Engine | Version |",
+ "|--------|---------|",
+ ]
+
+ for engine in engines:
+ icon_name = engine_icons.get(engine, engine.title())
+ lines.append(f"| {icon_name} | `{versions[engine]}` |")
+
+ # Main results table
+ lines.extend([
+ "",
+ "## š Results Comparison",
+ "",
+ "| Query | " + " | ".join(engine_icons.get(e, e.title()) for e in
engines) + " |",
+ "|:------|" + "|".join(":---:" for _ in engines) + "|",
+ ])
+
+ # Add rows for each query with winner highlighting
+ for query in all_queries:
+ winner = get_winner(query, data, engines)
+ row = f"| **{query.upper()}** |"
+ for engine in engines:
+ result = data.get(engine, {}).get(query, {})
+ status = result.get("status", "N/A")
+ if status == "success":
+ time_val = result.get("time_seconds")
+ time_str = format_time(time_val)
+ if engine == winner:
+ row += f" **{time_str}** |"
+ else:
+ row += f" {time_str} |"
+ elif status == "timeout":
+ row += " ā±ļø TIMEOUT |"
+ elif status == "error":
+ row += " ā ERROR |"
+ else:
+ row += " ā |"
+ lines.append(row)
+
+ # Win count summary
+ win_counts = {engine: 0 for engine in engines}
+ for query in all_queries:
+ winner = get_winner(query, data, engines)
+ if winner:
+ win_counts[winner] += 1
+
+ lines.extend([
+ "",
+ "## š„ Performance Summary",
+ "",
+ "| Engine | Wins |",
+ "|--------|:----:|",
+ ])
+
+ for engine in sorted(engines, key=lambda e: win_counts[e], reverse=True):
+ icon_name = engine_icons.get(engine, engine.title())
+ wins = win_counts[engine]
+ lines.append(f"| {icon_name} | {wins} |")
+
+ # Detailed results section (collapsible)
+ lines.extend([
+ "",
+ "## š Detailed Results",
+ "",
+ ])
+
+ for engine in engines:
+ icon_name = engine_icons.get(engine, engine.title())
+ lines.extend([
+ f"<details>",
+ f"<summary><b>{icon_name}</b> - Click to expand</summary>",
+ "",
+ "| Query | Time | Status | Rows |",
+ "|:------|-----:|:------:|-----:|",
+ ])
+
+ for query in all_queries:
+ result = data.get(engine, {}).get(query, {})
+ time_str = format_time(result.get("time_seconds"))
+ status = result.get("status", "N/A")
+ rows = result.get("row_count")
+ row_str = f"{rows:,}" if rows is not None else "ā"
+
+ status_emoji = {
+ "success": "ā
",
+ "error": "ā",
+ "timeout": "ā±ļø",
+ }.get(status, "ā")
+
+ lines.append(f"| {query.upper()} | {time_str} | {status_emoji} |
{row_str} |")
+
+ lines.extend([
+ "",
+ "</details>",
+ "",
+ ])
+
+ # Add error details if any
+ has_errors = False
+ error_lines = ["## ā ļø Errors and Timeouts", ""]
+
+ for engine in engines:
+ engine_errors = []
+ for query in all_queries:
+ result = data.get(engine, {}).get(query, {})
+ if result.get("status") in ("error", "timeout"):
+ error_msg = result.get("error_message", "No details available")
+ # Truncate long error messages
+ if len(error_msg) > 200:
+ error_msg = error_msg[:200] + "..."
+ engine_errors.append(f"- **{query.upper()}**: `{error_msg}`")
+
+ if engine_errors:
+ has_errors = True
+ icon_name = engine_icons.get(engine, engine.title())
+ error_lines.append(f"### {icon_name}")
+ error_lines.append("")
+ error_lines.extend(engine_errors)
+ error_lines.append("")
+
+ if has_errors:
+ lines.extend(error_lines)
+
+ # Footer
+ lines.extend([
+ "---",
+ "",
+ "| Legend | Meaning |",
+ "|--------|---------|",
+ "| **bold** | Fastest for this query |",
+ "| ā±ļø TIMEOUT | Query exceeded timeout |",
+ "| ā ERROR | Query failed |",
+ "",
+ f"*Generated by
[SpatialBench](https://github.com/apache/sedona-spatialbench) on
{datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}*",
+ ])
+
+ markdown = "\n".join(lines)
+
+ # Write to file
+ with open(output_file, "w") as f:
+ f.write(markdown)
+
+ return markdown
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Summarize SpatialBench benchmark results"
+ )
+ parser.add_argument(
+ "--results-dir",
+ type=str,
+ required=True,
+ help="Directory containing *_results.json files",
+ )
+ parser.add_argument(
+ "--output",
+ type=str,
+ default="benchmark_summary.md",
+ help="Output markdown file",
+ )
+ parser.add_argument(
+ "--timeout",
+ type=int,
+ default=60,
+ help="Query timeout in seconds (for reporting)",
+ )
+ parser.add_argument(
+ "--runs",
+ type=int,
+ default=3,
+ help="Number of runs per query (for reporting)",
+ )
+
+ args = parser.parse_args()
+
+ results = load_results(args.results_dir)
+
+ if not results:
+ print(f"No results found in {args.results_dir}")
+ # Write empty summary
+ with open(args.output, "w") as f:
+ f.write("# SpatialBench Benchmark Results\n\nNo results found.")
+ return
+
+ markdown = generate_markdown_summary(results, args.output, args.timeout,
args.runs)
+ print(f"Summary written to {args.output}")
+ print("\nPreview:")
+ print("-" * 60)
+ print(markdown[:2000])
+ if len(markdown) > 2000:
+ print("...")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/geopandas.py b/spatialbench-queries/geopandas_queries.py
similarity index 99%
rename from geopandas.py
rename to spatialbench-queries/geopandas_queries.py
index ac149f0..1cf3337 100644
--- a/geopandas.py
+++ b/spatialbench-queries/geopandas_queries.py
@@ -212,10 +212,10 @@ def q5(data_paths: dict[str, str]) -> DataFrame: # type:
ignore[override]
def q6(data_paths: dict[str, str]) -> DataFrame: # type: ignore[override]
- """Q6 (GeoPandas): Zone statistics for trips within 50km bounding box
around Sedona.
+ """Q6 (GeoPandas): Zone statistics for trips intersecting a bounding box.
Mirrors original SQL intent:
- * Filter zones fully contained in the provided bounding box polygon.
+ * Filter zones intersecting the provided bounding box polygon.
* Count trips whose pickup point lies within each zone (inner semantics:
zones with 0 pickups excluded).
* Compute:
total_pickups = COUNT(t_tripkey)
diff --git a/print_queries.py b/spatialbench-queries/print_queries.py
similarity index 96%
rename from print_queries.py
rename to spatialbench-queries/print_queries.py
index 10d38ed..6d4777f 100755
--- a/print_queries.py
+++ b/spatialbench-queries/print_queries.py
@@ -135,13 +135,13 @@ ORDER BY dropoff_count DESC, c.c_custkey ASC
@staticmethod
def q6() -> str:
return """
--- Q6: Zone statistics for trips within 50km radius of Sedona city center
+-- Q6: Zone statistics for trips intersecting a bounding box
SELECT
z.z_zonekey, z.z_name,
COUNT(t.t_tripkey) AS total_pickups, AVG(t.t_totalamount) AS avg_distance,
AVG(t.t_dropofftime - t.t_pickuptime) AS avg_duration
FROM trip t, zone z
-WHERE ST_Contains(ST_GeomFromText('POLYGON((-112.2110 34.4197, -111.3110
34.4197, -111.3110 35.3197, -112.2110 35.3197, -112.2110 34.4197))'),
ST_GeomFromWKB(z.z_boundary)) -- 50km bounding box around Sedona
+WHERE ST_Intersects(ST_GeomFromText('POLYGON((-112.2110 34.4197, -111.3110
34.4197, -111.3110 35.3197, -112.2110 35.3197, -112.2110 34.4197))'),
ST_GeomFromWKB(z.z_boundary))
AND ST_Within(ST_GeomFromWKB(t.t_pickuploc), ST_GeomFromWKB(z.z_boundary))
GROUP BY z.z_zonekey, z.z_name
ORDER BY total_pickups DESC, z.z_zonekey ASC
@@ -404,12 +404,11 @@ class
SedonaDBSpatialBenchBenchmark(SpatialBenchBenchmark):
@staticmethod
def q5() -> str:
return """
--- Q5 (SedonaDB): In SedonaDB ST_Collect is an aggregate function so no need
to use ARRAY_AGG first.
--- ST_Collect does not accept an array as input so we cannot use the query
with ARRAY_AGG.
+-- Q5 (SedonaDB): SedonaDB uses ST_Collect_Agg (with _Agg suffix) for
aggregate functions.
SELECT
c.c_custkey, c.c_name AS customer_name,
DATE_TRUNC('month', t.t_pickuptime) AS pickup_month,
- ST_Area(ST_ConvexHull(ST_Collect(ST_GeomFromWKB(t.t_dropoffloc)))) AS
monthly_travel_hull_area,
+ ST_Area(ST_ConvexHull(ST_Collect_Agg(ST_GeomFromWKB(t.t_dropoffloc)))) AS
monthly_travel_hull_area,
COUNT(*) as dropoff_count
FROM trip t JOIN customer c ON t.t_custkey = c.c_custkey
GROUP BY c.c_custkey, c.c_name, pickup_month
diff --git a/spatial_polars.py b/spatialbench-queries/spatial_polars.py
similarity index 99%
rename from spatial_polars.py
rename to spatialbench-queries/spatial_polars.py
index bf53a18..79149df 100644
--- a/spatial_polars.py
+++ b/spatialbench-queries/spatial_polars.py
@@ -244,10 +244,10 @@ def q5(data_paths: dict[str, str]) -> DataFrame:
def q6(data_paths: dict[str, str]) -> DataFrame:
- """Q6 (Spatial Polars): Zone statistics for trips within 50km bounding box
around Sedona.
+ """Q6 (Spatial Polars): Zone statistics for trips intersecting a bounding
box.
Mirrors original SQL intent:
- * Filter zones fully contained in the provided bounding box polygon.
+ * Filter zones intersecting the provided bounding box polygon.
* Count trips whose pickup point lies within each zone (inner semantics:
zones with 0 pickups excluded).
* Compute:
total_pickups = COUNT(t_tripkey)