This is an automated email from the ASF dual-hosted git repository. jiayu pushed a commit to branch spatial-polars in repository https://gitbox.apache.org/repos/asf/sedona-spatialbench.git
commit 61eb3c8443511a7c95c76a2df30cebf6c03b9f0f Author: Jia Yu <[email protected]> AuthorDate: Wed Jan 14 19:57:44 2026 -0800 Add SpatialPolars --- .github/workflows/benchmark.yml | 75 +++++++++++++++++++++++++++++++++++++---- benchmark/run_benchmark.py | 54 +++++++++++++++++++++++++---- benchmark/summarize_results.py | 1 + 3 files changed, 117 insertions(+), 13 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 8b5c81a..a68289c 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -32,7 +32,7 @@ on: engines: description: 'Engines to benchmark (comma-separated)' required: false - default: 'duckdb,geopandas,sedonadb' + default: 'duckdb,geopandas,sedonadb,spatial_polars' type: string timeout: description: 'Query timeout in seconds (default: 60, increase for full benchmark)' @@ -54,6 +54,11 @@ on: required: false default: '' type: string + spatial_polars_version: + description: 'Spatial Polars version (e.g., 1.0.0, leave empty for latest)' + required: false + default: '' + type: string runs: description: 'Number of runs per query (average taken for fair comparison)' required: false @@ -71,13 +76,14 @@ concurrency: env: CARGO_TERM_COLOR: always SCALE_FACTOR: ${{ github.event.inputs.scale_factor || '1' }} - BENCHMARK_ENGINES: ${{ github.event.inputs.engines || 'duckdb,geopandas,sedonadb' }} + BENCHMARK_ENGINES: ${{ github.event.inputs.engines || 'duckdb,geopandas,sedonadb,spatial_polars' }} QUERY_TIMEOUT: ${{ github.event.inputs.timeout || '60' }} BENCHMARK_RUNS: ${{ github.event.inputs.runs || '3' }} # Package versions (empty = latest, can be overridden via workflow_dispatch) SEDONADB_VERSION: ${{ github.event.inputs.sedonadb_version }} DUCKDB_VERSION: ${{ github.event.inputs.duckdb_version }} GEOPANDAS_VERSION: ${{ github.event.inputs.geopandas_version }} + SPATIAL_POLARS_VERSION: ${{ github.event.inputs.spatial_polars_version }} # Hugging Face dataset for benchmark data HF_DATASET: apache-sedona/spatialbench HF_DATA_VERSION: v0.1.0 @@ -170,7 +176,7 @@ jobs: name: Benchmark DuckDB (SF${{ github.event.inputs.scale_factor || '1' }}) needs: download-data runs-on: ubuntu-latest - if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb', 'duckdb') + if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb,spatial_polars', 'duckdb') steps: - uses: actions/checkout@v4 @@ -220,7 +226,7 @@ jobs: name: Benchmark GeoPandas (SF${{ github.event.inputs.scale_factor || '1' }}) needs: download-data runs-on: ubuntu-latest - if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb', 'geopandas') + if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb,spatial_polars', 'geopandas') steps: - uses: actions/checkout@v4 @@ -266,7 +272,7 @@ jobs: name: Benchmark SedonaDB (SF${{ github.event.inputs.scale_factor || '1' }}) needs: download-data runs-on: ubuntu-latest - if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb', 'sedonadb') + if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb,spatial_polars', 'sedonadb') steps: - uses: actions/checkout@v4 @@ -308,10 +314,56 @@ jobs: path: sedonadb_results.json retention-days: 30 + benchmark-spatial-polars: + name: Benchmark Spatial Polars (SF${{ github.event.inputs.scale_factor || '1' }}) + needs: download-data + runs-on: ubuntu-latest + if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb,spatial_polars', 'spatial_polars') + steps: + - uses: actions/checkout@v4 + + - name: Restore benchmark data from cache + uses: actions/cache/restore@v4 + with: + path: benchmark-data-sf${{ env.SCALE_FACTOR }} + key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ env.SCALE_FACTOR }} + fail-on-cache-miss: true + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install dependencies + run: | + if [ -n "${{ env.SPATIAL_POLARS_VERSION }}" ]; then + pip install "spatial-polars[knn]==${{ env.SPATIAL_POLARS_VERSION }}" pyarrow + else + pip install "spatial-polars[knn]" pyarrow + fi + echo "Installed Spatial Polars version: $(python -c 'from importlib.metadata import version; print(version(\"spatial-polars\"))')" + + - name: Run Spatial Polars benchmark + run: | + python benchmark/run_benchmark.py \ + --data-dir benchmark-data-sf${{ env.SCALE_FACTOR }} \ + --engines spatial_polars \ + --timeout ${{ env.QUERY_TIMEOUT }} \ + --runs ${{ env.BENCHMARK_RUNS }} \ + --scale-factor ${{ env.SCALE_FACTOR }} \ + --output spatial_polars_results.json + + - name: Upload results + uses: actions/upload-artifact@v4 + with: + name: spatial_polars-results-sf${{ env.SCALE_FACTOR }} + path: spatial_polars_results.json + retention-days: 30 + summarize-results: name: Summarize Results (SF${{ github.event.inputs.scale_factor || '1' }}) - needs: [benchmark-duckdb, benchmark-geopandas, benchmark-sedonadb] - if: always() && (needs.benchmark-duckdb.result == 'success' || needs.benchmark-geopandas.result == 'success' || needs.benchmark-sedonadb.result == 'success') + needs: [benchmark-duckdb, benchmark-geopandas, benchmark-sedonadb, benchmark-spatial-polars] + if: always() && (needs.benchmark-duckdb.result == 'success' || needs.benchmark-geopandas.result == 'success' || needs.benchmark-sedonadb.result == 'success' || needs.benchmark-spatial-polars.result == 'success') runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -333,12 +385,21 @@ jobs: continue-on-error: true - name: Download SedonaDB results + if: needs.benchmark-sedonadb.result == 'success' uses: actions/download-artifact@v4 with: name: sedonadb-results-sf${{ env.SCALE_FACTOR }} path: results continue-on-error: true + - name: Download Spatial Polars results + if: needs.benchmark-spatial-polars.result == 'success' + uses: actions/download-artifact@v4 + with: + name: spatial_polars-results-sf${{ env.SCALE_FACTOR }} + path: results + continue-on-error: true + - name: Setup Python uses: actions/setup-python@v5 with: diff --git a/benchmark/run_benchmark.py b/benchmark/run_benchmark.py index 4b459f4..fca3c8e 100644 --- a/benchmark/run_benchmark.py +++ b/benchmark/run_benchmark.py @@ -37,7 +37,8 @@ from pathlib import Path from typing import Any, Callable # Add spatialbench-queries directory to path to import query modules -sys.path.insert(0, str(Path(__file__).parent.parent / "spatialbench-queries")) +# Use append (not insert) so installed packages like spatial_polars are found first +sys.path.append(str(Path(__file__).parent.parent / "spatialbench-queries")) # Constants QUERY_COUNT = 12 @@ -103,6 +104,10 @@ def _run_query_in_process( too much memory, which SIGALRM cannot do for native code. """ try: + # For Spatial Polars, ensure the package is imported first to register namespace + if engine_class.__name__ == "SpatialPolarsBenchmark": + import spatial_polars as _sp # noqa: F401 + benchmark = engine_class(data_paths) benchmark.setup() try: @@ -310,6 +315,35 @@ class SedonaDBBenchmark(BaseBenchmark): return len(result), result +class SpatialPolarsBenchmark(BaseBenchmark): + """Spatial Polars benchmark runner.""" + + def __init__(self, data_paths: dict[str, str]): + super().__init__(data_paths, "spatial_polars") + self._queries = None + + def setup(self) -> None: + # spatial_polars package is already imported in _run_query_in_process + # to register .spatial namespace before any module loading + + # Load query functions directly from the module + import importlib.util + query_file = Path(__file__).parent.parent / "spatialbench-queries" / "spatial_polars.py" + spec = importlib.util.spec_from_file_location("spatial_polars_queries", query_file) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + self._queries = {f"q{i}": getattr(module, f"q{i}") for i in range(1, QUERY_COUNT + 1)} + + def teardown(self) -> None: + self._queries = None + + def execute_query(self, query_name: str, query: str | None) -> tuple[int, Any]: + if query_name not in self._queries: + raise ValueError(f"Query {query_name} not found") + result = self._queries[query_name](self.data_paths) + return len(result), result + + def get_sql_queries(dialect: str) -> dict[str, str]: """Get SQL queries for a specific dialect from print_queries.py.""" from print_queries import DuckDBSpatialBenchBenchmark, SedonaDBSpatialBenchBenchmark @@ -425,15 +459,23 @@ def run_benchmark( "version_getter": lambda: pkg_version("sedonadb"), "queries_getter": lambda: get_sql_queries("sedonadb"), }, + "spatial_polars": { + "class": SpatialPolarsBenchmark, + "version_getter": lambda: pkg_version("spatial-polars"), + "queries_getter": lambda: {f"q{i}": None for i in range(1, QUERY_COUNT + 1)}, + }, } config = configs[engine] version = config["version_getter"]() + # Format engine name for display + display_name = engine.replace("_", " ").title() + print(f"\n{'=' * 60}") - print(f"Running {engine.title()} Benchmark") + print(f"Running {display_name} Benchmark") print(f"{'=' * 60}") - print(f"{engine.title()} version: {version}") + print(f"{display_name} version: {version}") if runs > 1: print(f"Runs per query: {runs} (average will be reported)") @@ -548,11 +590,11 @@ def save_results(results: list[BenchmarkSuite], output_file: str) -> None: def main(): parser = argparse.ArgumentParser( - description="Run SpatialBench benchmarks comparing SedonaDB, DuckDB, and GeoPandas" + description="Run SpatialBench benchmarks comparing SedonaDB, DuckDB, GeoPandas, and Spatial Polars" ) parser.add_argument("--data-dir", type=str, required=True, help="Path to directory containing benchmark data (parquet files)") - parser.add_argument("--engines", type=str, default="duckdb,geopandas", + parser.add_argument("--engines", type=str, default="duckdb,geopandas,sedonadb,spatial_polars", help="Comma-separated list of engines to benchmark") parser.add_argument("--queries", type=str, default=None, help="Comma-separated list of queries to run (e.g., q1,q2,q3)") @@ -568,7 +610,7 @@ def main(): args = parser.parse_args() engines = [e.strip().lower() for e in args.engines.split(",")] - valid_engines = {"duckdb", "geopandas", "sedonadb"} + valid_engines = {"duckdb", "geopandas", "sedonadb", "spatial_polars"} for e in engines: if e not in valid_engines: diff --git a/benchmark/summarize_results.py b/benchmark/summarize_results.py index d324e96..5c08707 100644 --- a/benchmark/summarize_results.py +++ b/benchmark/summarize_results.py @@ -99,6 +99,7 @@ def generate_markdown_summary(results: dict, output_file: str, query_timeout: in "sedonadb": "🌵 SedonaDB", "duckdb": "🦆 DuckDB", "geopandas": "🐼 GeoPandas", + "spatial_polars": "🐻❄️ Spatial Polars", } # Generate markdown
