This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-spatialbench.git
The following commit(s) were added to refs/heads/main by this push:
new c9b7cdc add queries + queries CLI (#21)
c9b7cdc is described below
commit c9b7cdc06c5cec80a66d494ddfe0fd3ce9c6d9aa
Author: James Willis <[email protected]>
AuthorDate: Fri Sep 19 13:43:27 2025 -0700
add queries + queries CLI (#21)
* add queries
* update readme
* Update print_queries.py
Co-authored-by: Copilot <[email protected]>
* fix q3 sql
---------
Co-authored-by: Copilot <[email protected]>
---
README.md | 26 +++
geopandas.py | 624 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
print_queries.py | 459 ++++++++++++++++++++++++++++++++++++++++
3 files changed, 1109 insertions(+)
diff --git a/README.md b/README.md
index fc0330b..c844daa 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,32 @@ Goals:
- Help users compare engines and frameworks across different data scales.
- Support open standards and foster collaboration in the spatial computing
community.
+## Queries
+SpatialBench includes a set of 12 SQL queries. Because spatial sql syntaxes
vary widely across systems, we provide a cli
+to print all 12 queries in the dialect of your choice. Currently supported
dialects are:
+
+- Databricks Spatial SQL
+- DuckDB
+- Geopandas (distinct case)
+- SedonaDB
+- SedonaSpark
+
+
+We tried to vary the queries only as much as necessary to accommodate dialect
differences.
+
+Geopandas is obviously a distinct case, as it is not SQL-based, however, due
to its popularity, we felt it was important
+to include it. Pandas/Geopandas users often hand optimize their queries in
ways that SQL engines handle automatically.
+We felt hand-tuning the queries was unfair for this exercise, and tried to do
as little of that as possible while still
+writing "idiomatic" pandas code. We would be interested in hearing feedback on
this approach as well as seeing a "fully
+hand-optimized" version of the queries.
+
+We welcome contributions and civil discussions on how to improve the queries
and their implementations.
+
+You can print the queries in your dialect of choice using the following
command:
+```bash
+./print_queries.py <dialect>
+```
+
## Data Model
SpatialBench defines a spatial star schema with the following tables:
diff --git a/geopandas.py b/geopandas.py
new file mode 100644
index 0000000..ac149f0
--- /dev/null
+++ b/geopandas.py
@@ -0,0 +1,624 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from typing import cast
+
+import geopandas as gpd
+import numpy as np
+import pandas as pd
+
+from pandas import DataFrame
+from shapely import wkb
+from shapely.geometry import LineString, MultiPoint, Point, Polygon
+
+
+def q1(data_paths: dict[str, str]) -> DataFrame: # type: ignore[override]
+ """Q1 (GeoPandas): Trips starting within 50km of Sedona city center."""
+ trip_df = pd.read_parquet(data_paths["trip"])[
+ ["t_tripkey", "t_pickuploc", "t_pickuptime"]
+ ]
+ trip_df["pickup_geom"] = gpd.GeoSeries.from_wkb(
+ trip_df["t_pickuploc"], crs="EPSG:4326"
+ )
+ trip_gdf = gpd.GeoDataFrame(trip_df, geometry="pickup_geom",
crs="EPSG:4326")
+ trip_gdf["pickup_lon"] = trip_gdf.geometry.x
+ trip_gdf["pickup_lat"] = trip_gdf.geometry.y
+ center = Point(-111.7610, 34.8697)
+ trip_gdf["distance_to_center"] = trip_gdf.geometry.distance(center)
+ filtered = trip_gdf[
+ trip_gdf["distance_to_center"].notna()
+ & (trip_gdf["distance_to_center"] <= 0.45)
+ ]
+ return filtered.sort_values( # type: ignore[no-any-return]
+ ["distance_to_center", "t_tripkey"], ascending=[True, True]
+ )[
+ [
+ "t_tripkey",
+ "pickup_lon",
+ "pickup_lat",
+ "t_pickuptime",
+ "distance_to_center",
+ ]
+ ].reset_index(drop=True)
+
+
+def q2(data_paths: dict[str, str]) -> DataFrame: # type: ignore[override]
+ """Q2 (GeoPandas): Count trips starting within Coconino County zone.
+
+ Finds the first zone row where z_name == 'Coconino County' and counts
trips whose
+ pickup point intersects that polygon. Returns single-row DataFrame with
+ trip_count_in_coconino_county.
+ """
+ trip_df = pd.read_parquet(data_paths["trip"])
+ zone_df = pd.read_parquet(data_paths["zone"])
+ target = zone_df[zone_df["z_name"] == "Coconino County"].head(1)
+ if target.empty:
+ return pd.DataFrame({"trip_count_in_coconino_county": [0]})
+ poly = wkb.loads(target.iloc[0]["z_boundary"])
+ trip_df["pickup_geom"] = gpd.GeoSeries.from_wkb(
+ trip_df["t_pickuploc"], crs="EPSG:4326"
+ )
+ # Ensure intersects is called on a GeoSeries, not a Series
+ pickup_geoms = gpd.GeoSeries(trip_df["pickup_geom"], crs="EPSG:4326")
+ count = int(pickup_geoms.intersects(poly).sum())
+ return pd.DataFrame({"trip_count_in_coconino_county": [count]})
+
+
+def q3(data_paths: dict[str, str]) -> DataFrame: # type: ignore[override]
+ """Q3 (GeoPandas): Monthly trip stats within 15km (10km box + 5km buffer)
of Sedona center.
+
+ Implements: filter trips whose pickup location is within 0.045 degrees
(~5km) of the 10km bounding
+ box polygon (approximating ST_DWithin(pickup_point, polygon, 0.045)). Then
aggregates monthly:
+ * total_trips = COUNT(t_tripkey)
+ * avg_distance = AVG(t_distance) (set NaN if column absent)
+ * avg_duration = AVG(t_dropofftime - t_pickuptime) in seconds
+ * avg_fare = AVG(t_fare) (set NaN if column absent)
+ Ordered by pickup_month ASC.
+ Returns columns: pickup_month, total_trips, avg_distance, avg_duration,
avg_fare
+ """
+ trip_df = pd.read_parquet(data_paths["trip"])
+
+ trip_df["pickup_geom"] = gpd.GeoSeries.from_wkb(
+ trip_df["t_pickuploc"], crs="EPSG:4326"
+ )
+ trips_gdf = gpd.GeoDataFrame(trip_df, geometry="pickup_geom",
crs="EPSG:4326")
+
+ base_poly = Polygon(
+ [
+ (-111.9060, 34.7347),
+ (-111.6160, 34.7347),
+ (-111.6160, 35.0047),
+ (-111.9060, 35.0047),
+ (-111.9060, 34.7347),
+ ]
+ )
+
+ distances = trips_gdf["pickup_geom"].distance(base_poly)
+ mask = distances <= 0.045
+ filtered = trips_gdf.loc[mask]
+
+ filtered["_duration_seconds"] = (
+ filtered["t_dropofftime"] - filtered["t_pickuptime"]
+ ).dt.total_seconds()
+
+ filtered["pickup_month"] = (
+ filtered["t_pickuptime"].dt.to_period("M").dt.to_timestamp()
+ )
+
+ agg = (
+ filtered.groupby("pickup_month", as_index=False)
+ .agg(
+ total_trips=("t_tripkey", "count"),
+ avg_distance=("t_distance", "mean"),
+ avg_duration=("_duration_seconds", "mean"),
+ avg_fare=("t_fare", "mean"),
+ )
+ .sort_values("pickup_month")
+ .reset_index(drop=True)
+ )
+ return cast(DataFrame, agg)
+
+
+def q4(data_paths: dict[str, str]) -> DataFrame: # type: ignore[override]
+ """Q4 (GeoPandas): Zone distribution of top 1000 trips by tip amount.
+
+ Steps:
+ * Select top 1000 trips ordered by t_tip DESC, t_tripkey ASC.
+ * Spatial join (pickup point within zone polygon).
+ * Group by z_zonekey, z_name counting trips.
+ * Order by trip_count DESC, z_zonekey ASC.
+ Returns columns: z_zonekey, z_name, trip_count.
+ """
+ trip_df = pd.read_parquet(data_paths["trip"])
+ if "t_tip" not in trip_df.columns:
+ return pd.DataFrame(columns=["z_zonekey", "z_name", "trip_count"])
+ top_trips = trip_df.sort_values(
+ ["t_tip", "t_tripkey"], ascending=[False, True]
+ ).head(1000)
+ top_trips["pickup_geom"] = gpd.GeoSeries.from_wkb(
+ top_trips["t_pickuploc"], crs="EPSG:4326"
+ )
+ top_gdf = gpd.GeoDataFrame(top_trips, geometry="pickup_geom",
crs="EPSG:4326")
+ zone_df = pd.read_parquet(data_paths["zone"])[
+ ["z_zonekey", "z_name", "z_boundary"]
+ ]
+ zone_df["zone_geom"] = gpd.GeoSeries.from_wkb(
+ zone_df["z_boundary"], crs="EPSG:4326"
+ )
+ zones_gdf = gpd.GeoDataFrame(zone_df, geometry="zone_geom",
crs="EPSG:4326")[
+ ["z_zonekey", "z_name", "zone_geom"]
+ ]
+
+ result = (
+ gpd.sjoin(top_gdf, zones_gdf, how="inner", predicate="within")
+ .groupby(["z_zonekey", "z_name"], as_index=False)
+ .size()
+ .rename(columns={"size": "trip_count"})
+ .sort_values(["trip_count", "z_zonekey"], ascending=[False, True])
+ .reset_index(drop=True)
+ )
+ return result # type: ignore[no-any-return]
+
+
+def q5(data_paths: dict[str, str]) -> DataFrame: # type: ignore[override]
+ """Q5 (GeoPandas): Monthly travel patterns for repeat customers (convex
hull of dropoff points)."""
+ trip_df = pd.read_parquet(data_paths["trip"])
+ cust_df = pd.read_parquet(data_paths["customer"])
+ trip_df["dropoff_geom"] = gpd.GeoSeries.from_wkb(
+ trip_df["t_dropoffloc"], crs="EPSG:4326"
+ )
+ joined = trip_df.merge(
+ cust_df[["c_custkey", "c_name"]],
+ left_on="t_custkey",
+ right_on="c_custkey",
+ how="inner",
+ )
+ joined["pickup_month"] = (
+ joined["t_pickuptime"].dt.to_period("M").dt.to_timestamp()
+ )
+ grouped = (
+ joined.groupby(["c_custkey", "c_name", "pickup_month"], as_index=False)
+ .agg(
+ trip_count=("t_tripkey", "count"),
+ dropoff_points=("dropoff_geom", lambda x: list(x)),
+ )
+ .loc[lambda d: d["trip_count"] > 5]
+ )
+ grouped["monthly_travel_hull_area"] = gpd.GeoSeries(
+ grouped["dropoff_points"].map(MultiPoint), crs="EPSG:4326"
+ ).convex_hull.area
+
+ result = (
+ grouped.sort_values(["trip_count", "c_custkey"], ascending=[False,
True])[
+ ["c_custkey", "c_name", "pickup_month", "monthly_travel_hull_area"]
+ ]
+ .rename(columns={"c_name": "customer_name"})
+ .reset_index(drop=True)
+ )
+ return result
+
+
+def q6(data_paths: dict[str, str]) -> DataFrame: # type: ignore[override]
+ """Q6 (GeoPandas): Zone statistics for trips within 50km bounding box
around Sedona.
+
+ Mirrors original SQL intent:
+ * Filter zones fully contained in the provided bounding box polygon.
+ * Count trips whose pickup point lies within each zone (inner semantics:
zones with 0 pickups excluded).
+ * Compute:
+ total_pickups = COUNT(t_tripkey)
+ avg_distance = AVG(t_totalamount) (matches original aliasing; falls
back to t_distance if needed)
+ avg_duration = AVG(t_dropofftime - t_pickuptime) in seconds
+ * Order by total_pickups DESC, z_zonekey ASC.
+ Returns DataFrame with columns: z_zonekey, z_name, total_pickups,
avg_distance, avg_duration
+ """
+ trip_df = pd.read_parquet(data_paths["trip"])
+ zone_df = pd.read_parquet(data_paths["zone"])
+
+ trip_df["pickup_geom"] = gpd.GeoSeries.from_wkb(
+ trip_df["t_pickuploc"], crs="EPSG:4326"
+ )
+ pickup_points = gpd.GeoDataFrame(
+ trip_df, geometry="pickup_geom", crs="EPSG:4326"
+ )
+
+ zone_df["zone_geom"] = gpd.GeoSeries.from_wkb(
+ zone_df["z_boundary"], crs="EPSG:4326"
+ )
+ zones_gdf = gpd.GeoDataFrame(zone_df, geometry="zone_geom",
crs="EPSG:4326")[
+ ["z_zonekey", "z_name", "zone_geom"]
+ ]
+
+ bbox_poly = Polygon(
+ [
+ (-112.2110, 34.4197),
+ (-111.3110, 34.4197),
+ (-111.3110, 35.3197),
+ (-112.2110, 35.3197),
+ (-112.2110, 34.4197),
+ ]
+ )
+
+ candidate_zones = zones_gdf[
+ zones_gdf["zone_geom"].notna()
+ & zones_gdf["zone_geom"].intersects(bbox_poly)
+ ]
+
+ distance_col = (
+ "t_totalamount"
+ if "t_totalamount" in trip_df.columns
+ else ("t_distance" if "t_distance" in trip_df.columns else None)
+ )
+
+ result = (
+ gpd.sjoin(pickup_points, candidate_zones, how="inner",
predicate="within")
+ .assign(
+ _duration_seconds=lambda d: (
+ d["t_dropofftime"] - d["t_pickuptime"]
+ ).dt.total_seconds(),
+ _distance_metric=lambda d: d[distance_col] if distance_col else
pd.NA,
+ )
+ .groupby(["z_zonekey", "z_name"], as_index=False)
+ .agg(
+ total_pickups=("t_tripkey", "count"),
+ avg_distance=("_distance_metric", "mean"),
+ avg_duration=("_duration_seconds", "mean"),
+ )
+ .sort_values(["total_pickups", "z_zonekey"], ascending=[False, True])
+ .reset_index(drop=True)
+ )
+ return result # type: ignore[no-any-return]
+
+
+def q7(data_paths: dict[str, str]) -> DataFrame: # type: ignore[override]
+ """Q7 (GeoPandas): Detect potential route detours by comparing reported vs
geometric distances.
+
+ Mirrors SQL semantics:
+ * Join trip with driver and vehicle
+ * Filter trips where t_distance > 0
+ * reported_distance_m = t_distance (coerced to float)
+ * line_distance_m = length of straight line between pickup and dropoff
(meters)
+ * detour_ratio = (reported_distance_m) / line_distance_m (NULL if
line_distance_m==0)
+ * Ordered by detour_ratio DESC, reported_distance_m DESC, t_tripkey ASC
+ """
+ trip_df = pd.read_parquet(data_paths["trip"])
+ trip_df["pickup_geom"] = gpd.GeoSeries.from_wkb(
+ trip_df["t_pickuploc"], crs="EPSG:4326"
+ )
+ trip_df["dropoff_geom"] = gpd.GeoSeries.from_wkb(
+ trip_df["t_dropoffloc"], crs="EPSG:4326"
+ )
+ trip_df["reported_distance_m"] = trip_df["t_distance"].astype(float)
+ pickup_vals = trip_df["pickup_geom"].to_numpy()
+ dropoff_vals = trip_df["dropoff_geom"].to_numpy()
+ line_lengths = np.fromiter(
+ (
+ LineString([pg, dg]).length / 0.000009 # 1 meter = 0.000009 degree
+ if (pg is not None and dg is not None)
+ else np.nan
+ for pg, dg in zip(pickup_vals, dropoff_vals, strict=False)
+ ),
+ dtype=float,
+ count=len(trip_df),
+ )
+ trip_df["line_distance_m"] = line_lengths
+ trip_df["detour_ratio"] = np.divide(
+ trip_df["reported_distance_m"].to_numpy(dtype=float, copy=False),
+ line_lengths,
+ out=np.full_like(
+ trip_df["reported_distance_m"].to_numpy(dtype=float, copy=False),
np.nan
+ ),
+ where=(line_lengths != 0.0),
+ )
+ result = (
+ trip_df[
+ [
+ "t_tripkey",
+ "reported_distance_m",
+ "line_distance_m",
+ "detour_ratio",
+ ]
+ ]
+ .sort_values(
+ ["detour_ratio", "reported_distance_m", "t_tripkey"],
+ ascending=[False, False, True],
+ na_position="last",
+ )
+ .reset_index(drop=True)
+ )
+ return result
+
+
+def q8(data_paths: dict[str, str]) -> DataFrame: # type: ignore[override]
+ """Q8 (GeoPandas): Count nearby pickups for each building within ~500m."""
+ trips_df = pd.read_parquet(data_paths["trip"])
+ trips_df["pickup_geom"] = gpd.GeoSeries.from_wkb(
+ trips_df["t_pickuploc"], crs="EPSG:4326"
+ )
+ pickups_gdf = gpd.GeoDataFrame(
+ trips_df, geometry="pickup_geom", crs="EPSG:4326"
+ )
+
+ buildings_df = pd.read_parquet(data_paths["building"])
+ buildings_df["boundary_geom"] = gpd.GeoSeries.from_wkb(
+ buildings_df["b_boundary"], crs="EPSG:4326"
+ )
+ buildings_gdf = gpd.GeoDataFrame(
+ buildings_df, geometry="boundary_geom", crs="EPSG:4326"
+ )
+
+ threshold = 0.0045 # degrees (~500m)
+ result = (
+ buildings_gdf.sjoin(pickups_gdf, predicate="dwithin",
distance=threshold)
+ .groupby(["b_buildingkey", "b_name"], as_index=False)
+ .size()
+ .rename(columns={"size": "nearby_pickup_count"})
+ .sort_values(
+ ["nearby_pickup_count", "b_buildingkey"], ascending=[False, True]
+ )
+ .reset_index(drop=True)
+ )
+ return result # type: ignore[no-any-return]
+
+
+def q9(data_paths: dict[str, str]) -> DataFrame: # type: ignore[override]
+ """Q9 (GeoPandas): Building conflation via IoU (intersection over union)
detection.
+
+ Uses spatial self-join (predicate='intersects') to find overlapping
(intersecting) building boundary polygons.
+ Robust to differing GeoPandas suffix behaviors by detecting column names
and falling back to index_right.
+ Output columns: building_1, building_2, area1, area2, overlap_area, iou
ordered by
+ iou DESC, building_1 ASC, building_2 ASC.
+ """
+ buildings_df = pd.read_parquet(data_paths["building"])
+ buildings_df["boundary_geom"] = gpd.GeoSeries.from_wkb(
+ buildings_df["b_boundary"], crs="EPSG:4326"
+ )
+ bdf = gpd.GeoDataFrame(buildings_df, geometry="boundary_geom",
crs="EPSG:4326")[
+ ["b_buildingkey", "boundary_geom"]
+ ].rename(columns={"b_buildingkey": "building_key"})
+
+ pairs = gpd.sjoin(bdf, bdf, how="inner", predicate="intersects")
+
+ left_key_candidates = ["building_key_left", "building_key_1",
"building_key"]
+ right_key_candidates = ["building_key_right", "building_key_2"]
+ left_key_col = next(c for c in left_key_candidates if c in pairs.columns)
+ right_key_col = next(
+ (c for c in right_key_candidates if c in pairs.columns), None
+ )
+ if right_key_col is None:
+ pairs["_building_key_right_temp"] = bdf.loc[
+ pairs["index_right"], "building_key"
+ ].to_numpy()
+ right_key_col = "_building_key_right_temp"
+
+ pairs = pairs.rename(
+ columns={left_key_col: "building_1", right_key_col: "building_2"}
+ ).rename_geometry("boundary_geom_1")
+ pairs["boundary_geom_2"] = bdf.loc[
+ pairs["index_right"], "boundary_geom"
+ ].to_numpy()
+
+ # Filter to only building_1 < building_2 (exclude self-pairs)
+ pairs = pairs[pairs["building_1"] < pairs["building_2"]]
+
+ # Compute metrics
+ boundary_geom_1_gs = gpd.GeoSeries(pairs["boundary_geom_1"], crs=pairs.crs)
+ boundary_geom_2_gs = gpd.GeoSeries(pairs["boundary_geom_2"], crs=pairs.crs)
+ pairs["area1"] = boundary_geom_1_gs.area
+ pairs["area2"] = boundary_geom_2_gs.area
+ intersection = boundary_geom_1_gs.intersection(boundary_geom_2_gs)
+ pairs["overlap_area"] = intersection.area
+ overlap = pairs["overlap_area"].to_numpy(dtype=float, copy=False)
+ area1 = pairs["area1"].to_numpy(dtype=float, copy=False)
+ area2 = pairs["area2"].to_numpy(dtype=float, copy=False)
+ union = area1 + area2 - overlap
+ iou = np.divide(overlap, union, out=np.zeros_like(overlap), where=union !=
0.0)
+ mask_union_zero = (union == 0.0) & (overlap > 0.0)
+ if mask_union_zero.any():
+ iou[mask_union_zero] = 1.0
+ pairs["iou"] = iou
+ result = (
+ pairs[["building_1", "building_2", "area1", "area2", "overlap_area",
"iou"]]
+ .sort_values(
+ ["iou", "building_1", "building_2"], ascending=[False, True, True]
+ )
+ .reset_index(drop=True)
+ )
+ return cast(DataFrame, result)
+
+
+def q10(data_paths: dict[str, str]) -> DataFrame: # type: ignore[override]
+ """Q10 (GeoPandas): Zone stats for trips starting within each zone.
+
+ Produces columns: z_zonekey, pickup_zone (z_name), avg_duration (seconds),
avg_distance, num_trips
+ Ordered by avg_duration DESC (NULLS last), z_zonekey ASC.
+ Zones with zero trips retained (avg_* = NaN, num_trips = 0).
+ """
+ trip_df = pd.read_parquet(data_paths["trip"])
+ zone_df = pd.read_parquet(data_paths["zone"])
+
+ trip_df["pickup_geom"] = gpd.GeoSeries.from_wkb(
+ trip_df["t_pickuploc"], crs="EPSG:4326"
+ )
+ pickup_points = gpd.GeoDataFrame(
+ trip_df, geometry="pickup_geom", crs="EPSG:4326"
+ )
+
+ zone_df["zone_geom"] = gpd.GeoSeries.from_wkb(
+ zone_df["z_boundary"], crs="EPSG:4326"
+ )
+ zones_gdf = gpd.GeoDataFrame(zone_df, geometry="zone_geom",
crs="EPSG:4326")
+
+ aggregations = {
+ "duration_seconds": "mean",
+ "t_distance": "mean",
+ "t_tripkey": "count",
+ }
+ result = (
+ gpd.sjoin(pickup_points, zones_gdf, how="right", predicate="within")
+ .assign(
+ duration_seconds=lambda d: (
+ d["t_dropofftime"] - d["t_pickuptime"]
+ ).dt.total_seconds()
+ )
+ .groupby(["z_zonekey", "z_name"], dropna=False)
+ .agg(aggregations)
+ .rename(
+ columns={
+ "duration_seconds": "avg_duration",
+ "t_distance": "avg_distance",
+ "t_tripkey": "num_trips",
+ }
+ )
+ .reset_index()
+ .assign(num_trips=lambda d: d["num_trips"].fillna(0).astype(int))
+ .sort_values(
+ by=["avg_duration", "z_zonekey"],
+ ascending=[False, True],
+ na_position="last",
+ )
+ .rename(columns={"z_name": "pickup_zone"})
+ .reset_index(drop=True)
+ )
+ return result # type: ignore[no-any-return]
+
+
+def q11(data_paths: dict[str, str]) -> DataFrame: # type: ignore[override]
+ """Q11 (GeoPandas): Count trips that cross between different zones.
+
+ Returns a single-row DataFrame with column: cross_zone_trip_count
+ """
+ trip_df = pd.read_parquet(data_paths["trip"])
+ zone_df = pd.read_parquet(data_paths["zone"])
+
+ pickup_df = trip_df[["t_tripkey", "t_pickuploc"]].copy()
+ pickup_df["pickup_geom"] = gpd.GeoSeries.from_wkb(
+ pickup_df["t_pickuploc"], crs="EPSG:4326"
+ )
+ pickup_points = gpd.GeoDataFrame(
+ pickup_df, geometry="pickup_geom", crs="EPSG:4326"
+ )
+
+ dropoff_df = trip_df[["t_tripkey", "t_dropoffloc"]].copy()
+ dropoff_df["dropoff_geom"] = gpd.GeoSeries.from_wkb(
+ dropoff_df["t_dropoffloc"], crs="EPSG:4326"
+ )
+ dropoff_points = gpd.GeoDataFrame(
+ dropoff_df, geometry="dropoff_geom", crs="EPSG:4326"
+ )
+
+ zones_pickup = (
+ zone_df[["z_zonekey", "z_boundary"]]
+ .rename(columns={"z_zonekey": "pickup_zonekey"})
+ .copy()
+ )
+ zones_pickup["zone_geom"] = gpd.GeoSeries.from_wkb(
+ zones_pickup["z_boundary"], crs="EPSG:4326"
+ )
+ zones_gdf = gpd.GeoDataFrame(
+ zones_pickup, geometry="zone_geom", crs="EPSG:4326"
+ )
+
+ zones_dropoff = (
+ zone_df[["z_zonekey", "z_boundary"]]
+ .rename(columns={"z_zonekey": "dropoff_zonekey"})
+ .copy()
+ )
+ zones_dropoff["zone_geom"] = gpd.GeoSeries.from_wkb(
+ zones_dropoff["z_boundary"], crs="EPSG:4326"
+ )
+ zones2_gdf = gpd.GeoDataFrame(
+ zones_dropoff, geometry="zone_geom", crs="EPSG:4326"
+ )
+
+ pickup_join = gpd.sjoin(
+ pickup_points,
+ zones_gdf,
+ how="left",
+ predicate="within",
+ )
+ dropoff_join = gpd.sjoin(
+ dropoff_points,
+ zones2_gdf,
+ how="left",
+ predicate="within",
+ )
+
+ merged = pickup_join[["t_tripkey", "pickup_zonekey"]].merge(
+ dropoff_join[["t_tripkey", "dropoff_zonekey"]], on="t_tripkey",
how="inner"
+ )
+
+ mask = (
+ merged["pickup_zonekey"].notna()
+ & merged["dropoff_zonekey"].notna()
+ & (merged["pickup_zonekey"] != merged["dropoff_zonekey"])
+ )
+ count = int(mask.sum())
+ return pd.DataFrame({"cross_zone_trip_count": [count]})
+
+
+def q12(data_paths: dict[str, str]) -> DataFrame: # type: ignore[override]
+ """Q12 (GeoPandas): Find 5 nearest buildings to each trip pickup location
(NLJ, memory-efficient).
+
+ Uses a Python loop (nested loop join) to avoid materializing the full
cross join.
+ Optionally uses STRtree to shortlist candidate buildings for each pickup
point.
+ For each pickup, computes distances to candidates, selects 5 closest (ties
by building key ASC).
+ Output columns: t_tripkey, t_pickuploc, b_buildingkey, building_name,
distance_to_building
+ """
+ trips_df = pd.read_parquet(data_paths["trip"])
+ buildings_df = pd.read_parquet(data_paths["building"])
+
+ trips_df["pickup_geom"] = gpd.GeoSeries.from_wkb(
+ trips_df["t_pickuploc"], crs="EPSG:4326"
+ )
+ buildings_df["boundary_geom"] = gpd.GeoSeries.from_wkb(
+ buildings_df["b_boundary"], crs="EPSG:4326"
+ )
+ trips_gdf = gpd.GeoDataFrame(trips_df, geometry="pickup_geom",
crs="EPSG:4326")
+ buildings_gdf = gpd.GeoDataFrame(
+ buildings_df, geometry="boundary_geom", crs="EPSG:4326"
+ )
+
+ pickup_geoms = trips_gdf["pickup_geom"].to_list()
+ building_geoms = buildings_gdf["boundary_geom"].to_list()
+ building_keys = buildings_gdf["b_buildingkey"].to_numpy()
+ building_names = buildings_gdf["b_name"].to_numpy()
+
+ results = []
+ # Since geopandas doesn't support KNN join, we had to choose either a
cross join + filter or a NLJ.
+ # The cross join would be more pandas-esque, but would require too much
memory.
+ # The NLJ is arguably methodologically unfair (a hand optimization) but
the only way to
+ # actually get the query to run.
+ for i, pt in enumerate(pickup_geoms):
+ dists = [pt.distance(geom) for geom in building_geoms]
+ # Sort by distance, then building key
+ nearest_idx = np.lexsort((building_keys, dists))[:5]
+ for idx in nearest_idx:
+ results.append(
+ {
+ "t_tripkey": trips_gdf.iloc[i]["t_tripkey"],
+ "t_pickuploc": trips_gdf.iloc[i]["t_pickuploc"],
+ "b_buildingkey": building_keys[idx],
+ "building_name": building_names[idx],
+ "distance_to_building": dists[idx],
+ }
+ )
+ return (
+ pd.DataFrame(results)
+ .sort_values(
+ ["distance_to_building", "b_buildingkey"], ascending=[True, True]
+ )
+ .reset_index(drop=True)
+ )
diff --git a/print_queries.py b/print_queries.py
new file mode 100755
index 0000000..3636b2c
--- /dev/null
+++ b/print_queries.py
@@ -0,0 +1,459 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import inspect
+import re
+import sys
+
+
+class SpatialBenchBenchmark:
+ """A benchmark for the performance of analytical spatial queries on a
spatial dataset.
+
+ These queries are written in the Sedona/Spark SQL dialect. Because spatial
functions are not as standardized as
+ other analytical functions, many engines needs specific implementations of
a couple of these queries where dialects
+ vary slightly.
+
+ To deal with these differences, other engine-specific implementations of
this benchmark subclass this class and
+ override only the queries that need to be changed.
+
+ """
+
+ def queries(self) -> dict[str, str]:
+ """
+ Collects all methods of the subclass whose names start with 'q'
followed by a number and have no arguments (other than self),
+ and returns them as a dictionary of query functions, partially applied
with the current instance.
+
+ Returns:
+ Dict[str, str]: A dictionary mapping query names to their
corresponding functions.
+ """
+
+ queries = {}
+ for name, method in inspect.getmembers(
+ self.__class__, predicate=inspect.isfunction
+ ):
+ if re.fullmatch(r"q\d+", name):
+ sig = inspect.signature(method)
+ if len(sig.parameters) == 0:
+ queries[name] = method()
+ else:
+ raise ValueError("Query methods must not take any
arguments")
+
+ # Sort queries numerically by extracting the number from the query name
+ sorted_queries = dict(sorted(queries.items(), key=lambda x:
int(x[0][1:])))
+
+ return sorted_queries
+
+ def dialect(self) -> str:
+ """Return the dialect of the benchmark."""
+ return "SedonaSpark"
+
+ @staticmethod
+ def q1() -> str:
+ return """
+-- Q1: Find trips starting within 50km of Sedona city center, ordered by
distance
+SELECT
+ t.t_tripkey, ST_X(ST_GeomFromWKB(t.t_pickuploc)) AS pickup_lon,
ST_Y(ST_GeomFromWKB(t.t_pickuploc)) AS pickup_lat, t.t_pickuptime,
+ ST_Distance(ST_GeomFromWKB(t.t_pickuploc), ST_GeomFromText('POINT
(-111.7610 34.8697)')) AS distance_to_center
+FROM trip t
+WHERE ST_DWithin(ST_GeomFromWKB(t.t_pickuploc), ST_GeomFromText('POINT
(-111.7610 34.8697)'), 0.45) -- 50km radius around Sedona center
+ORDER BY distance_to_center ASC, t.t_tripkey ASC
+ """
+
+ @staticmethod
+ def q2() -> str:
+ return """
+-- Q2: Count trips starting within Coconino County (Arizona) zone
+SELECT COUNT(*) AS trip_count_in_coconino_county
+FROM trip t
+WHERE ST_Intersects(ST_GeomFromWKB(t.t_pickuploc), (SELECT
ST_GeomFromWKB(z.z_boundary) FROM zone z WHERE z.z_name = 'Coconino County'
LIMIT 1))
+ """
+
+ @staticmethod
+ def q3() -> str:
+ return """
+-- Q3: Monthly trip statistics within 15km radius of Sedona city center (10km
base + 5km buffer)
+SELECT
+ DATE_TRUNC('month', t.t_pickuptime) AS pickup_month, COUNT(t.t_tripkey) AS
total_trips,
+ AVG(t.t_distance) AS avg_distance, AVG(t.t_dropofftime - t.t_pickuptime) AS
avg_duration,
+ AVG(t.t_fare) AS avg_fare
+FROM trip t
+WHERE ST_DWithin(
+ ST_GeomFromWKB(t.t_pickuploc),
+ ST_GeomFromText('POLYGON((-111.9060 34.7347, -111.6160 34.7347,
-111.6160 35.0047, -111.9060 35.0047, -111.9060 34.7347))'), -- 10km bounding
box around Sedona
+ 0.045 -- Additional 5km buffer
+ )
+GROUP BY pickup_month
+ORDER BY pickup_month
+"""
+
+ @staticmethod
+ def q4() -> str:
+ return """
+-- Q4: Zone distribution of top 1000 trips by tip amount
+SELECT z.z_zonekey, z.z_name, COUNT(*) AS trip_count
+FROM
+ zone z
+ JOIN (
+ SELECT t.t_pickuploc
+ FROM trip t
+ ORDER BY t.t_tip DESC, t.t_tripkey ASC
+ LIMIT 1000 -- Replace 1000 with x (how many top tips you want)
+ ) top_trips ON ST_Within(ST_GeomFromWKB(top_trips.t_pickuploc),
ST_GeomFromWKB(z.z_boundary))
+GROUP BY z.z_zonekey, z.z_name
+ORDER BY trip_count DESC, z.z_zonekey ASC
+ """
+
+ @staticmethod
+ def q5() -> str:
+ return """
+-- Q5: Monthly travel patterns for repeat customers (convex hull of dropoff
locations)
+SELECT
+ c.c_custkey, c.c_name AS customer_name,
+ DATE_TRUNC('month', t.t_pickuptime) AS pickup_month,
+
ST_Area(ST_ConvexHull(ST_Collect(ARRAY_AGG(ST_GeomFromWKB(t.t_dropoffloc)))))
AS monthly_travel_hull_area
+FROM trip t JOIN customer c ON t.t_custkey = c.c_custkey
+GROUP BY c.c_custkey, c.c_name, pickup_month
+HAVING COUNT(*) > 5 -- Only include repeat customers for meaningful hulls
+ORDER BY COUNT(*) DESC, c.c_custkey ASC
+ """
+
+ @staticmethod
+ def q6() -> str:
+ return """
+-- Q6: Zone statistics for trips within 50km radius of Sedona city center
+SELECT
+ z.z_zonekey, z.z_name,
+ COUNT(t.t_tripkey) AS total_pickups, AVG(t.t_totalamount) AS avg_distance,
+ AVG(t.t_dropofftime - t.t_pickuptime) AS avg_duration
+FROM trip t, zone z
+WHERE ST_Contains(ST_GeomFromText('POLYGON((-112.2110 34.4197, -111.3110
34.4197, -111.3110 35.3197, -112.2110 35.3197, -112.2110 34.4197))'),
ST_GeomFromWKB(z.z_boundary)) -- 50km bounding box around Sedona
+ AND ST_Within(ST_GeomFromWKB(t.t_pickuploc), ST_GeomFromWKB(z.z_boundary))
+GROUP BY z.z_zonekey, z.z_name
+ORDER BY total_pickups DESC, z.z_zonekey ASC
+ """
+
+ @staticmethod
+ def q7() -> str:
+ return """
+-- Q7: Detect potential route detours by comparing reported vs. geometric
distances
+WITH trip_lengths AS (
+ SELECT
+ t.t_tripkey,
+ t.t_distance AS reported_distance_m,
+ ST_Length(
+ ST_MakeLine(
+ ST_GeomFromWKB(t.t_pickuploc),
+ ST_GeomFromWKB(t.t_dropoffloc)
+ )
+ ) / 0.000009 AS line_distance_m -- 1 meter = 0.000009 degree
+ FROM trip t
+)
+SELECT
+ t.t_tripkey,
+ t.reported_distance_m,
+ t.line_distance_m,
+ t.reported_distance_m / NULLIF(t.line_distance_m, 0) AS detour_ratio
+FROM trip_lengths t
+ORDER BY detour_ratio DESC NULLS LAST, reported_distance_m DESC, t_tripkey ASC
+ """
+
+ @staticmethod
+ def q8() -> str:
+ return """
+-- Q8: Count nearby pickups for each building within 500m radius
+SELECT b.b_buildingkey, b.b_name, COUNT(*) AS nearby_pickup_count
+FROM trip t JOIN building b ON ST_DWithin(ST_GeomFromWKB(t.t_pickuploc),
ST_GeomFromWKB(b.b_boundary), 0.0045) -- ~500m
+GROUP BY b.b_buildingkey, b.b_name
+ORDER BY nearby_pickup_count DESC, b.b_buildingkey ASC
+ """
+
+ @staticmethod
+ def q9() -> str:
+ return """
+-- Q9: Building Conflation (duplicate/overlap detection via IoU),
deterministic order
+WITH b1 AS (
+ SELECT b_buildingkey AS id, ST_GeomFromWKB(b_boundary) AS geom
+ FROM building
+),
+ b2 AS (
+ SELECT b_buildingkey AS id, ST_GeomFromWKB(b_boundary) AS geom
+ FROM building
+ ),
+ pairs AS (
+ SELECT
+ b1.id AS building_1,
+ b2.id AS building_2,
+ ST_Area(b1.geom) AS area1,
+ ST_Area(b2.geom) AS area2,
+ ST_Area(ST_Intersection(b1.geom, b2.geom)) AS overlap_area
+ FROM b1
+ JOIN b2
+ ON b1.id < b2.id
+ AND ST_Intersects(b1.geom, b2.geom)
+ )
+SELECT
+ building_1,
+ building_2,
+ area1,
+ area2,
+ overlap_area,
+ CASE
+ WHEN overlap_area = 0 THEN 0.0
+ WHEN (area1 + area2 - overlap_area) = 0 THEN 1.0
+ ELSE overlap_area / (area1 + area2 - overlap_area)
+ END AS iou
+FROM pairs
+ORDER BY iou DESC, building_1 ASC, building_2 ASC
+ """
+
+ @staticmethod
+ def q10() -> str:
+ return """
+-- Q10: Zone statistics for trips starting within each zone
+SELECT
+ z.z_zonekey, z.z_name AS pickup_zone, AVG(t.t_dropofftime - t.t_pickuptime)
AS avg_duration,
+ AVG(t.t_distance) AS avg_distance, COUNT(t.t_tripkey) AS num_trips
+FROM zone z LEFT JOIN trip t ON ST_Within(ST_GeomFromWKB(t.t_pickuploc),
ST_GeomFromWKB(z.z_boundary))
+GROUP BY z.z_zonekey, z.z_name
+ORDER BY avg_duration DESC NULLS LAST, z.z_zonekey ASC
+ """
+
+ @staticmethod
+ def q11() -> str:
+ return """
+-- Q11: Count trips that cross between different zones
+SELECT COUNT(*) AS cross_zone_trip_count
+FROM
+ trip t
+ JOIN zone pickup_zone ON ST_Within(ST_GeomFromWKB(t.t_pickuploc),
ST_GeomFromWKB(pickup_zone.z_boundary))
+ JOIN zone dropoff_zone ON ST_Within(ST_GeomFromWKB(t.t_dropoffloc),
ST_GeomFromWKB(dropoff_zone.z_boundary))
+WHERE pickup_zone.z_zonekey != dropoff_zone.z_zonekey
+ """
+
+ @staticmethod
+ def q12() -> str:
+ # There is some odd bug with missing columns in EMR. Using CTEs to
work around it.
+ return """
+-- Q12: Find 5 nearest buildings to each trip pickup location using KNN join
+WITH trip_with_geom AS (
+ SELECT t_tripkey, t_pickuploc, ST_GeomFromWKB(t_pickuploc) as pickup_geom
+ FROM trip
+),
+ building_with_geom AS (
+ SELECT b_buildingkey, b_name, b_boundary, ST_GeomFromWKB(b_boundary)
as boundary_geom
+ FROM building
+ )
+SELECT
+ t.t_tripkey,
+ t.t_pickuploc,
+ b.b_buildingkey,
+ b.b_name AS building_name,
+ ST_Distance(t.pickup_geom, b.boundary_geom) AS distance_to_building
+FROM trip_with_geom t JOIN building_with_geom b
+ ON ST_KNN(t.pickup_geom, b.boundary_geom, 5, FALSE)
+ORDER BY distance_to_building ASC, b.b_buildingkey ASC
+ """
+
+
+class DatabricksSpatialBenchBenchmark(SpatialBenchBenchmark):
+ """A Databricks-specific implementation of the SpatialBench benchmark.
+
+ This class is used to run the SpatialBench benchmark using Databricks'
spatial functions. It varies only as
+ needed from the base class.
+
+ """
+
+ def dialect(self) -> str:
+ """Return the dialect of the benchmark."""
+ return "Databricks"
+
+ @staticmethod
+ def q5() -> str:
+ return """
+-- Q5 (Databricks): NO ST_Collect function, using ST_Union_Agg instead. This
is more expensive, but should give the same results.
+SELECT
+ c.c_custkey, c.c_name AS customer_name,
+ DATE_TRUNC('month', t.t_pickuptime) AS pickup_month,
+ ST_Area(ST_ConvexHull(ST_Union_Agg(ST_GeomFromWKB(t.t_dropoffloc)))) AS
monthly_travel_hull_area
+FROM trip t JOIN customer c ON t.t_custkey = c.c_custkey
+GROUP BY c.c_custkey, c.c_name, pickup_month
+HAVING COUNT(*) > 5 -- Only include repeat customers for meaningful hulls
+ORDER BY COUNT(*) DESC, c.c_custkey ASC
+ """
+
+ @staticmethod
+ def q8() -> str:
+ return """
+-- Q8 (Databricks): ST_MakeLine takes an array of points rather than varargs
+WITH trip_lengths AS (
+ SELECT
+ t.t_tripkey,
+ t.t_distance AS reported_distance_m,
+ ST_Length(
+ ST_MakeLine(
+ Array(
+ ST_GeomFromWKB(t.t_pickuploc),
+ ST_GeomFromWKB(t.t_dropoffloc)
+ )
+ )
+ ) / 0.000009 AS line_distance_m -- 1 meter = 0.000009 degree
+ FROM trip t
+)
+SELECT
+ t.t_tripkey,
+ t.reported_distance_m,
+ t.line_distance_m,
+ t.reported_distance_m / NULLIF(t.line_distance_m, 0) AS detour_ratio
+FROM trip_lengths t
+ORDER BY detour_ratio DESC NULLS LAST, reported_distance_m DESC, t_tripkey ASC
+ """
+
+ @staticmethod
+ def q12() -> str:
+ return """
+-- Q12 (Databricks): No KNN join, using cross join + ROW_NUMBER() window
function instead.
+-- Note: Databricks doesn't have cross join lateral support.
+SELECT
+ t_tripkey,
+ t_pickuploc,
+ b_buildingkey,
+ building_name,
+ distance_to_building
+FROM (
+ SELECT
+ t.t_tripkey,
+ t.t_pickuploc,
+ b.b_buildingkey,
+ b.b_name AS building_name,
+ ST_Distance(ST_GeomFromWKB(t.t_pickuploc),
ST_GeomFromWKB(b.b_boundary)) AS distance_to_building,
+ ROW_NUMBER() OVER (
+ PARTITION BY t.t_tripkey
+ ORDER BY ST_Distance(ST_GeomFromWKB(t.t_pickuploc),
ST_GeomFromWKB(b.b_boundary)) ASC
+ ) AS rn
+ FROM trip t
+ JOIN building b
+ ) AS ranked_buildings
+WHERE rn <= 5
+ORDER BY distance_to_building ASC, b_buildingkey ASC
+ """
+
+
+class DuckDBSpatialBenchBenchmark(SpatialBenchBenchmark):
+ """A DuckDB-specific implementation of the SpatialBench benchmark.
+
+ This class is used to run the SpatialBench benchmark using DuckDB's
spatial extension. It varies only as
+ needed from the base class.
+ """
+
+ def dialect(self) -> str:
+ """Return the dialect of the benchmark."""
+ return "DuckDB"
+
+ @staticmethod
+ def q12() -> str:
+ return """
+-- Q12 (DuckDB): No KNN join, using cross join lateral instead.
+SELECT
+ t.t_tripkey,
+ t.t_pickuploc,
+ nb.b_buildingkey,
+ nb.building_name,
+ nb.distance_to_building
+FROM trip t
+ CROSS JOIN LATERAL (
+ SELECT
+ b.b_buildingkey,
+ b.b_name AS building_name,
+ ST_Distance(ST_GeomFromWKB(t.t_pickuploc),
ST_GeomFromWKB(b.b_boundary)) AS distance_to_building
+ FROM building b
+ ORDER BY distance_to_building
+ LIMIT 5
+) AS nb
+ORDER BY nb.distance_to_building, nb.b_buildingkey
+ """
+
+
+class SedonaDBSpatialBenchBenchmark(SpatialBenchBenchmark):
+ """A SedonaDB-specific implementation of the SpatialBench benchmark.
+
+ This class is used to run the SpatialBench benchmark using SedonaDB's
spatial functions.
+ It inherits from the SpatialBenchBenchmark class and uses SedonaDB's
spatial functions.
+
+ """
+
+ def dialect(self) -> str:
+ """Return the dialect of the benchmark."""
+ return "SedonaDB"
+
+ @staticmethod
+ def q5() -> str:
+ return """
+-- Q5 (SedonaDB): In SedonaDB ST_Collect is an aggregate function so no need
to use ARRAY_AGG first.
+-- ST_Collect does not accept an array as input so we cannot use the query
with ARRAY_AGG.
+WITH per AS (
+ SELECT
+ c.c_custkey,
+ c.c_name AS customer_name,
+ DATE_TRUNC('month', t.t_pickuptime) AS pickup_month,
+ COUNT(t.t_tripkey) AS n_trips,
+ ST_Area(ST_ConvexHull(
+ ST_Collect(ST_GeomFromWKB(t.t_dropoffloc))
+ )) AS monthly_travel_hull_area
+ FROM trip t
+ JOIN customer c ON t.t_custkey = c.c_custkey
+ GROUP BY c.c_custkey, c.c_name, pickup_month
+)
+SELECT *
+FROM per
+WHERE n_trips > 5
+ORDER BY n_trips DESC, c_custkey ASC;
+ """
+
+
+def main():
+ query_classes = {
+ "SedonaSpark": SpatialBenchBenchmark,
+ "Databricks": DatabricksSpatialBenchBenchmark,
+ "DuckDB": DuckDBSpatialBenchBenchmark,
+ "SedonaDB": SedonaDBSpatialBenchBenchmark,
+ "Geopandas": None # Special case, we will catch this below
+ }
+
+ if len(sys.argv) < 2:
+ print(f"Usage: {sys.argv[0]} <dialect>")
+ print(f"Available dialects: {', '.join(query_classes.keys())}")
+ sys.exit(1)
+
+ dialect_arg = sys.argv[1]
+
+ if dialect_arg == "Geopandas":
+ print("Geopandas does not support SQL queries directly. Please use the
provided Python script geopandas.py.")
+ sys.exit(0)
+
+ if dialect_arg not in query_classes:
+ print(f"Unknown dialect: {dialect_arg}")
+ print(f"Available dialects: {', '.join(query_classes.keys())}")
+ sys.exit(1)
+
+ queries = query_classes[dialect_arg]().queries()
+
+ for query in queries.values():
+ print(query)
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file