This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-db.git
The following commit(s) were added to refs/heads/main by this push:
new 8566b758 feat(python/sedonadb): Add sort by geometry support in
to_parquet() (#642)
8566b758 is described below
commit 8566b75860b0a111725be760c620ce4d4cb8fc91
Author: Dewey Dunnington <[email protected]>
AuthorDate: Mon Feb 23 16:15:39 2026 -0600
feat(python/sedonadb): Add sort by geometry support in to_parquet() (#642)
---
python/sedonadb/src/dataframe.rs | 44 +++++++++++++++++++++++++++++---
python/sedonadb/tests/io/test_parquet.py | 29 +++++++++++++++++++++
2 files changed, 69 insertions(+), 4 deletions(-)
diff --git a/python/sedonadb/src/dataframe.rs b/python/sedonadb/src/dataframe.rs
index 5de8c011..efceca98 100644
--- a/python/sedonadb/src/dataframe.rs
+++ b/python/sedonadb/src/dataframe.rs
@@ -1,4 +1,4 @@
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
@@ -189,13 +189,49 @@ impl InternalDataFrame {
) -> Result<(), PySedonaError> {
// sort_by needs to be SortExpr. A Vec<String> can unambiguously be
interpreted as
// field names (ascending), but other types of expressions aren't
supported here yet.
+ // We need to special-case geometry columns until we have a logical
optimizer rule or
+ // sorting for user-defined types is supported.
+ let geometry_column_indices =
self.inner.schema().geometry_column_indices()?;
+ let geometry_column_names = geometry_column_indices
+ .iter()
+ .map(|i| self.inner.schema().field(*i).name().as_str())
+ .collect::<HashSet<&str>>();
+
+ #[cfg(feature = "s2geography")]
+ let has_geography = true;
+ #[cfg(not(feature = "s2geography"))]
+ let has_geography = false;
+
let sort_by_expr = sort_by
.into_iter()
.map(|name| {
- let column = Expr::Column(Column::new_unqualified(name));
- SortExpr::new(column, true, false)
+ let column =
Expr::Column(Column::new_unqualified(name.clone()));
+ if geometry_column_names.contains(name.as_str()) {
+ // Create the call sd_order(column). If we're ordering by
geometry but don't have
+ // the required feature for high quality sort output, give
an error. This is mostly
+ // an issue when using maturin develop because geography
is not a default feature.
+ if has_geography {
+ let state = ctx.inner.ctx.state();
+ let order_udf_opt =
state.scalar_functions().get("sd_order");
+ if let Some(order_udf) = order_udf_opt {
+ Ok(SortExpr::new(order_udf.call(vec![column]),
true, false))
+ } else {
+ Err(PySedonaError::SedonaPython(
+ "Can't order by geometry field when sd_order()
is not available"
+ .to_string(),
+ ))
+ }
+ } else {
+ Err(PySedonaError::SedonaPython(
+ "Use maturin develop --features
's2geography,pyo3/extension-module' for dev geography support"
+ .to_string(),
+ ))
+ }
+ } else {
+ Ok(SortExpr::new(column, true, false))
+ }
})
- .collect::<Vec<_>>();
+ .collect::<Result<Vec<_>, _>>()?;
let write_options = SedonaWriteOptions::new()
.with_partition_by(partition_by)
diff --git a/python/sedonadb/tests/io/test_parquet.py
b/python/sedonadb/tests/io/test_parquet.py
index f5980659..ccdbcd40 100644
--- a/python/sedonadb/tests/io/test_parquet.py
+++ b/python/sedonadb/tests/io/test_parquet.py
@@ -356,6 +356,35 @@ def test_write_geoparquet_options(geoarrow_data):
assert tmp_parquet.stat().st_size > (size_with_default_compression * 2)
+def test_write_sort_by_geometry(con):
+ if "s2geography" not in sedonadb.__features__:
+ pytest.skip("Ordering currently requires build with feature
s2geography")
+
+ con.funcs.table.sd_random_geometry(
+ "Point", 10000, seed=948, bounds=[-50, -50, 50, 50]
+ ).to_view("pts", overwrite=True)
+ df = con.sql("SELECT id, ST_SetSRID(geometry, 4326) AS geometry FROM pts")
+
+ # Write sorted and unsorted output and ensure we have improved the locality
+ with tempfile.TemporaryDirectory() as td:
+ df.to_parquet(Path(td) / "unsorted.parquet")
+ df.to_parquet(Path(td) / "sorted.parquet", sort_by="geometry")
+
+ gdf_unsorted = geopandas.read_parquet(Path(td) /
"unsorted.parquet").to_crs(
+ 3857
+ )
+ gdf_sorted = geopandas.read_parquet(Path(td) /
"sorted.parquet").to_crs(3857)
+
+ neighbour_distance_unsorted = gdf_unsorted.geometry.distance(
+ gdf_unsorted.geometry.shift(1)
+ ).median()
+ neighbour_distance_sorted = gdf_sorted.geometry.distance(
+ gdf_sorted.geometry.shift(1)
+ ).median()
+
+ assert neighbour_distance_sorted < (neighbour_distance_unsorted / 20)
+
+
def test_write_geoparquet_1_1(con, geoarrow_data):
# Checks GeoParquet 1.1 support specifically
path = geoarrow_data / "ns-water" / "files" /
"ns-water_water-junc_geo.parquet"