This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-db.git


The following commit(s) were added to refs/heads/main by this push:
     new 8566b758 feat(python/sedonadb): Add sort by geometry support in 
to_parquet() (#642)
8566b758 is described below

commit 8566b75860b0a111725be760c620ce4d4cb8fc91
Author: Dewey Dunnington <[email protected]>
AuthorDate: Mon Feb 23 16:15:39 2026 -0600

    feat(python/sedonadb): Add sort by geometry support in to_parquet() (#642)
---
 python/sedonadb/src/dataframe.rs         | 44 +++++++++++++++++++++++++++++---
 python/sedonadb/tests/io/test_parquet.py | 29 +++++++++++++++++++++
 2 files changed, 69 insertions(+), 4 deletions(-)

diff --git a/python/sedonadb/src/dataframe.rs b/python/sedonadb/src/dataframe.rs
index 5de8c011..efceca98 100644
--- a/python/sedonadb/src/dataframe.rs
+++ b/python/sedonadb/src/dataframe.rs
@@ -1,4 +1,4 @@
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
@@ -189,13 +189,49 @@ impl InternalDataFrame {
     ) -> Result<(), PySedonaError> {
         // sort_by needs to be SortExpr. A Vec<String> can unambiguously be 
interpreted as
         // field names (ascending), but other types of expressions aren't 
supported here yet.
+        // We need to special-case geometry columns until we have a logical 
optimizer rule or
+        // sorting for user-defined types is supported.
+        let geometry_column_indices = 
self.inner.schema().geometry_column_indices()?;
+        let geometry_column_names = geometry_column_indices
+            .iter()
+            .map(|i| self.inner.schema().field(*i).name().as_str())
+            .collect::<HashSet<&str>>();
+
+        #[cfg(feature = "s2geography")]
+        let has_geography = true;
+        #[cfg(not(feature = "s2geography"))]
+        let has_geography = false;
+
         let sort_by_expr = sort_by
             .into_iter()
             .map(|name| {
-                let column = Expr::Column(Column::new_unqualified(name));
-                SortExpr::new(column, true, false)
+                let column = 
Expr::Column(Column::new_unqualified(name.clone()));
+                if geometry_column_names.contains(name.as_str()) {
+                    // Create the call sd_order(column). If we're ordering by 
geometry but don't have
+                    // the required feature for high quality sort output, give 
an error. This is mostly
+                    // an issue when using maturin develop because geography 
is not a default feature.
+                    if has_geography {
+                        let state = ctx.inner.ctx.state();
+                        let order_udf_opt = 
state.scalar_functions().get("sd_order");
+                        if let Some(order_udf) = order_udf_opt {
+                            Ok(SortExpr::new(order_udf.call(vec![column]), 
true, false))
+                        } else {
+                            Err(PySedonaError::SedonaPython(
+                                "Can't order by geometry field when sd_order() 
is not available"
+                                    .to_string(),
+                            ))
+                        }
+                    } else {
+                        Err(PySedonaError::SedonaPython(
+                                "Use maturin develop --features 
's2geography,pyo3/extension-module' for dev geography support"
+                                    .to_string(),
+                            ))
+                    }
+                } else {
+                    Ok(SortExpr::new(column, true, false))
+                }
             })
-            .collect::<Vec<_>>();
+            .collect::<Result<Vec<_>, _>>()?;
 
         let write_options = SedonaWriteOptions::new()
             .with_partition_by(partition_by)
diff --git a/python/sedonadb/tests/io/test_parquet.py 
b/python/sedonadb/tests/io/test_parquet.py
index f5980659..ccdbcd40 100644
--- a/python/sedonadb/tests/io/test_parquet.py
+++ b/python/sedonadb/tests/io/test_parquet.py
@@ -356,6 +356,35 @@ def test_write_geoparquet_options(geoarrow_data):
         assert tmp_parquet.stat().st_size > (size_with_default_compression * 2)
 
 
+def test_write_sort_by_geometry(con):
+    if "s2geography" not in sedonadb.__features__:
+        pytest.skip("Ordering currently requires build with feature 
s2geography")
+
+    con.funcs.table.sd_random_geometry(
+        "Point", 10000, seed=948, bounds=[-50, -50, 50, 50]
+    ).to_view("pts", overwrite=True)
+    df = con.sql("SELECT id, ST_SetSRID(geometry, 4326) AS geometry FROM pts")
+
+    # Write sorted and unsorted output and ensure we have improved the locality
+    with tempfile.TemporaryDirectory() as td:
+        df.to_parquet(Path(td) / "unsorted.parquet")
+        df.to_parquet(Path(td) / "sorted.parquet", sort_by="geometry")
+
+        gdf_unsorted = geopandas.read_parquet(Path(td) / 
"unsorted.parquet").to_crs(
+            3857
+        )
+        gdf_sorted = geopandas.read_parquet(Path(td) / 
"sorted.parquet").to_crs(3857)
+
+        neighbour_distance_unsorted = gdf_unsorted.geometry.distance(
+            gdf_unsorted.geometry.shift(1)
+        ).median()
+        neighbour_distance_sorted = gdf_sorted.geometry.distance(
+            gdf_sorted.geometry.shift(1)
+        ).median()
+
+        assert neighbour_distance_sorted < (neighbour_distance_unsorted / 20)
+
+
 def test_write_geoparquet_1_1(con, geoarrow_data):
     # Checks GeoParquet 1.1 support specifically
     path = geoarrow_data / "ns-water" / "files" / 
"ns-water_water-junc_geo.parquet"

Reply via email to