Copilot commented on code in PR #642:
URL: https://github.com/apache/sedona-db/pull/642#discussion_r2829094804


##########
python/sedonadb/src/dataframe.rs:
##########
@@ -189,13 +189,51 @@ impl InternalDataFrame {
     ) -> Result<(), PySedonaError> {
         // sort_by needs to be SortExpr. A Vec<String> can unambiguously be 
interpreted as
         // field names (ascending), but other types of expressions aren't 
supported here yet.
+        // We need to special-case geometry columns until we have a logical 
optimizer rule or
+        // sorting for user-defined types is supported.
+        let geometry_column_indices = 
self.inner.schema().geometry_column_indices()?;
+        let geometry_column_names = geometry_column_indices
+            .iter()
+            .map(|i| self.inner.schema().field(*i).name().as_str())
+            .collect::<HashSet<&str>>();
         let sort_by_expr = sort_by
             .into_iter()
             .map(|name| {
-                let column = Expr::Column(Column::new_unqualified(name));
-                SortExpr::new(column, true, false)
+                let column = 
Expr::Column(Column::new_unqualified(name.clone()));
+                if geometry_column_names.contains(name.as_str()) {
+                    // Create the call sd_order(column). If we're ordering by 
geometry but don't have
+                    // the required feature for high quality sort output, give 
an error. This is mostly
+                    // an issue when using maturin develop because geography 
is not a default feature.
+                    #[cfg(feature = "s2geography")]
+                    let has_geography = true;
+                    #[cfg(not(feature = "s2geography"))]
+                    let has_geography = false;
+
+                    if has_geography {
+                    let order_udf_opt: Option<ScalarUDF> = ctx
+                        .inner
+                        .functions
+                        .scalar_udf("sd_order")
+                        .map(|udf_opt| udf_opt.clone().into());
+                    if let Some(order_udf) = order_udf_opt {
+                        Ok(SortExpr::new(order_udf.call(vec![column]), true, 
false))
+                    } else {
+                        Err(PySedonaError::SedonaPython(
+                            "Can't order by geometry field when sd_order() is 
not available"
+                                .to_string(),
+                        ))
+                    }
+                } else {
+                    Err(PySedonaError::SedonaPython(
+                            "Use maturin develop --features 
's2geography,pyo3/extension-module' for dev geography support"
+                                .to_string(),
+                        ))
+                }
+                } else {

Review Comment:
   Incorrect indentation: Line 212 is missing proper indentation. The `if 
has_geography {` statement should be indented 4 more spaces to align with the 
code block it belongs to (specifically, to match the indentation of line 204's 
comment). While the logic and brace matching appear correct, this indentation 
issue significantly reduces code readability and makes the control flow harder 
to follow.
   ```suggestion
                           let order_udf_opt: Option<ScalarUDF> = ctx
                               .inner
                               .functions
                               .scalar_udf("sd_order")
                               .map(|udf_opt| udf_opt.clone().into());
                           if let Some(order_udf) = order_udf_opt {
                               Ok(SortExpr::new(order_udf.call(vec![column]), 
true, false))
                           } else {
                               Err(PySedonaError::SedonaPython(
                                   "Can't order by geometry field when 
sd_order() is not available"
                                       .to_string(),
                               ))
                           }
                       } else {
                           Err(PySedonaError::SedonaPython(
                               "Use maturin develop --features 
's2geography,pyo3/extension-module' for dev geography support"
                                   .to_string(),
                               ))
                       }
                   } else {
   ```



##########
python/sedonadb/tests/io/test_parquet.py:
##########
@@ -356,6 +356,35 @@ def test_write_geoparquet_options(geoarrow_data):
         assert tmp_parquet.stat().st_size > (size_with_default_compression * 2)
 
 
+def test_write_sort_by_geometry(con):
+    if "s2geography" not in sedonadb.__features__:
+        pytest.skip("Ordering currently requires build with feature 
s2geography")
+
+    con.funcs.table.sd_random_geometry(
+        "Point", 10000, seed=948, bounds=[-50, -50, 50, 50]
+    ).to_view("pts", overwrite=True)
+    df = con.sql("SELECT id, ST_SetSRID(geometry, 4326) AS geometry FROM pts")
+
+    # Write sorted and unsorted output and ensure
+    with tempfile.TemporaryDirectory() as td:
+        df.to_parquet(Path(td) / "unsorted.parquet")
+        df.to_parquet(Path(td) / "sorted.parquet", sort_by="geometry")
+
+        gdf_unsorted = geopandas.read_parquet(Path(td) / 
"unsorted.parquet").to_crs(
+            3857
+        )
+        gdf_sorted = geopandas.read_parquet(Path(td) / 
"sorted.parquet").to_crs(3857)
+
+        neighbour_distance_unsorted = gdf_unsorted.geometry.distance(
+            gdf_unsorted.geometry.shift(1)
+        ).median()
+        neighbour_distance_sorted = gdf_sorted.geometry.distance(
+            gdf_sorted.geometry.shift(1)
+        ).median()
+
+        assert neighbour_distance_sorted < (neighbour_distance_unsorted / 20)
+
+

Review Comment:
   Missing test coverage: The implementation supports sorting by multiple 
columns (both geometry and non-geometry columns), but there's no test case that 
verifies this scenario works correctly. Consider adding a test that sorts by 
multiple columns, such as sorting by a geometry column and a non-geometry 
column together, to ensure the logic correctly handles mixed column types.
   ```suggestion
   
   def test_write_sort_by_geometry_and_id(con):
       if "s2geography" not in sedonadb.__features__:
           pytest.skip("Ordering currently requires build with feature 
s2geography")
   
       # Construct a small deterministic dataset with duplicate geometries
       df = con.sql(
           """
           SELECT id, ST_SetSRID(geom, 4326) AS geometry
           FROM (
               VALUES
                   (1, ST_GeomFromText('POINT (0 0)')),
                   (3, ST_GeomFromText('POINT (0 0)')),
                   (4, ST_GeomFromText('POINT (1 1)')),
                   (2, ST_GeomFromText('POINT (1 1)'))
           ) AS t(id, geom)
           """
       )
   
       with tempfile.TemporaryDirectory() as td:
           out_path = Path(td) / "sorted_multi.parquet"
           # Sort by geometry as primary key and id as secondary key
           df.to_parquet(out_path, sort_by=["geometry", "id"])
   
           gdf = geopandas.read_parquet(out_path)
   
           # For each distinct geometry, ids should be in ascending order
           geom_wkts = gdf.geometry.to_wkt()
           for wkt in geom_wkts.unique():
               ids = gdf.loc[geom_wkts == wkt, "id"].tolist()
               assert ids == sorted(ids)
   ```



##########
python/sedonadb/tests/io/test_parquet.py:
##########
@@ -356,6 +356,35 @@ def test_write_geoparquet_options(geoarrow_data):
         assert tmp_parquet.stat().st_size > (size_with_default_compression * 2)
 
 
+def test_write_sort_by_geometry(con):
+    if "s2geography" not in sedonadb.__features__:
+        pytest.skip("Ordering currently requires build with feature 
s2geography")
+
+    con.funcs.table.sd_random_geometry(
+        "Point", 10000, seed=948, bounds=[-50, -50, 50, 50]
+    ).to_view("pts", overwrite=True)
+    df = con.sql("SELECT id, ST_SetSRID(geometry, 4326) AS geometry FROM pts")
+
+    # Write sorted and unsorted output and ensure

Review Comment:
   Incomplete comment: The comment on line 368 says "Write sorted and unsorted 
output and ensure" but doesn't complete the thought. It should explain what is 
being ensured, for example: "Write sorted and unsorted output and ensure sorted 
output has better spatial locality".
   ```suggestion
       # Write sorted and unsorted output and ensure sorting by geometry 
improves spatial locality
   ```



##########
python/sedonadb/src/dataframe.rs:
##########
@@ -189,13 +189,51 @@ impl InternalDataFrame {
     ) -> Result<(), PySedonaError> {
         // sort_by needs to be SortExpr. A Vec<String> can unambiguously be 
interpreted as
         // field names (ascending), but other types of expressions aren't 
supported here yet.
+        // We need to special-case geometry columns until we have a logical 
optimizer rule or
+        // sorting for user-defined types is supported.
+        let geometry_column_indices = 
self.inner.schema().geometry_column_indices()?;
+        let geometry_column_names = geometry_column_indices
+            .iter()
+            .map(|i| self.inner.schema().field(*i).name().as_str())
+            .collect::<HashSet<&str>>();
         let sort_by_expr = sort_by
             .into_iter()
             .map(|name| {
-                let column = Expr::Column(Column::new_unqualified(name));
-                SortExpr::new(column, true, false)
+                let column = 
Expr::Column(Column::new_unqualified(name.clone()));
+                if geometry_column_names.contains(name.as_str()) {
+                    // Create the call sd_order(column). If we're ordering by 
geometry but don't have
+                    // the required feature for high quality sort output, give 
an error. This is mostly
+                    // an issue when using maturin develop because geography 
is not a default feature.
+                    #[cfg(feature = "s2geography")]
+                    let has_geography = true;
+                    #[cfg(not(feature = "s2geography"))]
+                    let has_geography = false;
+
+                    if has_geography {
+                    let order_udf_opt: Option<ScalarUDF> = ctx
+                        .inner
+                        .functions
+                        .scalar_udf("sd_order")
+                        .map(|udf_opt| udf_opt.clone().into());
+                    if let Some(order_udf) = order_udf_opt {
+                        Ok(SortExpr::new(order_udf.call(vec![column]), true, 
false))
+                    } else {
+                        Err(PySedonaError::SedonaPython(
+                            "Can't order by geometry field when sd_order() is 
not available"
+                                .to_string(),
+                        ))
+                    }
+                } else {
+                    Err(PySedonaError::SedonaPython(
+                            "Use maturin develop --features 
's2geography,pyo3/extension-module' for dev geography support"
+                                .to_string(),
+                        ))
+                }
+                } else {

Review Comment:
   Incorrect indentation: Lines 213-231 should be indented 4 more spaces to 
properly reflect their nesting level within the `if has_geography` block that 
starts on line 212. The current indentation makes it difficult to understand 
the code structure and control flow.
   ```suggestion
                           let order_udf_opt: Option<ScalarUDF> = ctx
                               .inner
                               .functions
                               .scalar_udf("sd_order")
                               .map(|udf_opt| udf_opt.clone().into());
                           if let Some(order_udf) = order_udf_opt {
                               Ok(SortExpr::new(order_udf.call(vec![column]), 
true, false))
                           } else {
                               Err(PySedonaError::SedonaPython(
                                   "Can't order by geometry field when 
sd_order() is not available"
                                       .to_string(),
                               ))
                           }
                       } else {
                           Err(PySedonaError::SedonaPython(
                               "Use maturin develop --features 
's2geography,pyo3/extension-module' for dev geography support"
                                   .to_string(),
                               ))
                       }
                   } else {
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to