This is an automated email from the ASF dual-hosted git repository.
petern pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-db.git
The following commit(s) were added to refs/heads/main by this push:
new a0d3a066 perf: Write wkb directly from geos for st_boundary, centroid,
concavehull, convexhull, simplify, snap (#493)
a0d3a066 is described below
commit a0d3a066a1ff99c219b5d987f4e29d334dca38a4
Author: Peter Nguyen <[email protected]>
AuthorDate: Tue Jan 6 22:55:55 2026 -0800
perf: Write wkb directly from geos for st_boundary, centroid, concavehull,
convexhull, simplify, snap (#493)
---
c/sedona-geos/benches/geos-functions.rs | 25 +++++++++++++-
c/sedona-geos/src/st_boundary.rs | 7 ++--
c/sedona-geos/src/st_centroid.rs | 7 ++--
c/sedona-geos/src/st_concavehull.rs | 7 ++--
c/sedona-geos/src/st_convexhull.rs | 7 ++--
c/sedona-geos/src/st_simplify.rs | 18 +++++++---
c/sedona-geos/src/st_snap.rs | 24 +++++++++-----
python/sedonadb/tests/functions/test_functions.py | 40 +++++++++++++++++++++++
8 files changed, 100 insertions(+), 35 deletions(-)
diff --git a/c/sedona-geos/benches/geos-functions.rs
b/c/sedona-geos/benches/geos-functions.rs
index 725f4de4..b8c0e278 100644
--- a/c/sedona-geos/benches/geos-functions.rs
+++ b/c/sedona-geos/benches/geos-functions.rs
@@ -49,6 +49,14 @@ fn criterion_benchmark(c: &mut Criterion) {
benchmark::scalar(c, &f, "geos", "st_centroid", Polygon(10));
benchmark::scalar(c, &f, "geos", "st_centroid", Polygon(500));
+ benchmark::scalar(
+ c,
+ &f,
+ "geos",
+ "st_concavehull",
+ ArrayScalar(Polygon(10), Float64(0.0, 1.0)),
+ );
+
benchmark::scalar(
c,
&f,
@@ -255,6 +263,21 @@ fn criterion_benchmark(c: &mut Criterion) {
benchmark::scalar(c, &f, "geos", "st_perimeter", Polygon(10));
benchmark::scalar(c, &f, "geos", "st_perimeter", Polygon(500));
+ benchmark::scalar(
+ c,
+ &f,
+ "geos",
+ "st_simplify",
+ ArrayScalar(Polygon(10), Float64(1.0, 10.0)),
+ );
+ benchmark::scalar(
+ c,
+ &f,
+ "geos",
+ "st_simplify",
+ ArrayScalar(Polygon(500), Float64(1.0, 10.0)),
+ );
+
benchmark::scalar(
c,
&f,
@@ -267,7 +290,7 @@ fn criterion_benchmark(c: &mut Criterion) {
&f,
"geos",
"st_snap",
- ArrayArrayScalar(Polygon(10), Polygon(500), Float64(1.0, 10.0)),
+ ArrayArrayScalar(Polygon(10), Polygon(50), Float64(1.0, 10.0)),
);
benchmark::scalar(
diff --git a/c/sedona-geos/src/st_boundary.rs b/c/sedona-geos/src/st_boundary.rs
index 323e146f..6b544b30 100644
--- a/c/sedona-geos/src/st_boundary.rs
+++ b/c/sedona-geos/src/st_boundary.rs
@@ -29,6 +29,7 @@ use sedona_schema::{
};
use crate::executor::GeosExecutor;
+use crate::geos_to_wkb::write_geos_geometry;
/// ST_Boundary() implementation using the geos crate
pub fn st_boundary_impl() -> ScalarKernelRef {
@@ -74,11 +75,7 @@ impl SedonaScalarKernel for STBoundary {
fn invoke_scalar(geos_geom: &geos::Geometry, writer: &mut BinaryBuilder) ->
Result<()> {
let result_geom = geos_boundary(geos_geom)?;
- let wkb = result_geom
- .to_wkb()
- .map_err(|e| DataFusionError::Execution(format!("Failed to convert to
wkb: {e}")))?;
-
- writer.append_value(&wkb);
+ write_geos_geometry(&result_geom, writer)?;
Ok(())
}
diff --git a/c/sedona-geos/src/st_centroid.rs b/c/sedona-geos/src/st_centroid.rs
index 3ee6ce39..db997e31 100644
--- a/c/sedona-geos/src/st_centroid.rs
+++ b/c/sedona-geos/src/st_centroid.rs
@@ -28,6 +28,7 @@ use sedona_schema::{
};
use crate::executor::GeosExecutor;
+use crate::geos_to_wkb::write_geos_geometry;
/// ST_Centroid() implementation using the geos crate
pub fn st_centroid_impl() -> ScalarKernelRef {
@@ -75,11 +76,7 @@ fn invoke_scalar(geos_geom: &geos::Geometry, writer: &mut
impl std::io::Write) -
.get_centroid()
.map_err(|e| DataFusionError::Execution(format!("Failed to calculate
centroid: {e}")))?;
- let wkb = geometry
- .to_wkb()
- .map_err(|e| DataFusionError::Execution(format!("Failed to convert to
wkb: {e}")))?;
-
- writer.write_all(wkb.as_ref())?;
+ write_geos_geometry(&geometry, writer)?;
Ok(())
}
diff --git a/c/sedona-geos/src/st_concavehull.rs
b/c/sedona-geos/src/st_concavehull.rs
index 3d275944..9369a49d 100644
--- a/c/sedona-geos/src/st_concavehull.rs
+++ b/c/sedona-geos/src/st_concavehull.rs
@@ -31,6 +31,7 @@ use sedona_schema::{
};
use crate::executor::GeosExecutor;
+use crate::geos_to_wkb::write_geos_geometry;
/// ST_ConcaveHull() implementation using the geos crate
pub fn st_concave_hull_allow_holes_impl() -> ScalarKernelRef {
@@ -140,11 +141,7 @@ fn invoke_scalar<W: std::io::Write>(
DataFusionError::Execution(format!("Failed to calculate concave
hull: {e}"))
})?;
- let geo_wkb = geometry
- .to_wkb()
- .map_err(|e| DataFusionError::Execution(format!("Failed to convert to
WKB: {e}")))?;
-
- writer.write_all(&geo_wkb)?;
+ write_geos_geometry(&geometry, writer)?;
Ok(())
}
diff --git a/c/sedona-geos/src/st_convexhull.rs
b/c/sedona-geos/src/st_convexhull.rs
index f015477a..af9a837a 100644
--- a/c/sedona-geos/src/st_convexhull.rs
+++ b/c/sedona-geos/src/st_convexhull.rs
@@ -28,6 +28,7 @@ use sedona_schema::{
};
use crate::executor::GeosExecutor;
+use crate::geos_to_wkb::write_geos_geometry;
/// ST_ConvexHull() implementation using the geos crate
pub fn st_convex_hull_impl() -> ScalarKernelRef {
@@ -75,11 +76,7 @@ fn invoke_scalar(geos_geom: &geos::Geometry, writer: &mut
impl std::io::Write) -
.convex_hull()
.map_err(|e| DataFusionError::Execution(format!("Failed to calculate
convex_hull: {e}")))?;
- let wkb = geometry
- .to_wkb()
- .map_err(|e| DataFusionError::Execution(format!("Failed to convert to
wkb: {e}")))?;
-
- writer.write_all(wkb.as_ref())?;
+ write_geos_geometry(&geometry, writer)?;
Ok(())
}
diff --git a/c/sedona-geos/src/st_simplify.rs b/c/sedona-geos/src/st_simplify.rs
index 4958c5ef..fe5fd36a 100644
--- a/c/sedona-geos/src/st_simplify.rs
+++ b/c/sedona-geos/src/st_simplify.rs
@@ -30,6 +30,7 @@ use sedona_schema::{
};
use crate::executor::GeosExecutor;
+use crate::geos_to_wkb::write_geos_geometry;
/// ST_Simplify() implementation using the geos crate
pub fn st_simplify_impl() -> ScalarKernelRef {
@@ -87,6 +88,17 @@ fn invoke_scalar(
tolerance: f64,
writer: &mut impl std::io::Write,
) -> Result<()> {
+ let is_empty = geos_geom.is_empty().map_err(|e| {
+ DataFusionError::Execution(format!("Failed to check if geometry is
empty: {e}"))
+ })?;
+
+ if is_empty {
+ // There's a bug in GEOS where simplify() adds a Z coordinate when
processing POINT EMPTY or LINESTRING EMPTY.
+ // See https://github.com/apache/sedona-db/pull/493.
+ // As a workaround, we handle it separately and write the input
geometry as-is
+ write_geos_geometry(geos_geom, writer)?;
+ return Ok(());
+ }
let initial_type = geos_geom
.geometry_type()
.map_err(|e| DataFusionError::Execution(format!("Failed to get
geometry type: {e}")))?;
@@ -124,11 +136,7 @@ fn invoke_scalar(
_ => geometry,
};
- let wkb = geometry
- .to_wkb()
- .map_err(|e| DataFusionError::Execution(format!("Failed to convert to
wkb: {e}")))?;
-
- writer.write_all(wkb.as_ref())?;
+ write_geos_geometry(&geometry, writer)?;
Ok(())
}
diff --git a/c/sedona-geos/src/st_snap.rs b/c/sedona-geos/src/st_snap.rs
index 8a0b901b..19270f63 100644
--- a/c/sedona-geos/src/st_snap.rs
+++ b/c/sedona-geos/src/st_snap.rs
@@ -30,6 +30,7 @@ use sedona_schema::{
};
use crate::executor::GeosExecutor;
+use crate::geos_to_wkb::write_geos_geometry;
/// ST_Snap() implementation using the geos crate
pub fn st_snap_impl() -> ScalarKernelRef {
@@ -93,15 +94,20 @@ fn invoke_scalar(
tolerance: f64,
writer: &mut impl std::io::Write,
) -> Result<()> {
- let geometry = geom_input
- .snap(geom_reference, tolerance)
- .map_err(|e| DataFusionError::Execution(format!("Failed to snap
geometry: {e}")))?;
-
- let wkb = geometry
- .to_wkb()
- .map_err(|e| DataFusionError::Execution(format!("Failed to convert to
wkb: {e}")))?;
-
- writer.write_all(wkb.as_ref())?;
+ let is_empty = geom_input.is_empty().map_err(|e| {
+ DataFusionError::Execution(format!("Failed to check if geometry is
empty: {e}"))
+ })?;
+ if is_empty {
+ // There's a bug in GEOS where snap() adds a Z coordinate when
processing POINT EMPTY or LINESTRING EMPTY.
+ // See https://github.com/apache/sedona-db/pull/493.
+ // As a workaround, we handle it separately and write the input
geometry as-is
+ write_geos_geometry(geom_input, writer)?;
+ } else {
+ let geometry = geom_input
+ .snap(geom_reference, tolerance)
+ .map_err(|e| DataFusionError::Execution(format!("Failed to snap
geometry: {e}")))?;
+ write_geos_geometry(&geometry, writer)?;
+ }
Ok(())
}
diff --git a/python/sedonadb/tests/functions/test_functions.py
b/python/sedonadb/tests/functions/test_functions.py
index d50606ea..af30785d 100644
--- a/python/sedonadb/tests/functions/test_functions.py
+++ b/python/sedonadb/tests/functions/test_functions.py
@@ -2913,6 +2913,9 @@ def test_st_isvalidreason(eng, geom, expected):
],
)
def test_st_simplify(eng, geom, tolerance, expected):
+ # PostGIS incorrectly returns LINESTRING EMPTY here, so we skip this case
for PostGIS.
+ if eng == PostGIS and geom == "POLYGON EMPTY":
+ pytest.skip("PostGIS's result for POLYGON EMPTY is incorrect")
eng = eng.create_or_skip()
eng.assert_query_result(
f"SELECT ST_Simplify({geom_or_null(geom)}, {val_or_null(tolerance)})",
@@ -2970,6 +2973,40 @@ def test_st_simplifypreservetopology(eng, geom,
tolerance, expected):
@pytest.mark.parametrize(
("input", "reference", "tolerance", "expected"),
[
+ (None, None, None, None),
+ (None, "POINT (1 2)", 0.5, None),
+ ("POINT (1 2)", None, 0.5, None),
+ ("POINT (1 2)", "POINT (1 2)", None, None),
+ (
+ "POINT EMPTY",
+ "POINT (1 2)",
+ 0.5,
+ "POINT (nan nan)",
+ ),
+ (
+ "LINESTRING EMPTY",
+ "POINT (1 2)",
+ 0.5,
+ "LINESTRING EMPTY",
+ ),
+ (
+ "POLYGON EMPTY",
+ "POINT (1 2)",
+ 0.5,
+ "POLYGON EMPTY",
+ ),
+ (
+ "MULTIPOLYGON EMPTY",
+ "POINT (1 2)",
+ 0.5,
+ "MULTIPOLYGON EMPTY",
+ ),
+ (
+ "GEOMETRYCOLLECTION EMPTY",
+ "POINT (1 2)",
+ 0.5,
+ "GEOMETRYCOLLECTION EMPTY",
+ ),
(
"MULTIPOLYGON(((26 125, 26 200, 126 200, 126 125, 26 125 ),( 51
150, 101 150, 76 175, 51 150 )),(( 151 100, 151 200, 176 175, 151 100 )))",
"LINESTRING (5 107, 54 84, 101 100)",
@@ -3082,6 +3119,9 @@ def test_st_simplifypreservetopology(eng, geom,
tolerance, expected):
],
)
def test_st_snap(eng, input, reference, tolerance, expected):
+ # PostGIS incorrectly returns LINESTRING EMPTY here, so we skip this case
for PostGIS.
+ if eng == PostGIS and input == "POLYGON EMPTY":
+ pytest.skip("PostGIS's result for POLYGON EMPTY is incorrect")
eng = eng.create_or_skip()
eng.assert_query_result(
f"SELECT ST_Snap({geom_or_null(input)}, {geom_or_null(reference)},
{val_or_null(tolerance)})",