This is an automated email from the ASF dual-hosted git repository.

jiayu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-db.git


The following commit(s) were added to refs/heads/main by this push:
     new 8d7d778  perf: Faster `st_geometrytype()` function (#90)
8d7d778 is described below

commit 8d7d77850f5aad0be9ca6be203ad7ee513f8aa77
Author: Yongting You <[email protected]>
AuthorDate: Thu Sep 18 00:23:55 2025 +0800

    perf: Faster `st_geometrytype()` function (#90)
---
 rust/sedona-functions/src/executor.rs        | 24 +++++++++++
 rust/sedona-functions/src/st_geometrytype.rs | 61 ++++++++++++++++------------
 2 files changed, 60 insertions(+), 25 deletions(-)

diff --git a/rust/sedona-functions/src/executor.rs 
b/rust/sedona-functions/src/executor.rs
index 081b627..0a98e71 100644
--- a/rust/sedona-functions/src/executor.rs
+++ b/rust/sedona-functions/src/executor.rs
@@ -246,6 +246,30 @@ impl GeometryFactory for WkbGeometryFactory {
     }
 }
 
+/// A [GeometryFactory] whose geometry type are raw WKB bytes
+///
+/// Using this geometry factory iterates over items as references to the raw 
underlying
+/// bytes, which is useful for writing optimized kernels that do not need the 
full buffer to
+/// be validated and/or parsed.
+#[derive(Default)]
+pub struct WkbBytesFactory {}
+
+impl GeometryFactory for WkbBytesFactory {
+    type Geom<'a> = &'a [u8];
+
+    fn try_from_wkb<'a>(&self, wkb_bytes: &'a [u8]) -> Result<Self::Geom<'a>> {
+        Ok(wkb_bytes)
+    }
+}
+
+/// Alias for an executor that iterates over geometries in their raw [Wkb] 
bytes.
+///
+/// This [GenericExecutor] implementation provides more optimization 
opportunities,
+/// but it requires additional manual processing of the raw [Wkb] bytes 
compared to
+/// the [WkbExecutor].
+pub(crate) type WkbBytesExecutor<'a, 'b> =
+    GenericExecutor<'a, 'b, WkbBytesFactory, WkbBytesFactory>;
+
 /// Trait for iterating over a container type as geometry scalars
 ///
 /// Currently the only scalar type supported is [Wkb]; however, for future
diff --git a/rust/sedona-functions/src/st_geometrytype.rs 
b/rust/sedona-functions/src/st_geometrytype.rs
index 0cdd9a8..6cccf0e 100644
--- a/rust/sedona-functions/src/st_geometrytype.rs
+++ b/rust/sedona-functions/src/st_geometrytype.rs
@@ -16,18 +16,16 @@
 // under the License.
 use std::sync::Arc;
 
-use crate::executor::WkbExecutor;
+use crate::executor::WkbBytesExecutor;
 use arrow_array::builder::StringBuilder;
 use arrow_schema::DataType;
 use datafusion_common::error::Result;
 use datafusion_expr::{
     scalar_doc_sections::DOC_SECTION_OTHER, ColumnarValue, Documentation, 
Volatility,
 };
-use geo_traits::GeometryTrait;
 use sedona_common::sedona_internal_err;
 use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF};
 use sedona_schema::{datatypes::SedonaType, matchers::ArgMatcher};
-use wkb::reader::Wkb;
 
 pub fn st_geometry_type_udf() -> SedonaScalarUDF {
     SedonaScalarUDF::new(
@@ -67,16 +65,16 @@ impl SedonaScalarKernel for STGeometryType {
         arg_types: &[SedonaType],
         args: &[ColumnarValue],
     ) -> Result<ColumnarValue> {
-        let executor = WkbExecutor::new(arg_types, args);
-        let min_output_size = "POINT".len() * executor.num_iterations();
+        let executor = WkbBytesExecutor::new(arg_types, args);
+        let min_output_size = "ST_POINT".len() * executor.num_iterations();
         let mut builder = 
StringBuilder::with_capacity(executor.num_iterations(), min_output_size);
 
-        // We can do quite a lot better than this with some vectorized WKB 
processing,
-        // but for now we just do a slow iteration
-        executor.execute_wkb_void(|maybe_item| {
-            match maybe_item {
-                Some(item) => {
-                    builder.append_option(invoke_scalar(&item)?);
+        // Iterate over raw WKB bytes for faster type inference
+        executor.execute_wkb_void(|maybe_bytes| {
+            match maybe_bytes {
+                Some(bytes) => {
+                    let name = infer_geometry_type_name(bytes)?;
+                    builder.append_value(name);
                 }
                 None => builder.append_null(),
             }
@@ -87,20 +85,33 @@ impl SedonaScalarKernel for STGeometryType {
     }
 }
 
-fn invoke_scalar(item: &Wkb) -> Result<Option<String>> {
-    match item.as_type() {
-        geo_traits::GeometryType::Point(_) => Ok(Some("ST_Point".to_string())),
-        geo_traits::GeometryType::LineString(_) => 
Ok(Some("ST_LineString".to_string())),
-        geo_traits::GeometryType::Polygon(_) => 
Ok(Some("ST_Polygon".to_string())),
-        geo_traits::GeometryType::MultiPoint(_) => 
Ok(Some("ST_MultiPoint".to_string())),
-        geo_traits::GeometryType::MultiLineString(_) => 
Ok(Some("ST_MultiLineString".to_string())),
-        geo_traits::GeometryType::MultiPolygon(_) => 
Ok(Some("ST_MultiPolygon".to_string())),
-        geo_traits::GeometryType::GeometryCollection(_) => {
-            Ok(Some("ST_GeometryCollection".to_string()))
-        }
-
-        // Other geometry types in geo that we should not get here: Rect, 
Triangle, Line
-        _ => sedona_internal_err!("unexpected geometry type"),
+/// Fast-path inference of geometry type name from raw WKB bytes
+/// An error will be thrown for invalid WKB bytes input
+///
+/// Spec: https://libgeos.org/specifications/wkb/
+#[inline]
+fn infer_geometry_type_name(buf: &[u8]) -> Result<&'static str> {
+    if buf.len() < 5 {
+        return sedona_internal_err!("Invalid WKB: buffer too small ({} 
bytes)", buf.len());
+    }
+
+    let byte_order = buf[0];
+    let code = match byte_order {
+        0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]),
+        1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]),
+        other => return sedona_internal_err!("Unexpected byte order: {other}"),
+    };
+
+    // Only low 3 bits is for the base type, high bits include additional info
+    match code & 0x7 {
+        1 => Ok("ST_Point"),
+        2 => Ok("ST_LineString"),
+        3 => Ok("ST_Polygon"),
+        4 => Ok("ST_MultiPoint"),
+        5 => Ok("ST_MultiLineString"),
+        6 => Ok("ST_MultiPolygon"),
+        7 => Ok("ST_GeometryCollection"),
+        _ => sedona_internal_err!("WKB type code out of range. Got: {}", code),
     }
 }
 

Reply via email to