paleolimbot commented on code in PR #90: URL: https://github.com/apache/sedona-db/pull/90#discussion_r2355657323
########## rust/sedona-functions/src/executor.rs: ########## @@ -246,6 +246,58 @@ impl GeometryFactory for WkbGeometryFactory { } } +#[derive(Default)] +pub(crate) struct WkbBytesFactory {} + +impl GeometryFactory for WkbBytesFactory { + type Geom<'a> = &'a [u8]; + + fn try_from_wkb<'a>(&self, wkb_bytes: &'a [u8]) -> Result<Self::Geom<'a>> { + Ok(wkb_bytes) + } +} + +/// Alias for an executor that iterates over geometries in their raw [Wkb] bytes. +/// +/// This [GenericExecutor] implementation provides more optimization opportunities, +/// but it requires additional manual processing of the raw [Wkb] bytes compared to +/// the [WkbExecutor]. +pub(crate) type WkbBytesExecutor<'a, 'b> = + GenericExecutor<'a, 'b, WkbBytesFactory, WkbBytesFactory>; + +impl<'b> GenericExecutor<'_, 'b, WkbBytesFactory, WkbBytesFactory> { + /// Execute a function by iterating over [Wkb]'s raw binary representation. + /// The provided `func` can assume its input bytes is a valid [Wkb] binary. + pub fn execute_wkb_bytes_void<F: FnMut(Option<&'b [u8]>) -> Result<()>>( + &self, + mut func: F, + ) -> Result<()> { + // Ensure the first argument of the executor is either Wkb, WkbView, or + // a Null type (to support columns of all-null values) + match &self.arg_types[0] { + SedonaType::Wkb(_, _) + | SedonaType::WkbView(_, _) + | SedonaType::Arrow(DataType::Null) => {} + other => { + return sedona_internal_err!( + "Expected SedonaType::Wkb or SedonaType::WkbView or SedonaType::Arrow(DataType::Null) for the first arg, got {}", + other + ) + } + } + + match &self.args[0] { + ColumnarValue::Array(array) => { + array.iter_as_wkb_bytes(&self.arg_types[0], self.num_iterations, func) + } + ColumnarValue::Scalar(scalar_value) => { + let maybe_bytes = scalar_value.scalar_as_wkb_bytes()?; + func(maybe_bytes) + } + } + } +} + Review Comment: (See the below change...I am almost positive that `executor.execute_wkb_void()` does exactly this already unless I'm missing something!) ```suggestion ``` ########## rust/sedona-functions/src/st_geometrytype.rs: ########## @@ -67,16 +65,16 @@ impl SedonaScalarKernel for STGeometryType { arg_types: &[SedonaType], args: &[ColumnarValue], ) -> Result<ColumnarValue> { - let executor = WkbExecutor::new(arg_types, args); - let min_output_size = "POINT".len() * executor.num_iterations(); + let executor = WkbBytesExecutor::new(arg_types, args); + let min_output_size = "ST_POINT".len() * executor.num_iterations(); let mut builder = StringBuilder::with_capacity(executor.num_iterations(), min_output_size); - // We can do quite a lot better than this with some vectorized WKB processing, - // but for now we just do a slow iteration - executor.execute_wkb_void(|maybe_item| { - match maybe_item { - Some(item) => { - builder.append_option(invoke_scalar(&item)?); + // Iterate over raw WKB bytes for faster type inference + executor.execute_wkb_bytes_void(|maybe_bytes| { Review Comment: ```suggestion executor.execute_wkb_void(|maybe_bytes| { ``` ########## rust/sedona-functions/src/executor.rs: ########## @@ -246,6 +246,58 @@ impl GeometryFactory for WkbGeometryFactory { } } +#[derive(Default)] +pub(crate) struct WkbBytesFactory {} Review Comment: Optional, but this will probably be useful in a number of places! ```suggestion /// A [GeometryFactory] whose geometry type are raw WKB bytes /// /// Using this geometry factory iterates over items as references to the raw underlying /// bytes, which is useful for writing optimized kernels that do not need the full buffer to /// be validated and/or parsed. #[derive(Default)] pub struct WkbBytesFactory {} ``` ########## rust/sedona-functions/src/st_geometrytype.rs: ########## @@ -87,20 +85,35 @@ impl SedonaScalarKernel for STGeometryType { } } -fn invoke_scalar(item: &Wkb) -> Result<Option<String>> { - match item.as_type() { - geo_traits::GeometryType::Point(_) => Ok(Some("ST_Point".to_string())), - geo_traits::GeometryType::LineString(_) => Ok(Some("ST_LineString".to_string())), - geo_traits::GeometryType::Polygon(_) => Ok(Some("ST_Polygon".to_string())), - geo_traits::GeometryType::MultiPoint(_) => Ok(Some("ST_MultiPoint".to_string())), - geo_traits::GeometryType::MultiLineString(_) => Ok(Some("ST_MultiLineString".to_string())), - geo_traits::GeometryType::MultiPolygon(_) => Ok(Some("ST_MultiPolygon".to_string())), - geo_traits::GeometryType::GeometryCollection(_) => { - Ok(Some("ST_GeometryCollection".to_string())) - } - - // Other geometry types in geo that we should not get here: Rect, Triangle, Line - _ => sedona_internal_err!("unexpected geometry type"), +/// Fast-path inference of geometry type name from raw WKB bytes +/// An error will be thrown for invalid WKB bytes input +/// +/// Spec: https://libgeos.org/specifications/wkb/ +/// +/// TODO: Move it to `Wkb` crate Review Comment: ```suggestion ``` (We have a lot of stuff like this, both in this crate and the internal sedona-geometry) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@sedona.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org