paleolimbot commented on code in PR #13: URL: https://github.com/apache/sedona-db/pull/13#discussion_r2317634582
########## rust/sedona-functions/src/st_point.rs: ########## @@ -38,53 +46,166 @@ pub fn st_point_udf() -> SedonaScalarUDF { "st_point", vec![Arc::new(STGeoFromPoint { out_type: WKB_GEOMETRY, + wkb_type: WKB_POINT, })], Volatility::Immutable, - Some(doc("ST_Point", "Geometry")), + Some(doc( + "ST_Point", + "Geometry", + &["x", "y"], + "ST_Point(-64.36, 45.09)", + )), ) } /// ST_GeogPoint() scalar UDF implementation /// -/// Native implementation to create geometries from coordinates. -/// See [`st_geogpoint_udf`] for the corresponding geography constructor. +/// Native implementation to create geographies from coordinates. pub fn st_geogpoint_udf() -> SedonaScalarUDF { SedonaScalarUDF::new( "st_geogpoint", vec![Arc::new(STGeoFromPoint { out_type: WKB_GEOGRAPHY, + wkb_type: WKB_POINT, + })], + Volatility::Immutable, + Some(doc( + "st_geogpoint", + "Geography", + &["x", "y"], + "st_geogpoint(-64.36, 45.09)", + )), + ) +} + +/// ST_PointZ() scalar UDF implementation +/// +/// Native implementation to create Z geometries from coordinates. +pub fn st_pointz_udf() -> SedonaScalarUDF { + SedonaScalarUDF::new( + "st_pointz", + vec![Arc::new(STGeoFromPoint { + out_type: WKB_GEOMETRY, + wkb_type: WKB_POINT_Z, })], Volatility::Immutable, - Some(doc("st_geogpoint", "Geography")), + Some(doc( + "ST_PointZ", + "Geometry", + &["x", "y", "z"], + "ST_PointZ(-64.36, 45.09, 100.0)", + )), ) } -fn doc(name: &str, out_type_name: &str) -> Documentation { - Documentation::builder( - DOC_SECTION_OTHER, - format!( +/// ST_PointM() scalar UDF implementation +/// +/// Native implementation to create M geometries from coordinates. +pub fn st_pointm_udf() -> SedonaScalarUDF { + SedonaScalarUDF::new( + "st_pointm", + vec![Arc::new(STGeoFromPoint { + out_type: WKB_GEOMETRY, + wkb_type: WKB_POINT_M, + })], + Volatility::Immutable, + Some(doc( + "ST_PointM", + "Geometry", + &["x", "y", "m"], + "ST_PointM(-64.36, 45.09, 50.0)", + )), + ) +} + +/// ST_PointZM() scalar UDF implementation +/// +/// Native implementation to create ZM geometries from coordinates. +pub fn st_pointzm_udf() -> SedonaScalarUDF { + SedonaScalarUDF::new( + "st_pointzm", + vec![Arc::new(STGeoFromPoint { + out_type: WKB_GEOMETRY, + wkb_type: WKB_POINT_ZM, + })], + Volatility::Immutable, + Some(doc( + "ST_PointZM", + "Geometry", + &["x", "y", "z", "m"], + "ST_PointZM(-64.36, 45.09, 100.0, 50.0)", + )), + ) +} + +fn doc(name: &str, out_type_name: &str, params: &[&str], example: &str) -> Documentation { + let description = match params.len() { + 2 => format!( "Construct a Point {} from X and Y", out_type_name.to_lowercase() ), - format!("{name} (x: Double, y: Double)"), - ) - .with_argument("x", "double: X value") - .with_argument("y", "double: Y value") - .with_sql_example(format!("{name}(-64.36, 45.09)")) - .build() + 3 if params[2] == "z" => format!( + "Construct a Point {} from X, Y and Z", + out_type_name.to_lowercase() + ), Review Comment: Even though it's slightly repetitive, I think this would be better off just split into four `doc()` sections. ########## rust/sedona-functions/src/st_point.rs: ########## @@ -38,53 +46,166 @@ pub fn st_point_udf() -> SedonaScalarUDF { "st_point", vec![Arc::new(STGeoFromPoint { out_type: WKB_GEOMETRY, + wkb_type: WKB_POINT, })], Volatility::Immutable, - Some(doc("ST_Point", "Geometry")), + Some(doc( + "ST_Point", + "Geometry", + &["x", "y"], + "ST_Point(-64.36, 45.09)", + )), ) } /// ST_GeogPoint() scalar UDF implementation /// -/// Native implementation to create geometries from coordinates. -/// See [`st_geogpoint_udf`] for the corresponding geography constructor. +/// Native implementation to create geographies from coordinates. pub fn st_geogpoint_udf() -> SedonaScalarUDF { SedonaScalarUDF::new( "st_geogpoint", vec![Arc::new(STGeoFromPoint { out_type: WKB_GEOGRAPHY, + wkb_type: WKB_POINT, + })], + Volatility::Immutable, + Some(doc( + "st_geogpoint", + "Geography", + &["x", "y"], + "st_geogpoint(-64.36, 45.09)", + )), + ) +} + +/// ST_PointZ() scalar UDF implementation +/// +/// Native implementation to create Z geometries from coordinates. +pub fn st_pointz_udf() -> SedonaScalarUDF { + SedonaScalarUDF::new( + "st_pointz", + vec![Arc::new(STGeoFromPoint { + out_type: WKB_GEOMETRY, + wkb_type: WKB_POINT_Z, })], Volatility::Immutable, - Some(doc("st_geogpoint", "Geography")), + Some(doc( + "ST_PointZ", + "Geometry", + &["x", "y", "z"], + "ST_PointZ(-64.36, 45.09, 100.0)", + )), ) } -fn doc(name: &str, out_type_name: &str) -> Documentation { - Documentation::builder( - DOC_SECTION_OTHER, - format!( +/// ST_PointM() scalar UDF implementation +/// +/// Native implementation to create M geometries from coordinates. +pub fn st_pointm_udf() -> SedonaScalarUDF { + SedonaScalarUDF::new( + "st_pointm", + vec![Arc::new(STGeoFromPoint { + out_type: WKB_GEOMETRY, + wkb_type: WKB_POINT_M, + })], + Volatility::Immutable, + Some(doc( + "ST_PointM", + "Geometry", + &["x", "y", "m"], + "ST_PointM(-64.36, 45.09, 50.0)", + )), + ) +} + +/// ST_PointZM() scalar UDF implementation +/// +/// Native implementation to create ZM geometries from coordinates. +pub fn st_pointzm_udf() -> SedonaScalarUDF { + SedonaScalarUDF::new( + "st_pointzm", + vec![Arc::new(STGeoFromPoint { + out_type: WKB_GEOMETRY, + wkb_type: WKB_POINT_ZM, + })], + Volatility::Immutable, + Some(doc( + "ST_PointZM", + "Geometry", + &["x", "y", "z", "m"], + "ST_PointZM(-64.36, 45.09, 100.0, 50.0)", + )), + ) +} + +fn doc(name: &str, out_type_name: &str, params: &[&str], example: &str) -> Documentation { + let description = match params.len() { + 2 => format!( "Construct a Point {} from X and Y", out_type_name.to_lowercase() ), - format!("{name} (x: Double, y: Double)"), - ) - .with_argument("x", "double: X value") - .with_argument("y", "double: Y value") - .with_sql_example(format!("{name}(-64.36, 45.09)")) - .build() + 3 if params[2] == "z" => format!( + "Construct a Point {} from X, Y and Z", + out_type_name.to_lowercase() + ), + 3 if params[2] == "m" => format!( + "Construct a Point {} from X, Y and M", + out_type_name.to_lowercase() + ), + 4 => format!( + "Construct a Point {} from X, Y, Z and M", + out_type_name.to_lowercase() + ), + _ => unreachable!(), + }; + + let signature = format!( + "{name} ({})", + params + .iter() + .map(|p| format!("{}: Double", p)) + .collect::<Vec<_>>() + .join(", ") + ); + + let mut builder = Documentation::builder(DOC_SECTION_OTHER, description, signature) + .with_argument("x", "double: X coordinate") + .with_argument("y", "double: Y coordinate"); + + if params.len() >= 3 { + if params[2] == "z" { + builder = builder.with_argument("z", "double: Z coordinate"); + } else if params[2] == "m" { + builder = builder.with_argument("m", "double: M coordinate"); + } + } + + if params.len() == 4 { + builder = builder + .with_argument("z", "double: Z coordinate") + .with_argument("m", "double: M coordinate"); + } + + builder.with_sql_example(example).build() } #[derive(Debug)] struct STGeoFromPoint { out_type: SedonaType, + wkb_type: u32, Review Comment: Probably the right thing to use as a parameter here is `Dimensions`. You can use `GeometryTypeAndDimensions` to get the WKB value (or use `write_wkb_xxx()` from sedona-geometry, which I believe takes a Dimensions). You can use `dimensions.size()` to get the count. ########## rust/sedona-functions/src/st_point.rs: ########## @@ -93,60 +214,108 @@ impl SedonaScalarKernel for STGeoFromPoint { arg_types: &[SedonaType], args: &[ColumnarValue], ) -> Result<ColumnarValue> { + let num_coords: usize = match self.wkb_type { + WKB_POINT => 2, + WKB_POINT_Z => 3, + WKB_POINT_M => 3, + WKB_POINT_ZM => 4, + _ => sedona_internal_err!("Invalid WKB type")?, + }; let executor = WkbExecutor::new(arg_types, args); - let x = &args[0].cast_to(&DataType::Float64, None)?; - let y = &args[1].cast_to(&DataType::Float64, None)?; + // Cast all arguments to Float64 + let coord_values: Result<Vec<_>> = args + .iter() + .map(|arg| arg.cast_to(&DataType::Float64, None)) + .collect(); + let coord_values = coord_values?; - let mut item: [u8; 21] = [0x00; 21]; + // Calculate WKB item size based on coordinates: endian(1) + type(4) + coords(8 each) + let wkb_size = WKB_HEADER_SIZE + (num_coords * 8); + let mut item = vec![0u8; wkb_size]; + // Little endian item[0] = 0x01; - item[1] = 0x01; - - // Handle the Scalar case to ensure that Scalar + Scalar -> Scalar - if let ( - ColumnarValue::Scalar(ScalarValue::Float64(x_float)), - ColumnarValue::Scalar(ScalarValue::Float64(y_float)), - ) = (x, y) - { - if let (Some(x), Some(y)) = (x_float, y_float) { - populate_wkb_item(&mut item, x, y); - return Ok(ScalarValue::Binary(Some(item.to_vec())).into()); - } else { + // Geometry type + item[1..WKB_HEADER_SIZE].copy_from_slice(&self.wkb_type.to_le_bytes()); + + // Check if all arguments are scalars + let all_scalars = coord_values + .iter() + .all(|v| matches!(v, ColumnarValue::Scalar(_))); + + if all_scalars { + let scalar_coords: Result<Vec<_>> = coord_values + .iter() + .map(|v| match v { + ColumnarValue::Scalar(ScalarValue::Float64(val)) => Ok(*val), + _ => Err(datafusion_common::DataFusionError::Internal( + "Expected Float64 scalar".to_string(), + )), + }) + .collect(); + let scalar_coords = scalar_coords?; + + // Check if any coordinate is null + if scalar_coords.iter().any(|coord| coord.is_none()) { return Ok(ScalarValue::Binary(None).into()); } + + // Populate WKB with coordinates + let coord_values: Vec<f64> = scalar_coords.into_iter().map(|c| c.unwrap()).collect(); + populate_wkb_item(&mut item, &coord_values); + return Ok(ScalarValue::Binary(Some(item)).into()); } - // Ensure both sides are arrays before iterating - let x_array = x.to_array(executor.num_iterations())?; - let y_array = y.to_array(executor.num_iterations())?; - let x_f64 = as_float64_array(&x_array)?; - let y_f64 = as_float64_array(&y_array)?; + // Handle array case + let coord_arrays: Result<Vec<_>> = coord_values + .iter() + .map(|v| v.to_array(executor.num_iterations())) + .collect(); + let coord_arrays = coord_arrays?; + + let coord_f64_arrays: Result<Vec<_>> = coord_arrays + .iter() + .map(|array| as_float64_array(array)) + .collect(); + let coord_f64_arrays = coord_f64_arrays?; let mut builder = BinaryBuilder::with_capacity( executor.num_iterations(), - item.len() * executor.num_iterations(), + wkb_size * executor.num_iterations(), ); - for (x_elem, y_elem) in zip(x_f64, y_f64) { - match (x_elem, y_elem) { - (Some(x), Some(y)) => { - populate_wkb_item(&mut item, &x, &y); - builder.append_value(item); - } - _ => { - builder.append_null(); + for i in 0..executor.num_iterations() { + let mut coords = Vec::with_capacity(num_coords); + let mut has_null = false; + + for array in &coord_f64_arrays { + if array.is_null(i) { + has_null = true; + break; + } else { + coords.push(array.value(i)); } } + + if has_null { + builder.append_null(); + } else { + populate_wkb_item(&mut item, &coords); + builder.append_value(&item); + } Review Comment: The inside of this loop is a very performance-sensitive section and I think there are a few things we can do to help: ```rust for ... { let values = (0..num_dimensions).iter().map(|j| coord_f64_arrays[j].value(i).collect::<Vec<_>>(); let values_non_null = values.iter().take(num_dimensions).flatten().collect::<Vec<_>>(); if values_non_null.len() == num_dimensions { write_wkb_point_header(&mut builder, dimensions)?; // I forget exactly how to write the f64s as little endian doubles to the builder but it's // the same as in write_wkb_xxxx() builder.append_value([]); } else { builder.append_null(); } } ``` This was the first ever function I wrote in sedonadb and we now have `write_wkb_point_xx()`. The `builder` implements `Write`, so you can write directly into it (pass it directly to `write_wkb_xxx()` functions). Make sure you run `cd rust/sedona-functions` + `cargo bench -- st_point` to make sure your implementation doesn't slow down the XY case (the most common). It is probably worth separating the implementation completely for Z, M, and ZM (where performance is not as critical). ########## rust/sedona-testing/src/testers.rs: ########## @@ -368,7 +368,7 @@ impl ScalarUdfTester { } } - fn invoke_arrays(&self, arrays: Vec<ArrayRef>) -> Result<ArrayRef> { + pub fn invoke_arrays(&self, arrays: Vec<ArrayRef>) -> Result<ArrayRef> { Review Comment: I like your approach! (Although this needs a tiny docstring now that it's public) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@sedona.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org