This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-db.git
The following commit(s) were added to refs/heads/main by this push:
new 54ce8ae Making `sd_format` show geospatial values in arrays and
structs properly (#1)
54ce8ae is described below
commit 54ce8ae5a54ca00c146013aa5a1f942e8ea89021
Author: Kristin Cowalcijk <[email protected]>
AuthorDate: Sat Aug 30 04:05:15 2025 +0800
Making `sd_format` show geospatial values in arrays and structs properly
(#1)
* Start to implement a more generic sd_format function
* Now it is working
* Make tests more comprehensive
* Support list view
* Fix clippy warnings
* Update rust/sedona-functions/src/sd_format.rs
Co-authored-by: Copilot <[email protected]>
---------
Co-authored-by: Copilot <[email protected]>
---
rust/sedona-expr/src/scalar_udf.rs | 14 +
rust/sedona-functions/src/sd_format.rs | 829 ++++++++++++++++++++++++++++++---
2 files changed, 791 insertions(+), 52 deletions(-)
diff --git a/rust/sedona-expr/src/scalar_udf.rs
b/rust/sedona-expr/src/scalar_udf.rs
index c6926c3..f3f699e 100644
--- a/rust/sedona-expr/src/scalar_udf.rs
+++ b/rust/sedona-expr/src/scalar_udf.rs
@@ -182,6 +182,11 @@ impl ArgMatcher {
arg_iter.next().is_none()
}
+ /// Matches any argument
+ pub fn is_any() -> Arc<dyn TypeMatcher + Send + Sync> {
+ Arc::new(IsAny {})
+ }
+
/// Matches the given Arrow type using PartialEq
pub fn is_arrow(data_type: DataType) -> Arc<dyn TypeMatcher + Send + Sync>
{
Arc::new(IsExact {
@@ -239,6 +244,15 @@ pub trait TypeMatcher: Debug {
}
}
+#[derive(Debug)]
+struct IsAny;
+
+impl TypeMatcher for IsAny {
+ fn match_type(&self, _arg: &SedonaType) -> bool {
+ true
+ }
+}
+
#[derive(Debug)]
struct IsExact {
exact_type: SedonaType,
diff --git a/rust/sedona-functions/src/sd_format.rs
b/rust/sedona-functions/src/sd_format.rs
index b3d7381..28de57d 100644
--- a/rust/sedona-functions/src/sd_format.rs
+++ b/rust/sedona-functions/src/sd_format.rs
@@ -17,11 +17,14 @@
use std::{sync::Arc, vec};
use crate::executor::WkbExecutor;
-use arrow_array::builder::StringBuilder;
-use arrow_schema::DataType;
+use arrow_array::{
+ builder::StringBuilder, cast::AsArray, Array, GenericListArray,
GenericListViewArray,
+ OffsetSizeTrait, StructArray,
+};
+use arrow_schema::{DataType, Field, Fields};
use datafusion_common::{
error::{DataFusionError, Result},
- ScalarValue,
+ internal_err, ScalarValue,
};
use datafusion_expr::{
scalar_doc_sections::DOC_SECTION_OTHER, ColumnarValue, Documentation,
Volatility,
@@ -37,7 +40,7 @@ use sedona_schema::datatypes::SedonaType;
pub fn sd_format_udf() -> SedonaScalarUDF {
SedonaScalarUDF::new(
"sd_format",
- vec![Arc::new(SDFormatDefault {}), Arc::new(SDFormatGeometry {})],
+ vec![Arc::new(SDFormatDefault {})],
Volatility::Immutable,
Some(sd_format_doc()),
)
@@ -73,33 +76,14 @@ struct SDFormatDefault {}
impl SedonaScalarKernel for SDFormatDefault {
fn return_type(&self, args: &[SedonaType]) -> Result<Option<SedonaType>> {
- Ok(Some(args[0].clone()))
- }
-
- fn invoke_batch(
- &self,
- _arg_types: &[SedonaType],
- args: &[ColumnarValue],
- ) -> Result<ColumnarValue> {
- Ok(args[0].clone())
- }
-}
-
-/// Implementation format geometry or geography
-///
-/// This is very similar to ST_AsText except it respects the width_hint by
-/// stopping the render for each item when too many characters have been
written.
-#[derive(Debug)]
-struct SDFormatGeometry {}
-
-impl SedonaScalarKernel for SDFormatGeometry {
- fn return_type(&self, args: &[SedonaType]) -> Result<Option<SedonaType>> {
+ let sedona_type = &args[0];
+ let formatted_type = sedona_type_to_formatted_type(sedona_type)?;
let matcher = ArgMatcher::new(
vec![
- ArgMatcher::is_geometry_or_geography(),
+ ArgMatcher::is_any(),
ArgMatcher::is_optional(ArgMatcher::is_string()),
],
- SedonaType::Arrow(DataType::Utf8),
+ formatted_type,
);
matcher.match_args(args)
}
@@ -129,39 +113,249 @@ impl SedonaScalarKernel for SDFormatGeometry {
}
}
- let executor = WkbExecutor::new(&arg_types[0..1], &args[0..1]);
+ let formatted_type = sedona_type_to_formatted_type(&arg_types[0])?;
+ if formatted_type == arg_types[0] {
+ // No change in type, the input data does not have geospatial
columns that we can format,
+ // so just return the input value
+ return Ok(args[0].clone());
+ }
+
+ columnar_value_to_formatted_value(&arg_types[0], &args[0],
maybe_width_hint)
+ }
+}
+
+fn sedona_type_to_formatted_type(sedona_type: &SedonaType) ->
Result<SedonaType> {
+ match sedona_type {
+ SedonaType::Wkb(_, _) | SedonaType::WkbView(_, _) =>
Ok(SedonaType::Arrow(DataType::Utf8)),
+ SedonaType::Arrow(arrow_type) => {
+ // dive into the arrow type and translate geospatial types into
Utf8
+ match arrow_type {
+ DataType::Struct(fields) => {
+ let mut new_fields = Vec::with_capacity(fields.len());
+ for field in fields {
+ let new_field = field_to_formatted_field(field)?;
+ new_fields.push(Arc::new(new_field));
+ }
+ Ok(SedonaType::Arrow(DataType::Struct(new_fields.into())))
+ }
+ DataType::List(field) => {
+ let new_field = field_to_formatted_field(field)?;
+ Ok(SedonaType::Arrow(DataType::List(Arc::new(new_field))))
+ }
+ DataType::ListView(field) => {
+ let new_field = field_to_formatted_field(field)?;
+
Ok(SedonaType::Arrow(DataType::ListView(Arc::new(new_field))))
+ }
+ _ => Ok(sedona_type.clone()),
+ }
+ }
+ }
+}
+
+fn field_to_formatted_field(field: &Field) -> Result<Field> {
+ let new_type =
sedona_type_to_formatted_type(&SedonaType::from_data_type(field.data_type())?)?;
+ let new_field = field.clone().with_data_type(new_type.data_type());
+ Ok(new_field)
+}
+
+fn columnar_value_to_formatted_value(
+ sedona_type: &SedonaType,
+ columnar_value: &ColumnarValue,
+ maybe_width_hint: Option<usize>,
+) -> Result<ColumnarValue> {
+ match sedona_type {
+ SedonaType::Wkb(_, _) | SedonaType::WkbView(_, _) => {
+ geospatial_value_to_formatted_value(sedona_type, columnar_value,
maybe_width_hint)
+ }
+ SedonaType::Arrow(arrow_type) => match arrow_type {
+ DataType::Struct(fields) => match columnar_value {
+ ColumnarValue::Array(array) => {
+ let struct_array = array.as_struct();
+ let formatted_struct_array =
+ struct_value_to_formatted_value(fields, struct_array,
maybe_width_hint)?;
+ Ok(ColumnarValue::Array(Arc::new(formatted_struct_array)))
+ }
+ ColumnarValue::Scalar(ScalarValue::Struct(struct_array)) => {
+ let formatted_struct_array =
+ struct_value_to_formatted_value(fields, struct_array,
maybe_width_hint)?;
+ Ok(ColumnarValue::Scalar(ScalarValue::Struct(Arc::new(
+ formatted_struct_array,
+ ))))
+ }
+ _ => internal_err!("Unsupported struct columnar value"),
+ },
+ DataType::List(field) => match columnar_value {
+ ColumnarValue::Array(array) => {
+ let list_array = array.as_list::<i32>();
+ let formatted_list_array =
+ list_value_to_formatted_value(field, list_array,
maybe_width_hint)?;
+ Ok(ColumnarValue::Array(Arc::new(formatted_list_array)))
+ }
+ ColumnarValue::Scalar(ScalarValue::List(list_array)) => {
+ let formatted_list_array =
+ list_value_to_formatted_value(field, list_array,
maybe_width_hint)?;
+ Ok(ColumnarValue::Scalar(ScalarValue::List(Arc::new(
+ formatted_list_array,
+ ))))
+ }
+ _ => internal_err!("Unsupported list columnar value"),
+ },
+ DataType::ListView(field) => match columnar_value {
+ ColumnarValue::Array(array) => {
+ let list_array = array.as_list_view::<i32>();
+ let formatted_list_array =
+ list_view_value_to_formatted_value(field, list_array,
maybe_width_hint)?;
+ Ok(ColumnarValue::Array(Arc::new(formatted_list_array)))
+ }
+ _ => internal_err!("Unsupported list view columnar value"),
+ },
+ _ => Ok(columnar_value.clone()),
+ },
+ }
+}
+
+/// Implementation format geometry or geography
+///
+/// This is very similar to ST_AsText except it respects the width_hint by
+/// stopping the render for each item when too many characters have been
written.
+fn geospatial_value_to_formatted_value(
+ sedona_type: &SedonaType,
+ geospatial_value: &ColumnarValue,
+ maybe_width_hint: Option<usize>,
+) -> Result<ColumnarValue> {
+ let arg_types: &[SedonaType] = std::slice::from_ref(sedona_type);
+ let args: &[ColumnarValue] = std::slice::from_ref(geospatial_value);
+ let executor = WkbExecutor::new(arg_types, args);
- let min_output_size = match maybe_width_hint {
- Some(width_hint) => executor.num_iterations() * width_hint,
- None => executor.num_iterations() * 25,
+ let min_output_size = match maybe_width_hint {
+ Some(width_hint) => executor.num_iterations() * width_hint,
+ None => executor.num_iterations() * 25,
+ };
+
+ // Initialize an output builder of the appropriate type
+ let mut builder = StringBuilder::with_capacity(executor.num_iterations(),
min_output_size);
+
+ executor.execute_wkb_void(|maybe_item| {
+ match maybe_item {
+ Some(item) => {
+ let mut builder_wrapper =
+ LimitedSizeOutput::new(&mut builder,
maybe_width_hint.unwrap_or(usize::MAX));
+
+ // We ignore this error on purpose: we raised it on purpose to
prevent
+ // the WKT writer from writing too many characters
+ #[allow(unused_must_use)]
+ wkt::to_wkt::write_geometry(&mut builder_wrapper, &item);
+
+ builder.append_value("");
+ }
+ None => builder.append_null(),
};
- // Initialize an output builder of the appropriate type
- let mut builder =
StringBuilder::with_capacity(executor.num_iterations(), min_output_size);
+ Ok(())
+ })?;
- executor.execute_wkb_void(|maybe_item| {
- match maybe_item {
- Some(item) => {
- let mut builder_wrapper = LimitedSizeOutput::new(
- &mut builder,
- maybe_width_hint.unwrap_or(usize::MAX),
- );
+ executor.finish(Arc::new(builder.finish()))
+}
- // We ignore this error on purpose: we raised it on
purpose to prevent
- // the WKT writer from writing too many characters
- #[allow(unused_must_use)]
- wkt::to_wkt::write_geometry(&mut builder_wrapper, &item);
+fn struct_value_to_formatted_value(
+ fields: &Fields,
+ struct_array: &StructArray,
+ maybe_width_hint: Option<usize>,
+) -> Result<StructArray> {
+ let columns = struct_array.columns();
- builder.append_value("");
- }
- None => builder.append_null(),
- };
+ let mut new_fields = Vec::with_capacity(columns.len());
+ for (column, field) in columns.iter().zip(fields) {
+ let new_field = field_to_formatted_field(field)?;
+ let sedona_type = SedonaType::from_data_type(field.data_type())?;
+ let unwrapped_column = sedona_type.unwrap_array(column)?;
+ let new_column = columnar_value_to_formatted_value(
+ &sedona_type,
+ &ColumnarValue::Array(unwrapped_column),
+ maybe_width_hint,
+ )?;
- Ok(())
- })?;
+ let ColumnarValue::Array(new_array) = new_column else {
+ return internal_err!(
+ "Expected Array in struct field formatting, got: {:?}",
+ new_column
+ );
+ };
- executor.finish(Arc::new(builder.finish()))
+ new_fields.push((Arc::new(new_field), new_array));
}
+
+ Ok(StructArray::from(new_fields))
+}
+
+fn list_value_to_formatted_value<OffsetSize: OffsetSizeTrait>(
+ field: &Field,
+ list_array: &GenericListArray<OffsetSize>,
+ maybe_width_hint: Option<usize>,
+) -> Result<GenericListArray<OffsetSize>> {
+ let values_array = list_array.values();
+ let offsets = list_array.offsets();
+ let nulls = list_array.nulls();
+
+ let new_field = field_to_formatted_field(field)?;
+ let sedona_type = SedonaType::from_data_type(field.data_type())?;
+ let unwrapped_values_array = sedona_type.unwrap_array(values_array)?;
+ let new_columnar_value = columnar_value_to_formatted_value(
+ &sedona_type,
+ &ColumnarValue::Array(unwrapped_values_array),
+ maybe_width_hint,
+ )?;
+
+ let ColumnarValue::Array(new_values_array) = new_columnar_value else {
+ return internal_err!(
+ "Expected Array when formatting list for field '{}', but got:
{:?}",
+ field.name(),
+ new_columnar_value
+ );
+ };
+
+ Ok(GenericListArray::<OffsetSize>::new(
+ Arc::new(new_field),
+ offsets.clone(),
+ new_values_array,
+ nulls.cloned(),
+ ))
+}
+
+fn list_view_value_to_formatted_value<OffsetSize: OffsetSizeTrait>(
+ field: &Field,
+ list_view_array: &GenericListViewArray<OffsetSize>,
+ maybe_width_hint: Option<usize>,
+) -> Result<GenericListViewArray<OffsetSize>> {
+ let values_array = list_view_array.values();
+ let offsets = list_view_array.offsets();
+ let sizes = list_view_array.sizes();
+ let nulls = list_view_array.nulls();
+
+ let new_field = field_to_formatted_field(field)?;
+ let sedona_type = SedonaType::from_data_type(field.data_type())?;
+ let unwrapped_values_array = sedona_type.unwrap_array(values_array)?;
+ let new_columnar_value = columnar_value_to_formatted_value(
+ &sedona_type,
+ &ColumnarValue::Array(unwrapped_values_array),
+ maybe_width_hint,
+ )?;
+
+ let ColumnarValue::Array(new_values_array) = new_columnar_value else {
+ return internal_err!(
+ "Expected Array during list view formatting for field '{}' of type
'{}'",
+ field.name(),
+ field.data_type()
+ );
+ };
+
+ Ok(GenericListViewArray::<OffsetSize>::new(
+ Arc::new(new_field),
+ offsets.clone(),
+ sizes.clone(),
+ new_values_array,
+ nulls.cloned(),
+ ))
}
struct LimitedSizeOutput<'a, T> {
@@ -194,13 +388,19 @@ impl<'a, T: std::fmt::Write> std::fmt::Write for
LimitedSizeOutput<'a, T> {
#[cfg(test)]
mod tests {
- use arrow_array::{create_array, StringArray};
+ use arrow_array::{
+ create_array, ArrayRef, Float64Array, Int32Array, ListArray,
ListViewArray, StringArray,
+ StructArray,
+ };
+ use arrow_schema::{DataType, Field};
+ use datafusion::arrow::buffer::{OffsetBuffer, ScalarBuffer};
use datafusion_expr::ScalarUDF;
use rstest::rstest;
use sedona_schema::datatypes::{
WKB_GEOGRAPHY, WKB_GEOMETRY, WKB_VIEW_GEOGRAPHY, WKB_VIEW_GEOMETRY,
};
use sedona_testing::{create::create_array, testers::ScalarUdfTester};
+ use std::sync::Arc;
use super::*;
@@ -266,4 +466,529 @@ mod tests {
&expected_array
);
}
+
+ #[test]
+ fn sd_format_does_not_format_non_spatial_columns() {
+ let udf = sd_format_udf();
+
+ // Define test cases as (description, array, expected_data_type)
+ let test_cases: Vec<(&str, ArrayRef, DataType)> = vec![
+ // Float64Array
+ (
+ "Float64Array",
+ Arc::new(Float64Array::from(vec![Some(1.5), None,
Some(3.16)])),
+ DataType::Float64,
+ ),
+ // StructArray with mixed types
+ (
+ "StructArray",
+ {
+ let struct_fields = vec![
+ Arc::new(Field::new("float_field", DataType::Float64,
true)),
+ Arc::new(Field::new("int_field", DataType::Int32,
false)),
+ ];
+ let float_col: ArrayRef =
+ Arc::new(Float64Array::from(vec![Some(1.1), Some(2.2),
None]));
+ let int_col: ArrayRef = Arc::new(Int32Array::from(vec![10,
20, 30]));
+ Arc::new(StructArray::new(
+ struct_fields.clone().into(),
+ vec![float_col, int_col],
+ None,
+ ))
+ },
+ DataType::Struct(
+ vec![
+ Arc::new(Field::new("float_field", DataType::Float64,
true)),
+ Arc::new(Field::new("int_field", DataType::Int32,
false)),
+ ]
+ .into(),
+ ),
+ ),
+ // String array using create_array! macro
+ (
+ "String array",
+ create_array!(Utf8, [Some("hello"), None, Some("world")]),
+ DataType::Utf8,
+ ),
+ // List array with Int32 elements
+ (
+ "List array",
+ {
+ let int_values = Int32Array::from(vec![Some(42), None,
Some(100), Some(200)]);
+ let field = Arc::new(Field::new("item", DataType::Int32,
true));
+ let offsets = OffsetBuffer::new(vec![0, 2, 2, 4].into());
// [0,2), [2,2), [2,4)
+ Arc::new(ListArray::new(
+ field.clone(),
+ offsets,
+ Arc::new(int_values),
+ None,
+ ))
+ },
+ DataType::List(Arc::new(Field::new("item", DataType::Int32,
true))),
+ ),
+ // List view array with Int32 elements
+ (
+ "List view array",
+ {
+ let int_values = Int32Array::from(vec![Some(10), Some(20),
Some(30)]);
+ let field = Arc::new(Field::new("item", DataType::Int32,
true));
+ let offsets = ScalarBuffer::from(vec![0i32, 1i32, 2i32]);
// Start offsets
+ let sizes = ScalarBuffer::from(vec![1i32, 1i32, 1i32]); //
Sizes
+ Arc::new(ListViewArray::new(
+ field.clone(),
+ offsets,
+ sizes,
+ Arc::new(int_values),
+ None,
+ ))
+ },
+ DataType::ListView(Arc::new(Field::new("item",
DataType::Int32, true))),
+ ),
+ ];
+
+ for (description, test_array, expected_data_type) in test_cases {
+ let tester = ScalarUdfTester::new(
+ udf.clone().into(),
+ vec![SedonaType::Arrow(expected_data_type.clone())],
+ );
+ let result = tester.invoke_array(test_array.clone()).unwrap();
+ if !matches!(expected_data_type, DataType::ListView(_)) {
+ assert_eq!(
+ &result, &test_array,
+ "Failed for test case: {}",
+ description
+ );
+ }
+ }
+ }
+
+ #[rstest]
+ fn sd_format_should_format_spatial_columns(
+ #[values(WKB_GEOMETRY, WKB_GEOGRAPHY, WKB_VIEW_GEOMETRY,
WKB_VIEW_GEOGRAPHY)]
+ sedona_type: SedonaType,
+ ) {
+ let udf = sd_format_udf();
+
+ // Create geometry storage array (without wrapping)
+ let geometry_values = vec![Some("POINT(1 2)"), None,
Some("LINESTRING(0 0, 1 1)")];
+ let geometry_array = create_array(&geometry_values, &sedona_type);
+
+ // Create non-spatial array
+ let int_array: ArrayRef = Arc::new(Int32Array::from(vec![10, 20, 30]));
+ let struct_fields = vec![
+ Arc::new(Field::new("geom", sedona_type.data_type(), true)),
+ Arc::new(Field::new("id", DataType::Int32, false)),
+ ];
+ let struct_array = StructArray::new(
+ struct_fields.clone().into(),
+ vec![geometry_array, int_array.clone()],
+ None,
+ );
+
+ // Create tester
+ let input_sedona_type =
SedonaType::Arrow(DataType::Struct(struct_fields.into()));
+ let tester = ScalarUdfTester::new(udf.clone().into(),
vec![input_sedona_type]);
+
+ // Test the function
+ let result =
tester.invoke_array(Arc::new(struct_array.clone())).unwrap();
+
+ // Verify the result structure
+ let result_struct = result.as_struct();
+ assert_eq!(result_struct.num_columns(), 2);
+
+ // First column should be formatted to UTF8 (geometry -> string)
+ let geometry_column = result_struct.column(0);
+ assert_eq!(geometry_column.data_type(), &DataType::Utf8);
+
+ // Second column should remain Int32 (unchanged)
+ let id_column = result_struct.column(1);
+ assert_eq!(id_column.data_type(), &DataType::Int32);
+ assert_eq!(id_column, &int_array);
+
+ // Check if it's actually formatted
+ if geometry_column.data_type() == &DataType::Utf8 {
+ // Verify the geometry was actually formatted to WKT strings
+ let string_array = geometry_column.as_string::<i32>();
+ assert_wkt_values_match(string_array, &geometry_values);
+ } else {
+ // If not UTF8, this test should fail but let's see what we got
+ panic!(
+ "Geometry column was not formatted to UTF8. Got: {:?}",
+ geometry_column.data_type()
+ );
+ }
+ }
+
+ #[rstest]
+ fn sd_format_should_handle_both_spatial_and_non_spatial_columns(
+ #[values(WKB_GEOMETRY, WKB_GEOGRAPHY, WKB_VIEW_GEOMETRY,
WKB_VIEW_GEOGRAPHY)]
+ sedona_type: SedonaType,
+ ) {
+ let udf = sd_format_udf();
+
+ // Create geometry array
+ let geog_values = vec![Some("POLYGON((0 0,1 0,1 1,0 1,0 0))"),
Some("POINT(1 2)")];
+ let geog_array = create_array(&geog_values, &sedona_type);
+
+ // Create string array
+ let name_array: ArrayRef =
+ Arc::new(StringArray::from(vec![Some("feature1"),
Some("feature2")]));
+
+ // Create boolean array
+ let active_array: ArrayRef =
Arc::new(arrow_array::BooleanArray::from(vec![
+ Some(true),
+ Some(false),
+ ]));
+
+ // Create struct array with proper extension metadata
+ let struct_fields = vec![
+ Arc::new(Field::new("geom", sedona_type.data_type(), true)),
+ Arc::new(Field::new("name", DataType::Utf8, true)),
+ Arc::new(Field::new("active", DataType::Boolean, false)),
+ ];
+ let struct_array = StructArray::new(
+ struct_fields.clone().into(),
+ vec![geog_array, name_array.clone(), active_array.clone()],
+ None,
+ );
+
+ // Create tester
+ let input_sedona_type =
SedonaType::Arrow(DataType::Struct(struct_fields.into()));
+ let tester = ScalarUdfTester::new(udf.clone().into(),
vec![input_sedona_type]);
+
+ // Test the function
+ let result = tester.invoke_array(Arc::new(struct_array)).unwrap();
+
+ // Verify the result structure
+ let result_struct = result.as_struct();
+ assert_eq!(result_struct.num_columns(), 3);
+
+ // Geography column should be formatted to UTF8
+ let geog_column = result_struct.column(0);
+ assert_eq!(geog_column.data_type(), &DataType::Utf8);
+
+ // Name column should remain UTF8 (unchanged)
+ let name_column = result_struct.column(1);
+ assert_eq!(name_column.data_type(), &DataType::Utf8);
+ assert_eq!(name_column, &name_array);
+
+ // Active column should remain Boolean (unchanged)
+ let active_column = result_struct.column(2);
+ assert_eq!(active_column.data_type(), &DataType::Boolean);
+ assert_eq!(active_column, &active_array);
+
+ // Verify the geography was actually formatted to WKT strings
+ let string_array = geog_column.as_string::<i32>();
+ assert_wkt_values_match(string_array, &geog_values);
+ }
+
+ #[rstest]
+ fn sd_format_should_format_spatial_lists(
+ #[values(WKB_GEOMETRY, WKB_GEOGRAPHY, WKB_VIEW_GEOMETRY,
WKB_VIEW_GEOGRAPHY)]
+ sedona_type: SedonaType,
+ ) -> Result<()> {
+ let udf = sd_format_udf();
+
+ // Create an array of WKB geometries using storage format
+ let geom_values = vec![
+ Some("POINT(1 2)"),
+ Some("LINESTRING(0 0,1 1)"),
+ None,
+ Some("POLYGON((0 0,1 1,1 0,0 0))"),
+ ];
+ let geom_array = create_array(&geom_values, &sedona_type);
+
+ // Create a simple list containing the geometry array
+ let field = Arc::new(Field::new("geom", sedona_type.data_type(),
true));
+ let offsets = OffsetBuffer::new(vec![0, 2, 4].into());
+ let list_array = ListArray::new(field, offsets, geom_array, None);
+
+ // Create tester
+ let input_sedona_type =
SedonaType::Arrow(list_array.data_type().clone());
+ let tester = ScalarUdfTester::new(udf.clone().into(),
vec![input_sedona_type]);
+
+ // Execute the UDF
+ let result = tester.invoke_array(Arc::new(list_array));
+ let output_array = result.unwrap();
+ let formatted_list =
output_array.as_any().downcast_ref::<ListArray>().unwrap();
+
+ // Check that the list field type is now UTF8 (formatted from WKB)
+ let list_field = formatted_list.data_type();
+ if let DataType::List(inner_field) = list_field {
+ assert_eq!(inner_field.data_type(), &DataType::Utf8);
+ } else {
+ panic!("Expected List data type, got: {:?}", list_field);
+ }
+
+ // Check the actual formatted values in the list
+ let values_array = formatted_list.values();
+ if let Some(utf8_array) =
values_array.as_any().downcast_ref::<StringArray>() {
+ assert_wkt_values_match(utf8_array, &geom_values);
+ } else {
+ panic!(
+ "Expected list elements to be formatted as UTF8 strings, got:
{:?}",
+ values_array.data_type()
+ );
+ }
+
+ Ok(())
+ }
+
+ #[rstest]
+ fn sd_format_should_format_spatial_list_views(
+ #[values(WKB_GEOMETRY, WKB_GEOGRAPHY, WKB_VIEW_GEOMETRY,
WKB_VIEW_GEOGRAPHY)]
+ sedona_type: SedonaType,
+ ) -> Result<()> {
+ let udf = sd_format_udf();
+
+ // Create an array of WKB geometries using storage format
+ let geom_values = vec![
+ Some("POINT(1 2)"),
+ Some("LINESTRING(0 0,1 1)"),
+ None,
+ Some("POLYGON((0 0,1 1,1 0,0 0))"),
+ ];
+ let geom_array = create_array(&geom_values, &sedona_type);
+
+ // Create a ListView containing the geometry array
+ let field = Arc::new(Field::new("geom", sedona_type.data_type(),
true));
+ let offsets = ScalarBuffer::from(vec![0i32, 2i32]); // Two list views:
[0,2) and [2,4)
+ let sizes = ScalarBuffer::from(vec![2i32, 2i32]); // Each list view
has 2 elements
+ let list_view_array = ListViewArray::new(field, offsets, sizes,
geom_array, None);
+
+ // Create tester
+ let input_sedona_type =
SedonaType::Arrow(list_view_array.data_type().clone());
+ let tester = ScalarUdfTester::new(udf.clone().into(),
vec![input_sedona_type]);
+
+ // Execute the UDF
+ let result = tester.invoke_array(Arc::new(list_view_array));
+ let output_array = result.unwrap();
+ let formatted_list_view = output_array
+ .as_any()
+ .downcast_ref::<ListViewArray>()
+ .unwrap();
+
+ // Check that the list view field type is now UTF8 (formatted from WKB)
+ let list_field = formatted_list_view.data_type();
+ if let DataType::ListView(inner_field) = list_field {
+ assert_eq!(inner_field.data_type(), &DataType::Utf8);
+ } else {
+ panic!("Expected ListView data type, got: {:?}", list_field);
+ }
+
+ // Check the actual formatted values in the list view
+ let values_array = formatted_list_view.values();
+ if let Some(utf8_array) =
values_array.as_any().downcast_ref::<StringArray>() {
+ assert_wkt_values_match(utf8_array, &geom_values);
+ } else {
+ panic!(
+ "Expected list view elements to be formatted as UTF8 strings,
got: {:?}",
+ values_array.data_type()
+ );
+ }
+
+ Ok(())
+ }
+
+ #[rstest]
+ fn sd_format_should_format_struct_containing_list_of_geometries(
+ #[values(WKB_GEOMETRY, WKB_GEOGRAPHY, WKB_VIEW_GEOMETRY,
WKB_VIEW_GEOGRAPHY)]
+ sedona_type: SedonaType,
+ ) -> Result<()> {
+ let udf = sd_format_udf();
+
+ // Create an array of WKB geometries
+ let geom_values = vec![
+ Some("POINT(1 2)"),
+ Some("LINESTRING(0 0,1 1)"),
+ None,
+ Some("POLYGON((0 0,1 1,1 0,0 0))"),
+ ];
+ let geom_array = create_array(&geom_values, &sedona_type);
+
+ // Create a list containing the geometry array
+ let geom_list_field = Arc::new(Field::new("geom",
sedona_type.data_type(), true));
+ let geom_offsets = OffsetBuffer::new(vec![0, 4].into()); // One list
containing all 4 geometries
+ let geom_list_array = ListArray::new(geom_list_field, geom_offsets,
geom_array, None);
+
+ // Create other fields for the struct
+ let name_array: ArrayRef =
Arc::new(StringArray::from(vec![Some("feature_collection")]));
+ let count_array: ArrayRef = Arc::new(Int32Array::from(vec![4]));
+
+ // Create struct containing the list of geometries
+ let struct_fields = vec![
+ Arc::new(Field::new("name", DataType::Utf8, true)),
+ Arc::new(Field::new(
+ "geometries",
+ DataType::List(Arc::new(Field::new("geom",
sedona_type.data_type(), true))),
+ true,
+ )),
+ Arc::new(Field::new("count", DataType::Int32, false)),
+ ];
+ let struct_array = StructArray::new(
+ struct_fields.clone().into(),
+ vec![
+ name_array.clone(),
+ Arc::new(geom_list_array),
+ count_array.clone(),
+ ],
+ None,
+ );
+
+ // Create tester
+ let input_sedona_type =
SedonaType::Arrow(DataType::Struct(struct_fields.into()));
+ let tester = ScalarUdfTester::new(udf.clone().into(),
vec![input_sedona_type]);
+
+ // Test the function
+ let result = tester.invoke_array(Arc::new(struct_array)).unwrap();
+
+ // Verify the result structure
+ let result_struct = result.as_struct();
+ assert_eq!(result_struct.num_columns(), 3);
+
+ // Name column should remain UTF8 (unchanged)
+ let name_column = result_struct.column(0);
+ assert_eq!(name_column.data_type(), &DataType::Utf8);
+ assert_eq!(name_column, &name_array);
+
+ // Geometries column should be a list of UTF8 (formatted)
+ let geometries_column = result_struct.column(1);
+ if let DataType::List(inner_field) = geometries_column.data_type() {
+ assert_eq!(inner_field.data_type(), &DataType::Utf8);
+ } else {
+ panic!(
+ "Expected List data type, got: {:?}",
+ geometries_column.data_type()
+ );
+ }
+
+ // Count column should remain Int32 (unchanged)
+ let count_column = result_struct.column(2);
+ assert_eq!(count_column.data_type(), &DataType::Int32);
+ assert_eq!(count_column, &count_array);
+
+ // Verify the geometries were actually formatted to WKT strings
+ let formatted_list = geometries_column
+ .as_any()
+ .downcast_ref::<ListArray>()
+ .unwrap();
+ let values_array = formatted_list.values();
+ if let Some(utf8_array) =
values_array.as_any().downcast_ref::<StringArray>() {
+ assert_wkt_values_match(utf8_array, &geom_values);
+ } else {
+ panic!(
+ "Expected list elements to be formatted as UTF8 strings, got:
{:?}",
+ values_array.data_type()
+ );
+ }
+
+ Ok(())
+ }
+
+ #[rstest]
+ fn sd_format_should_format_list_of_structs_containing_geometry(
+ #[values(WKB_GEOMETRY, WKB_GEOGRAPHY, WKB_VIEW_GEOMETRY,
WKB_VIEW_GEOGRAPHY)]
+ sedona_type: SedonaType,
+ ) -> Result<()> {
+ let udf = sd_format_udf();
+
+ // Create geometry arrays for each struct element
+ let geom_values = vec![Some("POINT(1 2)"), Some("LINESTRING(0 0,1
1)")];
+ let geom_array = create_array(&geom_values, &sedona_type);
+
+ // Create other field arrays for the struct elements
+ let name_array: ArrayRef = Arc::new(StringArray::from(vec![
+ Some("point_feature"),
+ Some("line_feature"),
+ ]));
+ let id_array: ArrayRef = Arc::new(Int32Array::from(vec![101, 102]));
+
+ // Create struct array containing geometry field
+ let struct_fields = vec![
+ Arc::new(Field::new("id", DataType::Int32, false)),
+ Arc::new(Field::new("geom", sedona_type.data_type(), true)),
+ Arc::new(Field::new("name", DataType::Utf8, true)),
+ ];
+ let struct_array = StructArray::new(
+ struct_fields.clone().into(),
+ vec![id_array.clone(), geom_array, name_array.clone()],
+ None,
+ );
+
+ // Create a list containing the struct array
+ let list_field = Arc::new(Field::new(
+ "feature",
+ DataType::Struct(struct_fields.into()),
+ true,
+ ));
+ let list_offsets = OffsetBuffer::new(vec![0, 2].into()); // One list
containing 2 structs
+ let list_array = ListArray::new(list_field, list_offsets,
Arc::new(struct_array), None);
+
+ // Create tester
+ let input_sedona_type =
SedonaType::Arrow(list_array.data_type().clone());
+ let tester = ScalarUdfTester::new(udf.clone().into(),
vec![input_sedona_type]);
+
+ // Test the function
+ let result = tester.invoke_array(Arc::new(list_array)).unwrap();
+
+ // Verify the result structure
+ let result_list = result.as_any().downcast_ref::<ListArray>().unwrap();
+
+ // Check that the list field type is a struct with formatted geometry
field
+ let list_field = result_list.data_type();
+ if let DataType::List(inner_field) = list_field {
+ if let DataType::Struct(struct_fields) = inner_field.data_type() {
+ // Find the geometry field and verify it's been formatted to
UTF8
+ let geom_field = struct_fields.iter().find(|f| f.name() ==
"geom").unwrap();
+ assert_eq!(geom_field.data_type(), &DataType::Utf8);
+ } else {
+ panic!(
+ "Expected Struct data type inside List, got: {:?}",
+ inner_field.data_type()
+ );
+ }
+ } else {
+ panic!("Expected List data type, got: {:?}", list_field);
+ }
+
+ // Verify the actual struct values and their geometry formatting
+ let struct_values = result_list.values().as_struct();
+ assert_eq!(struct_values.num_columns(), 3);
+
+ // ID column should remain Int32 (unchanged)
+ let id_column = struct_values.column(0);
+ assert_eq!(id_column.data_type(), &DataType::Int32);
+ assert_eq!(id_column, &id_array);
+
+ // Geometry column should be formatted to UTF8
+ let geometry_column = struct_values.column(1);
+ assert_eq!(geometry_column.data_type(), &DataType::Utf8);
+
+ // Name column should remain UTF8 (unchanged)
+ let name_column = struct_values.column(2);
+ assert_eq!(name_column.data_type(), &DataType::Utf8);
+ assert_eq!(name_column, &name_array);
+
+ // Verify the geometries were actually formatted to WKT strings
+ let string_array = geometry_column.as_string::<i32>();
+ assert_wkt_values_match(string_array, &geom_values);
+
+ Ok(())
+ }
+
+ /// Helper function to verify that actual WKT values match expected values,
+ /// handling the normalization of comma spacing in WKT output
+ fn assert_wkt_values_match(actual_array: &StringArray, expected_values:
&[Option<&str>]) {
+ for (i, expected) in expected_values.iter().enumerate() {
+ match expected {
+ Some(expected_value) => {
+ let actual_value = actual_array.value(i);
+ // Note: WKT output may not have spaces after commas
+ let normalized_expected = expected_value.replace(", ",
",");
+ assert_eq!(actual_value, normalized_expected);
+ }
+ None => assert!(actual_array.is_null(i)),
+ }
+ }
+ }
}