paleolimbot commented on code in PR #1: URL: https://github.com/apache/sedona-db/pull/1#discussion_r2310576660
########## rust/sedona-functions/src/sd_format.rs: ########## @@ -250,4 +450,529 @@ mod tests { &expected_array ); } + + #[test] + fn sd_format_does_not_format_non_spatial_columns() { + let udf = sd_format_udf(); + + // Define test cases as (description, array, expected_data_type) + let test_cases: Vec<(&str, ArrayRef, DataType)> = vec![ + // Float64Array + ( + "Float64Array", + Arc::new(Float64Array::from(vec![Some(1.5), None, Some(3.16)])), + DataType::Float64, + ), + // StructArray with mixed types + ( + "StructArray", + { + let struct_fields = vec![ + Arc::new(Field::new("float_field", DataType::Float64, true)), + Arc::new(Field::new("int_field", DataType::Int32, false)), + ]; + let float_col: ArrayRef = + Arc::new(Float64Array::from(vec![Some(1.1), Some(2.2), None])); + let int_col: ArrayRef = Arc::new(Int32Array::from(vec![10, 20, 30])); + Arc::new(StructArray::new( + struct_fields.clone().into(), + vec![float_col, int_col], + None, + )) + }, + DataType::Struct( + vec![ + Arc::new(Field::new("float_field", DataType::Float64, true)), + Arc::new(Field::new("int_field", DataType::Int32, false)), + ] + .into(), + ), + ), + // String array using create_array! macro + ( + "String array", + create_array!(Utf8, [Some("hello"), None, Some("world")]), + DataType::Utf8, + ), + // List array with Int32 elements + ( + "List array", + { + let int_values = Int32Array::from(vec![Some(42), None, Some(100), Some(200)]); + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let offsets = OffsetBuffer::new(vec![0, 2, 2, 4].into()); // [0,2), [2,2), [2,4) + Arc::new(ListArray::new( + field.clone(), + offsets, + Arc::new(int_values), + None, + )) + }, + DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), + ), + // List view array with Int32 elements + ( + "List view array", + { + let int_values = Int32Array::from(vec![Some(10), Some(20), Some(30)]); + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let offsets = ScalarBuffer::from(vec![0i32, 1i32, 2i32]); // Start offsets + let sizes = ScalarBuffer::from(vec![1i32, 1i32, 1i32]); // Sizes + Arc::new(ListViewArray::new( + field.clone(), + offsets, + sizes, + Arc::new(int_values), + None, + )) + }, + DataType::ListView(Arc::new(Field::new("item", DataType::Int32, true))), + ), + ]; + + for (description, test_array, expected_data_type) in test_cases { + let tester = ScalarUdfTester::new( + udf.clone().into(), + vec![SedonaType::Arrow(expected_data_type.clone())], + ); + let result = tester.invoke_array(test_array.clone()).unwrap(); + if !matches!(expected_data_type, DataType::ListView(_)) { + assert_eq!( + &result, &test_array, + "Failed for test case: {}", + description + ); + } + } + } + + #[rstest] + fn sd_format_should_format_spatial_columns( + #[values(WKB_GEOMETRY, WKB_GEOGRAPHY, WKB_VIEW_GEOMETRY, WKB_VIEW_GEOGRAPHY)] + sedona_type: SedonaType, + ) { + let udf = sd_format_udf(); + + // Create geometry storage array (without wrapping) + let geometry_values = vec![Some("POINT(1 2)"), None, Some("LINESTRING(0 0, 1 1)")]; + let geometry_array = create_array(&geometry_values, &sedona_type); + + // Create non-spatial array + let int_array: ArrayRef = Arc::new(Int32Array::from(vec![10, 20, 30])); + let struct_fields = vec![ + Arc::new(Field::new("geom", sedona_type.data_type(), true)), + Arc::new(Field::new("id", DataType::Int32, false)), + ]; + let struct_array = StructArray::new( + struct_fields.clone().into(), + vec![geometry_array, int_array.clone()], + None, + ); + + // Create tester + let input_sedona_type = SedonaType::Arrow(DataType::Struct(struct_fields.into())); + let tester = ScalarUdfTester::new(udf.clone().into(), vec![input_sedona_type]); + + // Test the function + let result = tester.invoke_array(Arc::new(struct_array.clone())).unwrap(); + + // Verify the result structure + let result_struct = result.as_struct(); + assert_eq!(result_struct.num_columns(), 2); + + // First column should be formatted to UTF8 (geometry -> string) + let geometry_column = result_struct.column(0); + assert_eq!(geometry_column.data_type(), &DataType::Utf8); + + // Second column should remain Int32 (unchanged) + let id_column = result_struct.column(1); + assert_eq!(id_column.data_type(), &DataType::Int32); + assert_eq!(id_column, &int_array); + + // Check if it's actually formatted + if geometry_column.data_type() == &DataType::Utf8 { + // Verify the geometry was actually formatted to WKT strings + let string_array = geometry_column.as_string::<i32>(); + assert_wkt_values_match(string_array, &geometry_values); + } else { + // If not UTF8, this test should fail but let's see what we got + panic!( + "Geometry column was not formatted to UTF8. Got: {:?}", + geometry_column.data_type() + ); + } + } + + #[rstest] + fn sd_format_should_handle_both_spatial_and_non_spatial_columns( + #[values(WKB_GEOMETRY, WKB_GEOGRAPHY, WKB_VIEW_GEOMETRY, WKB_VIEW_GEOGRAPHY)] + sedona_type: SedonaType, + ) { + let udf = sd_format_udf(); + + // Create geometry array + let geog_values = vec![Some("POLYGON((0 0,1 0,1 1,0 1,0 0))"), Some("POINT(1 2)")]; + let geog_array = create_array(&geog_values, &sedona_type); + + // Create string array + let name_array: ArrayRef = + Arc::new(StringArray::from(vec![Some("feature1"), Some("feature2")])); + + // Create boolean array + let active_array: ArrayRef = Arc::new(arrow_array::BooleanArray::from(vec![ + Some(true), + Some(false), + ])); + + // Create struct array with proper extension metadata + let struct_fields = vec![ + Arc::new(Field::new("geom", sedona_type.data_type(), true)), + Arc::new(Field::new("name", DataType::Utf8, true)), + Arc::new(Field::new("active", DataType::Boolean, false)), + ]; + let struct_array = StructArray::new( + struct_fields.clone().into(), + vec![geog_array, name_array.clone(), active_array.clone()], + None, + ); + + // Create tester + let input_sedona_type = SedonaType::Arrow(DataType::Struct(struct_fields.into())); + let tester = ScalarUdfTester::new(udf.clone().into(), vec![input_sedona_type]); + + // Test the function + let result = tester.invoke_array(Arc::new(struct_array)).unwrap(); + + // Verify the result structure + let result_struct = result.as_struct(); + assert_eq!(result_struct.num_columns(), 3); + + // Geography column should be formatted to UTF8 + let geog_column = result_struct.column(0); + assert_eq!(geog_column.data_type(), &DataType::Utf8); + + // Name column should remain UTF8 (unchanged) + let name_column = result_struct.column(1); + assert_eq!(name_column.data_type(), &DataType::Utf8); + assert_eq!(name_column, &name_array); + + // Active column should remain Boolean (unchanged) + let active_column = result_struct.column(2); + assert_eq!(active_column.data_type(), &DataType::Boolean); + assert_eq!(active_column, &active_array); + + // Verify the geography was actually formatted to WKT strings + let string_array = geog_column.as_string::<i32>(); + assert_wkt_values_match(string_array, &geog_values); + } + + #[rstest] + fn sd_format_should_format_spatial_lists( + #[values(WKB_GEOMETRY, WKB_GEOGRAPHY, WKB_VIEW_GEOMETRY, WKB_VIEW_GEOGRAPHY)] + sedona_type: SedonaType, + ) -> Result<()> { + let udf = sd_format_udf(); + + // Create an array of WKB geometries using storage format + let geom_values = vec![ + Some("POINT(1 2)"), + Some("LINESTRING(0 0,1 1)"), + None, + Some("POLYGON((0 0,1 1,1 0,0 0))"), + ]; + let geom_array = create_array(&geom_values, &sedona_type); + + // Create a simple list containing the geometry array + let field = Arc::new(Field::new("geom", sedona_type.data_type(), true)); + let offsets = OffsetBuffer::new(vec![0, 2, 4].into()); + let list_array = ListArray::new(field, offsets, geom_array, None); + + // Create tester + let input_sedona_type = SedonaType::Arrow(list_array.data_type().clone()); + let tester = ScalarUdfTester::new(udf.clone().into(), vec![input_sedona_type]); + + // Execute the UDF + let result = tester.invoke_array(Arc::new(list_array)); + let output_array = result.unwrap(); + let formatted_list = output_array.as_any().downcast_ref::<ListArray>().unwrap(); + + // Check that the list field type is now UTF8 (formatted from WKB) + let list_field = formatted_list.data_type(); + if let DataType::List(inner_field) = list_field { + assert_eq!(inner_field.data_type(), &DataType::Utf8); + } else { + panic!("Expected List data type, got: {:?}", list_field); + } + + // Check the actual formatted values in the list + let values_array = formatted_list.values(); + if let Some(utf8_array) = values_array.as_any().downcast_ref::<StringArray>() { + assert_wkt_values_match(utf8_array, &geom_values); + } else { + panic!( + "Expected list elements to be formatted as UTF8 strings, got: {:?}", + values_array.data_type() + ); + } + + Ok(()) + } + + #[rstest] + fn sd_format_should_format_spatial_list_views( + #[values(WKB_GEOMETRY, WKB_GEOGRAPHY, WKB_VIEW_GEOMETRY, WKB_VIEW_GEOGRAPHY)] + sedona_type: SedonaType, + ) -> Result<()> { + let udf = sd_format_udf(); + + // Create an array of WKB geometries using storage format + let geom_values = vec![ + Some("POINT(1 2)"), + Some("LINESTRING(0 0,1 1)"), + None, + Some("POLYGON((0 0,1 1,1 0,0 0))"), + ]; + let geom_array = create_array(&geom_values, &sedona_type); + + // Create a ListView containing the geometry array + let field = Arc::new(Field::new("geom", sedona_type.data_type(), true)); + let offsets = ScalarBuffer::from(vec![0i32, 2i32]); // Two list views: [0,2) and [2,4) + let sizes = ScalarBuffer::from(vec![2i32, 2i32]); // Each list view has 2 elements + let list_view_array = ListViewArray::new(field, offsets, sizes, geom_array, None); + + // Create tester + let input_sedona_type = SedonaType::Arrow(list_view_array.data_type().clone()); + let tester = ScalarUdfTester::new(udf.clone().into(), vec![input_sedona_type]); + + // Execute the UDF + let result = tester.invoke_array(Arc::new(list_view_array)); + let output_array = result.unwrap(); + let formatted_list_view = output_array + .as_any() + .downcast_ref::<ListViewArray>() + .unwrap(); + + // Check that the list view field type is now UTF8 (formatted from WKB) + let list_field = formatted_list_view.data_type(); + if let DataType::ListView(inner_field) = list_field { + assert_eq!(inner_field.data_type(), &DataType::Utf8); + } else { + panic!("Expected ListView data type, got: {:?}", list_field); + } + + // Check the actual formatted values in the list view + let values_array = formatted_list_view.values(); + if let Some(utf8_array) = values_array.as_any().downcast_ref::<StringArray>() { + assert_wkt_values_match(utf8_array, &geom_values); + } else { + panic!( + "Expected list view elements to be formatted as UTF8 strings, got: {:?}", + values_array.data_type() + ); + } + + Ok(()) + } + + #[rstest] + fn sd_format_should_format_struct_containing_list_of_geometries( + #[values(WKB_GEOMETRY, WKB_GEOGRAPHY, WKB_VIEW_GEOMETRY, WKB_VIEW_GEOGRAPHY)] + sedona_type: SedonaType, + ) -> Result<()> { + let udf = sd_format_udf(); + + // Create an array of WKB geometries + let geom_values = vec![ + Some("POINT(1 2)"), + Some("LINESTRING(0 0,1 1)"), + None, + Some("POLYGON((0 0,1 1,1 0,0 0))"), + ]; + let geom_array = create_array(&geom_values, &sedona_type); + + // Create a list containing the geometry array + let geom_list_field = Arc::new(Field::new("geom", sedona_type.data_type(), true)); + let geom_offsets = OffsetBuffer::new(vec![0, 4].into()); // One list containing all 4 geometries + let geom_list_array = ListArray::new(geom_list_field, geom_offsets, geom_array, None); + + // Create other fields for the struct + let name_array: ArrayRef = Arc::new(StringArray::from(vec![Some("feature_collection")])); + let count_array: ArrayRef = Arc::new(Int32Array::from(vec![4])); + + // Create struct containing the list of geometries + let struct_fields = vec![ + Arc::new(Field::new("name", DataType::Utf8, true)), + Arc::new(Field::new( + "geometries", + DataType::List(Arc::new(Field::new("geom", sedona_type.data_type(), true))), + true, + )), + Arc::new(Field::new("count", DataType::Int32, false)), + ]; + let struct_array = StructArray::new( + struct_fields.clone().into(), + vec![ + name_array.clone(), + Arc::new(geom_list_array), + count_array.clone(), + ], + None, + ); + + // Create tester + let input_sedona_type = SedonaType::Arrow(DataType::Struct(struct_fields.into())); + let tester = ScalarUdfTester::new(udf.clone().into(), vec![input_sedona_type]); + + // Test the function + let result = tester.invoke_array(Arc::new(struct_array)).unwrap(); + + // Verify the result structure + let result_struct = result.as_struct(); + assert_eq!(result_struct.num_columns(), 3); + + // Name column should remain UTF8 (unchanged) + let name_column = result_struct.column(0); + assert_eq!(name_column.data_type(), &DataType::Utf8); + assert_eq!(name_column, &name_array); + + // Geometries column should be a list of UTF8 (formatted) + let geometries_column = result_struct.column(1); + if let DataType::List(inner_field) = geometries_column.data_type() { + assert_eq!(inner_field.data_type(), &DataType::Utf8); + } else { + panic!( + "Expected List data type, got: {:?}", + geometries_column.data_type() + ); + } + + // Count column should remain Int32 (unchanged) + let count_column = result_struct.column(2); + assert_eq!(count_column.data_type(), &DataType::Int32); + assert_eq!(count_column, &count_array); + + // Verify the geometries were actually formatted to WKT strings + let formatted_list = geometries_column + .as_any() + .downcast_ref::<ListArray>() + .unwrap(); + let values_array = formatted_list.values(); + if let Some(utf8_array) = values_array.as_any().downcast_ref::<StringArray>() { + assert_wkt_values_match(utf8_array, &geom_values); + } else { + panic!( + "Expected list elements to be formatted as UTF8 strings, got: {:?}", + values_array.data_type() + ); + } + + Ok(()) + } + + #[rstest] + fn sd_format_should_format_list_of_structs_containing_geometry( + #[values(WKB_GEOMETRY, WKB_GEOGRAPHY, WKB_VIEW_GEOMETRY, WKB_VIEW_GEOGRAPHY)] + sedona_type: SedonaType, + ) -> Result<()> { + let udf = sd_format_udf(); + + // Create geometry arrays for each struct element + let geom_values = vec![Some("POINT(1 2)"), Some("LINESTRING(0 0,1 1)")]; + let geom_array = create_array(&geom_values, &sedona_type); + + // Create other field arrays for the struct elements + let name_array: ArrayRef = Arc::new(StringArray::from(vec![ + Some("point_feature"), + Some("line_feature"), + ])); + let id_array: ArrayRef = Arc::new(Int32Array::from(vec![101, 102])); + + // Create struct array containing geometry field + let struct_fields = vec![ + Arc::new(Field::new("id", DataType::Int32, false)), + Arc::new(Field::new("geom", sedona_type.data_type(), true)), + Arc::new(Field::new("name", DataType::Utf8, true)), + ]; + let struct_array = StructArray::new( + struct_fields.clone().into(), + vec![id_array.clone(), geom_array, name_array.clone()], + None, + ); + + // Create a list containing the struct array + let list_field = Arc::new(Field::new( + "feature", + DataType::Struct(struct_fields.into()), + true, + )); + let list_offsets = OffsetBuffer::new(vec![0, 2].into()); // One list containing 2 structs + let list_array = ListArray::new(list_field, list_offsets, Arc::new(struct_array), None); + + // Create tester + let input_sedona_type = SedonaType::Arrow(list_array.data_type().clone()); + let tester = ScalarUdfTester::new(udf.clone().into(), vec![input_sedona_type]); + + // Test the function + let result = tester.invoke_array(Arc::new(list_array)).unwrap(); + + // Verify the result structure + let result_list = result.as_any().downcast_ref::<ListArray>().unwrap(); + + // Check that the list field type is a struct with formatted geometry field + let list_field = result_list.data_type(); + if let DataType::List(inner_field) = list_field { + if let DataType::Struct(struct_fields) = inner_field.data_type() { + // Find the geometry field and verify it's been formatted to UTF8 + let geom_field = struct_fields.iter().find(|f| f.name() == "geom").unwrap(); + assert_eq!(geom_field.data_type(), &DataType::Utf8); + } else { + panic!( + "Expected Struct data type inside List, got: {:?}", + inner_field.data_type() + ); + } + } else { + panic!("Expected List data type, got: {:?}", list_field); + } + + // Verify the actual struct values and their geometry formatting + let struct_values = result_list.values().as_struct(); + assert_eq!(struct_values.num_columns(), 3); + + // ID column should remain Int32 (unchanged) + let id_column = struct_values.column(0); + assert_eq!(id_column.data_type(), &DataType::Int32); + assert_eq!(id_column, &id_array); + + // Geometry column should be formatted to UTF8 + let geometry_column = struct_values.column(1); + assert_eq!(geometry_column.data_type(), &DataType::Utf8); + + // Name column should remain UTF8 (unchanged) + let name_column = struct_values.column(2); + assert_eq!(name_column.data_type(), &DataType::Utf8); + assert_eq!(name_column, &name_array); + + // Verify the geometries were actually formatted to WKT strings + let string_array = geometry_column.as_string::<i32>(); + assert_wkt_values_match(string_array, &geom_values); + + Ok(()) + } + + /// Helper function to verify that actual WKT values match expected values, + /// handling the normalization of comma spacing in WKT output + fn assert_wkt_values_match(actual_array: &StringArray, expected_values: &[Option<&str>]) { + for (i, expected) in expected_values.iter().enumerate() { + match expected { + Some(expected_value) => { + let actual_value = actual_array.value(i); + // Note: WKT output may not have spaces after commas + let normalized_expected = expected_value.replace(", ", ","); + assert_eq!(actual_value, normalized_expected); + } + None => assert!(actual_array.is_null(i)), + } + } + } Review Comment: Are there cases where we're generating WKT whose result we can't guess in advance? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: dev-unsubscr...@sedona.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org