goldmedal commented on code in PR #11712: URL: https://github.com/apache/datafusion/pull/11712#discussion_r1702612042
########## datafusion/functions-nested/src/map.rs: ########## @@ -161,7 +174,19 @@ impl ScalarUDFImpl for MapFunc { &self.signature } - fn return_type(&self, arg_types: &[DataType]) -> datafusion_common::Result<DataType> { + fn return_type( + &self, + _arg_types: &[DataType], + ) -> datafusion_common::Result<DataType> { + internal_err!("map: return_type called instead of return_type_from_exprs") + } + + fn return_type_from_exprs( + &self, + _args: &[Expr], + _schema: &dyn ExprSchema, + arg_types: &[DataType], + ) -> datafusion_common::Result<DataType> { Review Comment: Curiously, why are you using `return_type_from_exprs` instead? It seems that neither `args` nor `schema` will be used. I guess we can just use `return_type`. ########## datafusion/functions-nested/src/map.rs: ########## @@ -202,3 +226,128 @@ fn get_element_type(data_type: &DataType) -> datafusion_common::Result<&DataType ), } } + +/// Helper function to create MapArray from array of values to support arrays for Map scalar function +/// +/// ``` text +/// Format of input KEYS and VALUES column +/// keys values +/// +---------------------+ +---------------------+ +/// | +-----------------+ | | +-----------------+ | +/// | | [k11, k12, k13] | | | | [v11, v12, v13] | | +/// | +-----------------+ | | +-----------------+ | +/// | | | | +/// | +-----------------+ | | +-----------------+ | +/// | | [k21, k22, k23] | | | | [v21, v22, v23] | | +/// | +-----------------+ | | +-----------------+ | +/// | | | | +/// | +-----------------+ | | +-----------------+ | +/// | |[k31, k32, k33] | | | |[v31, v32, v33] | | +/// | +-----------------+ | | +-----------------+ | +/// +---------------------+ +---------------------+ +/// ``` +/// Flattened keys and values array to user create `StructArray`, +/// which serves as inner child for `MapArray` +/// +/// ``` text +/// Flattened Flattened +/// Keys Values +/// +-----------+ +-----------+ +/// | +-------+ | | +-------+ | +/// | | k11 | | | | v11 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k12 | | | | v12 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k13 | | | | v13 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k21 | | | | v21 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k22 | | | | v22 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k23 | | | | v23 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k31 | | | | v31 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k32 | | | | v32 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k33 | | | | v33 | | +/// | +-------+ | | +-------+ | +/// +-----------+ +-----------+ +/// ```text + +fn make_map_array_internal<O: OffsetSizeTrait>( + keys: ArrayRef, + values: ArrayRef, +) -> datafusion_common::Result<ColumnarValue> { + let mut offset_buffer = vec![O::usize_as(0)]; + let mut running_offset = O::usize_as(0); + + let keys = collect_array_ref::<O>(keys); + let values = collect_array_ref::<O>(values); + + let mut key_array_vec = vec![]; + let mut value_array_vec = vec![]; + for (k, v) in keys.iter().zip(values.iter()) { + running_offset = running_offset.add(O::usize_as(k.len())); + offset_buffer.push(running_offset); + key_array_vec.push(k.as_ref()); + value_array_vec.push(v.as_ref()); + } + + // concatenate all the arrays + let flattened_keys = arrow::compute::concat(key_array_vec.as_ref()).unwrap(); + let flattened_values = arrow::compute::concat(value_array_vec.as_ref()).unwrap(); + + let fields = vec![ + Arc::new(Field::new("key", flattened_keys.data_type().clone(), false)), + Arc::new(Field::new( + "value", + flattened_values.data_type().clone(), + true, + )), + ]; + + let struct_data = ArrayData::builder(DataType::Struct(fields.into())) + .len(flattened_keys.len()) + .add_child_data(flattened_keys.to_data()) + .add_child_data(flattened_values.to_data()) + .build() + .unwrap(); + + let map_data = ArrayData::builder(DataType::Map( + Arc::new(Field::new( + "entries", + struct_data.data_type().clone(), + false, + )), + false, + )) + .len(keys.len()) + .add_child_data(struct_data) + .add_buffer(Buffer::from_slice_ref(offset_buffer.as_slice())) + .build() + .unwrap(); Review Comment: ```suggestion .build()?; ``` Instead of panic here, we can use `?` to return the error. ########## datafusion/functions-nested/src/map.rs: ########## @@ -202,3 +226,128 @@ fn get_element_type(data_type: &DataType) -> datafusion_common::Result<&DataType ), } } + +/// Helper function to create MapArray from array of values to support arrays for Map scalar function +/// +/// ``` text +/// Format of input KEYS and VALUES column +/// keys values +/// +---------------------+ +---------------------+ +/// | +-----------------+ | | +-----------------+ | +/// | | [k11, k12, k13] | | | | [v11, v12, v13] | | +/// | +-----------------+ | | +-----------------+ | +/// | | | | +/// | +-----------------+ | | +-----------------+ | +/// | | [k21, k22, k23] | | | | [v21, v22, v23] | | +/// | +-----------------+ | | +-----------------+ | +/// | | | | +/// | +-----------------+ | | +-----------------+ | +/// | |[k31, k32, k33] | | | |[v31, v32, v33] | | +/// | +-----------------+ | | +-----------------+ | +/// +---------------------+ +---------------------+ +/// ``` +/// Flattened keys and values array to user create `StructArray`, +/// which serves as inner child for `MapArray` +/// +/// ``` text +/// Flattened Flattened +/// Keys Values +/// +-----------+ +-----------+ +/// | +-------+ | | +-------+ | +/// | | k11 | | | | v11 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k12 | | | | v12 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k13 | | | | v13 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k21 | | | | v21 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k22 | | | | v22 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k23 | | | | v23 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k31 | | | | v31 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k32 | | | | v32 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k33 | | | | v33 | | +/// | +-------+ | | +-------+ | +/// +-----------+ +-----------+ +/// ```text + +fn make_map_array_internal<O: OffsetSizeTrait>( + keys: ArrayRef, + values: ArrayRef, +) -> datafusion_common::Result<ColumnarValue> { + let mut offset_buffer = vec![O::usize_as(0)]; + let mut running_offset = O::usize_as(0); + + let keys = collect_array_ref::<O>(keys); + let values = collect_array_ref::<O>(values); + + let mut key_array_vec = vec![]; + let mut value_array_vec = vec![]; + for (k, v) in keys.iter().zip(values.iter()) { + running_offset = running_offset.add(O::usize_as(k.len())); + offset_buffer.push(running_offset); + key_array_vec.push(k.as_ref()); + value_array_vec.push(v.as_ref()); + } + + // concatenate all the arrays + let flattened_keys = arrow::compute::concat(key_array_vec.as_ref()).unwrap(); + let flattened_values = arrow::compute::concat(value_array_vec.as_ref()).unwrap(); + + let fields = vec![ + Arc::new(Field::new("key", flattened_keys.data_type().clone(), false)), + Arc::new(Field::new( + "value", + flattened_values.data_type().clone(), + true, + )), + ]; + + let struct_data = ArrayData::builder(DataType::Struct(fields.into())) + .len(flattened_keys.len()) + .add_child_data(flattened_keys.to_data()) + .add_child_data(flattened_values.to_data()) + .build() + .unwrap(); Review Comment: ```suggestion .build()?; ``` Same as above. ########## datafusion/functions-nested/src/map.rs: ########## @@ -202,3 +226,128 @@ fn get_element_type(data_type: &DataType) -> datafusion_common::Result<&DataType ), } } + +/// Helper function to create MapArray from array of values to support arrays for Map scalar function +/// +/// ``` text +/// Format of input KEYS and VALUES column +/// keys values +/// +---------------------+ +---------------------+ +/// | +-----------------+ | | +-----------------+ | +/// | | [k11, k12, k13] | | | | [v11, v12, v13] | | +/// | +-----------------+ | | +-----------------+ | +/// | | | | +/// | +-----------------+ | | +-----------------+ | +/// | | [k21, k22, k23] | | | | [v21, v22, v23] | | +/// | +-----------------+ | | +-----------------+ | +/// | | | | +/// | +-----------------+ | | +-----------------+ | +/// | |[k31, k32, k33] | | | |[v31, v32, v33] | | +/// | +-----------------+ | | +-----------------+ | +/// +---------------------+ +---------------------+ +/// ``` +/// Flattened keys and values array to user create `StructArray`, +/// which serves as inner child for `MapArray` +/// +/// ``` text +/// Flattened Flattened +/// Keys Values +/// +-----------+ +-----------+ +/// | +-------+ | | +-------+ | +/// | | k11 | | | | v11 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k12 | | | | v12 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k13 | | | | v13 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k21 | | | | v21 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k22 | | | | v22 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k23 | | | | v23 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k31 | | | | v31 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k32 | | | | v32 | | +/// | +-------+ | | +-------+ | +/// | +-------+ | | +-------+ | +/// | | k33 | | | | v33 | | +/// | +-------+ | | +-------+ | +/// +-----------+ +-----------+ +/// ```text + +fn make_map_array_internal<O: OffsetSizeTrait>( + keys: ArrayRef, + values: ArrayRef, +) -> datafusion_common::Result<ColumnarValue> { + let mut offset_buffer = vec![O::usize_as(0)]; + let mut running_offset = O::usize_as(0); + + let keys = collect_array_ref::<O>(keys); + let values = collect_array_ref::<O>(values); + + let mut key_array_vec = vec![]; + let mut value_array_vec = vec![]; + for (k, v) in keys.iter().zip(values.iter()) { + running_offset = running_offset.add(O::usize_as(k.len())); + offset_buffer.push(running_offset); + key_array_vec.push(k.as_ref()); + value_array_vec.push(v.as_ref()); + } + + // concatenate all the arrays + let flattened_keys = arrow::compute::concat(key_array_vec.as_ref()).unwrap(); + let flattened_values = arrow::compute::concat(value_array_vec.as_ref()).unwrap(); Review Comment: ```suggestion let flattened_keys = arrow::compute::concat(key_array_vec.as_ref())?; let flattened_values = arrow::compute::concat(value_array_vec.as_ref())?; ``` Same as above. ########## datafusion/sqllogictest/test_files/map.slt: ########## Review Comment: I think we can add more complex type tests or negative tests for it. I tried a case `Utf8 Array as keys`as below but it failed. ``` statement ok create table t as values ('a', 1, 'k1', 10, ['k1', 'k2'], [1, 2], [['a'], ['b']]), ('b', 2, 'k3', 30, ['k3'], [3], [['c']]), ('d', 4, 'k5', 50, ['k5'], [5], [['d']]); query ? SELECT map(column7, column6) FROM t; ---- {[a]: 1, [b]: 2} {[c]: 3} {[d]: 5} ``` The error message: ``` External error: query failed: DataFusion error: Arrow error: Invalid argument error: column types must match schema types, expected Map(Field { name: "entries", data_type: Struct([Field { name: "key", data_type: List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false) but found Map(Field { name: "entries", data_type: Struct([Field { name: "key", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false) at column index 0 ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org