alamb commented on code in PR #7627: URL: https://github.com/apache/arrow-rs/pull/7627#discussion_r2283155956
########## arrow-row/src/lib.rs: ########## @@ -1668,8 +1668,21 @@ unsafe fn decode_column( rows.iter_mut().for_each(|row| *row = &row[1..]); let children = converter.convert_raw(rows, validate_utf8)?; - let child_data = children.iter().map(|c| c.to_data()).collect(); - let builder = ArrayDataBuilder::new(field.data_type.clone()) + let child_data: Vec<ArrayData> = children.iter().map(|c| c.to_data()).collect(); + // Since RowConverter flattens certian data types (i.e. Dictionary), + // we need to use updated data type instead of original field + let corrected_fields: Vec<Field> = match &field.data_type { + DataType::Struct(struct_fields) => struct_fields + .iter() + .zip(child_data.iter()) + .map(|(orig_field, child_array)| { + rebuild_field_with_child_type(orig_field, child_array) Review Comment: I think this is the same as using `Field::with_data_type` which I think is more explicit ```suggestion orig_field.clone().with_data_type(child_array.data_type().clone()) ``` ########## arrow-row/src/lib.rs: ########## @@ -2148,6 +2172,166 @@ mod tests { back[0].to_data().validate_full().unwrap(); } + #[test] + fn test_dictionary_in_struct() { + let ty = DataType::Struct( + vec![Field::new_dictionary( + "foo", + DataType::Int32, + DataType::Int32, + false, + )] + .into(), + ); + // Test Case 1. empty array + let s = arrow_array::new_empty_array(&ty); + + let sort_fields = vec![SortField::new(s.data_type().clone())]; + let converter = RowConverter::new(sort_fields).unwrap(); + let r = converter.convert_columns(&[Arc::clone(&s)]).unwrap(); + + let back = converter.convert_rows(&r).unwrap(); + let [s2] = back.try_into().unwrap(); + + // RowConverter flattens Dictionary + // s.ty = Struct(foo Dictionary(Int32, Int32)), s2.ty = Struct(foo Int32) + assert_ne!(&s.data_type(), &s2.data_type()); + s2.to_data().validate_full().unwrap(); + assert_eq!(s.len(), s2.len()); + + // Test Case 2. None empty array + let builder = PrimitiveDictionaryBuilder::<Int32Type, Int32Type>::new(); + let mut struct_builder = StructBuilder::new( + vec![Field::new_dictionary( + "foo", + DataType::Int32, + DataType::Int32, + false, + )], + vec![Box::new(builder)], + ); + let dict_builder = struct_builder + .field_builder::<PrimitiveDictionaryBuilder<Int32Type, Int32Type>>(0) Review Comment: Can we please change this so it uses different types for the keys and values -- otherwise it is hard to keep track of the key and values in the code below ########## arrow-row/src/lib.rs: ########## @@ -1668,8 +1668,21 @@ unsafe fn decode_column( rows.iter_mut().for_each(|row| *row = &row[1..]); let children = converter.convert_raw(rows, validate_utf8)?; - let child_data = children.iter().map(|c| c.to_data()).collect(); - let builder = ArrayDataBuilder::new(field.data_type.clone()) + let child_data: Vec<ArrayData> = children.iter().map(|c| c.to_data()).collect(); + // Since RowConverter flattens certian data types (i.e. Dictionary), Review Comment: ```suggestion // Since RowConverter flattens certain data types (i.e. Dictionary), ``` ########## arrow-row/src/list.rs: ########## @@ -179,7 +182,20 @@ pub unsafe fn decode<O: OffsetSizeTrait>( let child_data = child[0].to_data(); - let builder = ArrayDataBuilder::new(field.data_type.clone()) + // Since RowConverter flattens certian data types (i.e. Dictionary), Review Comment: ```suggestion // Since RowConverter flattens certain data types (i.e. Dictionary), ``` ########## arrow-row/src/lib.rs: ########## @@ -2148,6 +2172,166 @@ mod tests { back[0].to_data().validate_full().unwrap(); } + #[test] + fn test_dictionary_in_struct() { + let ty = DataType::Struct( + vec![Field::new_dictionary( + "foo", + DataType::Int32, + DataType::Int32, + false, + )] + .into(), + ); + // Test Case 1. empty array + let s = arrow_array::new_empty_array(&ty); + + let sort_fields = vec![SortField::new(s.data_type().clone())]; + let converter = RowConverter::new(sort_fields).unwrap(); + let r = converter.convert_columns(&[Arc::clone(&s)]).unwrap(); + + let back = converter.convert_rows(&r).unwrap(); + let [s2] = back.try_into().unwrap(); + + // RowConverter flattens Dictionary + // s.ty = Struct(foo Dictionary(Int32, Int32)), s2.ty = Struct(foo Int32) + assert_ne!(&s.data_type(), &s2.data_type()); + s2.to_data().validate_full().unwrap(); + assert_eq!(s.len(), s2.len()); + + // Test Case 2. None empty array Review Comment: Can you please break test case 2 into its own `#test` function so it is easier to see? ########## arrow-row/src/lib.rs: ########## @@ -1863,6 +1887,64 @@ mod tests { back[0].to_data().validate_full().unwrap(); } + #[test] + fn test_dictionary_in_struct() { + let ty = DataType::Struct( + vec![Field::new_dictionary( + "foo", + DataType::Int32, + DataType::Int32, + false, + )] + .into(), + ); + let s = arrow_array::new_empty_array(&ty); + + let sort_fields = vec![SortField::new(s.data_type().clone())]; + let converter = RowConverter::new(sort_fields).unwrap(); + let r = converter.convert_columns(&[Arc::clone(&s)]).unwrap(); + + let back = converter.convert_rows(&r).unwrap(); + let [s2] = back.try_into().unwrap(); + + // RowConverter flattens Dictionary + // s.ty = Struct(foo Dictionary(Int32, Int32)), s2.ty = Struct(foo Int32) + assert_ne!(&s.data_type(), &s2.data_type()); + + s2.to_data().validate_full().unwrap(); Review Comment: I don't see an assert that the values came through correctly -- it is only checking the length) ########## arrow-row/src/lib.rs: ########## @@ -1668,8 +1668,21 @@ unsafe fn decode_column( rows.iter_mut().for_each(|row| *row = &row[1..]); let children = converter.convert_raw(rows, validate_utf8)?; - let child_data = children.iter().map(|c| c.to_data()).collect(); - let builder = ArrayDataBuilder::new(field.data_type.clone()) + let child_data: Vec<ArrayData> = children.iter().map(|c| c.to_data()).collect(); + // Since RowConverter flattens certian data types (i.e. Dictionary), + // we need to use updated data type instead of original field + let corrected_fields: Vec<Field> = match &field.data_type { + DataType::Struct(struct_fields) => struct_fields + .iter() + .zip(child_data.iter()) + .map(|(orig_field, child_array)| { + rebuild_field_with_child_type(orig_field, child_array) Review Comment: The same comment applies below as well -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org