alamb commented on code in PR #11577: URL: https://github.com/apache/datafusion/pull/11577#discussion_r1771937668
########## datafusion/expr/src/logical_plan/builder.rs: ########## @@ -1184,6 +1186,26 @@ impl LogicalPlanBuilder { self, columns: Vec<Column>, options: UnnestOptions, + ) -> Result<Self> { + unnest_with_options( + Arc::unwrap_or_clone(self.plan), + columns + .into_iter() + .map(|c| (c, ColumnUnnestType::Inferred)) + .collect(), + options, + ) + .map(Self::new) + } + + /// Unnest the given columns with the given [`UnnestOptions`] + /// if one column is a list type, it can be recursively and simultaneously + /// unnested into the desired recursion levels + /// e.g select unnest(list_col,depth=1), unnest(list_col,depth=2) + pub fn unnest_columns_recursive_with_options( Review Comment: An alternate API would be to add the `ColumnUnnestType` as a field on `UnnestOptions` (for that matter, maybe we should just add `columns: Vec<(Column, ColumnUnnestType)>,` as a field 🤔 ########## datafusion/expr/src/logical_plan/builder.rs: ########## @@ -1561,7 +1583,61 @@ impl TableSource for LogicalTableSource { /// Create a [`LogicalPlan::Unnest`] plan pub fn unnest(input: LogicalPlan, columns: Vec<Column>) -> Result<LogicalPlan> { - unnest_with_options(input, columns, UnnestOptions::default()) + let unnestings = columns + .into_iter() + .map(|c| (c, ColumnUnnestType::Inferred)) + .collect(); + unnest_with_options(input, unnestings, UnnestOptions::default()) +} + +pub fn get_unnested_list_datatype_recursive( Review Comment: If this needs to be `pub` i think it should have some doc strings ########## datafusion/sql/src/utils.rs: ########## @@ -553,17 +901,25 @@ mod tests { // Only the inner most/ bottom most unnest is transformed assert_eq!( transformed_exprs, - vec![unnest(col("UNNEST(struct_col[matrix])"))] + vec![col("unnest_placeholder(struct_col[matrix],depth=2)") + .alias("UNNEST(UNNEST(struct_col[matrix]))")] ); - assert_eq!( - unnest_placeholder_columns, - vec!["UNNEST(struct_col[matrix])"] + // TODO: add a test case where Review Comment: Is this still something you mean to do? ########## datafusion/expr/src/logical_plan/builder.rs: ########## @@ -1561,7 +1583,61 @@ impl TableSource for LogicalTableSource { /// Create a [`LogicalPlan::Unnest`] plan pub fn unnest(input: LogicalPlan, columns: Vec<Column>) -> Result<LogicalPlan> { - unnest_with_options(input, columns, UnnestOptions::default()) + let unnestings = columns + .into_iter() + .map(|c| (c, ColumnUnnestType::Inferred)) + .collect(); + unnest_with_options(input, unnestings, UnnestOptions::default()) +} + +pub fn get_unnested_list_datatype_recursive( + data_type: &DataType, + depth: usize, +) -> Result<DataType> { + match data_type { + DataType::List(field) + | DataType::FixedSizeList(field, _) + | DataType::LargeList(field) => { + if depth == 1 { + return Ok(field.data_type().clone()); + } + return get_unnested_list_datatype_recursive(field.data_type(), depth - 1); + } + _ => {} + }; + + internal_err!("trying to unnest on invalid data type {:?}", data_type) +} + +/// Infer the unnest type based on the data type: +/// - list type: infer to unnest(list(col, depth=1)) +/// - struct type: infer to unnest(struct) +fn infer_unnest_type( + col_name: &String, + data_type: &DataType, +) -> Result<ColumnUnnestType> { + match data_type { + DataType::List(_) | DataType::FixedSizeList(_, _) | DataType::LargeList(_) => { + Ok(ColumnUnnestType::List(vec![ColumnUnnestList { + output_column: Column::from_name(col_name), + depth: 1, + }])) + } + DataType::Struct(_) => Ok(ColumnUnnestType::Struct), + _ => { + internal_err!("trying to unnest on invalid data type {:?}", data_type) + } + } +} + +pub fn get_struct_unnested_columns( Review Comment: likewise here ########## datafusion/physical-plan/src/unnest.rs: ########## @@ -765,6 +994,139 @@ mod tests { Ok(()) } + #[test] + fn test_build_batch_list_arr_recursive() -> datafusion_common::Result<()> { + // col1 | col2 + // [[1,2,3],null,[4,5]] | ['a','b'] + // [[7,8,9,10], null, [11,12,13]] | ['c','d'] + // null | ['e'] + let list_arr1 = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ + Some(vec![Some(1), Some(2), Some(3)]), + None, + Some(vec![Some(4), Some(5)]), + Some(vec![Some(7), Some(8), Some(9), Some(10)]), + None, + Some(vec![Some(11), Some(12), Some(13)]), + ]); + + let list_arr1_ref = Arc::new(list_arr1) as ArrayRef; + let offsets = OffsetBuffer::from_lengths([3, 3, 0]); + let mut nulls = BooleanBufferBuilder::new(3); + nulls.append(true); + nulls.append(true); + nulls.append(false); + // list<list<int32>> + let col1_field = Field::new_list_field( + DataType::List(Arc::new(Field::new_list_field( + list_arr1_ref.data_type().to_owned(), + true, + ))), + true, + ); + let col1 = ListArray::new( + Arc::new(Field::new_list_field( + list_arr1_ref.data_type().to_owned(), + true, + )), + offsets, + list_arr1_ref, + Some(NullBuffer::new(nulls.finish())), + ); + + let list_arr2 = StringArray::from(vec![ + Some("a"), + Some("b"), + Some("c"), + Some("d"), + Some("e"), + ]); + + let offsets = OffsetBuffer::from_lengths([2, 2, 1]); + let mut nulls = BooleanBufferBuilder::new(3); + nulls.append_n(3, true); + let col2_field = Field::new( + "col2", + DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, true))), + true, + ); + let col2 = GenericListArray::<i32>::new( + Arc::new(Field::new_list_field(DataType::Utf8, true)), + OffsetBuffer::new(offsets.into()), + Arc::new(list_arr2), + Some(NullBuffer::new(nulls.finish())), + ); + // convert col1 and col2 to a record batch + let schema = Arc::new(Schema::new(vec![col1_field, col2_field])); + let out_schema = Arc::new(Schema::new(vec![ + Field::new( + "col1_unnest_placeholder_depth_1", + DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), + true, + ), + Field::new("col1_unnest_placeholder_depth_2", DataType::Int32, true), + Field::new("col2_unnest_placeholder_depth_1", DataType::Utf8, true), + ])); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(col1) as ArrayRef, Arc::new(col2) as ArrayRef], + ) + .unwrap(); + let list_type_columns = vec![ + ListUnnest { + index_in_input_schema: 0, + depth: 1, + }, + ListUnnest { + index_in_input_schema: 0, + depth: 2, + }, + ListUnnest { + index_in_input_schema: 1, + depth: 1, + }, + ]; + let ret = build_batch( + &batch, + &out_schema, + list_type_columns.as_ref(), + &HashSet::default(), + &UnnestOptions { + preserve_nulls: true, + }, + )?; + let actual = Review Comment: for the future, you an use `assert_batches_eq` for this kind of comparison ########## datafusion/sqllogictest/test_files/unnest.slt: ########## @@ -165,6 +165,34 @@ select unnest(column1), column1 from unnest_table; 6 [6] 12 [12] Review Comment: Are there any `.slt` tests for unnesting structures? these tests seem to be on arrays (Lists) rather than Structs -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org