alamb commented on code in PR #12836: URL: https://github.com/apache/datafusion/pull/12836#discussion_r1806957606
########## datafusion/common/src/unnest.rs: ########## @@ -64,13 +66,24 @@ pub struct UnnestOptions { /// Should nulls in the input be preserved? Defaults to true pub preserve_nulls: bool, + /// Without explicit recursions, all + /// Datafusion will infer the unesting type with depth = 1 + pub recursions: Option<Vec<RecursionUnnestOption>>, +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd)] +pub struct RecursionUnnestOption { Review Comment: Can we please add documentation to this structure as it is public? ########## datafusion/expr/src/logical_plan/builder.rs: ########## @@ -2263,24 +2238,20 @@ mod tests { // Simultaneously unnesting a list (with different depth) and a struct column let plan = nested_table_scan("test_table")? - .unnest_columns_recursive_with_options( - vec![ - ( - "stringss".into(), - ColumnUnnestType::List(vec![ - ColumnUnnestList { - output_column: Column::from_name("stringss_depth_1"), - depth: 1, - }, - ColumnUnnestList { - output_column: Column::from_name("stringss_depth_2"), - depth: 2, - }, - ]), - ), - ("struct_singular".into(), ColumnUnnestType::Inferred), - ], - UnnestOptions::default(), + .unnest_columns_with_options( + vec!["stringss".into(), "struct_singular".into()], + UnnestOptions::default().with_recursions(vec![ + RecursionUnnestOption { + input_column: "stringss".into(), + output_column: "stringss_depth_1".into(), + depth: 1, + }, + RecursionUnnestOption { + input_column: "stringss".into(), + output_column: "stringss_depth_2".into(), + depth: 2, + }, + ]), Review Comment: Here is another potential way for this API to look would be to take individual options rather than a single Vec I don't feel strongly about this ```suggestion UnnestOptions::new() .with_recursion( RecursionUnnestOption { input_column: "stringss".into(), output_column: "stringss_depth_1".into(), depth: 1, }) .with_recusion( RecursionUnnestOption { input_column: "stringss".into(), output_column: "stringss_depth_2".into(), depth: 2, }, ), ``` ########## datafusion/sql/src/select.rs: ########## @@ -440,11 +460,31 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { }; projection_exprs.extend(inner_projection_exprs); + let mut list_recursions = vec![]; + let unnest_col_vec = unnest_columns + .into_iter() + .map(|(col, maybe_list_unnest)| { + match maybe_list_unnest { + None => {} + Some(list_unnest) => { + list_recursions.extend(list_unnest.into_iter().map( + |unnest_list| RecursionUnnestOption { + input_column: col.clone(), + output_column: unnest_list.output_column, + depth: unnest_list.depth, + }, + )); + } + } Review Comment: I think you could simplify this code (reduce recursion level) with `if let` like ```suggestion if let Some(list_unnest) = maybe_list_unnest { list_recursions.extend(list_unnest.into_iter().map( |unnest_list| RecursionUnnestOption { input_column: col.clone(), output_column: unnest_list.output_column, depth: unnest_list.depth, }, )); } } ``` Also I wonder if it would be possible / easier for `rewrite_recursive_unnests_bottom_up` to simply return a `HashSet<Column, RecursionUnnestOption>` directly rather than having to convert the results ########## datafusion/sql/src/select.rs: ########## @@ -301,7 +301,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // The transformation happen bottom up, one at a time for each iteration // Only exhaust the loop if no more unnest transformation is found for i in 0.. { - let mut unnest_columns = vec![]; + let mut unnest_columns = HashMap::new(); Review Comment: It might be better to use an IndexMap here (to retain the same order as was specified in the query) -- a HashMap can mix up the order ########## datafusion/physical-plan/src/unnest.rs: ########## @@ -1092,38 +1090,37 @@ mod tests { &HashSet::default(), &UnnestOptions { preserve_nulls: true, + recursions: None, }, )?; - let actual = - format!("{}", pretty_format_batches(vec![ret].as_ref())?).to_lowercase(); - let expected = r#" -+---------------------------------+---------------------------------+---------------------------------+ -| col1_unnest_placeholder_depth_1 | col1_unnest_placeholder_depth_2 | col2_unnest_placeholder_depth_1 | -+---------------------------------+---------------------------------+---------------------------------+ -| [1, 2, 3] | 1 | a | -| | 2 | b | -| [4, 5] | 3 | | -| [1, 2, 3] | | a | -| | | b | -| [4, 5] | | | -| [1, 2, 3] | 4 | a | -| | 5 | b | -| [4, 5] | | | -| [7, 8, 9, 10] | 7 | c | -| | 8 | d | -| [11, 12, 13] | 9 | | -| | 10 | | -| [7, 8, 9, 10] | | c | -| | | d | -| [11, 12, 13] | | | -| [7, 8, 9, 10] | 11 | c | -| | 12 | d | -| [11, 12, 13] | 13 | | -| | | e | -+---------------------------------+---------------------------------+---------------------------------+ - "# - .trim(); - assert_eq!(actual, expected); + + let expected = &[ +"+---------------------------------+---------------------------------+---------------------------------+", +"| col1_unnest_placeholder_depth_1 | col1_unnest_placeholder_depth_2 | col2_unnest_placeholder_depth_1 |", +"+---------------------------------+---------------------------------+---------------------------------+", +"| [1, 2, 3] | 1 | a |", +"| | 2 | b |", +"| [4, 5] | 3 | |", +"| [1, 2, 3] | | a |", +"| | | b |", +"| [4, 5] | | |", +"| [1, 2, 3] | 4 | a |", +"| | 5 | b |", +"| [4, 5] | | |", +"| [7, 8, 9, 10] | 7 | c |", +"| | 8 | d |", +"| [11, 12, 13] | 9 | |", +"| | 10 | |", +"| [7, 8, 9, 10] | | c |", +"| | | d |", +"| [11, 12, 13] | | |", +"| [7, 8, 9, 10] | 11 | c |", +"| | 12 | d |", +"| [11, 12, 13] | 13 | |", +"| | | e |", +"+---------------------------------+---------------------------------+---------------------------------+", + ]; + assert_batches_eq!(expected, &[ret]); Review Comment: 👍 ########## datafusion/common/src/unnest.rs: ########## @@ -64,13 +66,24 @@ pub struct UnnestOptions { /// Should nulls in the input be preserved? Defaults to true pub preserve_nulls: bool, + /// Without explicit recursions, all Review Comment: Can you please document what the `Vec` means in this context? Is the option for each of the columns in the schema? If so can you say it is expected to have the same number of elements? If possible it would also be great to update the documentation on this struct to explain what is going on with recursive options ########## datafusion/expr/src/logical_plan/builder.rs: ########## @@ -1192,26 +1191,6 @@ impl LogicalPlanBuilder { self, columns: Vec<Column>, options: UnnestOptions, - ) -> Result<Self> { - unnest_with_options( - Arc::unwrap_or_clone(self.plan), - columns - .into_iter() - .map(|c| (c, ColumnUnnestType::Inferred)) - .collect(), - options, - ) - .map(Self::new) - } - - /// Unnest the given columns with the given [`UnnestOptions`] - /// if one column is a list type, it can be recursively and simultaneously - /// unnested into the desired recursion levels - /// e.g select unnest(list_col,depth=1), unnest(list_col,depth=2) - pub fn unnest_columns_recursive_with_options( Review Comment: 👍 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org