alamb commented on code in PR #6796:
URL: https://github.com/apache/arrow-datafusion/pull/6796#discussion_r1286376608


##########
datafusion/expr/src/expr.rs:
##########
@@ -970,6 +987,43 @@ impl Expr {
     pub fn contains_outer(&self) -> bool {
         !find_out_reference_exprs(self).is_empty()
     }
+
+    /// Flatten the nested array expressions until the base array is reached.
+    /// For example:
+    ///  unnest([[1, 2, 3], [4, 5, 6]]) => unnest([1, 2, 3, 4, 5, 6])
+    pub fn flatten(&self) -> Self {
+        self.try_flatten().unwrap_or_else(|_| self.clone())
+    }
+
+    pub fn try_flatten(&self) -> Result<Self> {
+        match self {
+            Self::ScalarFunction(ScalarFunction {
+                fun: built_in_function::BuiltinScalarFunction::MakeArray,
+                args,
+            }) => {
+                let flatten_args: Vec<Expr> =
+                    args.iter().flat_map(Self::flatten_internal).collect();
+                Ok(Self::ScalarFunction(ScalarFunction {
+                    fun: built_in_function::BuiltinScalarFunction::MakeArray,
+                    args: flatten_args,
+                }))
+            }
+            _ => Err(DataFusionError::Internal(format!(

Review Comment:
   ```suggestion
               _ => Err(DataFusionError::NotYetImplemented(format!(
   ```



##########
datafusion/expr/src/expr.rs:
##########
@@ -970,6 +987,43 @@ impl Expr {
     pub fn contains_outer(&self) -> bool {
         !find_out_reference_exprs(self).is_empty()
     }
+
+    /// Flatten the nested array expressions until the base array is reached.
+    /// For example:
+    ///  unnest([[1, 2, 3], [4, 5, 6]]) => unnest([1, 2, 3, 4, 5, 6])

Review Comment:
   this example is in terms of `unnest` but the function is for flatten.



##########
datafusion/expr/src/expr.rs:
##########
@@ -970,6 +987,43 @@ impl Expr {
     pub fn contains_outer(&self) -> bool {
         !find_out_reference_exprs(self).is_empty()
     }
+
+    /// Flatten the nested array expressions until the base array is reached.
+    /// For example:
+    ///  unnest([[1, 2, 3], [4, 5, 6]]) => unnest([1, 2, 3, 4, 5, 6])
+    pub fn flatten(&self) -> Self {
+        self.try_flatten().unwrap_or_else(|_| self.clone())
+    }
+
+    pub fn try_flatten(&self) -> Result<Self> {

Review Comment:
   I don't understand what this is doing -- why is it checking for make_array 
during `Expr` creation? I would expect the Expr to always be created and then 
perhaps an error thrown during analyze or expression simplifcation



##########
datafusion/expr/src/logical_plan/builder.rs:
##########
@@ -1038,6 +1042,114 @@ impl LogicalPlanBuilder {
     pub fn unnest_column(self, column: impl Into<Column>) -> Result<Self> {
         Ok(Self::from(unnest(self.plan, column.into())?))
     }
+
+    pub fn join_unnest_plans(

Review Comment:
   Is it possible to use 
https://docs.rs/datafusion/latest/datafusion/physical_plan/unnest/struct.UnnestExec.html
 directly rather than building up a special LogicalPlan? It seems like 
`UnnestExec` already does this



##########
datafusion/expr/src/expr_schema.rs:
##########
@@ -86,6 +87,32 @@ impl ExprSchemable for Expr {
                     .collect::<Result<Vec<_>>>()?;
                 Ok((fun.return_type)(&data_types)?.as_ref().clone())
             }
+            Expr::Unnest(Unnest { array_exprs }) => {
+                let data_types = array_exprs
+                    .iter()
+                    .map(|e| e.get_type(schema))
+                    .collect::<Result<Vec<_>>>()?;
+
+                if data_types.is_empty() {
+                    return Err(DataFusionError::NotImplemented(
+                        "Unnest does not support empty data types".to_string(),
+                    ));
+                }
+
+                // Use a HashSet to efficiently check for unique data types
+                let unique_data_types: HashSet<_> = 
data_types.iter().collect();
+
+                // If there is more than one unique data type, return an error
+                if unique_data_types.len() > 1 {
+                    return Err(DataFusionError::NotImplemented(
+                        "Unnest does not support inconsistent data 
types".to_string(),

Review Comment:
   It would be nice to include the types in this error message as well (so it 
is clear if for example the mismatch is `Int32` and `Int64`



##########
datafusion/expr/src/logical_plan/builder.rs:
##########
@@ -1426,6 +1542,141 @@ pub fn unnest(input: LogicalPlan, column: Column) -> 
Result<LogicalPlan> {
     }))
 }
 
+// Create function name for unnest
+// Different from create_function_physical_name, so we create a customize one.
+// We need fun(arg1,arg2,arg3) format, but not fun(arg1, arg2, arg3).
+// TODO: We dont need this if we can customize format in impl::Display for Expr
+fn create_unnest_arguments_name(array_exprs: Vec<Expr>) -> Result<Vec<String>> 
{
+    array_exprs
+        .iter()
+        .map(|e| match e {
+            Expr::ScalarFunction(ScalarFunction { fun, args, .. }) => {
+                let arg_str = args
+                    .iter()
+                    .map(|e| e.to_string())
+                    .collect::<Vec<String>>()
+                    .join(",");
+
+                Ok(format!("{fun}({arg_str})"))
+            }
+            Expr::Literal(sv) => {
+                let name = format!("{sv:?}");
+                Ok(name)
+            }
+            e => Err(DataFusionError::NotImplemented(format!(
+                "Expr {e} is not supported in UNNEST"
+            ))),
+        })
+        .collect::<Result<Vec<String>>>()
+}
+
+/// Create unnest plan from arrays.
+fn build_unnest_plans(

Review Comment:
   I am surprised this can't use 
https://docs.rs/datafusion/latest/datafusion/logical_expr/enum.LogicalPlan.html#variant.Unnest
 



##########
datafusion/expr/src/expr.rs:
##########
@@ -970,6 +987,43 @@ impl Expr {
     pub fn contains_outer(&self) -> bool {
         !find_out_reference_exprs(self).is_empty()
     }
+
+    /// Flatten the nested array expressions until the base array is reached.
+    /// For example:
+    ///  unnest([[1, 2, 3], [4, 5, 6]]) => unnest([1, 2, 3, 4, 5, 6])
+    pub fn flatten(&self) -> Self {
+        self.try_flatten().unwrap_or_else(|_| self.clone())
+    }
+
+    pub fn try_flatten(&self) -> Result<Self> {

Review Comment:
   Can we please add documentation about what `try_flatten` does and how it is 
different than `flatten`?



##########
datafusion/core/tests/sqllogictests/test_files/array.slt:
##########
@@ -1794,9 +1802,150 @@ select make_array(f0) from fixed_size_list_array
 ----
 [[1, 2], [3, 4]]
 
+## Unnest
 
-### Delete tables
+# Set target partitions to 1 for deterministic results
+statement ok
+set datafusion.execution.target_partitions = 1;
+
+query ??
+select unnest(make_array(1,2,3)),
+       unnest(make_array(4,5))
+;
+----
+1 4
+2 5
+3 NULL
+
+query ???
+select unnest(make_array(1,2,3)),
+       unnest(make_array(4,5)),
+       unnest(make_array(6,7,8,9))
+;
+----
+1 4 6
+2 5 7
+3 NULL 8
+NULL NULL 9
+
+query ???
+select unnest(make_array(1,2,3,4,5)),
+       unnest(make_array(6,7)),
+       unnest(make_array(8,9,10,11,22,33))
+;
+----
+1 6 8
+2 7 9
+3 NULL 10
+4 NULL 11
+5 NULL 22
+NULL NULL 33
+
+# Select From
+
+query IIIII
+select * from unnest(
+  make_array(1), 
+  make_array(2,3), 
+  make_array(4,5,6),
+  make_array(7,8),
+  make_array(9)
+);
+----
+1 2 4 7 9
+NULL 3 5 8 NULL
+NULL NULL 6 NULL NULL
+
+query I
+select * from unnest(make_array(1,2,3)) as data
+----
+1
+2
+3
+
+query II
+select * from unnest(make_array(1,2,3),make_array(7,6,5,4)) as data(a,b) order 
by b
+----
+NULL 4
+3 5
+2 6
+1 7
+
+query ?T?I
+select * from arrays_unnest;
+----
+[1, 2] A [1, 2] 3
+NULL B [4] 5
+[3] C [6, 7, 8] 9
+
+# TODO: Unnest columns fails
+query error DataFusion error: SQL error: TokenizerError\("Unterminated string 
literal at Line: 2, Column 95"\)
+caused by
+Internal error: UNNEST only supports list type\. This was likely caused by a 
bug in DataFusion's code and we would welcome that you file an bug report in 
our issue tracker

Review Comment:
   this appears still to be NotImplemented



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to