This is an automated email from the ASF dual-hosted git repository.
jayzhan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 8b4a8e6b15 support unnest as subexpression (#9592)
8b4a8e6b15 is described below
commit 8b4a8e6b157c007e7988f715cb4b693578438f8b
Author: YjyJeff <[email protected]>
AuthorDate: Thu Mar 14 14:22:42 2024 +0800
support unnest as subexpression (#9592)
* support unnest as subexpression and multiple unnests in projection
* update alias unnest
* remove unnest multiple columns
* rename mulitple unnest columns to multiple unnest functions in selection
* fix: move column name to if branch
---------
Co-authored-by: jefffffyang <[email protected]>
---
datafusion/expr/src/expr_schema.rs | 20 +++++++-
datafusion/sql/src/select.rs | 52 ++++++++++++++++-----
datafusion/sqllogictest/test_files/unnest.slt | 67 ++++++++++++++++++++++++++-
3 files changed, 123 insertions(+), 16 deletions(-)
diff --git a/datafusion/expr/src/expr_schema.rs
b/datafusion/expr/src/expr_schema.rs
index 70ffa5064a..1d83fbe8c0 100644
--- a/datafusion/expr/src/expr_schema.rs
+++ b/datafusion/expr/src/expr_schema.rs
@@ -28,7 +28,8 @@ use crate::{utils, LogicalPlan, Projection, Subquery};
use arrow::compute::can_cast_types;
use arrow::datatypes::{DataType, Field};
use datafusion_common::{
- internal_err, plan_datafusion_err, plan_err, Column, DFField, ExprSchema,
Result,
+ internal_err, not_impl_err, plan_datafusion_err, plan_err, Column, DFField,
+ ExprSchema, Result,
};
use std::collections::HashMap;
use std::sync::Arc;
@@ -113,7 +114,22 @@ impl ExprSchemable for Expr {
.iter()
.map(|e| e.get_type(schema))
.collect::<Result<Vec<_>>>()?;
- Ok(arg_data_types[0].clone())
+ let arg_data_type = arg_data_types[0].clone();
+ // Unnest's output type is the inner type of the list
+ match arg_data_type{
+ DataType::List(field) | DataType::LargeList(field) |
DataType::FixedSizeList(field, _) =>{
+ Ok(field.data_type().clone())
+ }
+ DataType::Struct(_) => {
+ not_impl_err!("unnest() does not support struct yet")
+ }
+ DataType::Null => {
+ not_impl_err!("unnest() does not support null yet")
+ }
+ _ => {
+ plan_err!("unnest() can only be applied to array,
struct and null")
+ }
+ }
}
Expr::ScalarFunction(ScalarFunction { func_def, args }) => {
let arg_data_types = args
diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs
index c5c80bd6ac..1bfd60a8ce 100644
--- a/datafusion/sql/src/select.rs
+++ b/datafusion/sql/src/select.rs
@@ -24,6 +24,7 @@ use crate::utils::{
resolve_columns, resolve_positions_to_exprs,
};
+use datafusion_common::tree_node::{Transformed, TreeNode};
use datafusion_common::{not_impl_err, plan_err, DataFusionError, Result};
use datafusion_common::{Column, UnnestOptions};
use datafusion_expr::expr::{Alias, Unnest};
@@ -276,25 +277,51 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
Ok(plan)
}
- // Try converting Expr::Unnest to LogicalPlan::Unnest if possible,
otherwise do the final projection
+ /// Try converting Expr(Unnest(Expr)) to Projection/Unnest/Projection
pub(super) fn try_process_unnest(
&self,
input: LogicalPlan,
select_exprs: Vec<Expr>,
) -> Result<LogicalPlan> {
let mut unnest_columns = vec![];
- // Map unnest expressions to their argument
- let projection_exprs = select_exprs
+ let mut inner_projection_exprs = vec![];
+
+ let outer_projection_exprs = select_exprs
.into_iter()
.map(|expr| {
- if let Expr::Unnest(Unnest { ref exprs }) = expr {
- let column_name = expr.display_name()?;
- unnest_columns.push(column_name.clone());
- // Add alias for the argument expression, to avoid naming
conflicts with other expressions
- // in the select list. For example: `select unnest(col1),
col1 from t`.
- Ok(exprs[0].clone().alias(column_name))
+ let Transformed {
+ data: transformed_expr,
+ transformed,
+ tnr: _,
+ } = expr.transform_up_mut(&mut |expr: Expr| {
+ if let Expr::Unnest(Unnest { ref exprs }) = expr {
+ let column_name = expr.display_name()?;
+ unnest_columns.push(column_name.clone());
+ // Add alias for the argument expression, to avoid
naming conflicts with other expressions
+ // in the select list. For example: `select
unnest(col1), col1 from t`.
+ inner_projection_exprs
+ .push(exprs[0].clone().alias(column_name.clone()));
+ Ok(Transformed::yes(Expr::Column(Column::from_name(
+ column_name,
+ ))))
+ } else {
+ Ok(Transformed::no(expr))
+ }
+ })?;
+
+ if !transformed {
+ if matches!(&transformed_expr, Expr::Column(_)) {
+ inner_projection_exprs.push(transformed_expr.clone());
+ Ok(transformed_expr)
+ } else {
+ // We need to evaluate the expr in the inner
projection,
+ // outer projection just select its name
+ let column_name = transformed_expr.display_name()?;
+ inner_projection_exprs.push(transformed_expr);
+ Ok(Expr::Column(Column::from_name(column_name)))
+ }
} else {
- Ok(expr)
+ Ok(transformed_expr)
}
})
.collect::<Result<Vec<_>>>()?;
@@ -302,7 +329,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
// Do the final projection
if unnest_columns.is_empty() {
LogicalPlanBuilder::from(input)
- .project(projection_exprs)?
+ .project(inner_projection_exprs)?
.build()
} else {
if unnest_columns.len() > 1 {
@@ -312,8 +339,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
// Set preserve_nulls to false to ensure compatibility with DuckDB
and PostgreSQL
let unnest_options =
UnnestOptions::new().with_preserve_nulls(false);
LogicalPlanBuilder::from(input)
- .project(projection_exprs)?
+ .project(inner_projection_exprs)?
.unnest_column_with_options(unnest_column, unnest_options)?
+ .project(outer_projection_exprs)?
.build()
}
}
diff --git a/datafusion/sqllogictest/test_files/unnest.slt
b/datafusion/sqllogictest/test_files/unnest.slt
index f60f715242..5c178bb392 100644
--- a/datafusion/sqllogictest/test_files/unnest.slt
+++ b/datafusion/sqllogictest/test_files/unnest.slt
@@ -135,11 +135,10 @@ select array_remove(column1, 4), unnest(column2), column3
* 10 from unnest_table
query error DataFusion error: Error during planning: unnest\(\) can only be
applied to array, struct and null
select unnest(column3) from unnest_table;
-## Unnest multiple columns
+## Multiple unnest functions in selection
query error DataFusion error: This feature is not implemented: Only support
single unnest expression for now
select unnest(column1), unnest(column2) from unnest_table;
-
## Unnest scalar in select list
query error DataFusion error: Error during planning: unnest\(\) can only be
applied to array, struct and null
select unnest(1);
@@ -254,5 +253,69 @@ select * from unnest([1,2,(select sum(column3) from
unnest_table)]);
2
10
+## Unnest is the sub-expression of other expression
+query II
+select unnest(column1) as a, column3 from unnest_table;
+----
+1 1
+2 1
+3 1
+4 2
+5 2
+6 3
+12 NULL
+
+query BI
+select unnest(column1) is not null, column3 from unnest_table;
+----
+true 1
+true 1
+true 1
+true 2
+true 2
+true 3
+true NULL
+
+query II
+select -unnest(column1) as a, column3 from unnest_table;
+----
+-1 1
+-2 1
+-3 1
+-4 2
+-5 2
+-6 3
+-12 NULL
+
+query II
+select unnest(array_remove(column1, 3)) as a, column3 from unnest_table;
+----
+1 1
+2 1
+4 2
+5 2
+6 3
+12 NULL
+
+query II
+select unnest(array_remove(column1, 3)) as c1, column3 from unnest_table order
by c1 desc, column3;
+----
+12 NULL
+6 3
+5 2
+4 2
+2 1
+1 1
+
+query II
+select unnest(array_remove(column1, 3)) - 1 as c1, column3 from unnest_table;
+----
+0 1
+1 1
+3 2
+4 2
+5 3
+11 NULL
+
statement ok
drop table unnest_table;