This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new dbd0186080 Fix values with different data types caused failure (#10445)
dbd0186080 is described below
commit dbd0186080ec526c22920b1bc039af65b3d8be23
Author: baishen <[email protected]>
AuthorDate: Tue May 14 19:29:42 2024 +0800
Fix values with different data types caused failure (#10445)
* Fix values with different data types caused failure
* fix tests
* fix tests
* fix tests
* fix tests
* fix tests
* fix tests
* add `list_coercion`
* fix review suggestions
---
datafusion/expr/src/logical_plan/builder.rs | 67 +++++++++++++++------------
datafusion/expr/src/type_coercion/binary.rs | 43 +++++++++++++----
datafusion/sqllogictest/test_files/select.slt | 12 ++++-
3 files changed, 82 insertions(+), 40 deletions(-)
diff --git a/datafusion/expr/src/logical_plan/builder.rs
b/datafusion/expr/src/logical_plan/builder.rs
index 677d5a5da9..de832f6e38 100644
--- a/datafusion/expr/src/logical_plan/builder.rs
+++ b/datafusion/expr/src/logical_plan/builder.rs
@@ -36,7 +36,7 @@ use crate::logical_plan::{
Projection, Repartition, Sort, SubqueryAlias, TableScan, Union, Unnest,
Values,
Window,
};
-use crate::type_coercion::binary::comparison_coercion;
+use crate::type_coercion::binary::{comparison_coercion, values_coercion};
use crate::utils::{
can_hash, columnize_expr, compare_sort_expr, expand_qualified_wildcard,
expand_wildcard, find_valid_equijoin_key_pair,
group_window_expr_by_sort_keys,
@@ -173,13 +173,6 @@ impl LogicalPlanBuilder {
if n_cols == 0 {
return plan_err!("Values list cannot be zero length");
}
- let empty_schema = DFSchema::empty();
- let mut field_types: Vec<Option<DataType>> =
Vec::with_capacity(n_cols);
- for _ in 0..n_cols {
- field_types.push(None);
- }
- // hold all the null holes so that we can correct their data types
later
- let mut nulls: Vec<(usize, usize)> = Vec::new();
for (i, row) in values.iter().enumerate() {
if row.len() != n_cols {
return plan_err!(
@@ -189,24 +182,40 @@ impl LogicalPlanBuilder {
n_cols
);
}
- field_types = row
- .iter()
- .enumerate()
- .map(|(j, expr)| {
- if let Expr::Literal(ScalarValue::Null) = expr {
- nulls.push((i, j));
- Ok(field_types[j].clone())
- } else {
- let data_type = expr.get_type(&empty_schema)?;
- if let Some(prev_data_type) = &field_types[j] {
- if prev_data_type != &data_type {
- return plan_err!("Inconsistent data type
across values list at row {i} column {j}. Was {prev_data_type} but found
{data_type}")
- }
- }
- Ok(Some(data_type))
- }
- })
- .collect::<Result<Vec<Option<DataType>>>>()?;
+ }
+
+ let empty_schema = DFSchema::empty();
+ let mut field_types: Vec<DataType> = Vec::with_capacity(n_cols);
+ for j in 0..n_cols {
+ let mut common_type: Option<DataType> = None;
+ for (i, row) in values.iter().enumerate() {
+ let value = &row[j];
+ let data_type = value.get_type(&empty_schema)?;
+ if data_type == DataType::Null {
+ continue;
+ }
+ if let Some(prev_type) = common_type {
+ // get common type of each column values.
+ let Some(new_type) = values_coercion(&data_type,
&prev_type) else {
+ return plan_err!("Inconsistent data type across values
list at row {i} column {j}. Was {prev_type} but found {data_type}");
+ };
+ common_type = Some(new_type);
+ } else {
+ common_type = Some(data_type.clone());
+ }
+ }
+ field_types.push(common_type.unwrap_or(DataType::Utf8));
+ }
+ // wrap cast if data type is not same as common type.
+ for row in &mut values {
+ for (j, field_type) in field_types.iter().enumerate() {
+ if let Expr::Literal(ScalarValue::Null) = row[j] {
+ row[j] =
Expr::Literal(ScalarValue::try_from(field_type.clone())?);
+ } else {
+ row[j] =
+ std::mem::take(&mut row[j]).cast_to(field_type,
&empty_schema)?;
+ }
+ }
}
let fields = field_types
.iter()
@@ -214,12 +223,9 @@ impl LogicalPlanBuilder {
.map(|(j, data_type)| {
// naming is following convention
https://www.postgresql.org/docs/current/queries-values.html
let name = &format!("column{}", j + 1);
- Field::new(name, data_type.clone().unwrap_or(DataType::Utf8),
true)
+ Field::new(name, data_type.clone(), true)
})
.collect::<Vec<_>>();
- for (i, j) in nulls {
- values[i][j] =
Expr::Literal(ScalarValue::try_from(fields[j].data_type())?);
- }
let dfschema = DFSchema::from_unqualifed_fields(fields.into(),
HashMap::new())?;
let schema = DFSchemaRef::new(dfschema);
Ok(Self::from(LogicalPlan::Values(Values { schema, values })))
@@ -2146,6 +2152,7 @@ mod tests {
Ok(())
}
+
#[test]
fn test_change_redundant_column() -> Result<()> {
let t1_field_1 = Field::new("a", DataType::Int32, false);
diff --git a/datafusion/expr/src/type_coercion/binary.rs
b/datafusion/expr/src/type_coercion/binary.rs
index 7eec606658..10d8306d76 100644
--- a/datafusion/expr/src/type_coercion/binary.rs
+++ b/datafusion/expr/src/type_coercion/binary.rs
@@ -299,12 +299,25 @@ pub fn comparison_coercion(lhs_type: &DataType, rhs_type:
&DataType) -> Option<D
.or_else(|| dictionary_coercion(lhs_type, rhs_type, true))
.or_else(|| temporal_coercion(lhs_type, rhs_type))
.or_else(|| string_coercion(lhs_type, rhs_type))
+ .or_else(|| list_coercion(lhs_type, rhs_type))
.or_else(|| null_coercion(lhs_type, rhs_type))
.or_else(|| string_numeric_coercion(lhs_type, rhs_type))
.or_else(|| string_temporal_coercion(lhs_type, rhs_type))
.or_else(|| binary_coercion(lhs_type, rhs_type))
}
+/// Coerce `lhs_type` and `rhs_type` to a common type for value exprs
+pub fn values_coercion(lhs_type: &DataType, rhs_type: &DataType) ->
Option<DataType> {
+ if lhs_type == rhs_type {
+ // same type => equality is possible
+ return Some(lhs_type.clone());
+ }
+ comparison_binary_numeric_coercion(lhs_type, rhs_type)
+ .or_else(|| temporal_coercion(lhs_type, rhs_type))
+ .or_else(|| string_coercion(lhs_type, rhs_type))
+ .or_else(|| binary_coercion(lhs_type, rhs_type))
+}
+
/// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of a
comparison operation
/// where one is numeric and one is `Utf8`/`LargeUtf8`.
fn string_numeric_coercion(lhs_type: &DataType, rhs_type: &DataType) ->
Option<DataType> {
@@ -665,15 +678,17 @@ fn dictionary_coercion(
/// 2. Data type of the other side should be able to cast to string type
fn string_concat_coercion(lhs_type: &DataType, rhs_type: &DataType) ->
Option<DataType> {
use arrow::datatypes::DataType::*;
- string_coercion(lhs_type, rhs_type).or(match (lhs_type, rhs_type) {
- (Utf8, from_type) | (from_type, Utf8) => {
- string_concat_internal_coercion(from_type, &Utf8)
- }
- (LargeUtf8, from_type) | (from_type, LargeUtf8) => {
- string_concat_internal_coercion(from_type, &LargeUtf8)
- }
- _ => None,
- })
+ string_coercion(lhs_type, rhs_type)
+ .or_else(|| list_coercion(lhs_type, rhs_type))
+ .or(match (lhs_type, rhs_type) {
+ (Utf8, from_type) | (from_type, Utf8) => {
+ string_concat_internal_coercion(from_type, &Utf8)
+ }
+ (LargeUtf8, from_type) | (from_type, LargeUtf8) => {
+ string_concat_internal_coercion(from_type, &LargeUtf8)
+ }
+ _ => None,
+ })
}
fn array_coercion(lhs_type: &DataType, rhs_type: &DataType) ->
Option<DataType> {
@@ -706,6 +721,14 @@ fn string_coercion(lhs_type: &DataType, rhs_type:
&DataType) -> Option<DataType>
(LargeUtf8, Utf8) => Some(LargeUtf8),
(Utf8, LargeUtf8) => Some(LargeUtf8),
(LargeUtf8, LargeUtf8) => Some(LargeUtf8),
+ _ => None,
+ }
+}
+
+/// Coercion rules for list types.
+fn list_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType>
{
+ use arrow::datatypes::DataType::*;
+ match (lhs_type, rhs_type) {
// TODO: cast between array elements (#6558)
(List(_), List(_)) => Some(lhs_type.clone()),
(List(_), _) => Some(lhs_type.clone()),
@@ -752,6 +775,7 @@ fn binary_coercion(lhs_type: &DataType, rhs_type:
&DataType) -> Option<DataType>
/// This is a union of string coercion rules and dictionary coercion rules
pub fn like_coercion(lhs_type: &DataType, rhs_type: &DataType) ->
Option<DataType> {
string_coercion(lhs_type, rhs_type)
+ .or_else(|| list_coercion(lhs_type, rhs_type))
.or_else(|| binary_to_string_coercion(lhs_type, rhs_type))
.or_else(|| dictionary_coercion(lhs_type, rhs_type, false))
.or_else(|| null_coercion(lhs_type, rhs_type))
@@ -761,6 +785,7 @@ pub fn like_coercion(lhs_type: &DataType, rhs_type:
&DataType) -> Option<DataTyp
/// This is a union of string coercion rules and dictionary coercion rules
pub fn regex_coercion(lhs_type: &DataType, rhs_type: &DataType) ->
Option<DataType> {
string_coercion(lhs_type, rhs_type)
+ .or_else(|| list_coercion(lhs_type, rhs_type))
.or_else(|| dictionary_coercion(lhs_type, rhs_type, false))
}
diff --git a/datafusion/sqllogictest/test_files/select.slt
b/datafusion/sqllogictest/test_files/select.slt
index 6b74156b52..aae35c1ce7 100644
--- a/datafusion/sqllogictest/test_files/select.slt
+++ b/datafusion/sqllogictest/test_files/select.slt
@@ -351,8 +351,11 @@ VALUES (1),(1,2)
statement error DataFusion error: Error during planning: Inconsistent data
type across values list at row 1 column 0
VALUES (1),('2')
-statement error DataFusion error: Error during planning: Inconsistent data
type across values list at row 1 column 0
+query R
VALUES (1),(2.0)
+----
+1
+2
statement error DataFusion error: Error during planning: Inconsistent data
type across values list at row 1 column 1
VALUES (1,2), (1,'2')
@@ -473,6 +476,13 @@ CREATE TABLE foo AS VALUES
(3, 4),
(5, 6);
+# multiple rows and columns need type coercion
+statement ok
+CREATE TABLE foo2(c1 double, c2 double) AS VALUES
+(1.1, 4.1),
+(2, 5),
+(3, 6);
+
# foo distinct
query T
select distinct '1' from foo;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]