alamb commented on code in PR #10268:
URL: https://github.com/apache/datafusion/pull/10268#discussion_r1586698773
##########
datafusion/expr/src/type_coercion/functions.rs:
##########
@@ -53,20 +54,31 @@ pub fn data_types(
}
}
- let valid_types = get_valid_types(&signature.type_signature,
current_types)?;
-
+ let mut valid_types = get_valid_types(&signature.type_signature,
current_types)?;
if valid_types
.iter()
.any(|data_type| data_type == current_types)
{
return Ok(current_types.to_vec());
}
- // Try and coerce the argument types to match the signature, returning the
- // coerced types from the first matching signature.
- for valid_types in valid_types {
- if let Some(types) = maybe_data_types(&valid_types, current_types) {
- return Ok(types);
+ // Well-supported signature that returns exact valid types.
+ if !valid_types.is_empty()
+ && matches!(signature.type_signature,
TypeSignature::VariadicEqualOrNull)
+ {
+ // exact valid types
+ assert_eq!(valid_types.len(), 1);
+ let valid_types = valid_types.swap_remove(0);
Review Comment:
is it a problem that this may change the type ordering? So that the last
type is now the first in the new Vec?
##########
datafusion/functions/src/core/coalesce.rs:
##########
@@ -60,9 +59,8 @@ impl ScalarUDFImpl for CoalesceFunc {
}
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
- // COALESCE has multiple args and they might get coerced, get a
preview of this
Review Comment:
👍 ❤️
##########
datafusion/sqllogictest/test_files/functions.slt:
##########
@@ -1158,3 +1158,347 @@ drop table uuid_table
statement ok
drop table t
+
+
+# Test coalesce function
+query I
+select coalesce(1, 2, 3);
+----
+1
+
+# test with first null
+query ?T
+select coalesce(null, 3, 2, 1), arrow_typeof(coalesce(null, 3, 2, 1));
+----
+3 Int64
+
+# test with null values
+query ?
+select coalesce(null, null);
+----
+NULL
+
+# cast to float
+query IT
+select
+ coalesce(1, 2.0),
+ arrow_typeof(coalesce(1, 2.0))
+;
+----
+1 Float64
+
+query RT
+select
+ coalesce(2.0, 1),
+ arrow_typeof(coalesce(2.0, 1))
+;
+----
+2 Float64
+
+query IT
+select
+ coalesce(1, arrow_cast(2.0, 'Float32')),
+ arrow_typeof(coalesce(1, arrow_cast(2.0, 'Float32')))
+;
+----
+1 Float32
+
+# test with empty args
+query error
+select coalesce();
+
+# test with different types
+query I
+select coalesce(arrow_cast(1, 'Int32'), arrow_cast(1, 'Int64'));
+----
+1
+
+# test with nulls
+query ?T
+select coalesce(null, null, null), arrow_typeof(coalesce(null, null));
+----
+NULL Null
+
+# i32 and u32, cast to wider type i64
+query IT
+select
+ coalesce(arrow_cast(2, 'Int32'), arrow_cast(3, 'UInt32')),
+ arrow_typeof(coalesce(arrow_cast(2, 'Int32'), arrow_cast(3, 'UInt32')));
+----
+2 Int64
+
+query IT
+select
+ coalesce(arrow_cast(2, 'Int16'), arrow_cast(3, 'UInt16')),
+ arrow_typeof(coalesce(arrow_cast(2, 'Int16'), arrow_cast(3, 'UInt16')));
+----
+2 Int32
+
+query IT
+select
+ coalesce(arrow_cast(2, 'Int8'), arrow_cast(3, 'UInt8')),
+ arrow_typeof(coalesce(arrow_cast(2, 'Int8'), arrow_cast(3, 'UInt8')));
+----
+2 Int16
+
+# i64 and u32, cast to wider type i64
+query IT
+select
+ coalesce(2, arrow_cast(3, 'UInt32')),
+ arrow_typeof(coalesce(2, arrow_cast(3, 'UInt32')));
+----
+2 Int64
+
+# TODO: Got types (i64, u64), casting to decimal or double or even i128 if
supported
+query IT
+select
+ coalesce(2, arrow_cast(3, 'UInt64')),
+ arrow_typeof(coalesce(2, arrow_cast(3, 'UInt64')));
+----
+2 Int64
+
+statement ok
+create table t1 (a bigint, b int, c int) as values (null, null, 1), (null, 2,
null);
+
+# Follow Postgres and DuckDB behavior, since a is bigint, although the value
is null, all args are coerced to bigint
+query IT
+select
+ coalesce(a, b, c),
+ arrow_typeof(coalesce(a, b, c))
+from t1;
+----
+1 Int64
+2 Int64
+
+# b, c has the same type int, so the result is int
+query IT
+select
+ coalesce(b, c),
+ arrow_typeof(coalesce(b, c))
+from t1;
+----
+1 Int32
+2 Int32
+
+statement ok
+drop table t1;
+
+# test multi rows
+statement ok
+CREATE TABLE t1(
+ c1 int,
+ c2 int
+) as VALUES
+(1, 2),
+(NULL, 2),
+(1, NULL),
+(NULL, NULL);
+
+query I
+SELECT COALESCE(c1, c2) FROM t1
+----
+1
+2
+1
+NULL
+
+statement ok
+drop table t1;
+
+# Decimal128(7, 2) and int64 are coerced to common wider type Decimal128(22, 2)
+query RT
+select
+ coalesce(arrow_cast(2, 'Decimal128(7, 2)'), 0),
+ arrow_typeof(coalesce(arrow_cast(2, 'Decimal128(7, 2)'), 0))
+----
+2 Decimal128(22, 2)
+
+query RT
+select
+ coalesce(arrow_cast(2, 'Decimal256(7, 2)'), 0),
+ arrow_typeof(coalesce(arrow_cast(2, 'Decimal256(7, 2)'), 0));
+----
+2 Decimal256(22, 2)
+
+# coalesce string
+query T?
+select
+ coalesce('', 'test'),
+ coalesce(null, 'test');
+----
+(empty) test
+
+# coalesce utf8 and large utf8
+query TT
+select
+ coalesce('a', arrow_cast('b', 'LargeUtf8')),
+ arrow_typeof(coalesce('a', arrow_cast('b', 'LargeUtf8')))
+;
+----
+a LargeUtf8
+
+# coalesce array
+query ?T
+select
+ coalesce(array[1, 2], array[3, 4]),
+ arrow_typeof(coalesce(array[1, 2], array[3, 4]));
+----
+[1, 2] List(Field { name: "item", data_type: Int64, nullable: true, dict_id:
0, dict_is_ordered: false, metadata: {} })
+
+query ?T
+select
+ coalesce(null, array[3, 4]),
+ arrow_typeof(coalesce(array[1, 2], array[3, 4]));
+----
+[3, 4] List(Field { name: "item", data_type: Int64, nullable: true, dict_id:
0, dict_is_ordered: false, metadata: {} })
+
+# TODO: after switch signature of array to the same with coalesce, this query
should be fixed
+query error
+select
+ coalesce(array[1, 2], array[arrow_cast(3, 'Int32'), arrow_cast(4, 'Int32')]),
+ arrow_typeof(coalesce(array[1, 2], array[arrow_cast(3, 'Int32'),
arrow_cast(4, 'Int32')]));
+
+statement ok
+create table test1 as values (arrow_cast('foo', 'Dictionary(Int32, Utf8)')),
(null);
+
+# Dictionary and String are not coercible
Review Comment:
This query actually works on main so making it error seems like a regression
to me
```sql
DataFusion CLI v37.1.0
> create table test1 as values (arrow_cast('foo', 'Dictionary(Int32,
Utf8)')), (null);
0 row(s) fetched.
Elapsed 0.011 seconds.
> select coalesce(column1, 'none_set') from test1;
+------------------------------------------+
| coalesce(test1.column1,Utf8("none_set")) |
+------------------------------------------+
| foo |
| none_set |
+------------------------------------------+
2 row(s) fetched.
Elapsed 0.003 seconds.
> select coalesce(null, column1, 'none_set') from test1;
+-----------------------------------------------+
| coalesce(NULL,test1.column1,Utf8("none_set")) |
+-----------------------------------------------+
| foo |
| none_set |
+-----------------------------------------------+
2 row(s) fetched.
Elapsed 0.002 seconds.
```
##########
datafusion/expr/src/signature.rs:
##########
@@ -92,14 +92,22 @@ pub enum TypeSignature {
/// A function such as `concat` is `Variadic(vec![DataType::Utf8,
DataType::LargeUtf8])`
Variadic(Vec<DataType>),
/// One or more arguments of an arbitrary but equal type.
- /// DataFusion attempts to coerce all argument types to match the first
argument's type
+ /// DataFusion attempts to coerce all argument types to match to the
common type with comparision coercion.
Review Comment:
Can we please link to
https://docs.rs/datafusion/latest/datafusion/logical_expr/type_coercion/binary/fn.comparison_coercion.html
that explains (however limited) what comparison coercion is?
##########
datafusion/expr/src/type_coercion/binary.rs:
##########
@@ -289,15 +290,169 @@ fn bitwise_coercion(left_type: &DataType, right_type:
&DataType) -> Option<DataT
}
}
+#[derive(Debug, PartialEq, Eq, Hash, Clone)]
+enum TypeCategory {
+ Array,
+ Boolean,
+ Numeric,
+ // String, well-defined type, but are considered as unknown type.
+ DateTime,
+ Composite,
+ Unknown,
+ NotSupported,
+}
+
+fn data_type_category(data_type: &DataType) -> TypeCategory {
Review Comment:
This could also be something like
```rust
impl From<&DataType> for TypeCategory {
...
```
And then you create a `TypeCategory` like
```rust
let category = TypeCategory::from(&type)
```
##########
datafusion/expr/src/signature.rs:
##########
@@ -92,14 +92,22 @@ pub enum TypeSignature {
/// A function such as `concat` is `Variadic(vec![DataType::Utf8,
DataType::LargeUtf8])`
Variadic(Vec<DataType>),
/// One or more arguments of an arbitrary but equal type.
- /// DataFusion attempts to coerce all argument types to match the first
argument's type
+ /// DataFusion attempts to coerce all argument types to match to the
common type with comparision coercion.
///
/// # Examples
/// Given types in signature should be coercible to the same final type.
/// A function such as `make_array` is `VariadicEqual`.
///
/// `make_array(i32, i64) -> make_array(i64, i64)`
VariadicEqual,
+ /// One or more arguments of an arbitrary but equal type or Null.
+ /// Non-comparison coercion is attempted to match the signatures.
+ ///
+ /// Functions like `coalesce` is `VariadicEqual`.
+ // TODO: Temporary Signature, to differentiate existing VariadicEqual.
+ // After we swtich `make_array` to VariadicEqualOrNull,
+ // we can reuse VariadicEqual.
+ VariadicEqualOrNull,
Review Comment:
Maybe we could do something like this (basically to flavor the type
signature) 🤔
```rust
pub enum TypeSignature {
...
/// Rather than the usual coercion rules, special type union rules are
applied
Union(Box<TypeSignature>)
}
```
##########
datafusion/expr/src/type_coercion/functions.rs:
##########
@@ -293,6 +314,32 @@ fn maybe_data_types(
Some(new_type)
}
+/// Check if the current argument types can be coerced to match the given
`valid_types`
+/// unlike `maybe_data_types`, this function does not coerce the types.
Review Comment:
this comment says "function does not coerce the types" but then it permits
types in `can_cast_types`. I may be misunderstanding what coerce means in this
case
##########
datafusion/expr/src/type_coercion/functions.rs:
##########
@@ -450,19 +473,7 @@ fn coerced_from<'a>(
{
Some(type_into.clone())
}
- // More coerce rules.
- // Note that not all rules in `comparison_coercion` can be reused here.
- // For example, all numeric types can be coerced into Utf8 for
comparison,
- // but not for function arguments.
- _ => comparison_binary_numeric_coercion(type_into, type_from).and_then(
Review Comment:
cc @viirya
##########
datafusion/expr/src/signature.rs:
##########
@@ -92,14 +92,22 @@ pub enum TypeSignature {
/// A function such as `concat` is `Variadic(vec![DataType::Utf8,
DataType::LargeUtf8])`
Variadic(Vec<DataType>),
/// One or more arguments of an arbitrary but equal type.
- /// DataFusion attempts to coerce all argument types to match the first
argument's type
+ /// DataFusion attempts to coerce all argument types to match to the
common type with comparision coercion.
///
/// # Examples
/// Given types in signature should be coercible to the same final type.
/// A function such as `make_array` is `VariadicEqual`.
///
/// `make_array(i32, i64) -> make_array(i64, i64)`
VariadicEqual,
+ /// One or more arguments of an arbitrary but equal type or Null.
+ /// Non-comparison coercion is attempted to match the signatures.
+ ///
+ /// Functions like `coalesce` is `VariadicEqual`.
Review Comment:
I am a little confused about what "Non-comparison coercion" means in this
situation. Specifically how comparison coercion and non comparision coercion
differ 🤔
Does non comparison coercion mean "type union resolution" (aka
`type_union_resolution`)?
Also, I think `coalesce` is now `VariadicEqualOrNull` (this says the
opposite)
```suggestion
/// One or more arguments of an arbitrary but equal type or Null.
/// Non-comparison coercion is attempted to match the signatures.
///
/// Functions like `coalesce` is `VariadicEqualOrNull`.
```
##########
datafusion/sqllogictest/test_files/functions.slt:
##########
@@ -1158,3 +1158,347 @@ drop table uuid_table
statement ok
drop table t
+
+
+# Test coalesce function
+query I
+select coalesce(1, 2, 3);
+----
+1
+
+# test with first null
+query ?T
+select coalesce(null, 3, 2, 1), arrow_typeof(coalesce(null, 3, 2, 1));
+----
+3 Int64
+
+# test with null values
+query ?
+select coalesce(null, null);
+----
+NULL
+
+# cast to float
+query IT
+select
+ coalesce(1, 2.0),
+ arrow_typeof(coalesce(1, 2.0))
+;
+----
+1 Float64
+
+query RT
+select
+ coalesce(2.0, 1),
+ arrow_typeof(coalesce(2.0, 1))
+;
+----
+2 Float64
+
+query IT
+select
+ coalesce(1, arrow_cast(2.0, 'Float32')),
+ arrow_typeof(coalesce(1, arrow_cast(2.0, 'Float32')))
+;
+----
+1 Float32
+
+# test with empty args
+query error
+select coalesce();
+
+# test with different types
+query I
+select coalesce(arrow_cast(1, 'Int32'), arrow_cast(1, 'Int64'));
+----
+1
+
+# test with nulls
+query ?T
+select coalesce(null, null, null), arrow_typeof(coalesce(null, null));
+----
+NULL Null
+
+# i32 and u32, cast to wider type i64
+query IT
+select
+ coalesce(arrow_cast(2, 'Int32'), arrow_cast(3, 'UInt32')),
+ arrow_typeof(coalesce(arrow_cast(2, 'Int32'), arrow_cast(3, 'UInt32')));
+----
+2 Int64
+
+query IT
+select
+ coalesce(arrow_cast(2, 'Int16'), arrow_cast(3, 'UInt16')),
+ arrow_typeof(coalesce(arrow_cast(2, 'Int16'), arrow_cast(3, 'UInt16')));
+----
+2 Int32
+
+query IT
+select
+ coalesce(arrow_cast(2, 'Int8'), arrow_cast(3, 'UInt8')),
+ arrow_typeof(coalesce(arrow_cast(2, 'Int8'), arrow_cast(3, 'UInt8')));
+----
+2 Int16
+
+# i64 and u32, cast to wider type i64
+query IT
+select
+ coalesce(2, arrow_cast(3, 'UInt32')),
+ arrow_typeof(coalesce(2, arrow_cast(3, 'UInt32')));
+----
+2 Int64
+
+# TODO: Got types (i64, u64), casting to decimal or double or even i128 if
supported
+query IT
+select
+ coalesce(2, arrow_cast(3, 'UInt64')),
+ arrow_typeof(coalesce(2, arrow_cast(3, 'UInt64')));
+----
+2 Int64
+
+statement ok
+create table t1 (a bigint, b int, c int) as values (null, null, 1), (null, 2,
null);
+
+# Follow Postgres and DuckDB behavior, since a is bigint, although the value
is null, all args are coerced to bigint
+query IT
+select
+ coalesce(a, b, c),
+ arrow_typeof(coalesce(a, b, c))
+from t1;
+----
+1 Int64
+2 Int64
+
+# b, c has the same type int, so the result is int
+query IT
+select
+ coalesce(b, c),
+ arrow_typeof(coalesce(b, c))
+from t1;
+----
+1 Int32
+2 Int32
+
+statement ok
+drop table t1;
+
+# test multi rows
+statement ok
+CREATE TABLE t1(
+ c1 int,
+ c2 int
+) as VALUES
+(1, 2),
+(NULL, 2),
+(1, NULL),
+(NULL, NULL);
+
+query I
+SELECT COALESCE(c1, c2) FROM t1
+----
+1
+2
+1
+NULL
+
+statement ok
+drop table t1;
+
+# Decimal128(7, 2) and int64 are coerced to common wider type Decimal128(22, 2)
+query RT
+select
+ coalesce(arrow_cast(2, 'Decimal128(7, 2)'), 0),
+ arrow_typeof(coalesce(arrow_cast(2, 'Decimal128(7, 2)'), 0))
+----
+2 Decimal128(22, 2)
+
+query RT
+select
+ coalesce(arrow_cast(2, 'Decimal256(7, 2)'), 0),
+ arrow_typeof(coalesce(arrow_cast(2, 'Decimal256(7, 2)'), 0));
+----
+2 Decimal256(22, 2)
+
+# coalesce string
+query T?
+select
+ coalesce('', 'test'),
+ coalesce(null, 'test');
+----
+(empty) test
+
+# coalesce utf8 and large utf8
+query TT
+select
+ coalesce('a', arrow_cast('b', 'LargeUtf8')),
+ arrow_typeof(coalesce('a', arrow_cast('b', 'LargeUtf8')))
+;
+----
+a LargeUtf8
+
+# coalesce array
+query ?T
+select
+ coalesce(array[1, 2], array[3, 4]),
+ arrow_typeof(coalesce(array[1, 2], array[3, 4]));
+----
+[1, 2] List(Field { name: "item", data_type: Int64, nullable: true, dict_id:
0, dict_is_ordered: false, metadata: {} })
+
+query ?T
+select
+ coalesce(null, array[3, 4]),
+ arrow_typeof(coalesce(array[1, 2], array[3, 4]));
+----
+[3, 4] List(Field { name: "item", data_type: Int64, nullable: true, dict_id:
0, dict_is_ordered: false, metadata: {} })
+
+# TODO: after switch signature of array to the same with coalesce, this query
should be fixed
+query error
+select
+ coalesce(array[1, 2], array[arrow_cast(3, 'Int32'), arrow_cast(4, 'Int32')]),
+ arrow_typeof(coalesce(array[1, 2], array[arrow_cast(3, 'Int32'),
arrow_cast(4, 'Int32')]));
+
+statement ok
+create table test1 as values (arrow_cast('foo', 'Dictionary(Int32, Utf8)')),
(null);
+
+# Dictionary and String are not coercible
+query error
+select coalesce(column1, 'none_set') from test1;
+
+query error
+select coalesce(null, column1, 'none_set') from test1;
+
+# explicitly cast to Utf8
+query T
+select coalesce(arrow_cast(column1, 'Utf8'), 'none_set') from test1;
+----
+foo
+none_set
+
+statement ok
+drop table test1
+
+# Numeric and Dictionary are not coercible
Review Comment:
Likewise, this also works on main:
```
> select coalesce(34, arrow_cast(123, 'Dictionary(Int32, Int8)'));
+----------------------------------------------------------------------------+
| coalesce(Int64(34),arrow_cast(Int64(123),Utf8("Dictionary(Int32, Int8)")))
|
+----------------------------------------------------------------------------+
| 34
|
+----------------------------------------------------------------------------+
1 row(s) fetched.
Elapsed 0.001 seconds.
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]