alamb commented on code in PR #10268: URL: https://github.com/apache/datafusion/pull/10268#discussion_r1586698773
########## datafusion/expr/src/type_coercion/functions.rs: ########## @@ -53,20 +54,31 @@ pub fn data_types( } } - let valid_types = get_valid_types(&signature.type_signature, current_types)?; - + let mut valid_types = get_valid_types(&signature.type_signature, current_types)?; if valid_types .iter() .any(|data_type| data_type == current_types) { return Ok(current_types.to_vec()); } - // Try and coerce the argument types to match the signature, returning the - // coerced types from the first matching signature. - for valid_types in valid_types { - if let Some(types) = maybe_data_types(&valid_types, current_types) { - return Ok(types); + // Well-supported signature that returns exact valid types. + if !valid_types.is_empty() + && matches!(signature.type_signature, TypeSignature::VariadicEqualOrNull) + { + // exact valid types + assert_eq!(valid_types.len(), 1); + let valid_types = valid_types.swap_remove(0); Review Comment: is it a problem that this may change the type ordering? So that the last type is now the first in the new Vec? ########## datafusion/functions/src/core/coalesce.rs: ########## @@ -60,9 +59,8 @@ impl ScalarUDFImpl for CoalesceFunc { } fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> { - // COALESCE has multiple args and they might get coerced, get a preview of this Review Comment: 👍 ❤️ ########## datafusion/sqllogictest/test_files/functions.slt: ########## @@ -1158,3 +1158,347 @@ drop table uuid_table statement ok drop table t + + +# Test coalesce function +query I +select coalesce(1, 2, 3); +---- +1 + +# test with first null +query ?T +select coalesce(null, 3, 2, 1), arrow_typeof(coalesce(null, 3, 2, 1)); +---- +3 Int64 + +# test with null values +query ? +select coalesce(null, null); +---- +NULL + +# cast to float +query IT +select + coalesce(1, 2.0), + arrow_typeof(coalesce(1, 2.0)) +; +---- +1 Float64 + +query RT +select + coalesce(2.0, 1), + arrow_typeof(coalesce(2.0, 1)) +; +---- +2 Float64 + +query IT +select + coalesce(1, arrow_cast(2.0, 'Float32')), + arrow_typeof(coalesce(1, arrow_cast(2.0, 'Float32'))) +; +---- +1 Float32 + +# test with empty args +query error +select coalesce(); + +# test with different types +query I +select coalesce(arrow_cast(1, 'Int32'), arrow_cast(1, 'Int64')); +---- +1 + +# test with nulls +query ?T +select coalesce(null, null, null), arrow_typeof(coalesce(null, null)); +---- +NULL Null + +# i32 and u32, cast to wider type i64 +query IT +select + coalesce(arrow_cast(2, 'Int32'), arrow_cast(3, 'UInt32')), + arrow_typeof(coalesce(arrow_cast(2, 'Int32'), arrow_cast(3, 'UInt32'))); +---- +2 Int64 + +query IT +select + coalesce(arrow_cast(2, 'Int16'), arrow_cast(3, 'UInt16')), + arrow_typeof(coalesce(arrow_cast(2, 'Int16'), arrow_cast(3, 'UInt16'))); +---- +2 Int32 + +query IT +select + coalesce(arrow_cast(2, 'Int8'), arrow_cast(3, 'UInt8')), + arrow_typeof(coalesce(arrow_cast(2, 'Int8'), arrow_cast(3, 'UInt8'))); +---- +2 Int16 + +# i64 and u32, cast to wider type i64 +query IT +select + coalesce(2, arrow_cast(3, 'UInt32')), + arrow_typeof(coalesce(2, arrow_cast(3, 'UInt32'))); +---- +2 Int64 + +# TODO: Got types (i64, u64), casting to decimal or double or even i128 if supported +query IT +select + coalesce(2, arrow_cast(3, 'UInt64')), + arrow_typeof(coalesce(2, arrow_cast(3, 'UInt64'))); +---- +2 Int64 + +statement ok +create table t1 (a bigint, b int, c int) as values (null, null, 1), (null, 2, null); + +# Follow Postgres and DuckDB behavior, since a is bigint, although the value is null, all args are coerced to bigint +query IT +select + coalesce(a, b, c), + arrow_typeof(coalesce(a, b, c)) +from t1; +---- +1 Int64 +2 Int64 + +# b, c has the same type int, so the result is int +query IT +select + coalesce(b, c), + arrow_typeof(coalesce(b, c)) +from t1; +---- +1 Int32 +2 Int32 + +statement ok +drop table t1; + +# test multi rows +statement ok +CREATE TABLE t1( + c1 int, + c2 int +) as VALUES +(1, 2), +(NULL, 2), +(1, NULL), +(NULL, NULL); + +query I +SELECT COALESCE(c1, c2) FROM t1 +---- +1 +2 +1 +NULL + +statement ok +drop table t1; + +# Decimal128(7, 2) and int64 are coerced to common wider type Decimal128(22, 2) +query RT +select + coalesce(arrow_cast(2, 'Decimal128(7, 2)'), 0), + arrow_typeof(coalesce(arrow_cast(2, 'Decimal128(7, 2)'), 0)) +---- +2 Decimal128(22, 2) + +query RT +select + coalesce(arrow_cast(2, 'Decimal256(7, 2)'), 0), + arrow_typeof(coalesce(arrow_cast(2, 'Decimal256(7, 2)'), 0)); +---- +2 Decimal256(22, 2) + +# coalesce string +query T? +select + coalesce('', 'test'), + coalesce(null, 'test'); +---- +(empty) test + +# coalesce utf8 and large utf8 +query TT +select + coalesce('a', arrow_cast('b', 'LargeUtf8')), + arrow_typeof(coalesce('a', arrow_cast('b', 'LargeUtf8'))) +; +---- +a LargeUtf8 + +# coalesce array +query ?T +select + coalesce(array[1, 2], array[3, 4]), + arrow_typeof(coalesce(array[1, 2], array[3, 4])); +---- +[1, 2] List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) + +query ?T +select + coalesce(null, array[3, 4]), + arrow_typeof(coalesce(array[1, 2], array[3, 4])); +---- +[3, 4] List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) + +# TODO: after switch signature of array to the same with coalesce, this query should be fixed +query error +select + coalesce(array[1, 2], array[arrow_cast(3, 'Int32'), arrow_cast(4, 'Int32')]), + arrow_typeof(coalesce(array[1, 2], array[arrow_cast(3, 'Int32'), arrow_cast(4, 'Int32')])); + +statement ok +create table test1 as values (arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (null); + +# Dictionary and String are not coercible Review Comment: This query actually works on main so making it error seems like a regression to me ```sql DataFusion CLI v37.1.0 > create table test1 as values (arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (null); 0 row(s) fetched. Elapsed 0.011 seconds. > select coalesce(column1, 'none_set') from test1; +------------------------------------------+ | coalesce(test1.column1,Utf8("none_set")) | +------------------------------------------+ | foo | | none_set | +------------------------------------------+ 2 row(s) fetched. Elapsed 0.003 seconds. > select coalesce(null, column1, 'none_set') from test1; +-----------------------------------------------+ | coalesce(NULL,test1.column1,Utf8("none_set")) | +-----------------------------------------------+ | foo | | none_set | +-----------------------------------------------+ 2 row(s) fetched. Elapsed 0.002 seconds. ``` ########## datafusion/expr/src/signature.rs: ########## @@ -92,14 +92,22 @@ pub enum TypeSignature { /// A function such as `concat` is `Variadic(vec![DataType::Utf8, DataType::LargeUtf8])` Variadic(Vec<DataType>), /// One or more arguments of an arbitrary but equal type. - /// DataFusion attempts to coerce all argument types to match the first argument's type + /// DataFusion attempts to coerce all argument types to match to the common type with comparision coercion. Review Comment: Can we please link to https://docs.rs/datafusion/latest/datafusion/logical_expr/type_coercion/binary/fn.comparison_coercion.html that explains (however limited) what comparison coercion is? ########## datafusion/expr/src/type_coercion/binary.rs: ########## @@ -289,15 +290,169 @@ fn bitwise_coercion(left_type: &DataType, right_type: &DataType) -> Option<DataT } } +#[derive(Debug, PartialEq, Eq, Hash, Clone)] +enum TypeCategory { + Array, + Boolean, + Numeric, + // String, well-defined type, but are considered as unknown type. + DateTime, + Composite, + Unknown, + NotSupported, +} + +fn data_type_category(data_type: &DataType) -> TypeCategory { Review Comment: This could also be something like ```rust impl From<&DataType> for TypeCategory { ... ``` And then you create a `TypeCategory` like ```rust let category = TypeCategory::from(&type) ``` ########## datafusion/expr/src/signature.rs: ########## @@ -92,14 +92,22 @@ pub enum TypeSignature { /// A function such as `concat` is `Variadic(vec![DataType::Utf8, DataType::LargeUtf8])` Variadic(Vec<DataType>), /// One or more arguments of an arbitrary but equal type. - /// DataFusion attempts to coerce all argument types to match the first argument's type + /// DataFusion attempts to coerce all argument types to match to the common type with comparision coercion. /// /// # Examples /// Given types in signature should be coercible to the same final type. /// A function such as `make_array` is `VariadicEqual`. /// /// `make_array(i32, i64) -> make_array(i64, i64)` VariadicEqual, + /// One or more arguments of an arbitrary but equal type or Null. + /// Non-comparison coercion is attempted to match the signatures. + /// + /// Functions like `coalesce` is `VariadicEqual`. + // TODO: Temporary Signature, to differentiate existing VariadicEqual. + // After we swtich `make_array` to VariadicEqualOrNull, + // we can reuse VariadicEqual. + VariadicEqualOrNull, Review Comment: Maybe we could do something like this (basically to flavor the type signature) 🤔 ```rust pub enum TypeSignature { ... /// Rather than the usual coercion rules, special type union rules are applied Union(Box<TypeSignature>) } ``` ########## datafusion/expr/src/type_coercion/functions.rs: ########## @@ -293,6 +314,32 @@ fn maybe_data_types( Some(new_type) } +/// Check if the current argument types can be coerced to match the given `valid_types` +/// unlike `maybe_data_types`, this function does not coerce the types. Review Comment: this comment says "function does not coerce the types" but then it permits types in `can_cast_types`. I may be misunderstanding what coerce means in this case ########## datafusion/expr/src/type_coercion/functions.rs: ########## @@ -450,19 +473,7 @@ fn coerced_from<'a>( { Some(type_into.clone()) } - // More coerce rules. - // Note that not all rules in `comparison_coercion` can be reused here. - // For example, all numeric types can be coerced into Utf8 for comparison, - // but not for function arguments. - _ => comparison_binary_numeric_coercion(type_into, type_from).and_then( Review Comment: cc @viirya ########## datafusion/expr/src/signature.rs: ########## @@ -92,14 +92,22 @@ pub enum TypeSignature { /// A function such as `concat` is `Variadic(vec![DataType::Utf8, DataType::LargeUtf8])` Variadic(Vec<DataType>), /// One or more arguments of an arbitrary but equal type. - /// DataFusion attempts to coerce all argument types to match the first argument's type + /// DataFusion attempts to coerce all argument types to match to the common type with comparision coercion. /// /// # Examples /// Given types in signature should be coercible to the same final type. /// A function such as `make_array` is `VariadicEqual`. /// /// `make_array(i32, i64) -> make_array(i64, i64)` VariadicEqual, + /// One or more arguments of an arbitrary but equal type or Null. + /// Non-comparison coercion is attempted to match the signatures. + /// + /// Functions like `coalesce` is `VariadicEqual`. Review Comment: I am a little confused about what "Non-comparison coercion" means in this situation. Specifically how comparison coercion and non comparision coercion differ 🤔 Does non comparison coercion mean "type union resolution" (aka `type_union_resolution`)? Also, I think `coalesce` is now `VariadicEqualOrNull` (this says the opposite) ```suggestion /// One or more arguments of an arbitrary but equal type or Null. /// Non-comparison coercion is attempted to match the signatures. /// /// Functions like `coalesce` is `VariadicEqualOrNull`. ``` ########## datafusion/sqllogictest/test_files/functions.slt: ########## @@ -1158,3 +1158,347 @@ drop table uuid_table statement ok drop table t + + +# Test coalesce function +query I +select coalesce(1, 2, 3); +---- +1 + +# test with first null +query ?T +select coalesce(null, 3, 2, 1), arrow_typeof(coalesce(null, 3, 2, 1)); +---- +3 Int64 + +# test with null values +query ? +select coalesce(null, null); +---- +NULL + +# cast to float +query IT +select + coalesce(1, 2.0), + arrow_typeof(coalesce(1, 2.0)) +; +---- +1 Float64 + +query RT +select + coalesce(2.0, 1), + arrow_typeof(coalesce(2.0, 1)) +; +---- +2 Float64 + +query IT +select + coalesce(1, arrow_cast(2.0, 'Float32')), + arrow_typeof(coalesce(1, arrow_cast(2.0, 'Float32'))) +; +---- +1 Float32 + +# test with empty args +query error +select coalesce(); + +# test with different types +query I +select coalesce(arrow_cast(1, 'Int32'), arrow_cast(1, 'Int64')); +---- +1 + +# test with nulls +query ?T +select coalesce(null, null, null), arrow_typeof(coalesce(null, null)); +---- +NULL Null + +# i32 and u32, cast to wider type i64 +query IT +select + coalesce(arrow_cast(2, 'Int32'), arrow_cast(3, 'UInt32')), + arrow_typeof(coalesce(arrow_cast(2, 'Int32'), arrow_cast(3, 'UInt32'))); +---- +2 Int64 + +query IT +select + coalesce(arrow_cast(2, 'Int16'), arrow_cast(3, 'UInt16')), + arrow_typeof(coalesce(arrow_cast(2, 'Int16'), arrow_cast(3, 'UInt16'))); +---- +2 Int32 + +query IT +select + coalesce(arrow_cast(2, 'Int8'), arrow_cast(3, 'UInt8')), + arrow_typeof(coalesce(arrow_cast(2, 'Int8'), arrow_cast(3, 'UInt8'))); +---- +2 Int16 + +# i64 and u32, cast to wider type i64 +query IT +select + coalesce(2, arrow_cast(3, 'UInt32')), + arrow_typeof(coalesce(2, arrow_cast(3, 'UInt32'))); +---- +2 Int64 + +# TODO: Got types (i64, u64), casting to decimal or double or even i128 if supported +query IT +select + coalesce(2, arrow_cast(3, 'UInt64')), + arrow_typeof(coalesce(2, arrow_cast(3, 'UInt64'))); +---- +2 Int64 + +statement ok +create table t1 (a bigint, b int, c int) as values (null, null, 1), (null, 2, null); + +# Follow Postgres and DuckDB behavior, since a is bigint, although the value is null, all args are coerced to bigint +query IT +select + coalesce(a, b, c), + arrow_typeof(coalesce(a, b, c)) +from t1; +---- +1 Int64 +2 Int64 + +# b, c has the same type int, so the result is int +query IT +select + coalesce(b, c), + arrow_typeof(coalesce(b, c)) +from t1; +---- +1 Int32 +2 Int32 + +statement ok +drop table t1; + +# test multi rows +statement ok +CREATE TABLE t1( + c1 int, + c2 int +) as VALUES +(1, 2), +(NULL, 2), +(1, NULL), +(NULL, NULL); + +query I +SELECT COALESCE(c1, c2) FROM t1 +---- +1 +2 +1 +NULL + +statement ok +drop table t1; + +# Decimal128(7, 2) and int64 are coerced to common wider type Decimal128(22, 2) +query RT +select + coalesce(arrow_cast(2, 'Decimal128(7, 2)'), 0), + arrow_typeof(coalesce(arrow_cast(2, 'Decimal128(7, 2)'), 0)) +---- +2 Decimal128(22, 2) + +query RT +select + coalesce(arrow_cast(2, 'Decimal256(7, 2)'), 0), + arrow_typeof(coalesce(arrow_cast(2, 'Decimal256(7, 2)'), 0)); +---- +2 Decimal256(22, 2) + +# coalesce string +query T? +select + coalesce('', 'test'), + coalesce(null, 'test'); +---- +(empty) test + +# coalesce utf8 and large utf8 +query TT +select + coalesce('a', arrow_cast('b', 'LargeUtf8')), + arrow_typeof(coalesce('a', arrow_cast('b', 'LargeUtf8'))) +; +---- +a LargeUtf8 + +# coalesce array +query ?T +select + coalesce(array[1, 2], array[3, 4]), + arrow_typeof(coalesce(array[1, 2], array[3, 4])); +---- +[1, 2] List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) + +query ?T +select + coalesce(null, array[3, 4]), + arrow_typeof(coalesce(array[1, 2], array[3, 4])); +---- +[3, 4] List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) + +# TODO: after switch signature of array to the same with coalesce, this query should be fixed +query error +select + coalesce(array[1, 2], array[arrow_cast(3, 'Int32'), arrow_cast(4, 'Int32')]), + arrow_typeof(coalesce(array[1, 2], array[arrow_cast(3, 'Int32'), arrow_cast(4, 'Int32')])); + +statement ok +create table test1 as values (arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (null); + +# Dictionary and String are not coercible +query error +select coalesce(column1, 'none_set') from test1; + +query error +select coalesce(null, column1, 'none_set') from test1; + +# explicitly cast to Utf8 +query T +select coalesce(arrow_cast(column1, 'Utf8'), 'none_set') from test1; +---- +foo +none_set + +statement ok +drop table test1 + +# Numeric and Dictionary are not coercible Review Comment: Likewise, this also works on main: ``` > select coalesce(34, arrow_cast(123, 'Dictionary(Int32, Int8)')); +----------------------------------------------------------------------------+ | coalesce(Int64(34),arrow_cast(Int64(123),Utf8("Dictionary(Int32, Int8)"))) | +----------------------------------------------------------------------------+ | 34 | +----------------------------------------------------------------------------+ 1 row(s) fetched. Elapsed 0.001 seconds. ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org