This is an automated email from the ASF dual-hosted git repository. blaginin pushed a commit to branch annarose/dict-coercion in repository https://gitbox.apache.org/repos/asf/datafusion-sandbox.git
commit 5981d62c2860268790986246b705a1963e25f4e7 Author: Jeffrey Vo <[email protected]> AuthorDate: Wed Feb 4 06:26:14 2026 +0900 chore: remove datatype check functions in favour of upstream versions (#20104) We now have [`DataType::is_string`](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html#method.is_string) and [`DataType::is_decimal`](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html#method.is_decimal) so make the most of them --- datafusion/expr-common/src/type_coercion/binary.rs | 22 ++++++-------------- datafusion/expr/src/type_coercion/mod.rs | 24 ---------------------- datafusion/optimizer/src/analyzer/type_coercion.rs | 23 +++++++++++---------- .../physical-expr/src/expressions/negative.rs | 4 ++-- datafusion/pruning/src/pruning_predicate.rs | 11 ++-------- 5 files changed, 22 insertions(+), 62 deletions(-) diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index 427ebc598..9051f412b 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -351,16 +351,6 @@ impl<'a> BinaryTypeCoercer<'a> { // TODO Move the rest inside of BinaryTypeCoercer -fn is_decimal(data_type: &DataType) -> bool { - matches!( - data_type, - DataType::Decimal32(..) - | DataType::Decimal64(..) - | DataType::Decimal128(..) - | DataType::Decimal256(..) - ) -} - /// Returns true if both operands are Date types (Date32 or Date64) /// Used to detect Date - Date operations which should return Int64 (days difference) fn is_date_minus_date(lhs: &DataType, rhs: &DataType) -> bool { @@ -402,8 +392,8 @@ fn math_decimal_coercion( } // Cross-variant decimal coercion - choose larger variant with appropriate precision/scale (lhs, rhs) - if is_decimal(lhs) - && is_decimal(rhs) + if lhs.is_decimal() + && rhs.is_decimal() && std::mem::discriminant(lhs) != std::mem::discriminant(rhs) => { let coerced_type = get_wider_decimal_type_cross_variant(lhs_type, rhs_type)?; @@ -1018,8 +1008,8 @@ pub fn decimal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<Data match (lhs_type, rhs_type) { // Same decimal types (lhs_type, rhs_type) - if is_decimal(lhs_type) - && is_decimal(rhs_type) + if lhs_type.is_decimal() + && rhs_type.is_decimal() && std::mem::discriminant(lhs_type) == std::mem::discriminant(rhs_type) => { @@ -1027,8 +1017,8 @@ pub fn decimal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<Data } // Mismatched decimal types (lhs_type, rhs_type) - if is_decimal(lhs_type) - && is_decimal(rhs_type) + if lhs_type.is_decimal() + && rhs_type.is_decimal() && std::mem::discriminant(lhs_type) != std::mem::discriminant(rhs_type) => { diff --git a/datafusion/expr/src/type_coercion/mod.rs b/datafusion/expr/src/type_coercion/mod.rs index bd1acd3f3..c92d434e3 100644 --- a/datafusion/expr/src/type_coercion/mod.rs +++ b/datafusion/expr/src/type_coercion/mod.rs @@ -58,11 +58,6 @@ pub fn is_signed_numeric(dt: &DataType) -> bool { ) } -/// Determine whether the given data type `dt` is `Null`. -pub fn is_null(dt: &DataType) -> bool { - *dt == DataType::Null -} - /// Determine whether the given data type `dt` is a `Timestamp`. pub fn is_timestamp(dt: &DataType) -> bool { matches!(dt, DataType::Timestamp(_, _)) @@ -80,22 +75,3 @@ pub fn is_datetime(dt: &DataType) -> bool { DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, _) ) } - -/// Determine whether the given data type `dt` is a `Utf8` or `Utf8View` or `LargeUtf8`. -pub fn is_utf8_or_utf8view_or_large_utf8(dt: &DataType) -> bool { - matches!( - dt, - DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 - ) -} - -/// Determine whether the given data type `dt` is a `Decimal`. -pub fn is_decimal(dt: &DataType) -> bool { - matches!( - dt, - DataType::Decimal32(_, _) - | DataType::Decimal64(_, _) - | DataType::Decimal128(_, _) - | DataType::Decimal256(_, _) - ) -} diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 8b57cc798..1e59ceb97 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -43,10 +43,10 @@ use datafusion_expr::expr_schema::cast_subquery; use datafusion_expr::logical_plan::Subquery; use datafusion_expr::type_coercion::binary::{comparison_coercion, like_coercion}; use datafusion_expr::type_coercion::functions::{UDFCoercionExt, fields_with_udf}; +use datafusion_expr::type_coercion::is_datetime; use datafusion_expr::type_coercion::other::{ get_coerce_type_for_case_expression, get_coerce_type_for_list, }; -use datafusion_expr::type_coercion::{is_datetime, is_utf8_or_utf8view_or_large_utf8}; use datafusion_expr::utils::merge_schema; use datafusion_expr::{ Cast, Expr, ExprSchemable, Join, Limit, LogicalPlan, Operator, Projection, Union, @@ -513,10 +513,8 @@ impl TreeNodeRewriter for TypeCoercionRewriter<'_> { .data; let expr_type = expr.get_type(self.schema)?; let subquery_type = new_plan.schema().field(0).data_type(); - if (expr_type.is_numeric() - && is_utf8_or_utf8view_or_large_utf8(subquery_type)) - || (subquery_type.is_numeric() - && is_utf8_or_utf8view_or_large_utf8(&expr_type)) + if (expr_type.is_numeric() && subquery_type.is_string()) + || (subquery_type.is_numeric() && expr_type.is_string()) { return plan_err!( "expr type {expr_type} can't cast to {subquery_type} in SetComparison" @@ -890,12 +888,15 @@ fn coerce_frame_bound( fn extract_window_frame_target_type(col_type: &DataType) -> Result<DataType> { if col_type.is_numeric() - || is_utf8_or_utf8view_or_large_utf8(col_type) - || matches!(col_type, DataType::List(_)) - || matches!(col_type, DataType::LargeList(_)) - || matches!(col_type, DataType::FixedSizeList(_, _)) - || matches!(col_type, DataType::Null) - || matches!(col_type, DataType::Boolean) + || col_type.is_string() + || col_type.is_null() + || matches!( + col_type, + DataType::List(_) + | DataType::LargeList(_) + | DataType::FixedSizeList(_, _) + | DataType::Boolean + ) { Ok(col_type.clone()) } else if is_datetime(col_type) { diff --git a/datafusion/physical-expr/src/expressions/negative.rs b/datafusion/physical-expr/src/expressions/negative.rs index 0c9476beb..c727c8fa5 100644 --- a/datafusion/physical-expr/src/expressions/negative.rs +++ b/datafusion/physical-expr/src/expressions/negative.rs @@ -37,7 +37,7 @@ use datafusion_expr::statistics::Distribution::{ }; use datafusion_expr::{ ColumnarValue, - type_coercion::{is_interval, is_null, is_signed_numeric, is_timestamp}, + type_coercion::{is_interval, is_signed_numeric, is_timestamp}, }; /// Negative expression @@ -190,7 +190,7 @@ pub fn negative( input_schema: &Schema, ) -> Result<Arc<dyn PhysicalExpr>> { let data_type = arg.data_type(input_schema)?; - if is_null(&data_type) { + if data_type.is_null() { Ok(arg) } else if !is_signed_numeric(&data_type) && !is_interval(&data_type) diff --git a/datafusion/pruning/src/pruning_predicate.rs b/datafusion/pruning/src/pruning_predicate.rs index 5f1b4233b..d0cb06744 100644 --- a/datafusion/pruning/src/pruning_predicate.rs +++ b/datafusion/pruning/src/pruning_predicate.rs @@ -1205,13 +1205,6 @@ fn is_compare_op(op: Operator) -> bool { ) } -fn is_string_type(data_type: &DataType) -> bool { - matches!( - data_type, - DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View - ) -} - // The pruning logic is based on the comparing the min/max bounds. // Must make sure the two type has order. // For example, casts from string to numbers is not correct. @@ -1233,7 +1226,7 @@ fn verify_support_type_for_prune(from_type: &DataType, to_type: &DataType) -> Re // If both types are strings or both are not strings (number, timestamp, etc) // then we can compare them. // PruningPredicate does not support casting of strings to numbers and such. - if is_string_type(from_type) == is_string_type(to_type) { + if from_type.is_string() == to_type.is_string() { Ok(()) } else { plan_err!( @@ -4681,7 +4674,7 @@ mod tests { true, // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) true, - // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no row match. (min, max) maybe truncate + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no row match. (min, max) maybe truncate // original (min, max) maybe ("A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}\u{10ffff}\u{10ffff}") true, ]; --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
