pepijnve commented on code in PR #17813: URL: https://github.com/apache/datafusion/pull/17813#discussion_r2515841199
########## datafusion/expr/src/predicate_bounds.rs: ########## @@ -0,0 +1,1045 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::{BinaryExpr, Expr, ExprSchemable}; +use arrow::datatypes::DataType; +use bitflags::bitflags; +use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; +use datafusion_common::{DataFusionError, ExprSchema, Result, ScalarValue}; +use datafusion_expr_common::interval_arithmetic::{Interval, NullableInterval}; +use datafusion_expr_common::operator::Operator; + +bitflags! { + /// A set representing the possible outcomes of a SQL boolean expression + #[derive(PartialEq, Eq, Clone, Debug)] + struct TernarySet: u8 { + const TRUE = 0b1; + const FALSE = 0b10; + const UNKNOWN = 0b100; + } +} + +impl TernarySet { + /// Returns the set of possible values after applying the `is true` test on all + /// values in this set. + /// The resulting set can only contain 'TRUE' and/or 'FALSE', never 'UNKNOWN'. + fn is_true(&self) -> Self { + let mut is_true = Self::empty(); + if self.contains(Self::TRUE) { + is_true.toggle(Self::TRUE); + } + if self.intersects(Self::UNKNOWN | Self::FALSE) { + is_true.toggle(Self::FALSE); + } + is_true + } + + /// Returns the set of possible values after applying the `is false` test on all + /// values in this set. + /// The resulting set can only contain 'TRUE' and/or 'FALSE', never 'UNKNOWN'. + fn is_false(&self) -> Self { + let mut is_false = Self::empty(); + if self.contains(Self::FALSE) { + is_false.toggle(Self::TRUE); + } + if self.intersects(Self::UNKNOWN | Self::TRUE) { + is_false.toggle(Self::FALSE); + } + is_false + } + + /// Returns the set of possible values after applying the `is unknown` test on all + /// values in this set. + /// The resulting set can only contain 'TRUE' and/or 'FALSE', never 'UNKNOWN'. + fn is_unknown(&self) -> Self { + let mut is_unknown = Self::empty(); + if self.contains(Self::UNKNOWN) { + is_unknown.toggle(Self::TRUE); + } + if self.intersects(Self::TRUE | Self::FALSE) { + is_unknown.toggle(Self::FALSE); + } + is_unknown + } + + /// Returns the set of possible values after applying SQL three-valued logical NOT + /// on each value in `value`. + /// + /// This method uses the following truth table. + /// + /// ```text + /// A | ¬A + /// ----|---- + /// F | T + /// U | U + /// T | F + /// ``` + fn not(set: &Self) -> Self { + let mut not = Self::empty(); + if set.contains(Self::TRUE) { + not.toggle(Self::FALSE); + } + if set.contains(Self::FALSE) { + not.toggle(Self::TRUE); + } + if set.contains(Self::UNKNOWN) { + not.toggle(Self::UNKNOWN); + } + not + } + + /// Returns the set of possible values after applying SQL three-valued logical AND + /// on each combination of values from `lhs` and `rhs`. + /// + /// This method uses the following truth table. + /// + /// ```text + /// A ∧ B │ F U T + /// ──────┼────── + /// F │ F F F + /// U │ F U U + /// T │ F U T + /// ``` + fn and(lhs: &Self, rhs: &Self) -> Self { + if lhs.is_empty() || rhs.is_empty() { + return Self::empty(); + } + + let mut and = Self::empty(); + if lhs.contains(Self::FALSE) || rhs.contains(Self::FALSE) { + and.toggle(Self::FALSE); + } + + if (lhs.contains(Self::UNKNOWN) && rhs.intersects(Self::TRUE | Self::UNKNOWN)) + || (rhs.contains(Self::UNKNOWN) && lhs.intersects(Self::TRUE | Self::UNKNOWN)) + { + and.toggle(Self::UNKNOWN); + } + + if lhs.contains(Self::TRUE) && rhs.contains(Self::TRUE) { + and.toggle(Self::TRUE); + } + + and + } + + /// Returns the set of possible values after applying SQL three-valued logical OR + /// on each combination of values from `lhs` and `rhs`. + /// + /// This method uses the following truth table. + /// + /// ```text + /// A ∨ B │ F U T + /// ──────┼────── + /// F │ F U T + /// U │ U U T + /// T │ T T T + /// ``` + fn or(lhs: &Self, rhs: &Self) -> Self { + let mut or = Self::empty(); + if lhs.contains(Self::TRUE) || rhs.contains(Self::TRUE) { + or.toggle(Self::TRUE); + } + + if (lhs.contains(Self::UNKNOWN) && rhs.intersects(Self::FALSE | Self::UNKNOWN)) + || (rhs.contains(Self::UNKNOWN) + && lhs.intersects(Self::FALSE | Self::UNKNOWN)) + { + or.toggle(Self::UNKNOWN); + } + + if lhs.contains(Self::FALSE) && rhs.contains(Self::FALSE) { + or.toggle(Self::FALSE); + } + + or + } +} + +impl TryFrom<&ScalarValue> for TernarySet { + type Error = DataFusionError; + + fn try_from(value: &ScalarValue) -> Result<Self> { + Ok(match value { + ScalarValue::Null => TernarySet::UNKNOWN, + ScalarValue::Boolean(b) => match b { + Some(true) => TernarySet::TRUE, + Some(false) => TernarySet::FALSE, + None => TernarySet::UNKNOWN, + }, + _ => { + let b = value.cast_to(&DataType::Boolean)?; + Self::try_from(&b)? + } + }) + } +} + +/// Computes the output interval for the given boolean expression based on statically +/// available information. +/// +/// # Arguments +/// +/// * `predicate` - The boolean expression to analyze +/// * `is_null` - A callback function that provides additional nullability information for +/// expressions. When called with an expression, it should return: +/// - `Some(true)` if the expression is known to evaluate to NULL +/// - `Some(false)` if the expression is known to NOT evaluate to NULL +/// - `None` if the nullability cannot be determined +/// +/// This callback allows the caller to provide context-specific knowledge about expression +/// nullability that cannot be determined from the schema alone. For example, it can be used +/// to indicate that a particular column reference is known to be NULL in a specific context, +/// or that certain expressions will never be NULL based on runtime constraints. +/// +/// * `input_schema` - Schema information for resolving expression types and nullability +/// +/// # Return Value +/// +/// The function returns a [NullableInterval] that describes the possible boolean values the +/// predicate can evaluate to. The return value will be one of the following: +/// +/// * `NullableInterval::NotNull { values: Interval::CERTAINLY_TRUE }` - The predicate will +/// always evaluate to TRUE (never FALSE or NULL) +/// +/// * `NullableInterval::NotNull { values: Interval::CERTAINLY_FALSE }` - The predicate will +/// always evaluate to FALSE (never TRUE or NULL) +/// +/// * `NullableInterval::NotNull { values: Interval::UNCERTAIN }` - The predicate will never +/// evaluate to NULL, but may be either TRUE or FALSE +/// +/// * `NullableInterval::Null { datatype: DataType::Boolean }` - The predicate will always +/// evaluate to NULL (SQL UNKNOWN in three-valued logic) +/// +/// * `NullableInterval::MaybeNull { values: Interval::CERTAINLY_TRUE }` - The predicate may +/// evaluate to TRUE or NULL, but never FALSE +/// +/// * `NullableInterval::MaybeNull { values: Interval::CERTAINLY_FALSE }` - The predicate may +/// evaluate to FALSE or NULL, but never TRUE +/// +/// * `NullableInterval::MaybeNull { values: Interval::UNCERTAIN }` - The predicate may +/// evaluate to any of TRUE, FALSE, or NULL +/// +pub(super) fn evaluate_bounds( + predicate: &Expr, + certainly_null_expr: Option<&Expr>, + input_schema: &dyn ExprSchema, +) -> Result<NullableInterval> { + let evaluator = PredicateBoundsEvaluator { Review Comment: My idea was to see if I can remove `TernarySet` and use `NullableInterval` instead indeed. As I was trying to do so I stumbled upon the fact that the logical operators were broken. The bitfield is much less 🤯 if you ask me, but it's silly to have multiple implementations of the same concept and using `NullableInterval` directly will get rid of the ugly translation block code. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
