Re: [PR] #17801 Improve nullability reporting of case expressions [datafusion]

via GitHub Tue, 18 Nov 2025 07:41:03 -0800


pepijnve commented on code in PR #17813:
URL: https://github.com/apache/datafusion/pull/17813#discussion_r2538656699



##########
datafusion/expr/src/predicate_bounds.rs:
##########
@@ -0,0 +1,1045 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{BinaryExpr, Expr, ExprSchemable};
+use arrow::datatypes::DataType;
+use bitflags::bitflags;
+use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion};
+use datafusion_common::{DataFusionError, ExprSchema, Result, ScalarValue};
+use datafusion_expr_common::interval_arithmetic::{Interval, NullableInterval};
+use datafusion_expr_common::operator::Operator;
+
+bitflags! {
+    /// A set representing the possible outcomes of a SQL boolean expression
+    #[derive(PartialEq, Eq, Clone, Debug)]
+    struct TernarySet: u8 {
+        const TRUE = 0b1;
+        const FALSE = 0b10;
+        const UNKNOWN = 0b100;
+    }
+}
+
+impl TernarySet {
+    /// Returns the set of possible values after applying the `is true` test 
on all
+    /// values in this set.
+    /// The resulting set can only contain 'TRUE' and/or 'FALSE', never 
'UNKNOWN'.
+    fn is_true(&self) -> Self {
+        let mut is_true = Self::empty();
+        if self.contains(Self::TRUE) {
+            is_true.toggle(Self::TRUE);
+        }
+        if self.intersects(Self::UNKNOWN | Self::FALSE) {
+            is_true.toggle(Self::FALSE);
+        }
+        is_true
+    }
+
+    /// Returns the set of possible values after applying the `is false` test 
on all
+    /// values in this set.
+    /// The resulting set can only contain 'TRUE' and/or 'FALSE', never 
'UNKNOWN'.
+    fn is_false(&self) -> Self {
+        let mut is_false = Self::empty();
+        if self.contains(Self::FALSE) {
+            is_false.toggle(Self::TRUE);
+        }
+        if self.intersects(Self::UNKNOWN | Self::TRUE) {
+            is_false.toggle(Self::FALSE);
+        }
+        is_false
+    }
+
+    /// Returns the set of possible values after applying the `is unknown` 
test on all
+    /// values in this set.
+    /// The resulting set can only contain 'TRUE' and/or 'FALSE', never 
'UNKNOWN'.
+    fn is_unknown(&self) -> Self {
+        let mut is_unknown = Self::empty();
+        if self.contains(Self::UNKNOWN) {
+            is_unknown.toggle(Self::TRUE);
+        }
+        if self.intersects(Self::TRUE | Self::FALSE) {
+            is_unknown.toggle(Self::FALSE);
+        }
+        is_unknown
+    }
+
+    /// Returns the set of possible values after applying SQL three-valued 
logical NOT
+    /// on each value in `value`.
+    ///
+    /// This method uses the following truth table.
+    ///
+    /// ```text
+    ///  A  | ¬A
+    /// ----|----
+    ///  F  |  T
+    ///  U  |  U
+    ///  T  |  F
+    /// ```
+    fn not(set: &Self) -> Self {
+        let mut not = Self::empty();
+        if set.contains(Self::TRUE) {
+            not.toggle(Self::FALSE);
+        }
+        if set.contains(Self::FALSE) {
+            not.toggle(Self::TRUE);
+        }
+        if set.contains(Self::UNKNOWN) {
+            not.toggle(Self::UNKNOWN);
+        }
+        not
+    }
+
+    /// Returns the set of possible values after applying SQL three-valued 
logical AND
+    /// on each combination of values from `lhs` and `rhs`.
+    ///
+    /// This method uses the following truth table.
+    ///
+    /// ```text
+    /// A ∧ B │ F U T
+    /// ──────┼──────
+    ///     F │ F F F
+    ///     U │ F U U
+    ///     T │ F U T
+    /// ```
+    fn and(lhs: &Self, rhs: &Self) -> Self {
+        if lhs.is_empty() || rhs.is_empty() {
+            return Self::empty();
+        }
+
+        let mut and = Self::empty();
+        if lhs.contains(Self::FALSE) || rhs.contains(Self::FALSE) {
+            and.toggle(Self::FALSE);
+        }
+
+        if (lhs.contains(Self::UNKNOWN) && rhs.intersects(Self::TRUE | 
Self::UNKNOWN))
+            || (rhs.contains(Self::UNKNOWN) && lhs.intersects(Self::TRUE | 
Self::UNKNOWN))
+        {
+            and.toggle(Self::UNKNOWN);
+        }
+
+        if lhs.contains(Self::TRUE) && rhs.contains(Self::TRUE) {
+            and.toggle(Self::TRUE);
+        }
+
+        and
+    }
+
+    /// Returns the set of possible values after applying SQL three-valued 
logical OR
+    /// on each combination of values from `lhs` and `rhs`.
+    ///
+    /// This method uses the following truth table.
+    ///
+    /// ```text
+    /// A ∨ B │ F U T
+    /// ──────┼──────
+    ///     F │ F U T
+    ///     U │ U U T
+    ///     T │ T T T
+    /// ```
+    fn or(lhs: &Self, rhs: &Self) -> Self {
+        let mut or = Self::empty();
+        if lhs.contains(Self::TRUE) || rhs.contains(Self::TRUE) {
+            or.toggle(Self::TRUE);
+        }
+
+        if (lhs.contains(Self::UNKNOWN) && rhs.intersects(Self::FALSE | 
Self::UNKNOWN))
+            || (rhs.contains(Self::UNKNOWN)
+                && lhs.intersects(Self::FALSE | Self::UNKNOWN))
+        {
+            or.toggle(Self::UNKNOWN);
+        }
+
+        if lhs.contains(Self::FALSE) && rhs.contains(Self::FALSE) {
+            or.toggle(Self::FALSE);
+        }
+
+        or
+    }
+}
+
+impl TryFrom<&ScalarValue> for TernarySet {
+    type Error = DataFusionError;
+
+    fn try_from(value: &ScalarValue) -> Result<Self> {
+        Ok(match value {
+            ScalarValue::Null => TernarySet::UNKNOWN,
+            ScalarValue::Boolean(b) => match b {
+                Some(true) => TernarySet::TRUE,
+                Some(false) => TernarySet::FALSE,
+                None => TernarySet::UNKNOWN,
+            },
+            _ => {
+                let b = value.cast_to(&DataType::Boolean)?;
+                Self::try_from(&b)?
+            }
+        })
+    }
+}
+
+/// Computes the output interval for the given boolean expression based on 
statically
+/// available information.
+///
+/// # Arguments
+///
+/// * `predicate` - The boolean expression to analyze
+/// * `is_null` - A callback function that provides additional nullability 
information for
+///   expressions. When called with an expression, it should return:
+///   - `Some(true)` if the expression is known to evaluate to NULL
+///   - `Some(false)` if the expression is known to NOT evaluate to NULL
+///   - `None` if the nullability cannot be determined
+///
+///   This callback allows the caller to provide context-specific knowledge 
about expression
+///   nullability that cannot be determined from the schema alone. For 
example, it can be used
+///   to indicate that a particular column reference is known to be NULL in a 
specific context,
+///   or that certain expressions will never be NULL based on runtime 
constraints.
+///
+/// * `input_schema` - Schema information for resolving expression types and 
nullability
+///
+/// # Return Value
+///
+/// The function returns a [NullableInterval] that describes the possible 
boolean values the
+/// predicate can evaluate to. The return value will be one of the following:
+///
+/// * `NullableInterval::NotNull { values: Interval::CERTAINLY_TRUE }` - The 
predicate will
+///   always evaluate to TRUE (never FALSE or NULL)
+///
+/// * `NullableInterval::NotNull { values: Interval::CERTAINLY_FALSE }` - The 
predicate will
+///   always evaluate to FALSE (never TRUE or NULL)
+///
+/// * `NullableInterval::NotNull { values: Interval::UNCERTAIN }` - The 
predicate will never
+///   evaluate to NULL, but may be either TRUE or FALSE
+///
+/// * `NullableInterval::Null { datatype: DataType::Boolean }` - The predicate 
will always
+///   evaluate to NULL (SQL UNKNOWN in three-valued logic)
+///
+/// * `NullableInterval::MaybeNull { values: Interval::CERTAINLY_TRUE }` - The 
predicate may
+///   evaluate to TRUE or NULL, but never FALSE
+///
+/// * `NullableInterval::MaybeNull { values: Interval::CERTAINLY_FALSE }` - 
The predicate may
+///   evaluate to FALSE or NULL, but never TRUE
+///
+/// * `NullableInterval::MaybeNull { values: Interval::UNCERTAIN }` - The 
predicate may
+///   evaluate to any of TRUE, FALSE, or NULL
+///
+pub(super) fn evaluate_bounds(
+    predicate: &Expr,
+    certainly_null_expr: Option<&Expr>,
+    input_schema: &dyn ExprSchema,
+) -> Result<NullableInterval> {
+    let evaluator = PredicateBoundsEvaluator {

Review Comment:
   I've just pushed a commit that removes TernarySet. Already a big block of 
code gone.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] #17801 Improve nullability reporting of case expressions [datafusion]

Reply via email to