Re: [PR] feat: extend interval arithmetic to support scalar functions (UDFs) [datafusion]

via GitHub Sun, 17 May 2026 01:31:13 -0700


Copilot commented on code in PR #22248:
URL: https://github.com/apache/datafusion/pull/22248#discussion_r3254286885



##########
datafusion/physical-plan/src/filter.rs:
##########
@@ -1209,6 +1209,66 @@ mod tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn test_filter_statistics_ceil_scalar_fn() -> Result<()> {
+        // Table: x Float64, min=8.0, max=16.0, 100 rows.
+        // Filter: ceil(x) > 12.0
+        //
+        // The range [8.0, 16.0] lies within a single IEEE-754 binade so
+        // Float64 cardinality is proportional to the value range, making
+        // the selectivity estimate predictable.
+        //
+        // With check_support recognising ScalarFunctionExpr and CeilFunc
+        // implementing evaluate_bounds/propagate_constraints the solver
+        // narrows x to roughly [11.0, 16.0]:
+        //   ceil(x) > 12 → x ∈ (11, 16] → conservative [11, 16]
+        //   selectivity ≈ (16−11)/(16−8) = 5/8 = 0.625 → ~62 rows
+        //
+        // Without the fix the estimate stays at 100 (no interval analysis).
+        let schema = Schema::new(vec![Field::new("x", DataType::Float64, 
false)]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                num_rows: Precision::Inexact(100),
+                total_byte_size: Precision::Absent,
+                column_statistics: vec![ColumnStatistics {
+                    min_value: 
Precision::Inexact(ScalarValue::Float64(Some(8.0))),
+                    max_value: 
Precision::Inexact(ScalarValue::Float64(Some(16.0))),
+                    ..Default::default()
+                }],
+            },
+            schema.clone(),
+        ));
+
+        let x = col("x", &schema)?;
+        let ceil_udf = datafusion_functions::math::ceil();
+        let config = Arc::new(ConfigOptions::new());
+        let ceil_x: Arc<dyn PhysicalExpr> =
+            Arc::new(datafusion_physical_expr::ScalarFunctionExpr::try_new(
+                Arc::clone(&ceil_udf),
+                vec![x],
+                &schema,
+                config,
+            )?);
+        let predicate = binary(ceil_x, Operator::Gt, lit(12.0f64), &schema)?;
+
+        let filter = Arc::new(FilterExec::try_new(predicate, input)?);
+        let statistics = filter.partition_statistics(None)?;
+
+        let num_rows = statistics.num_rows.get_value().copied().unwrap_or(100);
+        // Interval analysis must narrow the estimate below the full 100-row 
input.
+        assert!(
+            num_rows < 100,
+            "expected interval analysis to narrow row estimate, got {num_rows}"
+        );
+        // The conservative bound is x ∈ [11, 16] out of [8, 16] → ~62 rows.
+        // Allow a generous range to be robust to float-cardinality rounding.
+        assert!(
+            num_rows >= 50,
+            "expected at least 50 rows after ceil(x) > 12.0 on [8,16], got 
{num_rows}"

Review Comment:
   This test’s numeric bounds (`< 100` and `>= 50`) may still be brittle if the 
selectivity/cardinality estimation model changes (even while still “narrowing” 
correctly). Consider asserting a derived/tolerant expectation (e.g., compare 
against an expected range computed from min/max with a wider tolerance), or 
assert only the key regression property (that the estimate decreases) and keep 
the numeric expectation in a more tolerant form.
   



##########
datafusion/functions/src/math/ceil.rs:
##########
@@ -191,11 +191,238 @@ impl ScalarUDFImpl for CeilFunc {
     }
 
     fn evaluate_bounds(&self, inputs: &[&Interval]) -> Result<Interval> {
-        let data_type = inputs[0].data_type();
-        Interval::make_unbounded(&data_type)
+        let [input] = inputs else {
+            return exec_err!(
+                "ceil expected 1 argument for bounds evaluation, got {}",
+                inputs.len()
+            );
+        };
+        let data_type = input.data_type();
+        match (ceil_scalar(input.lower()), ceil_scalar(input.upper())) {
+            (Some(lo), Some(hi)) => Interval::try_new(lo, hi)
+                .or_else(|_| Interval::make_unbounded(&data_type)),

Review Comment:
   `Interval::try_new(lo, hi).or_else(|_| ...)` will silently swallow 
unexpected construction errors (e.g., a type/interval invariant violation), 
making it harder to detect real bugs. Prefer propagating the error (using `?`) 
or only falling back to unbounded for specific, expected error conditions.
   



##########
datafusion/functions/src/math/ceil.rs:
##########
@@ -191,11 +191,238 @@ impl ScalarUDFImpl for CeilFunc {
     }
 
     fn evaluate_bounds(&self, inputs: &[&Interval]) -> Result<Interval> {
-        let data_type = inputs[0].data_type();
-        Interval::make_unbounded(&data_type)
+        let [input] = inputs else {
+            return exec_err!(
+                "ceil expected 1 argument for bounds evaluation, got {}",
+                inputs.len()
+            );
+        };
+        let data_type = input.data_type();
+        match (ceil_scalar(input.lower()), ceil_scalar(input.upper())) {
+            (Some(lo), Some(hi)) => Interval::try_new(lo, hi)
+                .or_else(|_| Interval::make_unbounded(&data_type)),
+            _ => Interval::make_unbounded(&data_type),
+        }
+    }
+
+    fn propagate_constraints(
+        &self,
+        interval: &Interval,
+        inputs: &[&Interval],
+    ) -> Result<Option<Vec<Interval>>> {
+        let [input_interval] = inputs else {
+            return exec_err!(
+                "ceil expected 1 argument for constraint propagation, got {}",
+                inputs.len()
+            );
+        };
+        // ceil(x) ∈ [N, M] → x ∈ (ceil(N)−1, floor(M)]
+        // Normalize bounds to integers ceil can actually take before mapping 
back.
+        let lo = match interval.lower() {
+            ScalarValue::Float64(Some(n)) if n.is_finite() => {
+                Some(ScalarValue::Float64(Some(n.ceil() - 1.0)))
+            }
+            ScalarValue::Float32(Some(n)) if n.is_finite() => {
+                Some(ScalarValue::Float32(Some(n.ceil() - 1.0)))
+            }
+            _ => None,
+        };
+        let hi = match interval.upper() {
+            ScalarValue::Float64(Some(n)) if n.is_finite() => {
+                Some(ScalarValue::Float64(Some(n.floor())))
+            }
+            ScalarValue::Float32(Some(n)) if n.is_finite() => {
+                Some(ScalarValue::Float32(Some(n.floor())))
+            }
+            _ => None,
+        };
+        match (lo, hi) {
+            (Some(lo), Some(hi)) => {
+                let constraint = Interval::try_new(lo, hi)?;
+                Ok(input_interval.intersect(constraint)?.map(|r| vec![r]))
+            }
+            _ => Ok(Some(vec![(*input_interval).clone()])),

Review Comment:
   When only one side of the output interval is finite (e.g., just an upper 
bound), this implementation returns the input interval unchanged, even though 
it could still narrow using the known bound (e.g., `x <= floor(M)` or `x > 
ceil(N)-1`). Extending this to apply one-sided narrowing improves solver 
effectiveness without sacrificing correctness.
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] feat: extend interval arithmetic to support scalar functions (UDFs) [datafusion]

Reply via email to