Re: [PR] test: interval analysis unit tests [datafusion]

via GitHub Mon, 20 Jan 2025 17:05:56 -0800


hiltontj commented on code in PR #14189:
URL: https://github.com/apache/datafusion/pull/14189#discussion_r1922946755



##########
datafusion/physical-expr/src/analysis.rs:
##########
@@ -246,3 +246,124 @@ fn calculate_selectivity(
             acc * cardinality_ratio(&initial.interval, &target.interval)
         })
 }
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow_schema::{DataType, Field, Schema};
+    use datafusion_common::{assert_contains, DFSchema};
+    use datafusion_expr::{
+        col, execution_props::ExecutionProps, interval_arithmetic::Interval, 
lit, Expr,
+    };
+
+    use crate::{create_physical_expr, AnalysisContext};
+
+    use super::{analyze, ExprBoundaries};
+
+    fn make_field(name: &str, data_type: DataType) -> Field {
+        let nullable = false;
+        Field::new(name, data_type, nullable)
+    }
+
+    #[test]
+    fn test_analyze_boundary_exprs() {
+        let schema = Arc::new(Schema::new(vec![make_field("a", 
DataType::Int32)]));
+
+        /// Test case containing (expression tree, lower bound, upper bound)
+        type TestCase = (Expr, Option<i32>, Option<i32>);
+
+        let test_cases: Vec<TestCase> = vec![
+            // a > 10
+            (col("a").gt(lit(10)), Some(11), None),
+            // a < 20
+            (col("a").lt(lit(20)), None, Some(19)),
+            // a > 10 AND a < 20
+            (
+                col("a").gt(lit(10)).and(col("a").lt(lit(20))),
+                Some(11),
+                Some(19),
+            ),
+            // a >= 10
+            (col("a").gt_eq(lit(10)), Some(10), None),
+            // a <= 20
+            (col("a").lt_eq(lit(20)), None, Some(20)),
+            // a >= 10 AND a <= 20
+            (
+                col("a").gt_eq(lit(10)).and(col("a").lt_eq(lit(20))),
+                Some(10),
+                Some(20),
+            ),
+            // a > 10 AND a < 20 AND a < 15
+            (
+                col("a")
+                    .gt(lit(10))
+                    .and(col("a").lt(lit(20)))
+                    .and(col("a").lt(lit(15))),
+                Some(11),
+                Some(14),
+            ),
+            // (a > 10 AND a < 20) AND (a > 15 AND a < 25)
+            (
+                col("a")
+                    .gt(lit(10))
+                    .and(col("a").lt(lit(20)))
+                    .and(col("a").gt(lit(15)))
+                    .and(col("a").lt(lit(25))),
+                Some(16),
+                Some(19),
+            ),
+            // (a > 10 AND a < 20) AND (a > 20 AND a < 30)
+            (
+                col("a")
+                    .gt(lit(10))
+                    .and(col("a").lt(lit(20)))
+                    .and(col("a").gt(lit(20)))
+                    .and(col("a").lt(lit(30))),
+                None,
+                None,
+            ),
+        ];
+        for (expr, lower, upper) in test_cases {
+            let boundaries = 
ExprBoundaries::try_new_unbounded(&schema).unwrap();
+            let df_schema = DFSchema::try_from(Arc::clone(&schema)).unwrap();
+            let physical_expr =
+                create_physical_expr(&expr, &df_schema, 
&ExecutionProps::new()).unwrap();
+            let analysis_result = analyze(
+                &physical_expr,
+                AnalysisContext::new(boundaries),
+                df_schema.as_ref(),
+            )
+            .unwrap();
+            let actual = &analysis_result.boundaries[0].interval;
+            let expected = Interval::make(lower, upper).unwrap();
+            assert_eq!(
+                &expected, actual,
+                "did not get correct interval for SQL expression: {expr:?}"
+            );
+        }
+    }
+
+    #[test]
+    fn test_analyze_invalid_boundary_exprs() {
+        let schema = Arc::new(Schema::new(vec![make_field("a", 
DataType::Int32)]));
+        type TestCase = (Expr, &'static str);
+        let test_cases: Vec<TestCase> = vec![(

Review Comment:
   > 3. The analysis results will be [None,None], just like in this example, 
which is an empty-set actually. If you evaluate a<0 AND a>0 in interval 
arithmetic tools, you will get the result of [false, false] (CERTAINLY_FALSE). 
So, there is nothing wrong there. However, in the scope of AnalysisContext, 
these are mixed up. The solution would be an another Option<> wrapper, 
representing the evaluation comes up with a valid range, or the result is an 
empty-set.
   
   Perhaps it would be best to log an issue for this. I don't know if this is 
something I can focus on at the moment, but perhaps another contributor will 
like to pick it up.
   
   ---
   
   Otherwise, I can simplify that test case to remove the vector wrapping  the 
single element as that is confusing.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Re: [PR] test: interval analysis unit tests [datafusion]

Reply via email to