Re: [PR] test: interval analysis unit tests [datafusion]

via GitHub Mon, 20 Jan 2025 07:00:18 -0800


hiltontj commented on code in PR #14189:
URL: https://github.com/apache/datafusion/pull/14189#discussion_r1922530938



##########
datafusion/physical-expr/src/analysis.rs:
##########
@@ -246,3 +246,124 @@ fn calculate_selectivity(
             acc * cardinality_ratio(&initial.interval, &target.interval)
         })
 }
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow_schema::{DataType, Field, Schema};
+    use datafusion_common::{assert_contains, DFSchema};
+    use datafusion_expr::{
+        col, execution_props::ExecutionProps, interval_arithmetic::Interval, 
lit, Expr,
+    };
+
+    use crate::{create_physical_expr, AnalysisContext};
+
+    use super::{analyze, ExprBoundaries};
+
+    fn make_field(name: &str, data_type: DataType) -> Field {
+        let nullable = false;
+        Field::new(name, data_type, nullable)
+    }
+
+    #[test]
+    fn test_analyze_boundary_exprs() {
+        let schema = Arc::new(Schema::new(vec![make_field("a", 
DataType::Int32)]));
+
+        /// Test case containing (expression tree, lower bound, upper bound)
+        type TestCase = (Expr, Option<i32>, Option<i32>);
+
+        let test_cases: Vec<TestCase> = vec![
+            // a > 10
+            (col("a").gt(lit(10)), Some(11), None),
+            // a < 20
+            (col("a").lt(lit(20)), None, Some(19)),
+            // a > 10 AND a < 20
+            (
+                col("a").gt(lit(10)).and(col("a").lt(lit(20))),
+                Some(11),
+                Some(19),
+            ),
+            // a >= 10
+            (col("a").gt_eq(lit(10)), Some(10), None),
+            // a <= 20
+            (col("a").lt_eq(lit(20)), None, Some(20)),
+            // a >= 10 AND a <= 20
+            (
+                col("a").gt_eq(lit(10)).and(col("a").lt_eq(lit(20))),
+                Some(10),
+                Some(20),
+            ),
+            // a > 10 AND a < 20 AND a < 15
+            (
+                col("a")
+                    .gt(lit(10))
+                    .and(col("a").lt(lit(20)))
+                    .and(col("a").lt(lit(15))),
+                Some(11),
+                Some(14),
+            ),
+            // (a > 10 AND a < 20) AND (a > 15 AND a < 25)
+            (
+                col("a")
+                    .gt(lit(10))
+                    .and(col("a").lt(lit(20)))
+                    .and(col("a").gt(lit(15)))
+                    .and(col("a").lt(lit(25))),
+                Some(16),
+                Some(19),
+            ),
+            // (a > 10 AND a < 20) AND (a > 20 AND a < 30)
+            (
+                col("a")
+                    .gt(lit(10))
+                    .and(col("a").lt(lit(20)))
+                    .and(col("a").gt(lit(20)))
+                    .and(col("a").lt(lit(30))),
+                None,
+                None,
+            ),
+        ];
+        for (expr, lower, upper) in test_cases {
+            let boundaries = 
ExprBoundaries::try_new_unbounded(&schema).unwrap();
+            let df_schema = DFSchema::try_from(Arc::clone(&schema)).unwrap();
+            let physical_expr =
+                create_physical_expr(&expr, &df_schema, 
&ExecutionProps::new()).unwrap();
+            let analysis_result = analyze(
+                &physical_expr,
+                AnalysisContext::new(boundaries),
+                df_schema.as_ref(),
+            )
+            .unwrap();
+            let actual = &analysis_result.boundaries[0].interval;
+            let expected = Interval::make(lower, upper).unwrap();
+            assert_eq!(
+                &expected, actual,
+                "did not get correct interval for SQL expression: {expr:?}"
+            );
+        }
+    }
+
+    #[test]
+    fn test_analyze_invalid_boundary_exprs() {
+        let schema = Arc::new(Schema::new(vec![make_field("a", 
DataType::Int32)]));
+        type TestCase = (Expr, &'static str);
+        let test_cases: Vec<TestCase> = vec![(

Review Comment:
   Originally I set it up this way because I thought the empty-set result would 
be an error, e.g., the test case in the other test in this PR for:
   ```
   (a > 10 AND a < 20) AND (a > 20 AND a < 30)
   ```
   Results in `None`/`None` instead of an error. (I believe that is what you 
are referring to by _empty-set_?)
   
   There are other things I have noticed while using this `analyze` method in 
my work at Influx (see https://github.com/influxdata/influxdb/pull/25866, and 
specifically 
[here](https://github.com/influxdata/influxdb/blob/daa3fe700aa26b9fcdae00aa77f8c01968b21ee4/influxdb3_write/src/lib.rs#L538)).
 The function produces an error for various expression types. For example 
`IsNotNull`.
   
   So, I could extend this test to check a wider range of expression types if 
that would be helpful - either on this or a following PR. If not, I can trim 
this test down to that singular case.
   
   I'm open to your recommendation.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] test: interval analysis unit tests [datafusion]

Reply via email to