[GitHub] [arrow-datafusion] berkaysynnada commented on a diff in pull request #6982: Refactor AnalysisContext and statistics() of FilterExec

via GitHub Wed, 19 Jul 2023 06:15:17 -0700


berkaysynnada commented on code in PR #6982:
URL: https://github.com/apache/arrow-datafusion/pull/6982#discussion_r1268058585



##########
datafusion/physical-expr/src/intervals/interval_aritmetic.rs:
##########
@@ -1172,4 +1321,70 @@ mod tests {
         let upper = 1.5;
         capture_mode_change_f32((lower, upper), true, true);
     }
+
+    #[test]
+    fn test_cardinality_of_intervals() -> Result<()> {
+        // In IEEE 754 standard for floating-point arithmetic, if we keep the 
sign and exponent fields same,
+        // we can represent 4503599627370496 different numbers by changing the 
mantissa
+        // (4503599627370496 = 2^52, since there are 52 bits in mantissa, and 
2^23 = 8388608 for f32).
+        let distinct_f64 = 4503599627370496;
+        let distinct_f32 = 8388608;
+        let intervals = [
+            Interval::new(
+                IntervalBound::new(ScalarValue::from(0.25), false),
+                IntervalBound::new(ScalarValue::from(0.50), true),
+            ),
+            Interval::new(
+                IntervalBound::new(ScalarValue::from(0.5), false),
+                IntervalBound::new(ScalarValue::from(1.0), true),
+            ),
+            Interval::new(
+                IntervalBound::new(ScalarValue::from(1.0), false),
+                IntervalBound::new(ScalarValue::from(2.0), true),
+            ),
+            Interval::new(
+                IntervalBound::new(ScalarValue::from(32.0), false),
+                IntervalBound::new(ScalarValue::from(64.0), true),
+            ),
+            Interval::new(
+                IntervalBound::new(ScalarValue::from(-0.50), false),
+                IntervalBound::new(ScalarValue::from(-0.25), true),
+            ),
+            Interval::new(
+                IntervalBound::new(ScalarValue::from(-32.0), false),
+                IntervalBound::new(ScalarValue::from(-16.0), true),
+            ),
+        ];
+        for interval in intervals {
+            assert_eq!(interval.cardinality()?, distinct_f64);
+        }
+
+        let intervals = [
+            Interval::new(
+                IntervalBound::new(ScalarValue::from(0.25_f32), false),
+                IntervalBound::new(ScalarValue::from(0.50_f32), true),
+            ),
+            Interval::new(
+                IntervalBound::new(ScalarValue::from(-1_f32), false),
+                IntervalBound::new(ScalarValue::from(-0.5_f32), true),
+            ),
+        ];
+        for interval in intervals {
+            assert_eq!(interval.cardinality()?, distinct_f32);
+        }
+
+        let interval = Interval::new(
+            IntervalBound::new(ScalarValue::from(-0.0625), false),
+            IntervalBound::new(ScalarValue::from(0.0625), true),
+        );
+        assert_eq!(interval.cardinality()?, distinct_f64 * 2_048);

Review Comment:
   
   
     | Interval | Distinct Count | Number of Floating Values
   -- | -- | -- | --
   before Filter | [-1, 1] | 100 | 1 B
   after Filter | [0, 1] | 50 or 100 ? | 0.5 B
   
   The selectivity is actually decreased by %50 in this example. However, 
distinct count parameter (storing in ExprBoundaries with interval parameter of 
the column) is not updated with an approximate information (it is not updated 
with 50). As I think selectivity can be calculated and used approximately, but 
unless we are sure, we should not update the interval and distinct count 
parameters.



##########
datafusion/physical-expr/src/intervals/interval_aritmetic.rs:
##########
@@ -1172,4 +1321,70 @@ mod tests {
         let upper = 1.5;
         capture_mode_change_f32((lower, upper), true, true);
     }
+
+    #[test]
+    fn test_cardinality_of_intervals() -> Result<()> {
+        // In IEEE 754 standard for floating-point arithmetic, if we keep the 
sign and exponent fields same,
+        // we can represent 4503599627370496 different numbers by changing the 
mantissa
+        // (4503599627370496 = 2^52, since there are 52 bits in mantissa, and 
2^23 = 8388608 for f32).
+        let distinct_f64 = 4503599627370496;
+        let distinct_f32 = 8388608;
+        let intervals = [
+            Interval::new(
+                IntervalBound::new(ScalarValue::from(0.25), false),
+                IntervalBound::new(ScalarValue::from(0.50), true),
+            ),
+            Interval::new(
+                IntervalBound::new(ScalarValue::from(0.5), false),
+                IntervalBound::new(ScalarValue::from(1.0), true),
+            ),
+            Interval::new(
+                IntervalBound::new(ScalarValue::from(1.0), false),
+                IntervalBound::new(ScalarValue::from(2.0), true),
+            ),
+            Interval::new(
+                IntervalBound::new(ScalarValue::from(32.0), false),
+                IntervalBound::new(ScalarValue::from(64.0), true),
+            ),
+            Interval::new(
+                IntervalBound::new(ScalarValue::from(-0.50), false),
+                IntervalBound::new(ScalarValue::from(-0.25), true),
+            ),
+            Interval::new(
+                IntervalBound::new(ScalarValue::from(-32.0), false),
+                IntervalBound::new(ScalarValue::from(-16.0), true),
+            ),
+        ];
+        for interval in intervals {
+            assert_eq!(interval.cardinality()?, distinct_f64);
+        }
+
+        let intervals = [
+            Interval::new(
+                IntervalBound::new(ScalarValue::from(0.25_f32), false),
+                IntervalBound::new(ScalarValue::from(0.50_f32), true),
+            ),
+            Interval::new(
+                IntervalBound::new(ScalarValue::from(-1_f32), false),
+                IntervalBound::new(ScalarValue::from(-0.5_f32), true),
+            ),
+        ];
+        for interval in intervals {
+            assert_eq!(interval.cardinality()?, distinct_f32);
+        }
+
+        let interval = Interval::new(
+            IntervalBound::new(ScalarValue::from(-0.0625), false),
+            IntervalBound::new(ScalarValue::from(0.0625), true),
+        );
+        assert_eq!(interval.cardinality()?, distinct_f64 * 2_048);

Review Comment:
   
   
     | Interval | Distinct Count | Number of Floating Values
   -- | -- | -- | --
   before Filter | [-1, 1] | 100 | 1 B
   after Filter | [0, 1] | 50 or 100 ? | 0.5 B
   
   The selectivity is actually decreased by %50 in this example. However, 
distinct count parameter (storing in ExprBoundaries with interval parameter of 
the column) is not updated with an approximate information (it is not updated 
with 50). As I think selectivity can be calculated and used approximately, but 
unless we are sure, we should not update the interval and distinct count 
parameters.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow-datafusion] berkaysynnada commented on a diff in pull request #6982: Refactor AnalysisContext and statistics() of FilterExec

Reply via email to