berkaysynnada commented on code in PR #6982:
URL: https://github.com/apache/arrow-datafusion/pull/6982#discussion_r1268058585
##########
datafusion/physical-expr/src/intervals/interval_aritmetic.rs:
##########
@@ -1172,4 +1321,70 @@ mod tests {
let upper = 1.5;
capture_mode_change_f32((lower, upper), true, true);
}
+
+ #[test]
+ fn test_cardinality_of_intervals() -> Result<()> {
+ // In IEEE 754 standard for floating-point arithmetic, if we keep the
sign and exponent fields same,
+ // we can represent 4503599627370496 different numbers by changing the
mantissa
+ // (4503599627370496 = 2^52, since there are 52 bits in mantissa, and
2^23 = 8388608 for f32).
+ let distinct_f64 = 4503599627370496;
+ let distinct_f32 = 8388608;
+ let intervals = [
+ Interval::new(
+ IntervalBound::new(ScalarValue::from(0.25), false),
+ IntervalBound::new(ScalarValue::from(0.50), true),
+ ),
+ Interval::new(
+ IntervalBound::new(ScalarValue::from(0.5), false),
+ IntervalBound::new(ScalarValue::from(1.0), true),
+ ),
+ Interval::new(
+ IntervalBound::new(ScalarValue::from(1.0), false),
+ IntervalBound::new(ScalarValue::from(2.0), true),
+ ),
+ Interval::new(
+ IntervalBound::new(ScalarValue::from(32.0), false),
+ IntervalBound::new(ScalarValue::from(64.0), true),
+ ),
+ Interval::new(
+ IntervalBound::new(ScalarValue::from(-0.50), false),
+ IntervalBound::new(ScalarValue::from(-0.25), true),
+ ),
+ Interval::new(
+ IntervalBound::new(ScalarValue::from(-32.0), false),
+ IntervalBound::new(ScalarValue::from(-16.0), true),
+ ),
+ ];
+ for interval in intervals {
+ assert_eq!(interval.cardinality()?, distinct_f64);
+ }
+
+ let intervals = [
+ Interval::new(
+ IntervalBound::new(ScalarValue::from(0.25_f32), false),
+ IntervalBound::new(ScalarValue::from(0.50_f32), true),
+ ),
+ Interval::new(
+ IntervalBound::new(ScalarValue::from(-1_f32), false),
+ IntervalBound::new(ScalarValue::from(-0.5_f32), true),
+ ),
+ ];
+ for interval in intervals {
+ assert_eq!(interval.cardinality()?, distinct_f32);
+ }
+
+ let interval = Interval::new(
+ IntervalBound::new(ScalarValue::from(-0.0625), false),
+ IntervalBound::new(ScalarValue::from(0.0625), true),
+ );
+ assert_eq!(interval.cardinality()?, distinct_f64 * 2_048);
Review Comment:
| Interval | Distinct Count | Number of Floating Values
-- | -- | -- | --
before Filter | [-1, 1] | 100 | 1 B
after Filter | [0, 1] | 50 or 100 ? | 0.5 B
The selectivity is actually decreased by %50 in this example. However,
distinct count parameter (storing in ExprBoundaries with interval parameter of
the column) is not updated with an approximate information (it is not updated
with 50). As I think selectivity can be calculated and used approximately, but
unless we are sure, we should not update the interval and distinct count
parameters.
##########
datafusion/physical-expr/src/intervals/interval_aritmetic.rs:
##########
@@ -1172,4 +1321,70 @@ mod tests {
let upper = 1.5;
capture_mode_change_f32((lower, upper), true, true);
}
+
+ #[test]
+ fn test_cardinality_of_intervals() -> Result<()> {
+ // In IEEE 754 standard for floating-point arithmetic, if we keep the
sign and exponent fields same,
+ // we can represent 4503599627370496 different numbers by changing the
mantissa
+ // (4503599627370496 = 2^52, since there are 52 bits in mantissa, and
2^23 = 8388608 for f32).
+ let distinct_f64 = 4503599627370496;
+ let distinct_f32 = 8388608;
+ let intervals = [
+ Interval::new(
+ IntervalBound::new(ScalarValue::from(0.25), false),
+ IntervalBound::new(ScalarValue::from(0.50), true),
+ ),
+ Interval::new(
+ IntervalBound::new(ScalarValue::from(0.5), false),
+ IntervalBound::new(ScalarValue::from(1.0), true),
+ ),
+ Interval::new(
+ IntervalBound::new(ScalarValue::from(1.0), false),
+ IntervalBound::new(ScalarValue::from(2.0), true),
+ ),
+ Interval::new(
+ IntervalBound::new(ScalarValue::from(32.0), false),
+ IntervalBound::new(ScalarValue::from(64.0), true),
+ ),
+ Interval::new(
+ IntervalBound::new(ScalarValue::from(-0.50), false),
+ IntervalBound::new(ScalarValue::from(-0.25), true),
+ ),
+ Interval::new(
+ IntervalBound::new(ScalarValue::from(-32.0), false),
+ IntervalBound::new(ScalarValue::from(-16.0), true),
+ ),
+ ];
+ for interval in intervals {
+ assert_eq!(interval.cardinality()?, distinct_f64);
+ }
+
+ let intervals = [
+ Interval::new(
+ IntervalBound::new(ScalarValue::from(0.25_f32), false),
+ IntervalBound::new(ScalarValue::from(0.50_f32), true),
+ ),
+ Interval::new(
+ IntervalBound::new(ScalarValue::from(-1_f32), false),
+ IntervalBound::new(ScalarValue::from(-0.5_f32), true),
+ ),
+ ];
+ for interval in intervals {
+ assert_eq!(interval.cardinality()?, distinct_f32);
+ }
+
+ let interval = Interval::new(
+ IntervalBound::new(ScalarValue::from(-0.0625), false),
+ IntervalBound::new(ScalarValue::from(0.0625), true),
+ );
+ assert_eq!(interval.cardinality()?, distinct_f64 * 2_048);
Review Comment:
| Interval | Distinct Count | Number of Floating Values
-- | -- | -- | --
before Filter | [-1, 1] | 100 | 1 B
after Filter | [0, 1] | 50 or 100 ? | 0.5 B
The selectivity is actually decreased by %50 in this example. However,
distinct count parameter (storing in ExprBoundaries with interval parameter of
the column) is not updated with an approximate information (it is not updated
with 50). As I think selectivity can be calculated and used approximately, but
unless we are sure, we should not update the interval and distinct count
parameters.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]