xudong963 commented on code in PR #21430:
URL: https://github.com/apache/datafusion/pull/21430#discussion_r3043365672
##########
datafusion/physical-plan/src/union.rs:
##########
@@ -959,294 +903,113 @@ mod tests {
Ok(())
}
- #[tokio::test]
- async fn test_stats_union() {
- let left = Statistics {
- num_rows: Precision::Exact(5),
- total_byte_size: Precision::Exact(23),
- column_statistics: vec![
- ColumnStatistics {
- distinct_count: Precision::Exact(5),
- max_value: Precision::Exact(ScalarValue::Int64(Some(21))),
- min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
- sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
- null_count: Precision::Exact(0),
- byte_size: Precision::Absent,
- },
- ColumnStatistics {
- distinct_count: Precision::Exact(1),
- max_value: Precision::Exact(ScalarValue::from("x")),
- min_value: Precision::Exact(ScalarValue::from("a")),
- sum_value: Precision::Absent,
- null_count: Precision::Exact(3),
- byte_size: Precision::Absent,
- },
- ColumnStatistics {
- distinct_count: Precision::Absent,
- max_value:
Precision::Exact(ScalarValue::Float32(Some(1.1))),
- min_value:
Precision::Exact(ScalarValue::Float32(Some(0.1))),
- sum_value:
Precision::Exact(ScalarValue::Float32(Some(42.0))),
- null_count: Precision::Absent,
- byte_size: Precision::Absent,
- },
- ],
- };
+ fn stats_merge_inputs() -> (SchemaRef, Statistics, Statistics, Statistics)
{
+ let schema = Arc::new(Schema::new(vec![Field::new("a",
DataType::UInt32, true)]));
+
+ let left = Statistics::default()
+ .with_num_rows(Precision::Exact(5))
+ .with_total_byte_size(Precision::Exact(23))
+ .add_column_statistics(
+ ColumnStatistics::new_unknown()
+ .with_distinct_count(Precision::Exact(5))
+
.with_min_value(Precision::Exact(ScalarValue::UInt32(Some(1))))
+
.with_max_value(Precision::Exact(ScalarValue::UInt32(Some(21))))
+
.with_sum_value(Precision::Exact(ScalarValue::UInt32(Some(42))))
+ .with_null_count(Precision::Exact(0))
+ .with_byte_size(Precision::Exact(40)),
+ );
- let right = Statistics {
- num_rows: Precision::Exact(7),
- total_byte_size: Precision::Exact(29),
- column_statistics: vec![
- ColumnStatistics {
- distinct_count: Precision::Exact(3),
- max_value: Precision::Exact(ScalarValue::Int64(Some(34))),
- min_value: Precision::Exact(ScalarValue::Int64(Some(1))),
- sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
- null_count: Precision::Exact(1),
- byte_size: Precision::Absent,
- },
- ColumnStatistics {
- distinct_count: Precision::Absent,
- max_value: Precision::Exact(ScalarValue::from("c")),
- min_value: Precision::Exact(ScalarValue::from("b")),
- sum_value: Precision::Absent,
- null_count: Precision::Absent,
- byte_size: Precision::Absent,
- },
- ColumnStatistics {
- distinct_count: Precision::Absent,
- max_value: Precision::Absent,
- min_value: Precision::Absent,
- sum_value: Precision::Absent,
- null_count: Precision::Absent,
- byte_size: Precision::Absent,
- },
- ],
- };
+ let right = Statistics::default()
+ .with_num_rows(Precision::Exact(7))
+ .with_total_byte_size(Precision::Exact(29))
+ .add_column_statistics(
+ ColumnStatistics::new_unknown()
+ .with_distinct_count(Precision::Exact(3))
+
.with_min_value(Precision::Exact(ScalarValue::UInt32(Some(22))))
+
.with_max_value(Precision::Exact(ScalarValue::UInt32(Some(34))))
+
.with_sum_value(Precision::Exact(ScalarValue::UInt32(Some(8))))
+ .with_null_count(Precision::Exact(1))
+ .with_byte_size(Precision::Exact(60)),
+ );
- let result = stats_union(left, right);
- let expected = Statistics {
- num_rows: Precision::Exact(12),
- total_byte_size: Precision::Exact(52),
- column_statistics: vec![
- ColumnStatistics {
- distinct_count: Precision::Inexact(6),
- max_value: Precision::Exact(ScalarValue::Int64(Some(34))),
- min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
- sum_value: Precision::Exact(ScalarValue::Int64(Some(84))),
- null_count: Precision::Exact(1),
- byte_size: Precision::Absent,
- },
- ColumnStatistics {
- distinct_count: Precision::Absent,
- max_value: Precision::Exact(ScalarValue::from("x")),
- min_value: Precision::Exact(ScalarValue::from("a")),
- sum_value: Precision::Absent,
- null_count: Precision::Absent,
- byte_size: Precision::Absent,
- },
- ColumnStatistics {
- distinct_count: Precision::Absent,
- max_value: Precision::Absent,
- min_value: Precision::Absent,
- sum_value: Precision::Absent,
- null_count: Precision::Absent,
- byte_size: Precision::Absent,
- },
- ],
- };
+ let expected = Statistics::default()
+ .with_num_rows(Precision::Exact(12))
+ .with_total_byte_size(Precision::Exact(52))
+ .add_column_statistics(
+ ColumnStatistics::new_unknown()
+ .with_distinct_count(Precision::Inexact(8))
+
.with_min_value(Precision::Exact(ScalarValue::UInt32(Some(1))))
+
.with_max_value(Precision::Exact(ScalarValue::UInt32(Some(34))))
+
.with_sum_value(Precision::Exact(ScalarValue::UInt64(Some(50))))
+ .with_null_count(Precision::Exact(1))
+ .with_byte_size(Precision::Exact(100)),
+ );
- assert_eq!(result, expected);
+ (schema, left, right, expected)
}
#[test]
- fn test_union_distinct_count() {
Review Comment:
There is a significant concern with the test coverage reduction.
The old `test_union_distinct_count` contained 15 carefully constructed edge
cases for NDV estimation: disjoint ranges, identical ranges, partial overlap,
containment, constant values, absent values, and mixed precision types.
These tests validated the correctness of `estimate_ndv_with_overlap` as used
in the union context. The new tests use a single scenario (non-overlapping
ranges, single column) and rely on the assumption that `try_merge_iter` is
already well-tested elsewhere.
##########
testing:
##########
Review Comment:
is it expected?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]