This is an automated email from the ASF dual-hosted git repository.
xudong963 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 43ee992566 Remove redundant code in favor of min/max/add (#15659)
43ee992566 is described below
commit 43ee992566066a5eb2f0f79b6551a1552857a1a5
Author: Andrew Lamb <[email protected]>
AuthorDate: Thu Apr 10 08:47:27 2025 -0400
Remove redundant code in favor of min/max/add (#15659)
---
datafusion/core/src/datasource/listing/table.rs | 3 +-
datafusion/datasource/src/mod.rs | 2 +
datafusion/datasource/src/statistics.rs | 96 ++++---------------------
3 files changed, 17 insertions(+), 84 deletions(-)
diff --git a/datafusion/core/src/datasource/listing/table.rs
b/datafusion/core/src/datasource/listing/table.rs
index 5848506da2..f32a32355c 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -54,7 +54,6 @@ use datafusion_physical_expr::{
use async_trait::async_trait;
use datafusion_catalog::Session;
use datafusion_common::stats::Precision;
-use datafusion_datasource::add_row_stats;
use datafusion_datasource::compute_all_files_statistics;
use datafusion_datasource::file_groups::FileGroup;
use datafusion_physical_expr_common::sort_expr::LexRequirement;
@@ -1230,7 +1229,7 @@ async fn get_files_with_limit(
file_stats.num_rows
} else {
// For subsequent files, accumulate the counts
- add_row_stats(num_rows, file_stats.num_rows)
+ num_rows.add(&file_stats.num_rows)
};
}
}
diff --git a/datafusion/datasource/src/mod.rs b/datafusion/datasource/src/mod.rs
index c02f84c74d..b93e917c94 100644
--- a/datafusion/datasource/src/mod.rs
+++ b/datafusion/datasource/src/mod.rs
@@ -58,6 +58,8 @@ use file_meta::FileMeta;
use futures::{Stream, StreamExt};
use object_store::{path::Path, ObjectMeta};
use object_store::{GetOptions, GetRange, ObjectStore};
+// Remove when add_row_stats is remove
+#[allow(deprecated)]
pub use statistics::add_row_stats;
pub use statistics::compute_all_files_statistics;
use std::ops::Range;
diff --git a/datafusion/datasource/src/statistics.rs
b/datafusion/datasource/src/statistics.rs
index e1a91c0533..1c3d1111e5 100644
--- a/datafusion/datasource/src/statistics.rs
+++ b/datafusion/datasource/src/statistics.rs
@@ -21,7 +21,6 @@
//! respect to the required sort order. See [`MinMaxStatistics`]
use futures::{Stream, StreamExt};
-use std::mem;
use std::sync::Arc;
use crate::file_groups::FileGroup;
@@ -34,7 +33,6 @@ use arrow::{
row::{Row, Rows},
};
use datafusion_common::stats::Precision;
-use datafusion_common::ScalarValue;
use datafusion_common::{plan_datafusion_err, plan_err, DataFusionError,
Result};
use datafusion_physical_expr::{expressions::Column, PhysicalSortExpr};
use datafusion_physical_expr_common::sort_expr::LexOrdering;
@@ -357,10 +355,9 @@ pub async fn get_statistics_with_limit(
// counts across all the files in question. If any file does
not
// provide any information or provides an inexact value, we
demote
// the statistic precision to inexact.
- num_rows = add_row_stats(file_stats.num_rows, num_rows);
+ num_rows = num_rows.add(&file_stats.num_rows);
- total_byte_size =
- add_row_stats(file_stats.total_byte_size, total_byte_size);
+ total_byte_size =
total_byte_size.add(&file_stats.total_byte_size);
for (file_col_stats, col_stats) in file_stats
.column_statistics
@@ -375,10 +372,10 @@ pub async fn get_statistics_with_limit(
distinct_count: _,
} = file_col_stats;
- col_stats.null_count = add_row_stats(*file_nc,
col_stats.null_count);
- set_max_if_greater(file_max, &mut col_stats.max_value);
- set_min_if_lesser(file_min, &mut col_stats.min_value);
- col_stats.sum_value = file_sum.add(&col_stats.sum_value);
+ col_stats.null_count = col_stats.null_count.add(file_nc);
+ col_stats.max_value = col_stats.max_value.max(file_max);
+ col_stats.min_value = col_stats.min_value.min(file_min);
+ col_stats.sum_value = col_stats.sum_value.add(file_sum);
}
// If the number of rows exceeds the limit, we can stop
processing
@@ -441,8 +438,8 @@ where
}
// Accumulate statistics for subsequent items
- num_rows = add_row_stats(item_stats.num_rows, num_rows);
- total_byte_size = add_row_stats(item_stats.total_byte_size,
total_byte_size);
+ num_rows = num_rows.add(&item_stats.num_rows);
+ total_byte_size = total_byte_size.add(&item_stats.total_byte_size);
for (item_col_stats, col_stats) in item_stats
.column_statistics
@@ -450,10 +447,10 @@ where
.zip(col_stats_set.iter_mut())
{
col_stats.null_count =
- add_row_stats(item_col_stats.null_count,
col_stats.null_count);
- set_max_if_greater(&item_col_stats.max_value, &mut
col_stats.max_value);
- set_min_if_lesser(&item_col_stats.min_value, &mut
col_stats.min_value);
- col_stats.sum_value =
item_col_stats.sum_value.add(&col_stats.sum_value);
+ col_stats.null_count.add(&item_col_stats.null_count);
+ col_stats.max_value =
col_stats.max_value.max(&item_col_stats.max_value);
+ col_stats.min_value =
col_stats.min_value.min(&item_col_stats.min_value);
+ col_stats.sum_value =
col_stats.sum_value.add(&item_col_stats.sum_value);
}
}
}
@@ -545,77 +542,12 @@ pub fn compute_all_files_statistics(
Ok((file_groups_with_stats, statistics))
}
+#[deprecated(since = "47.0.0", note = "Use Statistics::add")]
pub fn add_row_stats(
file_num_rows: Precision<usize>,
num_rows: Precision<usize>,
) -> Precision<usize> {
- match (file_num_rows, &num_rows) {
- (Precision::Absent, _) => num_rows.to_inexact(),
- (lhs, Precision::Absent) => lhs.to_inexact(),
- (lhs, rhs) => lhs.add(rhs),
- }
-}
-
-/// If the given value is numerically greater than the original maximum value,
-/// return the new maximum value with appropriate exactness information.
-fn set_max_if_greater(
- max_nominee: &Precision<ScalarValue>,
- max_value: &mut Precision<ScalarValue>,
-) {
- match (&max_value, max_nominee) {
- (Precision::Exact(val1), Precision::Exact(val2)) if val1 < val2 => {
- *max_value = max_nominee.clone();
- }
- (Precision::Exact(val1), Precision::Inexact(val2))
- | (Precision::Inexact(val1), Precision::Inexact(val2))
- | (Precision::Inexact(val1), Precision::Exact(val2))
- if val1 < val2 =>
- {
- *max_value = max_nominee.clone().to_inexact();
- }
- (Precision::Exact(_), Precision::Absent) => {
- let exact_max = mem::take(max_value);
- *max_value = exact_max.to_inexact();
- }
- (Precision::Absent, Precision::Exact(_)) => {
- *max_value = max_nominee.clone().to_inexact();
- }
- (Precision::Absent, Precision::Inexact(_)) => {
- *max_value = max_nominee.clone();
- }
- _ => {}
- }
-}
-
-/// If the given value is numerically lesser than the original minimum value,
-/// return the new minimum value with appropriate exactness information.
-fn set_min_if_lesser(
- min_nominee: &Precision<ScalarValue>,
- min_value: &mut Precision<ScalarValue>,
-) {
- match (&min_value, min_nominee) {
- (Precision::Exact(val1), Precision::Exact(val2)) if val1 > val2 => {
- *min_value = min_nominee.clone();
- }
- (Precision::Exact(val1), Precision::Inexact(val2))
- | (Precision::Inexact(val1), Precision::Inexact(val2))
- | (Precision::Inexact(val1), Precision::Exact(val2))
- if val1 > val2 =>
- {
- *min_value = min_nominee.clone().to_inexact();
- }
- (Precision::Exact(_), Precision::Absent) => {
- let exact_min = mem::take(min_value);
- *min_value = exact_min.to_inexact();
- }
- (Precision::Absent, Precision::Exact(_)) => {
- *min_value = min_nominee.clone().to_inexact();
- }
- (Precision::Absent, Precision::Inexact(_)) => {
- *min_value = min_nominee.clone();
- }
- _ => {}
- }
+ file_num_rows.add(&num_rows)
}
#[cfg(test)]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]