This is an automated email from the ASF dual-hosted git repository.

xudong963 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 43ee992566 Remove redundant code in favor of min/max/add (#15659)
43ee992566 is described below

commit 43ee992566066a5eb2f0f79b6551a1552857a1a5
Author: Andrew Lamb <[email protected]>
AuthorDate: Thu Apr 10 08:47:27 2025 -0400

    Remove redundant code in favor of min/max/add (#15659)
---
 datafusion/core/src/datasource/listing/table.rs |  3 +-
 datafusion/datasource/src/mod.rs                |  2 +
 datafusion/datasource/src/statistics.rs         | 96 ++++---------------------
 3 files changed, 17 insertions(+), 84 deletions(-)

diff --git a/datafusion/core/src/datasource/listing/table.rs 
b/datafusion/core/src/datasource/listing/table.rs
index 5848506da2..f32a32355c 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -54,7 +54,6 @@ use datafusion_physical_expr::{
 use async_trait::async_trait;
 use datafusion_catalog::Session;
 use datafusion_common::stats::Precision;
-use datafusion_datasource::add_row_stats;
 use datafusion_datasource::compute_all_files_statistics;
 use datafusion_datasource::file_groups::FileGroup;
 use datafusion_physical_expr_common::sort_expr::LexRequirement;
@@ -1230,7 +1229,7 @@ async fn get_files_with_limit(
                     file_stats.num_rows
                 } else {
                     // For subsequent files, accumulate the counts
-                    add_row_stats(num_rows, file_stats.num_rows)
+                    num_rows.add(&file_stats.num_rows)
                 };
             }
         }
diff --git a/datafusion/datasource/src/mod.rs b/datafusion/datasource/src/mod.rs
index c02f84c74d..b93e917c94 100644
--- a/datafusion/datasource/src/mod.rs
+++ b/datafusion/datasource/src/mod.rs
@@ -58,6 +58,8 @@ use file_meta::FileMeta;
 use futures::{Stream, StreamExt};
 use object_store::{path::Path, ObjectMeta};
 use object_store::{GetOptions, GetRange, ObjectStore};
+// Remove when add_row_stats is remove
+#[allow(deprecated)]
 pub use statistics::add_row_stats;
 pub use statistics::compute_all_files_statistics;
 use std::ops::Range;
diff --git a/datafusion/datasource/src/statistics.rs 
b/datafusion/datasource/src/statistics.rs
index e1a91c0533..1c3d1111e5 100644
--- a/datafusion/datasource/src/statistics.rs
+++ b/datafusion/datasource/src/statistics.rs
@@ -21,7 +21,6 @@
 //! respect to the required sort order. See [`MinMaxStatistics`]
 
 use futures::{Stream, StreamExt};
-use std::mem;
 use std::sync::Arc;
 
 use crate::file_groups::FileGroup;
@@ -34,7 +33,6 @@ use arrow::{
     row::{Row, Rows},
 };
 use datafusion_common::stats::Precision;
-use datafusion_common::ScalarValue;
 use datafusion_common::{plan_datafusion_err, plan_err, DataFusionError, 
Result};
 use datafusion_physical_expr::{expressions::Column, PhysicalSortExpr};
 use datafusion_physical_expr_common::sort_expr::LexOrdering;
@@ -357,10 +355,9 @@ pub async fn get_statistics_with_limit(
                 // counts across all the files in question. If any file does 
not
                 // provide any information or provides an inexact value, we 
demote
                 // the statistic precision to inexact.
-                num_rows = add_row_stats(file_stats.num_rows, num_rows);
+                num_rows = num_rows.add(&file_stats.num_rows);
 
-                total_byte_size =
-                    add_row_stats(file_stats.total_byte_size, total_byte_size);
+                total_byte_size = 
total_byte_size.add(&file_stats.total_byte_size);
 
                 for (file_col_stats, col_stats) in file_stats
                     .column_statistics
@@ -375,10 +372,10 @@ pub async fn get_statistics_with_limit(
                         distinct_count: _,
                     } = file_col_stats;
 
-                    col_stats.null_count = add_row_stats(*file_nc, 
col_stats.null_count);
-                    set_max_if_greater(file_max, &mut col_stats.max_value);
-                    set_min_if_lesser(file_min, &mut col_stats.min_value);
-                    col_stats.sum_value = file_sum.add(&col_stats.sum_value);
+                    col_stats.null_count = col_stats.null_count.add(file_nc);
+                    col_stats.max_value = col_stats.max_value.max(file_max);
+                    col_stats.min_value = col_stats.min_value.min(file_min);
+                    col_stats.sum_value = col_stats.sum_value.add(file_sum);
                 }
 
                 // If the number of rows exceeds the limit, we can stop 
processing
@@ -441,8 +438,8 @@ where
             }
 
             // Accumulate statistics for subsequent items
-            num_rows = add_row_stats(item_stats.num_rows, num_rows);
-            total_byte_size = add_row_stats(item_stats.total_byte_size, 
total_byte_size);
+            num_rows = num_rows.add(&item_stats.num_rows);
+            total_byte_size = total_byte_size.add(&item_stats.total_byte_size);
 
             for (item_col_stats, col_stats) in item_stats
                 .column_statistics
@@ -450,10 +447,10 @@ where
                 .zip(col_stats_set.iter_mut())
             {
                 col_stats.null_count =
-                    add_row_stats(item_col_stats.null_count, 
col_stats.null_count);
-                set_max_if_greater(&item_col_stats.max_value, &mut 
col_stats.max_value);
-                set_min_if_lesser(&item_col_stats.min_value, &mut 
col_stats.min_value);
-                col_stats.sum_value = 
item_col_stats.sum_value.add(&col_stats.sum_value);
+                    col_stats.null_count.add(&item_col_stats.null_count);
+                col_stats.max_value = 
col_stats.max_value.max(&item_col_stats.max_value);
+                col_stats.min_value = 
col_stats.min_value.min(&item_col_stats.min_value);
+                col_stats.sum_value = 
col_stats.sum_value.add(&item_col_stats.sum_value);
             }
         }
     }
@@ -545,77 +542,12 @@ pub fn compute_all_files_statistics(
     Ok((file_groups_with_stats, statistics))
 }
 
+#[deprecated(since = "47.0.0", note = "Use Statistics::add")]
 pub fn add_row_stats(
     file_num_rows: Precision<usize>,
     num_rows: Precision<usize>,
 ) -> Precision<usize> {
-    match (file_num_rows, &num_rows) {
-        (Precision::Absent, _) => num_rows.to_inexact(),
-        (lhs, Precision::Absent) => lhs.to_inexact(),
-        (lhs, rhs) => lhs.add(rhs),
-    }
-}
-
-/// If the given value is numerically greater than the original maximum value,
-/// return the new maximum value with appropriate exactness information.
-fn set_max_if_greater(
-    max_nominee: &Precision<ScalarValue>,
-    max_value: &mut Precision<ScalarValue>,
-) {
-    match (&max_value, max_nominee) {
-        (Precision::Exact(val1), Precision::Exact(val2)) if val1 < val2 => {
-            *max_value = max_nominee.clone();
-        }
-        (Precision::Exact(val1), Precision::Inexact(val2))
-        | (Precision::Inexact(val1), Precision::Inexact(val2))
-        | (Precision::Inexact(val1), Precision::Exact(val2))
-            if val1 < val2 =>
-        {
-            *max_value = max_nominee.clone().to_inexact();
-        }
-        (Precision::Exact(_), Precision::Absent) => {
-            let exact_max = mem::take(max_value);
-            *max_value = exact_max.to_inexact();
-        }
-        (Precision::Absent, Precision::Exact(_)) => {
-            *max_value = max_nominee.clone().to_inexact();
-        }
-        (Precision::Absent, Precision::Inexact(_)) => {
-            *max_value = max_nominee.clone();
-        }
-        _ => {}
-    }
-}
-
-/// If the given value is numerically lesser than the original minimum value,
-/// return the new minimum value with appropriate exactness information.
-fn set_min_if_lesser(
-    min_nominee: &Precision<ScalarValue>,
-    min_value: &mut Precision<ScalarValue>,
-) {
-    match (&min_value, min_nominee) {
-        (Precision::Exact(val1), Precision::Exact(val2)) if val1 > val2 => {
-            *min_value = min_nominee.clone();
-        }
-        (Precision::Exact(val1), Precision::Inexact(val2))
-        | (Precision::Inexact(val1), Precision::Inexact(val2))
-        | (Precision::Inexact(val1), Precision::Exact(val2))
-            if val1 > val2 =>
-        {
-            *min_value = min_nominee.clone().to_inexact();
-        }
-        (Precision::Exact(_), Precision::Absent) => {
-            let exact_min = mem::take(min_value);
-            *min_value = exact_min.to_inexact();
-        }
-        (Precision::Absent, Precision::Exact(_)) => {
-            *min_value = min_nominee.clone().to_inexact();
-        }
-        (Precision::Absent, Precision::Inexact(_)) => {
-            *min_value = min_nominee.clone();
-        }
-        _ => {}
-    }
+    file_num_rows.add(&num_rows)
 }
 
 #[cfg(test)]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to