This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 50a3e138e3 Speedup statistics_from_parquet_metadata (#20004)
50a3e138e3 is described below

commit 50a3e138e34afd0a52ac835eaf2d574bcc7e4ad2
Author: DaniĆ«l Heres <[email protected]>
AuthorDate: Mon Jan 26 14:03:51 2026 +0100

    Speedup statistics_from_parquet_metadata (#20004)
    
    ## Which issue does this PR close?
    
    <!--
    We generally require a GitHub issue to be filed for all bug fixes and
    enhancements and this helps us generate change logs for our releases.
    You can link an issue to this PR using the GitHub syntax. For example
    `Closes #123` indicates that this PR will close issue #123.
    -->
    
    - Closes #20005
    
    PR:
    ```
    SELECT COUNT(*) FROM hits WHERE "AdvEngineID" <> 0;
    
    Query 1 iteration 0 took 30.5 ms and returned 1 rows
    Query 1 avg time: 30.48 ms
    ```
    
    Main:
    
    ```
    SELECT COUNT(*) FROM hits WHERE "AdvEngineID" <> 0;
    
    Query 1 iteration 0 took 39.6 ms and returned 1 rows
    Query 1 avg time: 39.61 ms
    ```
    
    ## Rationale for this change
    
    Improving cold starts.
    
    ## What changes are included in this PR?
    
    ## Are these changes tested?
    
    Existing tests
    
    ## Are there any user-facing changes?
    
    No
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 datafusion/datasource-parquet/src/metadata.rs | 53 ++++++++++++++++++++-------
 1 file changed, 40 insertions(+), 13 deletions(-)

diff --git a/datafusion/datasource-parquet/src/metadata.rs 
b/datafusion/datasource-parquet/src/metadata.rs
index b763f817a0..5a4c0bcdd5 100644
--- a/datafusion/datasource-parquet/src/metadata.rs
+++ b/datafusion/datasource-parquet/src/metadata.rs
@@ -21,7 +21,7 @@
 use crate::{
     ObjectStoreFetch, apply_file_schema_type_coercions, 
coerce_int96_to_resolution,
 };
-use arrow::array::{ArrayRef, BooleanArray};
+use arrow::array::{Array, ArrayRef, BooleanArray};
 use arrow::compute::and;
 use arrow::compute::kernels::cmp::eq;
 use arrow::compute::sum;
@@ -487,22 +487,40 @@ fn summarize_min_max_null_counts(
 
     if let Some(max_acc) = &mut accumulators.max_accs[logical_schema_index] {
         max_acc.update_batch(&[Arc::clone(&max_values)])?;
-        let mut cur_max_acc = max_acc.clone();
-        accumulators.is_max_value_exact[logical_schema_index] = 
has_any_exact_match(
-            &cur_max_acc.evaluate()?,
-            &max_values,
-            &is_max_value_exact_stat,
-        );
+
+        // handle the common special case when all row groups have exact 
statistics
+        let exactness = &is_max_value_exact_stat;
+        if !exactness.is_empty()
+            && exactness.null_count() == 0
+            && exactness.true_count() == exactness.len()
+        {
+            accumulators.is_max_value_exact[logical_schema_index] = Some(true);
+        } else if exactness.true_count() == 0 {
+            accumulators.is_max_value_exact[logical_schema_index] = 
Some(false);
+        } else {
+            let val = max_acc.evaluate()?;
+            accumulators.is_max_value_exact[logical_schema_index] =
+                has_any_exact_match(&val, &max_values, exactness);
+        }
     }
 
     if let Some(min_acc) = &mut accumulators.min_accs[logical_schema_index] {
         min_acc.update_batch(&[Arc::clone(&min_values)])?;
-        let mut cur_min_acc = min_acc.clone();
-        accumulators.is_min_value_exact[logical_schema_index] = 
has_any_exact_match(
-            &cur_min_acc.evaluate()?,
-            &min_values,
-            &is_min_value_exact_stat,
-        );
+
+        // handle the common special case when all row groups have exact 
statistics
+        let exactness = &is_min_value_exact_stat;
+        if !exactness.is_empty()
+            && exactness.null_count() == 0
+            && exactness.true_count() == exactness.len()
+        {
+            accumulators.is_min_value_exact[logical_schema_index] = Some(true);
+        } else if exactness.true_count() == 0 {
+            accumulators.is_min_value_exact[logical_schema_index] = 
Some(false);
+        } else {
+            let val = min_acc.evaluate()?;
+            accumulators.is_min_value_exact[logical_schema_index] =
+                has_any_exact_match(&val, &min_values, exactness);
+        }
     }
 
     accumulators.null_counts_array[logical_schema_index] = match 
sum(&null_counts) {
@@ -582,6 +600,15 @@ fn has_any_exact_match(
     array: &ArrayRef,
     exactness: &BooleanArray,
 ) -> Option<bool> {
+    if value.is_null() {
+        return Some(false);
+    }
+
+    // Shortcut for single row group
+    if array.len() == 1 {
+        return Some(exactness.is_valid(0) && exactness.value(0));
+    }
+
     let scalar_array = value.to_scalar().ok()?;
     let eq_mask = eq(&scalar_array, &array).ok()?;
     let combined_mask = and(&eq_mask, exactness).ok()?;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to