This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 50a3e138e3 Speedup statistics_from_parquet_metadata (#20004)
50a3e138e3 is described below
commit 50a3e138e34afd0a52ac835eaf2d574bcc7e4ad2
Author: Daniƫl Heres <[email protected]>
AuthorDate: Mon Jan 26 14:03:51 2026 +0100
Speedup statistics_from_parquet_metadata (#20004)
## Which issue does this PR close?
<!--
We generally require a GitHub issue to be filed for all bug fixes and
enhancements and this helps us generate change logs for our releases.
You can link an issue to this PR using the GitHub syntax. For example
`Closes #123` indicates that this PR will close issue #123.
-->
- Closes #20005
PR:
```
SELECT COUNT(*) FROM hits WHERE "AdvEngineID" <> 0;
Query 1 iteration 0 took 30.5 ms and returned 1 rows
Query 1 avg time: 30.48 ms
```
Main:
```
SELECT COUNT(*) FROM hits WHERE "AdvEngineID" <> 0;
Query 1 iteration 0 took 39.6 ms and returned 1 rows
Query 1 avg time: 39.61 ms
```
## Rationale for this change
Improving cold starts.
## What changes are included in this PR?
## Are these changes tested?
Existing tests
## Are there any user-facing changes?
No
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
datafusion/datasource-parquet/src/metadata.rs | 53 ++++++++++++++++++++-------
1 file changed, 40 insertions(+), 13 deletions(-)
diff --git a/datafusion/datasource-parquet/src/metadata.rs
b/datafusion/datasource-parquet/src/metadata.rs
index b763f817a0..5a4c0bcdd5 100644
--- a/datafusion/datasource-parquet/src/metadata.rs
+++ b/datafusion/datasource-parquet/src/metadata.rs
@@ -21,7 +21,7 @@
use crate::{
ObjectStoreFetch, apply_file_schema_type_coercions,
coerce_int96_to_resolution,
};
-use arrow::array::{ArrayRef, BooleanArray};
+use arrow::array::{Array, ArrayRef, BooleanArray};
use arrow::compute::and;
use arrow::compute::kernels::cmp::eq;
use arrow::compute::sum;
@@ -487,22 +487,40 @@ fn summarize_min_max_null_counts(
if let Some(max_acc) = &mut accumulators.max_accs[logical_schema_index] {
max_acc.update_batch(&[Arc::clone(&max_values)])?;
- let mut cur_max_acc = max_acc.clone();
- accumulators.is_max_value_exact[logical_schema_index] =
has_any_exact_match(
- &cur_max_acc.evaluate()?,
- &max_values,
- &is_max_value_exact_stat,
- );
+
+ // handle the common special case when all row groups have exact
statistics
+ let exactness = &is_max_value_exact_stat;
+ if !exactness.is_empty()
+ && exactness.null_count() == 0
+ && exactness.true_count() == exactness.len()
+ {
+ accumulators.is_max_value_exact[logical_schema_index] = Some(true);
+ } else if exactness.true_count() == 0 {
+ accumulators.is_max_value_exact[logical_schema_index] =
Some(false);
+ } else {
+ let val = max_acc.evaluate()?;
+ accumulators.is_max_value_exact[logical_schema_index] =
+ has_any_exact_match(&val, &max_values, exactness);
+ }
}
if let Some(min_acc) = &mut accumulators.min_accs[logical_schema_index] {
min_acc.update_batch(&[Arc::clone(&min_values)])?;
- let mut cur_min_acc = min_acc.clone();
- accumulators.is_min_value_exact[logical_schema_index] =
has_any_exact_match(
- &cur_min_acc.evaluate()?,
- &min_values,
- &is_min_value_exact_stat,
- );
+
+ // handle the common special case when all row groups have exact
statistics
+ let exactness = &is_min_value_exact_stat;
+ if !exactness.is_empty()
+ && exactness.null_count() == 0
+ && exactness.true_count() == exactness.len()
+ {
+ accumulators.is_min_value_exact[logical_schema_index] = Some(true);
+ } else if exactness.true_count() == 0 {
+ accumulators.is_min_value_exact[logical_schema_index] =
Some(false);
+ } else {
+ let val = min_acc.evaluate()?;
+ accumulators.is_min_value_exact[logical_schema_index] =
+ has_any_exact_match(&val, &min_values, exactness);
+ }
}
accumulators.null_counts_array[logical_schema_index] = match
sum(&null_counts) {
@@ -582,6 +600,15 @@ fn has_any_exact_match(
array: &ArrayRef,
exactness: &BooleanArray,
) -> Option<bool> {
+ if value.is_null() {
+ return Some(false);
+ }
+
+ // Shortcut for single row group
+ if array.len() == 1 {
+ return Some(exactness.is_valid(0) && exactness.value(0));
+ }
+
let scalar_array = value.to_scalar().ok()?;
let eq_mask = eq(&scalar_array, &array).ok()?;
let combined_mask = and(&eq_mask, exactness).ok()?;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]