AdamGS commented on code in PR #22462:
URL: https://github.com/apache/datafusion/pull/22462#discussion_r3289500469


##########
datafusion/datasource-parquet/src/metadata.rs:
##########
@@ -506,121 +505,215 @@ impl StatisticsAccumulators<'_> {
 }
 
 fn summarize_column_statistics(
-    parquet_schema: &SchemaDescriptor,
     logical_file_schema: &Schema,
-    physical_file_schema: &Schema,
     accumulators: &mut StatisticsAccumulators,
     logical_schema_index: usize,
     stats_converter: &StatisticsConverter,
     row_groups_metadata: &[RowGroupMetaData],
+    num_rows: usize,
 ) -> Result<()> {
-    let max_values = stats_converter.row_group_maxes(row_groups_metadata)?;
-    let min_values = stats_converter.row_group_mins(row_groups_metadata)?;
-    let null_counts = 
stats_converter.row_group_null_counts(row_groups_metadata)?;
-    let is_max_value_exact_stat =
-        stats_converter.row_group_is_max_value_exact(row_groups_metadata)?;
-    let is_min_value_exact_stat =
-        stats_converter.row_group_is_min_value_exact(row_groups_metadata)?;
+    let parquet_index = stats_converter.parquet_column_index();
 
     if let Some(max_acc) = &mut accumulators.max_accs[logical_schema_index] {
-        max_acc.update_batch(&[Arc::clone(&max_values)])?;
-
-        // handle the common special case when all row groups have exact 
statistics
-        let exactness = &is_max_value_exact_stat;
-        if !exactness.is_empty() && exactness.null_count() == 0 && 
!exactness.has_false()
-        {
-            accumulators.is_max_value_exact[logical_schema_index] = Some(true);
-        } else if !exactness.has_true() {
-            accumulators.is_max_value_exact[logical_schema_index] = 
Some(false);
-        } else {
-            let val = max_acc.evaluate()?;
-            accumulators.is_max_value_exact[logical_schema_index] =
-                has_any_exact_match(&val, &max_values, exactness);
-        }
+        accumulators.is_max_value_exact[logical_schema_index] = 
summarize_bound(
+            max_acc,
+            &stats_converter.row_group_maxes(row_groups_metadata)?,
+            parquet_index,
+            row_groups_metadata,
+            ParquetStatistics::max_is_exact,
+            || 
Ok(stats_converter.row_group_is_max_value_exact(row_groups_metadata)?),
+        )?;
     }
 
     if let Some(min_acc) = &mut accumulators.min_accs[logical_schema_index] {
-        min_acc.update_batch(&[Arc::clone(&min_values)])?;
-
-        // handle the common special case when all row groups have exact 
statistics
-        let exactness = &is_min_value_exact_stat;
-        if !exactness.is_empty() && exactness.null_count() == 0 && 
!exactness.has_false()
-        {
-            accumulators.is_min_value_exact[logical_schema_index] = Some(true);
-        } else if !exactness.has_true() {
-            accumulators.is_min_value_exact[logical_schema_index] = 
Some(false);
-        } else {
-            let val = min_acc.evaluate()?;
-            accumulators.is_min_value_exact[logical_schema_index] =
-                has_any_exact_match(&val, &min_values, exactness);
-        }
+        accumulators.is_min_value_exact[logical_schema_index] = 
summarize_bound(
+            min_acc,
+            &stats_converter.row_group_mins(row_groups_metadata)?,
+            parquet_index,
+            row_groups_metadata,
+            ParquetStatistics::min_is_exact,
+            || 
Ok(stats_converter.row_group_is_min_value_exact(row_groups_metadata)?),
+        )?;
     }
 
-    accumulators.null_counts_array[logical_schema_index] = match 
sum(&null_counts) {
-        Some(null_count) => Precision::Exact(null_count as usize),
-        None => match null_counts.len() {
-            // If sum() returned None we either have no rows or all values are 
null
-            0 => Precision::Exact(0),
-            _ => Precision::Absent,
-        },
-    };
-
-    // This is the same logic as parquet_column but we start from arrow schema 
index

Review Comment:
   this value is already `stats_converter.parquet_column_index()`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to