Re: [PR] Update ListingTable to use StatisticsConverter [datafusion]

via GitHub Mon, 17 Jun 2024 03:09:37 -0700


alamb commented on code in PR #10924:
URL: https://github.com/apache/datafusion/pull/10924#discussion_r1640395427



##########
datafusion/core/src/datasource/file_format/parquet.rs:
##########
@@ -482,73 +404,101 @@ pub async fn statistics_from_parquet_meta(
         file_metadata.key_value_metadata(),
     )?;
 
-    let num_fields = table_schema.fields().len();
-    let fields = table_schema.fields();
-
     let mut num_rows = 0;
     let mut total_byte_size = 0;
-    let mut null_counts = vec![Precision::Exact(0); num_fields];
-    let mut has_statistics = false;
-
-    let schema_adapter =
-        DefaultSchemaAdapterFactory::default().create(table_schema.clone());
-
-    let (mut max_values, mut min_values) = create_max_min_accs(&table_schema);
 
     for row_group_meta in metadata.row_groups() {
         num_rows += row_group_meta.num_rows();
         total_byte_size += row_group_meta.total_byte_size();
+    }
 
-        let mut column_stats: HashMap<usize, (u64, &ParquetStatistics)> = 
HashMap::new();
+    let schema_adapter =
+        DefaultSchemaAdapterFactory::default().create(table_schema.clone());
 
-        for (i, column) in row_group_meta.columns().iter().enumerate() {
-            if let Some(stat) = column.statistics() {
-                has_statistics = true;
-                column_stats.insert(i, (stat.null_count(), stat));
-            }
-        }
+    // statistics for each of the table's columns (may be different from the
+    // file schema)
+    let mut column_statistics = vec![];
+
+    for (table_idx, field) in table_schema.fields().iter().enumerate() {
+        let Some(file_idx) = schema_adapter.map_column_index(table_idx, 
&file_schema)
+        else {
+            // file columns not in table schema are treated as all null
+            let null_count = Precision::Exact(num_rows as usize);
+            let null_value = ScalarValue::try_from(field.data_type())?;
+            let stats = ColumnStatistics::new_unknown()
+                .with_null_count(null_count)
+                .with_max_value(Precision::Exact(null_value.clone()))
+                .with_min_value(Precision::Exact(null_value));
+            column_statistics.push(stats);
+            continue;
+        };
 
-        if has_statistics {
-            for (table_idx, null_cnt) in null_counts.iter_mut().enumerate() {
-                if let Some(file_idx) =
-                    schema_adapter.map_column_index(table_idx, &file_schema)
-                {
-                    if let Some((null_count, stats)) = 
column_stats.get(&file_idx) {
-                        *null_cnt = null_cnt.add(&Precision::Exact(*null_count 
as usize));
-                        summarize_min_max(
-                            &mut max_values,
-                            &mut min_values,
-                            fields,
-                            table_idx,
-                            stats,
-                        )
-                    } else {
-                        // If none statistics of current column exists, set 
the Max/Min Accumulator to None.
-                        max_values[table_idx] = None;
-                        min_values[table_idx] = None;
-                    }
-                } else {
-                    *null_cnt = null_cnt.add(&Precision::Exact(num_rows as 
usize));
-                }
-            }
-        }
-    }
+        let file_field = file_schema.field(file_idx);
+        let Some(converter) = StatisticsConverter::try_new(

Review Comment:
   this code now uses the well tested StatisticsConverter to extract statistics 
from the parquet file with the correct type of array in a single call



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Update ListingTable to use StatisticsConverter [datafusion]

Reply via email to