Re: [PR] Update ListingTable to use StatisticsConverter [datafusion]

via GitHub Fri, 14 Jun 2024 15:16:19 -0700


alamb commented on code in PR #10924:
URL: https://github.com/apache/datafusion/pull/10924#discussion_r1640395427



##########
datafusion/core/src/datasource/file_format/parquet.rs:
##########
@@ -482,73 +404,101 @@ pub async fn statistics_from_parquet_meta(
         file_metadata.key_value_metadata(),
     )?;
 
-    let num_fields = table_schema.fields().len();
-    let fields = table_schema.fields();
-
     let mut num_rows = 0;
     let mut total_byte_size = 0;
-    let mut null_counts = vec![Precision::Exact(0); num_fields];
-    let mut has_statistics = false;
-
-    let schema_adapter =
-        DefaultSchemaAdapterFactory::default().create(table_schema.clone());
-
-    let (mut max_values, mut min_values) = create_max_min_accs(&table_schema);
 
     for row_group_meta in metadata.row_groups() {
         num_rows += row_group_meta.num_rows();
         total_byte_size += row_group_meta.total_byte_size();
+    }
 
-        let mut column_stats: HashMap<usize, (u64, &ParquetStatistics)> = 
HashMap::new();
+    let schema_adapter =
+        DefaultSchemaAdapterFactory::default().create(table_schema.clone());
 
-        for (i, column) in row_group_meta.columns().iter().enumerate() {
-            if let Some(stat) = column.statistics() {
-                has_statistics = true;
-                column_stats.insert(i, (stat.null_count(), stat));
-            }
-        }
+    // statistics for each of the table's columns (may be different from the
+    // file schema)
+    let mut column_statistics = vec![];
+
+    for (table_idx, field) in table_schema.fields().iter().enumerate() {
+        let Some(file_idx) = schema_adapter.map_column_index(table_idx, 
&file_schema)
+        else {
+            // file columns not in table schema are treated as all null
+            let null_count = Precision::Exact(num_rows as usize);
+            let null_value = ScalarValue::try_from(field.data_type())?;
+            let stats = ColumnStatistics::new_unknown()
+                .with_null_count(null_count)
+                .with_max_value(Precision::Exact(null_value.clone()))
+                .with_min_value(Precision::Exact(null_value));
+            column_statistics.push(stats);
+            continue;
+        };
 
-        if has_statistics {
-            for (table_idx, null_cnt) in null_counts.iter_mut().enumerate() {
-                if let Some(file_idx) =
-                    schema_adapter.map_column_index(table_idx, &file_schema)
-                {
-                    if let Some((null_count, stats)) = 
column_stats.get(&file_idx) {
-                        *null_cnt = null_cnt.add(&Precision::Exact(*null_count 
as usize));
-                        summarize_min_max(
-                            &mut max_values,
-                            &mut min_values,
-                            fields,
-                            table_idx,
-                            stats,
-                        )
-                    } else {
-                        // If none statistics of current column exists, set 
the Max/Min Accumulator to None.
-                        max_values[table_idx] = None;
-                        min_values[table_idx] = None;
-                    }
-                } else {
-                    *null_cnt = null_cnt.add(&Precision::Exact(num_rows as 
usize));
-                }
-            }
-        }
-    }
+        let file_field = file_schema.field(file_idx);
+        let Some(converter) = StatisticsConverter::try_new(

Review Comment:
   this code now uses the well tested StatisticsConverter to extract statistics 
from the parquet file with the correct type of array



##########
datafusion/core/src/datasource/file_format/parquet.rs:
##########
@@ -295,86 +297,6 @@ impl FileFormat for ParquetFormat {
     }
 }
 
-fn summarize_min_max(
-    max_values: &mut [Option<MaxAccumulator>],
-    min_values: &mut [Option<MinAccumulator>],
-    fields: &Fields,
-    i: usize,
-    stat: &ParquetStatistics,
-) {
-    if !stat.has_min_max_set() {
-        max_values[i] = None;
-        min_values[i] = None;
-        return;
-    }
-    match stat {
-        ParquetStatistics::Boolean(s) if DataType::Boolean == 
*fields[i].data_type() => {
-            if let Some(max_value) = &mut max_values[i] {
-                max_value
-                    
.update_batch(&[Arc::new(BooleanArray::from(vec![*s.max()]))])
-                    .unwrap_or_else(|_| max_values[i] = None);
-            }
-            if let Some(min_value) = &mut min_values[i] {
-                min_value
-                    
.update_batch(&[Arc::new(BooleanArray::from(vec![*s.min()]))])
-                    .unwrap_or_else(|_| min_values[i] = None);
-            }
-        }
-        ParquetStatistics::Int32(s) if DataType::Int32 == 
*fields[i].data_type() => {
-            if let Some(max_value) = &mut max_values[i] {
-                max_value
-                    .update_batch(&[Arc::new(Int32Array::from_value(*s.max(), 
1))])
-                    .unwrap_or_else(|_| max_values[i] = None);
-            }
-            if let Some(min_value) = &mut min_values[i] {
-                min_value
-                    .update_batch(&[Arc::new(Int32Array::from_value(*s.min(), 
1))])
-                    .unwrap_or_else(|_| min_values[i] = None);
-            }
-        }
-        ParquetStatistics::Int64(s) if DataType::Int64 == 
*fields[i].data_type() => {
-            if let Some(max_value) = &mut max_values[i] {
-                max_value
-                    .update_batch(&[Arc::new(Int64Array::from_value(*s.max(), 
1))])

Review Comment:
   note how update_batch is called by first creating a single row array



##########
datafusion/core/src/datasource/file_format/parquet.rs:
##########
@@ -1449,8 +1400,15 @@ mod tests {
         // column c1
         let c1_stats = &stats.column_statistics[0];
         assert_eq!(c1_stats.null_count, Precision::Exact(1));
-        assert_eq!(c1_stats.max_value, Precision::Absent);
-        assert_eq!(c1_stats.min_value, Precision::Absent);
+        // Note in ASCII lower case is greater than upper case
+        assert_eq!(
+            c1_stats.max_value,
+            Precision::Exact(ScalarValue::from("bar"))

Review Comment:
   strings were previously not handled



##########
datafusion/sqllogictest/test_files/explain.slt:
##########
@@ -287,20 +287,20 @@ query TT
 EXPLAIN SELECT * FROM alltypes_plain limit 10;
 ----
 physical_plan
-01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, 
[(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
-02)--ParquetExec: file_groups={1 group: 
[[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, 
projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, 
float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, 
statistics=[Rows=Exact(8), Bytes=Absent, 
[(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
+01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, 
[(Col[0]: Min=Exact(Int32(NULL)) Max=Exact(Int32(NULL)) Null=Exact(0)),(Col[1]: 
Min=Exact(Boolean(NULL)) Max=Exact(Boolean(NULL)) Null=Exact(0)),(Col[2]: 
Min=Exact(Int32(NULL)) Max=Exact(Int32(NULL)) Null=Exact(0)),(Col[3]: 
Min=Exact(Int32(NULL)) Max=Exact(Int32(NULL)) Null=Exact(0)),(Col[4]: 
Min=Exact(Int32(NULL)) Max=Exact(Int32(NULL)) Null=Exact(0)),(Col[5]: 
Min=Exact(Int64(NULL)) Max=Exact(Int64(NULL)) Null=Exact(0)),(Col[6]: 
Min=Exact(Float32(NULL)) Max=Exact(Float32(NULL)) Null=Exact(0)),(Col[7]: 
Min=Exact(Float64(NULL)) Max=Exact(Float64(NULL)) Null=Exact(0)),(Col[8]: 
Min=Exact(Binary(NULL)) Max=Exact(Binary(NULL)) Null=Exact(0)),(Col[9]: 
Min=Exact(Binary(NULL)) Max=Exact(Binary(NULL)) Null=Exact(0)),(Col[10]: 
Min=Exact(TimestampNanosecond(NULL, None)) Max=Exact(TimestampNanosecond(NULL, 
None)) Null=Exact(0))]]
+02)--ParquetExec: file_groups={1 group: 
[[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, 
projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, 
float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, 
statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]: Min=Exact(Int32(NULL)) 
Max=Exact(Int32(NULL)) Null=Exact(0)),(Col[1]: Min=Exact(Boolean(NULL)) 
Max=Exact(Boolean(NULL)) Null=Exact(0)),(Col[2]: Min=Exact(Int32(NULL)) 
Max=Exact(Int32(NULL)) Null=Exact(0)),(Col[3]: Min=Exact(Int32(NULL)) 
Max=Exact(Int32(NULL)) Null=Exact(0)),(Col[4]: Min=Exact(Int32(NULL)) 
Max=Exact(Int32(NULL)) Null=Exact(0)),(Col[5]: Min=Exact(Int64(NULL)) 
Max=Exact(Int64(NULL)) Null=Exact(0)),(Col[6]: Min=Exact(Float32(NULL)) 
Max=Exact(Float32(NULL)) Null=Exact(0)),(Col[7]: Min=Exact(Float64(NULL)) 
Max=Exact(Float64(NULL)) Null=Exact(0)),(Col[8]: Min=Exact(Binary(NULL)) 
Max=Exact(Binary(NULL)) Null=Exact(0)),(Col[9]: Min=Exact(Binary(NULL)) 
Max=Exact(Binary(NULL
 )) Null=Exact(0)),(Col[10]: Min=Exact(TimestampNanosecond(NULL, None)) 
Max=Exact(TimestampNanosecond(NULL, None)) Null=Exact(0))]]

Review Comment:
   I don't fully understand why these statistics are all reported as null / 
None now. -- I need to investagate



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Update ListingTable to use StatisticsConverter [datafusion]

Reply via email to