This is an automated email from the ASF dual-hosted git repository.

comphead pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 73381fe357 Minor: Update `min_statistics` and `max_statistics`  to be 
helpers, update docs (#10866)
73381fe357 is described below

commit 73381fe35738ef2f5a06e9f55626f08855e8a852
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed Jun 12 11:17:14 2024 -0400

    Minor: Update `min_statistics` and `max_statistics`  to be helpers, update 
docs (#10866)
---
 .../datasource/physical_plan/parquet/statistics.rs | 50 ++++++++++++----------
 1 file changed, 28 insertions(+), 22 deletions(-)

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs 
b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
index a4a919f20d..c0d36f1fc4 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! [`min_statistics`] and [`max_statistics`] convert statistics in parquet 
format to arrow [`ArrayRef`].
+//! [`StatisticsConverter`] to convert statistics in parquet format to arrow 
[`ArrayRef`].
 
 // TODO: potentially move this to arrow-rs: 
https://github.com/apache/arrow-rs/issues/4328
 
@@ -542,8 +542,11 @@ pub(crate) fn parquet_column<'a>(
     Some((parquet_idx, field))
 }
 
-/// Extracts the min statistics from an iterator of [`ParquetStatistics`] to 
an [`ArrayRef`]
-pub(crate) fn min_statistics<'a, I: Iterator<Item = Option<&'a 
ParquetStatistics>>>(
+/// Extracts the min statistics from an iterator of [`ParquetStatistics`] to an
+/// [`ArrayRef`]
+///
+/// This is an internal helper -- see [`StatisticsConverter`] for public API
+fn min_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics>>>(
     data_type: &DataType,
     iterator: I,
 ) -> Result<ArrayRef> {
@@ -551,7 +554,9 @@ pub(crate) fn min_statistics<'a, I: Iterator<Item = 
Option<&'a ParquetStatistics
 }
 
 /// Extracts the max statistics from an iterator of [`ParquetStatistics`] to 
an [`ArrayRef`]
-pub(crate) fn max_statistics<'a, I: Iterator<Item = Option<&'a 
ParquetStatistics>>>(
+///
+/// This is an internal helper -- see [`StatisticsConverter`] for public API
+fn max_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics>>>(
     data_type: &DataType,
     iterator: I,
 ) -> Result<ArrayRef> {
@@ -1425,9 +1430,10 @@ mod test {
         assert_eq!(idx, 2);
 
         let row_groups = metadata.row_groups();
-        let iter = row_groups.iter().map(|x| x.column(idx).statistics());
+        let converter =
+            StatisticsConverter::try_new("int_col", &schema, 
parquet_schema).unwrap();
 
-        let min = min_statistics(&DataType::Int32, iter.clone()).unwrap();
+        let min = converter.row_group_mins(row_groups.iter()).unwrap();
         assert_eq!(
             &min,
             &expected_min,
@@ -1435,7 +1441,7 @@ mod test {
             DisplayStats(row_groups)
         );
 
-        let max = max_statistics(&DataType::Int32, iter).unwrap();
+        let max = converter.row_group_maxes(row_groups.iter()).unwrap();
         assert_eq!(
             &max,
             &expected_max,
@@ -1623,22 +1629,23 @@ mod test {
                     continue;
                 }
 
-                let (idx, f) =
-                    parquet_column(parquet_schema, &schema, 
field.name()).unwrap();
-                assert_eq!(f, field);
+                let converter =
+                    StatisticsConverter::try_new(field.name(), &schema, 
parquet_schema)
+                        .unwrap();
 
-                let iter = row_groups.iter().map(|x| 
x.column(idx).statistics());
-                let min = min_statistics(f.data_type(), iter.clone()).unwrap();
+                assert_eq!(converter.arrow_field, field.as_ref());
+
+                let mins = 
converter.row_group_mins(row_groups.iter()).unwrap();
                 assert_eq!(
-                    &min,
+                    &mins,
                     &expected_min,
                     "Min. Statistics\n\n{}\n\n",
                     DisplayStats(row_groups)
                 );
 
-                let max = max_statistics(f.data_type(), iter).unwrap();
+                let maxes = 
converter.row_group_maxes(row_groups.iter()).unwrap();
                 assert_eq!(
-                    &max,
+                    &maxes,
                     &expected_max,
                     "Max. Statistics\n\n{}\n\n",
                     DisplayStats(row_groups)
@@ -1705,7 +1712,7 @@ mod test {
             self
         }
 
-        /// Reads the specified parquet file and validates that the exepcted 
min/max
+        /// Reads the specified parquet file and validates that the expected 
min/max
         /// values for the specified columns are as expected.
         fn run(self) {
             let path = PathBuf::from(parquet_test_data()).join(self.file_name);
@@ -1723,14 +1730,13 @@ mod test {
                     expected_max,
                 } = expected_column;
 
-                let (idx, field) =
-                    parquet_column(parquet_schema, arrow_schema, 
name).unwrap();
-
-                let iter = row_groups.iter().map(|x| 
x.column(idx).statistics());
-                let actual_min = min_statistics(field.data_type(), 
iter.clone()).unwrap();
+                let converter =
+                    StatisticsConverter::try_new(name, arrow_schema, 
parquet_schema)
+                        .unwrap();
+                let actual_min = 
converter.row_group_mins(row_groups.iter()).unwrap();
                 assert_eq!(&expected_min, &actual_min, "column {name}");
 
-                let actual_max = max_statistics(field.data_type(), 
iter).unwrap();
+                let actual_max = 
converter.row_group_maxes(row_groups.iter()).unwrap();
                 assert_eq!(&expected_max, &actual_max, "column {name}");
             }
         }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to