This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new f621d28db5 Parquet: omit min/max for interval columns when writing 
stats (#5147)
f621d28db5 is described below

commit f621d28db590ff6ad3907450f7ff434c7deb9766
Author: Jeffrey <[email protected]>
AuthorDate: Fri Dec 1 04:46:36 2023 +1100

    Parquet: omit min/max for interval columns when writing stats (#5147)
    
    * Parquet: omit min/max for interval columns when writing stats
    
    * Trigger
---
 parquet/src/column/writer/encoder.rs |  7 +++--
 parquet/src/column/writer/mod.rs     | 59 ++++++++++++++++++++++++++++++------
 2 files changed, 55 insertions(+), 11 deletions(-)

diff --git a/parquet/src/column/writer/encoder.rs 
b/parquet/src/column/writer/encoder.rs
index d0720dd243..0d5144f61c 100644
--- a/parquet/src/column/writer/encoder.rs
+++ b/parquet/src/column/writer/encoder.rs
@@ -18,7 +18,7 @@
 use bytes::Bytes;
 use half::f16;
 
-use crate::basic::{Encoding, LogicalType, Type};
+use crate::basic::{ConvertedType, Encoding, LogicalType, Type};
 use crate::bloom_filter::Sbbf;
 use crate::column::writer::{
     compare_greater, fallback_encoding, has_dictionary_support, is_nan, 
update_max, update_min,
@@ -137,7 +137,10 @@ pub struct ColumnValueEncoderImpl<T: DataType> {
 
 impl<T: DataType> ColumnValueEncoderImpl<T> {
     fn write_slice(&mut self, slice: &[T::T]) -> Result<()> {
-        if self.statistics_enabled == EnabledStatistics::Page {
+        if self.statistics_enabled == EnabledStatistics::Page
+            // INTERVAL has undefined sort order, so don't write min/max stats 
for it
+            && self.descr.converted_type() != ConvertedType::INTERVAL
+        {
             if let Some((min, max)) = self.min_max(slice, None) {
                 update_min(&self.descr, &min, &mut self.min_value);
                 update_max(&self.descr, &max, &mut self.max_value);
diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs
index 14b8655091..e92a502689 100644
--- a/parquet/src/column/writer/mod.rs
+++ b/parquet/src/column/writer/mod.rs
@@ -332,7 +332,10 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> 
{
         // If only computing chunk-level statistics compute them here, 
page-level statistics
         // are computed in [`Self::write_mini_batch`] and used to update chunk 
statistics in
         // [`Self::add_data_page`]
-        if self.statistics_enabled == EnabledStatistics::Chunk {
+        if self.statistics_enabled == EnabledStatistics::Chunk
+            // INTERVAL has undefined sort order, so don't write min/max stats 
for it
+            && self.descr.converted_type() != ConvertedType::INTERVAL
+        {
             match (min, max) {
                 (Some(min), Some(max)) => {
                     update_min(&self.descr, min, &mut 
self.column_metrics.min_column_value);
@@ -1093,7 +1096,6 @@ fn is_nan<T: ParquetValueType>(descr: &ColumnDescriptor, 
val: &T) -> bool {
 ///
 /// If `cur` is `None`, sets `cur` to `Some(val)`, otherwise calls 
`should_update` with
 /// the value of `cur`, and updates `cur` to `Some(val)` if it returns `true`
-
 fn update_stat<T: ParquetValueType, F>(
     descr: &ColumnDescriptor,
     val: &T,
@@ -3066,6 +3068,30 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn test_interval_stats_should_not_have_min_max() {
+        let input = [
+            vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+            vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
+            vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2],
+        ]
+        .into_iter()
+        .map(|s| ByteArray::from(s).into())
+        .collect::<Vec<_>>();
+
+        let page_writer = get_test_page_writer();
+        let mut writer = get_test_interval_column_writer(page_writer);
+        writer.write_batch(&input, None, None).unwrap();
+
+        let metadata = writer.close().unwrap().metadata;
+        let stats = if let Some(Statistics::FixedLenByteArray(stats)) = 
metadata.statistics() {
+            stats.clone()
+        } else {
+            panic!("metadata missing statistics");
+        };
+        assert!(!stats.has_min_max_set());
+    }
+
     fn write_multiple_pages<T: DataType>(
         column_descr: &Arc<ColumnDescriptor>,
         pages: &[&[Option<T::T>]],
@@ -3395,8 +3421,7 @@ mod tests {
         values: &[FixedLenByteArray],
     ) -> ValueStatistics<FixedLenByteArray> {
         let page_writer = get_test_page_writer();
-        let props = Default::default();
-        let mut writer = get_test_float16_column_writer(page_writer, 0, 0, 
props);
+        let mut writer = get_test_float16_column_writer(page_writer);
         writer.write_batch(values, None, None).unwrap();
 
         let metadata = writer.close().unwrap().metadata;
@@ -3409,12 +3434,9 @@ mod tests {
 
     fn get_test_float16_column_writer(
         page_writer: Box<dyn PageWriter>,
-        max_def_level: i16,
-        max_rep_level: i16,
-        props: WriterPropertiesPtr,
     ) -> ColumnWriterImpl<'static, FixedLenByteArrayType> {
-        let descr = Arc::new(get_test_float16_column_descr(max_def_level, 
max_rep_level));
-        let column_writer = get_column_writer(descr, props, page_writer);
+        let descr = Arc::new(get_test_float16_column_descr(0, 0));
+        let column_writer = get_column_writer(descr, Default::default(), 
page_writer);
         get_typed_column_writer::<FixedLenByteArrayType>(column_writer)
     }
 
@@ -3429,6 +3451,25 @@ mod tests {
         ColumnDescriptor::new(Arc::new(tpe), max_def_level, max_rep_level, 
path)
     }
 
+    fn get_test_interval_column_writer(
+        page_writer: Box<dyn PageWriter>,
+    ) -> ColumnWriterImpl<'static, FixedLenByteArrayType> {
+        let descr = Arc::new(get_test_interval_column_descr());
+        let column_writer = get_column_writer(descr, Default::default(), 
page_writer);
+        get_typed_column_writer::<FixedLenByteArrayType>(column_writer)
+    }
+
+    fn get_test_interval_column_descr() -> ColumnDescriptor {
+        let path = ColumnPath::from("col");
+        let tpe =
+            SchemaType::primitive_type_builder("col", 
FixedLenByteArrayType::get_physical_type())
+                .with_length(12)
+                .with_converted_type(ConvertedType::INTERVAL)
+                .build()
+                .unwrap();
+        ColumnDescriptor::new(Arc::new(tpe), 0, 0, path)
+    }
+
     /// Returns column writer for UINT32 Column provided as ConvertedType only
     fn get_test_unsigned_int_given_as_converted_column_writer<'a, T: DataType>(
         page_writer: Box<dyn PageWriter + 'a>,

Reply via email to