This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new e6bd74b2c3 Add support for `StringView` and `BinaryView` statistics in 
`StatisticsConverter` (#6181)
e6bd74b2c3 is described below

commit e6bd74b2c381e67a2b08e15fc23672f8317436c4
Author: kf zheng <[email protected]>
AuthorDate: Sat Aug 3 19:21:14 2024 +0800

    Add support for `StringView` and `BinaryView` statistics in 
`StatisticsConverter` (#6181)
    
    * Add StringView and BinaryView support for the macro `get_statistics`
    
    * Add StringView and BinaryView support for the macro 
`get_data_page_statistics`
    
    * add tests to cover the support for StringView and BinaryView in the macro 
get_data_page_statistics
    
    * found potential bugs and ignore the tests
    
    * fake alarm! no bugs, fix the code by initiating all batches to have 5 rows
    
    * make the get_stat StringView and BinaryView tests cover bytes greater 
than 12
---
 parquet/src/arrow/arrow_reader/statistics.rs | 173 +++++++++++++++++++++++++--
 parquet/tests/arrow_reader/mod.rs            |  55 +++++++--
 parquet/tests/arrow_reader/statistics.rs     |  62 +++++++++-
 3 files changed, 270 insertions(+), 20 deletions(-)

diff --git a/parquet/src/arrow/arrow_reader/statistics.rs 
b/parquet/src/arrow/arrow_reader/statistics.rs
index 6a1434bce9..c42f92838c 100644
--- a/parquet/src/arrow/arrow_reader/statistics.rs
+++ b/parquet/src/arrow/arrow_reader/statistics.rs
@@ -26,7 +26,8 @@ use crate::file::page_index::index::{Index, PageIndex};
 use crate::file::statistics::Statistics as ParquetStatistics;
 use crate::schema::types::SchemaDescriptor;
 use arrow_array::builder::{
-    BooleanBuilder, FixedSizeBinaryBuilder, LargeStringBuilder, StringBuilder,
+    BinaryViewBuilder, BooleanBuilder, FixedSizeBinaryBuilder, 
LargeStringBuilder, StringBuilder,
+    StringViewBuilder,
 };
 use arrow_array::{
     new_empty_array, new_null_array, ArrayRef, BinaryArray, BooleanArray, 
Date32Array, Date64Array,
@@ -446,14 +447,43 @@ macro_rules! get_statistics {
             },
             DataType::Dictionary(_, value_type) => {
                 [<$stat_type_prefix:lower _ statistics>](value_type, $iterator)
+            },
+            DataType::Utf8View => {
+                let iterator = [<$stat_type_prefix 
ByteArrayStatsIterator>]::new($iterator);
+                let mut builder = StringViewBuilder::new();
+                for x in iterator {
+                    let Some(x) = x else {
+                        builder.append_null(); // no statistics value
+                        continue;
+                    };
+
+                    let Ok(x) = std::str::from_utf8(x) else {
+                        builder.append_null();
+                        continue;
+                    };
+
+                    builder.append_value(x);
+                }
+                Ok(Arc::new(builder.finish()))
+            },
+            DataType::BinaryView => {
+                let iterator = [<$stat_type_prefix 
ByteArrayStatsIterator>]::new($iterator);
+                let mut builder = BinaryViewBuilder::new();
+                for x in iterator {
+                    let Some(x) = x else {
+                        builder.append_null(); // no statistics value
+                        continue;
+                    };
+
+                    builder.append_value(x);
+                }
+                Ok(Arc::new(builder.finish()))
             }
 
             DataType::Map(_,_) |
             DataType::Duration(_) |
             DataType::Interval(_) |
             DataType::Null |
-            DataType::BinaryView |
-            DataType::Utf8View |
             DataType::List(_) |
             DataType::ListView(_) |
             DataType::FixedSizeList(_, _) |
@@ -919,7 +949,7 @@ macro_rules! get_data_page_statistics {
                         }
                     })
                 },
-               Some(DataType::FixedSizeBinary(size)) => {
+                Some(DataType::FixedSizeBinary(size)) => {
                     let mut builder = FixedSizeBinaryBuilder::new(*size);
                     let iterator = [<$stat_type_prefix 
FixedLenByteArrayDataPageStatsIterator>]::new($iterator);
                     for x in iterator {
@@ -943,7 +973,58 @@ macro_rules! get_data_page_statistics {
                     }
                     Ok(Arc::new(builder.finish()))
                 },
-                _ => unimplemented!()
+                Some(DataType::Utf8View) => {
+                    let mut builder = StringViewBuilder::new();
+                    let iterator = [<$stat_type_prefix 
ByteArrayDataPageStatsIterator>]::new($iterator);
+                    for x in iterator {
+                        for x in x.into_iter() {
+                            let Some(x) = x else {
+                                builder.append_null(); // no statistics value
+                                continue;
+                            };
+
+                            let Ok(x) = std::str::from_utf8(x.data()) else {
+                                builder.append_null();
+                                continue;
+                            };
+
+                            builder.append_value(x);
+                        }
+                    }
+                    Ok(Arc::new(builder.finish()))
+                },
+                Some(DataType::BinaryView) => {
+                    let mut builder = BinaryViewBuilder::new();
+                    let iterator = [<$stat_type_prefix 
ByteArrayDataPageStatsIterator>]::new($iterator);
+                    for x in iterator {
+                        for x in x.into_iter() {
+                            let Some(x) = x else {
+                                builder.append_null(); // no statistics value
+                                continue;
+                            };
+
+                            builder.append_value(x);
+                        }
+                    }
+                    Ok(Arc::new(builder.finish()))
+                },
+                Some(DataType::Null) |
+                Some(DataType::Duration(_)) |
+                Some(DataType::Interval(_)) |
+                Some(DataType::List(_)) |
+                Some(DataType::ListView(_)) |
+                Some(DataType::FixedSizeList(_, _)) |
+                Some(DataType::LargeList(_)) |
+                Some(DataType::LargeListView(_)) |
+                Some(DataType::Struct(_)) |
+                Some(DataType::Union(_, _)) |
+                Some(DataType::Map(_, _)) |
+                Some(DataType::RunEndEncoded(_, _)) => {
+                    let len = $iterator.count();
+                    // don't know how to extract statistics, so return a null 
array
+                    Ok(new_null_array($data_type.unwrap(), len))
+                },
+                None => unimplemented!()  // not sure how to handle this
             }
         }
     }
@@ -1499,10 +1580,10 @@ mod test {
     use arrow::datatypes::{i256, Date32Type, Date64Type};
     use arrow::util::test_util::parquet_test_data;
     use arrow_array::{
-        new_empty_array, new_null_array, Array, ArrayRef, BinaryArray, 
BooleanArray, Date32Array,
-        Date64Array, Decimal128Array, Decimal256Array, Float32Array, 
Float64Array, Int16Array,
-        Int32Array, Int64Array, Int8Array, LargeBinaryArray, RecordBatch, 
StringArray, StructArray,
-        TimestampNanosecondArray,
+        new_empty_array, new_null_array, Array, ArrayRef, BinaryArray, 
BinaryViewArray,
+        BooleanArray, Date32Array, Date64Array, Decimal128Array, 
Decimal256Array, Float32Array,
+        Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, 
LargeBinaryArray, RecordBatch,
+        StringArray, StringViewArray, StructArray, TimestampNanosecondArray,
     };
     use arrow_schema::{DataType, Field, SchemaRef};
     use bytes::Bytes;
@@ -1916,6 +1997,65 @@ mod test {
         .run()
     }
 
+    #[test]
+    fn roundtrip_string_view() {
+        Test {
+            input: string_view_array([
+                // row group 1
+                Some("A"),
+                None,
+                Some("Q"),
+                // row group 2
+                Some("ZZ"),
+                Some("A_longerthan12"),
+                None,
+                // row group 3
+                Some("A_longerthan12"),
+                None,
+                None,
+            ]),
+            expected_min: string_view_array([
+                Some("A"),
+                Some("A_longerthan12"),
+                Some("A_longerthan12"),
+            ]),
+            expected_max: string_view_array([Some("Q"), Some("ZZ"), 
Some("A_longerthan12")]),
+        }
+        .run()
+    }
+
+    #[test]
+    fn roundtrip_binary_view() {
+        let input: Vec<Option<&[u8]>> = vec![
+            // row group 1
+            Some(b"A"),
+            None,
+            Some(b"Q"),
+            // row group 2
+            Some(b"ZZ"),
+            Some(b"A_longerthan12"),
+            None,
+            // row group 3
+            Some(b"A_longerthan12"),
+            None,
+            None,
+        ];
+
+        let expected_min: Vec<Option<&[u8]>> =
+            vec![Some(b"A"), Some(b"A_longerthan12"), Some(b"A_longerthan12")];
+        let expected_max: Vec<Option<&[u8]>> =
+            vec![Some(b"Q"), Some(b"ZZ"), Some(b"A_longerthan12")];
+
+        let array = binary_view_array(input);
+
+        Test {
+            input: array,
+            expected_min: binary_view_array(expected_min),
+            expected_max: binary_view_array(expected_max),
+        }
+        .run()
+    }
+
     #[test]
     fn roundtrip_struct() {
         let mut test = Test {
@@ -2539,4 +2679,19 @@ mod test {
 
         Arc::new(array)
     }
+
+    fn string_view_array<'a>(input: impl IntoIterator<Item = Option<&'a str>>) 
-> ArrayRef {
+        let array: StringViewArray = input
+            .into_iter()
+            .map(|s| s.map(|s| s.to_string()))
+            .collect();
+
+        Arc::new(array)
+    }
+
+    fn binary_view_array(input: Vec<Option<&[u8]>>) -> ArrayRef {
+        let array = 
BinaryViewArray::from(input.into_iter().collect::<Vec<Option<&[u8]>>>());
+
+        Arc::new(array)
+    }
 }
diff --git a/parquet/tests/arrow_reader/mod.rs 
b/parquet/tests/arrow_reader/mod.rs
index 4f63a50548..7e979dcf3e 100644
--- a/parquet/tests/arrow_reader/mod.rs
+++ b/parquet/tests/arrow_reader/mod.rs
@@ -17,13 +17,13 @@
 
 use arrow_array::types::{Int32Type, Int8Type};
 use arrow_array::{
-    Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array, 
Decimal128Array,
-    Decimal256Array, DictionaryArray, FixedSizeBinaryArray, Float16Array, 
Float32Array,
-    Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, 
LargeBinaryArray,
-    LargeStringArray, RecordBatch, StringArray, StructArray, 
Time32MillisecondArray,
-    Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, 
TimestampMicrosecondArray,
-    TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, 
UInt16Array,
-    UInt32Array, UInt64Array, UInt8Array,
+    Array, ArrayRef, BinaryArray, BinaryViewArray, BooleanArray, Date32Array, 
Date64Array,
+    Decimal128Array, Decimal256Array, DictionaryArray, FixedSizeBinaryArray, 
Float16Array,
+    Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, 
LargeBinaryArray,
+    LargeStringArray, RecordBatch, StringArray, StringViewArray, StructArray,
+    Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, 
Time64NanosecondArray,
+    TimestampMicrosecondArray, TimestampMillisecondArray, 
TimestampNanosecondArray,
+    TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
 };
 use arrow_buffer::i256;
 use arrow_schema::{DataType, Field, Schema, TimeUnit};
@@ -88,6 +88,8 @@ enum Scenario {
     PeriodsInColumnNames,
     StructArray,
     UTF8,
+    UTF8View,
+    BinaryView,
 }
 
 fn make_boolean_batch(v: Vec<Option<bool>>) -> RecordBatch {
@@ -589,6 +591,16 @@ fn make_utf8_batch(value: Vec<Option<&str>>) -> 
RecordBatch {
     .unwrap()
 }
 
+fn make_utf8_view_batch(value: Vec<Option<&str>>) -> RecordBatch {
+    let utf8_view = StringViewArray::from(value);
+    RecordBatch::try_from_iter(vec![("utf8_view", Arc::new(utf8_view) as 
_)]).unwrap()
+}
+
+fn make_binary_view_batch(value: Vec<Option<&[u8]>>) -> RecordBatch {
+    let binary_view = BinaryViewArray::from(value);
+    RecordBatch::try_from_iter(vec![("binary_view", Arc::new(binary_view) as 
_)]).unwrap()
+}
+
 fn make_dict_batch() -> RecordBatch {
     let values = [
         Some("abc"),
@@ -972,6 +984,35 @@ fn create_data_batch(scenario: Scenario) -> 
Vec<RecordBatch> {
                 make_utf8_batch(vec![Some("e"), Some("f"), Some("g"), 
Some("h"), Some("i")]),
             ]
         }
+        Scenario::UTF8View => {
+            // Make utf8_view batch including string length <12 and >12 bytes
+            // as the internal representation of StringView is differed for 
strings
+            // shorter and longer than that length
+            vec![
+                make_utf8_view_batch(vec![Some("a"), Some("b"), Some("c"), 
Some("d"), None]),
+                make_utf8_view_batch(vec![Some("a"), Some("e_longerthan12"), 
None, None, None]),
+                make_utf8_view_batch(vec![
+                    Some("e_longerthan12"),
+                    Some("f_longerthan12"),
+                    Some("g_longerthan12"),
+                    Some("h_longerthan12"),
+                    Some("i_longerthan12"),
+                ]),
+            ]
+        }
+        Scenario::BinaryView => {
+            vec![
+                make_binary_view_batch(vec![Some(b"a"), Some(b"b"), 
Some(b"c"), Some(b"d"), None]),
+                make_binary_view_batch(vec![Some(b"a"), 
Some(b"e_longerthan12"), None, None, None]),
+                make_binary_view_batch(vec![
+                    Some(b"e_longerthan12"),
+                    Some(b"f_longerthan12"),
+                    Some(b"g_longerthan12"),
+                    Some(b"h_longerthan12"),
+                    Some(b"i_longerthan12"),
+                ]),
+            ]
+        }
     }
 }
 
diff --git a/parquet/tests/arrow_reader/statistics.rs 
b/parquet/tests/arrow_reader/statistics.rs
index 5702967ffd..75a73ac130 100644
--- a/parquet/tests/arrow_reader/statistics.rs
+++ b/parquet/tests/arrow_reader/statistics.rs
@@ -29,11 +29,11 @@ use arrow::datatypes::{
     TimestampNanosecondType, TimestampSecondType,
 };
 use arrow_array::{
-    make_array, new_null_array, Array, ArrayRef, BinaryArray, BooleanArray, 
Date32Array,
-    Date64Array, Decimal128Array, Decimal256Array, FixedSizeBinaryArray, 
Float16Array,
+    make_array, new_null_array, Array, ArrayRef, BinaryArray, BinaryViewArray, 
BooleanArray,
+    Date32Array, Date64Array, Decimal128Array, Decimal256Array, 
FixedSizeBinaryArray, Float16Array,
     Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, 
LargeBinaryArray,
-    LargeStringArray, RecordBatch, StringArray, Time32MillisecondArray, 
Time32SecondArray,
-    Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray,
+    LargeStringArray, RecordBatch, StringArray, StringViewArray, 
Time32MillisecondArray,
+    Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, 
TimestampMicrosecondArray,
     TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, 
UInt16Array,
     UInt32Array, UInt64Array, UInt8Array,
 };
@@ -2059,6 +2059,60 @@ async fn test_utf8() {
     .run();
 }
 
+// UTF8View
+#[tokio::test]
+async fn test_utf8_view() {
+    let reader = TestReader {
+        scenario: Scenario::UTF8View,
+        row_per_group: 5,
+    }
+    .build()
+    .await;
+
+    // test for utf8_view
+    Test {
+        reader: &reader,
+        expected_min: Arc::new(StringViewArray::from(vec!["a", "a", 
"e_longerthan12"])),
+        expected_max: Arc::new(StringViewArray::from(vec![
+            "d",
+            "e_longerthan12",
+            "i_longerthan12",
+        ])),
+        expected_null_counts: UInt64Array::from(vec![1, 3, 0]),
+        expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])),
+        column_name: "utf8_view",
+        check: Check::Both,
+    }
+    .run()
+}
+
+// BinaryView
+#[tokio::test]
+async fn test_binary_view() {
+    let reader = TestReader {
+        scenario: Scenario::BinaryView,
+        row_per_group: 5,
+    }
+    .build()
+    .await;
+
+    let expected_min: Vec<Option<&[u8]>> = vec![Some(b"a"), Some(b"a"), 
Some(b"e_longerthan12")];
+    let expected_max: Vec<Option<&[u8]>> =
+        vec![Some(b"d"), Some(b"e_longerthan12"), Some(b"i_longerthan12")];
+
+    // test for utf8_view
+    Test {
+        reader: &reader,
+        expected_min: Arc::new(BinaryViewArray::from(expected_min)),
+        expected_max: Arc::new(BinaryViewArray::from(expected_max)),
+        expected_null_counts: UInt64Array::from(vec![1, 3, 0]),
+        expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])),
+        column_name: "binary_view",
+        check: Check::Both,
+    }
+    .run()
+}
+
 ////// Files with missing statistics ///////
 
 #[tokio::test]

Reply via email to