This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new e6bd74b2c3 Add support for `StringView` and `BinaryView` statistics in
`StatisticsConverter` (#6181)
e6bd74b2c3 is described below
commit e6bd74b2c381e67a2b08e15fc23672f8317436c4
Author: kf zheng <[email protected]>
AuthorDate: Sat Aug 3 19:21:14 2024 +0800
Add support for `StringView` and `BinaryView` statistics in
`StatisticsConverter` (#6181)
* Add StringView and BinaryView support for the macro `get_statistics`
* Add StringView and BinaryView support for the macro
`get_data_page_statistics`
* add tests to cover the support for StringView and BinaryView in the macro
get_data_page_statistics
* found potential bugs and ignore the tests
* fake alarm! no bugs, fix the code by initiating all batches to have 5 rows
* make the get_stat StringView and BinaryView tests cover bytes greater
than 12
---
parquet/src/arrow/arrow_reader/statistics.rs | 173 +++++++++++++++++++++++++--
parquet/tests/arrow_reader/mod.rs | 55 +++++++--
parquet/tests/arrow_reader/statistics.rs | 62 +++++++++-
3 files changed, 270 insertions(+), 20 deletions(-)
diff --git a/parquet/src/arrow/arrow_reader/statistics.rs
b/parquet/src/arrow/arrow_reader/statistics.rs
index 6a1434bce9..c42f92838c 100644
--- a/parquet/src/arrow/arrow_reader/statistics.rs
+++ b/parquet/src/arrow/arrow_reader/statistics.rs
@@ -26,7 +26,8 @@ use crate::file::page_index::index::{Index, PageIndex};
use crate::file::statistics::Statistics as ParquetStatistics;
use crate::schema::types::SchemaDescriptor;
use arrow_array::builder::{
- BooleanBuilder, FixedSizeBinaryBuilder, LargeStringBuilder, StringBuilder,
+ BinaryViewBuilder, BooleanBuilder, FixedSizeBinaryBuilder,
LargeStringBuilder, StringBuilder,
+ StringViewBuilder,
};
use arrow_array::{
new_empty_array, new_null_array, ArrayRef, BinaryArray, BooleanArray,
Date32Array, Date64Array,
@@ -446,14 +447,43 @@ macro_rules! get_statistics {
},
DataType::Dictionary(_, value_type) => {
[<$stat_type_prefix:lower _ statistics>](value_type, $iterator)
+ },
+ DataType::Utf8View => {
+ let iterator = [<$stat_type_prefix
ByteArrayStatsIterator>]::new($iterator);
+ let mut builder = StringViewBuilder::new();
+ for x in iterator {
+ let Some(x) = x else {
+ builder.append_null(); // no statistics value
+ continue;
+ };
+
+ let Ok(x) = std::str::from_utf8(x) else {
+ builder.append_null();
+ continue;
+ };
+
+ builder.append_value(x);
+ }
+ Ok(Arc::new(builder.finish()))
+ },
+ DataType::BinaryView => {
+ let iterator = [<$stat_type_prefix
ByteArrayStatsIterator>]::new($iterator);
+ let mut builder = BinaryViewBuilder::new();
+ for x in iterator {
+ let Some(x) = x else {
+ builder.append_null(); // no statistics value
+ continue;
+ };
+
+ builder.append_value(x);
+ }
+ Ok(Arc::new(builder.finish()))
}
DataType::Map(_,_) |
DataType::Duration(_) |
DataType::Interval(_) |
DataType::Null |
- DataType::BinaryView |
- DataType::Utf8View |
DataType::List(_) |
DataType::ListView(_) |
DataType::FixedSizeList(_, _) |
@@ -919,7 +949,7 @@ macro_rules! get_data_page_statistics {
}
})
},
- Some(DataType::FixedSizeBinary(size)) => {
+ Some(DataType::FixedSizeBinary(size)) => {
let mut builder = FixedSizeBinaryBuilder::new(*size);
let iterator = [<$stat_type_prefix
FixedLenByteArrayDataPageStatsIterator>]::new($iterator);
for x in iterator {
@@ -943,7 +973,58 @@ macro_rules! get_data_page_statistics {
}
Ok(Arc::new(builder.finish()))
},
- _ => unimplemented!()
+ Some(DataType::Utf8View) => {
+ let mut builder = StringViewBuilder::new();
+ let iterator = [<$stat_type_prefix
ByteArrayDataPageStatsIterator>]::new($iterator);
+ for x in iterator {
+ for x in x.into_iter() {
+ let Some(x) = x else {
+ builder.append_null(); // no statistics value
+ continue;
+ };
+
+ let Ok(x) = std::str::from_utf8(x.data()) else {
+ builder.append_null();
+ continue;
+ };
+
+ builder.append_value(x);
+ }
+ }
+ Ok(Arc::new(builder.finish()))
+ },
+ Some(DataType::BinaryView) => {
+ let mut builder = BinaryViewBuilder::new();
+ let iterator = [<$stat_type_prefix
ByteArrayDataPageStatsIterator>]::new($iterator);
+ for x in iterator {
+ for x in x.into_iter() {
+ let Some(x) = x else {
+ builder.append_null(); // no statistics value
+ continue;
+ };
+
+ builder.append_value(x);
+ }
+ }
+ Ok(Arc::new(builder.finish()))
+ },
+ Some(DataType::Null) |
+ Some(DataType::Duration(_)) |
+ Some(DataType::Interval(_)) |
+ Some(DataType::List(_)) |
+ Some(DataType::ListView(_)) |
+ Some(DataType::FixedSizeList(_, _)) |
+ Some(DataType::LargeList(_)) |
+ Some(DataType::LargeListView(_)) |
+ Some(DataType::Struct(_)) |
+ Some(DataType::Union(_, _)) |
+ Some(DataType::Map(_, _)) |
+ Some(DataType::RunEndEncoded(_, _)) => {
+ let len = $iterator.count();
+ // don't know how to extract statistics, so return a null
array
+ Ok(new_null_array($data_type.unwrap(), len))
+ },
+ None => unimplemented!() // not sure how to handle this
}
}
}
@@ -1499,10 +1580,10 @@ mod test {
use arrow::datatypes::{i256, Date32Type, Date64Type};
use arrow::util::test_util::parquet_test_data;
use arrow_array::{
- new_empty_array, new_null_array, Array, ArrayRef, BinaryArray,
BooleanArray, Date32Array,
- Date64Array, Decimal128Array, Decimal256Array, Float32Array,
Float64Array, Int16Array,
- Int32Array, Int64Array, Int8Array, LargeBinaryArray, RecordBatch,
StringArray, StructArray,
- TimestampNanosecondArray,
+ new_empty_array, new_null_array, Array, ArrayRef, BinaryArray,
BinaryViewArray,
+ BooleanArray, Date32Array, Date64Array, Decimal128Array,
Decimal256Array, Float32Array,
+ Float64Array, Int16Array, Int32Array, Int64Array, Int8Array,
LargeBinaryArray, RecordBatch,
+ StringArray, StringViewArray, StructArray, TimestampNanosecondArray,
};
use arrow_schema::{DataType, Field, SchemaRef};
use bytes::Bytes;
@@ -1916,6 +1997,65 @@ mod test {
.run()
}
+ #[test]
+ fn roundtrip_string_view() {
+ Test {
+ input: string_view_array([
+ // row group 1
+ Some("A"),
+ None,
+ Some("Q"),
+ // row group 2
+ Some("ZZ"),
+ Some("A_longerthan12"),
+ None,
+ // row group 3
+ Some("A_longerthan12"),
+ None,
+ None,
+ ]),
+ expected_min: string_view_array([
+ Some("A"),
+ Some("A_longerthan12"),
+ Some("A_longerthan12"),
+ ]),
+ expected_max: string_view_array([Some("Q"), Some("ZZ"),
Some("A_longerthan12")]),
+ }
+ .run()
+ }
+
+ #[test]
+ fn roundtrip_binary_view() {
+ let input: Vec<Option<&[u8]>> = vec![
+ // row group 1
+ Some(b"A"),
+ None,
+ Some(b"Q"),
+ // row group 2
+ Some(b"ZZ"),
+ Some(b"A_longerthan12"),
+ None,
+ // row group 3
+ Some(b"A_longerthan12"),
+ None,
+ None,
+ ];
+
+ let expected_min: Vec<Option<&[u8]>> =
+ vec![Some(b"A"), Some(b"A_longerthan12"), Some(b"A_longerthan12")];
+ let expected_max: Vec<Option<&[u8]>> =
+ vec![Some(b"Q"), Some(b"ZZ"), Some(b"A_longerthan12")];
+
+ let array = binary_view_array(input);
+
+ Test {
+ input: array,
+ expected_min: binary_view_array(expected_min),
+ expected_max: binary_view_array(expected_max),
+ }
+ .run()
+ }
+
#[test]
fn roundtrip_struct() {
let mut test = Test {
@@ -2539,4 +2679,19 @@ mod test {
Arc::new(array)
}
+
+ fn string_view_array<'a>(input: impl IntoIterator<Item = Option<&'a str>>)
-> ArrayRef {
+ let array: StringViewArray = input
+ .into_iter()
+ .map(|s| s.map(|s| s.to_string()))
+ .collect();
+
+ Arc::new(array)
+ }
+
+ fn binary_view_array(input: Vec<Option<&[u8]>>) -> ArrayRef {
+ let array =
BinaryViewArray::from(input.into_iter().collect::<Vec<Option<&[u8]>>>());
+
+ Arc::new(array)
+ }
}
diff --git a/parquet/tests/arrow_reader/mod.rs
b/parquet/tests/arrow_reader/mod.rs
index 4f63a50548..7e979dcf3e 100644
--- a/parquet/tests/arrow_reader/mod.rs
+++ b/parquet/tests/arrow_reader/mod.rs
@@ -17,13 +17,13 @@
use arrow_array::types::{Int32Type, Int8Type};
use arrow_array::{
- Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array,
Decimal128Array,
- Decimal256Array, DictionaryArray, FixedSizeBinaryArray, Float16Array,
Float32Array,
- Float64Array, Int16Array, Int32Array, Int64Array, Int8Array,
LargeBinaryArray,
- LargeStringArray, RecordBatch, StringArray, StructArray,
Time32MillisecondArray,
- Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray,
TimestampMicrosecondArray,
- TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray,
UInt16Array,
- UInt32Array, UInt64Array, UInt8Array,
+ Array, ArrayRef, BinaryArray, BinaryViewArray, BooleanArray, Date32Array,
Date64Array,
+ Decimal128Array, Decimal256Array, DictionaryArray, FixedSizeBinaryArray,
Float16Array,
+ Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array,
LargeBinaryArray,
+ LargeStringArray, RecordBatch, StringArray, StringViewArray, StructArray,
+ Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray,
Time64NanosecondArray,
+ TimestampMicrosecondArray, TimestampMillisecondArray,
TimestampNanosecondArray,
+ TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
};
use arrow_buffer::i256;
use arrow_schema::{DataType, Field, Schema, TimeUnit};
@@ -88,6 +88,8 @@ enum Scenario {
PeriodsInColumnNames,
StructArray,
UTF8,
+ UTF8View,
+ BinaryView,
}
fn make_boolean_batch(v: Vec<Option<bool>>) -> RecordBatch {
@@ -589,6 +591,16 @@ fn make_utf8_batch(value: Vec<Option<&str>>) ->
RecordBatch {
.unwrap()
}
+fn make_utf8_view_batch(value: Vec<Option<&str>>) -> RecordBatch {
+ let utf8_view = StringViewArray::from(value);
+ RecordBatch::try_from_iter(vec![("utf8_view", Arc::new(utf8_view) as
_)]).unwrap()
+}
+
+fn make_binary_view_batch(value: Vec<Option<&[u8]>>) -> RecordBatch {
+ let binary_view = BinaryViewArray::from(value);
+ RecordBatch::try_from_iter(vec![("binary_view", Arc::new(binary_view) as
_)]).unwrap()
+}
+
fn make_dict_batch() -> RecordBatch {
let values = [
Some("abc"),
@@ -972,6 +984,35 @@ fn create_data_batch(scenario: Scenario) ->
Vec<RecordBatch> {
make_utf8_batch(vec![Some("e"), Some("f"), Some("g"),
Some("h"), Some("i")]),
]
}
+ Scenario::UTF8View => {
+ // Make utf8_view batch including string length <12 and >12 bytes
+ // as the internal representation of StringView is differed for
strings
+ // shorter and longer than that length
+ vec![
+ make_utf8_view_batch(vec![Some("a"), Some("b"), Some("c"),
Some("d"), None]),
+ make_utf8_view_batch(vec![Some("a"), Some("e_longerthan12"),
None, None, None]),
+ make_utf8_view_batch(vec![
+ Some("e_longerthan12"),
+ Some("f_longerthan12"),
+ Some("g_longerthan12"),
+ Some("h_longerthan12"),
+ Some("i_longerthan12"),
+ ]),
+ ]
+ }
+ Scenario::BinaryView => {
+ vec![
+ make_binary_view_batch(vec![Some(b"a"), Some(b"b"),
Some(b"c"), Some(b"d"), None]),
+ make_binary_view_batch(vec![Some(b"a"),
Some(b"e_longerthan12"), None, None, None]),
+ make_binary_view_batch(vec![
+ Some(b"e_longerthan12"),
+ Some(b"f_longerthan12"),
+ Some(b"g_longerthan12"),
+ Some(b"h_longerthan12"),
+ Some(b"i_longerthan12"),
+ ]),
+ ]
+ }
}
}
diff --git a/parquet/tests/arrow_reader/statistics.rs
b/parquet/tests/arrow_reader/statistics.rs
index 5702967ffd..75a73ac130 100644
--- a/parquet/tests/arrow_reader/statistics.rs
+++ b/parquet/tests/arrow_reader/statistics.rs
@@ -29,11 +29,11 @@ use arrow::datatypes::{
TimestampNanosecondType, TimestampSecondType,
};
use arrow_array::{
- make_array, new_null_array, Array, ArrayRef, BinaryArray, BooleanArray,
Date32Array,
- Date64Array, Decimal128Array, Decimal256Array, FixedSizeBinaryArray,
Float16Array,
+ make_array, new_null_array, Array, ArrayRef, BinaryArray, BinaryViewArray,
BooleanArray,
+ Date32Array, Date64Array, Decimal128Array, Decimal256Array,
FixedSizeBinaryArray, Float16Array,
Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array,
LargeBinaryArray,
- LargeStringArray, RecordBatch, StringArray, Time32MillisecondArray,
Time32SecondArray,
- Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray,
+ LargeStringArray, RecordBatch, StringArray, StringViewArray,
Time32MillisecondArray,
+ Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray,
TimestampMicrosecondArray,
TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray,
UInt16Array,
UInt32Array, UInt64Array, UInt8Array,
};
@@ -2059,6 +2059,60 @@ async fn test_utf8() {
.run();
}
+// UTF8View
+#[tokio::test]
+async fn test_utf8_view() {
+ let reader = TestReader {
+ scenario: Scenario::UTF8View,
+ row_per_group: 5,
+ }
+ .build()
+ .await;
+
+ // test for utf8_view
+ Test {
+ reader: &reader,
+ expected_min: Arc::new(StringViewArray::from(vec!["a", "a",
"e_longerthan12"])),
+ expected_max: Arc::new(StringViewArray::from(vec![
+ "d",
+ "e_longerthan12",
+ "i_longerthan12",
+ ])),
+ expected_null_counts: UInt64Array::from(vec![1, 3, 0]),
+ expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])),
+ column_name: "utf8_view",
+ check: Check::Both,
+ }
+ .run()
+}
+
+// BinaryView
+#[tokio::test]
+async fn test_binary_view() {
+ let reader = TestReader {
+ scenario: Scenario::BinaryView,
+ row_per_group: 5,
+ }
+ .build()
+ .await;
+
+ let expected_min: Vec<Option<&[u8]>> = vec![Some(b"a"), Some(b"a"),
Some(b"e_longerthan12")];
+ let expected_max: Vec<Option<&[u8]>> =
+ vec![Some(b"d"), Some(b"e_longerthan12"), Some(b"i_longerthan12")];
+
+ // test for utf8_view
+ Test {
+ reader: &reader,
+ expected_min: Arc::new(BinaryViewArray::from(expected_min)),
+ expected_max: Arc::new(BinaryViewArray::from(expected_max)),
+ expected_null_counts: UInt64Array::from(vec![1, 3, 0]),
+ expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])),
+ column_name: "binary_view",
+ check: Check::Both,
+ }
+ .run()
+}
+
////// Files with missing statistics ///////
#[tokio::test]