This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new d3fa083acf refactor: handle LargeUtf8 statistics and add tests for
UTF8 and LargeUTF8 (#10762)
d3fa083acf is described below
commit d3fa083acfc558b9fff5c0bb539d6dc972bfbf7f
Author: Alex Huang <[email protected]>
AuthorDate: Tue Jun 4 03:55:06 2024 +0800
refactor: handle LargeUtf8 statistics and add tests for UTF8 and LargeUTF8
(#10762)
Co-authored-by: Andrew Lamb <[email protected]>
---
.../datasource/physical_plan/parquet/statistics.rs | 16 +++++-----
datafusion/core/tests/parquet/arrow_statistics.rs | 34 +++++++++++++++++++++-
datafusion/core/tests/parquet/mod.rs | 28 +++++++++++++++++-
3 files changed, 69 insertions(+), 9 deletions(-)
diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
index 1c20fa7caa..e7e6360c25 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
@@ -152,16 +152,18 @@ macro_rules! get_statistic {
Some(DataType::Binary) => {
Some(ScalarValue::Binary(Some(s.$bytes_func().to_vec())))
}
- _ => {
- let s = std::str::from_utf8(s.$bytes_func())
+ Some(DataType::LargeUtf8) | _ => {
+ let utf8_value = std::str::from_utf8(s.$bytes_func())
.map(|s| s.to_string())
.ok();
- if s.is_none() {
- log::debug!(
- "Utf8 statistics is a non-UTF8 value, ignoring
it."
- );
+ if utf8_value.is_none() {
+ log::debug!("Utf8 statistics is a non-UTF8 value,
ignoring it.");
+ }
+
+ match $target_arrow_type {
+ Some(DataType::LargeUtf8) =>
Some(ScalarValue::LargeUtf8(utf8_value)),
+ _ => Some(ScalarValue::Utf8(utf8_value)),
}
- Some(ScalarValue::Utf8(s))
}
}
}
diff --git a/datafusion/core/tests/parquet/arrow_statistics.rs
b/datafusion/core/tests/parquet/arrow_statistics.rs
index 5e0f8b4f5f..aa5fc7c34c 100644
--- a/datafusion/core/tests/parquet/arrow_statistics.rs
+++ b/datafusion/core/tests/parquet/arrow_statistics.rs
@@ -29,7 +29,7 @@ use arrow::datatypes::{
use arrow_array::{
make_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array,
Date64Array,
Decimal128Array, FixedSizeBinaryArray, Float32Array, Float64Array,
Int16Array,
- Int32Array, Int64Array, Int8Array, RecordBatch, StringArray,
+ Int32Array, Int64Array, Int8Array, LargeStringArray, RecordBatch,
StringArray,
TimestampMicrosecondArray, TimestampMillisecondArray,
TimestampNanosecondArray,
TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
};
@@ -1447,6 +1447,38 @@ async fn test_struct() {
}
.run();
}
+
+// UTF8
+#[tokio::test]
+async fn test_utf8() {
+ let reader = TestReader {
+ scenario: Scenario::UTF8,
+ row_per_group: 5,
+ };
+
+ // test for utf8
+ Test {
+ reader: reader.build().await,
+ expected_min: Arc::new(StringArray::from(vec!["a", "e"])),
+ expected_max: Arc::new(StringArray::from(vec!["d", "i"])),
+ expected_null_counts: UInt64Array::from(vec![1, 0]),
+ expected_row_counts: UInt64Array::from(vec![5, 5]),
+ column_name: "utf8",
+ }
+ .run();
+
+ // test for large_utf8
+ Test {
+ reader: reader.build().await,
+ expected_min: Arc::new(LargeStringArray::from(vec!["a", "e"])),
+ expected_max: Arc::new(LargeStringArray::from(vec!["d", "i"])),
+ expected_null_counts: UInt64Array::from(vec![1, 0]),
+ expected_row_counts: UInt64Array::from(vec![5, 5]),
+ column_name: "large_utf8",
+ }
+ .run();
+}
+
////// Files with missing statistics ///////
#[tokio::test]
diff --git a/datafusion/core/tests/parquet/mod.rs
b/datafusion/core/tests/parquet/mod.rs
index f45ff53d3f..bfb6e8e555 100644
--- a/datafusion/core/tests/parquet/mod.rs
+++ b/datafusion/core/tests/parquet/mod.rs
@@ -29,7 +29,10 @@ use arrow::{
util::pretty::pretty_format_batches,
};
use arrow_array::types::{Int32Type, Int8Type};
-use arrow_array::{make_array, BooleanArray, DictionaryArray, Float32Array,
StructArray};
+use arrow_array::{
+ make_array, BooleanArray, DictionaryArray, Float32Array, LargeStringArray,
+ StructArray,
+};
use chrono::{Datelike, Duration, TimeDelta};
use datafusion::{
datasource::{physical_plan::ParquetExec, provider_as_source,
TableProvider},
@@ -90,6 +93,7 @@ enum Scenario {
WithNullValues,
WithNullValuesPageLevel,
StructArray,
+ UTF8,
}
enum Unit {
@@ -787,6 +791,16 @@ fn make_numeric_limit_batch() -> RecordBatch {
.unwrap()
}
+fn make_utf8_batch(value: Vec<Option<&str>>) -> RecordBatch {
+ let utf8 = StringArray::from(value.clone());
+ let large_utf8 = LargeStringArray::from(value);
+ RecordBatch::try_from_iter(vec![
+ ("utf8", Arc::new(utf8) as _),
+ ("large_utf8", Arc::new(large_utf8) as _),
+ ])
+ .unwrap()
+}
+
fn make_dict_batch() -> RecordBatch {
let values = [
Some("abc"),
@@ -1044,6 +1058,18 @@ fn create_data_batch(scenario: Scenario) ->
Vec<RecordBatch> {
)]));
vec![RecordBatch::try_new(schema,
vec![struct_array_data]).unwrap()]
}
+ Scenario::UTF8 => {
+ vec![
+ make_utf8_batch(vec![Some("a"), Some("b"), Some("c"),
Some("d"), None]),
+ make_utf8_batch(vec![
+ Some("e"),
+ Some("f"),
+ Some("g"),
+ Some("h"),
+ Some("i"),
+ ]),
+ ]
+ }
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]