This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new d3fa083acf refactor: handle LargeUtf8 statistics and add tests for 
UTF8 and LargeUTF8 (#10762)
d3fa083acf is described below

commit d3fa083acfc558b9fff5c0bb539d6dc972bfbf7f
Author: Alex Huang <[email protected]>
AuthorDate: Tue Jun 4 03:55:06 2024 +0800

    refactor: handle LargeUtf8 statistics and add tests for UTF8 and LargeUTF8 
(#10762)
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 .../datasource/physical_plan/parquet/statistics.rs | 16 +++++-----
 datafusion/core/tests/parquet/arrow_statistics.rs  | 34 +++++++++++++++++++++-
 datafusion/core/tests/parquet/mod.rs               | 28 +++++++++++++++++-
 3 files changed, 69 insertions(+), 9 deletions(-)

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs 
b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
index 1c20fa7caa..e7e6360c25 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
@@ -152,16 +152,18 @@ macro_rules! get_statistic {
                     Some(DataType::Binary) => {
                         
Some(ScalarValue::Binary(Some(s.$bytes_func().to_vec())))
                     }
-                    _ => {
-                        let s = std::str::from_utf8(s.$bytes_func())
+                    Some(DataType::LargeUtf8) | _ => {
+                        let utf8_value = std::str::from_utf8(s.$bytes_func())
                             .map(|s| s.to_string())
                             .ok();
-                        if s.is_none() {
-                            log::debug!(
-                                "Utf8 statistics is a non-UTF8 value, ignoring 
it."
-                            );
+                        if utf8_value.is_none() {
+                            log::debug!("Utf8 statistics is a non-UTF8 value, 
ignoring it.");
+                        }
+
+                        match $target_arrow_type {
+                            Some(DataType::LargeUtf8) => 
Some(ScalarValue::LargeUtf8(utf8_value)),
+                            _ => Some(ScalarValue::Utf8(utf8_value)),
                         }
-                        Some(ScalarValue::Utf8(s))
                     }
                 }
             }
diff --git a/datafusion/core/tests/parquet/arrow_statistics.rs 
b/datafusion/core/tests/parquet/arrow_statistics.rs
index 5e0f8b4f5f..aa5fc7c34c 100644
--- a/datafusion/core/tests/parquet/arrow_statistics.rs
+++ b/datafusion/core/tests/parquet/arrow_statistics.rs
@@ -29,7 +29,7 @@ use arrow::datatypes::{
 use arrow_array::{
     make_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, 
Date64Array,
     Decimal128Array, FixedSizeBinaryArray, Float32Array, Float64Array, 
Int16Array,
-    Int32Array, Int64Array, Int8Array, RecordBatch, StringArray,
+    Int32Array, Int64Array, Int8Array, LargeStringArray, RecordBatch, 
StringArray,
     TimestampMicrosecondArray, TimestampMillisecondArray, 
TimestampNanosecondArray,
     TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
 };
@@ -1447,6 +1447,38 @@ async fn test_struct() {
     }
     .run();
 }
+
+// UTF8
+#[tokio::test]
+async fn test_utf8() {
+    let reader = TestReader {
+        scenario: Scenario::UTF8,
+        row_per_group: 5,
+    };
+
+    // test for utf8
+    Test {
+        reader: reader.build().await,
+        expected_min: Arc::new(StringArray::from(vec!["a", "e"])),
+        expected_max: Arc::new(StringArray::from(vec!["d", "i"])),
+        expected_null_counts: UInt64Array::from(vec![1, 0]),
+        expected_row_counts: UInt64Array::from(vec![5, 5]),
+        column_name: "utf8",
+    }
+    .run();
+
+    // test for large_utf8
+    Test {
+        reader: reader.build().await,
+        expected_min: Arc::new(LargeStringArray::from(vec!["a", "e"])),
+        expected_max: Arc::new(LargeStringArray::from(vec!["d", "i"])),
+        expected_null_counts: UInt64Array::from(vec![1, 0]),
+        expected_row_counts: UInt64Array::from(vec![5, 5]),
+        column_name: "large_utf8",
+    }
+    .run();
+}
+
 ////// Files with missing statistics ///////
 
 #[tokio::test]
diff --git a/datafusion/core/tests/parquet/mod.rs 
b/datafusion/core/tests/parquet/mod.rs
index f45ff53d3f..bfb6e8e555 100644
--- a/datafusion/core/tests/parquet/mod.rs
+++ b/datafusion/core/tests/parquet/mod.rs
@@ -29,7 +29,10 @@ use arrow::{
     util::pretty::pretty_format_batches,
 };
 use arrow_array::types::{Int32Type, Int8Type};
-use arrow_array::{make_array, BooleanArray, DictionaryArray, Float32Array, 
StructArray};
+use arrow_array::{
+    make_array, BooleanArray, DictionaryArray, Float32Array, LargeStringArray,
+    StructArray,
+};
 use chrono::{Datelike, Duration, TimeDelta};
 use datafusion::{
     datasource::{physical_plan::ParquetExec, provider_as_source, 
TableProvider},
@@ -90,6 +93,7 @@ enum Scenario {
     WithNullValues,
     WithNullValuesPageLevel,
     StructArray,
+    UTF8,
 }
 
 enum Unit {
@@ -787,6 +791,16 @@ fn make_numeric_limit_batch() -> RecordBatch {
     .unwrap()
 }
 
+fn make_utf8_batch(value: Vec<Option<&str>>) -> RecordBatch {
+    let utf8 = StringArray::from(value.clone());
+    let large_utf8 = LargeStringArray::from(value);
+    RecordBatch::try_from_iter(vec![
+        ("utf8", Arc::new(utf8) as _),
+        ("large_utf8", Arc::new(large_utf8) as _),
+    ])
+    .unwrap()
+}
+
 fn make_dict_batch() -> RecordBatch {
     let values = [
         Some("abc"),
@@ -1044,6 +1058,18 @@ fn create_data_batch(scenario: Scenario) -> 
Vec<RecordBatch> {
             )]));
             vec![RecordBatch::try_new(schema, 
vec![struct_array_data]).unwrap()]
         }
+        Scenario::UTF8 => {
+            vec![
+                make_utf8_batch(vec![Some("a"), Some("b"), Some("c"), 
Some("d"), None]),
+                make_utf8_batch(vec![
+                    Some("e"),
+                    Some("f"),
+                    Some("g"),
+                    Some("h"),
+                    Some("i"),
+                ]),
+            ]
+        }
     }
 }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to