This is an automated email from the ASF dual-hosted git repository.

comphead pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new fbbab6c7ad Minor: Add tests for extracting dictionary parquet 
statistics (#10729)
fbbab6c7ad is described below

commit fbbab6c7adb0c2c285ff3f9ed25f5bd9796ecb89
Author: Andrew Lamb <[email protected]>
AuthorDate: Mon Jun 3 11:31:06 2024 -0400

    Minor: Add tests for extracting dictionary parquet statistics (#10729)
---
 datafusion/core/tests/parquet/arrow_statistics.rs | 40 +++++++++++++++++++--
 datafusion/core/tests/parquet/mod.rs              | 44 ++++++++++++++++++++++-
 2 files changed, 81 insertions(+), 3 deletions(-)

diff --git a/datafusion/core/tests/parquet/arrow_statistics.rs 
b/datafusion/core/tests/parquet/arrow_statistics.rs
index 2836cd2893..5e0f8b4f5f 100644
--- a/datafusion/core/tests/parquet/arrow_statistics.rs
+++ b/datafusion/core/tests/parquet/arrow_statistics.rs
@@ -1231,8 +1231,44 @@ async fn test_decimal() {
     .run();
 }
 
-// BUG: not convert BinaryArray to StringArray
-// https://github.com/apache/datafusion/issues/10605
+#[tokio::test]
+async fn test_dictionary() {
+    let reader = TestReader {
+        scenario: Scenario::Dictionary,
+        row_per_group: 5,
+    };
+
+    Test {
+        reader: reader.build().await,
+        expected_min: Arc::new(StringArray::from(vec!["abc", "aaa"])),
+        expected_max: Arc::new(StringArray::from(vec!["def", "fffff"])),
+        expected_null_counts: UInt64Array::from(vec![1, 0]),
+        expected_row_counts: UInt64Array::from(vec![5, 2]),
+        column_name: "string_dict_i8",
+    }
+    .run();
+
+    Test {
+        reader: reader.build().await,
+        expected_min: Arc::new(StringArray::from(vec!["abc", "aaa"])),
+        expected_max: Arc::new(StringArray::from(vec!["def", "fffff"])),
+        expected_null_counts: UInt64Array::from(vec![1, 0]),
+        expected_row_counts: UInt64Array::from(vec![5, 2]),
+        column_name: "string_dict_i32",
+    }
+    .run();
+
+    Test {
+        reader: reader.build().await,
+        expected_min: Arc::new(Int64Array::from(vec![-100, 0])),
+        expected_max: Arc::new(Int64Array::from(vec![0, 100])),
+        expected_null_counts: UInt64Array::from(vec![1, 0]),
+        expected_row_counts: UInt64Array::from(vec![5, 2]),
+        column_name: "int_dict_i8",
+    }
+    .run();
+}
+
 #[tokio::test]
 async fn test_byte() {
     // This creates a parquet file of 4 columns
diff --git a/datafusion/core/tests/parquet/mod.rs 
b/datafusion/core/tests/parquet/mod.rs
index 41a0a86aa8..f45ff53d3f 100644
--- a/datafusion/core/tests/parquet/mod.rs
+++ b/datafusion/core/tests/parquet/mod.rs
@@ -28,7 +28,8 @@ use arrow::{
     record_batch::RecordBatch,
     util::pretty::pretty_format_batches,
 };
-use arrow_array::{make_array, BooleanArray, Float32Array, StructArray};
+use arrow_array::types::{Int32Type, Int8Type};
+use arrow_array::{make_array, BooleanArray, DictionaryArray, Float32Array, 
StructArray};
 use chrono::{Datelike, Duration, TimeDelta};
 use datafusion::{
     datasource::{physical_plan::ParquetExec, provider_as_source, 
TableProvider},
@@ -81,7 +82,10 @@ enum Scenario {
     DecimalBloomFilterInt64,
     DecimalLargePrecision,
     DecimalLargePrecisionBloomFilter,
+    /// StringArray, BinaryArray, FixedSizeBinaryArray
     ByteArray,
+    /// DictionaryArray
+    Dictionary,
     PeriodsInColumnNames,
     WithNullValues,
     WithNullValuesPageLevel,
@@ -783,6 +787,41 @@ fn make_numeric_limit_batch() -> RecordBatch {
     .unwrap()
 }
 
+fn make_dict_batch() -> RecordBatch {
+    let values = [
+        Some("abc"),
+        Some("def"),
+        None,
+        Some("def"),
+        Some("abc"),
+        Some("fffff"),
+        Some("aaa"),
+    ];
+    let dict_i8_array = 
DictionaryArray::<Int8Type>::from_iter(values.iter().cloned());
+    let dict_i32_array = 
DictionaryArray::<Int32Type>::from_iter(values.iter().cloned());
+
+    // Dictionary array of integers
+    let int64_values = Int64Array::from(vec![0, -100, 100]);
+    let keys = Int8Array::from_iter([
+        Some(0),
+        Some(1),
+        None,
+        Some(0),
+        Some(0),
+        Some(2),
+        Some(0),
+    ]);
+    let dict_i8_int_array =
+        DictionaryArray::<Int8Type>::try_new(keys, 
Arc::new(int64_values)).unwrap();
+
+    RecordBatch::try_from_iter(vec![
+        ("string_dict_i8", Arc::new(dict_i8_array) as _),
+        ("string_dict_i32", Arc::new(dict_i32_array) as _),
+        ("int_dict_i8", Arc::new(dict_i8_int_array) as _),
+    ])
+    .unwrap()
+}
+
 fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
     match scenario {
         Scenario::Boolean => {
@@ -954,6 +993,9 @@ fn create_data_batch(scenario: Scenario) -> 
Vec<RecordBatch> {
                 ),
             ]
         }
+        Scenario::Dictionary => {
+            vec![make_dict_batch()]
+        }
         Scenario::PeriodsInColumnNames => {
             vec![
                 // all frontend


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to