This is an automated email from the ASF dual-hosted git repository.
comphead pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new fbbab6c7ad Minor: Add tests for extracting dictionary parquet
statistics (#10729)
fbbab6c7ad is described below
commit fbbab6c7adb0c2c285ff3f9ed25f5bd9796ecb89
Author: Andrew Lamb <[email protected]>
AuthorDate: Mon Jun 3 11:31:06 2024 -0400
Minor: Add tests for extracting dictionary parquet statistics (#10729)
---
datafusion/core/tests/parquet/arrow_statistics.rs | 40 +++++++++++++++++++--
datafusion/core/tests/parquet/mod.rs | 44 ++++++++++++++++++++++-
2 files changed, 81 insertions(+), 3 deletions(-)
diff --git a/datafusion/core/tests/parquet/arrow_statistics.rs
b/datafusion/core/tests/parquet/arrow_statistics.rs
index 2836cd2893..5e0f8b4f5f 100644
--- a/datafusion/core/tests/parquet/arrow_statistics.rs
+++ b/datafusion/core/tests/parquet/arrow_statistics.rs
@@ -1231,8 +1231,44 @@ async fn test_decimal() {
.run();
}
-// BUG: not convert BinaryArray to StringArray
-// https://github.com/apache/datafusion/issues/10605
+#[tokio::test]
+async fn test_dictionary() {
+ let reader = TestReader {
+ scenario: Scenario::Dictionary,
+ row_per_group: 5,
+ };
+
+ Test {
+ reader: reader.build().await,
+ expected_min: Arc::new(StringArray::from(vec!["abc", "aaa"])),
+ expected_max: Arc::new(StringArray::from(vec!["def", "fffff"])),
+ expected_null_counts: UInt64Array::from(vec![1, 0]),
+ expected_row_counts: UInt64Array::from(vec![5, 2]),
+ column_name: "string_dict_i8",
+ }
+ .run();
+
+ Test {
+ reader: reader.build().await,
+ expected_min: Arc::new(StringArray::from(vec!["abc", "aaa"])),
+ expected_max: Arc::new(StringArray::from(vec!["def", "fffff"])),
+ expected_null_counts: UInt64Array::from(vec![1, 0]),
+ expected_row_counts: UInt64Array::from(vec![5, 2]),
+ column_name: "string_dict_i32",
+ }
+ .run();
+
+ Test {
+ reader: reader.build().await,
+ expected_min: Arc::new(Int64Array::from(vec![-100, 0])),
+ expected_max: Arc::new(Int64Array::from(vec![0, 100])),
+ expected_null_counts: UInt64Array::from(vec![1, 0]),
+ expected_row_counts: UInt64Array::from(vec![5, 2]),
+ column_name: "int_dict_i8",
+ }
+ .run();
+}
+
#[tokio::test]
async fn test_byte() {
// This creates a parquet file of 4 columns
diff --git a/datafusion/core/tests/parquet/mod.rs
b/datafusion/core/tests/parquet/mod.rs
index 41a0a86aa8..f45ff53d3f 100644
--- a/datafusion/core/tests/parquet/mod.rs
+++ b/datafusion/core/tests/parquet/mod.rs
@@ -28,7 +28,8 @@ use arrow::{
record_batch::RecordBatch,
util::pretty::pretty_format_batches,
};
-use arrow_array::{make_array, BooleanArray, Float32Array, StructArray};
+use arrow_array::types::{Int32Type, Int8Type};
+use arrow_array::{make_array, BooleanArray, DictionaryArray, Float32Array,
StructArray};
use chrono::{Datelike, Duration, TimeDelta};
use datafusion::{
datasource::{physical_plan::ParquetExec, provider_as_source,
TableProvider},
@@ -81,7 +82,10 @@ enum Scenario {
DecimalBloomFilterInt64,
DecimalLargePrecision,
DecimalLargePrecisionBloomFilter,
+ /// StringArray, BinaryArray, FixedSizeBinaryArray
ByteArray,
+ /// DictionaryArray
+ Dictionary,
PeriodsInColumnNames,
WithNullValues,
WithNullValuesPageLevel,
@@ -783,6 +787,41 @@ fn make_numeric_limit_batch() -> RecordBatch {
.unwrap()
}
+fn make_dict_batch() -> RecordBatch {
+ let values = [
+ Some("abc"),
+ Some("def"),
+ None,
+ Some("def"),
+ Some("abc"),
+ Some("fffff"),
+ Some("aaa"),
+ ];
+ let dict_i8_array =
DictionaryArray::<Int8Type>::from_iter(values.iter().cloned());
+ let dict_i32_array =
DictionaryArray::<Int32Type>::from_iter(values.iter().cloned());
+
+ // Dictionary array of integers
+ let int64_values = Int64Array::from(vec![0, -100, 100]);
+ let keys = Int8Array::from_iter([
+ Some(0),
+ Some(1),
+ None,
+ Some(0),
+ Some(0),
+ Some(2),
+ Some(0),
+ ]);
+ let dict_i8_int_array =
+ DictionaryArray::<Int8Type>::try_new(keys,
Arc::new(int64_values)).unwrap();
+
+ RecordBatch::try_from_iter(vec![
+ ("string_dict_i8", Arc::new(dict_i8_array) as _),
+ ("string_dict_i32", Arc::new(dict_i32_array) as _),
+ ("int_dict_i8", Arc::new(dict_i8_int_array) as _),
+ ])
+ .unwrap()
+}
+
fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
match scenario {
Scenario::Boolean => {
@@ -954,6 +993,9 @@ fn create_data_batch(scenario: Scenario) ->
Vec<RecordBatch> {
),
]
}
+ Scenario::Dictionary => {
+ vec![make_dict_batch()]
+ }
Scenario::PeriodsInColumnNames => {
vec![
// all frontend
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]