This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 8bed541f31 feat: Support round trip reading/writing Arrow type
`Dictionary(_, FixedSizeBinary(_))` to Parquet (#7446)
8bed541f31 is described below
commit 8bed541f314dfc6adfa1f7b46d4a43fd4118e7c7
Author: albertlockett <[email protected]>
AuthorDate: Fri May 9 05:55:50 2025 -0400
feat: Support round trip reading/writing Arrow type `Dictionary(_,
FixedSizeBinary(_))` to Parquet (#7446)
* support FixedSizedBinary in dict encoding
* roundtrip works
* cleanup
* clippy and linter
* support all types of keys in byte_array_dictionary
* back out change included by mistake
* linter
* PR feedback before cleanup
* PR feedback from Weston
---------
Co-authored-by: albertlockett <[email protected]>
---
parquet/src/arrow/array_reader/builder.rs | 9 ++--
.../arrow/array_reader/byte_array_dictionary.rs | 16 ++++----
parquet/src/arrow/arrow_writer/byte_array.rs | 7 +++-
parquet/src/arrow/arrow_writer/mod.rs | 48 ++++++++++++++++++++++
parquet/src/arrow/buffer/dictionary_buffer.rs | 9 ++++
5 files changed, 76 insertions(+), 13 deletions(-)
diff --git a/parquet/src/arrow/array_reader/builder.rs
b/parquet/src/arrow/array_reader/builder.rs
index 945f62526a..5ada61e93d 100644
--- a/parquet/src/arrow/array_reader/builder.rs
+++ b/parquet/src/arrow/array_reader/builder.rs
@@ -289,9 +289,12 @@ fn build_primitive_reader(
}
_ => make_byte_array_reader(page_iterator, column_desc,
arrow_type)?,
},
- PhysicalType::FIXED_LEN_BYTE_ARRAY => {
- make_fixed_len_byte_array_reader(page_iterator, column_desc,
arrow_type)?
- }
+ PhysicalType::FIXED_LEN_BYTE_ARRAY => match arrow_type {
+ Some(DataType::Dictionary(_, _)) => {
+ make_byte_array_dictionary_reader(page_iterator, column_desc,
arrow_type)?
+ }
+ _ => make_fixed_len_byte_array_reader(page_iterator, column_desc,
arrow_type)?,
+ },
};
Ok(Some(reader))
}
diff --git a/parquet/src/arrow/array_reader/byte_array_dictionary.rs
b/parquet/src/arrow/array_reader/byte_array_dictionary.rs
index 440db641a2..757d3df8a8 100644
--- a/parquet/src/arrow/array_reader/byte_array_dictionary.rs
+++ b/parquet/src/arrow/array_reader/byte_array_dictionary.rs
@@ -90,21 +90,21 @@ pub fn make_byte_array_dictionary_reader(
ArrowType::Dictionary(key_type, value_type) => {
make_reader! {
(pages, column_desc, data_type) => match (key_type.as_ref(),
value_type.as_ref()) {
- (ArrowType::UInt8, ArrowType::Binary | ArrowType::Utf8) =>
(u8, i32),
+ (ArrowType::UInt8, ArrowType::Binary | ArrowType::Utf8 |
ArrowType::FixedSizeBinary(_)) => (u8, i32),
(ArrowType::UInt8, ArrowType::LargeBinary |
ArrowType::LargeUtf8) => (u8, i64),
- (ArrowType::Int8, ArrowType::Binary | ArrowType::Utf8) =>
(i8, i32),
+ (ArrowType::Int8, ArrowType::Binary | ArrowType::Utf8 |
ArrowType::FixedSizeBinary(_)) => (i8, i32),
(ArrowType::Int8, ArrowType::LargeBinary |
ArrowType::LargeUtf8) => (i8, i64),
- (ArrowType::UInt16, ArrowType::Binary | ArrowType::Utf8)
=> (u16, i32),
+ (ArrowType::UInt16, ArrowType::Binary | ArrowType::Utf8 |
ArrowType::FixedSizeBinary(_)) => (u16, i32),
(ArrowType::UInt16, ArrowType::LargeBinary |
ArrowType::LargeUtf8) => (u16, i64),
- (ArrowType::Int16, ArrowType::Binary | ArrowType::Utf8) =>
(i16, i32),
+ (ArrowType::Int16, ArrowType::Binary | ArrowType::Utf8 |
ArrowType::FixedSizeBinary(_)) => (i16, i32),
(ArrowType::Int16, ArrowType::LargeBinary |
ArrowType::LargeUtf8) => (i16, i64),
- (ArrowType::UInt32, ArrowType::Binary | ArrowType::Utf8)
=> (u32, i32),
+ (ArrowType::UInt32, ArrowType::Binary | ArrowType::Utf8 |
ArrowType::FixedSizeBinary(_)) => (u32, i32),
(ArrowType::UInt32, ArrowType::LargeBinary |
ArrowType::LargeUtf8) => (u32, i64),
- (ArrowType::Int32, ArrowType::Binary | ArrowType::Utf8) =>
(i32, i32),
+ (ArrowType::Int32, ArrowType::Binary | ArrowType::Utf8 |
ArrowType::FixedSizeBinary(_)) => (i32, i32),
(ArrowType::Int32, ArrowType::LargeBinary |
ArrowType::LargeUtf8) => (i32, i64),
- (ArrowType::UInt64, ArrowType::Binary | ArrowType::Utf8)
=> (u64, i32),
+ (ArrowType::UInt64, ArrowType::Binary | ArrowType::Utf8 |
ArrowType::FixedSizeBinary(_)) => (u64, i32),
(ArrowType::UInt64, ArrowType::LargeBinary |
ArrowType::LargeUtf8) => (u64, i64),
- (ArrowType::Int64, ArrowType::Binary | ArrowType::Utf8) =>
(i64, i32),
+ (ArrowType::Int64, ArrowType::Binary | ArrowType::Utf8 |
ArrowType::FixedSizeBinary(_)) => (i64, i32),
(ArrowType::Int64, ArrowType::LargeBinary |
ArrowType::LargeUtf8) => (i64, i64),
}
}
diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs
b/parquet/src/arrow/arrow_writer/byte_array.rs
index 2d23ad8510..9767ec98e6 100644
--- a/parquet/src/arrow/arrow_writer/byte_array.rs
+++ b/parquet/src/arrow/arrow_writer/byte_array.rs
@@ -27,8 +27,8 @@ use crate::schema::types::ColumnDescPtr;
use crate::util::bit_util::num_required_bits;
use crate::util::interner::{Interner, Storage};
use arrow_array::{
- Array, ArrayAccessor, BinaryArray, BinaryViewArray, DictionaryArray,
LargeBinaryArray,
- LargeStringArray, StringArray, StringViewArray,
+ Array, ArrayAccessor, BinaryArray, BinaryViewArray, DictionaryArray,
FixedSizeBinaryArray,
+ LargeBinaryArray, LargeStringArray, StringArray, StringViewArray,
};
use arrow_schema::DataType;
@@ -85,6 +85,9 @@ macro_rules! downcast_op {
DataType::LargeBinary => {
downcast_dict_op!(key, LargeBinaryArray, $array, $op$(,
$arg)*)
}
+ DataType::FixedSizeBinary(_) => {
+ downcast_dict_op!(key, FixedSizeBinaryArray, $array,
$op$(, $arg)*)
+ }
d => unreachable!("cannot downcast {} dictionary value to byte
array", d),
},
d => unreachable!("cannot downcast {} to byte array", d),
diff --git a/parquet/src/arrow/arrow_writer/mod.rs
b/parquet/src/arrow/arrow_writer/mod.rs
index 1e1054c9a0..66e1b06fa7 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -989,6 +989,9 @@ impl ArrowColumnWriterFactory {
ArrowDataType::Utf8View | ArrowDataType::BinaryView => {
out.push(bytes(leaves.next().unwrap())?)
}
+ ArrowDataType::FixedSizeBinary(_) => {
+ out.push(bytes(leaves.next().unwrap())?)
+ }
_ => {
out.push(col(leaves.next().unwrap())?)
}
@@ -1333,6 +1336,7 @@ mod tests {
use arrow_buffer::{i256, IntervalDayTime, IntervalMonthDayNano,
NullBuffer};
use arrow_schema::Fields;
use half::f16;
+ use num::{FromPrimitive, ToPrimitive};
use crate::basic::Encoding;
use crate::data_type::AsBytes;
@@ -1911,6 +1915,50 @@ mod tests {
roundtrip(batch, Some(SMALL_SIZE / 2));
}
+ #[test]
+ fn test_fixed_size_binary_in_dict() {
+ fn test_fixed_size_binary_in_dict_inner<K>()
+ where
+ K: ArrowDictionaryKeyType,
+ K::Native: FromPrimitive + ToPrimitive + TryFrom<u8>,
+ <<K as arrow_array::ArrowPrimitiveType>::Native as
TryFrom<u8>>::Error: std::fmt::Debug,
+ {
+ let field = Field::new(
+ "a",
+ DataType::Dictionary(
+ Box::new(K::DATA_TYPE),
+ Box::new(DataType::FixedSizeBinary(4)),
+ ),
+ false,
+ );
+ let schema = Schema::new(vec![field]);
+
+ let keys: Vec<K::Native> = vec![
+ K::Native::try_from(0u8).unwrap(),
+ K::Native::try_from(0u8).unwrap(),
+ K::Native::try_from(1u8).unwrap(),
+ ];
+ let keys = PrimitiveArray::<K>::from_iter_values(keys);
+ let values = FixedSizeBinaryArray::try_from_iter(
+ vec![vec![0, 0, 0, 0], vec![1, 1, 1, 1]].into_iter(),
+ )
+ .unwrap();
+
+ let data = DictionaryArray::<K>::new(keys, Arc::new(values));
+ let batch = RecordBatch::try_new(Arc::new(schema),
vec![Arc::new(data)]).unwrap();
+ roundtrip(batch, None);
+ }
+
+ test_fixed_size_binary_in_dict_inner::<UInt8Type>();
+ test_fixed_size_binary_in_dict_inner::<UInt16Type>();
+ test_fixed_size_binary_in_dict_inner::<UInt32Type>();
+ test_fixed_size_binary_in_dict_inner::<UInt16Type>();
+ test_fixed_size_binary_in_dict_inner::<Int8Type>();
+ test_fixed_size_binary_in_dict_inner::<Int16Type>();
+ test_fixed_size_binary_in_dict_inner::<Int32Type>();
+ test_fixed_size_binary_in_dict_inner::<Int64Type>();
+ }
+
#[test]
fn test_empty_dict() {
let struct_fields = Fields::from(vec![Field::new(
diff --git a/parquet/src/arrow/buffer/dictionary_buffer.rs
b/parquet/src/arrow/buffer/dictionary_buffer.rs
index 59f1cfa056..3861776393 100644
--- a/parquet/src/arrow/buffer/dictionary_buffer.rs
+++ b/parquet/src/arrow/buffer/dictionary_buffer.rs
@@ -154,6 +154,15 @@ impl<K: ArrowNativeType + Ord, V: OffsetSizeTrait>
DictionaryBuffer<K, V> {
}
}
+ let ArrowType::Dictionary(_, value_type) = data_type else {
+ unreachable!()
+ };
+ let values = if let ArrowType::FixedSizeBinary(size) =
**value_type {
+ arrow_cast::cast(&values,
&ArrowType::FixedSizeBinary(size)).unwrap()
+ } else {
+ values
+ };
+
let builder = ArrayDataBuilder::new(data_type.clone())
.len(keys.len())
.add_buffer(Buffer::from_vec(keys))