This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 266965beca Remove parquet arrow_cast dependency (#9077)
266965beca is described below
commit 266965becab400d6c7ff05703f2e5fc778555cc3
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Fri Jan 9 20:50:05 2026 +0000
Remove parquet arrow_cast dependency (#9077)
# Which issue does this PR close?
<!--
We generally require a GitHub issue to be filed for all bug fixes and
enhancements and this helps us generate change logs for our releases.
You can link an issue to this PR using the GitHub syntax.
-->
# Rationale for this change
<!--
Why are you proposing this change? If this is already explained clearly
in the issue then this section is not needed.
Explaining clearly why changes are proposed helps reviewers understand
your changes and offer better suggestions for fixes.
-->
Arrow_cast is fairly heavy dependency, especially now that it bundles in
arrow-ord for RunEndEncodedArrays (#8708). Removing this dependency has
been discussed as far back as 2024, let's finally actually do it
https://github.com/apache/arrow-rs/issues/4764.
# What changes are included in this PR?
<!--
There is no need to duplicate the description in the issue here but it
is sometimes worth providing a summary of the individual changes in this
PR.
-->
# Are these changes tested?
<!--
We typically require tests for all PRs in order to:
1. Prevent the code from being accidentally broken by subsequent changes
2. Serve as another way to document the expected behavior of the code
If tests are not included in your PR, please explain why (for example,
are they covered by existing tests)?
-->
# Are there any user-facing changes?
Yes, unfortunately https://github.com/apache/arrow-rs/pull/8524 added an
API that allows overriding the inferred schema, which in turn allows the
coercion machinery to traverse somewhat unintended paths. I personally
think this API shouldn't exist, but...
<!--
If there are user-facing changes then we may require documentation to be
updated before approving the PR.
If there are any breaking changes to public APIs, please call them out.
-->
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
parquet/Cargo.toml | 4 +-
parquet/src/arrow/array_reader/primitive_array.rs | 558 +++++++++-------------
parquet/src/arrow/arrow_writer/mod.rs | 287 +++++------
parquet/src/arrow/buffer/dictionary_buffer.rs | 82 +++-
4 files changed, 426 insertions(+), 505 deletions(-)
diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
index 50f69fea54..454f8455ff 100644
--- a/parquet/Cargo.toml
+++ b/parquet/Cargo.toml
@@ -39,7 +39,6 @@ ahash = { version = "0.8", default-features = false, features
= ["runtime-rng"]
[dependencies]
arrow-array = { workspace = true, optional = true }
arrow-buffer = { workspace = true, optional = true }
-arrow-cast = { workspace = true, optional = true }
arrow-csv = { workspace = true, optional = true }
arrow-data = { workspace = true, optional = true }
arrow-schema = { workspace = true, optional = true }
@@ -91,6 +90,7 @@ lz4_flex = { version = "0.12", default-features = false,
features = ["std", "fra
zstd = { version = "0.13", default-features = false }
serde_json = { version = "1.0", features = ["std"], default-features = false }
arrow = { workspace = true, features = ["ipc", "test_utils", "prettyprint",
"json"] }
+arrow-cast = { workspace = true }
tokio = { version = "1.0", default-features = false, features = ["macros",
"rt-multi-thread", "io-util", "fs"] }
rand = { version = "0.9", default-features = false, features = ["std",
"std_rng", "thread_rng"] }
object_store = { version = "0.12.0", default-features = false, features =
["azure", "fs"] }
@@ -104,7 +104,7 @@ default = ["arrow", "snap", "brotli", "flate2-zlib-rs",
"lz4", "zstd", "base64",
# Enable lz4
lz4 = ["lz4_flex"]
# Enable arrow reader/writer APIs
-arrow = ["base64", "arrow-array", "arrow-buffer", "arrow-cast", "arrow-data",
"arrow-schema", "arrow-select", "arrow-ipc"]
+arrow = ["base64", "arrow-array", "arrow-buffer", "arrow-data",
"arrow-schema", "arrow-select", "arrow-ipc"]
# Enable support for arrow canonical extension types
arrow_canonical_extension_types = ["arrow-schema?/canonical_extension_types"]
# Enable CLI tools
diff --git a/parquet/src/arrow/array_reader/primitive_array.rs
b/parquet/src/arrow/array_reader/primitive_array.rs
index 7c5d03e020..362f103661 100644
--- a/parquet/src/arrow/array_reader/primitive_array.rs
+++ b/parquet/src/arrow/array_reader/primitive_array.rs
@@ -21,17 +21,16 @@ use crate::arrow::schema::parquet_to_arrow_field;
use crate::basic::Type as PhysicalType;
use crate::column::page::PageIterator;
use crate::data_type::{DataType, Int96};
-use crate::errors::{ParquetError, Result};
+use crate::errors::Result;
use crate::schema::types::ColumnDescPtr;
use arrow_array::{
- ArrayRef, BooleanArray, Decimal32Array, Decimal64Array, Decimal128Array,
Decimal256Array,
- Float32Array, Float64Array, Int8Array, Int16Array, Int32Array, Int64Array,
+ Array, ArrayRef, Date64Array, Decimal64Array, Decimal128Array,
Decimal256Array, Int8Array,
+ Int16Array, Int32Array, Int64Array, PrimitiveArray, UInt8Array,
UInt16Array,
+ builder::PrimitiveDictionaryBuilder, cast::AsArray, downcast_integer,
make_array, types::*,
+};
+use arrow_array::{
TimestampMicrosecondArray, TimestampMillisecondArray,
TimestampNanosecondArray,
- TimestampSecondArray, UInt8Array, UInt16Array, UInt32Array, UInt64Array,
- builder::{
- TimestampMicrosecondBufferBuilder, TimestampMillisecondBufferBuilder,
- TimestampNanosecondBufferBuilder, TimestampSecondBufferBuilder,
- },
+ TimestampSecondArray, UInt32Array, UInt64Array,
};
use arrow_buffer::{BooleanBuffer, Buffer, i256};
use arrow_data::ArrayDataBuilder;
@@ -63,37 +62,23 @@ impl IntoBuffer for Vec<bool> {
impl IntoBuffer for Vec<Int96> {
fn into_buffer(self, target_type: &ArrowType) -> Buffer {
+ let mut builder = Vec::with_capacity(self.len());
match target_type {
ArrowType::Timestamp(TimeUnit::Second, _) => {
- let mut builder =
TimestampSecondBufferBuilder::new(self.len());
- for v in self {
- builder.append(v.to_seconds())
- }
- builder.finish()
+ builder.extend(self.iter().map(|x| x.to_seconds()));
}
ArrowType::Timestamp(TimeUnit::Millisecond, _) => {
- let mut builder =
TimestampMillisecondBufferBuilder::new(self.len());
- for v in self {
- builder.append(v.to_millis())
- }
- builder.finish()
+ builder.extend(self.iter().map(|x| x.to_millis()));
}
ArrowType::Timestamp(TimeUnit::Microsecond, _) => {
- let mut builder =
TimestampMicrosecondBufferBuilder::new(self.len());
- for v in self {
- builder.append(v.to_micros())
- }
- builder.finish()
+ builder.extend(self.iter().map(|x| x.to_micros()));
}
ArrowType::Timestamp(TimeUnit::Nanosecond, _) => {
- let mut builder =
TimestampNanosecondBufferBuilder::new(self.len());
- for v in self {
- builder.append(v.to_nanos())
- }
- builder.finish()
+ builder.extend(self.iter().map(|x| x.to_nanos()));
}
_ => unreachable!("Invalid target_type for Int96."),
}
+ Buffer::from_vec(builder)
}
}
@@ -168,45 +153,17 @@ where
let target_type = &self.data_type;
let arrow_data_type = match T::get_physical_type() {
PhysicalType::BOOLEAN => ArrowType::Boolean,
- PhysicalType::INT32 => {
- match target_type {
- ArrowType::UInt32 => {
- // follow C++ implementation and use
overflow/reinterpret cast from i32 to u32 which will map
- // `i32::MIN..0` to `(i32::MAX as u32)..u32::MAX`
- ArrowType::UInt32
- }
- ArrowType::Decimal32(_, _) => target_type.clone(),
- _ => ArrowType::Int32,
- }
- }
- PhysicalType::INT64 => {
- match target_type {
- ArrowType::UInt64 => {
- // follow C++ implementation and use
overflow/reinterpret cast from i64 to u64 which will map
- // `i64::MIN..0` to `(i64::MAX as u64)..u64::MAX`
- ArrowType::UInt64
- }
- ArrowType::Decimal64(_, _) => target_type.clone(),
- _ => ArrowType::Int64,
- }
- }
+ PhysicalType::INT32 => ArrowType::Int32,
+ PhysicalType::INT64 => ArrowType::Int64,
PhysicalType::FLOAT => ArrowType::Float32,
PhysicalType::DOUBLE => ArrowType::Float64,
- PhysicalType::INT96 => match target_type {
- ArrowType::Timestamp(TimeUnit::Second, _) =>
target_type.clone(),
- ArrowType::Timestamp(TimeUnit::Millisecond, _) =>
target_type.clone(),
- ArrowType::Timestamp(TimeUnit::Microsecond, _) =>
target_type.clone(),
- ArrowType::Timestamp(TimeUnit::Nanosecond, _) =>
target_type.clone(),
- _ => unreachable!("INT96 must be a timestamp."),
- },
+ PhysicalType::INT96 => ArrowType::Int64,
PhysicalType::BYTE_ARRAY | PhysicalType::FIXED_LEN_BYTE_ARRAY => {
unreachable!("PrimitiveArrayReaders don't support complex
physical types");
}
};
- // Convert to arrays by using the Parquet physical type.
- // The physical types are then cast to Arrow types if necessary
-
+ // Convert to equivalent arrow type to parquet physical type
let record_data = self
.record_reader
.consume_record_data()
@@ -218,271 +175,10 @@ where
.null_bit_buffer(self.record_reader.consume_bitmap_buffer());
let array_data = unsafe { array_data.build_unchecked() };
- let array: ArrayRef = match T::get_physical_type() {
- PhysicalType::BOOLEAN => Arc::new(BooleanArray::from(array_data)),
- PhysicalType::INT32 => match array_data.data_type() {
- ArrowType::UInt32 => Arc::new(UInt32Array::from(array_data)),
- ArrowType::Int32 => Arc::new(Int32Array::from(array_data)),
- ArrowType::Decimal32(_, _) =>
Arc::new(Decimal32Array::from(array_data)),
- _ => unreachable!(),
- },
- PhysicalType::INT64 => match array_data.data_type() {
- ArrowType::UInt64 => Arc::new(UInt64Array::from(array_data)),
- ArrowType::Int64 => Arc::new(Int64Array::from(array_data)),
- ArrowType::Decimal64(_, _) =>
Arc::new(Decimal64Array::from(array_data)),
- _ => unreachable!(),
- },
- PhysicalType::FLOAT => Arc::new(Float32Array::from(array_data)),
- PhysicalType::DOUBLE => Arc::new(Float64Array::from(array_data)),
- PhysicalType::INT96 => match target_type {
- ArrowType::Timestamp(TimeUnit::Second, _) => {
- Arc::new(TimestampSecondArray::from(array_data))
- }
- ArrowType::Timestamp(TimeUnit::Millisecond, _) => {
- Arc::new(TimestampMillisecondArray::from(array_data))
- }
- ArrowType::Timestamp(TimeUnit::Microsecond, _) => {
- Arc::new(TimestampMicrosecondArray::from(array_data))
- }
- ArrowType::Timestamp(TimeUnit::Nanosecond, _) => {
- Arc::new(TimestampNanosecondArray::from(array_data))
- }
- _ => unreachable!("INT96 must be a timestamp."),
- },
+ let array: ArrayRef = make_array(array_data);
- PhysicalType::BYTE_ARRAY | PhysicalType::FIXED_LEN_BYTE_ARRAY => {
- unreachable!("PrimitiveArrayReaders don't support complex
physical types");
- }
- };
-
- // cast to Arrow type
- // We make a strong assumption here that the casts should be
infallible.
- // If the cast fails because of incompatible datatypes, then there
might
- // be a bigger problem with how Arrow schemas are converted to Parquet.
- //
- // As there is not always a 1:1 mapping between Arrow and Parquet,
there
- // are datatypes which we must convert explicitly.
- // These are:
- // - date64: cast int32 to date32, then date32 to date64.
- // - decimal: cast int32 to decimal, int64 to decimal
- let array = match target_type {
- // Using `arrow_cast::cast` has been found to be very slow for
converting
- // INT32 physical type to lower bitwidth logical types. Since rust
casts
- // are infallible, instead use `unary` which is much faster (by up
to 40%).
- // One consequence of this approach is that some malformed integer
columns
- // will return (an arguably correct) result rather than null.
- // See https://github.com/apache/arrow-rs/issues/7040 for a
discussion of this
- // issue.
- ArrowType::UInt8 if *(array.data_type()) == ArrowType::Int32 => {
- let array = array
- .as_any()
- .downcast_ref::<Int32Array>()
- .unwrap()
- .unary(|i| i as u8) as UInt8Array;
- Arc::new(array) as ArrayRef
- }
- ArrowType::Int8 if *(array.data_type()) == ArrowType::Int32 => {
- let array = array
- .as_any()
- .downcast_ref::<Int32Array>()
- .unwrap()
- .unary(|i| i as i8) as Int8Array;
- Arc::new(array) as ArrayRef
- }
- ArrowType::UInt16 if *(array.data_type()) == ArrowType::Int32 => {
- let array = array
- .as_any()
- .downcast_ref::<Int32Array>()
- .unwrap()
- .unary(|i| i as u16) as UInt16Array;
- Arc::new(array) as ArrayRef
- }
- ArrowType::Int16 if *(array.data_type()) == ArrowType::Int32 => {
- let array = array
- .as_any()
- .downcast_ref::<Int32Array>()
- .unwrap()
- .unary(|i| i as i16) as Int16Array;
- Arc::new(array) as ArrayRef
- }
- ArrowType::Date64 if *(array.data_type()) == ArrowType::Int32 => {
- // this is cheap as it internally reinterprets the data
- let a = arrow_cast::cast(&array, &ArrowType::Date32)?;
- arrow_cast::cast(&a, target_type)?
- }
- ArrowType::Decimal64(p, s) if *(array.data_type()) ==
ArrowType::Int32 => {
- // Apply conversion to all elements regardless of null slots
as the conversion
- // to `i64` is infallible. This improves performance by
avoiding a branch in
- // the inner loop (see docs for `PrimitiveArray::unary`).
- let array = match array.data_type() {
- ArrowType::Int32 => array
- .as_any()
- .downcast_ref::<Int32Array>()
- .unwrap()
- .unary(|i| i as i64)
- as Decimal64Array,
- _ => {
- return Err(arrow_err!(
- "Cannot convert {:?} to decimal",
- array.data_type()
- ));
- }
- }
- .with_precision_and_scale(*p, *s)?;
-
- Arc::new(array) as ArrayRef
- }
- ArrowType::Decimal128(p, s) => {
- // See above comment. Conversion to `i128` is likewise
infallible.
- let array = match array.data_type() {
- ArrowType::Int32 => array
- .as_any()
- .downcast_ref::<Int32Array>()
- .unwrap()
- .unary(|i| i as i128)
- as Decimal128Array,
- ArrowType::Int64 => array
- .as_any()
- .downcast_ref::<Int64Array>()
- .unwrap()
- .unary(|i| i as i128)
- as Decimal128Array,
- _ => {
- return Err(arrow_err!(
- "Cannot convert {:?} to decimal",
- array.data_type()
- ));
- }
- }
- .with_precision_and_scale(*p, *s)?;
-
- Arc::new(array) as ArrayRef
- }
- ArrowType::Decimal256(p, s) => {
- // See above comment. Conversion to `i256` is likewise
infallible.
- let array = match array.data_type() {
- ArrowType::Int32 => array
- .as_any()
- .downcast_ref::<Int32Array>()
- .unwrap()
- .unary(|i| i256::from_i128(i as i128))
- as Decimal256Array,
- ArrowType::Int64 => array
- .as_any()
- .downcast_ref::<Int64Array>()
- .unwrap()
- .unary(|i| i256::from_i128(i as i128))
- as Decimal256Array,
- _ => {
- return Err(arrow_err!(
- "Cannot convert {:?} to decimal",
- array.data_type()
- ));
- }
- }
- .with_precision_and_scale(*p, *s)?;
-
- Arc::new(array) as ArrayRef
- }
- ArrowType::Dictionary(_, value_type) => match value_type.as_ref() {
- ArrowType::Decimal32(p, s) => {
- let array = match array.data_type() {
- ArrowType::Int32 => array
- .as_any()
- .downcast_ref::<Int32Array>()
- .unwrap()
- .unary(|i| i)
- as Decimal32Array,
- _ => {
- return Err(arrow_err!(
- "Cannot convert {:?} to decimal dictionary",
- array.data_type()
- ));
- }
- }
- .with_precision_and_scale(*p, *s)?;
-
- arrow_cast::cast(&array, target_type)?
- }
- ArrowType::Decimal64(p, s) => {
- let array = match array.data_type() {
- ArrowType::Int32 => array
- .as_any()
- .downcast_ref::<Int32Array>()
- .unwrap()
- .unary(|i| i as i64)
- as Decimal64Array,
- ArrowType::Int64 => array
- .as_any()
- .downcast_ref::<Int64Array>()
- .unwrap()
- .unary(|i| i)
- as Decimal64Array,
- _ => {
- return Err(arrow_err!(
- "Cannot convert {:?} to decimal dictionary",
- array.data_type()
- ));
- }
- }
- .with_precision_and_scale(*p, *s)?;
-
- arrow_cast::cast(&array, target_type)?
- }
- ArrowType::Decimal128(p, s) => {
- let array = match array.data_type() {
- ArrowType::Int32 => array
- .as_any()
- .downcast_ref::<Int32Array>()
- .unwrap()
- .unary(|i| i as i128)
- as Decimal128Array,
- ArrowType::Int64 => array
- .as_any()
- .downcast_ref::<Int64Array>()
- .unwrap()
- .unary(|i| i as i128)
- as Decimal128Array,
- _ => {
- return Err(arrow_err!(
- "Cannot convert {:?} to decimal dictionary",
- array.data_type()
- ));
- }
- }
- .with_precision_and_scale(*p, *s)?;
-
- arrow_cast::cast(&array, target_type)?
- }
- ArrowType::Decimal256(p, s) => {
- let array = match array.data_type() {
- ArrowType::Int32 => array
- .as_any()
- .downcast_ref::<Int32Array>()
- .unwrap()
- .unary(i256::from)
- as Decimal256Array,
- ArrowType::Int64 => array
- .as_any()
- .downcast_ref::<Int64Array>()
- .unwrap()
- .unary(i256::from)
- as Decimal256Array,
- _ => {
- return Err(arrow_err!(
- "Cannot convert {:?} to decimal dictionary",
- array.data_type()
- ));
- }
- }
- .with_precision_and_scale(*p, *s)?;
-
- arrow_cast::cast(&array, target_type)?
- }
- _ => arrow_cast::cast(&array, target_type)?,
- },
- _ => arrow_cast::cast(&array, target_type)?,
- };
+ // Coerce the arrow type to the desired array type
+ let array = coerce_array(array, target_type)?;
// save definition and repetition buffers
self.def_levels_buffer = self.record_reader.consume_def_levels();
@@ -504,6 +200,220 @@ where
}
}
+/// Coerce the parquet physical type array to the target type
+///
+/// This should match the logic in schema::primitive::apply_hint
+fn coerce_array(array: ArrayRef, target_type: &ArrowType) -> Result<ArrayRef> {
+ if let ArrowType::Dictionary(key_type, value_type) = target_type {
+ let dictionary = pack_dictionary(key_type, array.as_ref())?;
+ let any_dictionary = dictionary.as_any_dictionary();
+
+ let coerced_values =
+ coerce_array(Arc::clone(any_dictionary.values()),
value_type.as_ref())?;
+
+ return Ok(any_dictionary.with_values(coerced_values));
+ }
+
+ match array.data_type() {
+ ArrowType::Int32 => coerce_i32(array.as_primitive(), target_type),
+ ArrowType::Int64 => coerce_i64(array.as_primitive(), target_type),
+ ArrowType::Boolean | ArrowType::Float32 | ArrowType::Float64 =>
Ok(array),
+ _ => unreachable!(),
+ }
+}
+
+fn coerce_i32(array: &Int32Array, target_type: &ArrowType) -> Result<ArrayRef>
{
+ Ok(match target_type {
+ ArrowType::UInt8 => {
+ let array = array.unary(|i| i as u8) as UInt8Array;
+ Arc::new(array) as ArrayRef
+ }
+ ArrowType::Int8 => {
+ let array = array.unary(|i| i as i8) as Int8Array;
+ Arc::new(array) as ArrayRef
+ }
+ ArrowType::UInt16 => {
+ let array = array.unary(|i| i as u16) as UInt16Array;
+ Arc::new(array) as ArrayRef
+ }
+ ArrowType::Int16 => {
+ let array = array.unary(|i| i as i16) as Int16Array;
+ Arc::new(array) as ArrayRef
+ }
+ ArrowType::Int32 => Arc::new(array.clone()),
+ // follow C++ implementation and use overflow/reinterpret cast from
i32 to u32 which will map
+ // `i32::MIN..0` to `(i32::MAX as u32)..u32::MAX`
+ ArrowType::UInt32 => Arc::new(UInt32Array::new(
+ array.values().inner().clone().into(),
+ array.nulls().cloned(),
+ )) as ArrayRef,
+ ArrowType::Date32 => Arc::new(array.reinterpret_cast::<Date32Type>())
as _,
+ ArrowType::Date64 => {
+ let array: Date64Array = array.unary(|x| x as i64 * 86_400_000);
+ Arc::new(array) as ArrayRef
+ }
+ ArrowType::Time32(TimeUnit::Second) => {
+ Arc::new(array.reinterpret_cast::<Time32SecondType>()) as ArrayRef
+ }
+ ArrowType::Time32(TimeUnit::Millisecond) => {
+ Arc::new(array.reinterpret_cast::<Time32MillisecondType>()) as
ArrayRef
+ }
+ ArrowType::Timestamp(time_unit, timezone) => match time_unit {
+ TimeUnit::Second => {
+ let array: TimestampSecondArray = array
+ .unary(|x| x as i64)
+ .with_timezone_opt(timezone.clone());
+ Arc::new(array) as _
+ }
+ TimeUnit::Millisecond => {
+ let array: TimestampMillisecondArray = array
+ .unary(|x| x as i64)
+ .with_timezone_opt(timezone.clone());
+ Arc::new(array) as _
+ }
+ TimeUnit::Microsecond => {
+ let array: TimestampMicrosecondArray = array
+ .unary(|x| x as i64)
+ .with_timezone_opt(timezone.clone());
+ Arc::new(array) as _
+ }
+ TimeUnit::Nanosecond => {
+ let array: TimestampNanosecondArray = array
+ .unary(|x| x as i64)
+ .with_timezone_opt(timezone.clone());
+ Arc::new(array) as _
+ }
+ },
+ ArrowType::Decimal32(p, s) => {
+ let array = array
+ .reinterpret_cast::<Decimal32Type>()
+ .with_precision_and_scale(*p, *s)?;
+ Arc::new(array) as ArrayRef
+ }
+ ArrowType::Decimal64(p, s) => {
+ let array: Decimal64Array =
+ array.unary(|i| i as i64).with_precision_and_scale(*p, *s)?;
+ Arc::new(array) as ArrayRef
+ }
+ ArrowType::Decimal128(p, s) => {
+ let array: Decimal128Array = array
+ .unary(|i| i as i128)
+ .with_precision_and_scale(*p, *s)?;
+ Arc::new(array) as ArrayRef
+ }
+ ArrowType::Decimal256(p, s) => {
+ let array: Decimal256Array = array
+ .unary(|i| i256::from_i128(i as i128))
+ .with_precision_and_scale(*p, *s)?;
+ Arc::new(array) as ArrayRef
+ }
+ _ => unreachable!("Cannot coerce i32 to {target_type}"),
+ })
+}
+
+fn coerce_i64(array: &Int64Array, target_type: &ArrowType) -> Result<ArrayRef>
{
+ Ok(match target_type {
+ ArrowType::Int64 => Arc::new(array.clone()) as _,
+ // follow C++ implementation and use overflow/reinterpret cast from
i64 to u64 which will map
+ // `i64::MIN..0` to `(i64::MAX as u64)..u64::MAX`
+ ArrowType::UInt64 => Arc::new(UInt64Array::new(
+ array.values().inner().clone().into(),
+ array.nulls().cloned(),
+ )) as ArrayRef,
+ ArrowType::Date64 => Arc::new(array.reinterpret_cast::<Date64Type>())
as _,
+ ArrowType::Time64(TimeUnit::Microsecond) => {
+ Arc::new(array.reinterpret_cast::<Time64MicrosecondType>()) as _
+ }
+ ArrowType::Time64(TimeUnit::Nanosecond) => {
+ Arc::new(array.reinterpret_cast::<Time64NanosecondType>()) as _
+ }
+ ArrowType::Duration(unit) => match unit {
+ TimeUnit::Second =>
Arc::new(array.reinterpret_cast::<DurationSecondType>()) as _,
+ TimeUnit::Millisecond => {
+ Arc::new(array.reinterpret_cast::<DurationMillisecondType>())
as _
+ }
+ TimeUnit::Microsecond => {
+ Arc::new(array.reinterpret_cast::<DurationMicrosecondType>())
as _
+ }
+ TimeUnit::Nanosecond => {
+ Arc::new(array.reinterpret_cast::<DurationNanosecondType>())
as _
+ }
+ },
+ ArrowType::Timestamp(time_unit, timezone) => match time_unit {
+ TimeUnit::Second => {
+ let array = array
+ .reinterpret_cast::<TimestampSecondType>()
+ .with_timezone_opt(timezone.clone());
+ Arc::new(array) as _
+ }
+ TimeUnit::Millisecond => {
+ let array = array
+ .reinterpret_cast::<TimestampMillisecondType>()
+ .with_timezone_opt(timezone.clone());
+ Arc::new(array) as _
+ }
+ TimeUnit::Microsecond => {
+ let array = array
+ .reinterpret_cast::<TimestampMicrosecondType>()
+ .with_timezone_opt(timezone.clone());
+ Arc::new(array) as _
+ }
+ TimeUnit::Nanosecond => {
+ let array = array
+ .reinterpret_cast::<TimestampNanosecondType>()
+ .with_timezone_opt(timezone.clone());
+ Arc::new(array) as _
+ }
+ },
+ ArrowType::Decimal64(p, s) => {
+ let array = array
+ .reinterpret_cast::<Decimal64Type>()
+ .with_precision_and_scale(*p, *s)?;
+ Arc::new(array) as _
+ }
+ ArrowType::Decimal128(p, s) => {
+ let array: Decimal128Array = array
+ .unary(|i| i as i128)
+ .with_precision_and_scale(*p, *s)?;
+ Arc::new(array) as _
+ }
+ ArrowType::Decimal256(p, s) => {
+ let array: Decimal256Array = array
+ .unary(|i| i256::from_i128(i as i128))
+ .with_precision_and_scale(*p, *s)?;
+ Arc::new(array) as _
+ }
+ _ => unreachable!("Cannot coerce i64 to {target_type}"),
+ })
+}
+
+macro_rules! pack_dictionary_helper {
+ ($t:ty, $values:ident) => {
+ match $values.data_type() {
+ ArrowType::Int32 => pack_dictionary_impl::<$t,
Int32Type>($values.as_primitive()),
+ ArrowType::Int64 => pack_dictionary_impl::<$t,
Int64Type>($values.as_primitive()),
+ ArrowType::Float32 => pack_dictionary_impl::<$t,
Float32Type>($values.as_primitive()),
+ ArrowType::Float64 => pack_dictionary_impl::<$t,
Float64Type>($values.as_primitive()),
+ _ => unreachable!("Invalid physical type"),
+ }
+ };
+}
+
+fn pack_dictionary(key: &ArrowType, values: &dyn Array) -> Result<ArrayRef> {
+ downcast_integer! {
+ key => (pack_dictionary_helper, values),
+ _ => unreachable!("Invalid key type"),
+ }
+}
+
+fn pack_dictionary_impl<K: ArrowDictionaryKeyType, V: ArrowPrimitiveType>(
+ values: &PrimitiveArray<V>,
+) -> Result<ArrayRef> {
+ let mut builder = PrimitiveDictionaryBuilder::<K, V>::with_capacity(1024,
values.len());
+ builder.extend(values);
+ Ok(Arc::new(builder.finish()))
+}
+
#[cfg(test)]
mod tests {
use super::*;
diff --git a/parquet/src/arrow/arrow_writer/mod.rs
b/parquet/src/arrow/arrow_writer/mod.rs
index a6cd200678..e5c5500d63 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -26,8 +26,10 @@ use std::vec::IntoIter;
use arrow_array::cast::AsArray;
use arrow_array::types::*;
-use arrow_array::{ArrayRef, RecordBatch, RecordBatchWriter};
-use arrow_schema::{ArrowError, DataType as ArrowDataType, Field, IntervalUnit,
SchemaRef};
+use arrow_array::{ArrayRef, Int32Array, RecordBatch, RecordBatchWriter};
+use arrow_schema::{
+ ArrowError, DataType as ArrowDataType, Field, IntervalUnit, SchemaRef,
TimeUnit,
+};
use super::schema::{add_encoded_arrow_schema_to_metadata,
decimal_length_from_precision};
@@ -819,7 +821,15 @@ impl ArrowColumnWriter {
pub fn write(&mut self, col: &ArrowLeafColumn) -> Result<()> {
match &mut self.writer {
ArrowColumnWriterImpl::Column(c) => {
- write_leaf(c, &col.0)?;
+ let leaf = col.0.array();
+ match leaf.as_any_dictionary_opt() {
+ Some(dictionary) => {
+ let materialized =
+ arrow_select::take::take(dictionary.values(),
dictionary.keys(), None)?;
+ write_leaf(c, &materialized, &col.0)?
+ }
+ None => write_leaf(c, leaf, &col.0)?,
+ };
}
ArrowColumnWriterImpl::ByteArray(c) => {
write_primitive(c, col.0.array().as_ref(), &col.0)?;
@@ -1132,26 +1142,65 @@ impl ArrowColumnWriterFactory {
}
}
-fn write_leaf(writer: &mut ColumnWriter<'_>, levels: &ArrayLevels) ->
Result<usize> {
- let column = levels.array().as_ref();
+fn write_leaf(
+ writer: &mut ColumnWriter<'_>,
+ column: &dyn arrow_array::Array,
+ levels: &ArrayLevels,
+) -> Result<usize> {
let indices = levels.non_null_indices();
+
match writer {
+ // Note: this should match the contents of arrow_to_parquet_type
ColumnWriter::Int32ColumnWriter(typed) => {
match column.data_type() {
- ArrowDataType::Date64 => {
- // If the column is a Date64, we cast it to a Date32, and
then interpret that as Int32
- let array = arrow_cast::cast(column,
&ArrowDataType::Date32)?;
- let array = arrow_cast::cast(&array,
&ArrowDataType::Int32)?;
-
- let array = array.as_primitive::<Int32Type>();
+ ArrowDataType::Null => {
+ let array = Int32Array::new_null(column.len());
+ write_primitive(typed, array.values(), levels)
+ }
+ ArrowDataType::Int8 => {
+ let array: Int32Array =
column.as_primitive::<Int8Type>().unary(|x| x as i32);
+ write_primitive(typed, array.values(), levels)
+ }
+ ArrowDataType::Int16 => {
+ let array: Int32Array =
column.as_primitive::<Int16Type>().unary(|x| x as i32);
+ write_primitive(typed, array.values(), levels)
+ }
+ ArrowDataType::Int32 => {
+ write_primitive(typed,
column.as_primitive::<Int32Type>().values(), levels)
+ }
+ ArrowDataType::UInt8 => {
+ let array: Int32Array =
column.as_primitive::<UInt8Type>().unary(|x| x as i32);
+ write_primitive(typed, array.values(), levels)
+ }
+ ArrowDataType::UInt16 => {
+ let array: Int32Array =
column.as_primitive::<UInt16Type>().unary(|x| x as i32);
write_primitive(typed, array.values(), levels)
}
ArrowDataType::UInt32 => {
- let values = column.as_primitive::<UInt32Type>().values();
// follow C++ implementation and use overflow/reinterpret
cast from u32 to i32 which will map
// `(i32::MAX as u32)..u32::MAX` to `i32::MIN..0`
- let array = values.inner().typed_data::<i32>();
- write_primitive(typed, array, levels)
+ let array = column.as_primitive::<UInt32Type>();
+ write_primitive(typed,
array.values().inner().typed_data(), levels)
+ }
+ ArrowDataType::Date32 => {
+ let array = column.as_primitive::<Date32Type>();
+ write_primitive(typed, array.values(), levels)
+ }
+ ArrowDataType::Time32(TimeUnit::Second) => {
+ let array = column.as_primitive::<Time32SecondType>();
+ write_primitive(typed, array.values(), levels)
+ }
+ ArrowDataType::Time32(TimeUnit::Millisecond) => {
+ let array = column.as_primitive::<Time32MillisecondType>();
+ write_primitive(typed, array.values(), levels)
+ }
+ ArrowDataType::Date64 => {
+ // If the column is a Date64, we truncate it
+ let array: Int32Array = column
+ .as_primitive::<Date64Type>()
+ .unary(|x| (x / 86_400_000) as _);
+
+ write_primitive(typed, array.values(), levels)
}
ArrowDataType::Decimal32(_, _) => {
let array = column
@@ -1180,46 +1229,7 @@ fn write_leaf(writer: &mut ColumnWriter<'_>, levels:
&ArrayLevels) -> Result<usi
.unary::<_, Int32Type>(|v| v.as_i128() as i32);
write_primitive(typed, array.values(), levels)
}
- ArrowDataType::Dictionary(_, value_type) => match
value_type.as_ref() {
- ArrowDataType::Decimal32(_, _) => {
- let array = arrow_cast::cast(column, value_type)?;
- let array = array
- .as_primitive::<Decimal32Type>()
- .unary::<_, Int32Type>(|v| v);
- write_primitive(typed, array.values(), levels)
- }
- ArrowDataType::Decimal64(_, _) => {
- let array = arrow_cast::cast(column, value_type)?;
- let array = array
- .as_primitive::<Decimal64Type>()
- .unary::<_, Int32Type>(|v| v as i32);
- write_primitive(typed, array.values(), levels)
- }
- ArrowDataType::Decimal128(_, _) => {
- let array = arrow_cast::cast(column, value_type)?;
- let array = array
- .as_primitive::<Decimal128Type>()
- .unary::<_, Int32Type>(|v| v as i32);
- write_primitive(typed, array.values(), levels)
- }
- ArrowDataType::Decimal256(_, _) => {
- let array = arrow_cast::cast(column, value_type)?;
- let array = array
- .as_primitive::<Decimal256Type>()
- .unary::<_, Int32Type>(|v| v.as_i128() as i32);
- write_primitive(typed, array.values(), levels)
- }
- _ => {
- let array = arrow_cast::cast(column,
&ArrowDataType::Int32)?;
- let array = array.as_primitive::<Int32Type>();
- write_primitive(typed, array.values(), levels)
- }
- },
- _ => {
- let array = arrow_cast::cast(column,
&ArrowDataType::Int32)?;
- let array = array.as_primitive::<Int32Type>();
- write_primitive(typed, array.values(), levels)
- }
+ d => Err(ParquetError::General(format!("Cannot coerce {d} to
I32"))),
}
}
ColumnWriter::BoolColumnWriter(typed) => {
@@ -1233,9 +1243,10 @@ fn write_leaf(writer: &mut ColumnWriter<'_>, levels:
&ArrayLevels) -> Result<usi
ColumnWriter::Int64ColumnWriter(typed) => {
match column.data_type() {
ArrowDataType::Date64 => {
- let array = arrow_cast::cast(column,
&ArrowDataType::Int64)?;
+ let array = column
+ .as_primitive::<Date64Type>()
+ .reinterpret_cast::<Int64Type>();
- let array = array.as_primitive::<Int64Type>();
write_primitive(typed, array.values(), levels)
}
ArrowDataType::Int64 => {
@@ -1249,10 +1260,54 @@ fn write_leaf(writer: &mut ColumnWriter<'_>, levels:
&ArrayLevels) -> Result<usi
let array = values.inner().typed_data::<i64>();
write_primitive(typed, array, levels)
}
+ ArrowDataType::Time64(TimeUnit::Microsecond) => {
+ let array = column.as_primitive::<Time64MicrosecondType>();
+ write_primitive(typed, array.values(), levels)
+ }
+ ArrowDataType::Time64(TimeUnit::Nanosecond) => {
+ let array = column.as_primitive::<Time64NanosecondType>();
+ write_primitive(typed, array.values(), levels)
+ }
+ ArrowDataType::Timestamp(unit, _) => match unit {
+ TimeUnit::Second => {
+ let array =
column.as_primitive::<TimestampSecondType>();
+ write_primitive(typed, array.values(), levels)
+ }
+ TimeUnit::Millisecond => {
+ let array =
column.as_primitive::<TimestampMillisecondType>();
+ write_primitive(typed, array.values(), levels)
+ }
+ TimeUnit::Microsecond => {
+ let array =
column.as_primitive::<TimestampMicrosecondType>();
+ write_primitive(typed, array.values(), levels)
+ }
+ TimeUnit::Nanosecond => {
+ let array =
column.as_primitive::<TimestampNanosecondType>();
+ write_primitive(typed, array.values(), levels)
+ }
+ },
+ ArrowDataType::Duration(unit) => match unit {
+ TimeUnit::Second => {
+ let array =
column.as_primitive::<DurationSecondType>();
+ write_primitive(typed, array.values(), levels)
+ }
+ TimeUnit::Millisecond => {
+ let array =
column.as_primitive::<DurationMillisecondType>();
+ write_primitive(typed, array.values(), levels)
+ }
+ TimeUnit::Microsecond => {
+ let array =
column.as_primitive::<DurationMicrosecondType>();
+ write_primitive(typed, array.values(), levels)
+ }
+ TimeUnit::Nanosecond => {
+ let array =
column.as_primitive::<DurationNanosecondType>();
+ write_primitive(typed, array.values(), levels)
+ }
+ },
ArrowDataType::Decimal64(_, _) => {
let array = column
.as_primitive::<Decimal64Type>()
- .unary::<_, Int64Type>(|v| v);
+ .reinterpret_cast::<Int64Type>();
write_primitive(typed, array.values(), levels)
}
ArrowDataType::Decimal128(_, _) => {
@@ -1269,39 +1324,7 @@ fn write_leaf(writer: &mut ColumnWriter<'_>, levels:
&ArrayLevels) -> Result<usi
.unary::<_, Int64Type>(|v| v.as_i128() as i64);
write_primitive(typed, array.values(), levels)
}
- ArrowDataType::Dictionary(_, value_type) => match
value_type.as_ref() {
- ArrowDataType::Decimal64(_, _) => {
- let array = arrow_cast::cast(column, value_type)?;
- let array = array
- .as_primitive::<Decimal64Type>()
- .unary::<_, Int64Type>(|v| v);
- write_primitive(typed, array.values(), levels)
- }
- ArrowDataType::Decimal128(_, _) => {
- let array = arrow_cast::cast(column, value_type)?;
- let array = array
- .as_primitive::<Decimal128Type>()
- .unary::<_, Int64Type>(|v| v as i64);
- write_primitive(typed, array.values(), levels)
- }
- ArrowDataType::Decimal256(_, _) => {
- let array = arrow_cast::cast(column, value_type)?;
- let array = array
- .as_primitive::<Decimal256Type>()
- .unary::<_, Int64Type>(|v| v.as_i128() as i64);
- write_primitive(typed, array.values(), levels)
- }
- _ => {
- let array = arrow_cast::cast(column,
&ArrowDataType::Int64)?;
- let array = array.as_primitive::<Int64Type>();
- write_primitive(typed, array.values(), levels)
- }
- },
- _ => {
- let array = arrow_cast::cast(column,
&ArrowDataType::Int64)?;
- let array = array.as_primitive::<Int64Type>();
- write_primitive(typed, array.values(), levels)
- }
+ d => Err(ParquetError::General(format!("Cannot coerce {d} to
I64"))),
}
}
ColumnWriter::Int96ColumnWriter(_typed) => {
@@ -1322,17 +1345,11 @@ fn write_leaf(writer: &mut ColumnWriter<'_>, levels:
&ArrayLevels) -> Result<usi
let bytes = match column.data_type() {
ArrowDataType::Interval(interval_unit) => match interval_unit {
IntervalUnit::YearMonth => {
- let array = column
- .as_any()
-
.downcast_ref::<arrow_array::IntervalYearMonthArray>()
- .unwrap();
+ let array =
column.as_primitive::<IntervalYearMonthType>();
get_interval_ym_array_slice(array, indices)
}
IntervalUnit::DayTime => {
- let array = column
- .as_any()
-
.downcast_ref::<arrow_array::IntervalDayTimeArray>()
- .unwrap();
+ let array =
column.as_primitive::<IntervalDayTimeType>();
get_interval_dt_array_slice(array, indices)
}
_ => {
@@ -1342,10 +1359,7 @@ fn write_leaf(writer: &mut ColumnWriter<'_>, levels:
&ArrayLevels) -> Result<usi
}
},
ArrowDataType::FixedSizeBinary(_) => {
- let array = column
- .as_any()
- .downcast_ref::<arrow_array::FixedSizeBinaryArray>()
- .unwrap();
+ let array = column.as_fixed_size_binary();
get_fsb_array_slice(array, indices)
}
ArrowDataType::Decimal32(_, _) => {
@@ -1361,10 +1375,7 @@ fn write_leaf(writer: &mut ColumnWriter<'_>, levels:
&ArrayLevels) -> Result<usi
get_decimal_128_array_slice(array, indices)
}
ArrowDataType::Decimal256(_, _) => {
- let array = column
- .as_any()
- .downcast_ref::<arrow_array::Decimal256Array>()
- .unwrap();
+ let array = column.as_primitive::<Decimal256Type>();
get_decimal_256_array_slice(array, indices)
}
ArrowDataType::Float16 => {
@@ -1533,7 +1544,7 @@ mod tests {
use crate::file::page_index::column_index::ColumnIndexMetaData;
use crate::file::reader::SerializedPageReader;
use crate::parquet_thrift::{ReadThrift, ThriftSliceInputProtocol};
- use crate::schema::types::{ColumnPath, Type};
+ use crate::schema::types::ColumnPath;
use arrow::datatypes::ToByteSlice;
use arrow::datatypes::{DataType, Schema};
use arrow::error::Result as ArrowResult;
@@ -4203,68 +4214,6 @@ mod tests {
}
}
- #[test]
- fn test_arrow_writer_explicit_schema() {
- // Write an int32 array using explicit int64 storage
- let batch_schema = Arc::new(Schema::new(vec![Field::new(
- "integers",
- DataType::Int32,
- true,
- )]));
- let parquet_schema = Type::group_type_builder("root")
- .with_fields(vec![
- Type::primitive_type_builder("integers",
crate::basic::Type::INT64)
- .build()
- .unwrap()
- .into(),
- ])
- .build()
- .unwrap();
- let parquet_schema_descr =
SchemaDescriptor::new(parquet_schema.into());
-
- let batch = RecordBatch::try_new(
- batch_schema.clone(),
- vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4])) as _],
- )
- .unwrap();
-
- let explicit_schema_options =
-
ArrowWriterOptions::new().with_parquet_schema(parquet_schema_descr);
- let mut buf = Vec::with_capacity(1024);
- let mut writer = ArrowWriter::try_new_with_options(
- &mut buf,
- batch_schema.clone(),
- explicit_schema_options,
- )
- .unwrap();
- writer.write(&batch).unwrap();
- writer.close().unwrap();
-
- let bytes = Bytes::from(buf);
- let reader_builder =
ParquetRecordBatchReaderBuilder::try_new(bytes).unwrap();
-
- let expected_schema = Arc::new(Schema::new(vec![Field::new(
- "integers",
- DataType::Int64,
- true,
- )]));
- assert_eq!(reader_builder.schema(), &expected_schema);
-
- let batches = reader_builder
- .build()
- .unwrap()
- .collect::<Result<Vec<_>, ArrowError>>()
- .unwrap();
- assert_eq!(batches.len(), 1);
-
- let expected_batch = RecordBatch::try_new(
- expected_schema.clone(),
- vec![Arc::new(Int64Array::from(vec![1, 2, 3, 4])) as _],
- )
- .unwrap();
- assert_eq!(batches[0], expected_batch);
- }
-
#[test]
fn mismatched_schemas() {
let batch_schema = Schema::new(vec![Field::new("count",
DataType::Int32, false)]);
diff --git a/parquet/src/arrow/buffer/dictionary_buffer.rs
b/parquet/src/arrow/buffer/dictionary_buffer.rs
index 49328d8c96..71fb18917d 100644
--- a/parquet/src/arrow/buffer/dictionary_buffer.rs
+++ b/parquet/src/arrow/buffer/dictionary_buffer.rs
@@ -18,7 +18,14 @@
use crate::arrow::buffer::offset_buffer::OffsetBuffer;
use crate::arrow::record_reader::buffer::ValuesBuffer;
use crate::errors::{ParquetError, Result};
-use arrow_array::{Array, ArrayRef, OffsetSizeTrait, make_array};
+use arrow_array::{Array, GenericByteArray, downcast_integer};
+use arrow_array::{
+ ArrayRef, FixedSizeBinaryArray, OffsetSizeTrait,
+ builder::{FixedSizeBinaryDictionaryBuilder, GenericByteDictionaryBuilder},
+ cast::AsArray,
+ make_array,
+ types::{ArrowDictionaryKeyType, ByteArrayType},
+};
use arrow_buffer::{ArrowNativeType, Buffer};
use arrow_data::ArrayDataBuilder;
use arrow_schema::DataType as ArrowType;
@@ -158,7 +165,12 @@ impl<K: ArrowNativeType + Ord, V: OffsetSizeTrait>
DictionaryBuffer<K, V> {
unreachable!()
};
let values = if let ArrowType::FixedSizeBinary(size) =
**value_type {
- arrow_cast::cast(&values,
&ArrowType::FixedSizeBinary(size)).unwrap()
+ let binary = values.as_binary::<i32>();
+ Arc::new(FixedSizeBinaryArray::new(
+ size,
+ binary.values().clone(),
+ binary.nulls().cloned(),
+ )) as _
} else {
values
};
@@ -177,17 +189,13 @@ impl<K: ArrowNativeType + Ord, V: OffsetSizeTrait>
DictionaryBuffer<K, V> {
Ok(make_array(data))
}
Self::Values { values } => {
- let value_type = match data_type {
- ArrowType::Dictionary(_, v) => v.as_ref().clone(),
+ let (key_type, value_type) = match data_type {
+ ArrowType::Dictionary(k, v) => (k, v.as_ref().clone()),
_ => unreachable!(),
};
- // This will compute a new dictionary
- let array =
- arrow_cast::cast(&values.into_array(null_buffer,
value_type), data_type)
- .expect("cast should be infallible");
-
- Ok(array)
+ let array = values.into_array(null_buffer, value_type);
+ pack_values(key_type, &array)
}
}
}
@@ -213,6 +221,60 @@ impl<K: ArrowNativeType, V: OffsetSizeTrait> ValuesBuffer
for DictionaryBuffer<K
}
}
+macro_rules! dict_helper {
+ ($k:ty, $array:ident) => {
+ match $array.data_type() {
+ ArrowType::Utf8 => pack_values_impl::<$k,
_>($array.as_string::<i32>()),
+ ArrowType::LargeUtf8 => pack_values_impl::<$k,
_>($array.as_string::<i64>()),
+ ArrowType::Binary => pack_values_impl::<$k,
_>($array.as_binary::<i32>()),
+ ArrowType::LargeBinary => pack_values_impl::<$k,
_>($array.as_binary::<i64>()),
+ ArrowType::FixedSizeBinary(_) => {
+ pack_fixed_values_impl::<$k>($array.as_fixed_size_binary())
+ }
+ _ => unreachable!(),
+ }
+ };
+}
+
+fn pack_values(key_type: &ArrowType, values: &ArrayRef) -> Result<ArrayRef> {
+ downcast_integer! {
+ key_type => (dict_helper, values),
+ _ => unreachable!(),
+ }
+}
+
+fn pack_values_impl<K: ArrowDictionaryKeyType, T: ByteArrayType>(
+ array: &GenericByteArray<T>,
+) -> Result<ArrayRef> {
+ let mut builder = GenericByteDictionaryBuilder::<K,
T>::with_capacity(array.len(), 1024, 1024);
+ for x in array {
+ match x {
+ Some(x) => builder.append_value(x),
+ None => builder.append_null(),
+ }
+ }
+ let raw = builder.finish();
+ Ok(Arc::new(raw))
+}
+
+fn pack_fixed_values_impl<K: ArrowDictionaryKeyType>(
+ array: &FixedSizeBinaryArray,
+) -> Result<ArrayRef> {
+ let mut builder = FixedSizeBinaryDictionaryBuilder::<K>::with_capacity(
+ array.len(),
+ 1024,
+ array.value_length(),
+ );
+ for x in array {
+ match x {
+ Some(x) => builder.append_value(x),
+ None => builder.append_null(),
+ }
+ }
+ let raw = builder.finish();
+ Ok(Arc::new(raw))
+}
+
#[cfg(test)]
mod tests {
use super::*;