This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 3e5c76f23f Remove unnecessary null buffer construction when converting
arrays to a different type (#6244)
3e5c76f23f is described below
commit 3e5c76f23f2f5f6ca074935fe242e57de7d8d8bd
Author: Ed Seidl <[email protected]>
AuthorDate: Wed Aug 14 03:56:11 2024 -0700
Remove unnecessary null buffer construction when converting arrays to a
different type (#6244)
* create primitive array from iter and nulls
* clippy
* speed up some more decimals
* add optimizations for byte_stream_split
* decimal256
* Revert "add optimizations for byte_stream_split"
This reverts commit 5d4ae0dc09f95ee9079b46b117fb554f63157564.
* add comments
---
arrow-array/src/array/primitive_array.rs | 14 ++++
.../src/arrow/array_reader/fixed_len_byte_array.rs | 83 ++++++++++++----------
parquet/src/arrow/array_reader/primitive_array.rs | 82 +++++++++++++--------
3 files changed, 115 insertions(+), 64 deletions(-)
diff --git a/arrow-array/src/array/primitive_array.rs
b/arrow-array/src/array/primitive_array.rs
index 70a8ceaef8..db14845b08 100644
--- a/arrow-array/src/array/primitive_array.rs
+++ b/arrow-array/src/array/primitive_array.rs
@@ -713,6 +713,20 @@ impl<T: ArrowPrimitiveType> PrimitiveArray<T> {
}
}
+ /// Creates a PrimitiveArray based on an iterator of values with provided
nulls
+ pub fn from_iter_values_with_nulls<I: IntoIterator<Item = T::Native>>(
+ iter: I,
+ nulls: Option<NullBuffer>,
+ ) -> Self {
+ let val_buf: Buffer = iter.into_iter().collect();
+ let len = val_buf.len() / std::mem::size_of::<T::Native>();
+ Self {
+ data_type: T::DATA_TYPE,
+ values: ScalarBuffer::new(val_buf, 0, len),
+ nulls,
+ }
+ }
+
/// Creates a PrimitiveArray based on a constant value with `count`
elements
pub fn from_value(value: T::Native, count: usize) -> Self {
unsafe {
diff --git a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs
b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs
index 670a97f351..3b2600c547 100644
--- a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs
+++ b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs
@@ -27,7 +27,7 @@ use crate::column::reader::decoder::ColumnValueDecoder;
use crate::errors::{ParquetError, Result};
use crate::schema::types::ColumnDescPtr;
use arrow_array::{
- ArrayRef, Decimal128Array, Decimal256Array, FixedSizeBinaryArray,
Float16Array,
+ Array, ArrayRef, Decimal128Array, Decimal256Array, FixedSizeBinaryArray,
Float16Array,
IntervalDayTimeArray, IntervalYearMonthArray,
};
use arrow_buffer::{i256, Buffer, IntervalDayTime};
@@ -165,57 +165,68 @@ impl ArrayReader for FixedLenByteArrayReader {
// TODO: An improvement might be to do this conversion on read
let array: ArrayRef = match &self.data_type {
ArrowType::Decimal128(p, s) => {
- let decimal = binary
- .iter()
- .map(|opt| Some(i128::from_be_bytes(sign_extend_be(opt?))))
- .collect::<Decimal128Array>()
+ // We can simply reuse the null buffer from `binary` rather
than recomputing it
+ // (as was the case when we simply used `collect` to produce
the new array).
+ // The same applies to the transformations below.
+ let nulls = binary.nulls().cloned();
+ let decimal = binary.iter().map(|o| match o {
+ Some(b) => i128::from_be_bytes(sign_extend_be(b)),
+ None => i128::default(),
+ });
+ let decimal =
Decimal128Array::from_iter_values_with_nulls(decimal, nulls)
.with_precision_and_scale(*p, *s)?;
-
Arc::new(decimal)
}
ArrowType::Decimal256(p, s) => {
- let decimal = binary
- .iter()
- .map(|opt| Some(i256::from_be_bytes(sign_extend_be(opt?))))
- .collect::<Decimal256Array>()
+ let nulls = binary.nulls().cloned();
+ let decimal = binary.iter().map(|o| match o {
+ Some(b) => i256::from_be_bytes(sign_extend_be(b)),
+ None => i256::default(),
+ });
+ let decimal =
Decimal256Array::from_iter_values_with_nulls(decimal, nulls)
.with_precision_and_scale(*p, *s)?;
-
Arc::new(decimal)
}
ArrowType::Interval(unit) => {
+ let nulls = binary.nulls().cloned();
// An interval is stored as 3x 32-bit unsigned integers
storing months, days,
// and milliseconds
match unit {
- IntervalUnit::YearMonth => Arc::new(
- binary
- .iter()
- .map(|o| o.map(|b|
i32::from_le_bytes(b[0..4].try_into().unwrap())))
- .collect::<IntervalYearMonthArray>(),
- ) as ArrayRef,
- IntervalUnit::DayTime => Arc::new(
- binary
- .iter()
- .map(|o| {
- o.map(|b| {
- IntervalDayTime::new(
-
i32::from_le_bytes(b[4..8].try_into().unwrap()),
-
i32::from_le_bytes(b[8..12].try_into().unwrap()),
- )
- })
- })
- .collect::<IntervalDayTimeArray>(),
- ) as ArrayRef,
+ IntervalUnit::YearMonth => {
+ let iter = binary.iter().map(|o| match o {
+ Some(b) =>
i32::from_le_bytes(b[0..4].try_into().unwrap()),
+ None => i32::default(),
+ });
+ let interval =
+
IntervalYearMonthArray::from_iter_values_with_nulls(iter, nulls);
+ Arc::new(interval) as ArrayRef
+ }
+ IntervalUnit::DayTime => {
+ let iter = binary.iter().map(|o| match o {
+ Some(b) => IntervalDayTime::new(
+
i32::from_le_bytes(b[4..8].try_into().unwrap()),
+
i32::from_le_bytes(b[8..12].try_into().unwrap()),
+ ),
+ None => IntervalDayTime::default(),
+ });
+ let interval =
+
IntervalDayTimeArray::from_iter_values_with_nulls(iter, nulls);
+ Arc::new(interval) as ArrayRef
+ }
IntervalUnit::MonthDayNano => {
return Err(nyi_err!("MonthDayNano intervals not
supported"));
}
}
}
- ArrowType::Float16 => Arc::new(
- binary
- .iter()
- .map(|o| o.map(|b|
f16::from_le_bytes(b[..2].try_into().unwrap())))
- .collect::<Float16Array>(),
- ) as ArrayRef,
+ ArrowType::Float16 => {
+ let nulls = binary.nulls().cloned();
+ let f16s = binary.iter().map(|o| match o {
+ Some(b) => f16::from_le_bytes(b[..2].try_into().unwrap()),
+ None => f16::default(),
+ });
+ let f16s = Float16Array::from_iter_values_with_nulls(f16s,
nulls);
+ Arc::new(f16s) as ArrayRef
+ }
_ => Arc::new(binary) as ArrayRef,
};
diff --git a/parquet/src/arrow/array_reader/primitive_array.rs
b/parquet/src/arrow/array_reader/primitive_array.rs
index 07ecc27d9b..5e0e09212c 100644
--- a/parquet/src/arrow/array_reader/primitive_array.rs
+++ b/parquet/src/arrow/array_reader/primitive_array.rs
@@ -217,22 +217,35 @@ where
arrow_cast::cast(&a, target_type)?
}
ArrowType::Decimal128(p, s) => {
+ // We can simply reuse the null buffer from `array` rather
than recomputing it
+ // (as was the case when we simply used `collect` to produce
the new array).
+ let nulls = array.nulls().cloned();
let array = match array.data_type() {
- ArrowType::Int32 => array
- .as_any()
- .downcast_ref::<Int32Array>()
- .unwrap()
- .iter()
- .map(|v| v.map(|v| v as i128))
- .collect::<Decimal128Array>(),
+ ArrowType::Int32 => {
+ let decimal = array
+ .as_any()
+ .downcast_ref::<Int32Array>()
+ .unwrap()
+ .iter()
+ .map(|v| match v {
+ Some(i) => i as i128,
+ None => i128::default(),
+ });
+ Decimal128Array::from_iter_values_with_nulls(decimal,
nulls)
+ }
- ArrowType::Int64 => array
- .as_any()
- .downcast_ref::<Int64Array>()
- .unwrap()
- .iter()
- .map(|v| v.map(|v| v as i128))
- .collect::<Decimal128Array>(),
+ ArrowType::Int64 => {
+ let decimal = array
+ .as_any()
+ .downcast_ref::<Int64Array>()
+ .unwrap()
+ .iter()
+ .map(|v| match v {
+ Some(i) => i as i128,
+ None => i128::default(),
+ });
+ Decimal128Array::from_iter_values_with_nulls(decimal,
nulls)
+ }
_ => {
return Err(arrow_err!(
"Cannot convert {:?} to decimal",
@@ -245,22 +258,35 @@ where
Arc::new(array) as ArrayRef
}
ArrowType::Decimal256(p, s) => {
+ // We can simply reuse the null buffer from `array` rather
than recomputing it
+ // (as was the case when we simply used `collect` to produce
the new array).
+ let nulls = array.nulls().cloned();
let array = match array.data_type() {
- ArrowType::Int32 => array
- .as_any()
- .downcast_ref::<Int32Array>()
- .unwrap()
- .iter()
- .map(|v| v.map(|v| i256::from_i128(v as i128)))
- .collect::<Decimal256Array>(),
+ ArrowType::Int32 => {
+ let decimal = array
+ .as_any()
+ .downcast_ref::<Int32Array>()
+ .unwrap()
+ .iter()
+ .map(|v| match v {
+ Some(i) => i256::from_i128(i as i128),
+ None => i256::default(),
+ });
+ Decimal256Array::from_iter_values_with_nulls(decimal,
nulls)
+ }
- ArrowType::Int64 => array
- .as_any()
- .downcast_ref::<Int64Array>()
- .unwrap()
- .iter()
- .map(|v| v.map(|v| i256::from_i128(v as i128)))
- .collect::<Decimal256Array>(),
+ ArrowType::Int64 => {
+ let decimal = array
+ .as_any()
+ .downcast_ref::<Int64Array>()
+ .unwrap()
+ .iter()
+ .map(|v| match v {
+ Some(i) => i256::from_i128(i as i128),
+ None => i256::default(),
+ });
+ Decimal256Array::from_iter_values_with_nulls(decimal,
nulls)
+ }
_ => {
return Err(arrow_err!(
"Cannot convert {:?} to decimal",