This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 90839dfbf1 [Parquet] perf: Create StructArrays directly rather than
via `ArrayData` (1% improvement) (#9120)
90839dfbf1 is described below
commit 90839dfbf104056062cf6fe68ca2b7c7babacaa5
Author: Andrew Lamb <[email protected]>
AuthorDate: Tue Jan 13 08:48:18 2026 -0500
[Parquet] perf: Create StructArrays directly rather than via `ArrayData`
(1% improvement) (#9120)
# Which issue does this PR close?
<!--
We generally require a GitHub issue to be filed for all bug fixes and
enhancements and this helps us generate change logs for our releases.
You can link an issue to this PR using the GitHub syntax.
-->
- part of https://github.com/apache/arrow-rs/issues/9061
- Part of https://github.com/apache/arrow-rs/issues/9128
# Rationale for this change
I noticed on https://github.com/apache/arrow-rs/issues/9061 that there
is non trivial overhead to create struct arrays. I am trying to improve
`make_array` in parallel, but @tustvold had an even better idea in
https://github.com/apache/arrow-rs/pull/9058#issuecomment-3712272488
> My 2 cents is it would be better to move the codepaths relying on
ArrayData over to using the typed arrays directly, this should not only
cut down on allocations but unnecessary validation and dispatch
overheads.
# What changes are included in this PR?
Update the parquet `StructArray` reader (used for the top level
RecordBatch) to directly construct StructArray rather than using
ArrayData
# Are these changes tested?
By existing CI
Benchmarks show a small repeatable improvement of a few percent. For
example
```
arrow_reader_clickbench/async/Q10 1.00 12.7±0.35ms ? ?/sec
1.02 12.9±0.44ms ? ?/sec
```
I am pretty sure this is because the click bench dataset has more than
100 columns. Creating such a struct array requires cloning 100
`ArrayData` (one for each child) which each has a Vec<Buffers>. So this
saves (at least) 100 allocations per batch
# Are there any user-facing changes?
<!--
If there are user-facing changes then we may require documentation to be
updated before approving the PR.
If there are any breaking changes to public APIs, please call them out.
-->
---
arrow-buffer/src/builder/boolean.rs | 10 ++++++-
parquet/src/arrow/array_reader/struct_array.rs | 36 +++++++++++++++-----------
2 files changed, 30 insertions(+), 16 deletions(-)
diff --git a/arrow-buffer/src/builder/boolean.rs
b/arrow-buffer/src/builder/boolean.rs
index fbc3bcc68f..7990be1e7c 100644
--- a/arrow-buffer/src/builder/boolean.rs
+++ b/arrow-buffer/src/builder/boolean.rs
@@ -16,7 +16,7 @@
// under the License.
use crate::bit_util::apply_bitwise_binary_op;
-use crate::{BooleanBuffer, Buffer, MutableBuffer, bit_util};
+use crate::{BooleanBuffer, Buffer, MutableBuffer, NullBuffer, bit_util};
use std::ops::Range;
/// Builder for [`BooleanBuffer`]
@@ -289,6 +289,14 @@ impl From<BooleanBufferBuilder> for BooleanBuffer {
}
}
+impl From<BooleanBufferBuilder> for NullBuffer {
+ #[inline]
+ fn from(builder: BooleanBufferBuilder) -> Self {
+ let boolean_buffer = BooleanBuffer::from(builder);
+ NullBuffer::new(boolean_buffer)
+ }
+}
+
#[cfg(test)]
mod tests {
use super::*;
diff --git a/parquet/src/arrow/array_reader/struct_array.rs
b/parquet/src/arrow/array_reader/struct_array.rs
index b4a6a37533..3c5a5f836b 100644
--- a/parquet/src/arrow/array_reader/struct_array.rs
+++ b/parquet/src/arrow/array_reader/struct_array.rs
@@ -18,8 +18,8 @@
use crate::arrow::array_reader::ArrayReader;
use crate::errors::{ParquetError, Result};
use arrow_array::{Array, ArrayRef, StructArray, builder::BooleanBufferBuilder};
-use arrow_data::{ArrayData, ArrayDataBuilder};
-use arrow_schema::DataType as ArrowType;
+use arrow_buffer::NullBuffer;
+use arrow_schema::{DataType as ArrowType, DataType};
use std::any::Any;
use std::sync::Arc;
@@ -124,16 +124,15 @@ impl ArrayReader for StructArrayReader {
return Err(general_err!("Not all children array length are the
same!"));
}
- // Now we can build array data
- let mut array_data_builder =
ArrayDataBuilder::new(self.data_type.clone())
- .len(children_array_len)
- .child_data(
- children_array
- .into_iter()
- .map(|x| x.into_data())
- .collect::<Vec<ArrayData>>(),
- );
+ let DataType::Struct(fields) = &self.data_type else {
+ return Err(general_err!(
+ "Internal: StructArrayReader must have struct data type, got
{:?}",
+ self.data_type
+ ));
+ };
+ let fields = fields.clone(); // cloning Fields is cheap (Arc
internally)
+ let mut nulls = None;
if self.nullable {
// calculate struct def level data
@@ -168,12 +167,19 @@ impl ArrayReader for StructArrayReader {
if bitmap_builder.len() != children_array_len {
return Err(general_err!("Failed to decode level data for
struct array"));
}
-
- array_data_builder =
array_data_builder.null_bit_buffer(Some(bitmap_builder.into()));
+ nulls = Some(NullBuffer::from(bitmap_builder));
}
- let array_data = unsafe { array_data_builder.build_unchecked() };
- Ok(Arc::new(StructArray::from(array_data)))
+ // Safety: checked above that all children array data have same
+ // length and correct type
+ unsafe {
+ Ok(Arc::new(StructArray::new_unchecked_with_length(
+ fields,
+ children_array,
+ nulls,
+ children_array_len,
+ )))
+ }
}
fn skip_records(&mut self, num_records: usize) -> Result<usize> {