This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 96b30527e5 [Parquet] perf: Create Utf8/BinaryViewArray directly rather
than via `ArrayData` (#9121)
96b30527e5 is described below
commit 96b30527e53ebca73e81b9ffbbd02a99da8fc11f
Author: Andrew Lamb <[email protected]>
AuthorDate: Sat Jan 17 11:20:10 2026 -0500
[Parquet] perf: Create Utf8/BinaryViewArray directly rather than via
`ArrayData` (#9121)
# Which issue does this PR close?
- part of https://github.com/apache/arrow-rs/issues/9061
- part of - Part of https://github.com/apache/arrow-rs/issues/9128
# Rationale for this change
- similarly to https://github.com/apache/arrow-rs/pull/9120
Creating Arrays via ArrayData / `make_array` has overhead (at least 2
Vec allocations) compared to simply creating the arrays directly
ViewArrays also have an extra Vec allocation (to hold their buffers)
# What changes are included in this PR?
Update the parquet reader to create ViewArrays directly
# Are these changes tested?
By CI
# Are there any user-facing changes?
<!--
If there are user-facing changes then we may require documentation to be
updated before approving the PR.
If there are any breaking changes to public APIs, please call them out.
-->
---
parquet/src/arrow/buffer/view_buffer.rs | 30 +++++++++++-------------------
1 file changed, 11 insertions(+), 19 deletions(-)
diff --git a/parquet/src/arrow/buffer/view_buffer.rs
b/parquet/src/arrow/buffer/view_buffer.rs
index 2802f97f8f..4ff34bf701 100644
--- a/parquet/src/arrow/buffer/view_buffer.rs
+++ b/parquet/src/arrow/buffer/view_buffer.rs
@@ -16,10 +16,10 @@
// under the License.
use crate::arrow::record_reader::buffer::ValuesBuffer;
-use arrow_array::{ArrayRef, builder::make_view, make_array};
-use arrow_buffer::Buffer;
-use arrow_data::ArrayDataBuilder;
+use arrow_array::{ArrayRef, BinaryViewArray, StringViewArray,
builder::make_view};
+use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer, ScalarBuffer};
use arrow_schema::DataType as ArrowType;
+use std::sync::Arc;
/// A buffer of view type byte arrays that can be converted into
/// `GenericByteViewArray`
@@ -70,26 +70,18 @@ impl ViewBuffer {
/// Converts this into an [`ArrayRef`] with the provided `data_type` and
`null_buffer`
pub fn into_array(self, null_buffer: Option<Buffer>, data_type:
&ArrowType) -> ArrayRef {
let len = self.views.len();
- let views = Buffer::from_vec(self.views);
+ let views = ScalarBuffer::from(self.views);
+ let nulls = null_buffer
+ .map(|b| NullBuffer::new(BooleanBuffer::new(b, 0, len)))
+ .filter(|n| n.null_count() != 0);
match data_type {
ArrowType::Utf8View => {
- let builder = ArrayDataBuilder::new(ArrowType::Utf8View)
- .len(len)
- .add_buffer(views)
- .add_buffers(self.buffers)
- .null_bit_buffer(null_buffer);
- // We have checked that the data is utf8 when building the
buffer, so it is safe
- let array = unsafe { builder.build_unchecked() };
- make_array(array)
+ // Safety: views were created correctly, and checked that the
data is utf8 when building the buffer
+ unsafe { Arc::new(StringViewArray::new_unchecked(views,
self.buffers, nulls)) }
}
ArrowType::BinaryView => {
- let builder = ArrayDataBuilder::new(ArrowType::BinaryView)
- .len(len)
- .add_buffer(views)
- .add_buffers(self.buffers)
- .null_bit_buffer(null_buffer);
- let array = unsafe { builder.build_unchecked() };
- make_array(array)
+ // Safety: views were created correctly
+ unsafe { Arc::new(BinaryViewArray::new_unchecked(views,
self.buffers, nulls)) }
}
_ => panic!("Unsupported data type: {data_type}"),
}