This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 7ecef6e5f0 [Parquet] perf: Create `PrimitiveArray`s directly rather
than via `ArrayData` (#9122)
7ecef6e5f0 is described below
commit 7ecef6e5f01910e2c0f6bbd8c4591115fabca8e8
Author: Andrew Lamb <[email protected]>
AuthorDate: Tue Jan 13 07:28:43 2026 -0500
[Parquet] perf: Create `PrimitiveArray`s directly rather than via
`ArrayData` (#9122)
# Which issue does this PR close?
- related to https://github.com/apache/arrow-rs/issues/9061
- Part of https://github.com/apache/arrow-rs/issues/9128
# Rationale for this change
- similarly to https://github.com/apache/arrow-rs/pull/9120
Creating Arrays via ArrayData / `make_array` has overhead (at least 2
Vec allocations) compared to simply creating the arrays directly
# What changes are included in this PR?
Update the parquet reader to create `PrimitiveArray`s directly
# Are these changes tested?
By CI
# Are there any user-facing changes?
<!--
If there are user-facing changes then we may require documentation to be
updated before approving the PR.
If there are any breaking changes to public APIs, please call them out.
-->
---
parquet/src/arrow/array_reader/primitive_array.rs | 68 ++++++++++++++---------
1 file changed, 43 insertions(+), 25 deletions(-)
diff --git a/parquet/src/arrow/array_reader/primitive_array.rs
b/parquet/src/arrow/array_reader/primitive_array.rs
index 362f103661..dae42c4c71 100644
--- a/parquet/src/arrow/array_reader/primitive_array.rs
+++ b/parquet/src/arrow/array_reader/primitive_array.rs
@@ -24,16 +24,16 @@ use crate::data_type::{DataType, Int96};
use crate::errors::Result;
use crate::schema::types::ColumnDescPtr;
use arrow_array::{
- Array, ArrayRef, Date64Array, Decimal64Array, Decimal128Array,
Decimal256Array, Int8Array,
- Int16Array, Int32Array, Int64Array, PrimitiveArray, UInt8Array,
UInt16Array,
- builder::PrimitiveDictionaryBuilder, cast::AsArray, downcast_integer,
make_array, types::*,
+ Array, ArrayRef, BooleanArray, Date64Array, Decimal64Array,
Decimal128Array, Decimal256Array,
+ Float32Array, Float64Array, Int8Array, Int16Array, Int32Array, Int64Array,
PrimitiveArray,
+ UInt8Array, UInt16Array, builder::PrimitiveDictionaryBuilder,
cast::AsArray, downcast_integer,
+ types::*,
};
use arrow_array::{
TimestampMicrosecondArray, TimestampMillisecondArray,
TimestampNanosecondArray,
TimestampSecondArray, UInt32Array, UInt64Array,
};
-use arrow_buffer::{BooleanBuffer, Buffer, i256};
-use arrow_data::ArrayDataBuilder;
+use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer, ScalarBuffer, i256};
use arrow_schema::{DataType as ArrowType, TimeUnit};
use std::any::Any;
use std::sync::Arc;
@@ -151,31 +151,49 @@ where
fn consume_batch(&mut self) -> Result<ArrayRef> {
let target_type = &self.data_type;
- let arrow_data_type = match T::get_physical_type() {
- PhysicalType::BOOLEAN => ArrowType::Boolean,
- PhysicalType::INT32 => ArrowType::Int32,
- PhysicalType::INT64 => ArrowType::Int64,
- PhysicalType::FLOAT => ArrowType::Float32,
- PhysicalType::DOUBLE => ArrowType::Float64,
- PhysicalType::INT96 => ArrowType::Int64,
- PhysicalType::BYTE_ARRAY | PhysicalType::FIXED_LEN_BYTE_ARRAY => {
- unreachable!("PrimitiveArrayReaders don't support complex
physical types");
- }
- };
- // Convert to equivalent arrow type to parquet physical type
+ // Convert physical data to equivalent arrow type, and then perform
+ // coercion as needed
let record_data = self
.record_reader
.consume_record_data()
.into_buffer(target_type);
- let array_data = ArrayDataBuilder::new(arrow_data_type)
- .len(self.record_reader.num_values())
- .add_buffer(record_data)
- .null_bit_buffer(self.record_reader.consume_bitmap_buffer());
-
- let array_data = unsafe { array_data.build_unchecked() };
- let array: ArrayRef = make_array(array_data);
+ let len = self.record_reader.num_values();
+ let nulls = self
+ .record_reader
+ .consume_bitmap_buffer()
+ .map(|b| NullBuffer::new(BooleanBuffer::new(b, 0, len)));
+
+ let array: ArrayRef = match T::get_physical_type() {
+ PhysicalType::BOOLEAN => Arc::new(BooleanArray::new(
+ BooleanBuffer::new(record_data, 0, len),
+ nulls,
+ )),
+ PhysicalType::INT32 => Arc::new(Int32Array::new(
+ ScalarBuffer::new(record_data, 0, len),
+ nulls,
+ )),
+ PhysicalType::INT64 => Arc::new(Int64Array::new(
+ ScalarBuffer::new(record_data, 0, len),
+ nulls,
+ )),
+ PhysicalType::FLOAT => Arc::new(Float32Array::new(
+ ScalarBuffer::new(record_data, 0, len),
+ nulls,
+ )),
+ PhysicalType::DOUBLE => Arc::new(Float64Array::new(
+ ScalarBuffer::new(record_data, 0, len),
+ nulls,
+ )),
+ PhysicalType::INT96 => Arc::new(Int64Array::new(
+ ScalarBuffer::new(record_data, 0, len),
+ nulls,
+ )),
+ PhysicalType::BYTE_ARRAY | PhysicalType::FIXED_LEN_BYTE_ARRAY => {
+ unreachable!("PrimitiveArrayReaders don't support complex
physical types");
+ }
+ };
// Coerce the arrow type to the desired array type
let array = coerce_array(array, target_type)?;
@@ -218,7 +236,7 @@ fn coerce_array(array: ArrayRef, target_type: &ArrowType)
-> Result<ArrayRef> {
ArrowType::Int32 => coerce_i32(array.as_primitive(), target_type),
ArrowType::Int64 => coerce_i64(array.as_primitive(), target_type),
ArrowType::Boolean | ArrowType::Float32 | ArrowType::Float64 =>
Ok(array),
- _ => unreachable!(),
+ _ => unreachable!("Cannot coerce array of type {}", array.data_type()),
}
}