This is an automated email from the ASF dual-hosted git repository.
dheres pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new d610468d22 Avoid a clone when creating StringArray/BinaryArray from
ArrayData (#9160)
d610468d22 is described below
commit d610468d22406772e1aa01600aeb23b7a0444120
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed Jan 14 07:31:06 2026 -0500
Avoid a clone when creating StringArray/BinaryArray from ArrayData (#9160)
# Which issue does this PR close?
- Part of https://github.com/apache/arrow-rs/issues/9061
- broken out of https://github.com/apache/arrow-rs/pull/9058
# Rationale for this change
Let's make arrow-rs the fastest we can and the fewer allocations the
better
# What changes are included in this PR?
Apply pattern from https://github.com/apache/arrow-rs/pull/9114
# Are these changes tested?
Existing tests
# Are there any user-facing changes?
No
---
arrow-array/src/array/byte_array.rs | 20 ++++++++++++--------
arrow-array/src/array/mod.rs | 23 ++++++++++++++++++++++-
2 files changed, 34 insertions(+), 9 deletions(-)
diff --git a/arrow-array/src/array/byte_array.rs
b/arrow-array/src/array/byte_array.rs
index bd85bffcfe..8e8ad91cea 100644
--- a/arrow-array/src/array/byte_array.rs
+++ b/arrow-array/src/array/byte_array.rs
@@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.
-use crate::array::{get_offsets, print_long_array};
+use crate::array::{get_offsets_from_buffer, print_long_array};
use crate::builder::GenericByteBuilder;
use crate::iterator::ArrayIter;
use crate::types::ByteArrayType;
@@ -542,30 +542,34 @@ impl<'a, T: ByteArrayType> ArrayAccessor for &'a
GenericByteArray<T> {
impl<T: ByteArrayType> From<ArrayData> for GenericByteArray<T> {
fn from(data: ArrayData) -> Self {
+ let (data_type, len, nulls, offset, mut buffers, _child_data) =
data.into_parts();
assert_eq!(
- data.data_type(),
- &Self::DATA_TYPE,
+ data_type,
+ Self::DATA_TYPE,
"{}{}Array expects DataType::{}",
T::Offset::PREFIX,
T::PREFIX,
Self::DATA_TYPE
);
assert_eq!(
- data.buffers().len(),
+ buffers.len(),
2,
"{}{}Array data should contain 2 buffers only (offsets and
values)",
T::Offset::PREFIX,
T::PREFIX,
);
+ // buffers are offset then value, so pop in reverse
+ let value_data = buffers.pop().expect("checked above");
+ let offset_buffer = buffers.pop().expect("checked above");
+
// SAFETY:
// ArrayData is valid, and verified type above
- let value_offsets = unsafe { get_offsets(&data) };
- let value_data = data.buffers()[1].clone();
+ let value_offsets = unsafe { get_offsets_from_buffer(offset_buffer,
offset, len) };
Self {
value_offsets,
value_data,
- data_type: T::DATA_TYPE,
- nulls: data.nulls().cloned(),
+ data_type,
+ nulls,
}
}
}
diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs
index aae382ace7..6fcb80c533 100644
--- a/arrow-array/src/array/mod.rs
+++ b/arrow-array/src/array/mod.rs
@@ -20,7 +20,7 @@
mod binary_array;
use crate::types::*;
-use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer, ScalarBuffer};
+use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, OffsetBuffer,
ScalarBuffer};
use arrow_data::ArrayData;
use arrow_schema::{DataType, IntervalUnit, TimeUnit};
use std::any::Any;
@@ -939,6 +939,27 @@ unsafe fn get_offsets<O: ArrowNativeType>(data:
&ArrayData) -> OffsetBuffer<O> {
}
}
+/// Helper function that creates an [`OffsetBuffer`] from a buffer and array
offset/ length
+///
+/// # Safety
+///
+/// - buffer must contain valid arrow offsets ( [`OffsetBuffer`] ) for the
+/// given length and offset.
+unsafe fn get_offsets_from_buffer<O: ArrowNativeType>(
+ buffer: Buffer,
+ offset: usize,
+ len: usize,
+) -> OffsetBuffer<O> {
+ if len == 0 && buffer.is_empty() {
+ return OffsetBuffer::new_empty();
+ }
+
+ let scalar_buffer = ScalarBuffer::new(buffer, offset, len + 1);
+ // Safety:
+ // Arguments were valid
+ unsafe { OffsetBuffer::new_unchecked(scalar_buffer) }
+}
+
/// Helper function for printing potentially long arrays.
fn print_long_array<A, F>(array: &A, f: &mut std::fmt::Formatter, print_item:
F) -> std::fmt::Result
where