zhuqi-lucas commented on code in PR #7873:
URL: https://github.com/apache/arrow-rs/pull/7873#discussion_r2190385320
##########
arrow-array/src/array/byte_view_array.rs:
##########
@@ -473,90 +473,79 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
/// Note: this function does not attempt to canonicalize / deduplicate
values. For this
/// feature see [`GenericByteViewBuilder::with_deduplicate_strings`].
pub fn gc(&self) -> Self {
- // Get the number of elements in this array
- let len = self.len();
- // Get the raw view values (u128 representations) for each element
- let views = self.views();
-
- // 1) Reuse the existing null bitmap from this array, to avoid
rebuilding it,
- // Cloning the underlying buffer is less expensive than iterating
and appending bits.
- let nulls = self.nulls().cloned();
-
- // 2) Pre-scan: compute the total number of bytes needed to store all
non-inlined values
- // so we can reserve that capacity in one go.
- let total_large_bytes: usize = (0..len)
+ // 1) Read basic properties once
+ let len = self.len(); // number of elements
+ let views = self.views(); // raw u128 “view” values per slot
+ let nulls = self.nulls().cloned(); // reuse & clone existing null
bitmap
+
+ // 2) Pre-scan to determine how many out‑of‑line bytes we must store
+ let total_large: usize = (0..len)
.filter_map(|i| {
Review Comment:
Interesting, i tried now, using set_indices will not improve performance for
benchmark.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]