Dandandan commented on code in PR #7513: URL: https://github.com/apache/arrow-rs/pull/7513#discussion_r2122235843
########## arrow-array/src/builder/generic_bytes_view_builder.rs: ########## @@ -406,6 +459,122 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> { }; buffer_size + in_progress + tracker + views + null } + + /// Append all views from the given array into the inprogress builder + /// + /// Will copy the underlying views based on the value of target_buffer_load_factor + pub fn append_array(&mut self, array: &GenericByteViewArray<T>) { + let num_rows = array.len(); + if num_rows == 0 { + return; // nothing to do + } + + let null_buffer_builder = &mut self.null_buffer_builder; + let views = &mut self.views_builder; + + // Copy nulls + if let Some(nulls) = array.nulls() { + null_buffer_builder.append_buffer(nulls); + } else { + null_buffer_builder.append_n_non_nulls(array.len()); + } + + // Copy views from the source array + let starting_view = views.len(); + views.append_slice(array.views()); + + // Safety we only appended views from array + unsafe { + self.finalize_copied_views(starting_view, array); + } + } + + /// Finalizes the views and buffers of the array + /// + /// This must be called after appending views from `array` to the builder. + /// + /// The views from `array` will point to the old buffers. This function + /// updates all views starting at `starting_view` to point to the new + /// buffers or copies the values into a new buffer if the array is sparse. + /// + /// # Safety + /// + /// * self.views[starting_view..] must be valid views from `array`. + pub unsafe fn finalize_copied_views( + &mut self, + starting_view: usize, + array: &GenericByteViewArray<T>, + ) { + // Flush the in-progress buffer + self.flush_in_progress(); + + let buffers = &mut self.completed; + let views = &mut self.views_builder; + + let mut used_buffer_size = 0; + let use_exising_buffers = match self.target_buffer_load_factor { + None => true, + Some(load_factor) => { + used_buffer_size = array.minimum_buffer_size(); + let actual_buffer_size = array.get_buffer_memory_size(); + // If the total size of the buffers is less than the load factor, copy them existing buffers + used_buffer_size >= (actual_buffer_size as f32 * load_factor) as usize + } + }; + + if use_exising_buffers { + let num_buffers_before: u32 = buffers.len().try_into().expect("buffer count overflow"); + buffers.extend_from_slice(array.data_buffers()); // + + // If there were no existing buffers, the views do not need to be updated + // as the buffers of `array` are the same + if num_buffers_before == 0 { + return; + } + + // Update any views that point to the old buffers + for v in views.as_slice_mut()[starting_view..].iter_mut() { Review Comment: it would be faster to extend to it rather than first append_from_slice and later `iter_mut`. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org