alamb commented on code in PR #7463: URL: https://github.com/apache/arrow-rs/pull/7463#discussion_r2073584836
########## arrow-select/src/filter.rs: ########## @@ -656,29 +653,44 @@ where (start, end, len) } - /// Extends the in-progress array by the indexes in the provided iterator - fn extend_idx(&mut self, iter: impl Iterator<Item = usize>) { + fn extend_offsets_idx(&mut self, iter: impl Iterator<Item = usize>) { self.dst_offsets.extend(iter.map(|idx| { let start = self.src_offsets[idx].as_usize(); let end = self.src_offsets[idx + 1].as_usize(); let len = OffsetSize::from_usize(end - start).expect("illegal offset range"); self.cur_offset += len; - self.dst_values - .extend_from_slice(&self.src_values[start..end]); + self.cur_offset })); + self.dst_values.reserve_exact(self.cur_offset.as_usize()); } - /// Extends the in-progress array by the ranges in the provided iterator - fn extend_slices(&mut self, iter: impl Iterator<Item = (usize, usize)>) { + /// Extends the in-progress array by the indexes in the provided iterator + fn extend_idx(&mut self, iter: impl Iterator<Item = usize>) { + for idx in iter { + let start = self.src_offsets[idx].as_usize(); + let end = self.src_offsets[idx + 1].as_usize(); + self.dst_values + .extend_from_slice(&self.src_values[start..end]); + } + } + + fn extend_offsets_slices(&mut self, iter: impl Iterator<Item = (usize, usize)>, count: usize) { + self.dst_offsets.reserve_exact(count); for (start, end) in iter { // These can only fail if `array` contains invalid data for idx in start..end { let (_, _, len) = self.get_value_range(idx); self.cur_offset += len; - self.dst_offsets.push(self.cur_offset); // push_unchecked? + self.dst_offsets.push(self.cur_offset); } + } + self.dst_values.reserve_exact(self.cur_offset.as_usize()); Review Comment: I think it would make sense for this call to reserve_exact to be a part of `extend_idx` (which is what actually extends the values). Otherwise I think it is quite tricky to understand that the performance of `extend_idx` is relying on calling `extend_offsts_slices` first Since they are now always called in a pair `extend_offsets_slices` and `extend_slices` I don't think this would change anything functionally but it would make the code easier for me to understand (even though you would also have to pass `count` to the extend_idx was well Another alternative might be to make a function `fn reserve_exact` that to make the allocations explicit -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org