This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch active_release
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/active_release by this push:
new 5d13a4d Optimize array::transform::utils::set_bits (#716) (#761)
5d13a4d is described below
commit 5d13a4d9eddf43d2fbec4d1cb50228f63e42d61f
Author: Andrew Lamb <[email protected]>
AuthorDate: Fri Sep 10 05:38:54 2021 -0400
Optimize array::transform::utils::set_bits (#716) (#761)
* Added tests
* Updated tests and improved implementation
* Cleanup
* Stopped collecting bytes before writing to write_data
* Added tests
* Cleanup and comments
* Fixed clippy warning
* Fixed an endianess issue
* Fixed comments and naming
* Made tests less prone to off-by-n errors
Co-authored-by: mathiaspeters-sig
<[email protected]>
---
arrow/src/array/transform/utils.rs | 175 +++++++++++++++++++++++++++++++++++--
1 file changed, 166 insertions(+), 9 deletions(-)
diff --git a/arrow/src/array/transform/utils.rs
b/arrow/src/array/transform/utils.rs
index 8c718c7..ed7f455 100644
--- a/arrow/src/array/transform/utils.rs
+++ b/arrow/src/array/transform/utils.rs
@@ -15,7 +15,14 @@
// specific language governing permissions and limitations
// under the License.
-use crate::{array::OffsetSizeTrait, buffer::MutableBuffer, util::bit_util};
+use crate::{
+ array::OffsetSizeTrait,
+ buffer::MutableBuffer,
+ util::{
+ bit_chunk_iterator::BitChunks,
+ bit_util::{self, ceil},
+ },
+};
/// extends the `buffer` to be able to hold `len` bits, setting all bits of
the new size to zero.
#[inline]
@@ -28,6 +35,7 @@ pub(super) fn resize_for_bits(buffer: &mut MutableBuffer,
len: usize) {
/// sets all bits on `write_data` on the range
`[offset_write..offset_write+len]` to be equal to the
/// bits on `data` on the range `[offset_read..offset_read+len]`
+/// returns the number of `0` bits `data[offset_read..offset_read+len]`
pub(super) fn set_bits(
write_data: &mut [u8],
data: &[u8],
@@ -35,15 +43,37 @@ pub(super) fn set_bits(
offset_read: usize,
len: usize,
) -> usize {
- let mut count = 0;
- (0..len).for_each(|i| {
- if bit_util::get_bit(data, offset_read + i) {
- bit_util::set_bit(write_data, offset_write + i);
- } else {
- count += 1;
- }
+ let mut null_count = 0;
+
+ let mut bits_to_align = offset_write % 8;
+ if bits_to_align > 0 {
+ bits_to_align = std::cmp::min(len, 8 - bits_to_align);
+ }
+ let mut write_byte_index = ceil(offset_write + bits_to_align, 8);
+
+ // Set full bytes provided by bit chunk iterator (which iterates in 64
bits at a time)
+ let chunks = BitChunks::new(data, offset_read + bits_to_align, len -
bits_to_align);
+ chunks.iter().for_each(|chunk| {
+ null_count += chunk.count_zeros();
+ chunk.to_le_bytes().iter().for_each(|b| {
+ write_data[write_byte_index] = *b;
+ write_byte_index += 1;
+ })
});
- count
+
+ // Set individual bits both to align write_data to a byte offset and the
remainder bits not covered by the bit chunk iterator
+ let remainder_offset = len - chunks.remainder_len();
+ (0..bits_to_align)
+ .chain(remainder_offset..len)
+ .for_each(|i| {
+ if bit_util::get_bit(data, offset_read + i) {
+ bit_util::set_bit(write_data, offset_write + i);
+ } else {
+ null_count += 1;
+ }
+ });
+
+ null_count as usize
}
pub(super) fn extend_offsets<T: OffsetSizeTrait>(
@@ -74,3 +104,130 @@ pub(super) unsafe fn get_last_offset<T: OffsetSizeTrait>(
debug_assert!(prefix.is_empty() && suffix.is_empty());
*offsets.get_unchecked(offsets.len() - 1)
}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_set_bits_aligned() {
+ let mut destination: Vec<u8> = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+ let source: &[u8] = &[
+ 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011,
0b11000011,
+ 0b11100111, 0b10100101,
+ ];
+
+ let destination_offset = 8;
+ let source_offset = 0;
+
+ let len = 64;
+
+ let expected_data: &[u8] = &[
+ 0, 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011,
0b11000011,
+ 0b11100111, 0b10100101, 0,
+ ];
+ let expected_null_count = 24;
+ let result = set_bits(
+ destination.as_mut_slice(),
+ source,
+ destination_offset,
+ source_offset,
+ len,
+ );
+
+ assert_eq!(destination, expected_data);
+ assert_eq!(result, expected_null_count);
+ }
+
+ #[test]
+ fn test_set_bits_unaligned_destination_start() {
+ let mut destination: Vec<u8> = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+ let source: &[u8] = &[
+ 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011,
0b11000011,
+ 0b11100111, 0b10100101,
+ ];
+
+ let destination_offset = 3;
+ let source_offset = 0;
+
+ let len = 64;
+
+ let expected_data: &[u8] = &[
+ 0b00111000, 0b00101111, 0b11001101, 0b11011100, 0b01011110,
0b00011111,
+ 0b00111110, 0b00101111, 0b00000101, 0b00000000,
+ ];
+ let expected_null_count = 24;
+ let result = set_bits(
+ destination.as_mut_slice(),
+ source,
+ destination_offset,
+ source_offset,
+ len,
+ );
+
+ assert_eq!(destination, expected_data);
+ assert_eq!(result, expected_null_count);
+ }
+
+ #[test]
+ fn test_set_bits_unaligned_destination_end() {
+ let mut destination: Vec<u8> = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+ let source: &[u8] = &[
+ 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011,
0b11000011,
+ 0b11100111, 0b10100101,
+ ];
+
+ let destination_offset = 8;
+ let source_offset = 0;
+
+ let len = 62;
+
+ let expected_data: &[u8] = &[
+ 0, 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011,
0b11000011,
+ 0b11100111, 0b00100101, 0,
+ ];
+ let expected_null_count = 23;
+ let result = set_bits(
+ destination.as_mut_slice(),
+ source,
+ destination_offset,
+ source_offset,
+ len,
+ );
+
+ assert_eq!(destination, expected_data);
+ assert_eq!(result, expected_null_count);
+ }
+
+ #[test]
+ fn test_set_bits_unaligned() {
+ let mut destination: Vec<u8> = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0];
+ let source: &[u8] = &[
+ 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011,
0b11000011,
+ 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011,
0b11000011,
+ 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011,
0b11000011,
+ ];
+
+ let destination_offset = 3;
+ let source_offset = 5;
+
+ let len = 95;
+
+ let expected_data: &[u8] = &[
+ 0b01111000, 0b01101001, 0b11100110, 0b11110110, 0b11111010,
0b11110000,
+ 0b01111001, 0b01101001, 0b11100110, 0b11110110, 0b11111010,
0b11110000,
+ 0b00000001,
+ ];
+ let expected_null_count = 35;
+ let result = set_bits(
+ destination.as_mut_slice(),
+ source,
+ destination_offset,
+ source_offset,
+ len,
+ );
+
+ assert_eq!(destination, expected_data);
+ assert_eq!(result, expected_null_count);
+ }
+}