alamb commented on a change in pull request #716:
URL: https://github.com/apache/arrow-rs/pull/716#discussion_r697690544
##########
File path: arrow/src/array/transform/utils.rs
##########
@@ -35,15 +42,37 @@ pub(super) fn set_bits(
offset_read: usize,
len: usize,
) -> usize {
- let mut count = 0;
- (0..len).for_each(|i| {
- if bit_util::get_bit(data, offset_read + i) {
- bit_util::set_bit(write_data, offset_write + i);
- } else {
- count += 1;
- }
+ let mut null_count = 0;
+
+ let mut bits_to_align = offset_write % 8;
+ if bits_to_align > 0 {
+ bits_to_align = std::cmp::min(len, 8 - bits_to_align);
+ }
+ let mut byte_index = ceil(offset_write + bits_to_align, 8);
Review comment:
Maybe callig this `write_byte_index` would help make it clearer what it
represented
##########
File path: arrow/src/array/transform/utils.rs
##########
@@ -35,15 +42,37 @@ pub(super) fn set_bits(
offset_read: usize,
len: usize,
) -> usize {
- let mut count = 0;
- (0..len).for_each(|i| {
- if bit_util::get_bit(data, offset_read + i) {
- bit_util::set_bit(write_data, offset_write + i);
- } else {
- count += 1;
- }
+ let mut null_count = 0;
+
+ let mut bits_to_align = offset_write % 8;
+ if bits_to_align > 0 {
+ bits_to_align = std::cmp::min(len, 8 - bits_to_align);
+ }
+ let mut byte_index = ceil(offset_write + bits_to_align, 8);
+
+ // Set full bytes provided by bit chunk iterator
+ let chunks = BitChunks::new(data, offset_read + bits_to_align, len -
bits_to_align);
+ chunks.iter().for_each(|chunk| {
+ null_count += chunk.count_zeros();
+ chunk.to_le_bytes().iter().for_each(|b| {
Review comment:
👍 bit chunk iterator seems to use little endian as well:
https://github.com/mathiaspeters-sig/arrow-rs/blob/improve-set-bits/arrow/src/util/bit_chunk_iterator.rs#L138
##########
File path: arrow/src/array/transform/utils.rs
##########
@@ -74,3 +103,130 @@ pub(super) unsafe fn get_last_offset<T: OffsetSizeTrait>(
debug_assert!(prefix.is_empty() && suffix.is_empty());
*offsets.get_unchecked(offsets.len() - 1)
}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_set_bits_aligned() {
+ let mut destination: Vec<u8> = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+ let source: &[u8] = &[
+ 0b11100111, 0b11100111, 0b11100111, 0b11100111, 0b11100111,
0b11100111,
+ 0b11100111, 0b11100111,
Review comment:
I think one danger of using the same values for all of the source data
is that it may mask offset calculation errors
What would you think of using a different pattern for each byte? Perhaps
something like
```suggestion
0b10101010, 0b11001100, 0b11100011, 0b1111000, 0b11111000,
0b11111100,
0b11111110, 0b1111111,
```
Or some other pattern where having "off by 8 errors" is not as likely to be
masked?
##########
File path: arrow/src/array/transform/utils.rs
##########
@@ -74,3 +103,130 @@ pub(super) unsafe fn get_last_offset<T: OffsetSizeTrait>(
debug_assert!(prefix.is_empty() && suffix.is_empty());
*offsets.get_unchecked(offsets.len() - 1)
}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_set_bits_aligned() {
+ let mut destination: Vec<u8> = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+ let source: &[u8] = &[
+ 0b11100111, 0b11100111, 0b11100111, 0b11100111, 0b11100111,
0b11100111,
+ 0b11100111, 0b11100111,
+ ];
+
+ let destination_offset = 8;
+ let source_offset = 0;
+
+ let len = 64;
+
+ let expected_data: &[u8] = &[
+ 0, 0b11100111, 0b11100111, 0b11100111, 0b11100111, 0b11100111,
0b11100111,
+ 0b11100111, 0b11100111, 0,
+ ];
+ let expected_null_count = 16;
+ let result = set_bits(
+ destination.as_mut_slice(),
+ source,
+ destination_offset,
+ source_offset,
+ len,
+ );
+
+ assert_eq!(destination, expected_data);
+ assert_eq!(result, expected_null_count);
+ }
+
+ #[test]
+ fn test_set_bits_unaligned_destination_start() {
+ let mut destination: Vec<u8> = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+ let source: &[u8] = &[
+ 0b11100111, 0b11100111, 0b11100111, 0b11100111, 0b11100111,
0b11100111,
+ 0b11100111, 0b11100111,
+ ];
+
+ let destination_offset = 3;
+ let source_offset = 0;
+
+ let len = 64;
+
+ let expected_data: &[u8] = &[
+ 0b00111000, 0b00111111, 0b00111111, 0b00111111, 0b00111111,
0b00111111,
+ 0b00111111, 0b00111111, 0b00000111, 0b00000000,
Review comment:
i think the 3rd byte looks suspicious -- should it be?
```suggestion
0b00111111, 0b00111111, 0b00111100, 0b00000000,
```
##########
File path: arrow/src/array/transform/utils.rs
##########
@@ -35,15 +42,37 @@ pub(super) fn set_bits(
offset_read: usize,
len: usize,
) -> usize {
Review comment:
It would be cool to update the doc strings here too to explain what the
return value of `set_bits` is supposed to be (seems like it is supposed to be
the total number of `0` bits in `data[offset_read..offset_read+len]`
##########
File path: arrow/src/array/transform/utils.rs
##########
@@ -35,15 +42,37 @@ pub(super) fn set_bits(
offset_read: usize,
len: usize,
) -> usize {
- let mut count = 0;
- (0..len).for_each(|i| {
- if bit_util::get_bit(data, offset_read + i) {
- bit_util::set_bit(write_data, offset_write + i);
- } else {
- count += 1;
- }
+ let mut null_count = 0;
+
+ let mut bits_to_align = offset_write % 8;
+ if bits_to_align > 0 {
+ bits_to_align = std::cmp::min(len, 8 - bits_to_align);
+ }
+ let mut byte_index = ceil(offset_write + bits_to_align, 8);
+
+ // Set full bytes provided by bit chunk iterator
Review comment:
```suggestion
// Set full bytes provided by bit chunk iterator (which iterates in 64
bits at a time)
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]