This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 39c8814f04 feat: extract `has_false` and `has_true` from BooleanArray
to `BooleanBuffer` and reuse for no nulls (#9987)
39c8814f04 is described below
commit 39c8814f04150f4b8f1c3b30d7914022e7c2a292
Author: Raz Luvaton <[email protected]>
AuthorDate: Wed May 20 21:44:42 2026 +0300
feat: extract `has_false` and `has_true` from BooleanArray to
`BooleanBuffer` and reuse for no nulls (#9987)
# Which issue does this PR close?
N/A
# Rationale for this change
So we can use the useful helpers without creating temp `BooleanArray`
# What changes are included in this PR?
Extract non null variants for computing `has_false` and `has_true` from
`BooleanArray` to `BooleanBuffer` and call them instead, and copied the
tests for not nullable
# Are these changes tested?
Yes
# Are there any user-facing changes?
2 new functions
-----
Cc @alamb as talked in:
- https://github.com/apache/datafusion/pull/22158#discussion_r3241321436
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
arrow-array/src/array/boolean_array.rs | 52 +----------
arrow-buffer/src/buffer/boolean.rs | 165 ++++++++++++++++++++++++++++++++-
2 files changed, 166 insertions(+), 51 deletions(-)
diff --git a/arrow-array/src/array/boolean_array.rs
b/arrow-array/src/array/boolean_array.rs
index 22a1ba7653..0fa5b5d912 100644
--- a/arrow-array/src/array/boolean_array.rs
+++ b/arrow-array/src/array/boolean_array.rs
@@ -19,7 +19,6 @@ use crate::array::print_long_array;
use crate::builder::BooleanBuilder;
use crate::iterator::BooleanIter;
use crate::{Array, ArrayAccessor, ArrayRef, Scalar};
-use arrow_buffer::bit_chunk_iterator::UnalignedBitChunk;
use arrow_buffer::{BooleanBuffer, Buffer, MutableBuffer, NullBuffer, bit_util};
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::DataType;
@@ -157,16 +156,6 @@ impl BooleanArray {
&self.values
}
- /// Block size for chunked fold operations in [`Self::has_true`] and
[`Self::has_false`].
- /// Using `chunks_exact` with this size lets the compiler fully unroll the
inner
- /// fold (no inner branch/loop), enabling short-circuit exits every N
chunks.
- const CHUNK_FOLD_BLOCK_SIZE: usize = 16;
-
- /// Returns an [`UnalignedBitChunk`] over this array's values.
- fn unaligned_bit_chunks(&self) -> UnalignedBitChunk<'_> {
- UnalignedBitChunk::new(self.values().values(), self.values().offset(),
self.len())
- }
-
/// Returns the number of non null, true values within this array.
/// If you only need to check if there is at least one true value,
consider using `has_true()` which can short-circuit and be more efficient.
pub fn true_count(&self) -> usize {
@@ -202,16 +191,7 @@ impl BooleanArray {
let value_chunks = self.values().bit_chunks().iter_padded();
null_chunks.zip(value_chunks).any(|(n, v)| (n & v) != 0)
}
- None => {
- let bit_chunks = self.unaligned_bit_chunks();
- let chunks = bit_chunks.chunks();
- let mut exact =
chunks.chunks_exact(Self::CHUNK_FOLD_BLOCK_SIZE);
- let found = bit_chunks.prefix().unwrap_or(0) != 0
- || exact.any(|block| block.iter().fold(0u64, |acc, &c| acc
| c) != 0);
- found
- || exact.remainder().iter().any(|&c| c != 0)
- || bit_chunks.suffix().unwrap_or(0) != 0
- }
+ None => self.values().has_true(),
}
}
@@ -228,35 +208,7 @@ impl BooleanArray {
let value_chunks = self.values().bit_chunks().iter_padded();
null_chunks.zip(value_chunks).any(|(n, v)| (n & !v) != 0)
}
- None => {
- let bit_chunks = self.unaligned_bit_chunks();
- // UnalignedBitChunk zeros padding bits; fill them with 1s so
- // they don't appear as false values.
- let lead_mask = !((1u64 << bit_chunks.lead_padding()) - 1);
- let trail_mask = if bit_chunks.trailing_padding() == 0 {
- u64::MAX
- } else {
- (1u64 << (64 - bit_chunks.trailing_padding())) - 1
- };
- let (prefix_fill, suffix_fill) = match (bit_chunks.prefix(),
bit_chunks.suffix()) {
- (Some(_), Some(_)) => (!lead_mask, !trail_mask),
- (Some(_), None) => (!lead_mask | !trail_mask, 0),
- (None, Some(_)) => (0, !trail_mask),
- (None, None) => (0, 0),
- };
- let chunks = bit_chunks.chunks();
- let mut exact =
chunks.chunks_exact(Self::CHUNK_FOLD_BLOCK_SIZE);
- let found = bit_chunks
- .prefix()
- .is_some_and(|v| (v | prefix_fill) != u64::MAX)
- || exact
- .any(|block| block.iter().fold(u64::MAX, |acc, &c| acc
& c) != u64::MAX);
- found
- || exact.remainder().iter().any(|&c| c != u64::MAX)
- || bit_chunks
- .suffix()
- .is_some_and(|v| (v | suffix_fill) != u64::MAX)
- }
+ None => self.values().has_false(),
}
}
diff --git a/arrow-buffer/src/buffer/boolean.rs
b/arrow-buffer/src/buffer/boolean.rs
index 420bbf59f3..52df67080a 100644
--- a/arrow-buffer/src/buffer/boolean.rs
+++ b/arrow-buffer/src/buffer/boolean.rs
@@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.
-use crate::bit_chunk_iterator::BitChunks;
+use crate::bit_chunk_iterator::{BitChunks, UnalignedBitChunk};
use crate::bit_iterator::{BitIndexIterator, BitIndexU32Iterator, BitIterator,
BitSliceIterator};
use crate::bit_util::read_u64;
use crate::{
@@ -611,6 +611,11 @@ impl BooleanBuffer {
self.into_iter()
}
+ /// Returns an [`UnalignedBitChunk`] over this buffer's values.
+ fn unaligned_bit_chunks(&self) -> UnalignedBitChunk<'_> {
+ UnalignedBitChunk::new(self.values(), self.offset(), self.len())
+ }
+
/// Returns an iterator over the set bit positions in this
[`BooleanBuffer`]
pub fn set_indices(&self) -> BitIndexIterator<'_> {
BitIndexIterator::new(self.values(), self.bit_offset, self.bit_len)
@@ -625,6 +630,61 @@ impl BooleanBuffer {
pub fn set_slices(&self) -> BitSliceIterator<'_> {
BitSliceIterator::new(self.values(), self.bit_offset, self.bit_len)
}
+
+ /// Block size for chunked fold operations in [`Self::has_true`] and
[`Self::has_false`].
+ /// Using `chunks_exact` with this size lets the compiler fully unroll the
inner
+ /// fold (no inner branch/loop), enabling short-circuit exits every N
chunks.
+ const CHUNK_FOLD_BLOCK_SIZE: usize = 16;
+
+ /// Returns whether there is at least one `true` value in this buffer.
+ ///
+ /// This is more efficient than `count_set_bits() > 0` because it can
short-circuit
+ /// as soon as a `true` value is found, without counting all set bits.
+ ///
+ /// Returns `false` for empty buffer.
+ pub fn has_true(&self) -> bool {
+ let bit_chunks = self.unaligned_bit_chunks();
+ let chunks = bit_chunks.chunks();
+ let mut exact = chunks.chunks_exact(Self::CHUNK_FOLD_BLOCK_SIZE);
+ let found = bit_chunks.prefix().unwrap_or(0) != 0
+ || exact.any(|block| block.iter().fold(0u64, |acc, &c| acc | c) !=
0);
+ found || exact.remainder().iter().any(|&c| c != 0) ||
bit_chunks.suffix().unwrap_or(0) != 0
+ }
+
+ /// Returns whether there is at least one `false` value in this buffer.
+ ///
+ /// This is more efficient than `len() > count_set_bits()` because it can
short-circuit
+ /// as soon as a `false` value is found, without counting all set bits.
+ ///
+ /// Returns `false` for empty buffer.
+ pub fn has_false(&self) -> bool {
+ let bit_chunks = self.unaligned_bit_chunks();
+ // UnalignedBitChunk zeros padding bits; fill them with 1s so
+ // they don't appear as false values.
+ let lead_mask = !((1u64 << bit_chunks.lead_padding()) - 1);
+ let trail_mask = if bit_chunks.trailing_padding() == 0 {
+ u64::MAX
+ } else {
+ (1u64 << (64 - bit_chunks.trailing_padding())) - 1
+ };
+ let (prefix_fill, suffix_fill) = match (bit_chunks.prefix(),
bit_chunks.suffix()) {
+ (Some(_), Some(_)) => (!lead_mask, !trail_mask),
+ (Some(_), None) => (!lead_mask | !trail_mask, 0),
+ (None, Some(_)) => (0, !trail_mask),
+ (None, None) => (0, 0),
+ };
+ let chunks = bit_chunks.chunks();
+ let mut exact = chunks.chunks_exact(Self::CHUNK_FOLD_BLOCK_SIZE);
+ let found = bit_chunks
+ .prefix()
+ .is_some_and(|v| (v | prefix_fill) != u64::MAX)
+ || exact.any(|block| block.iter().fold(u64::MAX, |acc, &c| acc &
c) != u64::MAX);
+ found
+ || exact.remainder().iter().any(|&c| c != u64::MAX)
+ || bit_chunks
+ .suffix()
+ .is_some_and(|v| (v | suffix_fill) != u64::MAX)
+ }
}
impl Not for &BooleanBuffer {
@@ -1245,4 +1305,107 @@ mod tests {
let buffer = BooleanBuffer::new_unset(100);
assert_eq!(buffer.clone().find_nth_set_bit_position(0, 1), 100);
}
+
+ #[test]
+ fn test_has_true_has_false_all_true() {
+ let arr = BooleanBuffer::from(vec![true, true, true]);
+ assert!(arr.has_true());
+ assert!(!arr.has_false());
+ }
+
+ #[test]
+ fn test_has_true_has_false_all_false() {
+ let arr = BooleanBuffer::from(vec![false, false, false]);
+ assert!(!arr.has_true());
+ assert!(arr.has_false());
+ }
+
+ #[test]
+ fn test_has_true_has_false_mixed() {
+ let arr = BooleanBuffer::from(vec![true, false, true]);
+ assert!(arr.has_true());
+ assert!(arr.has_false());
+ }
+
+ #[test]
+ fn test_has_true_has_false_empty() {
+ let arr = BooleanBuffer::from(Vec::<bool>::new());
+ assert!(!arr.has_true());
+ assert!(!arr.has_false());
+ }
+
+ #[test]
+ fn test_has_false_aligned_suffix_all_true() {
+ let arr = BooleanBuffer::from(vec![true; 129]);
+ assert!(arr.has_true());
+ assert!(!arr.has_false());
+ }
+
+ #[test]
+ fn test_has_false_non_aligned_all_true() {
+ // 65 elements: exercises the remainder path in has_false
+ let arr = BooleanBuffer::from(vec![true; 65]);
+ assert!(arr.has_true());
+ assert!(!arr.has_false());
+ }
+
+ #[test]
+ fn test_has_false_non_aligned_last_false() {
+ // 64 trues + 1 false: remainder path should find the false
+ let mut values = vec![true; 64];
+ values.push(false);
+ let arr = BooleanBuffer::from(values);
+ assert!(arr.has_true());
+ assert!(arr.has_false());
+ }
+
+ #[test]
+ fn test_has_false_exact_64_all_true() {
+ // Exactly 64 elements, no remainder
+ let arr = BooleanBuffer::from(vec![true; 64]);
+ assert!(arr.has_true());
+ assert!(!arr.has_false());
+ }
+
+ #[test]
+ fn test_has_true_has_false_unaligned_slices() {
+ let cases = [
+ (1, 129, true, false),
+ (3, 130, true, false),
+ (5, 65, true, false),
+ (7, 64, true, false),
+ ];
+
+ let base = BooleanBuffer::from(vec![true; 300]);
+
+ for (offset, len, expected_has_true, expected_has_false) in cases {
+ let arr = base.slice(offset, len);
+ assert_eq!(
+ arr.has_true(),
+ expected_has_true,
+ "offset={offset} len={len}"
+ );
+ assert_eq!(
+ arr.has_false(),
+ expected_has_false,
+ "offset={offset} len={len}"
+ );
+ }
+ }
+
+ #[test]
+ fn test_has_true_has_false_exact_multiples_of_64() {
+ let cases = [
+ (64, true, false),
+ (128, true, false),
+ (192, true, false),
+ (256, true, false),
+ ];
+
+ for (len, expected_has_true, expected_has_false) in cases {
+ let arr = BooleanBuffer::from(vec![true; len]);
+ assert_eq!(arr.has_true(), expected_has_true, "len={len}");
+ assert_eq!(arr.has_false(), expected_has_false, "len={len}");
+ }
+ }
}