alamb commented on code in PR #23035:
URL: https://github.com/apache/datafusion/pull/23035#discussion_r3482819481
##########
datafusion/physical-expr/src/expressions/in_list/primitive_filter.rs:
##########
@@ -29,113 +29,106 @@ use std::hash::{Hash, Hasher};
use super::result::build_in_list_result;
use super::static_filter::{StaticFilter, handle_dictionary};
-/// Bitmap filter for O(1) set membership via single bit test.
+/// Storage for the bits used by [`BitmapFilter`].
///
-/// `UInt8` has only 256 possible values, so the filter stores membership in a
-/// 256-bit bitmap instead of using a hash table.
-pub(super) struct UInt8BitmapFilter {
- null_count: usize,
- bits: [u64; 4],
+/// `BitmapFilter` represents an `IN` list with one bit for each possible
+/// value, so membership checks become direct bit tests. This trait lets the
+/// same filter code use different storage sizes for different integer widths.
+pub(super) trait BitmapStorage: Send + Sync {
+ fn new_zeroed() -> Self;
+ fn set_bit(&mut self, index: usize);
+ fn get_bit(&self, index: usize) -> bool;
}
-impl UInt8BitmapFilter {
- pub(super) fn try_new(in_array: &ArrayRef) -> Result<Self> {
- let prim_array =
in_array.as_primitive_opt::<UInt8Type>().ok_or_else(|| {
- exec_datafusion_err!("UInt8BitmapFilter: expected UInt8 array")
- })?;
- let mut bits = [0u64; 4];
- let mut set_bit = |v: u8| {
- let index = usize::from(v);
- bits[index / 64] |= 1u64 << (index % 64);
- };
-
- let values = prim_array.values();
- match prim_array.nulls() {
- None => {
- for &v in values {
- set_bit(v);
- }
- }
- Some(nulls) => {
- for i in
- BitIndexIterator::new(nulls.validity(), nulls.offset(),
nulls.len())
- {
- set_bit(values[i]);
- }
- }
- }
- Ok(Self {
- null_count: prim_array.null_count(),
- bits,
- })
+// `UInt8` has 256 possible values, 0 through 255. One bit per value takes
+// 256 bits, which fits in four `u64` words.
+impl BitmapStorage for [u64; 4] {
+ #[inline]
+ fn new_zeroed() -> Self {
+ [0u64; 4]
+ }
+ #[inline]
+ fn set_bit(&mut self, index: usize) {
+ self[index / 64] |= 1u64 << (index % 64);
}
-
#[inline(always)]
- fn check(&self, needle: u8) -> bool {
- let index = needle as usize;
- (self.bits[index / 64] >> (index % 64)) & 1 != 0
+ fn get_bit(&self, index: usize) -> bool {
+ (self[index / 64] >> (index % 64)) & 1 != 0
}
}
-impl StaticFilter for UInt8BitmapFilter {
- fn null_count(&self) -> usize {
- self.null_count
+// `UInt16` has 65,536 possible values. One bit per value takes 65,536 bits,
+// which is 1,024 `u64` words, or 8 KiB. Box the array so the filter stores a
+// pointer instead of carrying an 8 KiB array inline.
+impl BitmapStorage for Box<[u64; 1024]> {
+ #[inline]
+ fn new_zeroed() -> Self {
+ Box::new([0u64; 1024])
}
-
- fn contains(&self, v: &dyn Array, negated: bool) -> Result<BooleanArray> {
- handle_dictionary!(self, v, negated);
- let v = v.as_primitive_opt::<UInt8Type>().ok_or_else(|| {
- exec_datafusion_err!("UInt8BitmapFilter: expected UInt8 array")
- })?;
- let input_values = v.values();
- Ok(build_in_list_result(
- v.len(),
- v.nulls(),
- self.null_count > 0,
- negated,
- #[inline(always)]
- |i| {
- // SAFETY: `build_in_list_result` invokes this closure for
- // indices in `0..v.len()`, which matches `input_values.len()`.
- let needle = unsafe { *input_values.get_unchecked(i) };
- self.check(needle)
- },
- ))
+ #[inline]
+ fn set_bit(&mut self, index: usize) {
+ self[index / 64] |= 1u64 << (index % 64);
+ }
+ #[inline(always)]
+ fn get_bit(&self, index: usize) -> bool {
+ (self[index / 64] >> (index % 64)) & 1 != 0
}
}
-/// Bitmap filter for O(1) `UInt16` set membership via single bit test.
+/// Arrow primitive types supported by [`BitmapFilter`].
///
-/// `UInt16` has 65,536 possible values, so the filter stores membership in an
-/// 8 KiB heap-allocated bitmap instead of using a hash table.
-pub(super) struct UInt16BitmapFilter {
+/// Arrow already defines the Rust value type as `T::Native`. This trait only
+/// supplies the bitmap storage size for the two integer domains that are small
+/// enough to represent with one bit per possible value.
+pub(super) trait BitmapFilterType:
Review Comment:
😍
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]