alamb commented on code in PR #21456:
URL: https://github.com/apache/datafusion/pull/21456#discussion_r3066422683


##########
datafusion/functions-aggregate-common/src/aggregate/count_distinct/native.rs:
##########
@@ -165,3 +165,354 @@ impl<T: ArrowPrimitiveType + Debug> Accumulator for 
FloatDistinctCountAccumulato
         size_of_val(self) + self.values.size()
     }
 }
+
+/// Optimized COUNT DISTINCT accumulator for u8 using a bool array.
+/// Uses 256 bytes to track all possible u8 values.
+#[derive(Debug)]
+pub struct BoolArray256DistinctCountAccumulator {
+    seen: Box<[bool; 256]>,
+}
+
+impl BoolArray256DistinctCountAccumulator {
+    pub fn new() -> Self {
+        Self {
+            seen: Box::new([false; 256]),
+        }
+    }
+
+    #[inline]
+    fn count(&self) -> i64 {
+        self.seen.iter().filter(|&&b| b).count() as i64
+    }
+}
+
+impl Default for BoolArray256DistinctCountAccumulator {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Accumulator for BoolArray256DistinctCountAccumulator {
+    fn update_batch(&mut self, values: &[ArrayRef]) -> 
datafusion_common::Result<()> {
+        if values.is_empty() {
+            return Ok(());
+        }
+
+        let arr = 
as_primitive_array::<arrow::datatypes::UInt8Type>(&values[0])?;
+        for value in arr.iter().flatten() {
+            self.seen[value as usize] = true;
+        }
+        Ok(())
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> 
datafusion_common::Result<()> {
+        if states.is_empty() {
+            return Ok(());
+        }
+
+        let arr = as_list_array(&states[0])?;
+        arr.iter().try_for_each(|maybe_list| {
+            if let Some(list) = maybe_list {
+                let list = 
as_primitive_array::<arrow::datatypes::UInt8Type>(&list)?;
+                for value in list.values().iter() {
+                    self.seen[*value as usize] = true;
+                }
+            };
+            Ok(())
+        })
+    }
+
+    fn state(&mut self) -> datafusion_common::Result<Vec<ScalarValue>> {
+        let values: Vec<u8> = self
+            .seen
+            .iter()
+            .enumerate()
+            .filter_map(|(idx, &seen)| if seen { Some(idx as u8) } else { None 
})
+            .collect();
+
+        let arr = Arc::new(
+            
PrimitiveArray::<arrow::datatypes::UInt8Type>::from_iter_values(values),
+        );
+        Ok(vec![
+            SingleRowListArrayBuilder::new(arr).build_list_scalar(),
+        ])
+    }
+
+    fn evaluate(&mut self) -> datafusion_common::Result<ScalarValue> {
+        Ok(ScalarValue::Int64(Some(self.count())))
+    }
+
+    fn size(&self) -> usize {
+        size_of_val(self) + 256
+    }
+}
+
+/// Optimized COUNT DISTINCT accumulator for i8 using a bool array.
+/// Uses 256 bytes to track all possible i8 values (mapped to 0..255).
+#[derive(Debug)]
+pub struct BoolArray256DistinctCountAccumulatorI8 {
+    seen: Box<[bool; 256]>,

Review Comment:
   BTW why is this boxed (rather than just  inlined)?
   
   ```rust
       seen: [bool; 256]
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to