coderfender commented on code in PR #22707:
URL: https://github.com/apache/datafusion/pull/22707#discussion_r3339576273
##########
datafusion/functions-aggregate/src/approx_distinct.rs:
##########
@@ -336,10 +337,13 @@ impl ApproxDistinct {
}
#[cold]
-fn get_small_int_approx_accumulator(
+fn get_fixed_domain_approx_accumulator(
Review Comment:
Curious why the name `get_fixed_domain_state_field` is chosen here ?
##########
datafusion/functions-aggregate-common/src/aggregate/count_distinct/native.rs:
##########
@@ -518,3 +519,101 @@ impl Accumulator for
Bitmap65536DistinctCountAccumulatorI16 {
size_of_val(self) + 8192
}
}
+
+/// Optimized COUNT DISTINCT accumulator for `Boolean` using two flags.
+///
+/// Tracks whether `false` and `true` have been observed; nulls are skipped.
+/// Result is always 0, 1, or 2.
+#[derive(Debug)]
+pub struct BooleanDistinctCountAccumulator {
+ has_seen_false: bool,
+ has_seen_true: bool,
+}
+
+impl BooleanDistinctCountAccumulator {
+ pub fn new() -> Self {
+ Self {
+ has_seen_false: false,
+ has_seen_true: false,
+ }
+ }
+
+ #[inline]
+ fn seen_both(&self) -> bool {
+ self.has_seen_false && self.has_seen_true
+ }
+
+ #[inline]
+ fn count(&self) -> i64 {
+ self.has_seen_false as i64 + self.has_seen_true as i64
+ }
+}
+
+impl Default for BooleanDistinctCountAccumulator {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
+impl Accumulator for BooleanDistinctCountAccumulator {
+ fn update_batch(&mut self, values: &[ArrayRef]) ->
datafusion_common::Result<()> {
+ if values.is_empty() || self.seen_both() {
+ return Ok(());
+ }
+
+ let arr = as_boolean_array(&values[0])?;
+ if !self.has_seen_false && arr.has_false() {
+ self.has_seen_false = true;
+ }
+ if !self.has_seen_true && arr.has_true() {
+ self.has_seen_true = true;
+ }
+ Ok(())
+ }
+
+ fn merge_batch(&mut self, states: &[ArrayRef]) ->
datafusion_common::Result<()> {
+ if states.is_empty() || self.seen_both() {
+ return Ok(());
+ }
+
+ let arr = as_list_array(&states[0])?;
+ arr.iter().try_for_each(|maybe_list| {
+ if self.seen_both() {
+ return Ok(());
+ }
+ if let Some(list) = maybe_list {
+ let list = as_boolean_array(&list)?;
+ if !self.has_seen_false && list.has_false() {
+ self.has_seen_false = true;
+ }
+ if !self.has_seen_true && list.has_true() {
+ self.has_seen_true = true;
+ }
+ };
Review Comment:
There seems to be a lot of logic duped between update and merge logic.
Perhaps we could DRY and merge to a single function ?
##########
datafusion/functions-aggregate-common/src/aggregate/count_distinct/native.rs:
##########
@@ -518,3 +519,101 @@ impl Accumulator for
Bitmap65536DistinctCountAccumulatorI16 {
size_of_val(self) + 8192
}
}
+
+/// Optimized COUNT DISTINCT accumulator for `Boolean` using two flags.
+///
+/// Tracks whether `false` and `true` have been observed; nulls are skipped.
+/// Result is always 0, 1, or 2.
+#[derive(Debug)]
+pub struct BooleanDistinctCountAccumulator {
+ has_seen_false: bool,
+ has_seen_true: bool,
+}
+
+impl BooleanDistinctCountAccumulator {
+ pub fn new() -> Self {
+ Self {
+ has_seen_false: false,
+ has_seen_true: false,
+ }
+ }
+
+ #[inline]
+ fn seen_both(&self) -> bool {
+ self.has_seen_false && self.has_seen_true
+ }
+
+ #[inline]
+ fn count(&self) -> i64 {
+ self.has_seen_false as i64 + self.has_seen_true as i64
Review Comment:
could probably do with one cast to i64 ?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]