This is an automated email from the ASF dual-hosted git repository.

github-merge-queue[bot] pushed a commit to branch 
gh-readonly-queue/main/pr-22707-f1bbcf5c5bf8ae8515613f223a1bf381090782dc
in repository https://gitbox.apache.org/repos/asf/datafusion.git

commit f447b40a975265875b327b9469a1afb4fd1ca4ec
Author: Zeel Rajodiya <[email protected]>
AuthorDate: Thu Jun 4 15:10:34 2026 +0530

    feat: support Boolean in approx_distinct (#22707)
    
    **Which issue does this PR close?**
    
    No issue is open for Boolean specifically. Related: #1109 (closed by
    #21453) introduced the bitmap pattern this PR extends. The pre-#21453
    code carried a `// TODO support for boolean (trivial case)` comment that
    was silently
    
[dropped](https://github.com/apache/datafusion/pull/21453/changes#diff-1b528be54cb05b65e70e59af3016c21deb39cdeedfe5bfd554cdc3e47694f81eL345)
    without implementation.
    
    **Rationale for this change**
    
    Today `approx_distinct(bool_col)` errors with *"Support for
    'approx_distinct' for data type Boolean is not implemented"*. Boolean
    has at most **2** distinct non-null values, so HLL is overkill — a tiny
    pair of flags gives an **exact** answer at a fraction of the memory
    cost, matching the small-int bitmap strategy already in the codebase.
    
    **What changes are included in this PR?**
    
    Adds `BooleanDistinctCountAccumulator` (two named flags:
    `has_seen_false` / `has_seen_true`) in `functions-aggregate-common` and
    wires `DataType::Boolean` through the existing
    `ApproxDistinctBitmapWrapper` in `approx_distinct.rs` alongside the
    small-int arms. State serializes as `List<Boolean>`.
    
    **Are these changes tested?**
    
    Yes — SLT regression coverage in `aggregate.slt` for all-true,
    all-false, mixed, all-null, and `GROUP BY` cases.
    
    **Are there any user-facing changes?**
    
    Yes — `approx_distinct(<boolean>)` now works instead of erroring. No API
    or behavioral changes for existing types.
---
 .../src/aggregate/count_distinct.rs                |   1 +
 .../src/aggregate/count_distinct/native.rs         | 101 ++++++++++++++++++++-
 .../functions-aggregate/src/approx_distinct.rs     |  27 ++++--
 datafusion/sqllogictest/test_files/aggregate.slt   |  44 +++++++++
 4 files changed, 165 insertions(+), 8 deletions(-)

diff --git 
a/datafusion/functions-aggregate-common/src/aggregate/count_distinct.rs 
b/datafusion/functions-aggregate-common/src/aggregate/count_distinct.rs
index 83cc5cded8..bb706aa614 100644
--- a/datafusion/functions-aggregate-common/src/aggregate/count_distinct.rs
+++ b/datafusion/functions-aggregate-common/src/aggregate/count_distinct.rs
@@ -28,5 +28,6 @@ pub use native::Bitmap65536DistinctCountAccumulator;
 pub use native::Bitmap65536DistinctCountAccumulatorI16;
 pub use native::BoolArray256DistinctCountAccumulator;
 pub use native::BoolArray256DistinctCountAccumulatorI8;
+pub use native::BooleanDistinctCountAccumulator;
 pub use native::FloatDistinctCountAccumulator;
 pub use native::PrimitiveDistinctCountAccumulator;
diff --git 
a/datafusion/functions-aggregate-common/src/aggregate/count_distinct/native.rs 
b/datafusion/functions-aggregate-common/src/aggregate/count_distinct/native.rs
index fb9cfb379a..c7b466d4f0 100644
--- 
a/datafusion/functions-aggregate-common/src/aggregate/count_distinct/native.rs
+++ 
b/datafusion/functions-aggregate-common/src/aggregate/count_distinct/native.rs
@@ -27,13 +27,14 @@ use std::mem::size_of_val;
 use std::sync::Arc;
 
 use arrow::array::ArrayRef;
+use arrow::array::BooleanArray;
 use arrow::array::PrimitiveArray;
 use arrow::array::types::ArrowPrimitiveType;
 use arrow::datatypes::DataType;
 use datafusion_common::hash_utils::RandomState;
 
 use datafusion_common::ScalarValue;
-use datafusion_common::cast::{as_list_array, as_primitive_array};
+use datafusion_common::cast::{as_boolean_array, as_list_array, 
as_primitive_array};
 use datafusion_common::utils::SingleRowListArrayBuilder;
 use datafusion_common::utils::memory::estimate_memory_size;
 use datafusion_expr_common::accumulator::Accumulator;
@@ -518,3 +519,101 @@ impl Accumulator for 
Bitmap65536DistinctCountAccumulatorI16 {
         size_of_val(self) + 8192
     }
 }
+
+/// Optimized COUNT DISTINCT accumulator for `Boolean` using two flags.
+///
+/// Tracks whether `false` and `true` have been observed; nulls are skipped.
+/// Result is always 0, 1, or 2.
+#[derive(Debug)]
+pub struct BooleanDistinctCountAccumulator {
+    has_seen_false: bool,
+    has_seen_true: bool,
+}
+
+impl BooleanDistinctCountAccumulator {
+    pub fn new() -> Self {
+        Self {
+            has_seen_false: false,
+            has_seen_true: false,
+        }
+    }
+
+    #[inline]
+    fn seen_both(&self) -> bool {
+        self.has_seen_false && self.has_seen_true
+    }
+
+    #[inline]
+    fn count(&self) -> i64 {
+        (self.has_seen_false as u8 + self.has_seen_true as u8) as i64
+    }
+
+    /// Update flags from a `BooleanArray`, short-circuiting per-flag once set.
+    #[inline]
+    fn observe(&mut self, arr: &BooleanArray) {
+        if !self.has_seen_false && arr.has_false() {
+            self.has_seen_false = true;
+        }
+        if !self.has_seen_true && arr.has_true() {
+            self.has_seen_true = true;
+        }
+    }
+}
+
+impl Default for BooleanDistinctCountAccumulator {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Accumulator for BooleanDistinctCountAccumulator {
+    fn update_batch(&mut self, values: &[ArrayRef]) -> 
datafusion_common::Result<()> {
+        if values.is_empty() || self.seen_both() {
+            return Ok(());
+        }
+
+        let arr = as_boolean_array(&values[0])?;
+        self.observe(arr);
+        Ok(())
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> 
datafusion_common::Result<()> {
+        if states.is_empty() || self.seen_both() {
+            return Ok(());
+        }
+
+        let arr = as_list_array(&states[0])?;
+        arr.iter().try_for_each(|maybe_list| {
+            if self.seen_both() {
+                return Ok(());
+            }
+            if let Some(list) = maybe_list {
+                self.observe(as_boolean_array(&list)?);
+            };
+            Ok(())
+        })
+    }
+
+    fn state(&mut self) -> datafusion_common::Result<Vec<ScalarValue>> {
+        let mut values: Vec<bool> = Vec::with_capacity(2);
+        if self.has_seen_false {
+            values.push(false);
+        }
+        if self.has_seen_true {
+            values.push(true);
+        }
+
+        let arr = Arc::new(BooleanArray::from(values));
+        Ok(vec![
+            SingleRowListArrayBuilder::new(arr).build_list_scalar(),
+        ])
+    }
+
+    fn evaluate(&mut self) -> datafusion_common::Result<ScalarValue> {
+        Ok(ScalarValue::Int64(Some(self.count())))
+    }
+
+    fn size(&self) -> usize {
+        size_of_val(self)
+    }
+}
diff --git a/datafusion/functions-aggregate/src/approx_distinct.rs 
b/datafusion/functions-aggregate/src/approx_distinct.rs
index 306ec074d4..ee12d9050e 100644
--- a/datafusion/functions-aggregate/src/approx_distinct.rs
+++ b/datafusion/functions-aggregate/src/approx_distinct.rs
@@ -42,6 +42,7 @@ use datafusion_expr::{
 use datafusion_functions_aggregate_common::aggregate::count_distinct::{
     Bitmap65536DistinctCountAccumulator, 
Bitmap65536DistinctCountAccumulatorI16,
     BoolArray256DistinctCountAccumulator, 
BoolArray256DistinctCountAccumulatorI8,
+    BooleanDistinctCountAccumulator,
 };
 use datafusion_functions_aggregate_common::noop_accumulator::NoopAccumulator;
 use datafusion_macros::user_doc;
@@ -336,10 +337,13 @@ impl ApproxDistinct {
 }
 
 #[cold]
-fn get_small_int_approx_accumulator(
+fn get_fixed_domain_approx_accumulator(
     data_type: &DataType,
 ) -> Result<Box<dyn Accumulator>> {
     match data_type {
+        DataType::Boolean => Ok(Box::new(ApproxDistinctBitmapWrapper {
+            inner: BooleanDistinctCountAccumulator::new(),
+        })),
         DataType::UInt8 => Ok(Box::new(ApproxDistinctBitmapWrapper {
             inner: BoolArray256DistinctCountAccumulator::new(),
         })),
@@ -357,7 +361,10 @@ fn get_small_int_approx_accumulator(
 }
 
 #[cold]
-fn get_small_int_state_field(name: &str, data_type: &DataType) -> 
Result<Vec<FieldRef>> {
+fn get_fixed_domain_state_field(
+    name: &str,
+    data_type: &DataType,
+) -> Result<Vec<FieldRef>> {
     Ok(vec![
         Field::new_list(
             format_state_name(name, "approx_distinct"),
@@ -400,9 +407,11 @@ impl AggregateUDFImpl for ApproxDistinct {
                 )
                 .into(),
             ]),
-            DataType::UInt8 | DataType::Int8 | DataType::UInt16 | 
DataType::Int16 => {
-                get_small_int_state_field(args.name, data_type)
-            }
+            DataType::Boolean
+            | DataType::UInt8
+            | DataType::Int8
+            | DataType::UInt16
+            | DataType::Int16 => get_fixed_domain_state_field(args.name, 
data_type),
             _ => Ok(vec![
                 Field::new(
                     format_state_name(args.name, "hll_registers"),
@@ -418,8 +427,12 @@ impl AggregateUDFImpl for ApproxDistinct {
         let data_type = acc_args.expr_fields[0].data_type();
 
         let accumulator: Box<dyn Accumulator> = match data_type {
-            DataType::UInt8 | DataType::Int8 | DataType::UInt16 | 
DataType::Int16 => {
-                return get_small_int_approx_accumulator(data_type);
+            DataType::Boolean
+            | DataType::UInt8
+            | DataType::Int8
+            | DataType::UInt16
+            | DataType::Int16 => {
+                return get_fixed_domain_approx_accumulator(data_type);
             }
             DataType::UInt32 => 
Box::new(NumericHLLAccumulator::<UInt32Type>::new()),
             DataType::UInt64 => 
Box::new(NumericHLLAccumulator::<UInt64Type>::new()),
diff --git a/datafusion/sqllogictest/test_files/aggregate.slt 
b/datafusion/sqllogictest/test_files/aggregate.slt
index e9e61ec541..2861b50580 100644
--- a/datafusion/sqllogictest/test_files/aggregate.slt
+++ b/datafusion/sqllogictest/test_files/aggregate.slt
@@ -1861,6 +1861,50 @@ SELECT approx_distinct(c14) AS a, approx_distinct(c15) 
AS b, approx_distinct(arr
 ----
 18 60 60 60 60
 
+# approx_distinct over Boolean: exact count via flag-pair accumulator (0..=2).
+statement ok
+CREATE TABLE approx_distinct_bool_test (g INT, b BOOLEAN) AS VALUES
+  (1, true), (1, true), (1, NULL),
+  (2, false), (2, false),
+  (3, true), (3, false), (3, NULL), (3, true),
+  (4, NULL), (4, NULL);
+
+query I
+SELECT approx_distinct(b) FROM approx_distinct_bool_test WHERE g = 1;
+----
+1
+
+query I
+SELECT approx_distinct(b) FROM approx_distinct_bool_test WHERE g = 2;
+----
+1
+
+query I
+SELECT approx_distinct(b) FROM approx_distinct_bool_test WHERE g = 3;
+----
+2
+
+query I
+SELECT approx_distinct(b) FROM approx_distinct_bool_test WHERE g = 4;
+----
+0
+
+query II
+SELECT g, approx_distinct(b) FROM approx_distinct_bool_test GROUP BY g ORDER 
BY g;
+----
+1 1
+2 1
+3 2
+4 0
+
+query I
+SELECT approx_distinct(b) FROM approx_distinct_bool_test;
+----
+2
+
+statement ok
+DROP TABLE approx_distinct_bool_test;
+
 ## This test executes the APPROX_PERCENTILE_CONT aggregation against the test
 ## data, asserting the estimated quantiles are ±5% their actual values.
 ##


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to