This is an automated email from the ASF dual-hosted git repository.

weijun pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 8061485be3 feat: support ApproxDistinct with utf8view (#15200)
8061485be3 is described below

commit 8061485be3b197d40bb35be09f9cf0a282c99bcd
Author: Qi Zhu <[email protected]>
AuthorDate: Fri Mar 14 10:17:44 2025 +0800

    feat: support ApproxDistinct with utf8view (#15200)
    
    * feat: support ApproxDistinct with utf8view
    
    * Address comment
---
 .../functions-aggregate/src/approx_distinct.rs     | 39 +++++++++++++++++++++-
 .../test_files/aggregate_skip_partial.slt          | 21 ++++++++++++
 2 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/datafusion/functions-aggregate/src/approx_distinct.rs 
b/datafusion/functions-aggregate/src/approx_distinct.rs
index 1d378fff17..c97dba1925 100644
--- a/datafusion/functions-aggregate/src/approx_distinct.rs
+++ b/datafusion/functions-aggregate/src/approx_distinct.rs
@@ -18,7 +18,7 @@
 //! Defines physical expressions that can evaluated at runtime during query 
execution
 
 use crate::hyperloglog::HyperLogLog;
-use arrow::array::BinaryArray;
+use arrow::array::{BinaryArray, StringViewArray};
 use arrow::array::{
     GenericBinaryArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray,
 };
@@ -126,6 +126,27 @@ where
     }
 }
 
+#[derive(Debug)]
+struct StringViewHLLAccumulator<T>
+where
+    T: OffsetSizeTrait,
+{
+    hll: HyperLogLog<String>,
+    phantom_data: PhantomData<T>,
+}
+
+impl<T> StringViewHLLAccumulator<T>
+where
+    T: OffsetSizeTrait,
+{
+    pub fn new() -> Self {
+        Self {
+            hll: HyperLogLog::new(),
+            phantom_data: PhantomData,
+        }
+    }
+}
+
 #[derive(Debug)]
 struct BinaryHLLAccumulator<T>
 where
@@ -197,6 +218,21 @@ where
     default_accumulator_impl!();
 }
 
+impl<T> Accumulator for StringViewHLLAccumulator<T>
+where
+    T: OffsetSizeTrait,
+{
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        let array: &StringViewArray = downcast_value!(values[0], 
StringViewArray);
+        // flatten because we would skip nulls
+        self.hll
+            .extend(array.iter().flatten().map(|s| s.to_string()));
+        Ok(())
+    }
+
+    default_accumulator_impl!();
+}
+
 impl<T> Accumulator for StringHLLAccumulator<T>
 where
     T: OffsetSizeTrait,
@@ -311,6 +347,7 @@ impl AggregateUDFImpl for ApproxDistinct {
             DataType::Int64 => 
Box::new(NumericHLLAccumulator::<Int64Type>::new()),
             DataType::Utf8 => Box::new(StringHLLAccumulator::<i32>::new()),
             DataType::LargeUtf8 => 
Box::new(StringHLLAccumulator::<i64>::new()),
+            DataType::Utf8View => 
Box::new(StringViewHLLAccumulator::<i32>::new()),
             DataType::Binary => Box::new(BinaryHLLAccumulator::<i32>::new()),
             DataType::LargeBinary => 
Box::new(BinaryHLLAccumulator::<i64>::new()),
             other => {
diff --git a/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt 
b/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt
index 3a4d641abf..8755918cd1 100644
--- a/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt
+++ b/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt
@@ -298,6 +298,27 @@ SELECT c2, approx_distinct(c1), approx_distinct(c5) FROM 
aggregate_test_100 GROU
 4 5 23
 5 5 14
 
+# Test approx_distinct for varchar(with Utf8View) / int
+statement ok
+CREATE TABLE aggregate_test_100_utf8view AS SELECT
+      arrow_cast(c1, 'Utf8View') as c1,
+      c2,
+      c5
+FROM aggregate_test_100;
+
+# Test approx_distinct for varchar(with Utf8View) / int
+query III
+SELECT c2, approx_distinct(c1), approx_distinct(c5) FROM 
aggregate_test_100_utf8view GROUP BY c2 ORDER BY c2;
+----
+1 5 22
+2 5 22
+3 5 19
+4 5 23
+5 5 14
+
+statement ok
+DROP TABLE aggregate_test_100_utf8view;
+
 # Test count with nullable fields
 query III
 SELECT c2, count(c3), count(c11) FROM aggregate_test_100_null GROUP BY c2 
ORDER BY c2;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to