This is an automated email from the ASF dual-hosted git repository.
weijun pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 8061485be3 feat: support ApproxDistinct with utf8view (#15200)
8061485be3 is described below
commit 8061485be3b197d40bb35be09f9cf0a282c99bcd
Author: Qi Zhu <[email protected]>
AuthorDate: Fri Mar 14 10:17:44 2025 +0800
feat: support ApproxDistinct with utf8view (#15200)
* feat: support ApproxDistinct with utf8view
* Address comment
---
.../functions-aggregate/src/approx_distinct.rs | 39 +++++++++++++++++++++-
.../test_files/aggregate_skip_partial.slt | 21 ++++++++++++
2 files changed, 59 insertions(+), 1 deletion(-)
diff --git a/datafusion/functions-aggregate/src/approx_distinct.rs
b/datafusion/functions-aggregate/src/approx_distinct.rs
index 1d378fff17..c97dba1925 100644
--- a/datafusion/functions-aggregate/src/approx_distinct.rs
+++ b/datafusion/functions-aggregate/src/approx_distinct.rs
@@ -18,7 +18,7 @@
//! Defines physical expressions that can evaluated at runtime during query
execution
use crate::hyperloglog::HyperLogLog;
-use arrow::array::BinaryArray;
+use arrow::array::{BinaryArray, StringViewArray};
use arrow::array::{
GenericBinaryArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray,
};
@@ -126,6 +126,27 @@ where
}
}
+#[derive(Debug)]
+struct StringViewHLLAccumulator<T>
+where
+ T: OffsetSizeTrait,
+{
+ hll: HyperLogLog<String>,
+ phantom_data: PhantomData<T>,
+}
+
+impl<T> StringViewHLLAccumulator<T>
+where
+ T: OffsetSizeTrait,
+{
+ pub fn new() -> Self {
+ Self {
+ hll: HyperLogLog::new(),
+ phantom_data: PhantomData,
+ }
+ }
+}
+
#[derive(Debug)]
struct BinaryHLLAccumulator<T>
where
@@ -197,6 +218,21 @@ where
default_accumulator_impl!();
}
+impl<T> Accumulator for StringViewHLLAccumulator<T>
+where
+ T: OffsetSizeTrait,
+{
+ fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+ let array: &StringViewArray = downcast_value!(values[0],
StringViewArray);
+ // flatten because we would skip nulls
+ self.hll
+ .extend(array.iter().flatten().map(|s| s.to_string()));
+ Ok(())
+ }
+
+ default_accumulator_impl!();
+}
+
impl<T> Accumulator for StringHLLAccumulator<T>
where
T: OffsetSizeTrait,
@@ -311,6 +347,7 @@ impl AggregateUDFImpl for ApproxDistinct {
DataType::Int64 =>
Box::new(NumericHLLAccumulator::<Int64Type>::new()),
DataType::Utf8 => Box::new(StringHLLAccumulator::<i32>::new()),
DataType::LargeUtf8 =>
Box::new(StringHLLAccumulator::<i64>::new()),
+ DataType::Utf8View =>
Box::new(StringViewHLLAccumulator::<i32>::new()),
DataType::Binary => Box::new(BinaryHLLAccumulator::<i32>::new()),
DataType::LargeBinary =>
Box::new(BinaryHLLAccumulator::<i64>::new()),
other => {
diff --git a/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt
b/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt
index 3a4d641abf..8755918cd1 100644
--- a/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt
+++ b/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt
@@ -298,6 +298,27 @@ SELECT c2, approx_distinct(c1), approx_distinct(c5) FROM
aggregate_test_100 GROU
4 5 23
5 5 14
+# Test approx_distinct for varchar(with Utf8View) / int
+statement ok
+CREATE TABLE aggregate_test_100_utf8view AS SELECT
+ arrow_cast(c1, 'Utf8View') as c1,
+ c2,
+ c5
+FROM aggregate_test_100;
+
+# Test approx_distinct for varchar(with Utf8View) / int
+query III
+SELECT c2, approx_distinct(c1), approx_distinct(c5) FROM
aggregate_test_100_utf8view GROUP BY c2 ORDER BY c2;
+----
+1 5 22
+2 5 22
+3 5 19
+4 5 23
+5 5 14
+
+statement ok
+DROP TABLE aggregate_test_100_utf8view;
+
# Test count with nullable fields
query III
SELECT c2, count(c3), count(c11) FROM aggregate_test_100_null GROUP BY c2
ORDER BY c2;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]