This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new f870dcd878 fix: Support Dictionary[Int32, Binary] for bitmap count
spark function (#18273)
f870dcd878 is described below
commit f870dcd87852d77c16e49c8b7c38337becf787ef
Author: Kazantsev Maksim <[email protected]>
AuthorDate: Mon Oct 27 00:06:54 2025 -0700
fix: Support Dictionary[Int32, Binary] for bitmap count spark function
(#18273)
## Which issue does this PR close?
Closes https://github.com/apache/datafusion/issues/18058
## Rationale for this change
When adding the bitmap_count function to Comet, we get the following
error - org.apache.comet.CometNativeException: Error from DataFusion:
bitmap_count expects Binary/BinaryView/FixedSizeBinary/LargeBinary as
argument, got Dictionary(Int32, Binary).
## Are these changes tested?
Added new UT
---------
Co-authored-by: Kazantsev Maksim <[email protected]>
---
.../spark/src/function/bitmap/bitmap_count.rs | 65 ++++++++++++++++++++--
.../test_files/spark/bitmap/bitmap_count.slt | 32 +++++++++++
2 files changed, 91 insertions(+), 6 deletions(-)
diff --git a/datafusion/spark/src/function/bitmap/bitmap_count.rs
b/datafusion/spark/src/function/bitmap/bitmap_count.rs
index 15bd33229a..56a9c5edb8 100644
--- a/datafusion/spark/src/function/bitmap/bitmap_count.rs
+++ b/datafusion/spark/src/function/bitmap/bitmap_count.rs
@@ -19,13 +19,13 @@ use std::any::Any;
use std::sync::Arc;
use arrow::array::{
- Array, ArrayRef, BinaryArray, BinaryViewArray, FixedSizeBinaryArray,
Int64Array,
- LargeBinaryArray,
+ as_dictionary_array, Array, ArrayRef, BinaryArray, BinaryViewArray,
+ FixedSizeBinaryArray, Int64Array, LargeBinaryArray,
};
-use arrow::datatypes::DataType;
use arrow::datatypes::DataType::{
- Binary, BinaryView, FixedSizeBinary, Int64, LargeBinary,
+ Binary, BinaryView, Dictionary, FixedSizeBinary, LargeBinary,
};
+use arrow::datatypes::{DataType, Int16Type, Int32Type, Int64Type, Int8Type};
use datafusion_common::utils::take_function_args;
use datafusion_common::{internal_err, Result};
use datafusion_expr::{
@@ -71,7 +71,7 @@ impl ScalarUDFImpl for BitmapCount {
}
fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
- Ok(Int64)
+ Ok(DataType::Int64)
}
fn invoke_with_args(&self, args: ScalarFunctionArgs) ->
Result<ColumnarValue> {
@@ -90,6 +90,17 @@ macro_rules! downcast_and_count_ones {
}};
}
+macro_rules! downcast_dict_and_count_ones {
+ ($input_dict:expr, $key_array_type:ident) => {{
+ let dict_array = as_dictionary_array::<$key_array_type>($input_dict);
+ let array = dict_array.downcast_dict::<BinaryArray>().unwrap();
+ Ok(array
+ .into_iter()
+ .map(binary_count_ones)
+ .collect::<Int64Array>())
+ }};
+}
+
pub fn bitmap_count_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
let [input_array] = take_function_args("bitmap_count", arg)?;
@@ -100,6 +111,17 @@ pub fn bitmap_count_inner(arg: &[ArrayRef]) ->
Result<ArrayRef> {
FixedSizeBinary(_size) => {
downcast_and_count_ones!(input_array, FixedSizeBinaryArray)
}
+ Dictionary(k, v) if v.as_ref() == &Binary => match k.as_ref() {
+ DataType::Int8 => downcast_dict_and_count_ones!(input_array,
Int8Type),
+ DataType::Int16 => downcast_dict_and_count_ones!(input_array,
Int16Type),
+ DataType::Int32 => downcast_dict_and_count_ones!(input_array,
Int32Type),
+ DataType::Int64 => downcast_dict_and_count_ones!(input_array,
Int64Type),
+ data_type => {
+ internal_err!(
+ "bitmap_count does not support Dictionary({data_type},
Binary)"
+ )
+ }
+ },
data_type => {
internal_err!("bitmap_count does not support {data_type}")
}
@@ -114,8 +136,12 @@ mod tests {
use crate::function::utils::test::test_scalar_function;
use arrow::array::{Array, Int64Array};
use arrow::datatypes::DataType::Int64;
+ use arrow::datatypes::{DataType, Field};
+ use datafusion_common::config::ConfigOptions;
use datafusion_common::{Result, ScalarValue};
- use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+ use datafusion_expr::ColumnarValue::Scalar;
+ use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
+ use std::sync::Arc;
macro_rules! test_bitmap_count_binary_invoke {
($INPUT:expr, $EXPECTED:expr) => {
@@ -171,4 +197,31 @@ mod tests {
);
Ok(())
}
+
+ #[test]
+ fn test_dictionary_encoded_bitmap_count_invoke() -> Result<()> {
+ let dict = Scalar(ScalarValue::Dictionary(
+ Box::new(DataType::Int32),
+ Box::new(ScalarValue::Binary(Some(vec![0xFFu8, 0xFFu8]))),
+ ));
+
+ let arg_fields = vec![Field::new(
+ "a",
+ DataType::Dictionary(Box::new(DataType::Int32),
Box::new(DataType::Binary)),
+ true,
+ )
+ .into()];
+ let args = ScalarFunctionArgs {
+ args: vec![dict.clone()],
+ arg_fields,
+ number_rows: 1,
+ return_field: Field::new("f", Int64, true).into(),
+ config_options: Arc::new(ConfigOptions::default()),
+ };
+ let udf = BitmapCount::new();
+ let actual = udf.invoke_with_args(args)?;
+ let expect = Scalar(ScalarValue::Int64(Some(16)));
+ assert_eq!(*actual.into_array(1)?, *expect.into_array(1)?);
+ Ok(())
+ }
}
diff --git a/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_count.slt
b/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_count.slt
index 2789efef7b..39dca51222 100644
--- a/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_count.slt
+++ b/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_count.slt
@@ -59,3 +59,35 @@ SELECT bitmap_count(arrow_cast(a, 'FixedSizeBinary(2)'))
FROM (VALUES (X'1010'),
5
16
NULL
+
+query I
+SELECT bitmap_count(arrow_cast(a, 'Dictionary(Int32, Binary)')) FROM (VALUES
(X'1010'), (X'0AB0'), (X'FFFF'), (NULL)) AS t(a);
+----
+2
+5
+16
+NULL
+
+query I
+SELECT bitmap_count(arrow_cast(a, 'Dictionary(Int8, Binary)')) FROM (VALUES
(X'1010'), (X'0AB0'), (X'FFFF'), (NULL)) AS t(a);
+----
+2
+5
+16
+NULL
+
+query I
+SELECT bitmap_count(arrow_cast(a, 'Dictionary(Int16, Binary)')) FROM (VALUES
(X'1010'), (X'0AB0'), (X'FFFF'), (NULL)) AS t(a);
+----
+2
+5
+16
+NULL
+
+query I
+SELECT bitmap_count(arrow_cast(a, 'Dictionary(Int64, Binary)')) FROM (VALUES
(X'1010'), (X'0AB0'), (X'FFFF'), (NULL)) AS t(a);
+----
+2
+5
+16
+NULL
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]