This is an automated email from the ASF dual-hosted git repository.

jeffreyvo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new fd7df66724 feat(spark): implement Spark `bitmap` function 
`bitmap_count` (#17179)
fd7df66724 is described below

commit fd7df66724f958a2d44ba1fda1b11dc6833f0296
Author: Evgenii Glotov <glot.un...@gmail.com>
AuthorDate: Sun Aug 24 17:49:28 2025 +0300

    feat(spark): implement Spark `bitmap` function `bitmap_count` (#17179)
    
    * feat(spark): implement Spark `misc` function `bitmap_count`
    
    * chore: add ASF license text
    
    * chore: move bitmap_count to spark/bitmap module, improve error handling, 
add sqllogictests for different types, remove hint
    
    * fix: BitmapCount derive PartialEq, Eq, Hash
    
    * chore: reminder to implement TypeSignature for BitmapCount when possible
---
 .../spark/src/function/bitmap/bitmap_count.rs      | 178 +++++++++++++++++++++
 datafusion/spark/src/function/{ => bitmap}/mod.rs  |  46 +++---
 datafusion/spark/src/function/mod.rs               |   1 +
 datafusion/spark/src/lib.rs                        |   2 +
 .../test_files/spark/bitmap/bitmap_count.slt       |  61 +++++++
 5 files changed, 263 insertions(+), 25 deletions(-)

diff --git a/datafusion/spark/src/function/bitmap/bitmap_count.rs 
b/datafusion/spark/src/function/bitmap/bitmap_count.rs
new file mode 100644
index 0000000000..966b0930f0
--- /dev/null
+++ b/datafusion/spark/src/function/bitmap/bitmap_count.rs
@@ -0,0 +1,178 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{
+    Array, ArrayRef, BinaryArray, BinaryViewArray, FixedSizeBinaryArray, 
Int64Array,
+    LargeBinaryArray,
+};
+use arrow::datatypes::DataType;
+use arrow::datatypes::DataType::{
+    Binary, BinaryView, FixedSizeBinary, Int64, LargeBinary,
+};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{internal_datafusion_err, internal_err, plan_err, 
Result};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+use datafusion_functions::{downcast_arg, downcast_named_arg};
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct BitmapCount {
+    signature: Signature,
+}
+
+impl Default for BitmapCount {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl BitmapCount {
+    pub fn new() -> Self {
+        Self {
+            // TODO: add definitive TypeSignature after 
https://github.com/apache/datafusion/issues/17291 is done
+            signature: Signature::any(1, Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for BitmapCount {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "bitmap_count"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        match arg_types.first() {
+            Some(Binary | BinaryView | FixedSizeBinary(_) | LargeBinary) => 
Ok(Int64),
+            Some(data_type) => plan_err!(
+                "bitmap_count expects 
Binary/BinaryView/FixedSizeBinary/LargeBinary as argument, got {:?}", 
+                data_type
+            ),
+            None => internal_err!("bitmap_count does not support zero 
arguments"),
+        }
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> 
Result<ColumnarValue> {
+        make_scalar_function(bitmap_count_inner, vec![])(&args.args)
+    }
+}
+
+fn binary_count_ones(opt: Option<&[u8]>) -> Option<i64> {
+    opt.map(|value| value.iter().map(|b| b.count_ones() as i64).sum())
+}
+
+macro_rules! downcast_and_count_ones {
+    ($input_array:expr, $array_type:ident) => {{
+        let arr = downcast_arg!($input_array, $array_type);
+        Ok(arr.iter().map(binary_count_ones).collect::<Int64Array>())
+    }};
+}
+
+pub fn bitmap_count_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
+    let [input_array] = take_function_args("bitmap_count", arg)?;
+
+    let res: Result<Int64Array> = match &input_array.data_type() {
+        Binary => downcast_and_count_ones!(input_array, BinaryArray),
+        BinaryView => downcast_and_count_ones!(input_array, BinaryViewArray),
+        LargeBinary => downcast_and_count_ones!(input_array, LargeBinaryArray),
+        FixedSizeBinary(_size) => {
+            downcast_and_count_ones!(input_array, FixedSizeBinaryArray)
+        }
+        data_type => {
+            internal_err!("bitmap_count does not support {:?}", data_type)
+        }
+    };
+
+    Ok(Arc::new(res?))
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::function::bitmap::bitmap_count::BitmapCount;
+    use crate::function::utils::test::test_scalar_function;
+    use arrow::array::{Array, Int64Array};
+    use arrow::datatypes::DataType::Int64;
+    use datafusion_common::{Result, ScalarValue};
+    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+
+    macro_rules! test_bitmap_count_binary_invoke {
+        ($INPUT:expr, $EXPECTED:expr) => {
+            test_scalar_function!(
+                BitmapCount::new(),
+                vec![ColumnarValue::Scalar(ScalarValue::Binary($INPUT))],
+                $EXPECTED,
+                i64,
+                Int64,
+                Int64Array
+            );
+
+            test_scalar_function!(
+                BitmapCount::new(),
+                vec![ColumnarValue::Scalar(ScalarValue::LargeBinary($INPUT))],
+                $EXPECTED,
+                i64,
+                Int64,
+                Int64Array
+            );
+
+            test_scalar_function!(
+                BitmapCount::new(),
+                vec![ColumnarValue::Scalar(ScalarValue::BinaryView($INPUT))],
+                $EXPECTED,
+                i64,
+                Int64,
+                Int64Array
+            );
+
+            test_scalar_function!(
+                BitmapCount::new(),
+                vec![ColumnarValue::Scalar(ScalarValue::FixedSizeBinary(
+                    $INPUT.map(|a| a.len()).unwrap_or(0) as i32,
+                    $INPUT
+                ))],
+                $EXPECTED,
+                i64,
+                Int64,
+                Int64Array
+            );
+        };
+    }
+
+    #[test]
+    fn test_bitmap_count_invoke() -> Result<()> {
+        test_bitmap_count_binary_invoke!(None::<Vec<u8>>, Ok(None));
+        test_bitmap_count_binary_invoke!(Some(vec![0x0Au8]), Ok(Some(2)));
+        test_bitmap_count_binary_invoke!(Some(vec![0xFFu8, 0xFFu8]), 
Ok(Some(16)));
+        test_bitmap_count_binary_invoke!(
+            Some(vec![0x0Au8, 0xB0u8, 0xCDu8]),
+            Ok(Some(10))
+        );
+        Ok(())
+    }
+}
diff --git a/datafusion/spark/src/function/mod.rs 
b/datafusion/spark/src/function/bitmap/mod.rs
similarity index 63%
copy from datafusion/spark/src/function/mod.rs
copy to datafusion/spark/src/function/bitmap/mod.rs
index cac8741a89..8532c32ac9 100644
--- a/datafusion/spark/src/function/mod.rs
+++ b/datafusion/spark/src/function/bitmap/mod.rs
@@ -15,28 +15,24 @@
 // specific language governing permissions and limitations
 // under the License.
 
-pub mod aggregate;
-pub mod array;
-pub mod bitwise;
-pub mod collection;
-pub mod conditional;
-pub mod conversion;
-pub mod csv;
-pub mod datetime;
-pub mod error_utils;
-pub mod functions_nested_utils;
-pub mod generator;
-pub mod hash;
-pub mod json;
-pub mod lambda;
-pub mod map;
-pub mod math;
-pub mod misc;
-pub mod predicate;
-pub mod string;
-pub mod r#struct;
-pub mod table;
-pub mod url;
-pub mod utils;
-pub mod window;
-pub mod xml;
+pub mod bitmap_count;
+
+use datafusion_expr::ScalarUDF;
+use datafusion_functions::make_udf_function;
+use std::sync::Arc;
+
+make_udf_function!(bitmap_count::BitmapCount, bitmap_count);
+
+pub mod expr_fn {
+    use datafusion_functions::export_functions;
+
+    export_functions!((
+        bitmap_count,
+        "Returns the number of set bits in the input bitmap.",
+        arg
+    ));
+}
+
+pub fn functions() -> Vec<Arc<ScalarUDF>> {
+    vec![bitmap_count()]
+}
diff --git a/datafusion/spark/src/function/mod.rs 
b/datafusion/spark/src/function/mod.rs
index cac8741a89..3f4f94cfaa 100644
--- a/datafusion/spark/src/function/mod.rs
+++ b/datafusion/spark/src/function/mod.rs
@@ -17,6 +17,7 @@
 
 pub mod aggregate;
 pub mod array;
+pub mod bitmap;
 pub mod bitwise;
 pub mod collection;
 pub mod conditional;
diff --git a/datafusion/spark/src/lib.rs b/datafusion/spark/src/lib.rs
index 4ce9be1263..531883a6c4 100644
--- a/datafusion/spark/src/lib.rs
+++ b/datafusion/spark/src/lib.rs
@@ -104,6 +104,7 @@ use std::sync::Arc;
 pub mod expr_fn {
     pub use super::function::aggregate::expr_fn::*;
     pub use super::function::array::expr_fn::*;
+    pub use super::function::bitmap::expr_fn::*;
     pub use super::function::bitwise::expr_fn::*;
     pub use super::function::collection::expr_fn::*;
     pub use super::function::conditional::expr_fn::*;
@@ -130,6 +131,7 @@ pub mod expr_fn {
 pub fn all_default_scalar_functions() -> Vec<Arc<ScalarUDF>> {
     function::array::functions()
         .into_iter()
+        .chain(function::bitmap::functions())
         .chain(function::bitwise::functions())
         .chain(function::collection::functions())
         .chain(function::conditional::functions())
diff --git a/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_count.slt 
b/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_count.slt
new file mode 100644
index 0000000000..2789efef7b
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_count.slt
@@ -0,0 +1,61 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+query I
+SELECT bitmap_count(X'1010');
+----
+2
+
+query I
+SELECT bitmap_count(X'FFFF');
+----
+16
+
+query I
+SELECT bitmap_count(X'0');
+----
+0
+
+query I
+SELECT bitmap_count(a) FROM (VALUES (X'0AB0'), (X'0AB0CD'), (NULL)) AS t(a);
+----
+5
+10
+NULL
+
+# Tests with different binary types
+query I
+SELECT bitmap_count(arrow_cast(a, 'LargeBinary')) FROM (VALUES (X'0AB0'), 
(X'0AB0CD'), (NULL)) AS t(a);
+----
+5
+10
+NULL
+
+query I
+SELECT bitmap_count(arrow_cast(a, 'BinaryView')) FROM (VALUES (X'0AB0'), 
(X'0AB0CD'), (NULL)) AS t(a);
+----
+5
+10
+NULL
+
+query I
+SELECT bitmap_count(arrow_cast(a, 'FixedSizeBinary(2)')) FROM (VALUES 
(X'1010'), (X'0AB0'), (X'FFFF'), (NULL)) AS t(a);
+----
+2
+5
+16
+NULL


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org
For additional commands, e-mail: commits-h...@datafusion.apache.org

Reply via email to