This is an automated email from the ASF dual-hosted git repository. jeffreyvo pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push: new fd7df66724 feat(spark): implement Spark `bitmap` function `bitmap_count` (#17179) fd7df66724 is described below commit fd7df66724f958a2d44ba1fda1b11dc6833f0296 Author: Evgenii Glotov <glot.un...@gmail.com> AuthorDate: Sun Aug 24 17:49:28 2025 +0300 feat(spark): implement Spark `bitmap` function `bitmap_count` (#17179) * feat(spark): implement Spark `misc` function `bitmap_count` * chore: add ASF license text * chore: move bitmap_count to spark/bitmap module, improve error handling, add sqllogictests for different types, remove hint * fix: BitmapCount derive PartialEq, Eq, Hash * chore: reminder to implement TypeSignature for BitmapCount when possible --- .../spark/src/function/bitmap/bitmap_count.rs | 178 +++++++++++++++++++++ datafusion/spark/src/function/{ => bitmap}/mod.rs | 46 +++--- datafusion/spark/src/function/mod.rs | 1 + datafusion/spark/src/lib.rs | 2 + .../test_files/spark/bitmap/bitmap_count.slt | 61 +++++++ 5 files changed, 263 insertions(+), 25 deletions(-) diff --git a/datafusion/spark/src/function/bitmap/bitmap_count.rs b/datafusion/spark/src/function/bitmap/bitmap_count.rs new file mode 100644 index 0000000000..966b0930f0 --- /dev/null +++ b/datafusion/spark/src/function/bitmap/bitmap_count.rs @@ -0,0 +1,178 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::array::{ + Array, ArrayRef, BinaryArray, BinaryViewArray, FixedSizeBinaryArray, Int64Array, + LargeBinaryArray, +}; +use arrow::datatypes::DataType; +use arrow::datatypes::DataType::{ + Binary, BinaryView, FixedSizeBinary, Int64, LargeBinary, +}; +use datafusion_common::utils::take_function_args; +use datafusion_common::{internal_datafusion_err, internal_err, plan_err, Result}; +use datafusion_expr::{ + ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, +}; +use datafusion_functions::utils::make_scalar_function; +use datafusion_functions::{downcast_arg, downcast_named_arg}; + +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct BitmapCount { + signature: Signature, +} + +impl Default for BitmapCount { + fn default() -> Self { + Self::new() + } +} + +impl BitmapCount { + pub fn new() -> Self { + Self { + // TODO: add definitive TypeSignature after https://github.com/apache/datafusion/issues/17291 is done + signature: Signature::any(1, Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for BitmapCount { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "bitmap_count" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> { + match arg_types.first() { + Some(Binary | BinaryView | FixedSizeBinary(_) | LargeBinary) => Ok(Int64), + Some(data_type) => plan_err!( + "bitmap_count expects Binary/BinaryView/FixedSizeBinary/LargeBinary as argument, got {:?}", + data_type + ), + None => internal_err!("bitmap_count does not support zero arguments"), + } + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> { + make_scalar_function(bitmap_count_inner, vec![])(&args.args) + } +} + +fn binary_count_ones(opt: Option<&[u8]>) -> Option<i64> { + opt.map(|value| value.iter().map(|b| b.count_ones() as i64).sum()) +} + +macro_rules! downcast_and_count_ones { + ($input_array:expr, $array_type:ident) => {{ + let arr = downcast_arg!($input_array, $array_type); + Ok(arr.iter().map(binary_count_ones).collect::<Int64Array>()) + }}; +} + +pub fn bitmap_count_inner(arg: &[ArrayRef]) -> Result<ArrayRef> { + let [input_array] = take_function_args("bitmap_count", arg)?; + + let res: Result<Int64Array> = match &input_array.data_type() { + Binary => downcast_and_count_ones!(input_array, BinaryArray), + BinaryView => downcast_and_count_ones!(input_array, BinaryViewArray), + LargeBinary => downcast_and_count_ones!(input_array, LargeBinaryArray), + FixedSizeBinary(_size) => { + downcast_and_count_ones!(input_array, FixedSizeBinaryArray) + } + data_type => { + internal_err!("bitmap_count does not support {:?}", data_type) + } + }; + + Ok(Arc::new(res?)) +} + +#[cfg(test)] +mod tests { + use crate::function::bitmap::bitmap_count::BitmapCount; + use crate::function::utils::test::test_scalar_function; + use arrow::array::{Array, Int64Array}; + use arrow::datatypes::DataType::Int64; + use datafusion_common::{Result, ScalarValue}; + use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + + macro_rules! test_bitmap_count_binary_invoke { + ($INPUT:expr, $EXPECTED:expr) => { + test_scalar_function!( + BitmapCount::new(), + vec![ColumnarValue::Scalar(ScalarValue::Binary($INPUT))], + $EXPECTED, + i64, + Int64, + Int64Array + ); + + test_scalar_function!( + BitmapCount::new(), + vec![ColumnarValue::Scalar(ScalarValue::LargeBinary($INPUT))], + $EXPECTED, + i64, + Int64, + Int64Array + ); + + test_scalar_function!( + BitmapCount::new(), + vec![ColumnarValue::Scalar(ScalarValue::BinaryView($INPUT))], + $EXPECTED, + i64, + Int64, + Int64Array + ); + + test_scalar_function!( + BitmapCount::new(), + vec![ColumnarValue::Scalar(ScalarValue::FixedSizeBinary( + $INPUT.map(|a| a.len()).unwrap_or(0) as i32, + $INPUT + ))], + $EXPECTED, + i64, + Int64, + Int64Array + ); + }; + } + + #[test] + fn test_bitmap_count_invoke() -> Result<()> { + test_bitmap_count_binary_invoke!(None::<Vec<u8>>, Ok(None)); + test_bitmap_count_binary_invoke!(Some(vec![0x0Au8]), Ok(Some(2))); + test_bitmap_count_binary_invoke!(Some(vec![0xFFu8, 0xFFu8]), Ok(Some(16))); + test_bitmap_count_binary_invoke!( + Some(vec![0x0Au8, 0xB0u8, 0xCDu8]), + Ok(Some(10)) + ); + Ok(()) + } +} diff --git a/datafusion/spark/src/function/mod.rs b/datafusion/spark/src/function/bitmap/mod.rs similarity index 63% copy from datafusion/spark/src/function/mod.rs copy to datafusion/spark/src/function/bitmap/mod.rs index cac8741a89..8532c32ac9 100644 --- a/datafusion/spark/src/function/mod.rs +++ b/datafusion/spark/src/function/bitmap/mod.rs @@ -15,28 +15,24 @@ // specific language governing permissions and limitations // under the License. -pub mod aggregate; -pub mod array; -pub mod bitwise; -pub mod collection; -pub mod conditional; -pub mod conversion; -pub mod csv; -pub mod datetime; -pub mod error_utils; -pub mod functions_nested_utils; -pub mod generator; -pub mod hash; -pub mod json; -pub mod lambda; -pub mod map; -pub mod math; -pub mod misc; -pub mod predicate; -pub mod string; -pub mod r#struct; -pub mod table; -pub mod url; -pub mod utils; -pub mod window; -pub mod xml; +pub mod bitmap_count; + +use datafusion_expr::ScalarUDF; +use datafusion_functions::make_udf_function; +use std::sync::Arc; + +make_udf_function!(bitmap_count::BitmapCount, bitmap_count); + +pub mod expr_fn { + use datafusion_functions::export_functions; + + export_functions!(( + bitmap_count, + "Returns the number of set bits in the input bitmap.", + arg + )); +} + +pub fn functions() -> Vec<Arc<ScalarUDF>> { + vec![bitmap_count()] +} diff --git a/datafusion/spark/src/function/mod.rs b/datafusion/spark/src/function/mod.rs index cac8741a89..3f4f94cfaa 100644 --- a/datafusion/spark/src/function/mod.rs +++ b/datafusion/spark/src/function/mod.rs @@ -17,6 +17,7 @@ pub mod aggregate; pub mod array; +pub mod bitmap; pub mod bitwise; pub mod collection; pub mod conditional; diff --git a/datafusion/spark/src/lib.rs b/datafusion/spark/src/lib.rs index 4ce9be1263..531883a6c4 100644 --- a/datafusion/spark/src/lib.rs +++ b/datafusion/spark/src/lib.rs @@ -104,6 +104,7 @@ use std::sync::Arc; pub mod expr_fn { pub use super::function::aggregate::expr_fn::*; pub use super::function::array::expr_fn::*; + pub use super::function::bitmap::expr_fn::*; pub use super::function::bitwise::expr_fn::*; pub use super::function::collection::expr_fn::*; pub use super::function::conditional::expr_fn::*; @@ -130,6 +131,7 @@ pub mod expr_fn { pub fn all_default_scalar_functions() -> Vec<Arc<ScalarUDF>> { function::array::functions() .into_iter() + .chain(function::bitmap::functions()) .chain(function::bitwise::functions()) .chain(function::collection::functions()) .chain(function::conditional::functions()) diff --git a/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_count.slt b/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_count.slt new file mode 100644 index 0000000000..2789efef7b --- /dev/null +++ b/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_count.slt @@ -0,0 +1,61 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +query I +SELECT bitmap_count(X'1010'); +---- +2 + +query I +SELECT bitmap_count(X'FFFF'); +---- +16 + +query I +SELECT bitmap_count(X'0'); +---- +0 + +query I +SELECT bitmap_count(a) FROM (VALUES (X'0AB0'), (X'0AB0CD'), (NULL)) AS t(a); +---- +5 +10 +NULL + +# Tests with different binary types +query I +SELECT bitmap_count(arrow_cast(a, 'LargeBinary')) FROM (VALUES (X'0AB0'), (X'0AB0CD'), (NULL)) AS t(a); +---- +5 +10 +NULL + +query I +SELECT bitmap_count(arrow_cast(a, 'BinaryView')) FROM (VALUES (X'0AB0'), (X'0AB0CD'), (NULL)) AS t(a); +---- +5 +10 +NULL + +query I +SELECT bitmap_count(arrow_cast(a, 'FixedSizeBinary(2)')) FROM (VALUES (X'1010'), (X'0AB0'), (X'FFFF'), (NULL)) AS t(a); +---- +2 +5 +16 +NULL --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org