mbutrovich commented on code in PR #2236: URL: https://github.com/apache/datafusion-comet/pull/2236#discussion_r2301183180
########## native/spark-expr/src/map_funcs/map_filter.rs: ########## @@ -0,0 +1,539 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; +use arrow::array::{ + Array, ArrayRef, BooleanArray, MapArray, StructArray, Int32Array, Int64Array, + Float32Array, Float64Array, StringArray, +}; +use arrow::datatypes::{DataType}; +use datafusion::common::{Result as DataFusionResult, ScalarValue, DataFusionError, internal_datafusion_err}; +use datafusion::logical_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility}; + +#[derive(Clone, Copy, Debug)] +enum CompareOp { + Greater, + Less, + GreaterEqual, + LessEqual, + Equal, + NotEqual, +} + +#[derive(Debug)] +pub struct SparkMapFilter { + signature: Signature, +} + +impl Default for SparkMapFilter { + fn default() -> Self { + Self { + signature: Signature::any(2, Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for SparkMapFilter { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "map_filter" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> DataFusionResult<DataType> { + if arg_types.len() != 2 { + return Err(DataFusionError::Internal( + "map_filter expects exactly 2 arguments".to_string(), + )); + } + + match &arg_types[0] { + DataType::Map(_, _) => Ok(arg_types[0].clone()), + _ => Err(DataFusionError::Internal( + "First argument to map_filter must be a map".to_string(), + )), + } + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue, DataFusionError> { + let args: [ColumnarValue; 2] = args + .args + .try_into() + .map_err(|_| internal_datafusion_err!("map_filter expects exactly two arguments"))?; + + spark_map_filter(&args[0], &args[1]) + } +} + + +pub fn spark_map_filter(map_arg: &ColumnarValue, lambda_arg: &ColumnarValue) -> DataFusionResult<ColumnarValue> { + match map_arg { + ColumnarValue::Array(map_array) => { + let map_array = map_array.as_any().downcast_ref::<MapArray>() + .ok_or_else(|| DataFusionError::Internal("Expected MapArray".to_string()))?; + let filtered = filter_map_with_lambda(map_array, lambda_arg)?; + Ok(ColumnarValue::Array(Arc::new(filtered))) + } + ColumnarValue::Scalar(scalar_value) => { + match scalar_value { + ScalarValue::Map(map_array) => { + let filtered = filter_map_with_lambda(map_array, lambda_arg)?; + + // Convert filtered result back to scalar + if filtered.len() == 1 { + let scalar_map = ScalarValue::try_from_array(&filtered, 0)?; + Ok(ColumnarValue::Scalar(scalar_map)) + } else { + // If filtering produces multiple maps, return as array + Ok(ColumnarValue::Array(Arc::new(filtered))) + } + } + ScalarValue::Null => { + // Handle null scalar map + Ok(ColumnarValue::Scalar(ScalarValue::Null)) + } + _ => Err(DataFusionError::Internal( + "Invalid map data received".to_string() + )), + } + } + } +} + +fn filter_map_with_lambda(map_array: &MapArray, lambda_arg: &ColumnarValue) -> DataFusionResult<MapArray> { + let lambda_expr = extract_lambda_expression(lambda_arg)?; + let entries = map_array.entries(); + let entries_str = entries.as_any().downcast_ref::<StructArray>().ok_or_else(|| + DataFusionError::Internal("Expected StructArray for map entries".to_string()))?; + + let keys = entries_str.column(0); + let values = entries_str.column(1); + let filter_mask = evaluate_lambda_on_pairs(&lambda_expr, keys, values)?; + + let filtered_entries = filter_struct_array(entries_str, &filter_mask)?; + let offsets = compute_filtered_offsets(map_array, &filter_mask)?; + let filtered_map = match map_array.data_type() { + DataType::Map(field, _) => field.clone(), + _ => return Err(DataFusionError::Internal( + "Invalid map datatype".to_string())), + }; + + MapArray::try_new(filtered_map, offsets, filtered_entries, map_array.nulls() + .cloned(), false,).map_err(|e| DataFusionError::Internal(format!("Arrow error: {}", e))) +} + +fn extract_lambda_expression(lambda_arg: &ColumnarValue) -> DataFusionResult<String> { + // Handle string based lambda expressions + match lambda_arg { + ColumnarValue::Scalar(scalar_value) => { + match scalar_value { + ScalarValue::Utf8(Some(expr)) => Ok(expr.clone()), + _ => Err(DataFusionError::Internal("Lambda expression must be a + string".to_string())), + } + } + _ => Err(DataFusionError::Internal("Lambda expression must be a scalar + string".to_string())), + } +} + +fn evaluate_lambda_on_pairs( + lambda_expr: &str, + keys: &ArrayRef, + values: &ArrayRef +) -> DataFusionResult<BooleanArray> { + let mut results = Vec::new(); + let len = keys.len(); + + for i in 0..len { + let result = evaluate_lambda_comparison(lambda_expr, keys, values, i)?; + results.push(result); + } + + Ok(BooleanArray::from(results)) +} + +fn evaluate_lambda_comparison( + lambda_expr: &str, + keys: &ArrayRef, + values: &ArrayRef, + index: usize +) -> DataFusionResult<Option<bool>> { + // Handle null values + if keys.is_null(index) || values.is_null(index) { + return Ok(Some(false)); // Spark behavior: nulls are filtered out + } + + // Parse the lambda expression + if lambda_expr.contains(" >= ") { + let parts: Vec<&str> = lambda_expr.split(" >= ").collect(); + if parts.len() == 2 { + return compare_with_constant(keys, values, index, parts[0], parts[1], CompareOp::GreaterEqual); + } + } else if lambda_expr.contains(" <= ") { + let parts: Vec<&str> = lambda_expr.split(" <= ").collect(); + if parts.len() == 2 { + return compare_with_constant(keys, values, index, parts[0], parts[1], CompareOp::LessEqual); + } + } else if lambda_expr.contains(" > ") { + let parts: Vec<&str> = lambda_expr.split(" > ").collect(); + if parts.len() == 2 { + return compare_with_constant(keys, values, index, parts[0], parts[1], CompareOp::Greater); + } + } else if lambda_expr.contains(" < ") { + let parts: Vec<&str> = lambda_expr.split(" < ").collect(); + if parts.len() == 2 { + return compare_with_constant(keys, values, index, parts[0], parts[1], CompareOp::Less); + } + } else if lambda_expr.contains(" == ") { + let parts: Vec<&str> = lambda_expr.split(" == ").collect(); + if parts.len() == 2 { + return compare_with_constant(keys, values, index, parts[0], parts[1], CompareOp::Equal); + } + } else if lambda_expr.contains(" != ") { + let parts: Vec<&str> = lambda_expr.split(" != ").collect(); + if parts.len() == 2 { + return compare_with_constant(keys, values, index, parts[0], parts[1], CompareOp::NotEqual); + } + } + + // Default: keep all entries for unsupported expressions + Ok(Some(true)) +} + +fn compare_with_constant( + keys: &ArrayRef, + values: &ArrayRef, + index: usize, + left_var: &str, + right_constant: &str, + op: CompareOp +) -> DataFusionResult<Option<bool>> { + let left_var = left_var.trim(); + let right_constant = right_constant.trim(); + + let (array_to_compare, data_type) = if left_var == "k" { + (keys, keys.data_type()) + } else if left_var == "v" { + (values, values.data_type()) + } else { + // Unsupported variable, keep entry + return Ok(Some(true)); + }; + + match data_type { + DataType::Int32 => { + let arr = array_to_compare.as_any().downcast_ref::<Int32Array>() + .ok_or_else(|| DataFusionError::Internal("Expected Int32Array".to_string()))?; + let left_val = arr.value(index) as i64; + if let Ok(right_val) = right_constant.parse::<i64>() { + Ok(Some(compare_integers(left_val, right_val, op))) + } else { + Ok(Some(true)) // Can't parse constant, keep entry + } + } + DataType::Int64 => { + let arr = array_to_compare.as_any().downcast_ref::<Int64Array>() + .ok_or_else(|| DataFusionError::Internal("Expected Int64Array".to_string()))?; + let left_val = arr.value(index); + if let Ok(right_val) = right_constant.parse::<i64>() { + Ok(Some(compare_integers(left_val, right_val, op))) + } else { + Ok(Some(true)) + } + } + DataType::Float32 => { + let arr = array_to_compare.as_any().downcast_ref::<Float32Array>() + .ok_or_else(|| DataFusionError::Internal("Expected Float32Array".to_string()))?; + let left_val = arr.value(index) as f64; + if let Ok(right_val) = right_constant.parse::<f64>() { + Ok(Some(compare_floats(left_val, right_val, op))) + } else { + Ok(Some(true)) + } + } + DataType::Float64 => { + let arr = array_to_compare.as_any().downcast_ref::<Float64Array>() + .ok_or_else(|| DataFusionError::Internal("Expected Float64Array".to_string()))?; + let left_val = arr.value(index); + if let Ok(right_val) = right_constant.parse::<f64>() { + Ok(Some(compare_floats(left_val, right_val, op))) + } else { + Ok(Some(true)) + } + } + DataType::Utf8 => { + let arr = array_to_compare.as_any().downcast_ref::<StringArray>() + .ok_or_else(|| DataFusionError::Internal("Expected StringArray".to_string()))?; + let left_val = arr.value(index); + Ok(Some(compare_strings(left_val, right_constant, op))) + } + _ => { + // Unsupported type, keep entry + Ok(Some(true)) + } + } +} + + +// Separate comparison functions for different types +fn compare_integers(left: i64, right: i64, op: CompareOp) -> bool { + match op { + CompareOp::Greater => left > right, + CompareOp::Less => left < right, + CompareOp::GreaterEqual => left >= right, + CompareOp::LessEqual => left <= right, + CompareOp::Equal => left == right, + CompareOp::NotEqual => left != right, + } +} + +fn compare_floats(left: f64, right: f64, op: CompareOp) -> bool { + match op { + CompareOp::Greater => left > right, + CompareOp::Less => left < right, + CompareOp::GreaterEqual => left >= right, + CompareOp::LessEqual => left <= right, + CompareOp::Equal => (left - right).abs() < f64::EPSILON, + CompareOp::NotEqual => (left - right).abs() >= f64::EPSILON, + } +} + +fn compare_strings(left: &str, right: &str, op: CompareOp) -> bool { Review Comment: Why do we need custom comparator logic? This seems like something Arrow kernels could handle. ########## native/spark-expr/src/map_funcs/map_filter.rs: ########## @@ -0,0 +1,539 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; +use arrow::array::{ + Array, ArrayRef, BooleanArray, MapArray, StructArray, Int32Array, Int64Array, + Float32Array, Float64Array, StringArray, +}; +use arrow::datatypes::{DataType}; +use datafusion::common::{Result as DataFusionResult, ScalarValue, DataFusionError, internal_datafusion_err}; +use datafusion::logical_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility}; + +#[derive(Clone, Copy, Debug)] +enum CompareOp { + Greater, + Less, + GreaterEqual, + LessEqual, + Equal, + NotEqual, +} + +#[derive(Debug)] +pub struct SparkMapFilter { + signature: Signature, +} + +impl Default for SparkMapFilter { + fn default() -> Self { + Self { + signature: Signature::any(2, Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for SparkMapFilter { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "map_filter" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> DataFusionResult<DataType> { + if arg_types.len() != 2 { + return Err(DataFusionError::Internal( + "map_filter expects exactly 2 arguments".to_string(), + )); + } + + match &arg_types[0] { + DataType::Map(_, _) => Ok(arg_types[0].clone()), + _ => Err(DataFusionError::Internal( + "First argument to map_filter must be a map".to_string(), + )), + } + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue, DataFusionError> { + let args: [ColumnarValue; 2] = args + .args + .try_into() + .map_err(|_| internal_datafusion_err!("map_filter expects exactly two arguments"))?; + + spark_map_filter(&args[0], &args[1]) + } +} + + +pub fn spark_map_filter(map_arg: &ColumnarValue, lambda_arg: &ColumnarValue) -> DataFusionResult<ColumnarValue> { + match map_arg { + ColumnarValue::Array(map_array) => { + let map_array = map_array.as_any().downcast_ref::<MapArray>() + .ok_or_else(|| DataFusionError::Internal("Expected MapArray".to_string()))?; + let filtered = filter_map_with_lambda(map_array, lambda_arg)?; + Ok(ColumnarValue::Array(Arc::new(filtered))) + } + ColumnarValue::Scalar(scalar_value) => { + match scalar_value { + ScalarValue::Map(map_array) => { + let filtered = filter_map_with_lambda(map_array, lambda_arg)?; + + // Convert filtered result back to scalar + if filtered.len() == 1 { + let scalar_map = ScalarValue::try_from_array(&filtered, 0)?; + Ok(ColumnarValue::Scalar(scalar_map)) + } else { + // If filtering produces multiple maps, return as array + Ok(ColumnarValue::Array(Arc::new(filtered))) + } + } + ScalarValue::Null => { + // Handle null scalar map + Ok(ColumnarValue::Scalar(ScalarValue::Null)) + } + _ => Err(DataFusionError::Internal( + "Invalid map data received".to_string() + )), + } + } + } +} + +fn filter_map_with_lambda(map_array: &MapArray, lambda_arg: &ColumnarValue) -> DataFusionResult<MapArray> { + let lambda_expr = extract_lambda_expression(lambda_arg)?; + let entries = map_array.entries(); + let entries_str = entries.as_any().downcast_ref::<StructArray>().ok_or_else(|| + DataFusionError::Internal("Expected StructArray for map entries".to_string()))?; + + let keys = entries_str.column(0); + let values = entries_str.column(1); + let filter_mask = evaluate_lambda_on_pairs(&lambda_expr, keys, values)?; + + let filtered_entries = filter_struct_array(entries_str, &filter_mask)?; + let offsets = compute_filtered_offsets(map_array, &filter_mask)?; + let filtered_map = match map_array.data_type() { + DataType::Map(field, _) => field.clone(), + _ => return Err(DataFusionError::Internal( + "Invalid map datatype".to_string())), + }; + + MapArray::try_new(filtered_map, offsets, filtered_entries, map_array.nulls() + .cloned(), false,).map_err(|e| DataFusionError::Internal(format!("Arrow error: {}", e))) +} + +fn extract_lambda_expression(lambda_arg: &ColumnarValue) -> DataFusionResult<String> { + // Handle string based lambda expressions + match lambda_arg { + ColumnarValue::Scalar(scalar_value) => { + match scalar_value { + ScalarValue::Utf8(Some(expr)) => Ok(expr.clone()), + _ => Err(DataFusionError::Internal("Lambda expression must be a + string".to_string())), + } + } + _ => Err(DataFusionError::Internal("Lambda expression must be a scalar + string".to_string())), + } +} + +fn evaluate_lambda_on_pairs( + lambda_expr: &str, + keys: &ArrayRef, + values: &ArrayRef +) -> DataFusionResult<BooleanArray> { + let mut results = Vec::new(); + let len = keys.len(); + + for i in 0..len { + let result = evaluate_lambda_comparison(lambda_expr, keys, values, i)?; + results.push(result); + } + + Ok(BooleanArray::from(results)) +} + +fn evaluate_lambda_comparison( + lambda_expr: &str, + keys: &ArrayRef, + values: &ArrayRef, + index: usize +) -> DataFusionResult<Option<bool>> { + // Handle null values + if keys.is_null(index) || values.is_null(index) { + return Ok(Some(false)); // Spark behavior: nulls are filtered out + } + + // Parse the lambda expression + if lambda_expr.contains(" >= ") { Review Comment: Is this parsing sufficient? How complex can they be? I'm surprised they're coming out of Spark as strings and not parsed already into some sort of expression. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
