Re: [PR] Feat : Bringing in support for map_filter expression. [datafusion-comet]

via GitHub Tue, 02 Sep 2025 14:47:28 -0700


parthchandra commented on code in PR #2236:
URL: https://github.com/apache/datafusion-comet/pull/2236#discussion_r2317232622



##########
native/spark-expr/src/map_funcs/map_filter.rs:
##########
@@ -0,0 +1,539 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+use arrow::array::{
+    Array, ArrayRef, BooleanArray, MapArray, StructArray, Int32Array, 
Int64Array,
+    Float32Array, Float64Array, StringArray,
+};
+use arrow::datatypes::{DataType};
+use datafusion::common::{Result as DataFusionResult, ScalarValue, 
DataFusionError, internal_datafusion_err};
+use datafusion::logical_expr::{ColumnarValue, ScalarFunctionArgs, 
ScalarUDFImpl, Signature, Volatility};
+
+#[derive(Clone, Copy, Debug)]
+enum CompareOp {
+    Greater,
+    Less,
+    GreaterEqual,
+    LessEqual,
+    Equal,
+    NotEqual,
+}
+
+#[derive(Debug)]
+pub struct SparkMapFilter {
+    signature: Signature,
+}
+
+impl Default for SparkMapFilter {
+    fn default() -> Self {
+        Self {
+            signature: Signature::any(2, Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkMapFilter {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "map_filter"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> 
DataFusionResult<DataType> {
+        if arg_types.len() != 2 {
+            return Err(DataFusionError::Internal(
+                "map_filter expects exactly 2 arguments".to_string(),
+            ));
+        }
+
+        match &arg_types[0] {
+            DataType::Map(_, _) => Ok(arg_types[0].clone()),
+            _ => Err(DataFusionError::Internal(
+                "First argument to map_filter must be a map".to_string(),
+            )),
+        }
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> 
Result<ColumnarValue, DataFusionError> {
+        let args: [ColumnarValue; 2] = args
+            .args
+            .try_into()
+            .map_err(|_| internal_datafusion_err!("map_filter expects exactly 
two arguments"))?;
+        
+        spark_map_filter(&args[0], &args[1])
+    }
+}
+
+
+pub fn spark_map_filter(map_arg: &ColumnarValue, lambda_arg: &ColumnarValue) 
-> DataFusionResult<ColumnarValue> {
+    match map_arg {
+        ColumnarValue::Array(map_array) => {
+            let map_array = map_array.as_any().downcast_ref::<MapArray>()
+                .ok_or_else(|| DataFusionError::Internal("Expected 
MapArray".to_string()))?;
+            let filtered = filter_map_with_lambda(map_array, lambda_arg)?;
+            Ok(ColumnarValue::Array(Arc::new(filtered)))
+        }
+        ColumnarValue::Scalar(scalar_value) => {
+            match scalar_value {
+                ScalarValue::Map(map_array) => {
+                    let filtered = filter_map_with_lambda(map_array, 
lambda_arg)?;
+                    
+                    // Convert filtered result back to scalar
+                    if filtered.len() == 1 {
+                        let scalar_map = 
ScalarValue::try_from_array(&filtered, 0)?;
+                        Ok(ColumnarValue::Scalar(scalar_map))
+                    } else {
+                        // If filtering produces multiple maps, return as array
+                        Ok(ColumnarValue::Array(Arc::new(filtered)))
+                    }
+                }
+                ScalarValue::Null => {
+                    // Handle null scalar map
+                    Ok(ColumnarValue::Scalar(ScalarValue::Null))
+                }
+                _ => Err(DataFusionError::Internal(
+                    "Invalid map data received".to_string()
+                )),
+            }
+        }
+    }
+}
+
+fn filter_map_with_lambda(map_array: &MapArray, lambda_arg: &ColumnarValue) -> 
DataFusionResult<MapArray> {
+    let lambda_expr = extract_lambda_expression(lambda_arg)?;
+    let entries = map_array.entries();
+    let entries_str = 
entries.as_any().downcast_ref::<StructArray>().ok_or_else(||
+        DataFusionError::Internal("Expected StructArray for map 
entries".to_string()))?;
+
+    let keys = entries_str.column(0);
+    let values = entries_str.column(1);
+    let filter_mask = evaluate_lambda_on_pairs(&lambda_expr, keys, values)?;
+
+    let filtered_entries = filter_struct_array(entries_str, &filter_mask)?;
+    let offsets = compute_filtered_offsets(map_array, &filter_mask)?;
+    let filtered_map = match map_array.data_type() {
+        DataType::Map(field, _) => field.clone(),
+        _ => return Err(DataFusionError::Internal(
+            "Invalid map datatype".to_string())),
+    };
+
+    MapArray::try_new(filtered_map, offsets, filtered_entries, 
map_array.nulls()
+        .cloned(), false,).map_err(|e| 
DataFusionError::Internal(format!("Arrow error: {}", e)))
+}
+
+fn extract_lambda_expression(lambda_arg: &ColumnarValue) -> 
DataFusionResult<String> {
+    // Handle string based lambda expressions
+    match lambda_arg {
+        ColumnarValue::Scalar(scalar_value) => {
+            match scalar_value {
+                ScalarValue::Utf8(Some(expr)) => Ok(expr.clone()),
+                _ => Err(DataFusionError::Internal("Lambda expression must be a
+  string".to_string())),
+            }
+        }
+        _ => Err(DataFusionError::Internal("Lambda expression must be a scalar
+  string".to_string())),
+    }
+}
+
+fn evaluate_lambda_on_pairs(
+    lambda_expr: &str,
+    keys: &ArrayRef,
+    values: &ArrayRef
+) -> DataFusionResult<BooleanArray> {
+    let mut results = Vec::new();
+    let len = keys.len();
+
+    for i in 0..len {
+        let result = evaluate_lambda_comparison(lambda_expr, keys, values, i)?;
+        results.push(result);
+    }
+
+    Ok(BooleanArray::from(results))
+}
+
+fn evaluate_lambda_comparison(
+    lambda_expr: &str,
+    keys: &ArrayRef,
+    values: &ArrayRef,
+    index: usize
+) -> DataFusionResult<Option<bool>> {
+    // Handle null values
+    if keys.is_null(index) || values.is_null(index) {
+        return Ok(Some(false)); // Spark behavior: nulls are filtered out
+    }
+
+    // Parse the lambda expression
+    if lambda_expr.contains(" >= ") {

Review Comment:
   The lambda could be any expression 
https://github.com/apache/spark/blob/885bfc22cb0d315519384568c9cb0dce2c0f556f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala#L113
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Feat : Bringing in support for map_filter expression. [datafusion-comet]

Reply via email to