Copilot commented on code in PR #1886:
URL: https://github.com/apache/auron/pull/1886#discussion_r2681262117


##########
native-engine/datafusion-ext-plans/src/orc_exec.rs:
##########
@@ -414,3 +430,256 @@ impl OrcFileMetrics {
         Self { bytes_scanned }
     }
 }
+
+fn convert_predicate_to_orc(
+    predicate: Option<PhysicalExprRef>,
+    file_schema: &SchemaRef,
+) -> Option<Predicate> {
+    let predicate = predicate?;
+    convert_expr_to_orc(&predicate, file_schema)
+}
+
+/// Recursively collect all AND sub-conditions and flatten nested AND
+/// structures.
+fn collect_and_predicates(
+    expr: &Arc<dyn datafusion::physical_expr::PhysicalExpr>,
+    schema: &SchemaRef,
+    predicates: &mut Vec<Predicate>,
+) {
+    if let Some(binary) = expr.as_any().downcast_ref::<BinaryExpr>() {
+        if matches!(binary.op(), Operator::And) {
+            // Recursively collect AND sub-conditions from both sides
+            collect_and_predicates(binary.left(), schema, predicates);
+            collect_and_predicates(binary.right(), schema, predicates);
+            return;
+        }
+    }
+
+    // Not an AND expression, try to convert to ORC predicate
+    if let Some(pred) = convert_expr_to_orc_internal(expr, schema) {
+        predicates.push(pred);
+    }
+}
+
+/// Recursively collect all OR sub-conditions and flatten nested OR
+/// structures.
+fn collect_or_predicates(
+    expr: &Arc<dyn datafusion::physical_expr::PhysicalExpr>,
+    schema: &SchemaRef,
+    predicates: &mut Vec<Predicate>,
+) {
+    if let Some(binary) = expr.as_any().downcast_ref::<BinaryExpr>() {
+        if matches!(binary.op(), Operator::Or) {
+            // Recursively collect OR sub-conditions from both sides
+            collect_or_predicates(binary.left(), schema, predicates);
+            collect_or_predicates(binary.right(), schema, predicates);
+            return;
+        }
+    }
+
+    // Not an OR expression, try to convert to ORC predicate
+    if let Some(pred) = convert_expr_to_orc_internal(expr, schema) {
+        predicates.push(pred);
+    }
+}
+
+fn convert_expr_to_orc(
+    expr: &Arc<dyn datafusion::physical_expr::PhysicalExpr>,
+    schema: &SchemaRef,
+) -> Option<Predicate> {
+    // Handle top-level AND expression, flatten all AND conditions
+    if let Some(binary) = expr.as_any().downcast_ref::<BinaryExpr>() {
+        if matches!(binary.op(), Operator::And) {
+            let mut predicates = Vec::new();
+            collect_and_predicates(expr, schema, &mut predicates);
+
+            if predicates.is_empty() {
+                return None;
+            }
+
+            if predicates.len() == 1 {
+                return Some(predicates.into_iter().next().unwrap());
+            }
+
+            return Some(Predicate::and(predicates));
+        }
+
+        // Handle top-level OR expression, flatten all OR conditions
+        if matches!(binary.op(), Operator::Or) {
+            let mut predicates = Vec::new();
+            collect_or_predicates(expr, schema, &mut predicates);
+
+            if predicates.is_empty() {
+                return None;
+            }
+
+            if predicates.len() == 1 {
+                return Some(predicates.into_iter().next().unwrap());
+            }
+
+            return Some(Predicate::or(predicates));
+        }
+    }
+
+    convert_expr_to_orc_internal(expr, schema)
+}
+
+/// Internal conversion function for non-AND/OR expressions.
+fn convert_expr_to_orc_internal(
+    expr: &Arc<dyn datafusion::physical_expr::PhysicalExpr>,
+    schema: &SchemaRef,
+) -> Option<Predicate> {
+    // Handle Literal expressions (WHERE true, WHERE false, etc.)
+    if let Some(lit) = expr.as_any().downcast_ref::<Literal>() {
+        match lit.value() {
+            ScalarValue::Boolean(Some(true)) => {
+                // WHERE true - no filtering needed, return None to skip 
predicate
+                return None;
+            }
+            ScalarValue::Boolean(Some(false)) => {
+                // WHERE false - need to filter all data
+                // ORC doesn't have a direct "false" predicate, return None
+                return None;

Review Comment:
   The handling of WHERE false is incorrect. Returning None causes no predicate 
to be pushed down, which means all data will be read from the ORC file. For 
WHERE false, the predicate should filter out all data. Consider creating a 
predicate that always evaluates to false, or handle this case at a higher level 
by returning an empty result set without reading the file.
   ```suggestion
                   // Construct an always-false predicate so ORC can skip all 
rows
                   let always_false = 
Predicate::not(Predicate::and(Vec::new()));
                   return Some(always_false);
   ```



##########
native-engine/datafusion-ext-plans/src/orc_exec.rs:
##########
@@ -414,3 +430,256 @@ impl OrcFileMetrics {
         Self { bytes_scanned }
     }
 }
+
+fn convert_predicate_to_orc(
+    predicate: Option<PhysicalExprRef>,
+    file_schema: &SchemaRef,
+) -> Option<Predicate> {
+    let predicate = predicate?;
+    convert_expr_to_orc(&predicate, file_schema)
+}
+
+/// Recursively collect all AND sub-conditions and flatten nested AND
+/// structures.
+fn collect_and_predicates(
+    expr: &Arc<dyn datafusion::physical_expr::PhysicalExpr>,
+    schema: &SchemaRef,
+    predicates: &mut Vec<Predicate>,
+) {
+    if let Some(binary) = expr.as_any().downcast_ref::<BinaryExpr>() {
+        if matches!(binary.op(), Operator::And) {
+            // Recursively collect AND sub-conditions from both sides
+            collect_and_predicates(binary.left(), schema, predicates);
+            collect_and_predicates(binary.right(), schema, predicates);
+            return;
+        }
+    }
+
+    // Not an AND expression, try to convert to ORC predicate
+    if let Some(pred) = convert_expr_to_orc_internal(expr, schema) {
+        predicates.push(pred);
+    }
+}
+
+/// Recursively collect all OR sub-conditions and flatten nested OR
+/// structures.
+fn collect_or_predicates(
+    expr: &Arc<dyn datafusion::physical_expr::PhysicalExpr>,
+    schema: &SchemaRef,
+    predicates: &mut Vec<Predicate>,
+) {
+    if let Some(binary) = expr.as_any().downcast_ref::<BinaryExpr>() {
+        if matches!(binary.op(), Operator::Or) {
+            // Recursively collect OR sub-conditions from both sides
+            collect_or_predicates(binary.left(), schema, predicates);
+            collect_or_predicates(binary.right(), schema, predicates);
+            return;
+        }
+    }
+
+    // Not an OR expression, try to convert to ORC predicate
+    if let Some(pred) = convert_expr_to_orc_internal(expr, schema) {
+        predicates.push(pred);
+    }
+}
+
+fn convert_expr_to_orc(
+    expr: &Arc<dyn datafusion::physical_expr::PhysicalExpr>,
+    schema: &SchemaRef,
+) -> Option<Predicate> {
+    // Handle top-level AND expression, flatten all AND conditions
+    if let Some(binary) = expr.as_any().downcast_ref::<BinaryExpr>() {
+        if matches!(binary.op(), Operator::And) {
+            let mut predicates = Vec::new();
+            collect_and_predicates(expr, schema, &mut predicates);
+
+            if predicates.is_empty() {
+                return None;
+            }
+
+            if predicates.len() == 1 {
+                return Some(predicates.into_iter().next().unwrap());
+            }
+
+            return Some(Predicate::and(predicates));
+        }
+
+        // Handle top-level OR expression, flatten all OR conditions
+        if matches!(binary.op(), Operator::Or) {
+            let mut predicates = Vec::new();
+            collect_or_predicates(expr, schema, &mut predicates);
+
+            if predicates.is_empty() {
+                return None;
+            }
+
+            if predicates.len() == 1 {
+                return Some(predicates.into_iter().next().unwrap());
+            }
+
+            return Some(Predicate::or(predicates));
+        }
+    }
+
+    convert_expr_to_orc_internal(expr, schema)
+}
+
+/// Internal conversion function for non-AND/OR expressions.
+fn convert_expr_to_orc_internal(
+    expr: &Arc<dyn datafusion::physical_expr::PhysicalExpr>,
+    schema: &SchemaRef,
+) -> Option<Predicate> {
+    // Handle Literal expressions (WHERE true, WHERE false, etc.)
+    if let Some(lit) = expr.as_any().downcast_ref::<Literal>() {
+        match lit.value() {
+            ScalarValue::Boolean(Some(true)) => {
+                // WHERE true - no filtering needed, return None to skip 
predicate
+                return None;
+            }
+            ScalarValue::Boolean(Some(false)) => {
+                // WHERE false - need to filter all data
+                // ORC doesn't have a direct "false" predicate, return None
+                return None;
+            }
+            _ => {
+                return None;
+            }
+        }
+    }
+
+    // Handle NOT expressions (WHERE NOT condition)
+    if let Some(not_expr) = expr.as_any().downcast_ref::<NotExpr>() {
+        if let Some(inner_pred) = convert_expr_to_orc(not_expr.arg(), schema) {
+            return Some(Predicate::not(inner_pred));
+        }
+        return None;
+    }
+
+    // Handle IS NULL expressions
+    if let Some(is_null) = expr.as_any().downcast_ref::<IsNullExpr>() {
+        if let Some(col) = is_null.arg().as_any().downcast_ref::<Column>() {
+            let col_name = col.name();
+            return Some(Predicate::is_null(col_name));
+        }
+        return None;
+    }
+
+    // Handle IS NOT NULL expressions
+    if let Some(is_not_null) = expr.as_any().downcast_ref::<IsNotNullExpr>() {
+        if let Some(col) = is_not_null.arg().as_any().downcast_ref::<Column>() 
{
+            let col_name = col.name();
+            return Some(Predicate::not(Predicate::is_null(col_name)));
+        }
+        return None;
+    }
+
+    // Handle IN expressions (WHERE col IN (val1, val2, ...))
+    if let Some(in_list) = expr.as_any().downcast_ref::<InListExpr>() {
+        if let Some(col) = in_list.expr().as_any().downcast_ref::<Column>() {
+            let col_name = col.name();
+
+            // Convert IN to multiple OR conditions: col = val1 OR col = val2 
OR ...
+            let mut predicates = Vec::new();
+            for list_expr in in_list.list() {
+                if let Some(lit) = 
list_expr.as_any().downcast_ref::<Literal>() {
+                    if let Some(pred_value) = 
convert_scalar_value(lit.value()) {
+                        predicates.push(Predicate::eq(col_name, pred_value));
+                    }
+                }
+            }
+
+            if predicates.is_empty() {
+                return None;
+            }
+
+            // If negated is true, it represents NOT IN
+            if in_list.negated() {
+                return Some(Predicate::not(Predicate::or(predicates)));
+            } else {
+                return Some(Predicate::or(predicates));
+            }
+        }
+        return None;
+    }
+
+    // Handle BinaryExpr (comparison operations)
+    if let Some(binary) = expr.as_any().downcast_ref::<BinaryExpr>() {
+        let left = binary.left();
+        let right = binary.right();
+        let op = binary.op();
+
+        // AND/OR are already handled at the outer level, skip here
+        if matches!(op, Operator::And | Operator::Or) {
+            return None;
+        }
+
+        if let Some(col) = left.as_any().downcast_ref::<Column>() {
+            if let Some(lit) = right.as_any().downcast_ref::<Literal>() {
+                let col_name = col.name();
+                let value = lit.value();
+                return build_comparison_predicate(col_name, op, value);
+            }
+        }
+
+        if let Some(lit) = left.as_any().downcast_ref::<Literal>() {
+            if let Some(col) = right.as_any().downcast_ref::<Column>() {
+                let col_name = col.name();
+                let value = lit.value();
+                return build_comparison_predicate_reversed(col_name, op, 
value);
+            }
+        }
+    }
+
+    None
+}
+
+fn build_comparison_predicate(
+    col_name: &str,
+    op: &Operator,
+    value: &ScalarValue,
+) -> Option<Predicate> {
+    let predicate_value = convert_scalar_value(value)?;
+
+    match op {
+        Operator::Eq => Some(Predicate::eq(col_name, predicate_value)),
+        Operator::NotEq => Some(Predicate::ne(col_name, predicate_value)),
+        Operator::Lt => Some(Predicate::lt(col_name, predicate_value)),
+        Operator::LtEq => Some(Predicate::lte(col_name, predicate_value)),
+        Operator::Gt => Some(Predicate::gt(col_name, predicate_value)),
+        Operator::GtEq => Some(Predicate::gte(col_name, predicate_value)),
+        _ => None,
+    }
+}
+
+fn build_comparison_predicate_reversed(
+    col_name: &str,
+    op: &Operator,
+    value: &ScalarValue,
+) -> Option<Predicate> {
+    let predicate_value = convert_scalar_value(value)?;
+
+    match op {
+        Operator::Eq => Some(Predicate::eq(col_name, predicate_value)),
+        Operator::NotEq => Some(Predicate::ne(col_name, predicate_value)),
+        Operator::Lt => Some(Predicate::gt(col_name, predicate_value)),
+        Operator::LtEq => Some(Predicate::gte(col_name, predicate_value)),
+        Operator::Gt => Some(Predicate::lt(col_name, predicate_value)),
+        Operator::GtEq => Some(Predicate::lte(col_name, predicate_value)),
+        _ => None,
+    }
+}
+
+fn convert_scalar_value(value: &ScalarValue) -> Option<PredicateValue> {
+    match value {
+        ScalarValue::Boolean(v) => Some(PredicateValue::Boolean(*v)),
+        ScalarValue::Int8(v) => Some(PredicateValue::Int8(*v)),
+        ScalarValue::Int16(v) => Some(PredicateValue::Int16(*v)),
+        ScalarValue::Int32(v) => Some(PredicateValue::Int32(*v)),
+        ScalarValue::Int64(v) => Some(PredicateValue::Int64(*v)),
+        ScalarValue::Float32(v) => Some(PredicateValue::Float32(*v)),
+        ScalarValue::Float64(v) => Some(PredicateValue::Float64(*v)),
+        ScalarValue::Utf8(v) => Some(PredicateValue::Utf8(v.clone())),

Review Comment:
   The new predicate conversion logic lacks test coverage. Given the complexity 
of handling various expression types (AND, OR, NOT, IN, comparisons, IS NULL, 
etc.), consider adding unit tests to verify correct conversion for different 
predicate patterns. This is especially important for edge cases like empty IN 
lists, nested logical operators, and reversed comparison operators.



##########
native-engine/datafusion-ext-plans/src/orc_exec.rs:
##########
@@ -414,3 +430,256 @@ impl OrcFileMetrics {
         Self { bytes_scanned }
     }
 }
+
+fn convert_predicate_to_orc(
+    predicate: Option<PhysicalExprRef>,
+    file_schema: &SchemaRef,
+) -> Option<Predicate> {
+    let predicate = predicate?;
+    convert_expr_to_orc(&predicate, file_schema)
+}
+
+/// Recursively collect all AND sub-conditions and flatten nested AND
+/// structures.
+fn collect_and_predicates(
+    expr: &Arc<dyn datafusion::physical_expr::PhysicalExpr>,
+    schema: &SchemaRef,
+    predicates: &mut Vec<Predicate>,
+) {
+    if let Some(binary) = expr.as_any().downcast_ref::<BinaryExpr>() {
+        if matches!(binary.op(), Operator::And) {
+            // Recursively collect AND sub-conditions from both sides
+            collect_and_predicates(binary.left(), schema, predicates);
+            collect_and_predicates(binary.right(), schema, predicates);
+            return;
+        }
+    }
+
+    // Not an AND expression, try to convert to ORC predicate
+    if let Some(pred) = convert_expr_to_orc_internal(expr, schema) {
+        predicates.push(pred);
+    }
+}
+
+/// Recursively collect all OR sub-conditions and flatten nested OR
+/// structures.
+fn collect_or_predicates(
+    expr: &Arc<dyn datafusion::physical_expr::PhysicalExpr>,
+    schema: &SchemaRef,
+    predicates: &mut Vec<Predicate>,
+) {
+    if let Some(binary) = expr.as_any().downcast_ref::<BinaryExpr>() {
+        if matches!(binary.op(), Operator::Or) {
+            // Recursively collect OR sub-conditions from both sides
+            collect_or_predicates(binary.left(), schema, predicates);
+            collect_or_predicates(binary.right(), schema, predicates);
+            return;
+        }
+    }
+
+    // Not an OR expression, try to convert to ORC predicate
+    if let Some(pred) = convert_expr_to_orc_internal(expr, schema) {
+        predicates.push(pred);
+    }
+}
+
+fn convert_expr_to_orc(
+    expr: &Arc<dyn datafusion::physical_expr::PhysicalExpr>,
+    schema: &SchemaRef,
+) -> Option<Predicate> {
+    // Handle top-level AND expression, flatten all AND conditions
+    if let Some(binary) = expr.as_any().downcast_ref::<BinaryExpr>() {
+        if matches!(binary.op(), Operator::And) {
+            let mut predicates = Vec::new();
+            collect_and_predicates(expr, schema, &mut predicates);
+
+            if predicates.is_empty() {
+                return None;
+            }
+
+            if predicates.len() == 1 {
+                return Some(predicates.into_iter().next().unwrap());
+            }
+
+            return Some(Predicate::and(predicates));
+        }
+
+        // Handle top-level OR expression, flatten all OR conditions
+        if matches!(binary.op(), Operator::Or) {
+            let mut predicates = Vec::new();
+            collect_or_predicates(expr, schema, &mut predicates);
+
+            if predicates.is_empty() {
+                return None;
+            }
+
+            if predicates.len() == 1 {
+                return Some(predicates.into_iter().next().unwrap());
+            }
+
+            return Some(Predicate::or(predicates));
+        }
+    }
+
+    convert_expr_to_orc_internal(expr, schema)
+}
+
+/// Internal conversion function for non-AND/OR expressions.

Review Comment:
   The schema parameter is passed through all predicate conversion functions 
but is never used. While this might be intended for future use (e.g., 
validating column names against the schema), it adds unnecessary complexity. 
Consider either using the schema to validate column existence and data types, 
or removing it if not needed. If kept for future extensibility, add a comment 
explaining this intent.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to