Mazen-Ghanaym commented on code in PR #3000:
URL: https://github.com/apache/datafusion-comet/pull/3000#discussion_r2672600524
##########
native/core/src/execution/expressions/strings.rs:
##########
@@ -123,3 +123,56 @@ impl ExpressionBuilder for FromJsonBuilder {
Ok(Arc::new(FromJson::new(child, schema, &expr.timezone)))
}
}
+
+/// Builder for StartsWith expressions
+pub struct StartsWithBuilder;
+
+impl ExpressionBuilder for StartsWithBuilder {
+ fn build(
+ &self,
+ spark_expr: &Expr,
+ input_schema: SchemaRef,
+ planner: &PhysicalPlanner,
+ ) -> Result<Arc<dyn PhysicalExpr>, ExecutionError> {
+ let expr = extract_expr!(spark_expr, StartsWith);
+ let left = planner.create_expr(expr.left.as_ref().unwrap(),
Arc::clone(&input_schema))?;
+ let right = planner.create_expr(expr.right.as_ref().unwrap(),
input_schema)?;
+
+ let pattern = extract_string_literal(&right)?;
+ Ok(Arc::new(StartsWithExpr::new(left, pattern)))
+ }
+}
+
+/// Builder for EndsWith expressions
+pub struct EndsWithBuilder;
+
+impl ExpressionBuilder for EndsWithBuilder {
+ fn build(
+ &self,
+ spark_expr: &Expr,
+ input_schema: SchemaRef,
+ planner: &PhysicalPlanner,
+ ) -> Result<Arc<dyn PhysicalExpr>, ExecutionError> {
+ let expr = extract_expr!(spark_expr, EndsWith);
+ let left = planner.create_expr(expr.left.as_ref().unwrap(),
Arc::clone(&input_schema))?;
+ let right = planner.create_expr(expr.right.as_ref().unwrap(),
input_schema)?;
+
+ let pattern = extract_string_literal(&right)?;
+ Ok(Arc::new(EndsWithExpr::new(left, pattern)))
+ }
+}
+
+/// Helper function to extract a string literal from a physical expression
+fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Result<String,
ExecutionError> {
+ match expr.as_any().downcast_ref::<Literal>() {
+ Some(literal) => match literal.value() {
+ ScalarValue::Utf8(Some(s)) => Ok(s.clone()),
+ _ => Err(ExecutionError::GeneralError(
+ "StartsWith/EndsWith pattern must be a string
literal".to_string(),
+ )),
+ },
+ None => Err(ExecutionError::GeneralError(
Review Comment:
Good point! The current implementation assumes the pattern is a literal for
the pre-allocation optimization.
For cases like `startsWith(c1, substring(c1, 0, 3))` where the pattern is a
dynamic expression, I can add a fallback that evaluates the pattern per-batch.
Would falling back to DataFusion's built-in function for non-literal patterns
work, or would you prefer handling it entirely in the custom expression?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]