alamb commented on code in PR #12850:
URL: https://github.com/apache/datafusion/pull/12850#discussion_r1796084081


##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1316,23 +1353,43 @@ const MAX_LIST_VALUE_SIZE_REWRITE: usize = 20;
 /// expression that will evaluate to FALSE if it can be determined no
 /// rows between the min/max values could pass the predicates.
 ///
+/// Any predicates that can not be translated will be passed to 
`unhandled_hook`.
+///
 /// Returns the pruning predicate as an [`PhysicalExpr`]
 ///
-/// Notice: Does not handle [`phys_expr::InListExpr`] greater than 20, which 
will be rewritten to TRUE
+/// Notice: Does not handle [`phys_expr::InListExpr`] greater than 20, which 
will fall back to calling `unhandled_hook`

Review Comment:
   Since this is a new API I think it might be worth thinking a bit more about. 
What would you think about moving this logic into its own structure? 
   
   For example, instead of
   
   ```rust
           let known_expression = 
col("a").eq(lit(ScalarValue::Int32(Some(12))));
           let known_expression_transformed = 
rewrite_predicate_to_statistics_predicate(
               &logical2physical(&known_expression, &schema),
               &schema,
               None,
           );
   ```
   
   Something like
   
   ```rust
           let known_expression = 
col("a").eq(lit(ScalarValue::Int32(Some(12))));
           let known_expression_transformed = StatisticsPredicateRewriter::new()
              .rewrite(
                &logical2physical(&known_expression, &schema)
                &schema)
              );
   ```
   
   
   Or with a rewrite hook
   
   ```rust
               let expr = logical2physical(&expr, &schema_with_b);
               StatisticsPredicateRewriter::new()
                 .with_unhandledhook(|expr| {
                    // closure that handles expr
                 })
              .rewrite(
                   &expr,
                   &schema,
               )
   ```



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -3397,6 +3462,75 @@ mod tests {
         // TODO: add test for other case and op
     }
 
+    #[test]
+    fn test_rewrite_expr_to_prunable_custom_unhandled_hook() {
+        struct CustomUnhandledHook;
+
+        impl UnhandledPredicateHook for CustomUnhandledHook {
+            /// This handles an arbitrary case of a column that doesn't exist 
in the schema
+            /// by renaming it to yet another column that doesn't exist in the 
schema
+            /// (the transformation is arbitrary, the point is that it can do 
whatever it wants)
+            fn handle(&self, _expr: &Arc<dyn PhysicalExpr>) -> Arc<dyn 
PhysicalExpr> {
+                Arc::new(phys_expr::Literal::new(ScalarValue::Int32(Some(42))))
+            }
+        }
+
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+        let schema_with_b = Schema::new(vec![
+            Field::new("a", DataType::Int32, true),
+            Field::new("b", DataType::Int32, true),
+        ]);
+
+        let transform_expr = |expr| {
+            let expr = logical2physical(&expr, &schema_with_b);
+            rewrite_predicate_to_statistics_predicate(
+                &expr,
+                &schema,
+                Some(Arc::new(CustomUnhandledHook {})),
+            )
+        };
+
+        // transform an arbitrary valid expression that we know is handled
+        let known_expression = col("a").eq(lit(ScalarValue::Int32(Some(12))));
+        let known_expression_transformed = 
rewrite_predicate_to_statistics_predicate(
+            &logical2physical(&known_expression, &schema),
+            &schema,
+            None,
+        );
+
+        // an expression referencing an unknown column (that is not in the 
schema) gets passed to the hook
+        let input = col("b").eq(lit(ScalarValue::Int32(Some(12))));

Review Comment:
   I think you can write literals more concisely like this if you wanted
   
   ```suggestion
           let input = col("b").eq(lit(12i32));
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to