alamb commented on a change in pull request #508:
URL: https://github.com/apache/arrow-datafusion/pull/508#discussion_r646112932



##########
File path: datafusion/src/physical_optimizer/pruning.rs
##########
@@ -1095,6 +1133,60 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn row_group_predicate_starts_with() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]);
+        // test LIKE operator that is converted to a 'starts_with'
+        let expr = col("c1").like(lit("Banana%"));
+        let expected_expr =
+            "#c1_min LtEq Utf8(\"Banana\") And Utf8(\"Banana\") LtEq #c1_max";
+        let predicate_expr =
+            build_predicate_expression(&expr, &schema, &mut 
RequiredStatColumns::new())?;
+        assert_eq!(format!("{:?}", predicate_expr), expected_expr);
+
+        Ok(())
+    }
+
+    #[test]
+    fn row_group_predicate_like() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]);
+        // test LIKE operator that can't be converted to a 'starts_with'
+        let expr = col("c1").like(lit("%Banana%"));
+        let expected_expr = "Boolean(true)";
+        let predicate_expr =
+            build_predicate_expression(&expr, &schema, &mut 
RequiredStatColumns::new())?;
+        assert_eq!(format!("{:?}", predicate_expr), expected_expr);
+
+        Ok(())
+    }
+
+    #[test]
+    fn row_group_predicate_not_starts_with() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]);
+        // test LIKE operator that can't be converted to a 'starts_with'

Review comment:
       ```suggestion
           // test NOT LIKE operator that can't be converted to a 'starts_with'
   ```

##########
File path: datafusion/src/physical_optimizer/pruning.rs
##########
@@ -586,8 +587,45 @@ fn build_predicate_expression(
                 .min_column_expr()?
                 .lt_eq(expr_builder.scalar_expr().clone())
         }
+        Operator::Like => {
+            match &**right {
+                // If the literal is a 'starts_with'
+                Expr::Literal(ScalarValue::Utf8(Some(string)))
+                    if !string.starts_with('%') =>
+                {
+                    let scalar_expr =
+                        
Expr::Literal(ScalarValue::Utf8(Some(string.replace('%', ""))));
+                    // Behaves like Eq
+                    let min_column_expr = expr_builder.min_column_expr()?;
+                    let max_column_expr = expr_builder.max_column_expr()?;
+                    min_column_expr
+                        .lt_eq(scalar_expr.clone())
+                        .and(scalar_expr.lt_eq(max_column_expr))
+                }
+                _ => unhandled,
+            }
+        }
+        Operator::NotLike => {
+            match &**right {
+                // If the literal is a 'starts_with'
+                Expr::Literal(ScalarValue::Utf8(Some(string)))
+                    if !string.starts_with('%') =>
+                {
+                    let scalar_expr =
+                        
Expr::Literal(ScalarValue::Utf8(Some(string.replace('%', ""))));

Review comment:
       I am not sure if just removing `%` is correct:
   
   For example in  a pattern like `foo%bar` would be converted to `foobar` and 
when compared with a value of `fooaaabar` would be deemed "out of range" by 
this logic, even though it matches the original predicate `foo%bar`.
   
   
   If instead, for `foo%bar` we used `foo` (only use the string up to the first 
unescaped `%`) I think then the logic applies.

##########
File path: datafusion/src/physical_optimizer/pruning.rs
##########
@@ -548,7 +549,7 @@ fn build_predicate_expression(
         // allow partial failure in predicate expression generation
         // this can still produce a useful predicate when multiple conditions 
are joined using AND
         Err(_) => {
-            return Ok(logical_plan::lit(true));
+            return Ok(unhandled);

Review comment:
       👍 thanks I forgot that

##########
File path: datafusion/src/physical_optimizer/pruning.rs
##########
@@ -1095,6 +1133,60 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn row_group_predicate_starts_with() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]);
+        // test LIKE operator that is converted to a 'starts_with'
+        let expr = col("c1").like(lit("Banana%"));
+        let expected_expr =
+            "#c1_min LtEq Utf8(\"Banana\") And Utf8(\"Banana\") LtEq #c1_max";
+        let predicate_expr =
+            build_predicate_expression(&expr, &schema, &mut 
RequiredStatColumns::new())?;
+        assert_eq!(format!("{:?}", predicate_expr), expected_expr);
+
+        Ok(())
+    }
+
+    #[test]
+    fn row_group_predicate_like() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]);
+        // test LIKE operator that can't be converted to a 'starts_with'
+        let expr = col("c1").like(lit("%Banana%"));
+        let expected_expr = "Boolean(true)";
+        let predicate_expr =
+            build_predicate_expression(&expr, &schema, &mut 
RequiredStatColumns::new())?;
+        assert_eq!(format!("{:?}", predicate_expr), expected_expr);
+
+        Ok(())
+    }
+
+    #[test]
+    fn row_group_predicate_not_starts_with() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]);
+        // test LIKE operator that can't be converted to a 'starts_with'
+        let expr = col("c1").not().like(lit("Banana%"));

Review comment:
       I think there is a difference between `!(a LIKE b)` and `a NOT LIKE b` 
-- so to test the `NOT LIKE` operator above this should be something like
   
   ```suggestion
           let expr = col("c1").not_like(lit("Banana%");
   ```
   
   
https://github.com/apache/arrow-datafusion/blob/master/datafusion/src/logical_plan/expr.rs#L455-L457

##########
File path: datafusion/src/physical_optimizer/pruning.rs
##########
@@ -1095,6 +1133,60 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn row_group_predicate_starts_with() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]);
+        // test LIKE operator that is converted to a 'starts_with'
+        let expr = col("c1").like(lit("Banana%"));
+        let expected_expr =
+            "#c1_min LtEq Utf8(\"Banana\") And Utf8(\"Banana\") LtEq #c1_max";
+        let predicate_expr =
+            build_predicate_expression(&expr, &schema, &mut 
RequiredStatColumns::new())?;
+        assert_eq!(format!("{:?}", predicate_expr), expected_expr);
+
+        Ok(())
+    }
+
+    #[test]
+    fn row_group_predicate_like() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]);
+        // test LIKE operator that can't be converted to a 'starts_with'
+        let expr = col("c1").like(lit("%Banana%"));
+        let expected_expr = "Boolean(true)";
+        let predicate_expr =
+            build_predicate_expression(&expr, &schema, &mut 
RequiredStatColumns::new())?;
+        assert_eq!(format!("{:?}", predicate_expr), expected_expr);
+
+        Ok(())
+    }
+
+    #[test]
+    fn row_group_predicate_not_starts_with() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]);
+        // test LIKE operator that can't be converted to a 'starts_with'
+        let expr = col("c1").not().like(lit("Banana%"));
+        let expected_expr =
+            "NOT #c1_min LtEq Utf8(\"Banana\") And Utf8(\"Banana\") LtEq NOT 
#c1_max";
+        let predicate_expr =
+            build_predicate_expression(&expr, &schema, &mut 
RequiredStatColumns::new())?;
+        assert_eq!(format!("{:?}", predicate_expr), expected_expr);
+
+        Ok(())
+    }
+
+    #[test]
+    fn row_group_predicate_not_like() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]);
+        // test LIKE operator that can't be converted to a 'starts_with'

Review comment:
       ```suggestion
           // test NOT LIKE operator that can't be converted to a 'starts_with'
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to