findepi commented on code in PR #12978:
URL: https://github.com/apache/datafusion/pull/12978#discussion_r1818867108


##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -3577,7 +3585,11 @@ mod tests {
         // s1 ["A", NULL]  ==> unknown (must keep)
         // s1 ["", "A"]  ==> some rows could pass (must keep)
         // s1 ["", ""]  ==> no rows can pass (not keep)
-        let expected_ret = &[true, true, false, false, true, true, true, 
false];
+        // s1 ["AB", "A\u{10ffff}"]  ==> no rows can pass (not keep)

Review Comment:
   The fact that these tests are "vectorized" doesn't make them easy to follow.
   
   (no change requested here)



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1624,74 @@ fn build_statistics_expr(
     Ok(statistics_expr)
 }
 
+fn build_like_match(
+    expr_builder: &mut PruningExpressionBuilder,
+) -> Result<Arc<dyn PhysicalExpr>> {
+    // column LIKE literal => (min, max) LIKE literal split at % => min <= 
split literal && split literal <= max
+    // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+    // column LIKE '%foo' => min <= '' && '' <= max => true
+    // column LIKE '%foo%' => min <= '' && '' <= max => true
+    // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+    fn unpack_string(s: &ScalarValue) -> Result<&String> {
+        match s {
+            ScalarValue::Utf8(Some(s)) => Ok(s),
+            ScalarValue::LargeUtf8(Some(s)) => Ok(s),
+            ScalarValue::Utf8View(Some(s)) => Ok(s),
+            ScalarValue::Dictionary(_, value) => unpack_string(value),
+            _ => plan_err!("LIKE expression must be a string literal"),
+        }
+    }
+
+    fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Result<&String> 
{
+        if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+            let s = unpack_string(lit.value())?;
+            return Ok(s);
+        }
+        plan_err!("LIKE expression must be a string literal")
+    }
+
+    // I *think* that ILIKE could be handled by making the min lowercase and 
max uppercase
+    // but that requires building the physical expressions that call lower() 
and upper()

Review Comment:
   ```suggestion
       // TODO Handle ILIKE perhaps by making the min lowercase and max 
uppercase
       //  this may involve building the physical expressions that call lower() 
and upper()
   ```



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1624,74 @@ fn build_statistics_expr(
     Ok(statistics_expr)
 }
 
+fn build_like_match(
+    expr_builder: &mut PruningExpressionBuilder,
+) -> Result<Arc<dyn PhysicalExpr>> {
+    // column LIKE literal => (min, max) LIKE literal split at % => min <= 
split literal && split literal <= max
+    // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+    // column LIKE '%foo' => min <= '' && '' <= max => true
+    // column LIKE '%foo%' => min <= '' && '' <= max => true
+    // column LIKE 'foo' => min <= 'foo' && 'foo' <= max

Review Comment:
   Turn this into function documentation and place above the function 
declaration
   
   ```
   /// Convert `column LIKE literal` where P is a constant prefix of the literal
   /// to a range check on the column: `P <= column && column < P'`, where P' 
is the
   /// lowest string after all P* strings.
   ```
   
   



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1624,74 @@ fn build_statistics_expr(
     Ok(statistics_expr)
 }
 
+fn build_like_match(
+    expr_builder: &mut PruningExpressionBuilder,
+) -> Result<Arc<dyn PhysicalExpr>> {
+    // column LIKE literal => (min, max) LIKE literal split at % => min <= 
split literal && split literal <= max
+    // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+    // column LIKE '%foo' => min <= '' && '' <= max => true
+    // column LIKE '%foo%' => min <= '' && '' <= max => true
+    // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+    fn unpack_string(s: &ScalarValue) -> Result<&String> {
+        match s {
+            ScalarValue::Utf8(Some(s)) => Ok(s),
+            ScalarValue::LargeUtf8(Some(s)) => Ok(s),
+            ScalarValue::Utf8View(Some(s)) => Ok(s),
+            ScalarValue::Dictionary(_, value) => unpack_string(value),
+            _ => plan_err!("LIKE expression must be a string literal"),

Review Comment:
   ```suggestion
               _ => plan_err!("LIKE pattern literal must be a string"),
   ```



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -3443,6 +3525,385 @@ mod tests {
         );
     }
 
+    /// Creates a setup for chunk pruning, modeling a utf8 column "s1"
+    /// with 5 different containers (e.g. RowGroups). They have [min,
+    /// max]:
+    /// s1 ["A", "Z"]
+    /// s1 ["A", "L"]
+    /// s1 ["N", "Z"]
+    /// s1 [NULL, NULL]
+    /// s1 ["A", NULL]
+    /// s1 ["", "A"]
+    /// s1 ["", ""]
+    /// s1 ["AB", "A\u{10ffff}"]
+    /// s1 ["A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]
+    fn utf8_setup() -> (SchemaRef, TestStatistics) {
+        let schema = Arc::new(Schema::new(vec![Field::new("s1", 
DataType::Utf8, true)]));
+
+        let statistics = TestStatistics::new().with(
+            "s1",
+            ContainerStats::new_utf8(
+                vec![
+                    Some("A"),
+                    Some("A"),
+                    Some("N"),
+                    Some("M"),
+                    None,
+                    Some("A"),
+                    Some(""),
+                    Some(""),
+                    Some("AB"),
+                    Some("A\u{10ffff}"),

Review Comment:
   ```suggestion
                       Some("A\u{10ffff}\u{10ffff}"),
   ```



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1624,74 @@ fn build_statistics_expr(
     Ok(statistics_expr)
 }
 
+fn build_like_match(
+    expr_builder: &mut PruningExpressionBuilder,
+) -> Result<Arc<dyn PhysicalExpr>> {
+    // column LIKE literal => (min, max) LIKE literal split at % => min <= 
split literal && split literal <= max
+    // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+    // column LIKE '%foo' => min <= '' && '' <= max => true
+    // column LIKE '%foo%' => min <= '' && '' <= max => true
+    // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+    fn unpack_string(s: &ScalarValue) -> Result<&String> {
+        match s {
+            ScalarValue::Utf8(Some(s)) => Ok(s),
+            ScalarValue::LargeUtf8(Some(s)) => Ok(s),
+            ScalarValue::Utf8View(Some(s)) => Ok(s),
+            ScalarValue::Dictionary(_, value) => unpack_string(value),
+            _ => plan_err!("LIKE expression must be a string literal"),
+        }
+    }
+
+    fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Result<&String> 
{
+        if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+            let s = unpack_string(lit.value())?;
+            return Ok(s);
+        }
+        plan_err!("LIKE expression must be a string literal")

Review Comment:
   Like pattern doesn't have to be literal.
   See 
https://github.com/apache/datafusion/blob/8aafa5498024e45b58ad2068a80cf5942babe55b/datafusion/sqllogictest/test_files/select.slt#L670-L685
   
   We probably are not hitting here because `PruningExpressionBuilder::try_new` 
filters out the case where both sides of operation contain column references.
   
   
   



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1624,74 @@ fn build_statistics_expr(
     Ok(statistics_expr)
 }
 
+fn build_like_match(
+    expr_builder: &mut PruningExpressionBuilder,
+) -> Result<Arc<dyn PhysicalExpr>> {
+    // column LIKE literal => (min, max) LIKE literal split at % => min <= 
split literal && split literal <= max
+    // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+    // column LIKE '%foo' => min <= '' && '' <= max => true
+    // column LIKE '%foo%' => min <= '' && '' <= max => true
+    // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+    fn unpack_string(s: &ScalarValue) -> Result<&String> {
+        match s {
+            ScalarValue::Utf8(Some(s)) => Ok(s),
+            ScalarValue::LargeUtf8(Some(s)) => Ok(s),
+            ScalarValue::Utf8View(Some(s)) => Ok(s),
+            ScalarValue::Dictionary(_, value) => unpack_string(value),
+            _ => plan_err!("LIKE expression must be a string literal"),
+        }
+    }
+
+    fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Result<&String> 
{
+        if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+            let s = unpack_string(lit.value())?;
+            return Ok(s);
+        }
+        plan_err!("LIKE expression must be a string literal")

Review Comment:
   ```suggestion
           plan_err!("Unexpected LIKE expression: {expr:?}")
   ```



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -3443,6 +3525,385 @@ mod tests {
         );
     }
 
+    /// Creates a setup for chunk pruning, modeling a utf8 column "s1"
+    /// with 5 different containers (e.g. RowGroups). They have [min,
+    /// max]:
+    /// s1 ["A", "Z"]
+    /// s1 ["A", "L"]
+    /// s1 ["N", "Z"]
+    /// s1 [NULL, NULL]
+    /// s1 ["A", NULL]
+    /// s1 ["", "A"]
+    /// s1 ["", ""]
+    /// s1 ["AB", "A\u{10ffff}"]
+    /// s1 ["A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]
+    fn utf8_setup() -> (SchemaRef, TestStatistics) {
+        let schema = Arc::new(Schema::new(vec![Field::new("s1", 
DataType::Utf8, true)]));
+
+        let statistics = TestStatistics::new().with(
+            "s1",
+            ContainerStats::new_utf8(
+                vec![
+                    Some("A"),
+                    Some("A"),
+                    Some("N"),
+                    Some("M"),
+                    None,
+                    Some("A"),
+                    Some(""),
+                    Some(""),
+                    Some("AB"),
+                    Some("A\u{10ffff}"),

Review Comment:
   This will expose the problem we're talking about here 
https://github.com/apache/datafusion/pull/12978#discussion_r1817937062



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to