alamb commented on code in PR #12978:
URL: https://github.com/apache/datafusion/pull/12978#discussion_r1823373972


##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1629,126 @@ fn build_statistics_expr(
     Ok(statistics_expr)
 }
 
+fn build_like_match(
+    expr_builder: &mut PruningExpressionBuilder,
+) -> Option<Arc<dyn PhysicalExpr>> {
+    // column LIKE literal => (min, max) LIKE literal split at % => min <= 
split literal && split literal <= max
+    // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+    // column LIKE '%foo' => min <= '' && '' <= max => true
+    // column LIKE '%foo%' => min <= '' && '' <= max => true
+    // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+    fn unpack_string(s: &ScalarValue) -> Option<&String> {
+        match s {
+            ScalarValue::Utf8(Some(s)) => Some(s),
+            ScalarValue::LargeUtf8(Some(s)) => Some(s),
+            ScalarValue::Utf8View(Some(s)) => Some(s),
+            ScalarValue::Dictionary(_, value) => unpack_string(value),
+            _ => None,
+        }
+    }
+
+    fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&String> 
{
+        if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+            let s = unpack_string(lit.value())?;
+            return Some(s);
+        }
+        None
+    }
+
+    // TODO Handle ILIKE perhaps by making the min lowercase and max uppercase
+    //  this may involve building the physical expressions that call lower() 
and upper()
+    let min_column_expr = expr_builder.min_column_expr().ok()?;
+    let max_column_expr = expr_builder.max_column_expr().ok()?;
+    let scalar_expr = expr_builder.scalar_expr();
+    // check that the scalar is a string literal
+    let s = extract_string_literal(scalar_expr)?;
+    // ANSI SQL specifies two wildcards: % and _. % matches zero or more 
characters, _ matches exactly one character.
+    let first_wildcard_index = s.find(['%', '_']);
+    if first_wildcard_index == Some(0) {
+        // there's no filtering we could possibly do, return an error and have 
this be handled by the unhandled hook
+        return None;
+    }
+    let (upper_bound, lower_bound) = if let Some(wildcard_index) = 
first_wildcard_index {

Review Comment:
   so the idea here is that any string before the first wild card character 
must be a prefix of the input?
   
   BTW I think you could potentially use `split_once` here and it might be a 
little more natural
   https://doc.rust-lang.org/std/primitive.str.html#method.split_once
   
   Like
   ```rust
   let (prefix, _) = s.split_once(['%', '_'])?; // return None if no match
   ```



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1629,126 @@ fn build_statistics_expr(
     Ok(statistics_expr)
 }
 
+fn build_like_match(
+    expr_builder: &mut PruningExpressionBuilder,
+) -> Option<Arc<dyn PhysicalExpr>> {
+    // column LIKE literal => (min, max) LIKE literal split at % => min <= 
split literal && split literal <= max
+    // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+    // column LIKE '%foo' => min <= '' && '' <= max => true
+    // column LIKE '%foo%' => min <= '' && '' <= max => true
+    // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+    fn unpack_string(s: &ScalarValue) -> Option<&String> {
+        match s {
+            ScalarValue::Utf8(Some(s)) => Some(s),
+            ScalarValue::LargeUtf8(Some(s)) => Some(s),
+            ScalarValue::Utf8View(Some(s)) => Some(s),
+            ScalarValue::Dictionary(_, value) => unpack_string(value),
+            _ => None,
+        }
+    }
+
+    fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&String> 
{
+        if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+            let s = unpack_string(lit.value())?;
+            return Some(s);
+        }
+        None
+    }
+
+    // TODO Handle ILIKE perhaps by making the min lowercase and max uppercase
+    //  this may involve building the physical expressions that call lower() 
and upper()
+    let min_column_expr = expr_builder.min_column_expr().ok()?;
+    let max_column_expr = expr_builder.max_column_expr().ok()?;
+    let scalar_expr = expr_builder.scalar_expr();
+    // check that the scalar is a string literal
+    let s = extract_string_literal(scalar_expr)?;
+    // ANSI SQL specifies two wildcards: % and _. % matches zero or more 
characters, _ matches exactly one character.
+    let first_wildcard_index = s.find(['%', '_']);
+    if first_wildcard_index == Some(0) {
+        // there's no filtering we could possibly do, return an error and have 
this be handled by the unhandled hook
+        return None;
+    }
+    let (upper_bound, lower_bound) = if let Some(wildcard_index) = 
first_wildcard_index {
+        let prefix = &s[..wildcard_index];
+        let upper_bound_lit = 
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+            increment_utf8(prefix)?,
+        ))));
+        let lower_bound_lit = 
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+            prefix.to_string(),
+        ))));
+        (upper_bound_lit, lower_bound_lit)
+    } else {
+        let bound = 
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(s.clone()))));

Review Comment:
   ```suggestion
           // the like expression is a literal and can be converted into a 
comparison
           let bound = 
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(s.clone()))));
   ```



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1629,126 @@ fn build_statistics_expr(
     Ok(statistics_expr)
 }
 
+fn build_like_match(
+    expr_builder: &mut PruningExpressionBuilder,
+) -> Option<Arc<dyn PhysicalExpr>> {
+    // column LIKE literal => (min, max) LIKE literal split at % => min <= 
split literal && split literal <= max
+    // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+    // column LIKE '%foo' => min <= '' && '' <= max => true
+    // column LIKE '%foo%' => min <= '' && '' <= max => true
+    // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+    fn unpack_string(s: &ScalarValue) -> Option<&String> {
+        match s {
+            ScalarValue::Utf8(Some(s)) => Some(s),
+            ScalarValue::LargeUtf8(Some(s)) => Some(s),
+            ScalarValue::Utf8View(Some(s)) => Some(s),
+            ScalarValue::Dictionary(_, value) => unpack_string(value),
+            _ => None,
+        }
+    }
+
+    fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&String> 
{
+        if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+            let s = unpack_string(lit.value())?;
+            return Some(s);
+        }
+        None
+    }
+
+    // TODO Handle ILIKE perhaps by making the min lowercase and max uppercase
+    //  this may involve building the physical expressions that call lower() 
and upper()
+    let min_column_expr = expr_builder.min_column_expr().ok()?;
+    let max_column_expr = expr_builder.max_column_expr().ok()?;
+    let scalar_expr = expr_builder.scalar_expr();
+    // check that the scalar is a string literal
+    let s = extract_string_literal(scalar_expr)?;
+    // ANSI SQL specifies two wildcards: % and _. % matches zero or more 
characters, _ matches exactly one character.
+    let first_wildcard_index = s.find(['%', '_']);
+    if first_wildcard_index == Some(0) {
+        // there's no filtering we could possibly do, return an error and have 
this be handled by the unhandled hook
+        return None;
+    }
+    let (upper_bound, lower_bound) = if let Some(wildcard_index) = 
first_wildcard_index {
+        let prefix = &s[..wildcard_index];
+        let upper_bound_lit = 
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+            increment_utf8(prefix)?,
+        ))));
+        let lower_bound_lit = 
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+            prefix.to_string(),
+        ))));
+        (upper_bound_lit, lower_bound_lit)
+    } else {
+        let bound = 
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(s.clone()))));
+        (bound.clone(), bound)
+    };
+    let upper_bound_expr = Arc::new(phys_expr::BinaryExpr::new(
+        min_column_expr.clone(),
+        Operator::LtEq,
+        upper_bound,
+    ));
+    let lower_bound_expr = Arc::new(phys_expr::BinaryExpr::new(
+        lower_bound,
+        Operator::LtEq,
+        max_column_expr.clone(),
+    ));
+    let combined = Arc::new(phys_expr::BinaryExpr::new(
+        upper_bound_expr,
+        Operator::And,
+        lower_bound_expr,
+    ));
+    Some(combined)
+}
+
+/// Increment a UTF8 string by one, returning `None` if it can't be 
incremented.
+/// This makes it so that the returned string will always compare greater than 
the input string
+/// or any other string with the same prefix.
+/// E.g. `increment_utf8("foo") >= "foo"` and `increment_utf8("foo") >= "fooz"`
+fn increment_utf8(data: &str) -> Option<String> {
+    // Helper function to check if a character is valid to use
+    fn is_valid_unicode(c: char) -> bool {
+        let cp = c as u32;
+
+        // Filter out Unicode surrogate pairs range
+        if (0xD800..=0xDFFF).contains(&cp) {
+            return false;
+        }
+
+        // Filter out non-characters
+        if (0xFDD0..=0xFDEF).contains(&cp) {
+            return false;
+        }
+
+        // Filter out private use areas and other invalid ranges
+        match cp {
+            0xFFFE | 0xFFFF => false,
+            _ if cp >= 0x110000 => false,
+            _ => true,
+        }
+    }
+
+    // Convert string to vector of code points
+    let mut code_points: Vec<char> = data.chars().collect();
+
+    // Work backwards through code points
+    for idx in (0..code_points.len()).rev() {
+        let original = code_points[idx] as u32;
+
+        // Try incrementing the code point
+        if let Some(next_char) = char::from_u32(original + 1) {
+            // Check if it's a valid continuation

Review Comment:
   I don't understand the continuation note here -- this is dealing with 
`chars` (aka codepoints, not utf8 bytes, right?) I think continuation is a 
notion of the utf8 encoding (aka a byte with a 1 in the top spot)



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1629,126 @@ fn build_statistics_expr(
     Ok(statistics_expr)
 }
 
+fn build_like_match(
+    expr_builder: &mut PruningExpressionBuilder,
+) -> Option<Arc<dyn PhysicalExpr>> {
+    // column LIKE literal => (min, max) LIKE literal split at % => min <= 
split literal && split literal <= max
+    // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+    // column LIKE '%foo' => min <= '' && '' <= max => true
+    // column LIKE '%foo%' => min <= '' && '' <= max => true
+    // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+    fn unpack_string(s: &ScalarValue) -> Option<&String> {
+        match s {
+            ScalarValue::Utf8(Some(s)) => Some(s),
+            ScalarValue::LargeUtf8(Some(s)) => Some(s),
+            ScalarValue::Utf8View(Some(s)) => Some(s),
+            ScalarValue::Dictionary(_, value) => unpack_string(value),
+            _ => None,
+        }
+    }
+
+    fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&String> 
{
+        if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+            let s = unpack_string(lit.value())?;
+            return Some(s);
+        }
+        None
+    }
+
+    // TODO Handle ILIKE perhaps by making the min lowercase and max uppercase
+    //  this may involve building the physical expressions that call lower() 
and upper()
+    let min_column_expr = expr_builder.min_column_expr().ok()?;
+    let max_column_expr = expr_builder.max_column_expr().ok()?;
+    let scalar_expr = expr_builder.scalar_expr();
+    // check that the scalar is a string literal
+    let s = extract_string_literal(scalar_expr)?;
+    // ANSI SQL specifies two wildcards: % and _. % matches zero or more 
characters, _ matches exactly one character.
+    let first_wildcard_index = s.find(['%', '_']);
+    if first_wildcard_index == Some(0) {
+        // there's no filtering we could possibly do, return an error and have 
this be handled by the unhandled hook
+        return None;
+    }
+    let (upper_bound, lower_bound) = if let Some(wildcard_index) = 
first_wildcard_index {
+        let prefix = &s[..wildcard_index];
+        let upper_bound_lit = 
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+            increment_utf8(prefix)?,
+        ))));
+        let lower_bound_lit = 
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+            prefix.to_string(),
+        ))));
+        (upper_bound_lit, lower_bound_lit)
+    } else {
+        let bound = 
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(s.clone()))));
+        (bound.clone(), bound)
+    };
+    let upper_bound_expr = Arc::new(phys_expr::BinaryExpr::new(
+        min_column_expr.clone(),
+        Operator::LtEq,
+        upper_bound,
+    ));
+    let lower_bound_expr = Arc::new(phys_expr::BinaryExpr::new(
+        lower_bound,
+        Operator::LtEq,
+        max_column_expr.clone(),
+    ));
+    let combined = Arc::new(phys_expr::BinaryExpr::new(
+        upper_bound_expr,
+        Operator::And,
+        lower_bound_expr,
+    ));
+    Some(combined)
+}
+
+/// Increment a UTF8 string by one, returning `None` if it can't be 
incremented.
+/// This makes it so that the returned string will always compare greater than 
the input string
+/// or any other string with the same prefix.
+/// E.g. `increment_utf8("foo") >= "foo"` and `increment_utf8("foo") >= "fooz"`
+fn increment_utf8(data: &str) -> Option<String> {

Review Comment:
   this is very clever -- I need to study the utf8 spec carefully to know if 
this is correct in all cases. I am not enough of an expert in how utf8 is 
compared to know for sure. 
   
   Is this based on any other implementation we could link to?



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1629,126 @@ fn build_statistics_expr(
     Ok(statistics_expr)
 }
 
+fn build_like_match(
+    expr_builder: &mut PruningExpressionBuilder,
+) -> Option<Arc<dyn PhysicalExpr>> {
+    // column LIKE literal => (min, max) LIKE literal split at % => min <= 
split literal && split literal <= max
+    // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+    // column LIKE '%foo' => min <= '' && '' <= max => true
+    // column LIKE '%foo%' => min <= '' && '' <= max => true
+    // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+    fn unpack_string(s: &ScalarValue) -> Option<&String> {
+        match s {
+            ScalarValue::Utf8(Some(s)) => Some(s),
+            ScalarValue::LargeUtf8(Some(s)) => Some(s),
+            ScalarValue::Utf8View(Some(s)) => Some(s),
+            ScalarValue::Dictionary(_, value) => unpack_string(value),
+            _ => None,
+        }
+    }
+
+    fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&String> 
{
+        if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+            let s = unpack_string(lit.value())?;
+            return Some(s);
+        }
+        None
+    }
+
+    // TODO Handle ILIKE perhaps by making the min lowercase and max uppercase
+    //  this may involve building the physical expressions that call lower() 
and upper()
+    let min_column_expr = expr_builder.min_column_expr().ok()?;
+    let max_column_expr = expr_builder.max_column_expr().ok()?;
+    let scalar_expr = expr_builder.scalar_expr();
+    // check that the scalar is a string literal
+    let s = extract_string_literal(scalar_expr)?;
+    // ANSI SQL specifies two wildcards: % and _. % matches zero or more 
characters, _ matches exactly one character.
+    let first_wildcard_index = s.find(['%', '_']);
+    if first_wildcard_index == Some(0) {
+        // there's no filtering we could possibly do, return an error and have 
this be handled by the unhandled hook
+        return None;
+    }
+    let (upper_bound, lower_bound) = if let Some(wildcard_index) = 
first_wildcard_index {
+        let prefix = &s[..wildcard_index];
+        let upper_bound_lit = 
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+            increment_utf8(prefix)?,

Review Comment:
   The rationale for requiring incrementing the utf8 string is non obvious in 
my mind (it is required by the fact that the statistics values may be 
truncated).
   
   Perhaps we can document that rationale somehow 🤔 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to