findepi commented on code in PR #12978:
URL: https://github.com/apache/datafusion/pull/12978#discussion_r1816614140
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1624,79 @@ fn build_statistics_expr(
Ok(statistics_expr)
}
+fn build_like_match(
+ expr_builder: &mut PruningExpressionBuilder,
+) -> Result<Arc<dyn PhysicalExpr>> {
+ // column LIKE literal => (min, max) LIKE literal split at % => min <=
split literal && split literal <= max
+ // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+ // column LIKE '%foo' => min <= '' && '' <= max => true
+ // column LIKE '%foo%' => min <= '' && '' <= max => true
+ // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+ fn unpack_string(s: &ScalarValue) -> Result<&String> {
+ match s {
+ ScalarValue::Utf8(Some(s)) => Ok(s),
+ ScalarValue::LargeUtf8(Some(s)) => Ok(s),
+ ScalarValue::Utf8View(Some(s)) => Ok(s),
+ ScalarValue::Dictionary(_, value) => unpack_string(value),
+ _ => plan_err!("LIKE expression must be a string literal"),
+ }
+ }
+
+ fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Result<&String>
{
+ if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+ let s = unpack_string(lit.value())?;
+ return Ok(s);
+ }
+ plan_err!("LIKE expression must be a string literal")
+ }
+
+ // I *think* that ILIKE could be handled by making the min lowercase and
max uppercase
+ // but that requires building the physical expressions that call lower()
and upper()
+ let min_column_expr = expr_builder.min_column_expr()?;
+ let max_column_expr = expr_builder.max_column_expr()?;
+ let scalar_expr = expr_builder.scalar_expr();
+ // check that the scalar is a string literal
+ let s = extract_string_literal(scalar_expr)?;
+ // **IMPORTANT** we need to make sure that the min and max are in the
range of the prefix
+ // If we truncate 'A%' to 'A', we need to make sure that 'A' is less than
'AB' so that
+ // when we make this a range query we get 'AB' <= 'A\u{10ffff}' AND 'A' <=
'AB'.
Review Comment:
for `A%` pattern the lower bound is `A` (obvious)
what should be the upper bound?
`A\u{10ffff}` is not a correct upper bound since `A\u{10ffff}\u{10ffff}` is
even bigger but still matches `A%` input.
The correct upper bound would be:
- `A\u{10ffff}\u{10ffff}\u{10ffff}...\u{10ffff}` include -- up to max
length of the column, so potentially very very long, so absolutely not practical
- `B` (exclusive).
Thus to calculate upper bound you need (pseudo-code)
```rust
let s = extract_string_literal(scalar_expr)?;
let first_wildcard_index = ...;
let prefix = &s[..wildcard_index];
let last_incrementable_character = /* find last code point of `prefix` that
can be incremented
if we choose to stay within ascii, this will be a code point < 127
otherwise it will be any code point != the max code point (0x10FFFF) */;
if last_incrementable_character not found {
// For `%`, or `\u{10ffff}...\u{10ffff}%` patterns, we cannot calculate an
upper bound
return None
}
let upper_bound =
prefix[..last_incrementable_character-1] + // take prefix of the prefix
up to and excluding the last character that can be incremented
str(prefix[last_incrementable_character] + 1) // take last character and
increment it
```
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1624,79 @@ fn build_statistics_expr(
Ok(statistics_expr)
}
+fn build_like_match(
+ expr_builder: &mut PruningExpressionBuilder,
+) -> Result<Arc<dyn PhysicalExpr>> {
+ // column LIKE literal => (min, max) LIKE literal split at % => min <=
split literal && split literal <= max
+ // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+ // column LIKE '%foo' => min <= '' && '' <= max => true
+ // column LIKE '%foo%' => min <= '' && '' <= max => true
+ // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+ fn unpack_string(s: &ScalarValue) -> Result<&String> {
+ match s {
+ ScalarValue::Utf8(Some(s)) => Ok(s),
+ ScalarValue::LargeUtf8(Some(s)) => Ok(s),
+ ScalarValue::Utf8View(Some(s)) => Ok(s),
+ ScalarValue::Dictionary(_, value) => unpack_string(value),
+ _ => plan_err!("LIKE expression must be a string literal"),
+ }
+ }
+
+ fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Result<&String>
{
+ if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+ let s = unpack_string(lit.value())?;
+ return Ok(s);
+ }
+ plan_err!("LIKE expression must be a string literal")
+ }
+
+ // I *think* that ILIKE could be handled by making the min lowercase and
max uppercase
+ // but that requires building the physical expressions that call lower()
and upper()
+ let min_column_expr = expr_builder.min_column_expr()?;
+ let max_column_expr = expr_builder.max_column_expr()?;
+ let scalar_expr = expr_builder.scalar_expr();
+ // check that the scalar is a string literal
+ let s = extract_string_literal(scalar_expr)?;
+ // **IMPORTANT** we need to make sure that the min and max are in the
range of the prefix
+ // If we truncate 'A%' to 'A', we need to make sure that 'A' is less than
'AB' so that
+ // when we make this a range query we get 'AB' <= 'A\u{10ffff}' AND 'A' <=
'AB'.
+ // Otherwise 'AB' <= 'A' AND 'A' <= 'AB' would be *wrong* because 'AB'
LIKE 'A%' is should be true!
+ // Credit to https://stackoverflow.com/a/35881551 for inspiration on this
approach.
Review Comment:
This link isn't useful. That page conveniently avoids any details that are
important. Please remove the link.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]