findepi commented on code in PR #12978:
URL: https://github.com/apache/datafusion/pull/12978#discussion_r1823898227
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1629,126 @@ fn build_statistics_expr(
Ok(statistics_expr)
}
+fn build_like_match(
+ expr_builder: &mut PruningExpressionBuilder,
+) -> Option<Arc<dyn PhysicalExpr>> {
+ // column LIKE literal => (min, max) LIKE literal split at % => min <=
split literal && split literal <= max
+ // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+ // column LIKE '%foo' => min <= '' && '' <= max => true
+ // column LIKE '%foo%' => min <= '' && '' <= max => true
+ // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+ fn unpack_string(s: &ScalarValue) -> Option<&String> {
+ match s {
+ ScalarValue::Utf8(Some(s)) => Some(s),
+ ScalarValue::LargeUtf8(Some(s)) => Some(s),
+ ScalarValue::Utf8View(Some(s)) => Some(s),
+ ScalarValue::Dictionary(_, value) => unpack_string(value),
+ _ => None,
+ }
+ }
+
+ fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&String>
{
+ if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+ let s = unpack_string(lit.value())?;
+ return Some(s);
+ }
+ None
+ }
+
+ // TODO Handle ILIKE perhaps by making the min lowercase and max uppercase
+ // this may involve building the physical expressions that call lower()
and upper()
+ let min_column_expr = expr_builder.min_column_expr().ok()?;
+ let max_column_expr = expr_builder.max_column_expr().ok()?;
+ let scalar_expr = expr_builder.scalar_expr();
+ // check that the scalar is a string literal
+ let s = extract_string_literal(scalar_expr)?;
+ // ANSI SQL specifies two wildcards: % and _. % matches zero or more
characters, _ matches exactly one character.
+ let first_wildcard_index = s.find(['%', '_']);
+ if first_wildcard_index == Some(0) {
+ // there's no filtering we could possibly do, return an error and have
this be handled by the unhandled hook
+ return None;
+ }
+ let (upper_bound, lower_bound) = if let Some(wildcard_index) =
first_wildcard_index {
+ let prefix = &s[..wildcard_index];
+ let upper_bound_lit =
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+ increment_utf8(prefix)?,
+ ))));
+ let lower_bound_lit =
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+ prefix.to_string(),
+ ))));
+ (upper_bound_lit, lower_bound_lit)
+ } else {
+ let bound =
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(s.clone()))));
+ (bound.clone(), bound)
+ };
+ let upper_bound_expr = Arc::new(phys_expr::BinaryExpr::new(
+ min_column_expr.clone(),
+ Operator::LtEq,
+ upper_bound,
+ ));
+ let lower_bound_expr = Arc::new(phys_expr::BinaryExpr::new(
+ lower_bound,
+ Operator::LtEq,
+ max_column_expr.clone(),
+ ));
+ let combined = Arc::new(phys_expr::BinaryExpr::new(
+ upper_bound_expr,
+ Operator::And,
+ lower_bound_expr,
+ ));
+ Some(combined)
+}
+
+/// Increment a UTF8 string by one, returning `None` if it can't be
incremented.
+/// This makes it so that the returned string will always compare greater than
the input string
+/// or any other string with the same prefix.
+/// E.g. `increment_utf8("foo") >= "foo"` and `increment_utf8("foo") >= "fooz"`
Review Comment:
Nice example.
This could also say that `increment_utf8("foo")` returns "fop"
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1629,126 @@ fn build_statistics_expr(
Ok(statistics_expr)
}
+fn build_like_match(
+ expr_builder: &mut PruningExpressionBuilder,
+) -> Option<Arc<dyn PhysicalExpr>> {
+ // column LIKE literal => (min, max) LIKE literal split at % => min <=
split literal && split literal <= max
+ // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+ // column LIKE '%foo' => min <= '' && '' <= max => true
+ // column LIKE '%foo%' => min <= '' && '' <= max => true
+ // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+ fn unpack_string(s: &ScalarValue) -> Option<&String> {
+ match s {
+ ScalarValue::Utf8(Some(s)) => Some(s),
+ ScalarValue::LargeUtf8(Some(s)) => Some(s),
+ ScalarValue::Utf8View(Some(s)) => Some(s),
+ ScalarValue::Dictionary(_, value) => unpack_string(value),
+ _ => None,
+ }
+ }
+
+ fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&String>
{
+ if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+ let s = unpack_string(lit.value())?;
+ return Some(s);
+ }
+ None
+ }
+
+ // TODO Handle ILIKE perhaps by making the min lowercase and max uppercase
+ // this may involve building the physical expressions that call lower()
and upper()
+ let min_column_expr = expr_builder.min_column_expr().ok()?;
+ let max_column_expr = expr_builder.max_column_expr().ok()?;
+ let scalar_expr = expr_builder.scalar_expr();
+ // check that the scalar is a string literal
+ let s = extract_string_literal(scalar_expr)?;
+ // ANSI SQL specifies two wildcards: % and _. % matches zero or more
characters, _ matches exactly one character.
+ let first_wildcard_index = s.find(['%', '_']);
+ if first_wildcard_index == Some(0) {
+ // there's no filtering we could possibly do, return an error and have
this be handled by the unhandled hook
+ return None;
+ }
+ let (upper_bound, lower_bound) = if let Some(wildcard_index) =
first_wildcard_index {
+ let prefix = &s[..wildcard_index];
+ let upper_bound_lit =
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+ increment_utf8(prefix)?,
+ ))));
+ let lower_bound_lit =
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+ prefix.to_string(),
+ ))));
+ (upper_bound_lit, lower_bound_lit)
+ } else {
+ let bound =
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(s.clone()))));
+ (bound.clone(), bound)
+ };
+ let upper_bound_expr = Arc::new(phys_expr::BinaryExpr::new(
+ min_column_expr.clone(),
+ Operator::LtEq,
+ upper_bound,
+ ));
+ let lower_bound_expr = Arc::new(phys_expr::BinaryExpr::new(
+ lower_bound,
+ Operator::LtEq,
+ max_column_expr.clone(),
+ ));
+ let combined = Arc::new(phys_expr::BinaryExpr::new(
+ upper_bound_expr,
+ Operator::And,
+ lower_bound_expr,
+ ));
+ Some(combined)
+}
+
+/// Increment a UTF8 string by one, returning `None` if it can't be
incremented.
+/// This makes it so that the returned string will always compare greater than
the input string
+/// or any other string with the same prefix.
+/// E.g. `increment_utf8("foo") >= "foo"` and `increment_utf8("foo") >= "fooz"`
+fn increment_utf8(data: &str) -> Option<String> {
+ // Helper function to check if a character is valid to use
+ fn is_valid_unicode(c: char) -> bool {
+ let cp = c as u32;
+
+ // Filter out Unicode surrogate pairs range
+ if (0xD800..=0xDFFF).contains(&cp) {
+ return false;
+ }
+
+ // Filter out non-characters
+ if (0xFDD0..=0xFDEF).contains(&cp) {
+ return false;
+ }
+
+ // Filter out private use areas and other invalid ranges
+ match cp {
+ 0xFFFE | 0xFFFF => false,
+ _ if cp >= 0x110000 => false,
+ _ => true,
+ }
Review Comment:
There should be helpers to inspect a char in Rust sdk or icu.
Or we can "stay within ASCII" to keep things simple for now.
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1629,126 @@ fn build_statistics_expr(
Ok(statistics_expr)
}
+fn build_like_match(
+ expr_builder: &mut PruningExpressionBuilder,
+) -> Option<Arc<dyn PhysicalExpr>> {
+ // column LIKE literal => (min, max) LIKE literal split at % => min <=
split literal && split literal <= max
+ // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+ // column LIKE '%foo' => min <= '' && '' <= max => true
+ // column LIKE '%foo%' => min <= '' && '' <= max => true
+ // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+ fn unpack_string(s: &ScalarValue) -> Option<&String> {
+ match s {
+ ScalarValue::Utf8(Some(s)) => Some(s),
+ ScalarValue::LargeUtf8(Some(s)) => Some(s),
+ ScalarValue::Utf8View(Some(s)) => Some(s),
+ ScalarValue::Dictionary(_, value) => unpack_string(value),
+ _ => None,
+ }
+ }
+
+ fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&String>
{
+ if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+ let s = unpack_string(lit.value())?;
+ return Some(s);
+ }
+ None
+ }
+
+ // TODO Handle ILIKE perhaps by making the min lowercase and max uppercase
+ // this may involve building the physical expressions that call lower()
and upper()
+ let min_column_expr = expr_builder.min_column_expr().ok()?;
+ let max_column_expr = expr_builder.max_column_expr().ok()?;
+ let scalar_expr = expr_builder.scalar_expr();
+ // check that the scalar is a string literal
+ let s = extract_string_literal(scalar_expr)?;
+ // ANSI SQL specifies two wildcards: % and _. % matches zero or more
characters, _ matches exactly one character.
+ let first_wildcard_index = s.find(['%', '_']);
+ if first_wildcard_index == Some(0) {
+ // there's no filtering we could possibly do, return an error and have
this be handled by the unhandled hook
+ return None;
+ }
+ let (upper_bound, lower_bound) = if let Some(wildcard_index) =
first_wildcard_index {
Review Comment:
nice!
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1629,126 @@ fn build_statistics_expr(
Ok(statistics_expr)
}
+fn build_like_match(
+ expr_builder: &mut PruningExpressionBuilder,
+) -> Option<Arc<dyn PhysicalExpr>> {
+ // column LIKE literal => (min, max) LIKE literal split at % => min <=
split literal && split literal <= max
+ // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+ // column LIKE '%foo' => min <= '' && '' <= max => true
+ // column LIKE '%foo%' => min <= '' && '' <= max => true
+ // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+ fn unpack_string(s: &ScalarValue) -> Option<&String> {
+ match s {
+ ScalarValue::Utf8(Some(s)) => Some(s),
+ ScalarValue::LargeUtf8(Some(s)) => Some(s),
+ ScalarValue::Utf8View(Some(s)) => Some(s),
+ ScalarValue::Dictionary(_, value) => unpack_string(value),
+ _ => None,
+ }
+ }
+
+ fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&String>
{
+ if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+ let s = unpack_string(lit.value())?;
+ return Some(s);
+ }
+ None
+ }
+
+ // TODO Handle ILIKE perhaps by making the min lowercase and max uppercase
+ // this may involve building the physical expressions that call lower()
and upper()
+ let min_column_expr = expr_builder.min_column_expr().ok()?;
+ let max_column_expr = expr_builder.max_column_expr().ok()?;
+ let scalar_expr = expr_builder.scalar_expr();
+ // check that the scalar is a string literal
+ let s = extract_string_literal(scalar_expr)?;
+ // ANSI SQL specifies two wildcards: % and _. % matches zero or more
characters, _ matches exactly one character.
+ let first_wildcard_index = s.find(['%', '_']);
+ if first_wildcard_index == Some(0) {
+ // there's no filtering we could possibly do, return an error and have
this be handled by the unhandled hook
+ return None;
+ }
+ let (upper_bound, lower_bound) = if let Some(wildcard_index) =
first_wildcard_index {
+ let prefix = &s[..wildcard_index];
+ let upper_bound_lit =
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+ increment_utf8(prefix)?,
+ ))));
+ let lower_bound_lit =
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+ prefix.to_string(),
+ ))));
+ (upper_bound_lit, lower_bound_lit)
+ } else {
+ let bound =
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(s.clone()))));
+ (bound.clone(), bound)
+ };
+ let upper_bound_expr = Arc::new(phys_expr::BinaryExpr::new(
+ min_column_expr.clone(),
+ Operator::LtEq,
+ upper_bound,
+ ));
+ let lower_bound_expr = Arc::new(phys_expr::BinaryExpr::new(
+ lower_bound,
+ Operator::LtEq,
+ max_column_expr.clone(),
+ ));
+ let combined = Arc::new(phys_expr::BinaryExpr::new(
+ upper_bound_expr,
+ Operator::And,
+ lower_bound_expr,
+ ));
+ Some(combined)
+}
+
+/// Increment a UTF8 string by one, returning `None` if it can't be
incremented.
+/// This makes it so that the returned string will always compare greater than
the input string
+/// or any other string with the same prefix.
+/// E.g. `increment_utf8("foo") >= "foo"` and `increment_utf8("foo") >= "fooz"`
+fn increment_utf8(data: &str) -> Option<String> {
+ // Helper function to check if a character is valid to use
+ fn is_valid_unicode(c: char) -> bool {
+ let cp = c as u32;
+
+ // Filter out Unicode surrogate pairs range
+ if (0xD800..=0xDFFF).contains(&cp) {
+ return false;
+ }
Review Comment:
According to https://doc.rust-lang.org/std/primitive.char.html, this is
redundant. `char` cannot be a code point within surrogate range.
```suggestion
```
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1629,126 @@ fn build_statistics_expr(
Ok(statistics_expr)
}
+fn build_like_match(
+ expr_builder: &mut PruningExpressionBuilder,
+) -> Option<Arc<dyn PhysicalExpr>> {
+ // column LIKE literal => (min, max) LIKE literal split at % => min <=
split literal && split literal <= max
+ // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+ // column LIKE '%foo' => min <= '' && '' <= max => true
+ // column LIKE '%foo%' => min <= '' && '' <= max => true
+ // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+ fn unpack_string(s: &ScalarValue) -> Option<&String> {
+ match s {
+ ScalarValue::Utf8(Some(s)) => Some(s),
+ ScalarValue::LargeUtf8(Some(s)) => Some(s),
+ ScalarValue::Utf8View(Some(s)) => Some(s),
+ ScalarValue::Dictionary(_, value) => unpack_string(value),
+ _ => None,
+ }
+ }
+
+ fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&String>
{
+ if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+ let s = unpack_string(lit.value())?;
+ return Some(s);
+ }
+ None
+ }
+
+ // TODO Handle ILIKE perhaps by making the min lowercase and max uppercase
+ // this may involve building the physical expressions that call lower()
and upper()
+ let min_column_expr = expr_builder.min_column_expr().ok()?;
+ let max_column_expr = expr_builder.max_column_expr().ok()?;
+ let scalar_expr = expr_builder.scalar_expr();
+ // check that the scalar is a string literal
+ let s = extract_string_literal(scalar_expr)?;
+ // ANSI SQL specifies two wildcards: % and _. % matches zero or more
characters, _ matches exactly one character.
+ let first_wildcard_index = s.find(['%', '_']);
+ if first_wildcard_index == Some(0) {
+ // there's no filtering we could possibly do, return an error and have
this be handled by the unhandled hook
+ return None;
+ }
+ let (upper_bound, lower_bound) = if let Some(wildcard_index) =
first_wildcard_index {
+ let prefix = &s[..wildcard_index];
+ let upper_bound_lit =
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+ increment_utf8(prefix)?,
+ ))));
+ let lower_bound_lit =
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+ prefix.to_string(),
+ ))));
+ (upper_bound_lit, lower_bound_lit)
+ } else {
+ let bound =
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(s.clone()))));
+ (bound.clone(), bound)
+ };
+ let upper_bound_expr = Arc::new(phys_expr::BinaryExpr::new(
+ min_column_expr.clone(),
+ Operator::LtEq,
+ upper_bound,
+ ));
+ let lower_bound_expr = Arc::new(phys_expr::BinaryExpr::new(
+ lower_bound,
+ Operator::LtEq,
+ max_column_expr.clone(),
+ ));
+ let combined = Arc::new(phys_expr::BinaryExpr::new(
+ upper_bound_expr,
+ Operator::And,
+ lower_bound_expr,
+ ));
+ Some(combined)
+}
+
+/// Increment a UTF8 string by one, returning `None` if it can't be
incremented.
+/// This makes it so that the returned string will always compare greater than
the input string
+/// or any other string with the same prefix.
+/// E.g. `increment_utf8("foo") >= "foo"` and `increment_utf8("foo") >= "fooz"`
+fn increment_utf8(data: &str) -> Option<String> {
+ // Helper function to check if a character is valid to use
+ fn is_valid_unicode(c: char) -> bool {
+ let cp = c as u32;
+
+ // Filter out Unicode surrogate pairs range
+ if (0xD800..=0xDFFF).contains(&cp) {
+ return false;
+ }
+
+ // Filter out non-characters
+ if (0xFDD0..=0xFDEF).contains(&cp) {
+ return false;
+ }
+
+ // Filter out private use areas and other invalid ranges
+ match cp {
+ 0xFFFE | 0xFFFF => false,
+ _ if cp >= 0x110000 => false,
+ _ => true,
+ }
Review Comment:
After applying the above suggestions, this can be
```suggestion
// Filter out private use area
if cp >= 0x110000 {
return false;
}
true
```
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1629,126 @@ fn build_statistics_expr(
Ok(statistics_expr)
}
+fn build_like_match(
+ expr_builder: &mut PruningExpressionBuilder,
+) -> Option<Arc<dyn PhysicalExpr>> {
+ // column LIKE literal => (min, max) LIKE literal split at % => min <=
split literal && split literal <= max
+ // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+ // column LIKE '%foo' => min <= '' && '' <= max => true
+ // column LIKE '%foo%' => min <= '' && '' <= max => true
+ // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+ fn unpack_string(s: &ScalarValue) -> Option<&String> {
+ match s {
+ ScalarValue::Utf8(Some(s)) => Some(s),
+ ScalarValue::LargeUtf8(Some(s)) => Some(s),
+ ScalarValue::Utf8View(Some(s)) => Some(s),
+ ScalarValue::Dictionary(_, value) => unpack_string(value),
+ _ => None,
+ }
+ }
+
+ fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&String>
{
+ if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+ let s = unpack_string(lit.value())?;
+ return Some(s);
+ }
+ None
+ }
+
+ // TODO Handle ILIKE perhaps by making the min lowercase and max uppercase
+ // this may involve building the physical expressions that call lower()
and upper()
+ let min_column_expr = expr_builder.min_column_expr().ok()?;
+ let max_column_expr = expr_builder.max_column_expr().ok()?;
+ let scalar_expr = expr_builder.scalar_expr();
+ // check that the scalar is a string literal
+ let s = extract_string_literal(scalar_expr)?;
+ // ANSI SQL specifies two wildcards: % and _. % matches zero or more
characters, _ matches exactly one character.
+ let first_wildcard_index = s.find(['%', '_']);
+ if first_wildcard_index == Some(0) {
+ // there's no filtering we could possibly do, return an error and have
this be handled by the unhandled hook
+ return None;
+ }
+ let (upper_bound, lower_bound) = if let Some(wildcard_index) =
first_wildcard_index {
+ let prefix = &s[..wildcard_index];
+ let upper_bound_lit =
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+ increment_utf8(prefix)?,
+ ))));
+ let lower_bound_lit =
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+ prefix.to_string(),
+ ))));
+ (upper_bound_lit, lower_bound_lit)
+ } else {
+ let bound =
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(s.clone()))));
+ (bound.clone(), bound)
+ };
+ let upper_bound_expr = Arc::new(phys_expr::BinaryExpr::new(
+ min_column_expr.clone(),
+ Operator::LtEq,
+ upper_bound,
+ ));
+ let lower_bound_expr = Arc::new(phys_expr::BinaryExpr::new(
+ lower_bound,
+ Operator::LtEq,
+ max_column_expr.clone(),
+ ));
+ let combined = Arc::new(phys_expr::BinaryExpr::new(
+ upper_bound_expr,
+ Operator::And,
+ lower_bound_expr,
+ ));
+ Some(combined)
+}
+
+/// Increment a UTF8 string by one, returning `None` if it can't be
incremented.
+/// This makes it so that the returned string will always compare greater than
the input string
+/// or any other string with the same prefix.
+/// E.g. `increment_utf8("foo") >= "foo"` and `increment_utf8("foo") >= "fooz"`
+fn increment_utf8(data: &str) -> Option<String> {
+ // Helper function to check if a character is valid to use
+ fn is_valid_unicode(c: char) -> bool {
+ let cp = c as u32;
+
+ // Filter out Unicode surrogate pairs range
+ if (0xD800..=0xDFFF).contains(&cp) {
+ return false;
+ }
+
+ // Filter out non-characters
+ if (0xFDD0..=0xFDEF).contains(&cp) {
+ return false;
+ }
Review Comment:
Per https://www.unicode.org/versions/corrigendum9.html, 0xFFFE | 0xFFFF are
also non-characters
```suggestion
// Filter out non-characters
(https://www.unicode.org/versions/corrigendum9.html)
if [0xFFFE, 0xFFFF].contains(&cp) || (0xFDD0..=0xFDEF).contains(&cp)
{
return false;
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]