findepi commented on code in PR #12978:
URL: https://github.com/apache/datafusion/pull/12978#discussion_r1818867108
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -3577,7 +3585,11 @@ mod tests {
// s1 ["A", NULL] ==> unknown (must keep)
// s1 ["", "A"] ==> some rows could pass (must keep)
// s1 ["", ""] ==> no rows can pass (not keep)
- let expected_ret = &[true, true, false, false, true, true, true,
false];
+ // s1 ["AB", "A\u{10ffff}"] ==> no rows can pass (not keep)
Review Comment:
The fact that these tests are "vectorized" doesn't make them easy to follow.
(no change requested here)
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1624,74 @@ fn build_statistics_expr(
Ok(statistics_expr)
}
+fn build_like_match(
+ expr_builder: &mut PruningExpressionBuilder,
+) -> Result<Arc<dyn PhysicalExpr>> {
+ // column LIKE literal => (min, max) LIKE literal split at % => min <=
split literal && split literal <= max
+ // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+ // column LIKE '%foo' => min <= '' && '' <= max => true
+ // column LIKE '%foo%' => min <= '' && '' <= max => true
+ // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+ fn unpack_string(s: &ScalarValue) -> Result<&String> {
+ match s {
+ ScalarValue::Utf8(Some(s)) => Ok(s),
+ ScalarValue::LargeUtf8(Some(s)) => Ok(s),
+ ScalarValue::Utf8View(Some(s)) => Ok(s),
+ ScalarValue::Dictionary(_, value) => unpack_string(value),
+ _ => plan_err!("LIKE expression must be a string literal"),
+ }
+ }
+
+ fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Result<&String>
{
+ if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+ let s = unpack_string(lit.value())?;
+ return Ok(s);
+ }
+ plan_err!("LIKE expression must be a string literal")
+ }
+
+ // I *think* that ILIKE could be handled by making the min lowercase and
max uppercase
+ // but that requires building the physical expressions that call lower()
and upper()
Review Comment:
```suggestion
// TODO Handle ILIKE perhaps by making the min lowercase and max
uppercase
// this may involve building the physical expressions that call lower()
and upper()
```
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1624,74 @@ fn build_statistics_expr(
Ok(statistics_expr)
}
+fn build_like_match(
+ expr_builder: &mut PruningExpressionBuilder,
+) -> Result<Arc<dyn PhysicalExpr>> {
+ // column LIKE literal => (min, max) LIKE literal split at % => min <=
split literal && split literal <= max
+ // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+ // column LIKE '%foo' => min <= '' && '' <= max => true
+ // column LIKE '%foo%' => min <= '' && '' <= max => true
+ // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
Review Comment:
Turn this into function documentation and place above the function
declaration
```
/// Convert `column LIKE literal` where P is a constant prefix of the literal
/// to a range check on the column: `P <= column && column < P'`, where P'
is the
/// lowest string after all P* strings.
```
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1624,74 @@ fn build_statistics_expr(
Ok(statistics_expr)
}
+fn build_like_match(
+ expr_builder: &mut PruningExpressionBuilder,
+) -> Result<Arc<dyn PhysicalExpr>> {
+ // column LIKE literal => (min, max) LIKE literal split at % => min <=
split literal && split literal <= max
+ // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+ // column LIKE '%foo' => min <= '' && '' <= max => true
+ // column LIKE '%foo%' => min <= '' && '' <= max => true
+ // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+ fn unpack_string(s: &ScalarValue) -> Result<&String> {
+ match s {
+ ScalarValue::Utf8(Some(s)) => Ok(s),
+ ScalarValue::LargeUtf8(Some(s)) => Ok(s),
+ ScalarValue::Utf8View(Some(s)) => Ok(s),
+ ScalarValue::Dictionary(_, value) => unpack_string(value),
+ _ => plan_err!("LIKE expression must be a string literal"),
Review Comment:
```suggestion
_ => plan_err!("LIKE pattern literal must be a string"),
```
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -3443,6 +3525,385 @@ mod tests {
);
}
+ /// Creates a setup for chunk pruning, modeling a utf8 column "s1"
+ /// with 5 different containers (e.g. RowGroups). They have [min,
+ /// max]:
+ /// s1 ["A", "Z"]
+ /// s1 ["A", "L"]
+ /// s1 ["N", "Z"]
+ /// s1 [NULL, NULL]
+ /// s1 ["A", NULL]
+ /// s1 ["", "A"]
+ /// s1 ["", ""]
+ /// s1 ["AB", "A\u{10ffff}"]
+ /// s1 ["A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]
+ fn utf8_setup() -> (SchemaRef, TestStatistics) {
+ let schema = Arc::new(Schema::new(vec![Field::new("s1",
DataType::Utf8, true)]));
+
+ let statistics = TestStatistics::new().with(
+ "s1",
+ ContainerStats::new_utf8(
+ vec![
+ Some("A"),
+ Some("A"),
+ Some("N"),
+ Some("M"),
+ None,
+ Some("A"),
+ Some(""),
+ Some(""),
+ Some("AB"),
+ Some("A\u{10ffff}"),
Review Comment:
```suggestion
Some("A\u{10ffff}\u{10ffff}"),
```
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1624,74 @@ fn build_statistics_expr(
Ok(statistics_expr)
}
+fn build_like_match(
+ expr_builder: &mut PruningExpressionBuilder,
+) -> Result<Arc<dyn PhysicalExpr>> {
+ // column LIKE literal => (min, max) LIKE literal split at % => min <=
split literal && split literal <= max
+ // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+ // column LIKE '%foo' => min <= '' && '' <= max => true
+ // column LIKE '%foo%' => min <= '' && '' <= max => true
+ // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+ fn unpack_string(s: &ScalarValue) -> Result<&String> {
+ match s {
+ ScalarValue::Utf8(Some(s)) => Ok(s),
+ ScalarValue::LargeUtf8(Some(s)) => Ok(s),
+ ScalarValue::Utf8View(Some(s)) => Ok(s),
+ ScalarValue::Dictionary(_, value) => unpack_string(value),
+ _ => plan_err!("LIKE expression must be a string literal"),
+ }
+ }
+
+ fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Result<&String>
{
+ if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+ let s = unpack_string(lit.value())?;
+ return Ok(s);
+ }
+ plan_err!("LIKE expression must be a string literal")
Review Comment:
Like pattern doesn't have to be literal.
See
https://github.com/apache/datafusion/blob/8aafa5498024e45b58ad2068a80cf5942babe55b/datafusion/sqllogictest/test_files/select.slt#L670-L685
We probably are not hitting here because `PruningExpressionBuilder::try_new`
filters out the case where both sides of operation contain column references.
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1610,6 +1624,74 @@ fn build_statistics_expr(
Ok(statistics_expr)
}
+fn build_like_match(
+ expr_builder: &mut PruningExpressionBuilder,
+) -> Result<Arc<dyn PhysicalExpr>> {
+ // column LIKE literal => (min, max) LIKE literal split at % => min <=
split literal && split literal <= max
+ // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+ // column LIKE '%foo' => min <= '' && '' <= max => true
+ // column LIKE '%foo%' => min <= '' && '' <= max => true
+ // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+ fn unpack_string(s: &ScalarValue) -> Result<&String> {
+ match s {
+ ScalarValue::Utf8(Some(s)) => Ok(s),
+ ScalarValue::LargeUtf8(Some(s)) => Ok(s),
+ ScalarValue::Utf8View(Some(s)) => Ok(s),
+ ScalarValue::Dictionary(_, value) => unpack_string(value),
+ _ => plan_err!("LIKE expression must be a string literal"),
+ }
+ }
+
+ fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Result<&String>
{
+ if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+ let s = unpack_string(lit.value())?;
+ return Ok(s);
+ }
+ plan_err!("LIKE expression must be a string literal")
Review Comment:
```suggestion
plan_err!("Unexpected LIKE expression: {expr:?}")
```
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -3443,6 +3525,385 @@ mod tests {
);
}
+ /// Creates a setup for chunk pruning, modeling a utf8 column "s1"
+ /// with 5 different containers (e.g. RowGroups). They have [min,
+ /// max]:
+ /// s1 ["A", "Z"]
+ /// s1 ["A", "L"]
+ /// s1 ["N", "Z"]
+ /// s1 [NULL, NULL]
+ /// s1 ["A", NULL]
+ /// s1 ["", "A"]
+ /// s1 ["", ""]
+ /// s1 ["AB", "A\u{10ffff}"]
+ /// s1 ["A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]
+ fn utf8_setup() -> (SchemaRef, TestStatistics) {
+ let schema = Arc::new(Schema::new(vec![Field::new("s1",
DataType::Utf8, true)]));
+
+ let statistics = TestStatistics::new().with(
+ "s1",
+ ContainerStats::new_utf8(
+ vec![
+ Some("A"),
+ Some("A"),
+ Some("N"),
+ Some("M"),
+ None,
+ Some("A"),
+ Some(""),
+ Some(""),
+ Some("AB"),
+ Some("A\u{10ffff}"),
Review Comment:
This will expose the problem we're talking about here
https://github.com/apache/datafusion/pull/12978#discussion_r1817937062
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]