etseidl commented on code in PR #12978: URL: https://github.com/apache/datafusion/pull/12978#discussion_r1871921760
########## datafusion/core/src/physical_optimizer/pruning.rs: ########## @@ -1610,6 +1629,126 @@ fn build_statistics_expr( Ok(statistics_expr) } +fn build_like_match( + expr_builder: &mut PruningExpressionBuilder, +) -> Option<Arc<dyn PhysicalExpr>> { + // column LIKE literal => (min, max) LIKE literal split at % => min <= split literal && split literal <= max + // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max + // column LIKE '%foo' => min <= '' && '' <= max => true + // column LIKE '%foo%' => min <= '' && '' <= max => true + // column LIKE 'foo' => min <= 'foo' && 'foo' <= max + + fn unpack_string(s: &ScalarValue) -> Option<&String> { + match s { + ScalarValue::Utf8(Some(s)) => Some(s), + ScalarValue::LargeUtf8(Some(s)) => Some(s), + ScalarValue::Utf8View(Some(s)) => Some(s), + ScalarValue::Dictionary(_, value) => unpack_string(value), + _ => None, + } + } + + fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&String> { + if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() { + let s = unpack_string(lit.value())?; + return Some(s); + } + None + } + + // TODO Handle ILIKE perhaps by making the min lowercase and max uppercase + // this may involve building the physical expressions that call lower() and upper() + let min_column_expr = expr_builder.min_column_expr().ok()?; + let max_column_expr = expr_builder.max_column_expr().ok()?; + let scalar_expr = expr_builder.scalar_expr(); + // check that the scalar is a string literal + let s = extract_string_literal(scalar_expr)?; + // ANSI SQL specifies two wildcards: % and _. % matches zero or more characters, _ matches exactly one character. + let first_wildcard_index = s.find(['%', '_']); + if first_wildcard_index == Some(0) { + // there's no filtering we could possibly do, return an error and have this be handled by the unhandled hook + return None; + } + let (upper_bound, lower_bound) = if let Some(wildcard_index) = first_wildcard_index { + let prefix = &s[..wildcard_index]; + let upper_bound_lit = Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some( + increment_utf8(prefix)?, + )))); + let lower_bound_lit = Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some( + prefix.to_string(), + )))); + (upper_bound_lit, lower_bound_lit) + } else { + let bound = Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(s.clone())))); + (bound.clone(), bound) + }; + let upper_bound_expr = Arc::new(phys_expr::BinaryExpr::new( + min_column_expr.clone(), + Operator::LtEq, + upper_bound, + )); + let lower_bound_expr = Arc::new(phys_expr::BinaryExpr::new( + lower_bound, + Operator::LtEq, + max_column_expr.clone(), + )); + let combined = Arc::new(phys_expr::BinaryExpr::new( + upper_bound_expr, + Operator::And, + lower_bound_expr, + )); + Some(combined) +} + +/// Increment a UTF8 string by one, returning `None` if it can't be incremented. +/// This makes it so that the returned string will always compare greater than the input string +/// or any other string with the same prefix. +/// E.g. `increment_utf8("foo") >= "foo"` and `increment_utf8("foo") >= "fooz"` +fn increment_utf8(data: &str) -> Option<String> { Review Comment: Also not a UTF-8 expert 😛. The code @alamb is referring to is [here](https://github.com/rapidsai/cudf/blob/1b82963df736f3ad71b003443a4de1414f3ce2e5/cpp/src/io/parquet/page_enc.cu#L2759), and unit test is [here](https://github.com/rapidsai/cudf/blob/1b82963df736f3ad71b003443a4de1414f3ce2e5/cpp/tests/io/parquet_writer_test.cpp#L882). I had to do it this way because STL isn't usable in code that runs on the GPU, and I couldn't handle cases where incrementing the character increases the byte count. Here we have the luxury of being able to convert the encoded bytes into a u32 representation of the code point, increment that and then re-encode. Much nicer :) The testing of the increment is pretty thorough...U+7F to U+80 is the tricky one. U+00FF to U+0100 would be another to test as that also overflows the lower 6 bits of a 2-byte character. Looks pretty sound to me. 👍 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org