This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new fb1d4bcf6f Implement predicate pruning for `like` expressions (prefix
matching) (#12978)
fb1d4bcf6f is described below
commit fb1d4bcf6f500a7689999606f49a4ffc1802bcaa
Author: Adrian Garcia Badaracco <[email protected]>
AuthorDate: Mon Dec 30 07:38:47 2024 -0600
Implement predicate pruning for `like` expressions (prefix matching)
(#12978)
* Implement predicate pruning for like expressions
* add function docstring
* re-order bounds calculations
* fmt
* add fuzz tests
* fix clippy
* Update datafusion/core/tests/fuzz_cases/pruning.rs
Co-authored-by: Andrew Lamb <[email protected]>
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
datafusion/core/tests/fuzz_cases/mod.rs | 2 +
datafusion/core/tests/fuzz_cases/pruning.rs | 247 ++++++++++++
datafusion/physical-optimizer/src/pruning.rs | 563 +++++++++++++++++++++++++++
3 files changed, 812 insertions(+)
diff --git a/datafusion/core/tests/fuzz_cases/mod.rs
b/datafusion/core/tests/fuzz_cases/mod.rs
index 49db0d31a8..d5511e2970 100644
--- a/datafusion/core/tests/fuzz_cases/mod.rs
+++ b/datafusion/core/tests/fuzz_cases/mod.rs
@@ -24,6 +24,8 @@ mod sort_fuzz;
mod aggregation_fuzzer;
mod equivalence;
+mod pruning;
+
mod limit_fuzz;
mod sort_preserving_repartition_fuzz;
mod window_fuzz;
diff --git a/datafusion/core/tests/fuzz_cases/pruning.rs
b/datafusion/core/tests/fuzz_cases/pruning.rs
new file mode 100644
index 0000000000..ad35850a5f
--- /dev/null
+++ b/datafusion/core/tests/fuzz_cases/pruning.rs
@@ -0,0 +1,247 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow_array::{Array, RecordBatch, StringArray};
+use arrow_schema::{DataType, Field, Schema};
+use bytes::{BufMut, Bytes, BytesMut};
+use datafusion::{
+ datasource::{
+ listing::PartitionedFile,
+ physical_plan::{parquet::ParquetExecBuilder, FileScanConfig},
+ },
+ prelude::*,
+};
+use datafusion_common::DFSchema;
+use datafusion_execution::object_store::ObjectStoreUrl;
+use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_plan::{collect, filter::FilterExec, ExecutionPlan};
+use itertools::Itertools;
+use object_store::{memory::InMemory, path::Path, ObjectStore, PutPayload};
+use parquet::{
+ arrow::ArrowWriter,
+ file::properties::{EnabledStatistics, WriterProperties},
+};
+use rand::seq::SliceRandom;
+use url::Url;
+
+#[tokio::test]
+async fn test_fuzz_utf8() {
+ // Fuzz testing for UTF8 predicate pruning
+ // The basic idea is that query results should always be the same with or
without stats/pruning
+ // If we get this right we at least guarantee that there are no incorrect
results
+ // There may still be suboptimal pruning or stats but that's something we
can try to catch
+ // with more targeted tests.
+
+ // Since we know where the edge cases might be we don't do random black
box fuzzing.
+ // Instead we fuzz on specific pre-defined axis:
+ //
+ // - Which characters are in each value. We want to make sure to include
characters that when
+ // incremented, truncated or otherwise manipulated might cause issues.
+ // - The values in each row group. This impacts which min/max stats are
generated for each rg.
+ // We'll generate combinations of the characters with lengths ranging
from 1 to 4.
+ // - Truncation of statistics to 1, 2 or 3 characters as well as no
truncation.
+
+ let mut rng = rand::thread_rng();
+
+ let characters = [
+ "z",
+ "0",
+ "~",
+ "ß",
+ "℣",
+ "%", // this one is useful for like/not like tests since it will
result in randomly inserted wildcards
+ "_", // this one is useful for like/not like tests since it will
result in randomly inserted wildcards
+ "\u{7F}",
+ "\u{7FF}",
+ "\u{FF}",
+ "\u{10FFFF}",
+ "\u{D7FF}",
+ "\u{FDCF}",
+ // null character
+ "\u{0}",
+ ];
+
+ let value_lengths = [1, 2, 3];
+
+ // generate all combinations of characters with lengths ranging from 1 to 4
+ let mut values = vec![];
+ for length in &value_lengths {
+ values.extend(
+ characters
+ .iter()
+ .cloned()
+ .combinations(*length)
+ // now get all permutations of each combination
+ .flat_map(|c| c.into_iter().permutations(*length))
+ // and join them into strings
+ .map(|c| c.join("")),
+ );
+ }
+
+ println!("Generated {} values", values.len());
+
+ // randomly pick 100 values
+ values.shuffle(&mut rng);
+ values.truncate(100);
+
+ let mut row_groups = vec![];
+ // generate all combinations of values for row groups (1 or 2 values per
rg, more is unnecessary since we only get min/max stats out)
+ for rg_length in [1, 2] {
+ row_groups.extend(values.iter().cloned().combinations(rg_length));
+ }
+
+ println!("Generated {} row groups", row_groups.len());
+
+ // Randomly pick 100 row groups (combinations of said values)
+ row_groups.shuffle(&mut rng);
+ row_groups.truncate(100);
+
+ let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8,
false)]));
+ let df_schema = DFSchema::try_from(schema.clone()).unwrap();
+
+ let store = InMemory::new();
+ let mut files = vec![];
+ for (idx, truncation_length) in [Some(1), Some(2),
None].iter().enumerate() {
+ // parquet files only support 32767 row groups per file, so chunk up
into multiple files so we don't error if running on a large number of row groups
+ for (rg_idx, row_groups) in row_groups.chunks(32766).enumerate() {
+ let buf = write_parquet_file(
+ *truncation_length,
+ schema.clone(),
+ row_groups.to_vec(),
+ )
+ .await;
+ let filename = format!("test_fuzz_utf8_{idx}_{rg_idx}.parquet");
+ files.push((filename.clone(), buf.len()));
+ let payload = PutPayload::from(buf);
+ let path = Path::from(filename);
+ store.put(&path, payload).await.unwrap();
+ }
+ }
+
+ println!("Generated {} parquet files", files.len());
+
+ let ctx = SessionContext::new();
+
+ ctx.register_object_store(&Url::parse("memory://").unwrap(),
Arc::new(store));
+
+ let mut predicates = vec![];
+ for value in values {
+ predicates.push(col("a").eq(lit(value.clone())));
+ predicates.push(col("a").not_eq(lit(value.clone())));
+ predicates.push(col("a").lt(lit(value.clone())));
+ predicates.push(col("a").lt_eq(lit(value.clone())));
+ predicates.push(col("a").gt(lit(value.clone())));
+ predicates.push(col("a").gt_eq(lit(value.clone())));
+ predicates.push(col("a").like(lit(value.clone())));
+ predicates.push(col("a").not_like(lit(value.clone())));
+ predicates.push(col("a").like(lit(format!("%{}", value.clone()))));
+ predicates.push(col("a").like(lit(format!("{}%", value.clone()))));
+ predicates.push(col("a").not_like(lit(format!("%{}", value.clone()))));
+ predicates.push(col("a").not_like(lit(format!("{}%", value.clone()))));
+ }
+
+ for predicate in predicates {
+ println!("Testing predicate {:?}", predicate);
+ let phys_expr_predicate = ctx
+ .create_physical_expr(predicate.clone(), &df_schema)
+ .unwrap();
+ let expected = execute_with_predicate(
+ &files,
+ phys_expr_predicate.clone(),
+ false,
+ schema.clone(),
+ &ctx,
+ )
+ .await;
+ let with_pruning = execute_with_predicate(
+ &files,
+ phys_expr_predicate,
+ true,
+ schema.clone(),
+ &ctx,
+ )
+ .await;
+ assert_eq!(expected, with_pruning);
+ }
+}
+
+async fn execute_with_predicate(
+ files: &[(String, usize)],
+ predicate: Arc<dyn PhysicalExpr>,
+ prune_stats: bool,
+ schema: Arc<Schema>,
+ ctx: &SessionContext,
+) -> Vec<String> {
+ let scan =
+ FileScanConfig::new(ObjectStoreUrl::parse("memory://").unwrap(),
schema.clone())
+ .with_file_group(
+ files
+ .iter()
+ .map(|(path, size)| PartitionedFile::new(path.clone(),
*size as u64))
+ .collect(),
+ );
+ let mut builder = ParquetExecBuilder::new(scan);
+ if prune_stats {
+ builder = builder.with_predicate(predicate.clone())
+ }
+ let exec = Arc::new(builder.build()) as Arc<dyn ExecutionPlan>;
+ let exec =
+ Arc::new(FilterExec::try_new(predicate, exec).unwrap()) as Arc<dyn
ExecutionPlan>;
+
+ let batches = collect(exec, ctx.task_ctx()).await.unwrap();
+ let mut values = vec![];
+ for batch in batches {
+ let column = batch
+ .column(0)
+ .as_any()
+ .downcast_ref::<StringArray>()
+ .unwrap();
+ for i in 0..column.len() {
+ values.push(column.value(i).to_string());
+ }
+ }
+ values
+}
+
+async fn write_parquet_file(
+ truncation_length: Option<usize>,
+ schema: Arc<Schema>,
+ row_groups: Vec<Vec<String>>,
+) -> Bytes {
+ let mut buf = BytesMut::new().writer();
+ let mut props = WriterProperties::builder();
+ if let Some(truncation_length) = truncation_length {
+ props = props.set_max_statistics_size(truncation_length);
+ }
+ props = props.set_statistics_enabled(EnabledStatistics::Chunk); // row
group level
+ let props = props.build();
+ {
+ let mut writer =
+ ArrowWriter::try_new(&mut buf, schema.clone(),
Some(props)).unwrap();
+ for rg_values in row_groups.iter() {
+ let arr = StringArray::from_iter_values(rg_values.iter());
+ let batch =
+ RecordBatch::try_new(schema.clone(),
vec![Arc::new(arr)]).unwrap();
+ writer.write(&batch).unwrap();
+ writer.flush().unwrap(); // finishes the current row group and
starts a new one
+ }
+ writer.finish().unwrap();
+ }
+ buf.into_inner().freeze()
+}
diff --git a/datafusion/physical-optimizer/src/pruning.rs
b/datafusion/physical-optimizer/src/pruning.rs
index 77fc76c353..c16ed306ef 100644
--- a/datafusion/physical-optimizer/src/pruning.rs
+++ b/datafusion/physical-optimizer/src/pruning.rs
@@ -1190,6 +1190,8 @@ fn is_compare_op(op: Operator) -> bool {
| Operator::LtEq
| Operator::Gt
| Operator::GtEq
+ | Operator::LikeMatch
+ | Operator::NotLikeMatch
)
}
@@ -1482,6 +1484,21 @@ fn build_predicate_expression(
*bin_expr.op(),
Arc::clone(bin_expr.right()),
)
+ } else if let Some(like_expr) =
expr_any.downcast_ref::<phys_expr::LikeExpr>() {
+ if like_expr.case_insensitive() {
+ return unhandled_hook.handle(expr);
+ }
+ let op = match (like_expr.negated(), like_expr.case_insensitive())
{
+ (false, false) => Operator::LikeMatch,
+ (true, false) => Operator::NotLikeMatch,
+ (false, true) => Operator::ILikeMatch,
+ (true, true) => Operator::NotILikeMatch,
+ };
+ (
+ Arc::clone(like_expr.expr()),
+ op,
+ Arc::clone(like_expr.pattern()),
+ )
} else {
return unhandled_hook.handle(expr);
}
@@ -1562,6 +1579,11 @@ fn build_statistics_expr(
)),
))
}
+ Operator::LikeMatch => build_like_match(expr_builder).ok_or_else(|| {
+ plan_datafusion_err!(
+ "LIKE expression with wildcard at the beginning is not
supported"
+ )
+ })?,
Operator::Gt => {
// column > literal => (min, max) > literal => max > literal
Arc::new(phys_expr::BinaryExpr::new(
@@ -1605,7 +1627,129 @@ fn build_statistics_expr(
Ok(statistics_expr)
}
+/// Convert `column LIKE literal` where P is a constant prefix of the literal
+/// to a range check on the column: `P <= column && column < P'`, where P' is
the
+/// lowest string after all P* strings.
+fn build_like_match(
+ expr_builder: &mut PruningExpressionBuilder,
+) -> Option<Arc<dyn PhysicalExpr>> {
+ // column LIKE literal => (min, max) LIKE literal split at % => min <=
split literal && split literal <= max
+ // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+ // column LIKE '%foo' => min <= '' && '' <= max => true
+ // column LIKE '%foo%' => min <= '' && '' <= max => true
+ // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+ fn unpack_string(s: &ScalarValue) -> Option<&String> {
+ match s {
+ ScalarValue::Utf8(Some(s)) => Some(s),
+ ScalarValue::LargeUtf8(Some(s)) => Some(s),
+ ScalarValue::Utf8View(Some(s)) => Some(s),
+ ScalarValue::Dictionary(_, value) => unpack_string(value),
+ _ => None,
+ }
+ }
+
+ fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&String>
{
+ if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+ let s = unpack_string(lit.value())?;
+ return Some(s);
+ }
+ None
+ }
+
+ // TODO Handle ILIKE perhaps by making the min lowercase and max uppercase
+ // this may involve building the physical expressions that call lower()
and upper()
+ let min_column_expr = expr_builder.min_column_expr().ok()?;
+ let max_column_expr = expr_builder.max_column_expr().ok()?;
+ let scalar_expr = expr_builder.scalar_expr();
+ // check that the scalar is a string literal
+ let s = extract_string_literal(scalar_expr)?;
+ // ANSI SQL specifies two wildcards: % and _. % matches zero or more
characters, _ matches exactly one character.
+ let first_wildcard_index = s.find(['%', '_']);
+ if first_wildcard_index == Some(0) {
+ // there's no filtering we could possibly do, return an error and have
this be handled by the unhandled hook
+ return None;
+ }
+ let (lower_bound, upper_bound) = if let Some(wildcard_index) =
first_wildcard_index {
+ let prefix = &s[..wildcard_index];
+ let lower_bound_lit =
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+ prefix.to_string(),
+ ))));
+ let upper_bound_lit =
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+ increment_utf8(prefix)?,
+ ))));
+ (lower_bound_lit, upper_bound_lit)
+ } else {
+ // the like expression is a literal and can be converted into a
comparison
+ let bound =
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(s.clone()))));
+ (Arc::clone(&bound), bound)
+ };
+ let lower_bound_expr = Arc::new(phys_expr::BinaryExpr::new(
+ lower_bound,
+ Operator::LtEq,
+ Arc::clone(&max_column_expr),
+ ));
+ let upper_bound_expr = Arc::new(phys_expr::BinaryExpr::new(
+ Arc::clone(&min_column_expr),
+ Operator::LtEq,
+ upper_bound,
+ ));
+ let combined = Arc::new(phys_expr::BinaryExpr::new(
+ upper_bound_expr,
+ Operator::And,
+ lower_bound_expr,
+ ));
+ Some(combined)
+}
+
+/// Increment a UTF8 string by one, returning `None` if it can't be
incremented.
+/// This makes it so that the returned string will always compare greater than
the input string
+/// or any other string with the same prefix.
+/// This is necessary since the statistics may have been truncated: if we have
a min statistic
+/// of "fo" that may have originally been "foz" or anything else with the
prefix "fo".
+/// E.g. `increment_utf8("foo") >= "foo"` and `increment_utf8("foo") >= "fooz"`
+/// In this example `increment_utf8("foo") == "fop"
+fn increment_utf8(data: &str) -> Option<String> {
+ // Helper function to check if a character is valid to use
+ fn is_valid_unicode(c: char) -> bool {
+ let cp = c as u32;
+
+ // Filter out non-characters
(https://www.unicode.org/versions/corrigendum9.html)
+ if [0xFFFE, 0xFFFF].contains(&cp) || (0xFDD0..=0xFDEF).contains(&cp) {
+ return false;
+ }
+
+ // Filter out private use area
+ if cp >= 0x110000 {
+ return false;
+ }
+
+ true
+ }
+
+ // Convert string to vector of code points
+ let mut code_points: Vec<char> = data.chars().collect();
+
+ // Work backwards through code points
+ for idx in (0..code_points.len()).rev() {
+ let original = code_points[idx] as u32;
+
+ // Try incrementing the code point
+ if let Some(next_char) = char::from_u32(original + 1) {
+ if is_valid_unicode(next_char) {
+ code_points[idx] = next_char;
+ // truncate the string to the current index
+ code_points.truncate(idx + 1);
+ return Some(code_points.into_iter().collect());
+ }
+ }
+ }
+
+ None
+}
+
/// Wrap the statistics expression in a check that skips the expression if the
column is all nulls.
+///
/// This is important not only as an optimization but also because statistics
may not be
/// accurate for columns that are all nulls.
/// For example, for an `int` column `x` with all nulls, the
min/max/null_count statistics
@@ -3462,6 +3606,425 @@ mod tests {
);
}
+ #[test]
+ fn test_increment_utf8() {
+ // Basic ASCII
+ assert_eq!(increment_utf8("abc").unwrap(), "abd");
+ assert_eq!(increment_utf8("abz").unwrap(), "ab{");
+
+ // Test around ASCII 127 (DEL)
+ assert_eq!(increment_utf8("~").unwrap(), "\u{7f}"); // 126 -> 127
+ assert_eq!(increment_utf8("\u{7f}").unwrap(), "\u{80}"); // 127 -> 128
+
+ // Test 2-byte UTF-8 sequences
+ assert_eq!(increment_utf8("ß").unwrap(), "à"); // U+00DF -> U+00E0
+
+ // Test 3-byte UTF-8 sequences
+ assert_eq!(increment_utf8("℣").unwrap(), "ℤ"); // U+2123 -> U+2124
+
+ // Test at UTF-8 boundaries
+ assert_eq!(increment_utf8("\u{7FF}").unwrap(), "\u{800}"); // 2-byte
to 3-byte boundary
+ assert_eq!(increment_utf8("\u{FFFF}").unwrap(), "\u{10000}"); //
3-byte to 4-byte boundary
+
+ // Test that if we can't increment we return None
+ assert!(increment_utf8("").is_none());
+ assert!(increment_utf8("\u{10FFFF}").is_none()); // U+10FFFF is the
max code point
+
+ // Test that if we can't increment the last character we do the
previous one and truncate
+ assert_eq!(increment_utf8("a\u{10FFFF}").unwrap(), "b");
+
+ // Test surrogate pair range (0xD800..=0xDFFF)
+ assert_eq!(increment_utf8("a\u{D7FF}").unwrap(), "b");
+ assert!(increment_utf8("\u{D7FF}").is_none());
+
+ // Test non-characters range (0xFDD0..=0xFDEF)
+ assert_eq!(increment_utf8("a\u{FDCF}").unwrap(), "b");
+ assert!(increment_utf8("\u{FDCF}").is_none());
+
+ // Test private use area limit (>= 0x110000)
+ assert_eq!(increment_utf8("a\u{10FFFF}").unwrap(), "b");
+ assert!(increment_utf8("\u{10FFFF}").is_none()); // Can't increment
past max valid codepoint
+ }
+
+ /// Creates a setup for chunk pruning, modeling a utf8 column "s1"
+ /// with 5 different containers (e.g. RowGroups). They have [min,
+ /// max]:
+ /// s1 ["A", "Z"]
+ /// s1 ["A", "L"]
+ /// s1 ["N", "Z"]
+ /// s1 [NULL, NULL]
+ /// s1 ["A", NULL]
+ /// s1 ["", "A"]
+ /// s1 ["", ""]
+ /// s1 ["AB", "A\u{10ffff}"]
+ /// s1 ["A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]
+ fn utf8_setup() -> (SchemaRef, TestStatistics) {
+ let schema = Arc::new(Schema::new(vec![Field::new("s1",
DataType::Utf8, true)]));
+
+ let statistics = TestStatistics::new().with(
+ "s1",
+ ContainerStats::new_utf8(
+ vec![
+ Some("A"),
+ Some("A"),
+ Some("N"),
+ Some("M"),
+ None,
+ Some("A"),
+ Some(""),
+ Some(""),
+ Some("AB"),
+ Some("A\u{10ffff}\u{10ffff}"),
+ ], // min
+ vec![
+ Some("Z"),
+ Some("L"),
+ Some("Z"),
+ Some("M"),
+ None,
+ None,
+ Some("A"),
+ Some(""),
+ Some("A\u{10ffff}\u{10ffff}\u{10ffff}"),
+ Some("A\u{10ffff}\u{10ffff}"),
+ ], // max
+ ),
+ );
+ (schema, statistics)
+ }
+
+ #[test]
+ fn prune_utf8_eq() {
+ let (schema, statistics) = utf8_setup();
+
+ let expr = col("s1").eq(lit("A"));
+ #[rustfmt::skip]
+ let expected_ret = &[
+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
+ true,
+ // s1 ["A", "L"] ==> some rows could pass (must keep)
+ true,
+ // s1 ["N", "Z"] ==> no rows can pass (not keep)
+ false,
+ // s1 ["M", "M"] ==> no rows can pass (not keep)
+ false,
+ // s1 [NULL, NULL] ==> unknown (must keep)
+ true,
+ // s1 ["A", NULL] ==> unknown (must keep)
+ true,
+ // s1 ["", "A"] ==> some rows could pass (must keep)
+ true,
+ // s1 ["", ""] ==> no rows can pass (not keep)
+ false,
+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> no rows can
pass (not keep)
+ false,
+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no
rows can pass (not keep)
+ false,
+ ];
+ prune_with_expr(expr, &schema, &statistics, expected_ret);
+
+ let expr = col("s1").eq(lit(""));
+ #[rustfmt::skip]
+ let expected_ret = &[
+ // s1 ["A", "Z"] ==> no rows can pass (not keep)
+ false,
+ // s1 ["A", "L"] ==> no rows can pass (not keep)
+ false,
+ // s1 ["N", "Z"] ==> no rows can pass (not keep)
+ false,
+ // s1 ["M", "M"] ==> no rows can pass (not keep)
+ false,
+ // s1 [NULL, NULL] ==> unknown (must keep)
+ true,
+ // s1 ["A", NULL] ==> no rows can pass (not keep)
+ false,
+ // s1 ["", "A"] ==> some rows could pass (must keep)
+ true,
+ // s1 ["", ""] ==> all rows must pass (must keep)
+ true,
+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> no rows can
pass (not keep)
+ false,
+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no
rows can pass (not keep)
+ false,
+ ];
+ prune_with_expr(expr, &schema, &statistics, expected_ret);
+ }
+
+ #[test]
+ fn prune_utf8_not_eq() {
+ let (schema, statistics) = utf8_setup();
+
+ let expr = col("s1").not_eq(lit("A"));
+ #[rustfmt::skip]
+ let expected_ret = &[
+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
+ true,
+ // s1 ["A", "L"] ==> some rows could pass (must keep)
+ true,
+ // s1 ["N", "Z"] ==> all rows must pass (must keep)
+ true,
+ // s1 ["M", "M"] ==> all rows must pass (must keep)
+ true,
+ // s1 [NULL, NULL] ==> unknown (must keep)
+ true,
+ // s1 ["A", NULL] ==> unknown (must keep)
+ true,
+ // s1 ["", "A"] ==> some rows could pass (must keep)
+ true,
+ // s1 ["", ""] ==> all rows must pass (must keep)
+ true,
+ // s1 ["AB", "A\u{10ffff}\u{10ffff}"] ==> all rows must pass
(must keep)
+ true,
+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> all
rows must pass (must keep)
+ true,
+ ];
+ prune_with_expr(expr, &schema, &statistics, expected_ret);
+
+ let expr = col("s1").not_eq(lit(""));
+ #[rustfmt::skip]
+ let expected_ret = &[
+ // s1 ["A", "Z"] ==> all rows must pass (must keep)
+ true,
+ // s1 ["A", "L"] ==> all rows must pass (must keep)
+ true,
+ // s1 ["N", "Z"] ==> all rows must pass (must keep)
+ true,
+ // s1 ["M", "M"] ==> all rows must pass (must keep)
+ true,
+ // s1 [NULL, NULL] ==> unknown (must keep)
+ true,
+ // s1 ["A", NULL] ==> unknown (must keep)
+ true,
+ // s1 ["", "A"] ==> some rows could pass (must keep)
+ true,
+ // s1 ["", ""] ==> no rows can pass (not keep)
+ false,
+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> all rows must
pass (must keep)
+ true,
+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> all
rows must pass (must keep)
+ true,
+ ];
+ prune_with_expr(expr, &schema, &statistics, expected_ret);
+ }
+
+ #[test]
+ fn prune_utf8_like_one() {
+ let (schema, statistics) = utf8_setup();
+
+ let expr = col("s1").like(lit("A_"));
+ #[rustfmt::skip]
+ let expected_ret = &[
+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
+ true,
+ // s1 ["A", "L"] ==> some rows could pass (must keep)
+ true,
+ // s1 ["N", "Z"] ==> no rows can pass (not keep)
+ false,
+ // s1 ["M", "M"] ==> no rows can pass (not keep)
+ false,
+ // s1 [NULL, NULL] ==> unknown (must keep)
+ true,
+ // s1 ["A", NULL] ==> unknown (must keep)
+ true,
+ // s1 ["", "A"] ==> some rows could pass (must keep)
+ true,
+ // s1 ["", ""] ==> no rows can pass (not keep)
+ false,
+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows
could pass (must keep)
+ true,
+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some
rows could pass (must keep)
+ true,
+ ];
+ prune_with_expr(expr, &schema, &statistics, expected_ret);
+
+ let expr = col("s1").like(lit("_A_"));
+ #[rustfmt::skip]
+ let expected_ret = &[
+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
+ true,
+ // s1 ["A", "L"] ==> some rows could pass (must keep)
+ true,
+ // s1 ["N", "Z"] ==> some rows could pass (must keep)
+ true,
+ // s1 ["M", "M"] ==> some rows could pass (must keep)
+ true,
+ // s1 [NULL, NULL] ==> unknown (must keep)
+ true,
+ // s1 ["A", NULL] ==> unknown (must keep)
+ true,
+ // s1 ["", "A"] ==> some rows could pass (must keep)
+ true,
+ // s1 ["", ""] ==> some rows could pass (must keep)
+ true,
+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows
could pass (must keep)
+ true,
+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some
rows could pass (must keep)
+ true,
+ ];
+ prune_with_expr(expr, &schema, &statistics, expected_ret);
+
+ let expr = col("s1").like(lit("_"));
+ #[rustfmt::skip]
+ let expected_ret = &[
+ // s1 ["A", "Z"] ==> all rows must pass (must keep)
+ true,
+ // s1 ["A", "L"] ==> all rows must pass (must keep)
+ true,
+ // s1 ["N", "Z"] ==> all rows must pass (must keep)
+ true,
+ // s1 ["M", "M"] ==> all rows must pass (must keep)
+ true,
+ // s1 [NULL, NULL] ==> unknown (must keep)
+ true,
+ // s1 ["A", NULL] ==> unknown (must keep)
+ true,
+ // s1 ["", "A"] ==> all rows must pass (must keep)
+ true,
+ // s1 ["", ""] ==> all rows must pass (must keep)
+ true,
+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> all rows must
pass (must keep)
+ true,
+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> all
rows must pass (must keep)
+ true,
+ ];
+ prune_with_expr(expr, &schema, &statistics, expected_ret);
+
+ let expr = col("s1").like(lit(""));
+ #[rustfmt::skip]
+ let expected_ret = &[
+ // s1 ["A", "Z"] ==> no rows can pass (not keep)
+ false,
+ // s1 ["A", "L"] ==> no rows can pass (not keep)
+ false,
+ // s1 ["N", "Z"] ==> no rows can pass (not keep)
+ false,
+ // s1 ["M", "M"] ==> no rows can pass (not keep)
+ false,
+ // s1 [NULL, NULL] ==> unknown (must keep)
+ true,
+ // s1 ["A", NULL] ==> no rows can pass (not keep)
+ false,
+ // s1 ["", "A"] ==> some rows could pass (must keep)
+ true,
+ // s1 ["", ""] ==> all rows must pass (must keep)
+ true,
+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> no rows can
pass (not keep)
+ false,
+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no
rows can pass (not keep)
+ false,
+ ];
+ prune_with_expr(expr, &schema, &statistics, expected_ret);
+ }
+
+ #[test]
+ fn prune_utf8_like_many() {
+ let (schema, statistics) = utf8_setup();
+
+ let expr = col("s1").like(lit("A%"));
+ #[rustfmt::skip]
+ let expected_ret = &[
+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
+ true,
+ // s1 ["A", "L"] ==> some rows could pass (must keep)
+ true,
+ // s1 ["N", "Z"] ==> no rows can pass (not keep)
+ false,
+ // s1 ["M", "M"] ==> no rows can pass (not keep)
+ false,
+ // s1 [NULL, NULL] ==> unknown (must keep)
+ true,
+ // s1 ["A", NULL] ==> unknown (must keep)
+ true,
+ // s1 ["", "A"] ==> some rows could pass (must keep)
+ true,
+ // s1 ["", ""] ==> no rows can pass (not keep)
+ false,
+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows
could pass (must keep)
+ true,
+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some
rows could pass (must keep)
+ true,
+ ];
+ prune_with_expr(expr, &schema, &statistics, expected_ret);
+
+ let expr = col("s1").like(lit("%A%"));
+ #[rustfmt::skip]
+ let expected_ret = &[
+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
+ true,
+ // s1 ["A", "L"] ==> some rows could pass (must keep)
+ true,
+ // s1 ["N", "Z"] ==> some rows could pass (must keep)
+ true,
+ // s1 ["M", "M"] ==> some rows could pass (must keep)
+ true,
+ // s1 [NULL, NULL] ==> unknown (must keep)
+ true,
+ // s1 ["A", NULL] ==> unknown (must keep)
+ true,
+ // s1 ["", "A"] ==> some rows could pass (must keep)
+ true,
+ // s1 ["", ""] ==> some rows could pass (must keep)
+ true,
+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows
could pass (must keep)
+ true,
+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some
rows could pass (must keep)
+ true,
+ ];
+ prune_with_expr(expr, &schema, &statistics, expected_ret);
+
+ let expr = col("s1").like(lit("%"));
+ #[rustfmt::skip]
+ let expected_ret = &[
+ // s1 ["A", "Z"] ==> all rows must pass (must keep)
+ true,
+ // s1 ["A", "L"] ==> all rows must pass (must keep)
+ true,
+ // s1 ["N", "Z"] ==> all rows must pass (must keep)
+ true,
+ // s1 ["M", "M"] ==> all rows must pass (must keep)
+ true,
+ // s1 [NULL, NULL] ==> unknown (must keep)
+ true,
+ // s1 ["A", NULL] ==> unknown (must keep)
+ true,
+ // s1 ["", "A"] ==> all rows must pass (must keep)
+ true,
+ // s1 ["", ""] ==> all rows must pass (must keep)
+ true,
+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> all rows must
pass (must keep)
+ true,
+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> all
rows must pass (must keep)
+ true,
+ ];
+ prune_with_expr(expr, &schema, &statistics, expected_ret);
+
+ let expr = col("s1").like(lit(""));
+ #[rustfmt::skip]
+ let expected_ret = &[
+ // s1 ["A", "Z"] ==> no rows can pass (not keep)
+ false,
+ // s1 ["A", "L"] ==> no rows can pass (not keep)
+ false,
+ // s1 ["N", "Z"] ==> no rows can pass (not keep)
+ false,
+ // s1 ["M", "M"] ==> no rows can pass (not keep)
+ false,
+ // s1 [NULL, NULL] ==> unknown (must keep)
+ true,
+ // s1 ["A", NULL] ==> no rows can pass (not keep)
+ false,
+ // s1 ["", "A"] ==> some rows could pass (must keep)
+ true,
+ // s1 ["", ""] ==> all rows must pass (must keep)
+ true,
+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> no rows can
pass (not keep)
+ false,
+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no
rows can pass (not keep)
+ false,
+ ];
+ prune_with_expr(expr, &schema, &statistics, expected_ret);
+ }
+
#[test]
fn test_rewrite_expr_to_prunable() {
let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]