(datafusion) branch main updated: Implement predicate pruning for `like` expressions (prefix matching) (#12978)

alamb Mon, 30 Dec 2024 05:38:56 -0800

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git



The following commit(s) were added to refs/heads/main by this push:
     new fb1d4bcf6f Implement predicate pruning for `like` expressions (prefix 
matching) (#12978)
fb1d4bcf6f is described below

commit fb1d4bcf6f500a7689999606f49a4ffc1802bcaa
Author: Adrian Garcia Badaracco <[email protected]>
AuthorDate: Mon Dec 30 07:38:47 2024 -0600

    Implement predicate pruning for `like` expressions (prefix matching) 
(#12978)
    
    * Implement predicate pruning for like expressions
    
    * add function docstring
    
    * re-order bounds calculations
    
    * fmt
    
    * add fuzz tests
    
    * fix clippy
    
    * Update datafusion/core/tests/fuzz_cases/pruning.rs
    
    Co-authored-by: Andrew Lamb <[email protected]>
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 datafusion/core/tests/fuzz_cases/mod.rs      |   2 +
 datafusion/core/tests/fuzz_cases/pruning.rs  | 247 ++++++++++++
 datafusion/physical-optimizer/src/pruning.rs | 563 +++++++++++++++++++++++++++
 3 files changed, 812 insertions(+)

diff --git a/datafusion/core/tests/fuzz_cases/mod.rs 
b/datafusion/core/tests/fuzz_cases/mod.rs
index 49db0d31a8..d5511e2970 100644
--- a/datafusion/core/tests/fuzz_cases/mod.rs
+++ b/datafusion/core/tests/fuzz_cases/mod.rs
@@ -24,6 +24,8 @@ mod sort_fuzz;
 mod aggregation_fuzzer;
 mod equivalence;
 
+mod pruning;
+
 mod limit_fuzz;
 mod sort_preserving_repartition_fuzz;
 mod window_fuzz;
diff --git a/datafusion/core/tests/fuzz_cases/pruning.rs 
b/datafusion/core/tests/fuzz_cases/pruning.rs
new file mode 100644
index 0000000000..ad35850a5f
--- /dev/null
+++ b/datafusion/core/tests/fuzz_cases/pruning.rs
@@ -0,0 +1,247 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow_array::{Array, RecordBatch, StringArray};
+use arrow_schema::{DataType, Field, Schema};
+use bytes::{BufMut, Bytes, BytesMut};
+use datafusion::{
+    datasource::{
+        listing::PartitionedFile,
+        physical_plan::{parquet::ParquetExecBuilder, FileScanConfig},
+    },
+    prelude::*,
+};
+use datafusion_common::DFSchema;
+use datafusion_execution::object_store::ObjectStoreUrl;
+use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_plan::{collect, filter::FilterExec, ExecutionPlan};
+use itertools::Itertools;
+use object_store::{memory::InMemory, path::Path, ObjectStore, PutPayload};
+use parquet::{
+    arrow::ArrowWriter,
+    file::properties::{EnabledStatistics, WriterProperties},
+};
+use rand::seq::SliceRandom;
+use url::Url;
+
+#[tokio::test]
+async fn test_fuzz_utf8() {
+    // Fuzz testing for UTF8 predicate pruning
+    // The basic idea is that query results should always be the same with or 
without stats/pruning
+    // If we get this right we at least guarantee that there are no incorrect 
results
+    // There may still be suboptimal pruning or stats but that's something we 
can try to catch
+    // with more targeted tests.
+
+    // Since we know where the edge cases might be we don't do random black 
box fuzzing.
+    // Instead we fuzz on specific pre-defined axis:
+    //
+    // - Which characters are in each value. We want to make sure to include 
characters that when
+    //   incremented, truncated or otherwise manipulated might cause issues.
+    // - The values in each row group. This impacts which min/max stats are 
generated for each rg.
+    //   We'll generate combinations of the characters with lengths ranging 
from 1 to 4.
+    // - Truncation of statistics to 1, 2 or 3 characters as well as no 
truncation.
+
+    let mut rng = rand::thread_rng();
+
+    let characters = [
+        "z",
+        "0",
+        "~",
+        "ß",
+        "℣",
+        "%", // this one is useful for like/not like tests since it will 
result in randomly inserted wildcards
+        "_", // this one is useful for like/not like tests since it will 
result in randomly inserted wildcards
+        "\u{7F}",
+        "\u{7FF}",
+        "\u{FF}",
+        "\u{10FFFF}",
+        "\u{D7FF}",
+        "\u{FDCF}",
+        // null character
+        "\u{0}",
+    ];
+
+    let value_lengths = [1, 2, 3];
+
+    // generate all combinations of characters with lengths ranging from 1 to 4
+    let mut values = vec![];
+    for length in &value_lengths {
+        values.extend(
+            characters
+                .iter()
+                .cloned()
+                .combinations(*length)
+                // now get all permutations of each combination
+                .flat_map(|c| c.into_iter().permutations(*length))
+                // and join them into strings
+                .map(|c| c.join("")),
+        );
+    }
+
+    println!("Generated {} values", values.len());
+
+    // randomly pick 100 values
+    values.shuffle(&mut rng);
+    values.truncate(100);
+
+    let mut row_groups = vec![];
+    // generate all combinations of values for row groups (1 or 2 values per 
rg, more is unnecessary since we only get min/max stats out)
+    for rg_length in [1, 2] {
+        row_groups.extend(values.iter().cloned().combinations(rg_length));
+    }
+
+    println!("Generated {} row groups", row_groups.len());
+
+    // Randomly pick 100 row groups (combinations of said values)
+    row_groups.shuffle(&mut rng);
+    row_groups.truncate(100);
+
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, 
false)]));
+    let df_schema = DFSchema::try_from(schema.clone()).unwrap();
+
+    let store = InMemory::new();
+    let mut files = vec![];
+    for (idx, truncation_length) in [Some(1), Some(2), 
None].iter().enumerate() {
+        // parquet files only support 32767 row groups per file, so chunk up 
into multiple files so we don't error if running on a large number of row groups
+        for (rg_idx, row_groups) in row_groups.chunks(32766).enumerate() {
+            let buf = write_parquet_file(
+                *truncation_length,
+                schema.clone(),
+                row_groups.to_vec(),
+            )
+            .await;
+            let filename = format!("test_fuzz_utf8_{idx}_{rg_idx}.parquet");
+            files.push((filename.clone(), buf.len()));
+            let payload = PutPayload::from(buf);
+            let path = Path::from(filename);
+            store.put(&path, payload).await.unwrap();
+        }
+    }
+
+    println!("Generated {} parquet files", files.len());
+
+    let ctx = SessionContext::new();
+
+    ctx.register_object_store(&Url::parse("memory://").unwrap(), 
Arc::new(store));
+
+    let mut predicates = vec![];
+    for value in values {
+        predicates.push(col("a").eq(lit(value.clone())));
+        predicates.push(col("a").not_eq(lit(value.clone())));
+        predicates.push(col("a").lt(lit(value.clone())));
+        predicates.push(col("a").lt_eq(lit(value.clone())));
+        predicates.push(col("a").gt(lit(value.clone())));
+        predicates.push(col("a").gt_eq(lit(value.clone())));
+        predicates.push(col("a").like(lit(value.clone())));
+        predicates.push(col("a").not_like(lit(value.clone())));
+        predicates.push(col("a").like(lit(format!("%{}", value.clone()))));
+        predicates.push(col("a").like(lit(format!("{}%", value.clone()))));
+        predicates.push(col("a").not_like(lit(format!("%{}", value.clone()))));
+        predicates.push(col("a").not_like(lit(format!("{}%", value.clone()))));
+    }
+
+    for predicate in predicates {
+        println!("Testing predicate {:?}", predicate);
+        let phys_expr_predicate = ctx
+            .create_physical_expr(predicate.clone(), &df_schema)
+            .unwrap();
+        let expected = execute_with_predicate(
+            &files,
+            phys_expr_predicate.clone(),
+            false,
+            schema.clone(),
+            &ctx,
+        )
+        .await;
+        let with_pruning = execute_with_predicate(
+            &files,
+            phys_expr_predicate,
+            true,
+            schema.clone(),
+            &ctx,
+        )
+        .await;
+        assert_eq!(expected, with_pruning);
+    }
+}
+
+async fn execute_with_predicate(
+    files: &[(String, usize)],
+    predicate: Arc<dyn PhysicalExpr>,
+    prune_stats: bool,
+    schema: Arc<Schema>,
+    ctx: &SessionContext,
+) -> Vec<String> {
+    let scan =
+        FileScanConfig::new(ObjectStoreUrl::parse("memory://").unwrap(), 
schema.clone())
+            .with_file_group(
+                files
+                    .iter()
+                    .map(|(path, size)| PartitionedFile::new(path.clone(), 
*size as u64))
+                    .collect(),
+            );
+    let mut builder = ParquetExecBuilder::new(scan);
+    if prune_stats {
+        builder = builder.with_predicate(predicate.clone())
+    }
+    let exec = Arc::new(builder.build()) as Arc<dyn ExecutionPlan>;
+    let exec =
+        Arc::new(FilterExec::try_new(predicate, exec).unwrap()) as Arc<dyn 
ExecutionPlan>;
+
+    let batches = collect(exec, ctx.task_ctx()).await.unwrap();
+    let mut values = vec![];
+    for batch in batches {
+        let column = batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .unwrap();
+        for i in 0..column.len() {
+            values.push(column.value(i).to_string());
+        }
+    }
+    values
+}
+
+async fn write_parquet_file(
+    truncation_length: Option<usize>,
+    schema: Arc<Schema>,
+    row_groups: Vec<Vec<String>>,
+) -> Bytes {
+    let mut buf = BytesMut::new().writer();
+    let mut props = WriterProperties::builder();
+    if let Some(truncation_length) = truncation_length {
+        props = props.set_max_statistics_size(truncation_length);
+    }
+    props = props.set_statistics_enabled(EnabledStatistics::Chunk); // row 
group level
+    let props = props.build();
+    {
+        let mut writer =
+            ArrowWriter::try_new(&mut buf, schema.clone(), 
Some(props)).unwrap();
+        for rg_values in row_groups.iter() {
+            let arr = StringArray::from_iter_values(rg_values.iter());
+            let batch =
+                RecordBatch::try_new(schema.clone(), 
vec![Arc::new(arr)]).unwrap();
+            writer.write(&batch).unwrap();
+            writer.flush().unwrap(); // finishes the current row group and 
starts a new one
+        }
+        writer.finish().unwrap();
+    }
+    buf.into_inner().freeze()
+}
diff --git a/datafusion/physical-optimizer/src/pruning.rs 
b/datafusion/physical-optimizer/src/pruning.rs
index 77fc76c353..c16ed306ef 100644
--- a/datafusion/physical-optimizer/src/pruning.rs
+++ b/datafusion/physical-optimizer/src/pruning.rs
@@ -1190,6 +1190,8 @@ fn is_compare_op(op: Operator) -> bool {
             | Operator::LtEq
             | Operator::Gt
             | Operator::GtEq
+            | Operator::LikeMatch
+            | Operator::NotLikeMatch
     )
 }
 
@@ -1482,6 +1484,21 @@ fn build_predicate_expression(
                 *bin_expr.op(),
                 Arc::clone(bin_expr.right()),
             )
+        } else if let Some(like_expr) = 
expr_any.downcast_ref::<phys_expr::LikeExpr>() {
+            if like_expr.case_insensitive() {
+                return unhandled_hook.handle(expr);
+            }
+            let op = match (like_expr.negated(), like_expr.case_insensitive()) 
{
+                (false, false) => Operator::LikeMatch,
+                (true, false) => Operator::NotLikeMatch,
+                (false, true) => Operator::ILikeMatch,
+                (true, true) => Operator::NotILikeMatch,
+            };
+            (
+                Arc::clone(like_expr.expr()),
+                op,
+                Arc::clone(like_expr.pattern()),
+            )
         } else {
             return unhandled_hook.handle(expr);
         }
@@ -1562,6 +1579,11 @@ fn build_statistics_expr(
                 )),
             ))
         }
+        Operator::LikeMatch => build_like_match(expr_builder).ok_or_else(|| {
+            plan_datafusion_err!(
+                "LIKE expression with wildcard at the beginning is not 
supported"
+            )
+        })?,
         Operator::Gt => {
             // column > literal => (min, max) > literal => max > literal
             Arc::new(phys_expr::BinaryExpr::new(
@@ -1605,7 +1627,129 @@ fn build_statistics_expr(
     Ok(statistics_expr)
 }
 
+/// Convert `column LIKE literal` where P is a constant prefix of the literal
+/// to a range check on the column: `P <= column && column < P'`, where P' is 
the
+/// lowest string after all P* strings.
+fn build_like_match(
+    expr_builder: &mut PruningExpressionBuilder,
+) -> Option<Arc<dyn PhysicalExpr>> {
+    // column LIKE literal => (min, max) LIKE literal split at % => min <= 
split literal && split literal <= max
+    // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
+    // column LIKE '%foo' => min <= '' && '' <= max => true
+    // column LIKE '%foo%' => min <= '' && '' <= max => true
+    // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
+
+    fn unpack_string(s: &ScalarValue) -> Option<&String> {
+        match s {
+            ScalarValue::Utf8(Some(s)) => Some(s),
+            ScalarValue::LargeUtf8(Some(s)) => Some(s),
+            ScalarValue::Utf8View(Some(s)) => Some(s),
+            ScalarValue::Dictionary(_, value) => unpack_string(value),
+            _ => None,
+        }
+    }
+
+    fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&String> 
{
+        if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
+            let s = unpack_string(lit.value())?;
+            return Some(s);
+        }
+        None
+    }
+
+    // TODO Handle ILIKE perhaps by making the min lowercase and max uppercase
+    //  this may involve building the physical expressions that call lower() 
and upper()
+    let min_column_expr = expr_builder.min_column_expr().ok()?;
+    let max_column_expr = expr_builder.max_column_expr().ok()?;
+    let scalar_expr = expr_builder.scalar_expr();
+    // check that the scalar is a string literal
+    let s = extract_string_literal(scalar_expr)?;
+    // ANSI SQL specifies two wildcards: % and _. % matches zero or more 
characters, _ matches exactly one character.
+    let first_wildcard_index = s.find(['%', '_']);
+    if first_wildcard_index == Some(0) {
+        // there's no filtering we could possibly do, return an error and have 
this be handled by the unhandled hook
+        return None;
+    }
+    let (lower_bound, upper_bound) = if let Some(wildcard_index) = 
first_wildcard_index {
+        let prefix = &s[..wildcard_index];
+        let lower_bound_lit = 
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+            prefix.to_string(),
+        ))));
+        let upper_bound_lit = 
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
+            increment_utf8(prefix)?,
+        ))));
+        (lower_bound_lit, upper_bound_lit)
+    } else {
+        // the like expression is a literal and can be converted into a 
comparison
+        let bound = 
Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(s.clone()))));
+        (Arc::clone(&bound), bound)
+    };
+    let lower_bound_expr = Arc::new(phys_expr::BinaryExpr::new(
+        lower_bound,
+        Operator::LtEq,
+        Arc::clone(&max_column_expr),
+    ));
+    let upper_bound_expr = Arc::new(phys_expr::BinaryExpr::new(
+        Arc::clone(&min_column_expr),
+        Operator::LtEq,
+        upper_bound,
+    ));
+    let combined = Arc::new(phys_expr::BinaryExpr::new(
+        upper_bound_expr,
+        Operator::And,
+        lower_bound_expr,
+    ));
+    Some(combined)
+}
+
+/// Increment a UTF8 string by one, returning `None` if it can't be 
incremented.
+/// This makes it so that the returned string will always compare greater than 
the input string
+/// or any other string with the same prefix.
+/// This is necessary since the statistics may have been truncated: if we have 
a min statistic
+/// of "fo" that may have originally been "foz" or anything else with the 
prefix "fo".
+/// E.g. `increment_utf8("foo") >= "foo"` and `increment_utf8("foo") >= "fooz"`
+/// In this example `increment_utf8("foo") == "fop"
+fn increment_utf8(data: &str) -> Option<String> {
+    // Helper function to check if a character is valid to use
+    fn is_valid_unicode(c: char) -> bool {
+        let cp = c as u32;
+
+        // Filter out non-characters 
(https://www.unicode.org/versions/corrigendum9.html)
+        if [0xFFFE, 0xFFFF].contains(&cp) || (0xFDD0..=0xFDEF).contains(&cp) {
+            return false;
+        }
+
+        // Filter out private use area
+        if cp >= 0x110000 {
+            return false;
+        }
+
+        true
+    }
+
+    // Convert string to vector of code points
+    let mut code_points: Vec<char> = data.chars().collect();
+
+    // Work backwards through code points
+    for idx in (0..code_points.len()).rev() {
+        let original = code_points[idx] as u32;
+
+        // Try incrementing the code point
+        if let Some(next_char) = char::from_u32(original + 1) {
+            if is_valid_unicode(next_char) {
+                code_points[idx] = next_char;
+                // truncate the string to the current index
+                code_points.truncate(idx + 1);
+                return Some(code_points.into_iter().collect());
+            }
+        }
+    }
+
+    None
+}
+
 /// Wrap the statistics expression in a check that skips the expression if the 
column is all nulls.
+///
 /// This is important not only as an optimization but also because statistics 
may not be
 /// accurate for columns that are all nulls.
 /// For example, for an `int` column `x` with all nulls, the 
min/max/null_count statistics
@@ -3462,6 +3606,425 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_increment_utf8() {
+        // Basic ASCII
+        assert_eq!(increment_utf8("abc").unwrap(), "abd");
+        assert_eq!(increment_utf8("abz").unwrap(), "ab{");
+
+        // Test around ASCII 127 (DEL)
+        assert_eq!(increment_utf8("~").unwrap(), "\u{7f}"); // 126 -> 127
+        assert_eq!(increment_utf8("\u{7f}").unwrap(), "\u{80}"); // 127 -> 128
+
+        // Test 2-byte UTF-8 sequences
+        assert_eq!(increment_utf8("ß").unwrap(), "à"); // U+00DF -> U+00E0
+
+        // Test 3-byte UTF-8 sequences
+        assert_eq!(increment_utf8("℣").unwrap(), "ℤ"); // U+2123 -> U+2124
+
+        // Test at UTF-8 boundaries
+        assert_eq!(increment_utf8("\u{7FF}").unwrap(), "\u{800}"); // 2-byte 
to 3-byte boundary
+        assert_eq!(increment_utf8("\u{FFFF}").unwrap(), "\u{10000}"); // 
3-byte to 4-byte boundary
+
+        // Test that if we can't increment we return None
+        assert!(increment_utf8("").is_none());
+        assert!(increment_utf8("\u{10FFFF}").is_none()); // U+10FFFF is the 
max code point
+
+        // Test that if we can't increment the last character we do the 
previous one and truncate
+        assert_eq!(increment_utf8("a\u{10FFFF}").unwrap(), "b");
+
+        // Test surrogate pair range (0xD800..=0xDFFF)
+        assert_eq!(increment_utf8("a\u{D7FF}").unwrap(), "b");
+        assert!(increment_utf8("\u{D7FF}").is_none());
+
+        // Test non-characters range (0xFDD0..=0xFDEF)
+        assert_eq!(increment_utf8("a\u{FDCF}").unwrap(), "b");
+        assert!(increment_utf8("\u{FDCF}").is_none());
+
+        // Test private use area limit (>= 0x110000)
+        assert_eq!(increment_utf8("a\u{10FFFF}").unwrap(), "b");
+        assert!(increment_utf8("\u{10FFFF}").is_none()); // Can't increment 
past max valid codepoint
+    }
+
+    /// Creates a setup for chunk pruning, modeling a utf8 column "s1"
+    /// with 5 different containers (e.g. RowGroups). They have [min,
+    /// max]:
+    /// s1 ["A", "Z"]
+    /// s1 ["A", "L"]
+    /// s1 ["N", "Z"]
+    /// s1 [NULL, NULL]
+    /// s1 ["A", NULL]
+    /// s1 ["", "A"]
+    /// s1 ["", ""]
+    /// s1 ["AB", "A\u{10ffff}"]
+    /// s1 ["A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]
+    fn utf8_setup() -> (SchemaRef, TestStatistics) {
+        let schema = Arc::new(Schema::new(vec![Field::new("s1", 
DataType::Utf8, true)]));
+
+        let statistics = TestStatistics::new().with(
+            "s1",
+            ContainerStats::new_utf8(
+                vec![
+                    Some("A"),
+                    Some("A"),
+                    Some("N"),
+                    Some("M"),
+                    None,
+                    Some("A"),
+                    Some(""),
+                    Some(""),
+                    Some("AB"),
+                    Some("A\u{10ffff}\u{10ffff}"),
+                ], // min
+                vec![
+                    Some("Z"),
+                    Some("L"),
+                    Some("Z"),
+                    Some("M"),
+                    None,
+                    None,
+                    Some("A"),
+                    Some(""),
+                    Some("A\u{10ffff}\u{10ffff}\u{10ffff}"),
+                    Some("A\u{10ffff}\u{10ffff}"),
+                ], // max
+            ),
+        );
+        (schema, statistics)
+    }
+
+    #[test]
+    fn prune_utf8_eq() {
+        let (schema, statistics) = utf8_setup();
+
+        let expr = col("s1").eq(lit("A"));
+        #[rustfmt::skip]
+        let expected_ret = &[
+            // s1 ["A", "Z"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["A", "L"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["N", "Z"] ==> no rows can pass (not keep)
+            false,
+            // s1 ["M", "M"] ==> no rows can pass (not keep)
+            false,
+            // s1 [NULL, NULL]  ==> unknown (must keep)
+            true,
+            // s1 ["A", NULL]  ==> unknown (must keep)
+            true,
+            // s1 ["", "A"]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["", ""]  ==> no rows can pass (not keep)
+            false,
+            // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"]  ==> no rows can 
pass (not keep)
+            false,
+            // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]  ==> no 
rows can pass (not keep)
+            false,
+        ];
+        prune_with_expr(expr, &schema, &statistics, expected_ret);
+
+        let expr = col("s1").eq(lit(""));
+        #[rustfmt::skip]
+        let expected_ret = &[
+            // s1 ["A", "Z"] ==> no rows can pass (not keep)
+            false,
+            // s1 ["A", "L"] ==> no rows can pass (not keep)
+            false,
+            // s1 ["N", "Z"] ==> no rows can pass (not keep)
+            false,
+            // s1 ["M", "M"] ==> no rows can pass (not keep)
+            false,
+            // s1 [NULL, NULL]  ==> unknown (must keep)
+            true,
+            // s1 ["A", NULL]  ==> no rows can pass (not keep)
+            false,
+            // s1 ["", "A"]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["", ""]  ==> all rows must pass (must keep)
+            true,
+            // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"]  ==> no rows can 
pass (not keep)
+            false,
+            // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]  ==> no 
rows can pass (not keep)
+            false,
+        ];
+        prune_with_expr(expr, &schema, &statistics, expected_ret);
+    }
+
+    #[test]
+    fn prune_utf8_not_eq() {
+        let (schema, statistics) = utf8_setup();
+
+        let expr = col("s1").not_eq(lit("A"));
+        #[rustfmt::skip]
+        let expected_ret = &[
+            // s1 ["A", "Z"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["A", "L"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["N", "Z"] ==> all rows must pass (must keep)
+            true,
+            // s1 ["M", "M"] ==> all rows must pass (must keep)
+            true,
+            // s1 [NULL, NULL]  ==> unknown (must keep)
+            true,
+            // s1 ["A", NULL]  ==> unknown (must keep)
+            true,
+            // s1 ["", "A"]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["", ""]  ==> all rows must pass (must keep)
+            true,
+            // s1 ["AB", "A\u{10ffff}\u{10ffff}"]  ==> all rows must pass 
(must keep)
+            true,
+            // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]  ==> all 
rows must pass (must keep)
+            true,
+        ];
+        prune_with_expr(expr, &schema, &statistics, expected_ret);
+
+        let expr = col("s1").not_eq(lit(""));
+        #[rustfmt::skip]
+        let expected_ret = &[
+            // s1 ["A", "Z"] ==> all rows must pass (must keep)
+            true,
+            // s1 ["A", "L"] ==> all rows must pass (must keep)
+            true,
+            // s1 ["N", "Z"] ==> all rows must pass (must keep)
+            true,
+            // s1 ["M", "M"] ==> all rows must pass (must keep)
+            true,
+            // s1 [NULL, NULL]  ==> unknown (must keep)
+            true,
+            // s1 ["A", NULL]  ==> unknown (must keep)
+            true,
+            // s1 ["", "A"]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["", ""]  ==> no rows can pass (not keep)
+            false,
+            // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"]  ==> all rows must 
pass (must keep)
+            true,
+            // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]  ==> all 
rows must pass (must keep)
+            true,
+        ];
+        prune_with_expr(expr, &schema, &statistics, expected_ret);
+    }
+
+    #[test]
+    fn prune_utf8_like_one() {
+        let (schema, statistics) = utf8_setup();
+
+        let expr = col("s1").like(lit("A_"));
+        #[rustfmt::skip]
+        let expected_ret = &[
+            // s1 ["A", "Z"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["A", "L"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["N", "Z"] ==> no rows can pass (not keep)
+            false,
+            // s1 ["M", "M"] ==> no rows can pass (not keep)
+            false,
+            // s1 [NULL, NULL]  ==> unknown (must keep)
+            true,
+            // s1 ["A", NULL]  ==> unknown (must keep)
+            true,
+            // s1 ["", "A"]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["", ""]  ==> no rows can pass (not keep)
+            false,
+            // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"]  ==> some rows 
could pass (must keep)
+            true,
+            // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]  ==> some 
rows could pass (must keep)
+            true,
+        ];
+        prune_with_expr(expr, &schema, &statistics, expected_ret);
+
+        let expr = col("s1").like(lit("_A_"));
+        #[rustfmt::skip]
+        let expected_ret = &[
+            // s1 ["A", "Z"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["A", "L"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["N", "Z"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["M", "M"] ==> some rows could pass (must keep)
+            true,
+            // s1 [NULL, NULL]  ==> unknown (must keep)
+            true,
+            // s1 ["A", NULL]  ==> unknown (must keep)
+            true,
+            // s1 ["", "A"]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["", ""]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"]  ==> some rows 
could pass (must keep)
+            true,
+            // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]  ==> some 
rows could pass (must keep)
+            true,
+        ];
+        prune_with_expr(expr, &schema, &statistics, expected_ret);
+
+        let expr = col("s1").like(lit("_"));
+        #[rustfmt::skip]
+        let expected_ret = &[
+            // s1 ["A", "Z"] ==> all rows must pass (must keep)
+            true,
+            // s1 ["A", "L"] ==> all rows must pass (must keep)
+            true,
+            // s1 ["N", "Z"] ==> all rows must pass (must keep)
+            true,
+            // s1 ["M", "M"] ==> all rows must pass (must keep)
+            true,
+            // s1 [NULL, NULL]  ==> unknown (must keep)
+            true,
+            // s1 ["A", NULL]  ==> unknown (must keep)
+            true,
+            // s1 ["", "A"]  ==> all rows must pass (must keep)
+            true,
+            // s1 ["", ""]  ==> all rows must pass (must keep)
+            true,
+            // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"]  ==> all rows must 
pass (must keep)
+            true,
+            // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]  ==> all 
rows must pass (must keep)
+            true,
+        ];
+        prune_with_expr(expr, &schema, &statistics, expected_ret);
+
+        let expr = col("s1").like(lit(""));
+        #[rustfmt::skip]
+        let expected_ret = &[
+            // s1 ["A", "Z"] ==> no rows can pass (not keep)
+            false,
+            // s1 ["A", "L"] ==> no rows can pass (not keep)
+            false,
+            // s1 ["N", "Z"] ==> no rows can pass (not keep)
+            false,
+            // s1 ["M", "M"] ==> no rows can pass (not keep)
+            false,
+            // s1 [NULL, NULL]  ==> unknown (must keep)
+            true,
+            // s1 ["A", NULL]  ==> no rows can pass (not keep)
+            false,
+            // s1 ["", "A"]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["", ""]  ==> all rows must pass (must keep)
+            true,
+            // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"]  ==> no rows can 
pass (not keep)
+            false,
+            // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]  ==> no 
rows can pass (not keep)
+            false,
+        ];
+        prune_with_expr(expr, &schema, &statistics, expected_ret);
+    }
+
+    #[test]
+    fn prune_utf8_like_many() {
+        let (schema, statistics) = utf8_setup();
+
+        let expr = col("s1").like(lit("A%"));
+        #[rustfmt::skip]
+        let expected_ret = &[
+            // s1 ["A", "Z"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["A", "L"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["N", "Z"] ==> no rows can pass (not keep)
+            false,
+            // s1 ["M", "M"] ==> no rows can pass (not keep)
+            false,
+            // s1 [NULL, NULL]  ==> unknown (must keep)
+            true,
+            // s1 ["A", NULL]  ==> unknown (must keep)
+            true,
+            // s1 ["", "A"]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["", ""]  ==> no rows can pass (not keep)
+            false,
+            // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"]  ==> some rows 
could pass (must keep)
+            true,
+            // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]  ==> some 
rows could pass (must keep)
+            true,
+        ];
+        prune_with_expr(expr, &schema, &statistics, expected_ret);
+
+        let expr = col("s1").like(lit("%A%"));
+        #[rustfmt::skip]
+        let expected_ret = &[
+            // s1 ["A", "Z"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["A", "L"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["N", "Z"] ==> some rows could pass (must keep)
+            true,
+            // s1 ["M", "M"] ==> some rows could pass (must keep)
+            true,
+            // s1 [NULL, NULL]  ==> unknown (must keep)
+            true,
+            // s1 ["A", NULL]  ==> unknown (must keep)
+            true,
+            // s1 ["", "A"]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["", ""]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"]  ==> some rows 
could pass (must keep)
+            true,
+            // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]  ==> some 
rows could pass (must keep)
+            true,
+        ];
+        prune_with_expr(expr, &schema, &statistics, expected_ret);
+
+        let expr = col("s1").like(lit("%"));
+        #[rustfmt::skip]
+        let expected_ret = &[
+            // s1 ["A", "Z"] ==> all rows must pass (must keep)
+            true,
+            // s1 ["A", "L"] ==> all rows must pass (must keep)
+            true,
+            // s1 ["N", "Z"] ==> all rows must pass (must keep)
+            true,
+            // s1 ["M", "M"] ==> all rows must pass (must keep)
+            true,
+            // s1 [NULL, NULL]  ==> unknown (must keep)
+            true,
+            // s1 ["A", NULL]  ==> unknown (must keep)
+            true,
+            // s1 ["", "A"]  ==> all rows must pass (must keep)
+            true,
+            // s1 ["", ""]  ==> all rows must pass (must keep)
+            true,
+            // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"]  ==> all rows must 
pass (must keep)
+            true,
+            // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]  ==> all 
rows must pass (must keep)
+            true,
+        ];
+        prune_with_expr(expr, &schema, &statistics, expected_ret);
+
+        let expr = col("s1").like(lit(""));
+        #[rustfmt::skip]
+        let expected_ret = &[
+            // s1 ["A", "Z"] ==> no rows can pass (not keep)
+            false,
+            // s1 ["A", "L"] ==> no rows can pass (not keep)
+            false,
+            // s1 ["N", "Z"] ==> no rows can pass (not keep)
+            false,
+            // s1 ["M", "M"] ==> no rows can pass (not keep)
+            false,
+            // s1 [NULL, NULL]  ==> unknown (must keep)
+            true,
+            // s1 ["A", NULL]  ==> no rows can pass (not keep)
+            false,
+            // s1 ["", "A"]  ==> some rows could pass (must keep)
+            true,
+            // s1 ["", ""]  ==> all rows must pass (must keep)
+            true,
+            // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"]  ==> no rows can 
pass (not keep)
+            false,
+            // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]  ==> no 
rows can pass (not keep)
+            false,
+        ];
+        prune_with_expr(expr, &schema, &statistics, expected_ret);
+    }
+
     #[test]
     fn test_rewrite_expr_to_prunable() {
         let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(datafusion) branch main updated: Implement predicate pruning for `like` expressions (prefix matching) (#12978)

Reply via email to