This is an automated email from the ASF dual-hosted git repository.

JingsongLi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git


The following commit(s) were added to refs/heads/master by this push:
     new 2ef055c635 [core] Optimize Like for wildcard patterns via chained 
indexOf (#7817)
2ef055c635 is described below

commit 2ef055c6354b76557ea1e47ccc1d3322a792cbc3
Author: Arnav Balyan <[email protected]>
AuthorDate: Tue May 12 15:13:53 2026 +0530

    [core] Optimize Like for wildcard patterns via chained indexOf (#7817)
    
    - Today all Like patterns with `%%` go through the Java regex matcher
    which is inefficient and can be improved by string matching algorithms.
    We only cater to basic cases of startwith/endwith, everything else falls
    through java regex.
    - Introduce new chainedpatternmatcher which can tokenize %% into
    segments and handle with byte level Memorysegmentutils per row.
    - 13x improvement for 10k input row for `xyz%xyz` type strings over java
    regex, Can be enabled with
    `paimon.predicate.like.chain-checker.enabled`.
    - In the future we can further improve with Boyer Moore Sunday algo
    similar to Duckdb pattern matching. Similar matching is used by Flink
    and other analytical engines
---
 .../java/org/apache/paimon/predicate/Like.java     |   6 +-
 .../apache/paimon/predicate/LikeChainChecker.java  | 166 ++++++++++++
 .../paimon/predicate/LikeChainCheckerTest.java     | 296 +++++++++++++++++++++
 3 files changed, 466 insertions(+), 2 deletions(-)

diff --git a/paimon-common/src/main/java/org/apache/paimon/predicate/Like.java 
b/paimon-common/src/main/java/org/apache/paimon/predicate/Like.java
index bbb54f095a..e12fc8b2a4 100644
--- a/paimon-common/src/main/java/org/apache/paimon/predicate/Like.java
+++ b/paimon-common/src/main/java/org/apache/paimon/predicate/Like.java
@@ -69,8 +69,10 @@ public class Like extends LeafBinaryFunction {
             Object literal = optimized.get().getValue();
             return field -> func.test(type, field, literal);
         }
-        // TODO optimize for chain checkers when there is no '_'
-        // TODO for example: "abc%def%","%abc%def","%abc%def%","abc%def"
+        LikeChainChecker chainChecker = 
LikeChainChecker.compile(patternLiteral.toString());
+        if (chainChecker != null) {
+            return chainChecker::test;
+        }
         String regex = sqlToRegexLike(patternLiteral.toString(), null);
         Pattern pattern = Pattern.compile(regex);
         return input -> pattern.matcher(input.toString()).matches();
diff --git 
a/paimon-common/src/main/java/org/apache/paimon/predicate/LikeChainChecker.java 
b/paimon-common/src/main/java/org/apache/paimon/predicate/LikeChainChecker.java
new file mode 100644
index 0000000000..c05bb85616
--- /dev/null
+++ 
b/paimon-common/src/main/java/org/apache/paimon/predicate/LikeChainChecker.java
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.predicate;
+
+import org.apache.paimon.data.BinaryString;
+import org.apache.paimon.memory.MemorySegmentUtils;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Like matcher for patterns containing only literal text and % wildcards and 
no escape characters.
+ * Avoids the regex engine for %% clauses.
+ *
+ * <p>Algorithm to tokenise the pattern into literal segments split by %, 
anchor the first/last
+ * segment if the pattern lacks a leading/trailing %, and walk the input via 
byte level for each
+ * interior segment.
+ */
+public final class LikeChainChecker {
+
+    private final BinaryString[] segments;
+    private final boolean leftAnchored;
+    private final boolean rightAnchored;
+    private final int minLen;
+
+    private LikeChainChecker(BinaryString[] segments, boolean leftAnchored, 
boolean rightAnchored) {
+        this.segments = segments;
+        this.leftAnchored = leftAnchored;
+        this.rightAnchored = rightAnchored;
+        int sum = 0;
+        for (BinaryString seg : segments) {
+            sum += seg.getSizeInBytes();
+        }
+        this.minLen = sum;
+    }
+
+    public static LikeChainChecker compile(String pattern) {
+        if (pattern.indexOf('_') >= 0 || pattern.indexOf('\\') >= 0) {
+            return null;
+        }
+        boolean leftAnchored = pattern.isEmpty() || pattern.charAt(0) != '%';
+        boolean rightAnchored = pattern.isEmpty() || 
pattern.charAt(pattern.length() - 1) != '%';
+
+        List<BinaryString> parts = new ArrayList<>();
+        int start = 0;
+        for (int i = 0; i < pattern.length(); i++) {
+            if (pattern.charAt(i) == '%') {
+                if (i > start) {
+                    parts.add(BinaryString.fromString(pattern.substring(start, 
i)));
+                }
+                start = i + 1;
+            }
+        }
+        if (start < pattern.length()) {
+            parts.add(BinaryString.fromString(pattern.substring(start)));
+        }
+        return new LikeChainChecker(
+                parts.toArray(new BinaryString[0]), leftAnchored, 
rightAnchored);
+    }
+
+    public boolean test(BinaryString input) {
+        int inputLen = input.getSizeInBytes();
+        if (inputLen < minLen) {
+            return false;
+        }
+        if (segments.length == 0) {
+            return !(leftAnchored && rightAnchored) || inputLen == 0;
+        }
+
+        if (segments.length == 1) {
+            BinaryString only = segments[0];
+            if (leftAnchored && rightAnchored) {
+                return inputLen == only.getSizeInBytes() && 
input.startsWith(only);
+            } else if (leftAnchored) {
+                return input.startsWith(only);
+            } else if (rightAnchored) {
+                return input.endsWith(only);
+            } else {
+                return findIn(input, only, 0) >= 0;
+            }
+        }
+
+        int pos = 0;
+
+        BinaryString first = segments[0];
+        if (leftAnchored) {
+            if (!input.startsWith(first)) {
+                return false;
+            }
+            pos = first.getSizeInBytes();
+        } else {
+            int found = findIn(input, first, 0);
+            if (found < 0) {
+                return false;
+            }
+            pos = found + first.getSizeInBytes();
+        }
+
+        int lastIdx = segments.length - 1;
+        for (int i = 1; i < lastIdx; i++) {
+            BinaryString seg = segments[i];
+            int found = findIn(input, seg, pos);
+            if (found < 0) {
+                return false;
+            }
+            pos = found + seg.getSizeInBytes();
+        }
+
+        BinaryString last = segments[lastIdx];
+        int lastLen = last.getSizeInBytes();
+        if (rightAnchored) {
+            if (!input.endsWith(last)) {
+                return false;
+            }
+            int lastStart = inputLen - lastLen;
+            if (lastStart < pos) {
+                return false;
+            }
+        } else {
+            if (findIn(input, last, pos) < 0) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    private static int findIn(BinaryString input, BinaryString needle, int 
fromByte) {
+        int inputLen = input.getSizeInBytes();
+        int needleLen = needle.getSizeInBytes();
+        if (needleLen == 0) {
+            return fromByte;
+        }
+        if (fromByte + needleLen > inputLen) {
+            return -1;
+        }
+        int absoluteOffset =
+                MemorySegmentUtils.find(
+                        input.getSegments(),
+                        input.getOffset() + fromByte,
+                        inputLen - fromByte,
+                        needle.getSegments(),
+                        needle.getOffset(),
+                        needleLen);
+        if (absoluteOffset < 0) {
+            return -1;
+        }
+        return absoluteOffset - input.getOffset();
+    }
+}
diff --git 
a/paimon-common/src/test/java/org/apache/paimon/predicate/LikeChainCheckerTest.java
 
b/paimon-common/src/test/java/org/apache/paimon/predicate/LikeChainCheckerTest.java
new file mode 100644
index 0000000000..577a6b3396
--- /dev/null
+++ 
b/paimon-common/src/test/java/org/apache/paimon/predicate/LikeChainCheckerTest.java
@@ -0,0 +1,296 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.predicate;
+
+import org.apache.paimon.data.BinaryString;
+
+import org.junit.jupiter.api.Test;
+
+import java.util.regex.Pattern;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/** Tests for {@link LikeChainChecker}. */
+public class LikeChainCheckerTest {
+
+    @Test
+    public void testCompileRejectsUnderscore() {
+        assertThat(LikeChainChecker.compile("a_b")).isNull();
+        assertThat(LikeChainChecker.compile("_")).isNull();
+        assertThat(LikeChainChecker.compile("%abc_def%")).isNull();
+    }
+
+    @Test
+    public void testCompileRejectsBackslashEscape() {
+        assertThat(LikeChainChecker.compile("a\\%b")).isNull();
+        assertThat(LikeChainChecker.compile("\\")).isNull();
+    }
+
+    @Test
+    public void testCompileAcceptsPureWildcardPatterns() {
+        assertThat(LikeChainChecker.compile("")).isNotNull();
+        assertThat(LikeChainChecker.compile("%")).isNotNull();
+        assertThat(LikeChainChecker.compile("%%")).isNotNull();
+        assertThat(LikeChainChecker.compile("abc")).isNotNull();
+        assertThat(LikeChainChecker.compile("abc%")).isNotNull();
+        assertThat(LikeChainChecker.compile("%abc")).isNotNull();
+        assertThat(LikeChainChecker.compile("%abc%")).isNotNull();
+        assertThat(LikeChainChecker.compile("abc%def")).isNotNull();
+        assertThat(LikeChainChecker.compile("abc%def%ghi")).isNotNull();
+        assertThat(LikeChainChecker.compile("%abc%def%")).isNotNull();
+    }
+
+    @Test
+    public void testExactMatch() {
+        LikeChainChecker checker = LikeChainChecker.compile("abc");
+        assertThat(checker.test(b("abc"))).isTrue();
+        assertThat(checker.test(b("abcd"))).isFalse();
+        assertThat(checker.test(b("ab"))).isFalse();
+        assertThat(checker.test(b(""))).isFalse();
+    }
+
+    @Test
+    public void testStartsWith() {
+        LikeChainChecker checker = LikeChainChecker.compile("abc%");
+        assertThat(checker.test(b("abc"))).isTrue();
+        assertThat(checker.test(b("abcdef"))).isTrue();
+        assertThat(checker.test(b("ab"))).isFalse();
+        assertThat(checker.test(b("xabc"))).isFalse();
+    }
+
+    @Test
+    public void testEndsWith() {
+        LikeChainChecker checker = LikeChainChecker.compile("%abc");
+        assertThat(checker.test(b("abc"))).isTrue();
+        assertThat(checker.test(b("xyzabc"))).isTrue();
+        assertThat(checker.test(b("abcd"))).isFalse();
+        assertThat(checker.test(b("ab"))).isFalse();
+    }
+
+    @Test
+    public void testContains() {
+        LikeChainChecker checker = LikeChainChecker.compile("%abc%");
+        assertThat(checker.test(b("abc"))).isTrue();
+        assertThat(checker.test(b("xabcy"))).isTrue();
+        assertThat(checker.test(b("xabc"))).isTrue();
+        assertThat(checker.test(b("abcy"))).isTrue();
+        assertThat(checker.test(b("ab"))).isFalse();
+        assertThat(checker.test(b(""))).isFalse();
+    }
+
+    @Test
+    public void testBothAnchoredTwoSegments() {
+        LikeChainChecker checker = LikeChainChecker.compile("abc%def");
+        assertThat(checker.test(b("abcdef"))).isTrue();
+        assertThat(checker.test(b("abcXdef"))).isTrue();
+        assertThat(checker.test(b("abcXXXdef"))).isTrue();
+        assertThat(checker.test(b("abc"))).isFalse();
+        assertThat(checker.test(b("def"))).isFalse();
+        assertThat(checker.test(b("abdef"))).isFalse();
+        assertThat(checker.test(b("abcde"))).isFalse();
+    }
+
+    @Test
+    public void testLeftAnchoredOnly() {
+        LikeChainChecker checker = LikeChainChecker.compile("abc%def%");
+        assertThat(checker.test(b("abcdef"))).isTrue();
+        assertThat(checker.test(b("abcdefX"))).isTrue();
+        assertThat(checker.test(b("abcXdef"))).isTrue();
+        assertThat(checker.test(b("abcXdefY"))).isTrue();
+        assertThat(checker.test(b("abc"))).isFalse();
+        assertThat(checker.test(b("xabcdef"))).isFalse();
+    }
+
+    @Test
+    public void testRightAnchoredOnly() {
+        LikeChainChecker checker = LikeChainChecker.compile("%abc%def");
+        assertThat(checker.test(b("abcdef"))).isTrue();
+        assertThat(checker.test(b("XabcXdef"))).isTrue();
+        assertThat(checker.test(b("abcXXXdef"))).isTrue();
+        assertThat(checker.test(b("abc"))).isFalse();
+        assertThat(checker.test(b("def"))).isFalse();
+        assertThat(checker.test(b("abcdefX"))).isFalse();
+    }
+
+    @Test
+    public void testNeitherAnchored() {
+        LikeChainChecker checker = LikeChainChecker.compile("%abc%def%");
+        assertThat(checker.test(b("abcdef"))).isTrue();
+        assertThat(checker.test(b("XabcXdefX"))).isTrue();
+        assertThat(checker.test(b("abcXdef"))).isTrue();
+        assertThat(checker.test(b("abcdefY"))).isTrue();
+        assertThat(checker.test(b("abc"))).isFalse();
+        assertThat(checker.test(b("def"))).isFalse();
+        assertThat(checker.test(b("defXabc"))).isFalse();
+    }
+
+    @Test
+    public void testThreeSegmentsBothAnchored() {
+        LikeChainChecker checker = 
LikeChainChecker.compile("/api/%/users/%/profile");
+        assertThat(checker.test(b("/api/v1/users/123/profile"))).isTrue();
+        assertThat(checker.test(b("/api/X/users/Y/profile"))).isTrue();
+        assertThat(checker.test(b("/api//users//profile"))).isTrue();
+        assertThat(checker.test(b("/api/v1/profile"))).isFalse();
+        
assertThat(checker.test(b("/api/v1/users/123/profile/extra"))).isFalse();
+        assertThat(checker.test(b("X/api/v1/users/123/profile"))).isFalse();
+    }
+
+    @Test
+    public void testLongChain() {
+        LikeChainChecker checker = LikeChainChecker.compile("a%b%c%d%e");
+        assertThat(checker.test(b("abcde"))).isTrue();
+        assertThat(checker.test(b("aXbYcZdWe"))).isTrue();
+        assertThat(checker.test(b("aXXXbYYYcZZZdWWWe"))).isTrue();
+        assertThat(checker.test(b("aebcd"))).isFalse();
+        assertThat(checker.test(b("abcd"))).isFalse();
+    }
+
+    @Test
+    public void testEmptyPattern() {
+        LikeChainChecker checker = LikeChainChecker.compile("");
+        assertThat(checker.test(b(""))).isTrue();
+        assertThat(checker.test(b("a"))).isFalse();
+    }
+
+    @Test
+    public void testJustPercent() {
+        LikeChainChecker checker = LikeChainChecker.compile("%");
+        assertThat(checker.test(b(""))).isTrue();
+        assertThat(checker.test(b("a"))).isTrue();
+        assertThat(checker.test(b("hello world"))).isTrue();
+    }
+
+    @Test
+    public void testConsecutivePercents() {
+        LikeChainChecker checker = LikeChainChecker.compile("%%");
+        assertThat(checker.test(b(""))).isTrue();
+        assertThat(checker.test(b("anything"))).isTrue();
+
+        LikeChainChecker checker2 = LikeChainChecker.compile("a%%b");
+        assertThat(checker2.test(b("ab"))).isTrue();
+        assertThat(checker2.test(b("aXb"))).isTrue();
+        assertThat(checker2.test(b("aXXXb"))).isTrue();
+        assertThat(checker2.test(b("a"))).isFalse();
+        assertThat(checker2.test(b("b"))).isFalse();
+    }
+
+    @Test
+    public void testOverlapRejection() {
+        LikeChainChecker checker = LikeChainChecker.compile("a%a");
+        assertThat(checker.test(b("a"))).isFalse();
+        assertThat(checker.test(b("aa"))).isTrue();
+        assertThat(checker.test(b("aXa"))).isTrue();
+        assertThat(checker.test(b("aXXa"))).isTrue();
+    }
+
+    @Test
+    public void testInputShorterThanPattern() {
+        LikeChainChecker checker = LikeChainChecker.compile("abc%def");
+        assertThat(checker.test(b("abc"))).isFalse();
+        assertThat(checker.test(b("def"))).isFalse();
+        assertThat(checker.test(b("ab"))).isFalse();
+        assertThat(checker.test(b(""))).isFalse();
+    }
+
+    @Test
+    public void testUtf8MultiByteCharacters() {
+        LikeChainChecker checker = LikeChainChecker.compile("café%");
+        assertThat(checker.test(b("café latte"))).isTrue();
+        assertThat(checker.test(b("café"))).isTrue();
+        assertThat(checker.test(b("cafe"))).isFalse();
+
+        LikeChainChecker checker2 = LikeChainChecker.compile("%日本%");
+        assertThat(checker2.test(b("こんにちは日本語"))).isTrue();
+        assertThat(checker2.test(b("hello world"))).isFalse();
+    }
+
+    @Test
+    public void testParityWithRegex() {
+        String[] patterns = {
+            "",
+            "%",
+            "%%",
+            "abc",
+            "abc%",
+            "%abc",
+            "%abc%",
+            "abc%def",
+            "abc%def%",
+            "%abc%def",
+            "%abc%def%",
+            "a%b%c",
+            "%a%b%c%",
+            "a%a",
+            "/api/%/users/%/profile",
+            "%hello%",
+            "prefix%",
+            "%suffix",
+        };
+        String[] inputs = {
+            "",
+            "a",
+            "abc",
+            "abcdef",
+            "abc def",
+            "xabcy",
+            "/api/v1/users/123/profile",
+            "/api//users//profile",
+            "aaa",
+            "ababab",
+            "hello world",
+            "the quick brown fox",
+            "a%b%c",
+            "log_x.txt",
+        };
+
+        for (String pattern : patterns) {
+            LikeChainChecker checker = LikeChainChecker.compile(pattern);
+            assertThat(checker).as("compile should succeed for pattern: %s", 
pattern).isNotNull();
+            String regex = sqlLikeToRegex(pattern);
+            Pattern p = Pattern.compile(regex, Pattern.DOTALL);
+            for (String input : inputs) {
+                boolean expected = p.matcher(input).matches();
+                boolean actual = checker.test(b(input));
+                assertThat(actual)
+                        .as("pattern=[%s] input=[%s] regex=[%s]", pattern, 
input, regex)
+                        .isEqualTo(expected);
+            }
+        }
+    }
+
+    private static BinaryString b(String s) {
+        return BinaryString.fromString(s);
+    }
+
+    private static String sqlLikeToRegex(String sqlPattern) {
+        StringBuilder out = new StringBuilder(sqlPattern.length() * 2);
+        for (int i = 0; i < sqlPattern.length(); i++) {
+            char c = sqlPattern.charAt(i);
+            if ("[]()|^-+*?{}$\\.".indexOf(c) >= 0) {
+                out.append('\\');
+                out.append(c);
+            } else if (c == '%') {
+                out.append("(?s:.*)");
+            } else {
+                out.append(c);
+            }
+        }
+        return out.toString();
+    }
+}

Reply via email to