This is an automated email from the ASF dual-hosted git repository.
JingsongLi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git
The following commit(s) were added to refs/heads/master by this push:
new 2ef055c635 [core] Optimize Like for wildcard patterns via chained
indexOf (#7817)
2ef055c635 is described below
commit 2ef055c6354b76557ea1e47ccc1d3322a792cbc3
Author: Arnav Balyan <[email protected]>
AuthorDate: Tue May 12 15:13:53 2026 +0530
[core] Optimize Like for wildcard patterns via chained indexOf (#7817)
- Today all Like patterns with `%%` go through the Java regex matcher
which is inefficient and can be improved by string matching algorithms.
We only cater to basic cases of startwith/endwith, everything else falls
through java regex.
- Introduce new chainedpatternmatcher which can tokenize %% into
segments and handle with byte level Memorysegmentutils per row.
- 13x improvement for 10k input row for `xyz%xyz` type strings over java
regex, Can be enabled with
`paimon.predicate.like.chain-checker.enabled`.
- In the future we can further improve with Boyer Moore Sunday algo
similar to Duckdb pattern matching. Similar matching is used by Flink
and other analytical engines
---
.../java/org/apache/paimon/predicate/Like.java | 6 +-
.../apache/paimon/predicate/LikeChainChecker.java | 166 ++++++++++++
.../paimon/predicate/LikeChainCheckerTest.java | 296 +++++++++++++++++++++
3 files changed, 466 insertions(+), 2 deletions(-)
diff --git a/paimon-common/src/main/java/org/apache/paimon/predicate/Like.java
b/paimon-common/src/main/java/org/apache/paimon/predicate/Like.java
index bbb54f095a..e12fc8b2a4 100644
--- a/paimon-common/src/main/java/org/apache/paimon/predicate/Like.java
+++ b/paimon-common/src/main/java/org/apache/paimon/predicate/Like.java
@@ -69,8 +69,10 @@ public class Like extends LeafBinaryFunction {
Object literal = optimized.get().getValue();
return field -> func.test(type, field, literal);
}
- // TODO optimize for chain checkers when there is no '_'
- // TODO for example: "abc%def%","%abc%def","%abc%def%","abc%def"
+ LikeChainChecker chainChecker =
LikeChainChecker.compile(patternLiteral.toString());
+ if (chainChecker != null) {
+ return chainChecker::test;
+ }
String regex = sqlToRegexLike(patternLiteral.toString(), null);
Pattern pattern = Pattern.compile(regex);
return input -> pattern.matcher(input.toString()).matches();
diff --git
a/paimon-common/src/main/java/org/apache/paimon/predicate/LikeChainChecker.java
b/paimon-common/src/main/java/org/apache/paimon/predicate/LikeChainChecker.java
new file mode 100644
index 0000000000..c05bb85616
--- /dev/null
+++
b/paimon-common/src/main/java/org/apache/paimon/predicate/LikeChainChecker.java
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.predicate;
+
+import org.apache.paimon.data.BinaryString;
+import org.apache.paimon.memory.MemorySegmentUtils;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Like matcher for patterns containing only literal text and % wildcards and
no escape characters.
+ * Avoids the regex engine for %% clauses.
+ *
+ * <p>Algorithm to tokenise the pattern into literal segments split by %,
anchor the first/last
+ * segment if the pattern lacks a leading/trailing %, and walk the input via
byte level for each
+ * interior segment.
+ */
+public final class LikeChainChecker {
+
+ private final BinaryString[] segments;
+ private final boolean leftAnchored;
+ private final boolean rightAnchored;
+ private final int minLen;
+
+ private LikeChainChecker(BinaryString[] segments, boolean leftAnchored,
boolean rightAnchored) {
+ this.segments = segments;
+ this.leftAnchored = leftAnchored;
+ this.rightAnchored = rightAnchored;
+ int sum = 0;
+ for (BinaryString seg : segments) {
+ sum += seg.getSizeInBytes();
+ }
+ this.minLen = sum;
+ }
+
+ public static LikeChainChecker compile(String pattern) {
+ if (pattern.indexOf('_') >= 0 || pattern.indexOf('\\') >= 0) {
+ return null;
+ }
+ boolean leftAnchored = pattern.isEmpty() || pattern.charAt(0) != '%';
+ boolean rightAnchored = pattern.isEmpty() ||
pattern.charAt(pattern.length() - 1) != '%';
+
+ List<BinaryString> parts = new ArrayList<>();
+ int start = 0;
+ for (int i = 0; i < pattern.length(); i++) {
+ if (pattern.charAt(i) == '%') {
+ if (i > start) {
+ parts.add(BinaryString.fromString(pattern.substring(start,
i)));
+ }
+ start = i + 1;
+ }
+ }
+ if (start < pattern.length()) {
+ parts.add(BinaryString.fromString(pattern.substring(start)));
+ }
+ return new LikeChainChecker(
+ parts.toArray(new BinaryString[0]), leftAnchored,
rightAnchored);
+ }
+
+ public boolean test(BinaryString input) {
+ int inputLen = input.getSizeInBytes();
+ if (inputLen < minLen) {
+ return false;
+ }
+ if (segments.length == 0) {
+ return !(leftAnchored && rightAnchored) || inputLen == 0;
+ }
+
+ if (segments.length == 1) {
+ BinaryString only = segments[0];
+ if (leftAnchored && rightAnchored) {
+ return inputLen == only.getSizeInBytes() &&
input.startsWith(only);
+ } else if (leftAnchored) {
+ return input.startsWith(only);
+ } else if (rightAnchored) {
+ return input.endsWith(only);
+ } else {
+ return findIn(input, only, 0) >= 0;
+ }
+ }
+
+ int pos = 0;
+
+ BinaryString first = segments[0];
+ if (leftAnchored) {
+ if (!input.startsWith(first)) {
+ return false;
+ }
+ pos = first.getSizeInBytes();
+ } else {
+ int found = findIn(input, first, 0);
+ if (found < 0) {
+ return false;
+ }
+ pos = found + first.getSizeInBytes();
+ }
+
+ int lastIdx = segments.length - 1;
+ for (int i = 1; i < lastIdx; i++) {
+ BinaryString seg = segments[i];
+ int found = findIn(input, seg, pos);
+ if (found < 0) {
+ return false;
+ }
+ pos = found + seg.getSizeInBytes();
+ }
+
+ BinaryString last = segments[lastIdx];
+ int lastLen = last.getSizeInBytes();
+ if (rightAnchored) {
+ if (!input.endsWith(last)) {
+ return false;
+ }
+ int lastStart = inputLen - lastLen;
+ if (lastStart < pos) {
+ return false;
+ }
+ } else {
+ if (findIn(input, last, pos) < 0) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ private static int findIn(BinaryString input, BinaryString needle, int
fromByte) {
+ int inputLen = input.getSizeInBytes();
+ int needleLen = needle.getSizeInBytes();
+ if (needleLen == 0) {
+ return fromByte;
+ }
+ if (fromByte + needleLen > inputLen) {
+ return -1;
+ }
+ int absoluteOffset =
+ MemorySegmentUtils.find(
+ input.getSegments(),
+ input.getOffset() + fromByte,
+ inputLen - fromByte,
+ needle.getSegments(),
+ needle.getOffset(),
+ needleLen);
+ if (absoluteOffset < 0) {
+ return -1;
+ }
+ return absoluteOffset - input.getOffset();
+ }
+}
diff --git
a/paimon-common/src/test/java/org/apache/paimon/predicate/LikeChainCheckerTest.java
b/paimon-common/src/test/java/org/apache/paimon/predicate/LikeChainCheckerTest.java
new file mode 100644
index 0000000000..577a6b3396
--- /dev/null
+++
b/paimon-common/src/test/java/org/apache/paimon/predicate/LikeChainCheckerTest.java
@@ -0,0 +1,296 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.predicate;
+
+import org.apache.paimon.data.BinaryString;
+
+import org.junit.jupiter.api.Test;
+
+import java.util.regex.Pattern;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/** Tests for {@link LikeChainChecker}. */
+public class LikeChainCheckerTest {
+
+ @Test
+ public void testCompileRejectsUnderscore() {
+ assertThat(LikeChainChecker.compile("a_b")).isNull();
+ assertThat(LikeChainChecker.compile("_")).isNull();
+ assertThat(LikeChainChecker.compile("%abc_def%")).isNull();
+ }
+
+ @Test
+ public void testCompileRejectsBackslashEscape() {
+ assertThat(LikeChainChecker.compile("a\\%b")).isNull();
+ assertThat(LikeChainChecker.compile("\\")).isNull();
+ }
+
+ @Test
+ public void testCompileAcceptsPureWildcardPatterns() {
+ assertThat(LikeChainChecker.compile("")).isNotNull();
+ assertThat(LikeChainChecker.compile("%")).isNotNull();
+ assertThat(LikeChainChecker.compile("%%")).isNotNull();
+ assertThat(LikeChainChecker.compile("abc")).isNotNull();
+ assertThat(LikeChainChecker.compile("abc%")).isNotNull();
+ assertThat(LikeChainChecker.compile("%abc")).isNotNull();
+ assertThat(LikeChainChecker.compile("%abc%")).isNotNull();
+ assertThat(LikeChainChecker.compile("abc%def")).isNotNull();
+ assertThat(LikeChainChecker.compile("abc%def%ghi")).isNotNull();
+ assertThat(LikeChainChecker.compile("%abc%def%")).isNotNull();
+ }
+
+ @Test
+ public void testExactMatch() {
+ LikeChainChecker checker = LikeChainChecker.compile("abc");
+ assertThat(checker.test(b("abc"))).isTrue();
+ assertThat(checker.test(b("abcd"))).isFalse();
+ assertThat(checker.test(b("ab"))).isFalse();
+ assertThat(checker.test(b(""))).isFalse();
+ }
+
+ @Test
+ public void testStartsWith() {
+ LikeChainChecker checker = LikeChainChecker.compile("abc%");
+ assertThat(checker.test(b("abc"))).isTrue();
+ assertThat(checker.test(b("abcdef"))).isTrue();
+ assertThat(checker.test(b("ab"))).isFalse();
+ assertThat(checker.test(b("xabc"))).isFalse();
+ }
+
+ @Test
+ public void testEndsWith() {
+ LikeChainChecker checker = LikeChainChecker.compile("%abc");
+ assertThat(checker.test(b("abc"))).isTrue();
+ assertThat(checker.test(b("xyzabc"))).isTrue();
+ assertThat(checker.test(b("abcd"))).isFalse();
+ assertThat(checker.test(b("ab"))).isFalse();
+ }
+
+ @Test
+ public void testContains() {
+ LikeChainChecker checker = LikeChainChecker.compile("%abc%");
+ assertThat(checker.test(b("abc"))).isTrue();
+ assertThat(checker.test(b("xabcy"))).isTrue();
+ assertThat(checker.test(b("xabc"))).isTrue();
+ assertThat(checker.test(b("abcy"))).isTrue();
+ assertThat(checker.test(b("ab"))).isFalse();
+ assertThat(checker.test(b(""))).isFalse();
+ }
+
+ @Test
+ public void testBothAnchoredTwoSegments() {
+ LikeChainChecker checker = LikeChainChecker.compile("abc%def");
+ assertThat(checker.test(b("abcdef"))).isTrue();
+ assertThat(checker.test(b("abcXdef"))).isTrue();
+ assertThat(checker.test(b("abcXXXdef"))).isTrue();
+ assertThat(checker.test(b("abc"))).isFalse();
+ assertThat(checker.test(b("def"))).isFalse();
+ assertThat(checker.test(b("abdef"))).isFalse();
+ assertThat(checker.test(b("abcde"))).isFalse();
+ }
+
+ @Test
+ public void testLeftAnchoredOnly() {
+ LikeChainChecker checker = LikeChainChecker.compile("abc%def%");
+ assertThat(checker.test(b("abcdef"))).isTrue();
+ assertThat(checker.test(b("abcdefX"))).isTrue();
+ assertThat(checker.test(b("abcXdef"))).isTrue();
+ assertThat(checker.test(b("abcXdefY"))).isTrue();
+ assertThat(checker.test(b("abc"))).isFalse();
+ assertThat(checker.test(b("xabcdef"))).isFalse();
+ }
+
+ @Test
+ public void testRightAnchoredOnly() {
+ LikeChainChecker checker = LikeChainChecker.compile("%abc%def");
+ assertThat(checker.test(b("abcdef"))).isTrue();
+ assertThat(checker.test(b("XabcXdef"))).isTrue();
+ assertThat(checker.test(b("abcXXXdef"))).isTrue();
+ assertThat(checker.test(b("abc"))).isFalse();
+ assertThat(checker.test(b("def"))).isFalse();
+ assertThat(checker.test(b("abcdefX"))).isFalse();
+ }
+
+ @Test
+ public void testNeitherAnchored() {
+ LikeChainChecker checker = LikeChainChecker.compile("%abc%def%");
+ assertThat(checker.test(b("abcdef"))).isTrue();
+ assertThat(checker.test(b("XabcXdefX"))).isTrue();
+ assertThat(checker.test(b("abcXdef"))).isTrue();
+ assertThat(checker.test(b("abcdefY"))).isTrue();
+ assertThat(checker.test(b("abc"))).isFalse();
+ assertThat(checker.test(b("def"))).isFalse();
+ assertThat(checker.test(b("defXabc"))).isFalse();
+ }
+
+ @Test
+ public void testThreeSegmentsBothAnchored() {
+ LikeChainChecker checker =
LikeChainChecker.compile("/api/%/users/%/profile");
+ assertThat(checker.test(b("/api/v1/users/123/profile"))).isTrue();
+ assertThat(checker.test(b("/api/X/users/Y/profile"))).isTrue();
+ assertThat(checker.test(b("/api//users//profile"))).isTrue();
+ assertThat(checker.test(b("/api/v1/profile"))).isFalse();
+
assertThat(checker.test(b("/api/v1/users/123/profile/extra"))).isFalse();
+ assertThat(checker.test(b("X/api/v1/users/123/profile"))).isFalse();
+ }
+
+ @Test
+ public void testLongChain() {
+ LikeChainChecker checker = LikeChainChecker.compile("a%b%c%d%e");
+ assertThat(checker.test(b("abcde"))).isTrue();
+ assertThat(checker.test(b("aXbYcZdWe"))).isTrue();
+ assertThat(checker.test(b("aXXXbYYYcZZZdWWWe"))).isTrue();
+ assertThat(checker.test(b("aebcd"))).isFalse();
+ assertThat(checker.test(b("abcd"))).isFalse();
+ }
+
+ @Test
+ public void testEmptyPattern() {
+ LikeChainChecker checker = LikeChainChecker.compile("");
+ assertThat(checker.test(b(""))).isTrue();
+ assertThat(checker.test(b("a"))).isFalse();
+ }
+
+ @Test
+ public void testJustPercent() {
+ LikeChainChecker checker = LikeChainChecker.compile("%");
+ assertThat(checker.test(b(""))).isTrue();
+ assertThat(checker.test(b("a"))).isTrue();
+ assertThat(checker.test(b("hello world"))).isTrue();
+ }
+
+ @Test
+ public void testConsecutivePercents() {
+ LikeChainChecker checker = LikeChainChecker.compile("%%");
+ assertThat(checker.test(b(""))).isTrue();
+ assertThat(checker.test(b("anything"))).isTrue();
+
+ LikeChainChecker checker2 = LikeChainChecker.compile("a%%b");
+ assertThat(checker2.test(b("ab"))).isTrue();
+ assertThat(checker2.test(b("aXb"))).isTrue();
+ assertThat(checker2.test(b("aXXXb"))).isTrue();
+ assertThat(checker2.test(b("a"))).isFalse();
+ assertThat(checker2.test(b("b"))).isFalse();
+ }
+
+ @Test
+ public void testOverlapRejection() {
+ LikeChainChecker checker = LikeChainChecker.compile("a%a");
+ assertThat(checker.test(b("a"))).isFalse();
+ assertThat(checker.test(b("aa"))).isTrue();
+ assertThat(checker.test(b("aXa"))).isTrue();
+ assertThat(checker.test(b("aXXa"))).isTrue();
+ }
+
+ @Test
+ public void testInputShorterThanPattern() {
+ LikeChainChecker checker = LikeChainChecker.compile("abc%def");
+ assertThat(checker.test(b("abc"))).isFalse();
+ assertThat(checker.test(b("def"))).isFalse();
+ assertThat(checker.test(b("ab"))).isFalse();
+ assertThat(checker.test(b(""))).isFalse();
+ }
+
+ @Test
+ public void testUtf8MultiByteCharacters() {
+ LikeChainChecker checker = LikeChainChecker.compile("café%");
+ assertThat(checker.test(b("café latte"))).isTrue();
+ assertThat(checker.test(b("café"))).isTrue();
+ assertThat(checker.test(b("cafe"))).isFalse();
+
+ LikeChainChecker checker2 = LikeChainChecker.compile("%日本%");
+ assertThat(checker2.test(b("こんにちは日本語"))).isTrue();
+ assertThat(checker2.test(b("hello world"))).isFalse();
+ }
+
+ @Test
+ public void testParityWithRegex() {
+ String[] patterns = {
+ "",
+ "%",
+ "%%",
+ "abc",
+ "abc%",
+ "%abc",
+ "%abc%",
+ "abc%def",
+ "abc%def%",
+ "%abc%def",
+ "%abc%def%",
+ "a%b%c",
+ "%a%b%c%",
+ "a%a",
+ "/api/%/users/%/profile",
+ "%hello%",
+ "prefix%",
+ "%suffix",
+ };
+ String[] inputs = {
+ "",
+ "a",
+ "abc",
+ "abcdef",
+ "abc def",
+ "xabcy",
+ "/api/v1/users/123/profile",
+ "/api//users//profile",
+ "aaa",
+ "ababab",
+ "hello world",
+ "the quick brown fox",
+ "a%b%c",
+ "log_x.txt",
+ };
+
+ for (String pattern : patterns) {
+ LikeChainChecker checker = LikeChainChecker.compile(pattern);
+ assertThat(checker).as("compile should succeed for pattern: %s",
pattern).isNotNull();
+ String regex = sqlLikeToRegex(pattern);
+ Pattern p = Pattern.compile(regex, Pattern.DOTALL);
+ for (String input : inputs) {
+ boolean expected = p.matcher(input).matches();
+ boolean actual = checker.test(b(input));
+ assertThat(actual)
+ .as("pattern=[%s] input=[%s] regex=[%s]", pattern,
input, regex)
+ .isEqualTo(expected);
+ }
+ }
+ }
+
+ private static BinaryString b(String s) {
+ return BinaryString.fromString(s);
+ }
+
+ private static String sqlLikeToRegex(String sqlPattern) {
+ StringBuilder out = new StringBuilder(sqlPattern.length() * 2);
+ for (int i = 0; i < sqlPattern.length(); i++) {
+ char c = sqlPattern.charAt(i);
+ if ("[]()|^-+*?{}$\\.".indexOf(c) >= 0) {
+ out.append('\\');
+ out.append(c);
+ } else if (c == '%') {
+ out.append("(?s:.*)");
+ } else {
+ out.append(c);
+ }
+ }
+ return out.toString();
+ }
+}