This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new b999672dfe8 branch-4.0: [Opt](function) Optimize like function for 
non-literal modes #59866 (#59943)
b999672dfe8 is described below

commit b999672dfe8914975085878ce9ede87044e64a9e
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Fri Jan 16 14:27:36 2026 +0800

    branch-4.0: [Opt](function) Optimize like function for non-literal modes 
#59866 (#59943)
    
    Cherry-picked from #59866
    
    Co-authored-by: zclllyybb <[email protected]>
---
 be/src/vec/functions/like.cpp |  42 ++++++++++++++--
 be/src/vec/functions/like.h   | 110 +++++++++++++++++++++++++++++++++++++++---
 2 files changed, 141 insertions(+), 11 deletions(-)

diff --git a/be/src/vec/functions/like.cpp b/be/src/vec/functions/like.cpp
index b609bbd0382..4cc3d124dff 100644
--- a/be/src/vec/functions/like.cpp
+++ b/be/src/vec/functions/like.cpp
@@ -720,11 +720,45 @@ Status FunctionLike::like_fn(const LikeSearchState* 
state, const ColumnString& v
 
 Status FunctionLike::like_fn_scalar(const LikeSearchState* state, const 
StringRef& val,
                                     const StringRef& pattern, unsigned char* 
result) {
-    std::string re_pattern;
-    convert_like_pattern(state, std::string(pattern.data, pattern.size), 
&re_pattern);
+    // Try to use fast path to avoid regex compilation
+    std::string search_string;
+    LikeFastPath fast_path = extract_like_fast_path(pattern.data, 
pattern.size, search_string);
 
-    return regexp_fn_scalar(state, StringRef(val.data, val.size),
-                            {re_pattern.c_str(), re_pattern.size()}, result);
+    switch (fast_path) {
+    case LikeFastPath::ALLPASS:
+        *result = 1;
+        return Status::OK();
+    case LikeFastPath::EQUALS:
+        *result = (val.size == search_string.size() &&
+                   (search_string.empty() ||
+                    memcmp(val.data, search_string.data(), 
search_string.size()) == 0));
+        return Status::OK();
+    case LikeFastPath::STARTS_WITH:
+        *result = (val.size >= search_string.size() &&
+                   memcmp(val.data, search_string.data(), 
search_string.size()) == 0);
+        return Status::OK();
+    case LikeFastPath::ENDS_WITH:
+        *result = (val.size >= search_string.size() &&
+                   memcmp(val.data + val.size - search_string.size(), 
search_string.data(),
+                          search_string.size()) == 0);
+        return Status::OK();
+    case LikeFastPath::SUBSTRING:
+        if (search_string.empty()) {
+            *result = 1;
+        } else {
+            // Use memmem for substring search
+            *result = (memmem(val.data, val.size, search_string.data(), 
search_string.size()) !=
+                       nullptr);
+        }
+        return Status::OK();
+    case LikeFastPath::REGEX:
+    default:
+        // Fall back to regex matching
+        std::string re_pattern;
+        convert_like_pattern(state, std::string(pattern.data, pattern.size), 
&re_pattern);
+        return regexp_fn_scalar(state, StringRef(val.data, val.size),
+                                {re_pattern.c_str(), re_pattern.size()}, 
result);
+    }
 }
 
 void FunctionLike::convert_like_pattern(const LikeSearchState* state, const 
std::string& pattern,
diff --git a/be/src/vec/functions/like.h b/be/src/vec/functions/like.h
index 085bea5bcd2..26a2f2a96bb 100644
--- a/be/src/vec/functions/like.h
+++ b/be/src/vec/functions/like.h
@@ -20,12 +20,12 @@
 #include <hs/hs_common.h>
 #include <hs/hs_runtime.h>
 #include <re2/re2.h>
-#include <stddef.h>
-#include <stdint.h>
 
 #include <algorithm>
 #include <boost/iterator/iterator_facade.hpp>
 #include <boost/regex.hpp>
+#include <cstddef>
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <string>
@@ -43,13 +43,109 @@
 #include "vec/data_types/data_type_number.h"
 #include "vec/functions/function.h"
 
-namespace doris {
-namespace vectorized {
+namespace doris::vectorized {
 class Block;
-} // namespace vectorized
-} // namespace doris
 
-namespace doris::vectorized {
+// FastPath types for LIKE pattern matching optimization
+// This allows per-row pattern analysis to avoid regex when possible
+enum class LikeFastPath {
+    ALLPASS,     // Pattern is just '%' or '%%...' - matches everything
+    EQUALS,      // No wildcards - exact string match
+    STARTS_WITH, // Pattern ends with '%' only - prefix match
+    ENDS_WITH,   // Pattern starts with '%' only - suffix match
+    SUBSTRING,   // Pattern is '%xxx%' - substring search
+    REGEX        // Contains '_' or multiple '%' - needs regex
+};
+
+// Lightweight pattern analysis without RE2
+// Returns the fast path type and extracts the search string (without 
wildcards)
+// Correctly handles escape sequences: backslash-% -> literal %, backslash-_ 
-> literal _
+inline LikeFastPath extract_like_fast_path(const char* pattern, size_t len,
+                                           std::string& search_string) {
+    search_string.clear();
+    if (len == 0) {
+        return LikeFastPath::EQUALS;
+    }
+
+    // Returns true if the character is NOT escaped (even number of preceding 
backslashes)
+    auto is_unescaped = [&pattern](size_t pos) -> bool {
+        size_t backslash_count = 0;
+        while (pos > 0 && pattern[pos - 1] == '\\') {
+            backslash_count++;
+            pos--;
+        }
+        return (backslash_count % 2 == 0);
+    };
+
+    bool starts_with_percent = (pattern[0] == '%');
+    bool ends_with_percent = (pattern[len - 1] == '%' && is_unescaped(len - 
1));
+
+    // Quick check: if starts or ends with unescaped '_', need regex
+    if (pattern[0] == '_') {
+        return LikeFastPath::REGEX;
+    }
+    if (pattern[len - 1] == '_' && is_unescaped(len - 1)) {
+        return LikeFastPath::REGEX;
+    }
+
+    // Helper lambda: check if character is a wildcard that needs escaping
+    auto is_wildcard = [](char c) { return c == '%' || c == '_' || c == '\\'; 
};
+
+    size_t i = 0;
+    // Skip leading '%' characters (unescaped)
+    while (i < len && pattern[i] == '%') {
+        i++;
+    }
+    // If pattern is all '%', it's ALLPASS
+    if (i >= len) {
+        return LikeFastPath::ALLPASS;
+    }
+
+    search_string.reserve(len);
+    while (i < len) {
+        char c = pattern[i];
+        // Escaped character - add the literal
+        if (c == '\\' && i + 1 < len && is_wildcard(pattern[i + 1])) {
+            search_string.push_back(pattern[i + 1]);
+            i += 2;
+            continue;
+        }
+
+        // Unescaped '_' requires regex
+        if (c == '_') {
+            return LikeFastPath::REGEX;
+        }
+
+        // Check for trailing '%' or middle '%' (which needs regex)
+        if (c == '%') {
+            // Check if this is a trailing '%' sequence
+            size_t j = i;
+            while (j < len && pattern[j] == '%') {
+                j++;
+            }
+            if (j >= len) {
+                // All remaining chars are '%', we're done parsing
+                break;
+            }
+            // '%' in the middle with more content after - need regex
+            return LikeFastPath::REGEX;
+        }
+
+        search_string.push_back(c);
+        i++;
+    }
+
+    // Determine the pattern type based on '%' positions
+    if (starts_with_percent && ends_with_percent) {
+        return LikeFastPath::SUBSTRING;
+    } else if (starts_with_percent) {
+        return LikeFastPath::ENDS_WITH;
+    } else if (ends_with_percent) {
+        return LikeFastPath::STARTS_WITH;
+    } else {
+        return LikeFastPath::EQUALS;
+    }
+}
 
 inline std::string replace_pattern_by_escape(const StringRef& pattern, char 
escape_char) {
     std::string result;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to