This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new b999672dfe8 branch-4.0: [Opt](function) Optimize like function for
non-literal modes #59866 (#59943)
b999672dfe8 is described below
commit b999672dfe8914975085878ce9ede87044e64a9e
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Fri Jan 16 14:27:36 2026 +0800
branch-4.0: [Opt](function) Optimize like function for non-literal modes
#59866 (#59943)
Cherry-picked from #59866
Co-authored-by: zclllyybb <[email protected]>
---
be/src/vec/functions/like.cpp | 42 ++++++++++++++--
be/src/vec/functions/like.h | 110 +++++++++++++++++++++++++++++++++++++++---
2 files changed, 141 insertions(+), 11 deletions(-)
diff --git a/be/src/vec/functions/like.cpp b/be/src/vec/functions/like.cpp
index b609bbd0382..4cc3d124dff 100644
--- a/be/src/vec/functions/like.cpp
+++ b/be/src/vec/functions/like.cpp
@@ -720,11 +720,45 @@ Status FunctionLike::like_fn(const LikeSearchState*
state, const ColumnString& v
Status FunctionLike::like_fn_scalar(const LikeSearchState* state, const
StringRef& val,
const StringRef& pattern, unsigned char*
result) {
- std::string re_pattern;
- convert_like_pattern(state, std::string(pattern.data, pattern.size),
&re_pattern);
+ // Try to use fast path to avoid regex compilation
+ std::string search_string;
+ LikeFastPath fast_path = extract_like_fast_path(pattern.data,
pattern.size, search_string);
- return regexp_fn_scalar(state, StringRef(val.data, val.size),
- {re_pattern.c_str(), re_pattern.size()}, result);
+ switch (fast_path) {
+ case LikeFastPath::ALLPASS:
+ *result = 1;
+ return Status::OK();
+ case LikeFastPath::EQUALS:
+ *result = (val.size == search_string.size() &&
+ (search_string.empty() ||
+ memcmp(val.data, search_string.data(),
search_string.size()) == 0));
+ return Status::OK();
+ case LikeFastPath::STARTS_WITH:
+ *result = (val.size >= search_string.size() &&
+ memcmp(val.data, search_string.data(),
search_string.size()) == 0);
+ return Status::OK();
+ case LikeFastPath::ENDS_WITH:
+ *result = (val.size >= search_string.size() &&
+ memcmp(val.data + val.size - search_string.size(),
search_string.data(),
+ search_string.size()) == 0);
+ return Status::OK();
+ case LikeFastPath::SUBSTRING:
+ if (search_string.empty()) {
+ *result = 1;
+ } else {
+ // Use memmem for substring search
+ *result = (memmem(val.data, val.size, search_string.data(),
search_string.size()) !=
+ nullptr);
+ }
+ return Status::OK();
+ case LikeFastPath::REGEX:
+ default:
+ // Fall back to regex matching
+ std::string re_pattern;
+ convert_like_pattern(state, std::string(pattern.data, pattern.size),
&re_pattern);
+ return regexp_fn_scalar(state, StringRef(val.data, val.size),
+ {re_pattern.c_str(), re_pattern.size()},
result);
+ }
}
void FunctionLike::convert_like_pattern(const LikeSearchState* state, const
std::string& pattern,
diff --git a/be/src/vec/functions/like.h b/be/src/vec/functions/like.h
index 085bea5bcd2..26a2f2a96bb 100644
--- a/be/src/vec/functions/like.h
+++ b/be/src/vec/functions/like.h
@@ -20,12 +20,12 @@
#include <hs/hs_common.h>
#include <hs/hs_runtime.h>
#include <re2/re2.h>
-#include <stddef.h>
-#include <stdint.h>
#include <algorithm>
#include <boost/iterator/iterator_facade.hpp>
#include <boost/regex.hpp>
+#include <cstddef>
+#include <cstdint>
#include <functional>
#include <memory>
#include <string>
@@ -43,13 +43,109 @@
#include "vec/data_types/data_type_number.h"
#include "vec/functions/function.h"
-namespace doris {
-namespace vectorized {
+namespace doris::vectorized {
class Block;
-} // namespace vectorized
-} // namespace doris
-namespace doris::vectorized {
+// FastPath types for LIKE pattern matching optimization
+// This allows per-row pattern analysis to avoid regex when possible
+enum class LikeFastPath {
+ ALLPASS, // Pattern is just '%' or '%%...' - matches everything
+ EQUALS, // No wildcards - exact string match
+ STARTS_WITH, // Pattern ends with '%' only - prefix match
+ ENDS_WITH, // Pattern starts with '%' only - suffix match
+ SUBSTRING, // Pattern is '%xxx%' - substring search
+ REGEX // Contains '_' or multiple '%' - needs regex
+};
+
+// Lightweight pattern analysis without RE2
+// Returns the fast path type and extracts the search string (without
wildcards)
+// Correctly handles escape sequences: backslash-% -> literal %, backslash-_
-> literal _
+inline LikeFastPath extract_like_fast_path(const char* pattern, size_t len,
+ std::string& search_string) {
+ search_string.clear();
+ if (len == 0) {
+ return LikeFastPath::EQUALS;
+ }
+
+ // Returns true if the character is NOT escaped (even number of preceding
backslashes)
+ auto is_unescaped = [&pattern](size_t pos) -> bool {
+ size_t backslash_count = 0;
+ while (pos > 0 && pattern[pos - 1] == '\\') {
+ backslash_count++;
+ pos--;
+ }
+ return (backslash_count % 2 == 0);
+ };
+
+ bool starts_with_percent = (pattern[0] == '%');
+ bool ends_with_percent = (pattern[len - 1] == '%' && is_unescaped(len -
1));
+
+ // Quick check: if starts or ends with unescaped '_', need regex
+ if (pattern[0] == '_') {
+ return LikeFastPath::REGEX;
+ }
+ if (pattern[len - 1] == '_' && is_unescaped(len - 1)) {
+ return LikeFastPath::REGEX;
+ }
+
+ // Helper lambda: check if character is a wildcard that needs escaping
+ auto is_wildcard = [](char c) { return c == '%' || c == '_' || c == '\\';
};
+
+ size_t i = 0;
+ // Skip leading '%' characters (unescaped)
+ while (i < len && pattern[i] == '%') {
+ i++;
+ }
+ // If pattern is all '%', it's ALLPASS
+ if (i >= len) {
+ return LikeFastPath::ALLPASS;
+ }
+
+ search_string.reserve(len);
+ while (i < len) {
+ char c = pattern[i];
+ // Escaped character - add the literal
+ if (c == '\\' && i + 1 < len && is_wildcard(pattern[i + 1])) {
+ search_string.push_back(pattern[i + 1]);
+ i += 2;
+ continue;
+ }
+
+ // Unescaped '_' requires regex
+ if (c == '_') {
+ return LikeFastPath::REGEX;
+ }
+
+ // Check for trailing '%' or middle '%' (which needs regex)
+ if (c == '%') {
+ // Check if this is a trailing '%' sequence
+ size_t j = i;
+ while (j < len && pattern[j] == '%') {
+ j++;
+ }
+ if (j >= len) {
+ // All remaining chars are '%', we're done parsing
+ break;
+ }
+ // '%' in the middle with more content after - need regex
+ return LikeFastPath::REGEX;
+ }
+
+ search_string.push_back(c);
+ i++;
+ }
+
+ // Determine the pattern type based on '%' positions
+ if (starts_with_percent && ends_with_percent) {
+ return LikeFastPath::SUBSTRING;
+ } else if (starts_with_percent) {
+ return LikeFastPath::ENDS_WITH;
+ } else if (ends_with_percent) {
+ return LikeFastPath::STARTS_WITH;
+ } else {
+ return LikeFastPath::EQUALS;
+ }
+}
inline std::string replace_pattern_by_escape(const StringRef& pattern, char
escape_char) {
std::string result;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]