This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-1.2-lts
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-1.2-lts by this push:
     new 8df05d6d81 [Enhencement](like) fallback to re2 if hyperscan failed 
pick #18350 (#18468)
8df05d6d81 is described below

commit 8df05d6d81036b08673676e7757eab5d45fab37f
Author: Kang <[email protected]>
AuthorDate: Sun Apr 9 09:19:56 2023 +0800

    [Enhencement](like) fallback to re2 if hyperscan failed pick #18350 (#18468)
    
    cherry-pick #18350
---
 be/src/vec/functions/like.cpp | 218 ++++++++++++++++++++++++++++++------------
 1 file changed, 157 insertions(+), 61 deletions(-)

diff --git a/be/src/vec/functions/like.cpp b/be/src/vec/functions/like.cpp
index 629e8fe1fc..9f4b10bae2 100644
--- a/be/src/vec/functions/like.cpp
+++ b/be/src/vec/functions/like.cpp
@@ -48,17 +48,26 @@ Status LikeSearchState::clone(LikeSearchState& cloned) {
     cloned.escape_char = escape_char;
     cloned.set_search_string(search_string);
 
-    if (hs_database) {
-        std::string re_pattern;
-        FunctionLike::convert_like_pattern(this, pattern_str, &re_pattern);
-
+    std::string re_pattern;
+    FunctionLike::convert_like_pattern(this, pattern_str, &re_pattern);
+    if (hs_database) { // use hyperscan
         hs_database_t* database = nullptr;
         hs_scratch_t* scratch = nullptr;
         RETURN_IF_ERROR(FunctionLike::hs_prepare(nullptr, re_pattern.c_str(), 
&database, &scratch));
-
         cloned.hs_database.reset(database);
         cloned.hs_scratch.reset(scratch);
-    }
+    } else { // fallback to re2
+        cloned.hs_database.reset();
+        cloned.hs_scratch.reset();
+
+        RE2::Options opts;
+        opts.set_never_nl(false);
+        opts.set_dot_nl(true);
+        cloned.regex = std::make_unique<RE2>(re_pattern, opts);
+        if (!cloned.regex->ok()) {
+            return Status::InternalError("Invalid regex expression: {}", 
re_pattern);
+        }
+     }
 
     return Status::OK();
 }
@@ -200,10 +209,14 @@ Status 
FunctionLikeBase::constant_substring_fn_scalar(LikeSearchState* state, co
 Status FunctionLikeBase::constant_regex_fn_scalar(LikeSearchState* state, 
const StringRef& val,
                                                   const StringValue& pattern,
                                                   unsigned char* result) {
-    auto ret = hs_scan(state->hs_database.get(), val.data, val.size, 0, 
state->hs_scratch.get(),
-                       state->hs_match_handler, (void*)result);
-    if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
-        return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
+    if (state->hs_database) { // use hyperscan
+        auto ret = hs_scan(state->hs_database.get(), val.data, val.size, 0, 
state->hs_scratch.get(),
+                        state->hs_match_handler, (void*)result);
+        if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
+            return Status::RuntimeError(fmt::format("hyperscan error: {}", 
ret));
+        }
+    } else { // fallback to re2
+        *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), 
*state->regex.get());
     }
 
     return Status::OK();
@@ -215,16 +228,26 @@ Status 
FunctionLikeBase::regexp_fn_scalar(LikeSearchState* state, const StringRe
 
     hs_database_t* database = nullptr;
     hs_scratch_t* scratch = nullptr;
-    RETURN_IF_ERROR(hs_prepare(nullptr, re_pattern.c_str(), &database, 
&scratch));
-
-    auto ret = hs_scan(database, val.data, val.size, 0, scratch, 
state->hs_match_handler,
-                       (void*)result);
-    if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
-        return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
-    }
+    if (hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch).ok()) { 
// use hyperscan
+        auto ret = hs_scan(database, val.data, val.size, 0, scratch, 
state->hs_match_handler,
+                        (void*)result);
+        if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
+            return Status::RuntimeError(fmt::format("hyperscan error: {}", 
ret));
+        }
 
-    hs_free_scratch(scratch);
-    hs_free_database(database);
+        hs_free_scratch(scratch);
+        hs_free_database(database);
+    } else { // fallback to re2
+        RE2::Options opts;
+        opts.set_never_nl(false);
+        opts.set_dot_nl(true);
+        re2::RE2 re(re_pattern, opts);
+        if (re.ok()) {
+            *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), 
re);
+        } else {
+            return Status::RuntimeError("Invalid pattern: {}", 
pattern.debug_string());
+        }
+    }    
 
     return Status::OK();
 }
@@ -233,13 +256,21 @@ Status 
FunctionLikeBase::constant_regex_fn(LikeSearchState* state, const ColumnS
                                            const StringValue& pattern,
                                            ColumnUInt8::Container& result) {
     auto sz = val.size();
-    for (size_t i = 0; i < sz; i++) {
-        const auto& str_ref = val.get_data_at(i);
-        auto ret = hs_scan(state->hs_database.get(), str_ref.data, 
str_ref.size, 0,
-                           state->hs_scratch.get(), state->hs_match_handler,
-                           (void*)(result.data() + i));
-        if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
-            return Status::RuntimeError(fmt::format("hyperscan error: {}", 
ret));
+    if (state->hs_database) { // use hyperscan
+        for (size_t i = 0; i < sz; i++) {
+            const auto& str_ref = val.get_data_at(i);
+            auto ret = hs_scan(state->hs_database.get(), str_ref.data, 
str_ref.size, 0,
+                            state->hs_scratch.get(), state->hs_match_handler,
+                            (void*)(result.data() + i));
+            if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
+                return Status::RuntimeError(fmt::format("hyperscan error: {}", 
ret));
+            }
+        }
+    } else { // fallback to re2
+        for (size_t i = 0; i < sz; i++) {
+            const auto& str_ref = val.get_data_at(i);
+            *(result.data() + i) = 
RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size),
+                                                     *state->regex.get());
         }
     }
 
@@ -252,21 +283,36 @@ Status FunctionLikeBase::regexp_fn(LikeSearchState* 
state, const ColumnString& v
 
     hs_database_t* database = nullptr;
     hs_scratch_t* scratch = nullptr;
-    RETURN_IF_ERROR(hs_prepare(nullptr, re_pattern.c_str(), &database, 
&scratch));
+    if (hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch).ok()) { 
// use hyperscan
+        auto sz = val.size();
+        for (size_t i = 0; i < sz; i++) {
+            const auto& str_ref = val.get_data_at(i);
+            auto ret = hs_scan(database, str_ref.data, str_ref.size, 0, 
scratch,
+                            state->hs_match_handler, (void*)(result.data() + 
i));
+            if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
+                return Status::RuntimeError(fmt::format("hyperscan error: {}", 
ret));
+            }
+        }
 
-    auto sz = val.size();
-    for (size_t i = 0; i < sz; i++) {
-        const auto& str_ref = val.get_data_at(i);
-        auto ret = hs_scan(database, str_ref.data, str_ref.size, 0, scratch,
-                           state->hs_match_handler, (void*)(result.data() + 
i));
-        if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
-            return Status::RuntimeError(fmt::format("hyperscan error: {}", 
ret));
+        hs_free_scratch(scratch);
+        hs_free_database(database);
+    } else { // fallback to re2
+        RE2::Options opts;
+        opts.set_never_nl(false);
+        opts.set_dot_nl(true);
+        re2::RE2 re(re_pattern, opts);
+        if (re.ok()) {
+            auto sz = val.size();
+            for (size_t i = 0; i < sz; i++) {
+                const auto& str_ref = val.get_data_at(i);
+                *(result.data() + i) =
+                        RE2::PartialMatch(re2::StringPiece(str_ref.data, 
str_ref.size), re);
+            }
+        } else {
+            return Status::RuntimeError("Invalid pattern: {}", 
pattern.debug_string());
         }
     }
 
-    hs_free_scratch(scratch);
-    hs_free_database(database);
-
     return Status::OK();
 }
 
@@ -276,12 +322,20 @@ Status 
FunctionLikeBase::constant_regex_fn_predicate(LikeSearchState* state,
                                                      ColumnUInt8::Container& 
result, uint16_t* sel,
                                                      size_t sz) {
     auto data_ptr = reinterpret_cast<const StringRef*>(val.get_data().data());
-    for (size_t i = 0; i < sz; i++) {
-        auto ret = hs_scan(state->hs_database.get(), data_ptr[sel[i]].data, 
data_ptr[sel[i]].size,
-                           0, state->hs_scratch.get(), state->hs_match_handler,
-                           (void*)(result.data() + i));
-        if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
-            return Status::RuntimeError(fmt::format("hyperscan error: {}", 
ret));
+    if (state->hs_database) { // use hyperscan
+        for (size_t i = 0; i < sz; i++) {
+            auto ret = hs_scan(state->hs_database.get(), 
data_ptr[sel[i]].data, data_ptr[sel[i]].size,
+                            0, state->hs_scratch.get(), 
state->hs_match_handler,
+                            (void*)(result.data() + i));
+            if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
+                return Status::RuntimeError(fmt::format("hyperscan error: {}", 
ret));
+            }
+        }
+    } else { // fallback to re2
+        for (size_t i = 0; i < sz; i++) {
+            *(result.data() + i) = RE2::PartialMatch(
+                    re2::StringPiece(data_ptr[sel[i]].data, 
data_ptr[sel[i]].size),
+                    *state->regex.get());
         }
     }
 
@@ -297,26 +351,41 @@ Status 
FunctionLikeBase::regexp_fn_predicate(LikeSearchState* state,
 
     hs_database_t* database = nullptr;
     hs_scratch_t* scratch = nullptr;
-    RETURN_IF_ERROR(hs_prepare(nullptr, re_pattern.c_str(), &database, 
&scratch));
+    if (hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch).ok()) { 
// use hyperscan
+        auto data_ptr = reinterpret_cast<const 
StringRef*>(val.get_data().data());
+        for (size_t i = 0; i < sz; i++) {
+            auto ret = hs_scan(database, data_ptr[sel[i]].data, 
data_ptr[sel[i]].size, 0, scratch,
+                            state->hs_match_handler, (void*)(result.data() + 
i));
+            if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
+                return Status::RuntimeError(fmt::format("hyperscan error: {}", 
ret));
+            }
+        }
 
-    auto data_ptr = reinterpret_cast<const StringRef*>(val.get_data().data());
-    for (size_t i = 0; i < sz; i++) {
-        auto ret = hs_scan(database, data_ptr[sel[i]].data, 
data_ptr[sel[i]].size, 0, scratch,
-                           state->hs_match_handler, (void*)(result.data() + 
i));
-        if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
-            return Status::RuntimeError(fmt::format("hyperscan error: {}", 
ret));
+        hs_free_scratch(scratch);
+        hs_free_database(database);
+    } else { // fallback to re2
+        RE2::Options opts;
+        opts.set_never_nl(false);
+        opts.set_dot_nl(true);
+        re2::RE2 re(re_pattern, opts);
+        if (re.ok()) {
+            auto data_ptr = reinterpret_cast<const 
StringRef*>(val.get_data().data());
+            for (size_t i = 0; i < sz; i++) {
+                *(result.data() + i) = RE2::PartialMatch(
+                        re2::StringPiece(data_ptr[sel[i]].data, 
data_ptr[sel[i]].size), re);
+            }
+        } else {
+            return Status::RuntimeError("Invalid pattern: {}", 
pattern.debug_string());
         }
     }
 
-    hs_free_scratch(scratch);
-    hs_free_database(database);
-
     return Status::OK();
 }
 
 // hyperscan compile expression to database and allocate scratch space
 Status FunctionLikeBase::hs_prepare(FunctionContext* context, const char* 
expression,
                                     hs_database_t** database, hs_scratch_t** 
scratch) {
+    return Status::RuntimeError("hs_compile inject error:");
     hs_compile_error_t* compile_err;
     auto res = hs_compile(expression, HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY, 
HS_MODE_BLOCK, NULL,
                           database, &compile_err);
@@ -651,10 +720,24 @@ Status FunctionLike::prepare(FunctionContext* context, 
FunctionContext::Function
 
             hs_database_t* database = nullptr;
             hs_scratch_t* scratch = nullptr;
-            RETURN_IF_ERROR(hs_prepare(context, re_pattern.c_str(), &database, 
&scratch));
-
-            state->search_state.hs_database.reset(database);
-            state->search_state.hs_scratch.reset(scratch);
+            if (hs_prepare(context, re_pattern.c_str(), &database, 
&scratch).ok()) {
+                // use hyperscan
+                state->search_state.hs_database.reset(database);
+                state->search_state.hs_scratch.reset(scratch);
+            } else {
+                // fallback to re2
+                // reset hs_database to nullptr to indicate not use hyperscan
+                state->search_state.hs_database.reset();
+                state->search_state.hs_scratch.reset();
+
+                RE2::Options opts;
+                opts.set_never_nl(false);
+                opts.set_dot_nl(true);
+                state->search_state.regex = std::make_unique<RE2>(re_pattern, 
opts);
+                if (!state->search_state.regex->ok()) {
+                    return Status::InternalError("Invalid regex expression: 
{}", pattern_str);
+                }
+            }
 
             state->function = constant_regex_fn;
             state->predicate_like_function = constant_regex_fn_predicate;
@@ -703,10 +786,23 @@ Status FunctionRegexp::prepare(FunctionContext* context,
         } else {
             hs_database_t* database = nullptr;
             hs_scratch_t* scratch = nullptr;
-            RETURN_IF_ERROR(hs_prepare(context, pattern_str.c_str(), 
&database, &scratch));
-
-            state->search_state.hs_database.reset(database);
-            state->search_state.hs_scratch.reset(scratch);
+            if (hs_prepare(context, pattern_str.c_str(), &database, 
&scratch).ok()) {
+                // use hyperscan
+                state->search_state.hs_database.reset(database);
+                state->search_state.hs_scratch.reset(scratch);
+            } else {
+                // fallback to re2
+                // reset hs_database to nullptr to indicate not use hyperscan
+                state->search_state.hs_database.reset();
+                state->search_state.hs_scratch.reset();
+                RE2::Options opts;
+                opts.set_never_nl(false);
+                opts.set_dot_nl(true);
+                state->search_state.regex = std::make_unique<RE2>(pattern_str, 
opts);
+                if (!state->search_state.regex->ok()) {
+                    return Status::InternalError("Invalid regex expression: 
{}", pattern_str);
+                }
+            }
 
             state->function = constant_regex_fn;
             state->predicate_like_function = constant_regex_fn_predicate;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to