This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new 1c18f2c34f1 branch-4.0: [Enhancement](regexp) Support zero-width 
assertions in some regexp functions #57643 (#57948)
1c18f2c34f1 is described below

commit 1c18f2c34f1a5dfe05df9d011be2bdc36dc6954a
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Thu Nov 13 10:27:04 2025 +0800

    branch-4.0: [Enhancement](regexp) Support zero-width assertions in some 
regexp functions #57643 (#57948)
    
    Cherry-picked from #57643
    
    Co-authored-by: linrrarity <[email protected]>
---
 be/src/runtime/runtime_state.h                     |   6 +
 be/src/vec/functions/function_regexp.cpp           | 216 ++++++++++++++++-----
 be/src/vec/functions/like.cpp                      |  26 ++-
 be/src/vec/functions/like.h                        |   4 +
 .../java/org/apache/doris/qe/SessionVariable.java  |   8 +
 gensrc/thrift/PaloInternalService.thrift           |   2 +
 .../test_string_function_regexp.out                |  56 +++++-
 .../test_string_function_regexp.groovy             |  52 ++++-
 8 files changed, 317 insertions(+), 53 deletions(-)

diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h
index 3d89f4aa0d4..24993123bf3 100644
--- a/be/src/runtime/runtime_state.h
+++ b/be/src/runtime/runtime_state.h
@@ -157,6 +157,12 @@ public:
                        : 1;
     }
 
+    // Support extended regex
+    // like look-around zero-width assertions(`?=`, `?!`, `?<=`, `?<!`)
+    bool enable_extended_regex() const {
+        return _query_options.__isset.enable_extended_regex && 
_query_options.enable_extended_regex;
+    }
+
     TQueryType::type query_type() const { return _query_options.query_type; }
     int64_t timestamp_ms() const { return _timestamp_ms; }
     int32_t nano_seconds() const { return _nano_seconds; }
diff --git a/be/src/vec/functions/function_regexp.cpp 
b/be/src/vec/functions/function_regexp.cpp
index 1007487e4ce..d24a0538c04 100644
--- a/be/src/vec/functions/function_regexp.cpp
+++ b/be/src/vec/functions/function_regexp.cpp
@@ -20,6 +20,7 @@
 #include <re2/stringpiece.h>
 #include <stddef.h>
 
+#include <boost/regex.hpp>
 #include <memory>
 #include <string>
 #include <string_view>
@@ -51,6 +52,137 @@
 
 namespace doris::vectorized {
 #include "common/compile_check_begin.h"
+
+// Helper structure to hold either RE2 or Boost.Regex
+struct RegexpExtractEngine {
+    std::unique_ptr<re2::RE2> re2_regex;
+    std::unique_ptr<boost::regex> boost_regex;
+
+    bool is_boost() const { return boost_regex != nullptr; }
+    bool is_re2() const { return re2_regex != nullptr; }
+
+    // Try to compile with RE2 first, fallback to Boost.Regex if RE2 fails
+    static bool compile(const StringRef& pattern, std::string* error_str,
+                        RegexpExtractEngine& engine, bool 
enable_extended_regex) {
+        engine.re2_regex = 
std::make_unique<re2::RE2>(re2::StringPiece(pattern.data, pattern.size));
+        if (engine.re2_regex->ok()) {
+            return true;
+        } else if (!enable_extended_regex) {
+            *error_str = fmt::format(
+                    "Invalid regex pattern: {}. Error: {}. If you need 
advanced regex features, "
+                    "try setting enable_extended_regex=true",
+                    std::string(pattern.data, pattern.size), 
engine.re2_regex->error());
+            return false;
+        }
+
+        // RE2 failed, try Boost.Regex for advanced features like zero-width 
assertions
+        engine.re2_regex.reset();
+        try {
+            boost::regex::flag_type flags = boost::regex::normal;
+            engine.boost_regex = std::make_unique<boost::regex>(pattern.data,
+                                                                pattern.data + 
pattern.size, flags);
+            return true;
+        } catch (const boost::regex_error& e) {
+            if (error_str) {
+                *error_str = fmt::format("Invalid regex pattern: {}. Error: 
{}",
+                                         std::string(pattern.data, 
pattern.size), e.what());
+            }
+            return false;
+        }
+    }
+
+    // Get number of capturing groups
+    int number_of_capturing_groups() const {
+        if (is_re2()) {
+            return re2_regex->NumberOfCapturingGroups();
+        } else if (is_boost()) {
+            return static_cast<int>(boost_regex->mark_count());
+        }
+        return 0;
+    }
+
+    // Match function for extraction
+    bool match_and_extract(const char* data, size_t size, int index, 
std::string& result) const {
+        if (is_re2()) {
+            int max_matches = 1 + re2_regex->NumberOfCapturingGroups();
+            if (index >= max_matches) {
+                return false;
+            }
+            std::vector<re2::StringPiece> matches(max_matches);
+            bool success = re2_regex->Match(re2::StringPiece(data, size), 0, 
size,
+                                            re2::RE2::UNANCHORED, 
matches.data(), max_matches);
+            if (success && index < matches.size()) {
+                const re2::StringPiece& match = matches[index];
+                result.assign(match.data(), match.size());
+                return true;
+            }
+            return false;
+        } else if (is_boost()) {
+            boost::cmatch matches;
+            bool success = boost::regex_search(data, data + size, matches, 
*boost_regex);
+            if (success && index < matches.size()) {
+                result = matches[index].str();
+                return true;
+            }
+            return false;
+        }
+        return false;
+    }
+
+    // Match all occurrences and extract the first capturing group
+    void match_all_and_extract(const char* data, size_t size,
+                               std::vector<std::string>& results) const {
+        if (is_re2()) {
+            int max_matches = 1 + re2_regex->NumberOfCapturingGroups();
+            if (max_matches < 2) {
+                return; // No capturing groups
+            }
+
+            size_t pos = 0;
+            while (pos < size) {
+                const char* str_pos = data + pos;
+                size_t str_size = size - pos;
+                std::vector<re2::StringPiece> matches(max_matches);
+                bool success = re2_regex->Match(re2::StringPiece(str_pos, 
str_size), 0, str_size,
+                                                re2::RE2::UNANCHORED, 
matches.data(), max_matches);
+                if (!success) {
+                    break;
+                }
+                if (matches[0].empty()) {
+                    pos += 1;
+                    continue;
+                }
+                // Extract first capturing group
+                if (matches.size() > 1 && !matches[1].empty()) {
+                    results.emplace_back(matches[1].data(), matches[1].size());
+                }
+                // Move position forward
+                auto offset = std::string(str_pos, str_size)
+                                      .find(std::string(matches[0].data(), 
matches[0].size()));
+                pos += offset + matches[0].size();
+            }
+        } else if (is_boost()) {
+            const char* search_start = data;
+            const char* search_end = data + size;
+            boost::match_results<const char*> matches;
+
+            while (boost::regex_search(search_start, search_end, matches, 
*boost_regex)) {
+                if (matches.size() > 1 && matches[1].matched) {
+                    results.emplace_back(matches[1].str());
+                }
+                if (matches[0].length() == 0) {
+                    if (search_start == search_end) {
+                        break;
+                    }
+                    search_start += 1;
+                } else {
+                    search_start = matches[0].second;
+                }
+            }
+        }
+    }
+};
+
 struct RegexpCountImpl {
     static void execute_impl(FunctionContext* context, ColumnPtr 
argument_columns[],
                              size_t input_rows_count, ColumnInt32::Container& 
result_data) {
@@ -469,42 +601,45 @@ struct RegexpExtractImpl {
                                     ColumnString::Chars& result_data,
                                     ColumnString::Offsets& result_offset, 
NullMap& null_map,
                                     const size_t index_now) {
-        re2::RE2* re = reinterpret_cast<re2::RE2*>(
+        auto* engine = reinterpret_cast<RegexpExtractEngine*>(
                 context->get_function_state(FunctionContext::THREAD_LOCAL));
-        std::unique_ptr<re2::RE2> scoped_re;
-        if (re == nullptr) {
+        std::unique_ptr<RegexpExtractEngine> scoped_engine;
+
+        if (engine == nullptr) {
             std::string error_str;
             const auto& pattern = 
pattern_col->get_data_at(index_check_const(index_now, Const));
-            bool st = StringFunctions::compile_regex(pattern, &error_str, 
StringRef(), StringRef(),
-                                                     scoped_re);
+            scoped_engine = std::make_unique<RegexpExtractEngine>();
+            bool st = RegexpExtractEngine::compile(pattern, &error_str, 
*scoped_engine,
+                                                   
context->state()->enable_extended_regex());
             if (!st) {
                 context->add_warning(error_str.c_str());
                 StringOP::push_null_string(index_now, result_data, 
result_offset, null_map);
                 return;
             }
-            re = scoped_re.get();
+            engine = scoped_engine.get();
         }
+
         const auto& str = str_col->get_data_at(index_now);
-        re2::StringPiece str_sp = re2::StringPiece(str.data, str.size);
 
-        int max_matches = 1 + re->NumberOfCapturingGroups();
+        int max_matches = 1 + engine->number_of_capturing_groups();
         if (index_data >= max_matches) {
             ReturnNull ? StringOP::push_null_string(index_now, result_data, 
result_offset, null_map)
                        : StringOP::push_empty_string(index_now, result_data, 
result_offset);
             return;
         }
 
-        std::vector<re2::StringPiece> matches(max_matches);
-        bool success =
-                re->Match(str_sp, 0, str.size, re2::RE2::UNANCHORED, 
&matches[0], max_matches);
+        std::string match_result;
+        bool success = engine->match_and_extract(str.data, str.size, 
static_cast<int>(index_data),
+                                                 match_result);
+
         if (!success) {
             ReturnNull ? StringOP::push_null_string(index_now, result_data, 
result_offset, null_map)
                        : StringOP::push_empty_string(index_now, result_data, 
result_offset);
             return;
         }
-        const re2::StringPiece& match = matches[index_data];
-        StringOP::push_value_string(std::string_view(match.data(), 
match.size()), index_now,
-                                    result_data, result_offset);
+
+        StringOP::push_value_string(std::string_view(match_result.data(), 
match_result.size()),
+                                    index_now, result_data, result_offset);
     }
 };
 
@@ -548,49 +683,31 @@ struct RegexpExtractAllImpl {
                                     ColumnString::Chars& result_data,
                                     ColumnString::Offsets& result_offset, 
NullMap& null_map,
                                     const size_t index_now) {
-        re2::RE2* re = reinterpret_cast<re2::RE2*>(
+        auto* engine = reinterpret_cast<RegexpExtractEngine*>(
                 context->get_function_state(FunctionContext::THREAD_LOCAL));
-        std::unique_ptr<re2::RE2> scoped_re;
-        if (re == nullptr) {
+        std::unique_ptr<RegexpExtractEngine> scoped_engine;
+
+        if (engine == nullptr) {
             std::string error_str;
             const auto& pattern = 
pattern_col->get_data_at(index_check_const(index_now, Const));
-            bool st = StringFunctions::compile_regex(pattern, &error_str, 
StringRef(), StringRef(),
-                                                     scoped_re);
+            scoped_engine = std::make_unique<RegexpExtractEngine>();
+            bool st = RegexpExtractEngine::compile(pattern, &error_str, 
*scoped_engine,
+                                                   
context->state()->enable_extended_regex());
             if (!st) {
                 context->add_warning(error_str.c_str());
                 StringOP::push_null_string(index_now, result_data, 
result_offset, null_map);
                 return;
             }
-            re = scoped_re.get();
+            engine = scoped_engine.get();
         }
-        if (re->NumberOfCapturingGroups() == 0) {
+
+        if (engine->number_of_capturing_groups() == 0) {
             StringOP::push_empty_string(index_now, result_data, result_offset);
             return;
         }
         const auto& str = str_col->get_data_at(index_now);
-        int max_matches = 1 + re->NumberOfCapturingGroups();
-        std::vector<re2::StringPiece> res_matches;
-        size_t pos = 0;
-        while (pos < str.size) {
-            auto str_pos = str.data + pos;
-            auto str_size = str.size - pos;
-            re2::StringPiece str_sp = re2::StringPiece(str_pos, str_size);
-            std::vector<re2::StringPiece> matches(max_matches);
-            bool success =
-                    re->Match(str_sp, 0, str_size, re2::RE2::UNANCHORED, 
&matches[0], max_matches);
-            if (!success) {
-                StringOP::push_empty_string(index_now, result_data, 
result_offset);
-                break;
-            }
-            if (matches[0].empty()) {
-                StringOP::push_empty_string(index_now, result_data, 
result_offset);
-                pos += 1;
-                continue;
-            }
-            res_matches.push_back(matches[1]);
-            auto offset = std::string(str_pos, 
str_size).find(std::string(matches[0].as_string()));
-            pos += offset + matches[0].size();
-        }
+        std::vector<std::string> res_matches;
+        engine->match_all_and_extract(str.data, str.size, res_matches);
 
         if (res_matches.empty()) {
             StringOP::push_empty_string(index_now, result_data, result_offset);
@@ -599,7 +716,7 @@ struct RegexpExtractAllImpl {
 
         std::string res = "[";
         for (int j = 0; j < res_matches.size(); ++j) {
-            res += "'" + res_matches[j].as_string() + "'";
+            res += "'" + res_matches[j] + "'";
             if (j < res_matches.size() - 1) {
                 res += ",";
             }
@@ -641,15 +758,14 @@ public:
                 }
 
                 std::string error_str;
-                std::unique_ptr<re2::RE2> scoped_re;
-                bool st = StringFunctions::compile_regex(pattern, &error_str, 
StringRef(),
-                                                         StringRef(), 
scoped_re);
+                auto engine = std::make_shared<RegexpExtractEngine>();
+                bool st = RegexpExtractEngine::compile(pattern, &error_str, 
*engine,
+                                                       
context->state()->enable_extended_regex());
                 if (!st) {
                     context->set_error(error_str.c_str());
                     return Status::InvalidArgument(error_str);
                 }
-                std::shared_ptr<re2::RE2> re(scoped_re.release());
-                context->set_function_state(scope, re);
+                context->set_function_state(scope, engine);
             }
         }
         return Status::OK();
diff --git a/be/src/vec/functions/like.cpp b/be/src/vec/functions/like.cpp
index 95703197990..b609bbd0382 100644
--- a/be/src/vec/functions/like.cpp
+++ b/be/src/vec/functions/like.cpp
@@ -392,6 +392,8 @@ Status FunctionLikeBase::constant_regex_fn_scalar(const 
LikeSearchState* state,
         if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
             return Status::RuntimeError(fmt::format("hyperscan error: {}", 
ret));
         }
+    } else if (state->boost_regex) { // use boost::regex for advanced features
+        *result = boost::regex_search(val.data, val.data + val.size, 
*state->boost_regex);
     } else { // fallback to re2
         *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), 
*state->regex);
     }
@@ -429,6 +431,12 @@ Status FunctionLikeBase::constant_regex_fn(const 
LikeSearchState* state, const C
                 return Status::RuntimeError(fmt::format("hyperscan error: {}", 
ret));
             }
         }
+    } else if (state->boost_regex) { // use boost::regex for advanced features
+        for (size_t i = 0; i < sz; i++) {
+            const auto& str_ref = val.get_data_at(i);
+            *(result.data() + i) = boost::regex_search(str_ref.data, 
str_ref.data + str_ref.size,
+                                                       *state->boost_regex);
+        }
     } else { // fallback to re2
         for (size_t i = 0; i < sz; i++) {
             const auto& str_ref = val.get_data_at(i);
@@ -1009,7 +1017,23 @@ Status FunctionRegexpLike::open(FunctionContext* context,
                 opts.set_dot_nl(true);
                 state->search_state.regex = std::make_unique<RE2>(pattern_str, 
opts);
                 if (!state->search_state.regex->ok()) {
-                    return Status::InternalError("Invalid regex expression: 
{}", pattern_str);
+                    if (!context->state()->enable_extended_regex()) {
+                        return Status::InternalError(
+                                "Invalid regex expression: {}. Error: {}. If 
you need advanced "
+                                "regex features, try setting 
enable_extended_regex=true",
+                                pattern_str, 
state->search_state.regex->error());
+                    }
+
+                    // RE2 failed, fallback to Boost.Regex
+                    // This handles advanced regex features like zero-width 
assertions
+                    state->search_state.regex.reset();
+                    try {
+                        state->search_state.boost_regex =
+                                std::make_unique<boost::regex>(pattern_str);
+                    } catch (const boost::regex_error& e) {
+                        return Status::InternalError("Invalid regex 
expression: {}. Error: {}",
+                                                     pattern_str, e.what());
+                    }
                 }
             }
             state->function = constant_regex_fn;
diff --git a/be/src/vec/functions/like.h b/be/src/vec/functions/like.h
index 1128e4f3f69..085bea5bcd2 100644
--- a/be/src/vec/functions/like.h
+++ b/be/src/vec/functions/like.h
@@ -25,6 +25,7 @@
 
 #include <algorithm>
 #include <boost/iterator/iterator_facade.hpp>
+#include <boost/regex.hpp>
 #include <functional>
 #include <memory>
 #include <string>
@@ -100,6 +101,9 @@ struct LikeSearchState {
     /// Used for RLIKE and REGEXP predicates if the pattern is a constant 
argument.
     std::unique_ptr<re2::RE2> regex;
 
+    /// Used for REGEXP predicates when RE2 doesn't support the pattern (e.g., 
zero-width assertions like `?=`, `?!`, `?<=`, `?<!`)
+    std::unique_ptr<boost::regex> boost_regex;
+
     template <typename Deleter, Deleter deleter>
     struct HyperscanDeleter {
         template <typename T>
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java 
b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
index 0f875787fe7..00d9b965c9f 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
@@ -726,6 +726,8 @@ public class SessionVariable implements Serializable, 
Writable {
 
     public static final String SKIP_CHECKING_ACID_VERSION_FILE = 
"skip_checking_acid_version_file";
 
+    public static final String ENABLE_EXTENDED_REGEX = "enable_extended_regex";
+
     // NOTE: if you want to add some debug variables, please disable sql cache 
in `CacheAnalyzer.commonCacheCondition`,
     //       and set affectQueryResult=true
     public static final List<String> DEBUG_VARIABLES = ImmutableList.of(
@@ -3055,6 +3057,11 @@ public class SessionVariable implements Serializable, 
Writable {
     )
     public int defaultVariantMaxSparseColumnStatisticsSize = 10000;
 
+    @VariableMgr.VarAttr(name = ENABLE_EXTENDED_REGEX, needForward = true, 
affectQueryResult = true,
+            description = {"是否启用扩展的正则表达式, 支持如 look-around 类的零宽断言",
+                    "Enable extended regular expressions, support look-around 
zero-width assertions"})
+    public boolean enableExtendedRegex = false;
+
     @VariableMgr.VarAttr(
             name = DEFAULT_VARIANT_SPARSE_HASH_SHARD_COUNT,
             needForward = true,
@@ -4817,6 +4824,7 @@ public class SessionVariable implements Serializable, 
Writable {
         tResult.setHnswCheckRelativeDistance(hnswCheckRelativeDistance);
         tResult.setHnswBoundedQueue(hnswBoundedQueue);
         tResult.setMergeReadSliceSize(mergeReadSliceSizeBytes);
+        tResult.setEnableExtendedRegex(enableExtendedRegex);
         return tResult;
     }
 
diff --git a/gensrc/thrift/PaloInternalService.thrift 
b/gensrc/thrift/PaloInternalService.thrift
index 1248c93ef35..bac2030f5fc 100644
--- a/gensrc/thrift/PaloInternalService.thrift
+++ b/gensrc/thrift/PaloInternalService.thrift
@@ -410,6 +410,8 @@ struct TQueryOptions {
 
   175: optional bool enable_fuzzy_blockable_task = false;
 
+  177: optional bool enable_extended_regex = false;
+
   // For cloud, to control if the content would be written into file cache
   // In write path, to control if the content would be written into file cache.
   // In read path, read from file cache or remote storage when execute query.
diff --git 
a/regression-test/data/query_p0/sql_functions/string_functions/test_string_function_regexp.out
 
b/regression-test/data/query_p0/sql_functions/string_functions/test_string_function_regexp.out
index b0a07aad777..d7994943fd5 100644
--- 
a/regression-test/data/query_p0/sql_functions/string_functions/test_string_function_regexp.out
+++ 
b/regression-test/data/query_p0/sql_functions/string_functions/test_string_function_regexp.out
@@ -154,6 +154,15 @@ d
 -- !sql --
 
 
+-- !regexp_extract_1 --
+123
+
+-- !regexp_extract_2 --
+EdgeCase1
+
+-- !regexp_extract_3 --
+AA-1
+
 -- !sql --
 b
 
@@ -163,6 +172,12 @@ d
 -- !sql --
 \N
 
+-- !regexp_extract_or_null_1 --
+123
+
+-- !regexp_extract_or_null_2 --
+B
+
 -- !sql --
 ['18','17']
 
@@ -181,7 +196,7 @@ d
 -- !sql --
 ['ab','c','c','c']
 
--- !sql_regexp_extract_all --
+-- !sql_regexp_extract_all_1 --
        0
        0
        0
@@ -190,6 +205,18 @@ d
        0
        0
 
+-- !sql_regexp_extract_all_2 --
+['Apache/Doris']
+
+-- !sql_regexp_extract_all_3 --
+['123','456']
+
+-- !sql_regexp_extract_all_4 --
+['AA-1','BB-2','CC-3']
+
+-- !sql_regexp_extract_all_5 --
+['Case1','Case2','Case3']
+
 -- !sql --
 a-b-c
 
@@ -202,6 +229,33 @@ a-b c
 -- !sql --
 a <b> b
 
+-- !regexp_fn_1 --
+true
+
+-- !regexp_fn_2 --
+false
+
+-- !regexp_fn_3 --
+true
+
+-- !regexp_fn_4 --
+true
+
+-- !regexp_fn_5 --
+false
+
+-- !regexp_fn_6 --
+false
+
+-- !regexp_fn_7 --
+true
+
+-- !regexp_fn_8 --
+false
+
+-- !regexp_fn_9 --
+true
+
 -- !sql_utf1 --
 true
 
diff --git 
a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function_regexp.groovy
 
b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function_regexp.groovy
index 7b78d5865b6..7c9876d32d6 100644
--- 
a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function_regexp.groovy
+++ 
b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function_regexp.groovy
@@ -65,17 +65,50 @@ suite("test_string_function_regexp") {
     qt_sql "SELECT regexp_extract('AbCdE', '([[:lower:]]+)C([[:lower:]]+)', 
2);"
     qt_sql "SELECT regexp_extract('AbCdE', '([[:lower:]]+)C([[:lower:]]+)', 
3);"
 
+    sql "set enable_extended_regex = false;"
+    test {
+        sql 'SELECT regexp_extract(\'foo123bar456baz\', 
\'(?<=foo)(\\\\d+)(?=bar)\', 1);'
+        exception "Invalid regex pattern"
+    }
+    sql "set enable_extended_regex = true;"
+    qt_regexp_extract_1 'SELECT regexp_extract(\'foo123bar456baz\', 
\'(?<=foo)(\\\\d+)(?=bar)\', 1);'
+    qt_regexp_extract_2 'SELECT regexp_extract(\'EdgeCase1 EdgeCase2 
EdgeCase3\', \'(EdgeCase\\\\d)(?= EdgeCase|$)\', 1);'
+    qt_regexp_extract_3 'SELECT regexp_extract(\'ID:AA-1,ID:BB-2,ID:CC-3\', 
\'(?<=ID:)([A-Z]{2}-\\\\d)(?=,ID|$)\', 1);'
+    sql "set enable_extended_regex = false;"
+
     qt_sql "SELECT regexp_extract_or_null('AbCdE', 
'([[:lower:]]+)C([[:lower:]]+)', 1);"
     qt_sql "SELECT regexp_extract_or_null('AbCdE', 
'([[:lower:]]+)C([[:lower:]]+)', 2);"
     qt_sql "SELECT regexp_extract_or_null('AbCdE', 
'([[:lower:]]+)C([[:lower:]]+)', 3);"
 
+    sql "SET enable_extended_regex = false;"
+    test {
+        sql "SELECT regexp_extract_or_null('foo123bar', 
'(?<=foo)(\\\\d+)(?=bar)', 1);"
+        exception "Invalid regex pattern"
+    }
+    sql "set enable_extended_regex = true;"
+    qt_regexp_extract_or_null_1 "SELECT regexp_extract_or_null('foo123bar', 
'(?<=foo)(\\\\d+)(?=bar)', 1);"
+    qt_regexp_extract_or_null_2 "SELECT regexp_extract_or_null('TokenA TokenB 
TokenC', '(?<=Token)([A-Z])(?= TokenC)', 1);"
+    sql "set enable_extended_regex = false;"
+
     qt_sql "SELECT regexp_extract_all('x=a3&x=18abc&x=2&y=3&x=4&x=17bcd', 
'x=([0-9]+)([a-z]+)');"
     qt_sql "SELECT regexp_extract_all('http://a.m.baidu.com/i41915i73660.htm', 
'i([0-9]+)');"
     qt_sql "SELECT regexp_extract_all('abc=111, def=222, ghi=333', 
'(\"[^\"]+\"|\\\\w+)=(\"[^\"]+\"|\\\\w+)');"
     qt_sql "select regexp_extract_all('xxfs','f');"
     qt_sql "select regexp_extract_all('asdfg', '(z|x|c|)');"
     qt_sql "select regexp_extract_all('abcdfesscca', '(ab|c|)');"
-    qt_sql_regexp_extract_all "select regexp_extract_all('', '\"([^\"]+)\":'), 
length(regexp_extract_all('', '\"([^\"]+)\":')) from 
test_string_function_regexp;"
+
+    sql "set enable_extended_regex = false"
+    test {
+        sql 'SELECT REGEXP_EXTRACT_ALL(\'Apache/Doris\', 
\'([a-zA-Z_+-]+(?:\\/[a-zA-Z_0-9+-]+)*)(?=s|$)\');'
+        exception "Invalid regex pattern"
+    }
+    sql "set enable_extended_regex = true;"
+    qt_sql_regexp_extract_all_1 "select regexp_extract_all('', 
'\"([^\"]+)\":'), length(regexp_extract_all('', '\"([^\"]+)\":')) from 
test_string_function_regexp;"
+    qt_sql_regexp_extract_all_2 'SELECT REGEXP_EXTRACT_ALL(\'Apache/Doris\', 
\'([a-zA-Z_+-]+(?:\\/[a-zA-Z_0-9+-]+)*)(?=s|$)\');'
+    qt_sql_regexp_extract_all_3 'SELECT 
REGEXP_EXTRACT_ALL(\'foo123bar456baz\', \'(\\\\d{3})(?=bar|baz)\');'
+    qt_sql_regexp_extract_all_4 'SELECT 
REGEXP_EXTRACT_ALL(\'ID:AA-1,ID:BB-2,ID:CC-3\', \'(?<=ID:)([A-Z]{2}-\\\\d)\');'
+    qt_sql_regexp_extract_all_5 'SELECT 
REGEXP_EXTRACT_ALL(\'EdgeCase1EdgeCase2EdgeCase3\', 
\'(?<=Edge)(Case\\\\d)(?=Edge|$)\');'
+    sql "set enable_extended_regex = false;"
 
     qt_sql "SELECT regexp_replace('a b c', \" \", \"-\");"
     qt_sql "SELECT regexp_replace('a b c','(b)','<\\\\1>');"
@@ -83,6 +116,23 @@ suite("test_string_function_regexp") {
     qt_sql "SELECT regexp_replace_one('a b c', \" \", \"-\");"
     qt_sql "SELECT regexp_replace_one('a b b','(b)','<\\\\1>');"
 
+    sql "set enable_extended_regex = false"
+    test {
+        sql 'SELECT regexp(\'foobar\', \'(?<=foo)bar\');'
+        exception "Invalid regex expression"
+    }
+    sql "set enable_extended_regex = true;"
+    qt_regexp_fn_1 'SELECT regexp(\'abc123def\', \'abc[0-9]+\');'
+    qt_regexp_fn_2 'SELECT regexp(\'edge case test\', \'\\bcase\\b\');'
+    qt_regexp_fn_3 'SELECT regexp(\'foo123bar\', \'foo(?=123)\');'
+    qt_regexp_fn_4 'SELECT regexp(\'fooXYZbar\', \'foo(?!123)\');'
+    qt_regexp_fn_5 'SELECT regexp(\'123abc\', \'^\\d+\');'
+    qt_regexp_fn_6 'SELECT regexp(\'abc123\', \'^\\d+\');'
+    qt_regexp_fn_7 'SELECT regexp(\'foobar\', \'(?<=foo)bar\');'
+    qt_regexp_fn_8 'SELECT regexp(\'foobar\', \'(?<!foo)bar\');'
+    qt_regexp_fn_9 'SELECT regexp(\'Hello\', \'(?i)hello\');'
+    sql "set enable_extended_regex = false;"
+
     qt_sql_utf1 """ select '皖12345' REGEXP '^[皖][0-9]{5}\$'; """
     qt_sql_utf2 """ select '皖 12345' REGEXP '^[皖] [0-9]{5}\$'; """
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to