Copilot commented on code in PR #57643:
URL: https://github.com/apache/doris/pull/57643#discussion_r2493198865
##########
be/src/vec/functions/function_regexp.cpp:
##########
@@ -51,6 +52,136 @@
namespace doris::vectorized {
#include "common/compile_check_begin.h"
+
+// Helper structure to hold either RE2 or Boost.Regex
+struct RegexpExtractEngine {
+ std::unique_ptr<re2::RE2> re2_regex;
+ std::unique_ptr<boost::regex> boost_regex;
+
+ bool is_boost() const { return boost_regex != nullptr; }
+ bool is_re2() const { return re2_regex != nullptr; }
+
+ // Try to compile with RE2 first, fallback to Boost.Regex if RE2 fails
+ static bool compile(const StringRef& pattern, std::string* error_str,
+ RegexpExtractEngine& engine, bool
enable_extended_regex) {
+ engine.re2_regex =
std::make_unique<re2::RE2>(re2::StringPiece(pattern.data, pattern.size));
+ if (engine.re2_regex->ok()) {
+ return true;
+ } else if (!enable_extended_regex) {
+ *error_str =
+ fmt::format("Invalid regex pattern: {}. Error: {}",
+ std::string(pattern.data, pattern.size),
engine.re2_regex->error());
+ return false;
+ }
+
+ // RE2 failed, try Boost.Regex for advanced features like zero-width
assertions
+ engine.re2_regex.reset();
+ try {
+ boost::regex::flag_type flags = boost::regex::normal;
+ engine.boost_regex = std::make_unique<boost::regex>(pattern.data,
+ pattern.data +
pattern.size, flags);
+ return true;
+ } catch (const boost::regex_error& e) {
+ if (error_str) {
+ *error_str = fmt::format("Invalid regex pattern: {}. Error:
{}",
+ std::string(pattern.data,
pattern.size), e.what());
+ }
+ return false;
+ }
+ }
+
+ // Get number of capturing groups
+ int number_of_capturing_groups() const {
+ if (is_re2()) {
+ return re2_regex->NumberOfCapturingGroups();
+ } else if (is_boost()) {
+ return static_cast<int>(boost_regex->mark_count());
+ }
+ return 0;
+ }
+
+ // Match function for extraction
+ bool match_and_extract(const char* data, size_t size, int index,
std::string& result) const {
+ if (is_re2()) {
+ int max_matches = 1 + re2_regex->NumberOfCapturingGroups();
+ if (index >= max_matches) {
+ return false;
+ }
+ std::vector<re2::StringPiece> matches(max_matches);
+ bool success = re2_regex->Match(re2::StringPiece(data, size), 0,
size,
+ re2::RE2::UNANCHORED,
matches.data(), max_matches);
+ if (success && index < matches.size()) {
+ const re2::StringPiece& match = matches[index];
+ result.assign(match.data(), match.size());
+ return true;
+ }
+ return false;
+ } else if (is_boost()) {
+ boost::cmatch matches;
+ bool success = boost::regex_search(data, data + size, matches,
*boost_regex);
+ if (success && index < matches.size()) {
+ result = matches[index].str();
+ return true;
+ }
+ return false;
+ }
+ return false;
+ }
Review Comment:
Potential type safety issue: `index` parameter is `int`, but it's compared
with `matches.size()` which returns `size_t` (unsigned). At line 122, if
`index` is negative, the comparison `index < matches.size()` will treat it as a
large positive value due to integer conversion, potentially causing
out-of-bounds access. Consider adding a check `if (index < 0) return false;`
before the comparisons.
##########
fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java:
##########
@@ -3094,6 +3096,11 @@ public boolean isEnableESParallelScroll() {
)
public int defaultVariantMaxSparseColumnStatisticsSize = 10000;
+ @VariableMgr.VarAttr(name = ENABLE_EXTENDED_REGEX, needForward = true,
affectQueryResult = true,
+ description = {"是否启用扩展的正则表达式, 支持如 look-round 类的零宽断言",
+ "Enable extended regular expressions, support look-round
zero-width assertions"})
Review Comment:
Typo in the Chinese description: 'look-round' should be 'look-around' to
match the English terminology used elsewhere in the codebase.
```suggestion
description = {"是否启用扩展的正则表达式, 支持如 look-around 类的零宽断言",
"Enable extended regular expressions, support
look-around zero-width assertions"})
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]