github-actions[bot] commented on code in PR #60727:
URL: https://github.com/apache/doris/pull/60727#discussion_r2881626712


##########
be/src/olap/tablet_meta.cpp:
##########
@@ -82,6 +82,14 @@ bvar::Window<bvar::Adder<uint64_t>> 
g_contains_agg_with_cache_if_eligible_full_h
         "g_contains_agg_with_cache_if_eligible_full_hit_1m",
         &g_contains_agg_with_cache_if_eligible_full_hit, 60);
 
+namespace {
+
+inline PatternTypePB to_pattern_type_pb(TPatternType::type pattern_type) {
+    return static_cast<PatternTypePB>(pattern_type);

Review Comment:
   **[Moderate] Fragile enum conversion via static_cast**
   
   This `static_cast` relies on `TPatternType` and `PatternTypePB` having 
identical ordinal values, which is currently true but fragile. The previous 
`switch` approach provided compiler warnings (`-Wswitch`) for missing cases. 
Consider adding compile-time guards:
   
   ```cpp
   static_assert(static_cast<int>(TPatternType::MATCH_NAME) == 
static_cast<int>(PatternTypePB::MATCH_NAME));
   static_assert(static_cast<int>(TPatternType::MATCH_NAME_GLOB) == 
static_cast<int>(PatternTypePB::MATCH_NAME_GLOB));
   static_assert(static_cast<int>(TPatternType::SKIP_NAME) == 
static_cast<int>(PatternTypePB::SKIP_NAME));
   static_assert(static_cast<int>(TPatternType::SKIP_NAME_GLOB) == 
static_cast<int>(PatternTypePB::SKIP_NAME_GLOB));
   ```
   
   Alternatively, restore a `switch` statement with a `default` case that 
returns an error or logs a warning.



##########
be/src/vec/common/variant_util.cpp:
##########
@@ -103,162 +105,126 @@
 namespace doris::vectorized::variant_util {
 #include "common/compile_check_begin.h"
 
-inline void append_escaped_regex_char(std::string* regex_output, char ch) {
-    switch (ch) {
-    case '.':
-    case '^':
-    case '$':
-    case '+':
-    case '*':
-    case '?':
-    case '(':
-    case ')':
-    case '|':
-    case '{':
-    case '}':
-    case '[':
-    case ']':
-    case '\\':
-        regex_output->push_back('\\');
-        regex_output->push_back(ch);
-        break;
-    default:
-        regex_output->push_back(ch);
-        break;
-    }
-}
+// Enable RE2::Set when glob pattern count is large enough to amortize setup 
cost.
+constexpr size_t kSkipRe2SetThreshold = 32;
 
-// Small LRU to cap compiled glob patterns
-constexpr size_t kGlobRegexCacheCapacity = 256;
+Status build_compiled_skip_patterns(
+        const std::vector<std::pair<std::string, PatternTypePB>>& 
skip_patterns,
+        bool enable_re2_set, ParseConfig* parse_config) {
+    if (parse_config == nullptr) {
+        return Status::InvalidArgument("ParseConfig for compiled skip patterns 
is null");
+    }
 
-struct GlobRegexCacheEntry {
-    std::shared_ptr<RE2> re2;
-    std::list<std::string>::iterator lru_it;
-};
+    parse_config->skip_exact_patterns.clear();
+    parse_config->compiled_skip_glob_regexes.clear();
+    parse_config->compiled_skip_glob_regex_set.reset();
+    parse_config->compiled_skip_use_re2_set = false;
+    parse_config->skip_exact_patterns.reserve(skip_patterns.size());
 
-static std::mutex g_glob_regex_cache_mutex;
-static std::list<std::string> g_glob_regex_cache_lru;
-static std::unordered_map<std::string, GlobRegexCacheEntry> g_glob_regex_cache;
+    std::vector<std::string> glob_regex_patterns;
+    glob_regex_patterns.reserve(skip_patterns.size());
+    for (const auto& [pattern, pt] : skip_patterns) {
+        if (is_skip_exact_path_pattern_type(pt)) {
+            parse_config->skip_exact_patterns.insert(pattern);
+            continue;
+        }
+        if (!is_skip_glob_path_pattern_type(pt)) {
+            continue;
+        }
 
-std::shared_ptr<RE2> get_or_build_re2(const std::string& glob_pattern) {
-    {
-        std::lock_guard<std::mutex> lock(g_glob_regex_cache_mutex);
-        auto it = g_glob_regex_cache.find(glob_pattern);
-        if (it != g_glob_regex_cache.end()) {
-            g_glob_regex_cache_lru.splice(g_glob_regex_cache_lru.begin(), 
g_glob_regex_cache_lru,
-                                          it->second.lru_it);
-            return it->second.re2;
+        std::string regex_pattern;
+        auto st = glob_to_regex(pattern, &regex_pattern);
+        if (!st.ok()) {
+            continue;
         }
+        glob_regex_patterns.emplace_back(std::move(regex_pattern));
     }
-    std::string regex_pattern;
-    Status st = glob_to_regex(glob_pattern, &regex_pattern);
-    if (!st.ok()) {
-        return nullptr;
-    }
-    auto compiled = std::make_shared<RE2>(regex_pattern);
-    if (!compiled->ok()) {
-        return nullptr;
+
+    if (glob_regex_patterns.empty()) {
+        return Status::OK();
     }
-    {
-        std::lock_guard<std::mutex> lock(g_glob_regex_cache_mutex);
-        auto it = g_glob_regex_cache.find(glob_pattern);
-        if (it != g_glob_regex_cache.end()) {
-            g_glob_regex_cache_lru.splice(g_glob_regex_cache_lru.begin(), 
g_glob_regex_cache_lru,
-                                          it->second.lru_it);
-            return it->second.re2;
+
+    if (enable_re2_set && glob_regex_patterns.size() >= kSkipRe2SetThreshold) {
+        RE2::Options options;
+        auto set = std::make_unique<RE2::Set>(options, RE2::ANCHOR_BOTH);
+        for (const auto& regex_pattern : glob_regex_patterns) {
+            if (set->Add(regex_pattern, nullptr) < 0) {
+                return Status::InvalidArgument(
+                        "Failed to add regexp '{}' into skip pattern matcher 
set", regex_pattern);
+            }
         }
-        g_glob_regex_cache_lru.push_front(glob_pattern);
-        g_glob_regex_cache.emplace(glob_pattern,
-                                   GlobRegexCacheEntry {compiled, 
g_glob_regex_cache_lru.begin()});
-        if (g_glob_regex_cache.size() > kGlobRegexCacheCapacity) {
-            const std::string& evict_key = g_glob_regex_cache_lru.back();
-            g_glob_regex_cache.erase(evict_key);
-            g_glob_regex_cache_lru.pop_back();
+        if (!set->Compile()) {
+            return Status::InvalidArgument("Failed to compile skip pattern 
matcher set");
+        }
+        parse_config->compiled_skip_glob_regex_set = std::move(set);
+        parse_config->compiled_skip_use_re2_set = true;
+    } else {
+        
parse_config->compiled_skip_glob_regexes.reserve(glob_regex_patterns.size());
+        for (const auto& regex_pattern : glob_regex_patterns) {
+            auto compiled = std::make_unique<RE2>(regex_pattern);
+            if (!compiled->ok()) {
+                return Status::InvalidArgument(
+                        "Invalid regexp '{}' generated from skip glob pattern: 
{}", regex_pattern,
+                        compiled->error());
+            }
+            
parse_config->compiled_skip_glob_regexes.emplace_back(std::move(compiled));
         }
     }
-    return compiled;
+
+    return Status::OK();
 }
 
-// Convert a restricted glob pattern into a regex.
-// Supported: '*', '?', '[...]', '\\' escape. Others are treated as literals.
-Status glob_to_regex(const std::string& glob_pattern, std::string* 
regex_pattern) {
-    regex_pattern->clear();
-    regex_pattern->append("^");
-    bool is_escaped = false;
-    size_t pattern_length = glob_pattern.size();
-    for (size_t index = 0; index < pattern_length; ++index) {

Review Comment:
   **[Low] Heap allocation per call in hot parsing path**
   
   `std::vector<int> matched_indexes` is allocated on every call to 
`should_skip_path` when using the RE2::Set code path. Since this function is 
called for every key in every JSON object during ingestion, the repeated 
allocations could add up. Consider:
   
   1. Using a `thread_local static std::vector<int>` that is `.clear()`ed each 
call, or
   2. Accepting a pre-allocated vector as a parameter
   
   This is a micro-optimization, so it may not be significant in practice 
depending on pattern count.



##########
be/src/vec/json/json_parser.cpp:
##########
@@ -25,11 +25,11 @@
 #include <glog/logging.h>
 
 #include <algorithm>
-#include <string_view>
 
 #include "common/cast_set.h"
 #include "common/config.h"

Review Comment:
   **[Low-Medium] Redundant recomputation inside inner loop**
   
   The `has_skip_patterns` check depends only on `ctx.parse_config` and 
`ctx.parse_config->skip_patterns`, neither of which change during the loop 
iteration. Consider hoisting this before the `for` loop:
   
   ```cpp
   const bool has_skip_patterns = ctx.parse_config != nullptr &&
                                  ctx.parse_config->skip_patterns != nullptr &&
                                  !ctx.parse_config->skip_patterns->empty();
   for (auto it = object.begin(); it != object.end(); ++it) {
       // ...
   }
   ```
   
   This improves readability and avoids re-evaluating the condition on every 
key.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to