github-actions[bot] commented on code in PR #25632:
URL: https://github.com/apache/doris/pull/25632#discussion_r1365183482


##########
be/src/olap/itoken_extractor.cpp:
##########
@@ -76,4 +80,214 @@ bool NgramTokenExtractor::next_in_string_like(const char* 
data, size_t length, s
 
     return false;
 }
+
+bool AlphaNumTokenExtractor::next_in_string(const char* data, size_t length, 
size_t* __restrict pos,

Review Comment:
   warning: method 'next_in_string' can be made static 
[readability-convert-member-functions-to-static]
   
   be/src/olap/itoken_extractor.h:103:
   ```diff
   -     bool next_in_string(const char* data, size_t length, size_t* 
__restrict pos,
   +     static bool next_in_string(const char* data, size_t length, size_t* 
__restrict pos,
   ```
   
   be/src/olap/itoken_extractor.h:105:
   ```diff
   -                         size_t* __restrict token_length) const override;
   +                         size_t* __restrict token_length) override;
   ```
   
   be/src/olap/itoken_extractor.cpp:85:
   ```diff
   -                                             size_t* __restrict 
token_length) const {
   +                                             size_t* __restrict 
token_length) {
   ```
   



##########
be/src/olap/itoken_extractor.cpp:
##########
@@ -76,4 +80,214 @@ bool NgramTokenExtractor::next_in_string_like(const char* 
data, size_t length, s
 
     return false;
 }
+
+bool AlphaNumTokenExtractor::next_in_string(const char* data, size_t length, 
size_t* __restrict pos,
+                                            size_t* __restrict token_start,
+                                            size_t* __restrict token_length) 
const {
+    *token_start = *pos;
+    *token_length = 0;
+    while (*pos < length) {
+        if (!is_alpha_numeric_ascii(data[*pos])) {
+            /// Finish current token if any
+            if (*token_length > 0) {
+                return true;
+            }
+            *token_start = ++*pos;
+        } else {
+            /// Note that UTF-8 sequence is completely consisted of non-ASCII 
bytes.
+            ++*pos;
+            ++*token_length;
+        }
+    }
+    return *token_length > 0;
+}
+
+bool AlphaNumTokenExtractor::next_in_string_like(const char* data, size_t 
length, size_t* pos,

Review Comment:
   warning: method 'next_in_string_like' can be made static 
[readability-convert-member-functions-to-static]
   
   be/src/olap/itoken_extractor.h:107:
   ```diff
   -     bool next_in_string_like(const char* data, size_t length, size_t* pos,
   -                              std::string& token) const override;
   +     static bool next_in_string_like(const char* data, size_t length, 
size_t* pos,
   +                              std::string& token) override;
   ```
   
   be/src/olap/itoken_extractor.cpp:105:
   ```diff
   -                                                  std::string& token) const 
{
   +                                                  std::string& token) {
   ```
   



##########
be/src/util/trie.h:
##########
@@ -0,0 +1,250 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <unordered_map>
+
+#include "unicode.h"
+
+namespace doris {
+using namespace std;
+
+const size_t MAX_WORD_LENGTH = 512;
+
+using TrieKey = Rune;
+
+struct Dag {
+    vector<uint32_t> nexts;
+}; // struct Dag
+
+class TrieNode {
+public:
+    using NextMap = unordered_map<TrieKey, TrieNode*>;
+    TrieNode() = default;
+
+    NextMap* next {};
+    const Unicode* ptValue {};
+};
+
+class Trie {
+public:
+    Trie(const vector<Unicode>& keys) : root_(new TrieNode) { 
create_trie(keys); }
+    ~Trie() { delete_node(root_); }
+
+    void insert_node(const Unicode& key) const {
+        if (key.begin() == key.end()) {
+            return;
+        }
+
+        TrieNode::NextMap::const_iterator kmIter;
+        TrieNode* ptNode = root_;
+        for (Unicode::const_iterator iter = key.begin(); iter != key.end(); 
++iter) {
+            if (nullptr == ptNode->next) {
+                ptNode->next = new TrieNode::NextMap;
+            }
+            kmIter = ptNode->next->find(*iter);
+            if (ptNode->next->end() == kmIter) {
+                TrieNode* nextNode = new TrieNode;
+                ptNode->next->insert(make_pair(*iter, nextNode));
+                ptNode = nextNode;
+            } else {
+                ptNode = kmIter->second;
+            }
+        }
+        ptNode->ptValue = &key;
+    }
+
+    [[nodiscard]] const Unicode* find(RuneStrArray::const_iterator begin,
+                                      RuneStrArray::const_iterator end) const {
+        const TrieNode* ptNode = root_;
+        TrieNode::NextMap::const_iterator citer;
+        const Unicode* longestWord = nullptr;
+
+        for (RuneStrArray::const_iterator it = begin; it != end; ++it) {
+            if (nullptr == ptNode->next) {
+                break;
+            }
+            citer = ptNode->next->find(it->rune);
+            if (ptNode->next->end() == citer) {
+                break;
+            }
+            ptNode = citer->second;
+            if (ptNode->ptValue) {
+                longestWord = ptNode->ptValue;
+            }
+        }
+
+        return longestWord;
+    }
+
+    void find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator 
end,
+              vector<struct Dag>& res, size_t max_word_len = MAX_WORD_LENGTH) 
const {
+        assert(root_ != nullptr);
+
+        const TrieNode* ptNode = nullptr;
+        TrieNode::NextMap::const_iterator citer;
+        if (root_->next == nullptr) {
+            return;
+        }
+
+        RuneStrArray::const_iterator iter = begin;

Review Comment:
   warning: use auto when declaring iterators [modernize-use-auto]
   
   ```suggestion
           auto iter = begin;
   ```
   



##########
be/src/util/trie.h:
##########
@@ -0,0 +1,250 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <unordered_map>
+
+#include "unicode.h"
+
+namespace doris {
+using namespace std;
+
+const size_t MAX_WORD_LENGTH = 512;
+
+using TrieKey = Rune;
+
+struct Dag {
+    vector<uint32_t> nexts;
+}; // struct Dag
+
+class TrieNode {
+public:
+    using NextMap = unordered_map<TrieKey, TrieNode*>;
+    TrieNode() = default;
+
+    NextMap* next {};
+    const Unicode* ptValue {};
+};
+
+class Trie {
+public:
+    Trie(const vector<Unicode>& keys) : root_(new TrieNode) { 
create_trie(keys); }
+    ~Trie() { delete_node(root_); }
+
+    void insert_node(const Unicode& key) const {
+        if (key.begin() == key.end()) {
+            return;
+        }
+
+        TrieNode::NextMap::const_iterator kmIter;
+        TrieNode* ptNode = root_;
+        for (Unicode::const_iterator iter = key.begin(); iter != key.end(); 
++iter) {
+            if (nullptr == ptNode->next) {
+                ptNode->next = new TrieNode::NextMap;
+            }
+            kmIter = ptNode->next->find(*iter);
+            if (ptNode->next->end() == kmIter) {
+                TrieNode* nextNode = new TrieNode;

Review Comment:
   warning: use auto when initializing with new to avoid duplicating the type 
name [modernize-use-auto]
   
   ```suggestion
                   auto* nextNode = new TrieNode;
   ```
   



##########
be/src/util/trie.h:
##########
@@ -0,0 +1,250 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <unordered_map>
+
+#include "unicode.h"
+
+namespace doris {
+using namespace std;
+
+const size_t MAX_WORD_LENGTH = 512;
+
+using TrieKey = Rune;
+
+struct Dag {
+    vector<uint32_t> nexts;
+}; // struct Dag
+
+class TrieNode {
+public:
+    using NextMap = unordered_map<TrieKey, TrieNode*>;
+    TrieNode() = default;
+
+    NextMap* next {};
+    const Unicode* ptValue {};
+};
+
+class Trie {
+public:
+    Trie(const vector<Unicode>& keys) : root_(new TrieNode) { 
create_trie(keys); }
+    ~Trie() { delete_node(root_); }
+
+    void insert_node(const Unicode& key) const {
+        if (key.begin() == key.end()) {
+            return;
+        }
+
+        TrieNode::NextMap::const_iterator kmIter;
+        TrieNode* ptNode = root_;
+        for (Unicode::const_iterator iter = key.begin(); iter != key.end(); 
++iter) {
+            if (nullptr == ptNode->next) {
+                ptNode->next = new TrieNode::NextMap;
+            }
+            kmIter = ptNode->next->find(*iter);
+            if (ptNode->next->end() == kmIter) {
+                TrieNode* nextNode = new TrieNode;
+                ptNode->next->insert(make_pair(*iter, nextNode));
+                ptNode = nextNode;
+            } else {
+                ptNode = kmIter->second;
+            }
+        }
+        ptNode->ptValue = &key;
+    }
+
+    [[nodiscard]] const Unicode* find(RuneStrArray::const_iterator begin,
+                                      RuneStrArray::const_iterator end) const {
+        const TrieNode* ptNode = root_;
+        TrieNode::NextMap::const_iterator citer;
+        const Unicode* longestWord = nullptr;
+
+        for (RuneStrArray::const_iterator it = begin; it != end; ++it) {
+            if (nullptr == ptNode->next) {
+                break;
+            }
+            citer = ptNode->next->find(it->rune);
+            if (ptNode->next->end() == citer) {
+                break;
+            }
+            ptNode = citer->second;
+            if (ptNode->ptValue) {
+                longestWord = ptNode->ptValue;
+            }
+        }
+
+        return longestWord;
+    }
+
+    void find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator 
end,
+              vector<struct Dag>& res, size_t max_word_len = MAX_WORD_LENGTH) 
const {
+        assert(root_ != nullptr);
+
+        const TrieNode* ptNode = nullptr;
+        TrieNode::NextMap::const_iterator citer;
+        if (root_->next == nullptr) {
+            return;
+        }
+
+        RuneStrArray::const_iterator iter = begin;
+        for (; iter != end; ++iter) {
+            if (root_->next->end() != (citer = root_->next->find(iter->rune))) 
{
+                ptNode = citer->second;
+            } else {
+                ptNode = nullptr;
+            }
+
+            if (ptNode != nullptr) {
+                size_t distance = std::distance(begin, iter);
+                res[distance].nexts.emplace_back(distance);
+            } else {
+                continue;
+            }
+
+            RuneStrArray::const_iterator innerIter = iter + 1;
+            for (; innerIter != end && std::distance(iter, innerIter) <= 
max_word_len;
+                 ++innerIter) {
+                if (ptNode->next == nullptr ||
+                    ptNode->next->end() == (citer = 
ptNode->next->find(innerIter->rune))) {
+                    break;
+                }
+                ptNode = citer->second;
+                if (ptNode->ptValue != nullptr) {
+                    size_t distance = std::distance(begin, innerIter);
+                    res[std::distance(begin, 
iter)].nexts.emplace_back(distance);
+                }
+            }
+        }
+    }
+
+    const Unicode* find_next(RuneStrArray::const_iterator& it,
+                             const RuneStrArray::const_iterator& end) const {
+        const TrieNode* ptNode = root_;
+        TrieNode::NextMap::const_iterator citer;
+
+        for (; it != end; ++it) {
+            if (nullptr == ptNode->next) {
+                break;
+            }
+            citer = ptNode->next->find(it->rune);
+            if (ptNode->next->end() == citer) {
+                break;
+            }
+            ptNode = citer->second;
+            if (ptNode->ptValue) {
+                return ptNode->ptValue;
+            }
+        }
+
+        return nullptr;
+    }
+
+    [[nodiscard]] std::vector<const Unicode*> 
find_all(RuneStrArray::const_iterator begin,
+                                                       
RuneStrArray::const_iterator end) const {
+        std::vector<const Unicode*> matchedWords;
+
+        if (begin == end) {
+            return matchedWords;
+        }
+
+        const TrieNode* ptNode = root_;
+        TrieNode::NextMap::const_iterator citer;
+
+        for (RuneStrArray::const_iterator it = begin; it != end; ++it) {
+            if (nullptr == ptNode->next) {
+                break;
+            }
+            citer = ptNode->next->find(it->rune);
+            if (ptNode->next->end() == citer) {
+                break;
+            }
+            ptNode = citer->second;
+            if (ptNode->ptValue) {
+                matchedWords.push_back(ptNode->ptValue);
+            }
+        }
+
+        if (begin != end && !matchedWords.empty()) {
+            std::vector<const Unicode*> subsequentMatches = find_all(begin + 
1, end);
+            matchedWords.insert(matchedWords.end(), subsequentMatches.begin(),
+                                subsequentMatches.end());
+        }
+
+        return matchedWords;
+    }
+
+    void delete_node(const Unicode& key) const {
+        if (key.begin() == key.end()) {
+            return;
+        }
+        TrieNode::NextMap::const_iterator kmIter;
+        TrieNode* ptNode = root_;
+        for (Unicode::const_iterator iter = key.begin(); iter != key.end(); 
++iter) {
+            if (nullptr == ptNode->next) {
+                return;
+            }
+            kmIter = ptNode->next->find(*iter);
+            if (ptNode->next->end() == kmIter) {
+                break;
+            }
+            ptNode->next->erase(*iter);
+            ptNode = kmIter->second;
+            delete ptNode;
+            break;
+        }
+    }
+
+private:
+    void create_trie(const vector<Unicode>& keys) const {
+        if (keys.empty()) {
+            return;
+        }
+
+        for (size_t i = 0; i < keys.size(); i++) {
+            insert_node(keys[i]);
+        }
+    }
+
+    void delete_node(TrieNode* node) {
+        if (nullptr == node) {
+            return;
+        }
+        if (nullptr != node->next) {
+            for (TrieNode::NextMap::iterator iter = node->next->begin(); iter 
!= node->next->end();
+                 ++iter) {
+                delete_node(iter->second);
+            }
+            delete node->next;
+        }
+        delete node;
+    }
+
+public:
+    TrieNode* root_;
+}; // class Trie
+
+inline bool make_node_info(Unicode& node_info, const string& word) {
+    if (!decode_runes_in_string(word.data(), word.length(), node_info)) {
+        LOG(ERROR) << "Decode " << word << " failed.";

Review Comment:
   warning: use of undeclared identifier 'ERROR' [clang-diagnostic-error]
   ```cpp
           LOG(ERROR) << "Decode " << word << " failed.";
               ^
   ```
   



##########
be/src/util/trie.h:
##########
@@ -0,0 +1,250 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <unordered_map>
+
+#include "unicode.h"
+
+namespace doris {
+using namespace std;
+
+const size_t MAX_WORD_LENGTH = 512;
+
+using TrieKey = Rune;
+
+struct Dag {
+    vector<uint32_t> nexts;
+}; // struct Dag
+
+class TrieNode {
+public:
+    using NextMap = unordered_map<TrieKey, TrieNode*>;
+    TrieNode() = default;
+
+    NextMap* next {};
+    const Unicode* ptValue {};
+};
+
+class Trie {
+public:
+    Trie(const vector<Unicode>& keys) : root_(new TrieNode) { 
create_trie(keys); }
+    ~Trie() { delete_node(root_); }
+
+    void insert_node(const Unicode& key) const {
+        if (key.begin() == key.end()) {
+            return;
+        }
+
+        TrieNode::NextMap::const_iterator kmIter;
+        TrieNode* ptNode = root_;
+        for (Unicode::const_iterator iter = key.begin(); iter != key.end(); 
++iter) {
+            if (nullptr == ptNode->next) {
+                ptNode->next = new TrieNode::NextMap;
+            }
+            kmIter = ptNode->next->find(*iter);
+            if (ptNode->next->end() == kmIter) {
+                TrieNode* nextNode = new TrieNode;
+                ptNode->next->insert(make_pair(*iter, nextNode));
+                ptNode = nextNode;
+            } else {
+                ptNode = kmIter->second;
+            }
+        }
+        ptNode->ptValue = &key;
+    }
+
+    [[nodiscard]] const Unicode* find(RuneStrArray::const_iterator begin,
+                                      RuneStrArray::const_iterator end) const {
+        const TrieNode* ptNode = root_;
+        TrieNode::NextMap::const_iterator citer;
+        const Unicode* longestWord = nullptr;
+
+        for (RuneStrArray::const_iterator it = begin; it != end; ++it) {
+            if (nullptr == ptNode->next) {
+                break;
+            }
+            citer = ptNode->next->find(it->rune);
+            if (ptNode->next->end() == citer) {
+                break;
+            }
+            ptNode = citer->second;
+            if (ptNode->ptValue) {
+                longestWord = ptNode->ptValue;
+            }
+        }
+
+        return longestWord;
+    }
+
+    void find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator 
end,
+              vector<struct Dag>& res, size_t max_word_len = MAX_WORD_LENGTH) 
const {
+        assert(root_ != nullptr);
+
+        const TrieNode* ptNode = nullptr;
+        TrieNode::NextMap::const_iterator citer;
+        if (root_->next == nullptr) {
+            return;
+        }
+
+        RuneStrArray::const_iterator iter = begin;
+        for (; iter != end; ++iter) {
+            if (root_->next->end() != (citer = root_->next->find(iter->rune))) 
{
+                ptNode = citer->second;
+            } else {
+                ptNode = nullptr;
+            }
+
+            if (ptNode != nullptr) {
+                size_t distance = std::distance(begin, iter);
+                res[distance].nexts.emplace_back(distance);
+            } else {
+                continue;
+            }
+
+            RuneStrArray::const_iterator innerIter = iter + 1;
+            for (; innerIter != end && std::distance(iter, innerIter) <= 
max_word_len;
+                 ++innerIter) {
+                if (ptNode->next == nullptr ||
+                    ptNode->next->end() == (citer = 
ptNode->next->find(innerIter->rune))) {
+                    break;
+                }
+                ptNode = citer->second;
+                if (ptNode->ptValue != nullptr) {
+                    size_t distance = std::distance(begin, innerIter);
+                    res[std::distance(begin, 
iter)].nexts.emplace_back(distance);
+                }
+            }
+        }
+    }
+
+    const Unicode* find_next(RuneStrArray::const_iterator& it,
+                             const RuneStrArray::const_iterator& end) const {
+        const TrieNode* ptNode = root_;
+        TrieNode::NextMap::const_iterator citer;
+
+        for (; it != end; ++it) {
+            if (nullptr == ptNode->next) {
+                break;
+            }
+            citer = ptNode->next->find(it->rune);
+            if (ptNode->next->end() == citer) {
+                break;
+            }
+            ptNode = citer->second;
+            if (ptNode->ptValue) {
+                return ptNode->ptValue;
+            }
+        }
+
+        return nullptr;
+    }
+
+    [[nodiscard]] std::vector<const Unicode*> 
find_all(RuneStrArray::const_iterator begin,
+                                                       
RuneStrArray::const_iterator end) const {
+        std::vector<const Unicode*> matchedWords;
+
+        if (begin == end) {
+            return matchedWords;
+        }
+
+        const TrieNode* ptNode = root_;
+        TrieNode::NextMap::const_iterator citer;
+
+        for (RuneStrArray::const_iterator it = begin; it != end; ++it) {
+            if (nullptr == ptNode->next) {
+                break;
+            }
+            citer = ptNode->next->find(it->rune);
+            if (ptNode->next->end() == citer) {
+                break;
+            }
+            ptNode = citer->second;
+            if (ptNode->ptValue) {
+                matchedWords.push_back(ptNode->ptValue);
+            }
+        }
+
+        if (begin != end && !matchedWords.empty()) {
+            std::vector<const Unicode*> subsequentMatches = find_all(begin + 
1, end);
+            matchedWords.insert(matchedWords.end(), subsequentMatches.begin(),
+                                subsequentMatches.end());
+        }
+
+        return matchedWords;
+    }
+
+    void delete_node(const Unicode& key) const {
+        if (key.begin() == key.end()) {
+            return;
+        }
+        TrieNode::NextMap::const_iterator kmIter;
+        TrieNode* ptNode = root_;
+        for (Unicode::const_iterator iter = key.begin(); iter != key.end(); 
++iter) {

Review Comment:
   warning: use auto when declaring iterators [modernize-use-auto]
   ```cpp
           for (Unicode::const_iterator iter = key.begin(); iter != key.end(); 
++iter) {
                ^
   ```
   this fix will not be applied because it overlaps with another fix



##########
be/src/util/unicode.h:
##########
@@ -0,0 +1,188 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <ostream>
+#include <string>
+#include <vector>
+
+namespace doris {
+using std::string;
+using std::string_view;
+using std::vector;
+
+using Rune = uint32_t;
+
+struct RuneStr {
+    Rune rune;
+    uint32_t offset;
+    uint32_t len;
+    RuneStr() : rune(0), offset(0), len(0) {}
+    RuneStr(Rune r, uint32_t o, uint32_t l) : rune(r), offset(o), len(l) {}
+    RuneStr(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t 
unicode_length)
+            : rune(r), offset(o), len(l) {}
+}; // struct RuneStr
+
+inline std::ostream& operator<<(std::ostream& os, const RuneStr& r) {
+    return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset

Review Comment:
   warning: escaped string literal can be written as a raw string literal 
[modernize-raw-string-literal]
   
   ```suggestion
       return os << "{\"rune\": \"" << r.rune << R"(", "offset": )" << r.offset
   ```
   



##########
be/src/util/unicode.h:
##########
@@ -0,0 +1,188 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <ostream>
+#include <string>
+#include <vector>
+
+namespace doris {
+using std::string;
+using std::string_view;
+using std::vector;
+
+using Rune = uint32_t;
+
+struct RuneStr {
+    Rune rune;
+    uint32_t offset;
+    uint32_t len;
+    RuneStr() : rune(0), offset(0), len(0) {}
+    RuneStr(Rune r, uint32_t o, uint32_t l) : rune(r), offset(o), len(l) {}
+    RuneStr(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t 
unicode_length)
+            : rune(r), offset(o), len(l) {}
+}; // struct RuneStr
+
+inline std::ostream& operator<<(std::ostream& os, const RuneStr& r) {
+    return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset
+              << ", \"len\": " << r.len << "}";
+}
+
+using Unicode = std::vector<Rune>;
+using RuneStrArray = std::vector<struct RuneStr>;
+
+struct WordRange {
+    RuneStrArray::const_iterator left;
+    RuneStrArray::const_iterator right;
+    WordRange(RuneStrArray::const_iterator l, RuneStrArray::const_iterator r) 
: left(l), right(r) {}
+    size_t Length() const { return right - left + 1; }
+}; // struct WordRange
+
+struct RuneStrLite {
+    uint32_t rune;
+    uint32_t len;
+    RuneStrLite() : rune(0), len(0) {}
+    RuneStrLite(uint32_t r, uint32_t l) : rune(r), len(l) {}
+}; // struct RuneStrLite
+
+inline RuneStrLite decode_rune_in_string(const char* str, size_t len) {
+    RuneStrLite rp(0, 0);
+    if (str == nullptr || len == 0) {
+        return rp;
+    }
+    if (!(str[0] & 0x80)) { // 0xxxxxxx
+        // 7bit, total 7bit
+        rp.rune = (uint8_t)(str[0]) & 0x7f;
+        rp.len = 1;
+    } else if ((uint8_t)str[0] <= 0xdf && 1 < len) {

Review Comment:
   warning: 0xdf is a magic number; consider replacing it with a named constant 
[readability-magic-numbers]
   ```cpp
       } else if ((uint8_t)str[0] <= 0xdf && 1 < len) {
                                     ^
   ```
   



##########
be/src/util/trie.h:
##########
@@ -0,0 +1,250 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <unordered_map>
+
+#include "unicode.h"
+
+namespace doris {
+using namespace std;
+
+const size_t MAX_WORD_LENGTH = 512;
+
+using TrieKey = Rune;
+
+struct Dag {
+    vector<uint32_t> nexts;
+}; // struct Dag
+
+class TrieNode {
+public:
+    using NextMap = unordered_map<TrieKey, TrieNode*>;
+    TrieNode() = default;
+
+    NextMap* next {};
+    const Unicode* ptValue {};
+};
+
+class Trie {
+public:
+    Trie(const vector<Unicode>& keys) : root_(new TrieNode) { 
create_trie(keys); }
+    ~Trie() { delete_node(root_); }
+
+    void insert_node(const Unicode& key) const {
+        if (key.begin() == key.end()) {
+            return;
+        }
+
+        TrieNode::NextMap::const_iterator kmIter;
+        TrieNode* ptNode = root_;
+        for (Unicode::const_iterator iter = key.begin(); iter != key.end(); 
++iter) {

Review Comment:
   warning: use range-based for loop instead [modernize-loop-convert]
   
   ```suggestion
           for (unsigned int iter : key) {
   ```
   
   be/src/util/trie.h:59:
   ```diff
   -             kmIter = ptNode->next->find(*iter);
   +             kmIter = ptNode->next->find(iter);
   ```
   
   be/src/util/trie.h:62:
   ```diff
   -                 ptNode->next->insert(make_pair(*iter, nextNode));
   +                 ptNode->next->insert(make_pair(iter, nextNode));
   ```
   



##########
be/src/util/trie.h:
##########
@@ -0,0 +1,250 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <unordered_map>
+
+#include "unicode.h"
+
+namespace doris {
+using namespace std;
+
+const size_t MAX_WORD_LENGTH = 512;
+
+using TrieKey = Rune;
+
+struct Dag {
+    vector<uint32_t> nexts;
+}; // struct Dag
+
+class TrieNode {
+public:
+    using NextMap = unordered_map<TrieKey, TrieNode*>;
+    TrieNode() = default;
+
+    NextMap* next {};
+    const Unicode* ptValue {};
+};
+
+class Trie {
+public:
+    Trie(const vector<Unicode>& keys) : root_(new TrieNode) { 
create_trie(keys); }
+    ~Trie() { delete_node(root_); }
+
+    void insert_node(const Unicode& key) const {
+        if (key.begin() == key.end()) {
+            return;
+        }
+
+        TrieNode::NextMap::const_iterator kmIter;
+        TrieNode* ptNode = root_;
+        for (Unicode::const_iterator iter = key.begin(); iter != key.end(); 
++iter) {
+            if (nullptr == ptNode->next) {
+                ptNode->next = new TrieNode::NextMap;
+            }
+            kmIter = ptNode->next->find(*iter);
+            if (ptNode->next->end() == kmIter) {
+                TrieNode* nextNode = new TrieNode;
+                ptNode->next->insert(make_pair(*iter, nextNode));
+                ptNode = nextNode;
+            } else {
+                ptNode = kmIter->second;
+            }
+        }
+        ptNode->ptValue = &key;
+    }
+
+    [[nodiscard]] const Unicode* find(RuneStrArray::const_iterator begin,
+                                      RuneStrArray::const_iterator end) const {
+        const TrieNode* ptNode = root_;
+        TrieNode::NextMap::const_iterator citer;
+        const Unicode* longestWord = nullptr;
+
+        for (RuneStrArray::const_iterator it = begin; it != end; ++it) {

Review Comment:
   warning: use auto when declaring iterators [modernize-use-auto]
   
   ```suggestion
           for (auto it = begin; it != end; ++it) {
   ```
   



##########
be/src/util/trie.h:
##########
@@ -0,0 +1,250 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <unordered_map>
+
+#include "unicode.h"
+
+namespace doris {
+using namespace std;
+
+const size_t MAX_WORD_LENGTH = 512;
+
+using TrieKey = Rune;
+
+struct Dag {
+    vector<uint32_t> nexts;
+}; // struct Dag
+
+class TrieNode {
+public:
+    using NextMap = unordered_map<TrieKey, TrieNode*>;
+    TrieNode() = default;
+
+    NextMap* next {};
+    const Unicode* ptValue {};
+};
+
+class Trie {
+public:
+    Trie(const vector<Unicode>& keys) : root_(new TrieNode) { 
create_trie(keys); }
+    ~Trie() { delete_node(root_); }
+
+    void insert_node(const Unicode& key) const {
+        if (key.begin() == key.end()) {
+            return;
+        }
+
+        TrieNode::NextMap::const_iterator kmIter;
+        TrieNode* ptNode = root_;
+        for (Unicode::const_iterator iter = key.begin(); iter != key.end(); 
++iter) {
+            if (nullptr == ptNode->next) {
+                ptNode->next = new TrieNode::NextMap;
+            }
+            kmIter = ptNode->next->find(*iter);
+            if (ptNode->next->end() == kmIter) {
+                TrieNode* nextNode = new TrieNode;
+                ptNode->next->insert(make_pair(*iter, nextNode));
+                ptNode = nextNode;
+            } else {
+                ptNode = kmIter->second;
+            }
+        }
+        ptNode->ptValue = &key;
+    }
+
+    [[nodiscard]] const Unicode* find(RuneStrArray::const_iterator begin,
+                                      RuneStrArray::const_iterator end) const {
+        const TrieNode* ptNode = root_;
+        TrieNode::NextMap::const_iterator citer;
+        const Unicode* longestWord = nullptr;
+
+        for (RuneStrArray::const_iterator it = begin; it != end; ++it) {
+            if (nullptr == ptNode->next) {
+                break;
+            }
+            citer = ptNode->next->find(it->rune);
+            if (ptNode->next->end() == citer) {
+                break;
+            }
+            ptNode = citer->second;
+            if (ptNode->ptValue) {
+                longestWord = ptNode->ptValue;
+            }
+        }
+
+        return longestWord;
+    }
+
+    void find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator 
end,
+              vector<struct Dag>& res, size_t max_word_len = MAX_WORD_LENGTH) 
const {
+        assert(root_ != nullptr);

Review Comment:
   warning: use of undeclared identifier 'assert' [clang-diagnostic-error]
   ```cpp
           assert(root_ != nullptr);
           ^
   ```
   



##########
be/src/util/trie.h:
##########
@@ -0,0 +1,250 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <unordered_map>
+
+#include "unicode.h"
+
+namespace doris {
+using namespace std;
+
+const size_t MAX_WORD_LENGTH = 512;
+
+using TrieKey = Rune;
+
+struct Dag {
+    vector<uint32_t> nexts;
+}; // struct Dag
+
+class TrieNode {
+public:
+    using NextMap = unordered_map<TrieKey, TrieNode*>;
+    TrieNode() = default;
+
+    NextMap* next {};
+    const Unicode* ptValue {};
+};
+
+class Trie {
+public:
+    Trie(const vector<Unicode>& keys) : root_(new TrieNode) { 
create_trie(keys); }
+    ~Trie() { delete_node(root_); }
+
+    void insert_node(const Unicode& key) const {
+        if (key.begin() == key.end()) {
+            return;
+        }
+
+        TrieNode::NextMap::const_iterator kmIter;
+        TrieNode* ptNode = root_;
+        for (Unicode::const_iterator iter = key.begin(); iter != key.end(); 
++iter) {

Review Comment:
   warning: use auto when declaring iterators [modernize-use-auto]
   ```cpp
           for (Unicode::const_iterator iter = key.begin(); iter != key.end(); 
++iter) {
                ^
   ```
   this fix will not be applied because it overlaps with another fix



##########
be/src/util/trie.h:
##########
@@ -0,0 +1,250 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <unordered_map>
+
+#include "unicode.h"
+
+namespace doris {
+using namespace std;
+
+const size_t MAX_WORD_LENGTH = 512;
+
+using TrieKey = Rune;
+
+struct Dag {
+    vector<uint32_t> nexts;
+}; // struct Dag
+
+class TrieNode {
+public:
+    using NextMap = unordered_map<TrieKey, TrieNode*>;
+    TrieNode() = default;
+
+    NextMap* next {};
+    const Unicode* ptValue {};
+};
+
+class Trie {
+public:
+    Trie(const vector<Unicode>& keys) : root_(new TrieNode) { 
create_trie(keys); }
+    ~Trie() { delete_node(root_); }
+
+    void insert_node(const Unicode& key) const {
+        if (key.begin() == key.end()) {
+            return;
+        }
+
+        TrieNode::NextMap::const_iterator kmIter;
+        TrieNode* ptNode = root_;
+        for (Unicode::const_iterator iter = key.begin(); iter != key.end(); 
++iter) {
+            if (nullptr == ptNode->next) {
+                ptNode->next = new TrieNode::NextMap;
+            }
+            kmIter = ptNode->next->find(*iter);
+            if (ptNode->next->end() == kmIter) {
+                TrieNode* nextNode = new TrieNode;
+                ptNode->next->insert(make_pair(*iter, nextNode));
+                ptNode = nextNode;
+            } else {
+                ptNode = kmIter->second;
+            }
+        }
+        ptNode->ptValue = &key;
+    }
+
+    [[nodiscard]] const Unicode* find(RuneStrArray::const_iterator begin,
+                                      RuneStrArray::const_iterator end) const {
+        const TrieNode* ptNode = root_;
+        TrieNode::NextMap::const_iterator citer;
+        const Unicode* longestWord = nullptr;
+
+        for (RuneStrArray::const_iterator it = begin; it != end; ++it) {
+            if (nullptr == ptNode->next) {
+                break;
+            }
+            citer = ptNode->next->find(it->rune);
+            if (ptNode->next->end() == citer) {
+                break;
+            }
+            ptNode = citer->second;
+            if (ptNode->ptValue) {
+                longestWord = ptNode->ptValue;
+            }
+        }
+
+        return longestWord;
+    }
+
+    void find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator 
end,
+              vector<struct Dag>& res, size_t max_word_len = MAX_WORD_LENGTH) 
const {
+        assert(root_ != nullptr);
+
+        const TrieNode* ptNode = nullptr;
+        TrieNode::NextMap::const_iterator citer;
+        if (root_->next == nullptr) {
+            return;
+        }
+
+        RuneStrArray::const_iterator iter = begin;
+        for (; iter != end; ++iter) {
+            if (root_->next->end() != (citer = root_->next->find(iter->rune))) 
{
+                ptNode = citer->second;
+            } else {
+                ptNode = nullptr;
+            }
+
+            if (ptNode != nullptr) {
+                size_t distance = std::distance(begin, iter);
+                res[distance].nexts.emplace_back(distance);
+            } else {
+                continue;
+            }
+
+            RuneStrArray::const_iterator innerIter = iter + 1;

Review Comment:
   warning: use auto when declaring iterators [modernize-use-auto]
   
   ```suggestion
               auto innerIter = iter + 1;
   ```
   



##########
be/src/util/trie.h:
##########
@@ -0,0 +1,250 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <unordered_map>
+
+#include "unicode.h"
+
+namespace doris {
+using namespace std;
+
+const size_t MAX_WORD_LENGTH = 512;
+
+using TrieKey = Rune;
+
+struct Dag {
+    vector<uint32_t> nexts;
+}; // struct Dag
+
+class TrieNode {
+public:
+    using NextMap = unordered_map<TrieKey, TrieNode*>;
+    TrieNode() = default;
+
+    NextMap* next {};
+    const Unicode* ptValue {};
+};
+
+class Trie {
+public:
+    Trie(const vector<Unicode>& keys) : root_(new TrieNode) { 
create_trie(keys); }
+    ~Trie() { delete_node(root_); }
+
+    void insert_node(const Unicode& key) const {
+        if (key.begin() == key.end()) {
+            return;
+        }
+
+        TrieNode::NextMap::const_iterator kmIter;
+        TrieNode* ptNode = root_;
+        for (Unicode::const_iterator iter = key.begin(); iter != key.end(); 
++iter) {
+            if (nullptr == ptNode->next) {
+                ptNode->next = new TrieNode::NextMap;
+            }
+            kmIter = ptNode->next->find(*iter);
+            if (ptNode->next->end() == kmIter) {
+                TrieNode* nextNode = new TrieNode;
+                ptNode->next->insert(make_pair(*iter, nextNode));
+                ptNode = nextNode;
+            } else {
+                ptNode = kmIter->second;
+            }
+        }
+        ptNode->ptValue = &key;
+    }
+
+    [[nodiscard]] const Unicode* find(RuneStrArray::const_iterator begin,
+                                      RuneStrArray::const_iterator end) const {
+        const TrieNode* ptNode = root_;
+        TrieNode::NextMap::const_iterator citer;
+        const Unicode* longestWord = nullptr;
+
+        for (RuneStrArray::const_iterator it = begin; it != end; ++it) {
+            if (nullptr == ptNode->next) {
+                break;
+            }
+            citer = ptNode->next->find(it->rune);
+            if (ptNode->next->end() == citer) {
+                break;
+            }
+            ptNode = citer->second;
+            if (ptNode->ptValue) {
+                longestWord = ptNode->ptValue;
+            }
+        }
+
+        return longestWord;
+    }
+
+    void find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator 
end,
+              vector<struct Dag>& res, size_t max_word_len = MAX_WORD_LENGTH) 
const {
+        assert(root_ != nullptr);
+
+        const TrieNode* ptNode = nullptr;
+        TrieNode::NextMap::const_iterator citer;
+        if (root_->next == nullptr) {
+            return;
+        }
+
+        RuneStrArray::const_iterator iter = begin;
+        for (; iter != end; ++iter) {
+            if (root_->next->end() != (citer = root_->next->find(iter->rune))) 
{
+                ptNode = citer->second;
+            } else {
+                ptNode = nullptr;
+            }
+
+            if (ptNode != nullptr) {
+                size_t distance = std::distance(begin, iter);
+                res[distance].nexts.emplace_back(distance);
+            } else {
+                continue;
+            }
+
+            RuneStrArray::const_iterator innerIter = iter + 1;
+            for (; innerIter != end && std::distance(iter, innerIter) <= 
max_word_len;
+                 ++innerIter) {
+                if (ptNode->next == nullptr ||
+                    ptNode->next->end() == (citer = 
ptNode->next->find(innerIter->rune))) {
+                    break;
+                }
+                ptNode = citer->second;
+                if (ptNode->ptValue != nullptr) {
+                    size_t distance = std::distance(begin, innerIter);
+                    res[std::distance(begin, 
iter)].nexts.emplace_back(distance);
+                }
+            }
+        }
+    }
+
+    const Unicode* find_next(RuneStrArray::const_iterator& it,
+                             const RuneStrArray::const_iterator& end) const {
+        const TrieNode* ptNode = root_;
+        TrieNode::NextMap::const_iterator citer;
+
+        for (; it != end; ++it) {
+            if (nullptr == ptNode->next) {
+                break;
+            }
+            citer = ptNode->next->find(it->rune);
+            if (ptNode->next->end() == citer) {
+                break;
+            }
+            ptNode = citer->second;
+            if (ptNode->ptValue) {
+                return ptNode->ptValue;
+            }
+        }
+
+        return nullptr;
+    }
+
+    [[nodiscard]] std::vector<const Unicode*> 
find_all(RuneStrArray::const_iterator begin,
+                                                       
RuneStrArray::const_iterator end) const {
+        std::vector<const Unicode*> matchedWords;
+
+        if (begin == end) {
+            return matchedWords;
+        }
+
+        const TrieNode* ptNode = root_;
+        TrieNode::NextMap::const_iterator citer;
+
+        for (RuneStrArray::const_iterator it = begin; it != end; ++it) {

Review Comment:
   warning: use auto when declaring iterators [modernize-use-auto]
   
   ```suggestion
           for (auto it = begin; it != end; ++it) {
   ```
   



##########
be/src/util/trie.h:
##########
@@ -0,0 +1,250 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <unordered_map>
+
+#include "unicode.h"
+
+namespace doris {
+using namespace std;
+
+const size_t MAX_WORD_LENGTH = 512;
+
+using TrieKey = Rune;
+
+struct Dag {
+    vector<uint32_t> nexts;
+}; // struct Dag
+
+class TrieNode {
+public:
+    using NextMap = unordered_map<TrieKey, TrieNode*>;
+    TrieNode() = default;
+
+    NextMap* next {};
+    const Unicode* ptValue {};
+};
+
+class Trie {
+public:
+    Trie(const vector<Unicode>& keys) : root_(new TrieNode) { 
create_trie(keys); }
+    ~Trie() { delete_node(root_); }
+
+    void insert_node(const Unicode& key) const {
+        if (key.begin() == key.end()) {
+            return;
+        }
+
+        TrieNode::NextMap::const_iterator kmIter;
+        TrieNode* ptNode = root_;
+        for (Unicode::const_iterator iter = key.begin(); iter != key.end(); 
++iter) {
+            if (nullptr == ptNode->next) {
+                ptNode->next = new TrieNode::NextMap;
+            }
+            kmIter = ptNode->next->find(*iter);
+            if (ptNode->next->end() == kmIter) {
+                TrieNode* nextNode = new TrieNode;
+                ptNode->next->insert(make_pair(*iter, nextNode));
+                ptNode = nextNode;
+            } else {
+                ptNode = kmIter->second;
+            }
+        }
+        ptNode->ptValue = &key;
+    }
+
+    [[nodiscard]] const Unicode* find(RuneStrArray::const_iterator begin,
+                                      RuneStrArray::const_iterator end) const {
+        const TrieNode* ptNode = root_;
+        TrieNode::NextMap::const_iterator citer;
+        const Unicode* longestWord = nullptr;
+
+        for (RuneStrArray::const_iterator it = begin; it != end; ++it) {
+            if (nullptr == ptNode->next) {
+                break;
+            }
+            citer = ptNode->next->find(it->rune);
+            if (ptNode->next->end() == citer) {
+                break;
+            }
+            ptNode = citer->second;
+            if (ptNode->ptValue) {
+                longestWord = ptNode->ptValue;
+            }
+        }
+
+        return longestWord;
+    }
+
+    void find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator 
end,
+              vector<struct Dag>& res, size_t max_word_len = MAX_WORD_LENGTH) 
const {
+        assert(root_ != nullptr);
+
+        const TrieNode* ptNode = nullptr;
+        TrieNode::NextMap::const_iterator citer;
+        if (root_->next == nullptr) {
+            return;
+        }
+
+        RuneStrArray::const_iterator iter = begin;
+        for (; iter != end; ++iter) {
+            if (root_->next->end() != (citer = root_->next->find(iter->rune))) 
{
+                ptNode = citer->second;
+            } else {
+                ptNode = nullptr;
+            }
+
+            if (ptNode != nullptr) {
+                size_t distance = std::distance(begin, iter);
+                res[distance].nexts.emplace_back(distance);
+            } else {
+                continue;
+            }
+
+            RuneStrArray::const_iterator innerIter = iter + 1;
+            for (; innerIter != end && std::distance(iter, innerIter) <= 
max_word_len;
+                 ++innerIter) {
+                if (ptNode->next == nullptr ||
+                    ptNode->next->end() == (citer = 
ptNode->next->find(innerIter->rune))) {
+                    break;
+                }
+                ptNode = citer->second;
+                if (ptNode->ptValue != nullptr) {
+                    size_t distance = std::distance(begin, innerIter);
+                    res[std::distance(begin, 
iter)].nexts.emplace_back(distance);
+                }
+            }
+        }
+    }
+
+    const Unicode* find_next(RuneStrArray::const_iterator& it,
+                             const RuneStrArray::const_iterator& end) const {
+        const TrieNode* ptNode = root_;
+        TrieNode::NextMap::const_iterator citer;
+
+        for (; it != end; ++it) {
+            if (nullptr == ptNode->next) {
+                break;
+            }
+            citer = ptNode->next->find(it->rune);
+            if (ptNode->next->end() == citer) {
+                break;
+            }
+            ptNode = citer->second;
+            if (ptNode->ptValue) {
+                return ptNode->ptValue;
+            }
+        }
+
+        return nullptr;
+    }
+
+    [[nodiscard]] std::vector<const Unicode*> 
find_all(RuneStrArray::const_iterator begin,
+                                                       
RuneStrArray::const_iterator end) const {
+        std::vector<const Unicode*> matchedWords;
+
+        if (begin == end) {
+            return matchedWords;
+        }
+
+        const TrieNode* ptNode = root_;
+        TrieNode::NextMap::const_iterator citer;
+
+        for (RuneStrArray::const_iterator it = begin; it != end; ++it) {
+            if (nullptr == ptNode->next) {
+                break;
+            }
+            citer = ptNode->next->find(it->rune);
+            if (ptNode->next->end() == citer) {
+                break;
+            }
+            ptNode = citer->second;
+            if (ptNode->ptValue) {
+                matchedWords.push_back(ptNode->ptValue);
+            }
+        }
+
+        if (begin != end && !matchedWords.empty()) {
+            std::vector<const Unicode*> subsequentMatches = find_all(begin + 
1, end);
+            matchedWords.insert(matchedWords.end(), subsequentMatches.begin(),
+                                subsequentMatches.end());
+        }
+
+        return matchedWords;
+    }
+
+    void delete_node(const Unicode& key) const {
+        if (key.begin() == key.end()) {
+            return;
+        }
+        TrieNode::NextMap::const_iterator kmIter;
+        TrieNode* ptNode = root_;
+        for (Unicode::const_iterator iter = key.begin(); iter != key.end(); 
++iter) {

Review Comment:
   warning: use range-based for loop instead [modernize-loop-convert]
   
   ```suggestion
           for (unsigned int iter : key) {
   ```
   
   be/src/util/trie.h:201:
   ```diff
   -             kmIter = ptNode->next->find(*iter);
   +             kmIter = ptNode->next->find(iter);
   ```
   
   be/src/util/trie.h:205:
   ```diff
   -             ptNode->next->erase(*iter);
   +             ptNode->next->erase(iter);
   ```
   



##########
be/src/util/unicode.h:
##########
@@ -0,0 +1,188 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <stdint.h>

Review Comment:
   warning: inclusion of deprecated C++ header 'stdint.h'; consider using 
'cstdint' instead [modernize-deprecated-headers]
   
   ```suggestion
   #include <cstdint>
   ```
   



##########
be/src/util/trie.h:
##########
@@ -0,0 +1,250 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <unordered_map>
+
+#include "unicode.h"
+
+namespace doris {
+using namespace std;
+
+const size_t MAX_WORD_LENGTH = 512;
+
+using TrieKey = Rune;
+
+struct Dag {
+    vector<uint32_t> nexts;
+}; // struct Dag
+
+class TrieNode {
+public:
+    using NextMap = unordered_map<TrieKey, TrieNode*>;
+    TrieNode() = default;
+
+    NextMap* next {};
+    const Unicode* ptValue {};
+};
+
+class Trie {
+public:
+    Trie(const vector<Unicode>& keys) : root_(new TrieNode) { 
create_trie(keys); }
+    ~Trie() { delete_node(root_); }
+
+    void insert_node(const Unicode& key) const {
+        if (key.begin() == key.end()) {
+            return;
+        }
+
+        TrieNode::NextMap::const_iterator kmIter;
+        TrieNode* ptNode = root_;
+        for (Unicode::const_iterator iter = key.begin(); iter != key.end(); 
++iter) {
+            if (nullptr == ptNode->next) {
+                ptNode->next = new TrieNode::NextMap;
+            }
+            kmIter = ptNode->next->find(*iter);
+            if (ptNode->next->end() == kmIter) {
+                TrieNode* nextNode = new TrieNode;
+                ptNode->next->insert(make_pair(*iter, nextNode));
+                ptNode = nextNode;
+            } else {
+                ptNode = kmIter->second;
+            }
+        }
+        ptNode->ptValue = &key;
+    }
+
+    [[nodiscard]] const Unicode* find(RuneStrArray::const_iterator begin,
+                                      RuneStrArray::const_iterator end) const {
+        const TrieNode* ptNode = root_;
+        TrieNode::NextMap::const_iterator citer;
+        const Unicode* longestWord = nullptr;
+
+        for (RuneStrArray::const_iterator it = begin; it != end; ++it) {
+            if (nullptr == ptNode->next) {
+                break;
+            }
+            citer = ptNode->next->find(it->rune);
+            if (ptNode->next->end() == citer) {
+                break;
+            }
+            ptNode = citer->second;
+            if (ptNode->ptValue) {
+                longestWord = ptNode->ptValue;
+            }
+        }
+
+        return longestWord;
+    }
+
+    void find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator 
end,
+              vector<struct Dag>& res, size_t max_word_len = MAX_WORD_LENGTH) 
const {
+        assert(root_ != nullptr);
+
+        const TrieNode* ptNode = nullptr;
+        TrieNode::NextMap::const_iterator citer;
+        if (root_->next == nullptr) {
+            return;
+        }
+
+        RuneStrArray::const_iterator iter = begin;
+        for (; iter != end; ++iter) {
+            if (root_->next->end() != (citer = root_->next->find(iter->rune))) 
{
+                ptNode = citer->second;
+            } else {
+                ptNode = nullptr;
+            }
+
+            if (ptNode != nullptr) {
+                size_t distance = std::distance(begin, iter);
+                res[distance].nexts.emplace_back(distance);
+            } else {
+                continue;
+            }
+
+            RuneStrArray::const_iterator innerIter = iter + 1;
+            for (; innerIter != end && std::distance(iter, innerIter) <= 
max_word_len;
+                 ++innerIter) {
+                if (ptNode->next == nullptr ||
+                    ptNode->next->end() == (citer = 
ptNode->next->find(innerIter->rune))) {
+                    break;
+                }
+                ptNode = citer->second;
+                if (ptNode->ptValue != nullptr) {
+                    size_t distance = std::distance(begin, innerIter);
+                    res[std::distance(begin, 
iter)].nexts.emplace_back(distance);
+                }
+            }
+        }
+    }
+
+    const Unicode* find_next(RuneStrArray::const_iterator& it,
+                             const RuneStrArray::const_iterator& end) const {
+        const TrieNode* ptNode = root_;
+        TrieNode::NextMap::const_iterator citer;
+
+        for (; it != end; ++it) {
+            if (nullptr == ptNode->next) {
+                break;
+            }
+            citer = ptNode->next->find(it->rune);
+            if (ptNode->next->end() == citer) {
+                break;
+            }
+            ptNode = citer->second;
+            if (ptNode->ptValue) {
+                return ptNode->ptValue;
+            }
+        }
+
+        return nullptr;
+    }
+
+    [[nodiscard]] std::vector<const Unicode*> 
find_all(RuneStrArray::const_iterator begin,
+                                                       
RuneStrArray::const_iterator end) const {
+        std::vector<const Unicode*> matchedWords;
+
+        if (begin == end) {
+            return matchedWords;
+        }
+
+        const TrieNode* ptNode = root_;
+        TrieNode::NextMap::const_iterator citer;
+
+        for (RuneStrArray::const_iterator it = begin; it != end; ++it) {
+            if (nullptr == ptNode->next) {
+                break;
+            }
+            citer = ptNode->next->find(it->rune);
+            if (ptNode->next->end() == citer) {
+                break;
+            }
+            ptNode = citer->second;
+            if (ptNode->ptValue) {
+                matchedWords.push_back(ptNode->ptValue);
+            }
+        }
+
+        if (begin != end && !matchedWords.empty()) {
+            std::vector<const Unicode*> subsequentMatches = find_all(begin + 
1, end);
+            matchedWords.insert(matchedWords.end(), subsequentMatches.begin(),
+                                subsequentMatches.end());
+        }
+
+        return matchedWords;
+    }
+
+    void delete_node(const Unicode& key) const {
+        if (key.begin() == key.end()) {
+            return;
+        }
+        TrieNode::NextMap::const_iterator kmIter;
+        TrieNode* ptNode = root_;
+        for (Unicode::const_iterator iter = key.begin(); iter != key.end(); 
++iter) {
+            if (nullptr == ptNode->next) {
+                return;
+            }
+            kmIter = ptNode->next->find(*iter);
+            if (ptNode->next->end() == kmIter) {
+                break;
+            }
+            ptNode->next->erase(*iter);
+            ptNode = kmIter->second;
+            delete ptNode;
+            break;
+        }
+    }
+
+private:
+    void create_trie(const vector<Unicode>& keys) const {
+        if (keys.empty()) {
+            return;
+        }
+
+        for (size_t i = 0; i < keys.size(); i++) {
+            insert_node(keys[i]);

Review Comment:
   warning: use range-based for loop instead [modernize-loop-convert]
   
   ```suggestion
           for (const auto & key : keys) {
               insert_node(key);
   ```
   



##########
be/src/util/trie.h:
##########
@@ -0,0 +1,250 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <unordered_map>
+
+#include "unicode.h"
+
+namespace doris {
+using namespace std;
+
+const size_t MAX_WORD_LENGTH = 512;
+
+using TrieKey = Rune;
+
+struct Dag {
+    vector<uint32_t> nexts;
+}; // struct Dag
+
+class TrieNode {
+public:
+    using NextMap = unordered_map<TrieKey, TrieNode*>;
+    TrieNode() = default;
+
+    NextMap* next {};
+    const Unicode* ptValue {};
+};
+
+class Trie {
+public:
+    Trie(const vector<Unicode>& keys) : root_(new TrieNode) { 
create_trie(keys); }
+    ~Trie() { delete_node(root_); }
+
+    void insert_node(const Unicode& key) const {
+        if (key.begin() == key.end()) {
+            return;
+        }
+
+        TrieNode::NextMap::const_iterator kmIter;
+        TrieNode* ptNode = root_;
+        for (Unicode::const_iterator iter = key.begin(); iter != key.end(); 
++iter) {
+            if (nullptr == ptNode->next) {
+                ptNode->next = new TrieNode::NextMap;
+            }
+            kmIter = ptNode->next->find(*iter);
+            if (ptNode->next->end() == kmIter) {
+                TrieNode* nextNode = new TrieNode;
+                ptNode->next->insert(make_pair(*iter, nextNode));
+                ptNode = nextNode;
+            } else {
+                ptNode = kmIter->second;
+            }
+        }
+        ptNode->ptValue = &key;
+    }
+
+    [[nodiscard]] const Unicode* find(RuneStrArray::const_iterator begin,
+                                      RuneStrArray::const_iterator end) const {
+        const TrieNode* ptNode = root_;
+        TrieNode::NextMap::const_iterator citer;
+        const Unicode* longestWord = nullptr;
+
+        for (RuneStrArray::const_iterator it = begin; it != end; ++it) {
+            if (nullptr == ptNode->next) {
+                break;
+            }
+            citer = ptNode->next->find(it->rune);
+            if (ptNode->next->end() == citer) {
+                break;
+            }
+            ptNode = citer->second;
+            if (ptNode->ptValue) {
+                longestWord = ptNode->ptValue;
+            }
+        }
+
+        return longestWord;
+    }
+
+    void find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator 
end,
+              vector<struct Dag>& res, size_t max_word_len = MAX_WORD_LENGTH) 
const {
+        assert(root_ != nullptr);
+
+        const TrieNode* ptNode = nullptr;
+        TrieNode::NextMap::const_iterator citer;
+        if (root_->next == nullptr) {
+            return;
+        }
+
+        RuneStrArray::const_iterator iter = begin;
+        for (; iter != end; ++iter) {
+            if (root_->next->end() != (citer = root_->next->find(iter->rune))) 
{
+                ptNode = citer->second;
+            } else {
+                ptNode = nullptr;
+            }
+
+            if (ptNode != nullptr) {
+                size_t distance = std::distance(begin, iter);
+                res[distance].nexts.emplace_back(distance);
+            } else {
+                continue;
+            }
+
+            RuneStrArray::const_iterator innerIter = iter + 1;
+            for (; innerIter != end && std::distance(iter, innerIter) <= 
max_word_len;
+                 ++innerIter) {
+                if (ptNode->next == nullptr ||
+                    ptNode->next->end() == (citer = 
ptNode->next->find(innerIter->rune))) {
+                    break;
+                }
+                ptNode = citer->second;
+                if (ptNode->ptValue != nullptr) {
+                    size_t distance = std::distance(begin, innerIter);
+                    res[std::distance(begin, 
iter)].nexts.emplace_back(distance);
+                }
+            }
+        }
+    }
+
+    const Unicode* find_next(RuneStrArray::const_iterator& it,
+                             const RuneStrArray::const_iterator& end) const {
+        const TrieNode* ptNode = root_;
+        TrieNode::NextMap::const_iterator citer;
+
+        for (; it != end; ++it) {
+            if (nullptr == ptNode->next) {
+                break;
+            }
+            citer = ptNode->next->find(it->rune);
+            if (ptNode->next->end() == citer) {
+                break;
+            }
+            ptNode = citer->second;
+            if (ptNode->ptValue) {
+                return ptNode->ptValue;
+            }
+        }
+
+        return nullptr;
+    }
+
+    [[nodiscard]] std::vector<const Unicode*> 
find_all(RuneStrArray::const_iterator begin,
+                                                       
RuneStrArray::const_iterator end) const {
+        std::vector<const Unicode*> matchedWords;
+
+        if (begin == end) {
+            return matchedWords;
+        }
+
+        const TrieNode* ptNode = root_;
+        TrieNode::NextMap::const_iterator citer;
+
+        for (RuneStrArray::const_iterator it = begin; it != end; ++it) {
+            if (nullptr == ptNode->next) {
+                break;
+            }
+            citer = ptNode->next->find(it->rune);
+            if (ptNode->next->end() == citer) {
+                break;
+            }
+            ptNode = citer->second;
+            if (ptNode->ptValue) {
+                matchedWords.push_back(ptNode->ptValue);
+            }
+        }
+
+        if (begin != end && !matchedWords.empty()) {
+            std::vector<const Unicode*> subsequentMatches = find_all(begin + 
1, end);
+            matchedWords.insert(matchedWords.end(), subsequentMatches.begin(),
+                                subsequentMatches.end());
+        }
+
+        return matchedWords;
+    }
+
+    void delete_node(const Unicode& key) const {
+        if (key.begin() == key.end()) {
+            return;
+        }
+        TrieNode::NextMap::const_iterator kmIter;
+        TrieNode* ptNode = root_;
+        for (Unicode::const_iterator iter = key.begin(); iter != key.end(); 
++iter) {
+            if (nullptr == ptNode->next) {
+                return;
+            }
+            kmIter = ptNode->next->find(*iter);
+            if (ptNode->next->end() == kmIter) {
+                break;
+            }
+            ptNode->next->erase(*iter);
+            ptNode = kmIter->second;
+            delete ptNode;
+            break;
+        }
+    }
+
+private:
+    void create_trie(const vector<Unicode>& keys) const {
+        if (keys.empty()) {
+            return;
+        }
+
+        for (size_t i = 0; i < keys.size(); i++) {
+            insert_node(keys[i]);
+        }
+    }
+
+    void delete_node(TrieNode* node) {
+        if (nullptr == node) {
+            return;
+        }
+        if (nullptr != node->next) {
+            for (TrieNode::NextMap::iterator iter = node->next->begin(); iter 
!= node->next->end();
+                 ++iter) {

Review Comment:
   warning: use range-based for loop instead [modernize-loop-convert]
   
   ```suggestion
               for (auto & iter : *node->next) {
   ```
   
   be/src/util/trie.h:230:
   ```diff
   -                 delete_node(iter->second);
   +                 delete_node(iter.second);
   ```
   



##########
be/src/util/trie.h:
##########
@@ -0,0 +1,250 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <unordered_map>
+
+#include "unicode.h"
+
+namespace doris {
+using namespace std;
+
+const size_t MAX_WORD_LENGTH = 512;
+
+using TrieKey = Rune;
+
+struct Dag {
+    vector<uint32_t> nexts;
+}; // struct Dag
+
+class TrieNode {
+public:
+    using NextMap = unordered_map<TrieKey, TrieNode*>;
+    TrieNode() = default;
+
+    NextMap* next {};
+    const Unicode* ptValue {};
+};
+
+class Trie {
+public:
+    Trie(const vector<Unicode>& keys) : root_(new TrieNode) { 
create_trie(keys); }
+    ~Trie() { delete_node(root_); }
+
+    void insert_node(const Unicode& key) const {
+        if (key.begin() == key.end()) {
+            return;
+        }
+
+        TrieNode::NextMap::const_iterator kmIter;
+        TrieNode* ptNode = root_;
+        for (Unicode::const_iterator iter = key.begin(); iter != key.end(); 
++iter) {
+            if (nullptr == ptNode->next) {
+                ptNode->next = new TrieNode::NextMap;
+            }
+            kmIter = ptNode->next->find(*iter);
+            if (ptNode->next->end() == kmIter) {
+                TrieNode* nextNode = new TrieNode;
+                ptNode->next->insert(make_pair(*iter, nextNode));
+                ptNode = nextNode;
+            } else {
+                ptNode = kmIter->second;
+            }
+        }
+        ptNode->ptValue = &key;
+    }
+
+    [[nodiscard]] const Unicode* find(RuneStrArray::const_iterator begin,
+                                      RuneStrArray::const_iterator end) const {
+        const TrieNode* ptNode = root_;
+        TrieNode::NextMap::const_iterator citer;
+        const Unicode* longestWord = nullptr;
+
+        for (RuneStrArray::const_iterator it = begin; it != end; ++it) {
+            if (nullptr == ptNode->next) {
+                break;
+            }
+            citer = ptNode->next->find(it->rune);
+            if (ptNode->next->end() == citer) {
+                break;
+            }
+            ptNode = citer->second;
+            if (ptNode->ptValue) {
+                longestWord = ptNode->ptValue;
+            }
+        }
+
+        return longestWord;
+    }
+
+    void find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator 
end,
+              vector<struct Dag>& res, size_t max_word_len = MAX_WORD_LENGTH) 
const {
+        assert(root_ != nullptr);
+
+        const TrieNode* ptNode = nullptr;
+        TrieNode::NextMap::const_iterator citer;
+        if (root_->next == nullptr) {
+            return;
+        }
+
+        RuneStrArray::const_iterator iter = begin;
+        for (; iter != end; ++iter) {
+            if (root_->next->end() != (citer = root_->next->find(iter->rune))) 
{
+                ptNode = citer->second;
+            } else {
+                ptNode = nullptr;
+            }
+
+            if (ptNode != nullptr) {
+                size_t distance = std::distance(begin, iter);
+                res[distance].nexts.emplace_back(distance);
+            } else {
+                continue;
+            }
+
+            RuneStrArray::const_iterator innerIter = iter + 1;
+            for (; innerIter != end && std::distance(iter, innerIter) <= 
max_word_len;
+                 ++innerIter) {
+                if (ptNode->next == nullptr ||
+                    ptNode->next->end() == (citer = 
ptNode->next->find(innerIter->rune))) {
+                    break;
+                }
+                ptNode = citer->second;
+                if (ptNode->ptValue != nullptr) {
+                    size_t distance = std::distance(begin, innerIter);
+                    res[std::distance(begin, 
iter)].nexts.emplace_back(distance);
+                }
+            }
+        }
+    }
+
+    const Unicode* find_next(RuneStrArray::const_iterator& it,
+                             const RuneStrArray::const_iterator& end) const {
+        const TrieNode* ptNode = root_;
+        TrieNode::NextMap::const_iterator citer;
+
+        for (; it != end; ++it) {
+            if (nullptr == ptNode->next) {
+                break;
+            }
+            citer = ptNode->next->find(it->rune);
+            if (ptNode->next->end() == citer) {
+                break;
+            }
+            ptNode = citer->second;
+            if (ptNode->ptValue) {
+                return ptNode->ptValue;
+            }
+        }
+
+        return nullptr;
+    }
+
+    [[nodiscard]] std::vector<const Unicode*> 
find_all(RuneStrArray::const_iterator begin,
+                                                       
RuneStrArray::const_iterator end) const {
+        std::vector<const Unicode*> matchedWords;
+
+        if (begin == end) {
+            return matchedWords;
+        }
+
+        const TrieNode* ptNode = root_;
+        TrieNode::NextMap::const_iterator citer;
+
+        for (RuneStrArray::const_iterator it = begin; it != end; ++it) {
+            if (nullptr == ptNode->next) {
+                break;
+            }
+            citer = ptNode->next->find(it->rune);
+            if (ptNode->next->end() == citer) {
+                break;
+            }
+            ptNode = citer->second;
+            if (ptNode->ptValue) {
+                matchedWords.push_back(ptNode->ptValue);
+            }
+        }
+
+        if (begin != end && !matchedWords.empty()) {
+            std::vector<const Unicode*> subsequentMatches = find_all(begin + 
1, end);
+            matchedWords.insert(matchedWords.end(), subsequentMatches.begin(),
+                                subsequentMatches.end());
+        }
+
+        return matchedWords;
+    }
+
+    void delete_node(const Unicode& key) const {
+        if (key.begin() == key.end()) {
+            return;
+        }
+        TrieNode::NextMap::const_iterator kmIter;
+        TrieNode* ptNode = root_;
+        for (Unicode::const_iterator iter = key.begin(); iter != key.end(); 
++iter) {
+            if (nullptr == ptNode->next) {
+                return;
+            }
+            kmIter = ptNode->next->find(*iter);
+            if (ptNode->next->end() == kmIter) {
+                break;
+            }
+            ptNode->next->erase(*iter);
+            ptNode = kmIter->second;
+            delete ptNode;
+            break;
+        }
+    }
+
+private:
+    void create_trie(const vector<Unicode>& keys) const {
+        if (keys.empty()) {
+            return;
+        }
+
+        for (size_t i = 0; i < keys.size(); i++) {
+            insert_node(keys[i]);
+        }
+    }
+
+    void delete_node(TrieNode* node) {
+        if (nullptr == node) {
+            return;
+        }
+        if (nullptr != node->next) {
+            for (TrieNode::NextMap::iterator iter = node->next->begin(); iter 
!= node->next->end();

Review Comment:
   warning: use auto when declaring iterators [modernize-use-auto]
   ```cpp
               for (TrieNode::NextMap::iterator iter = node->next->begin(); 
iter != node->next->end();
                    ^
   ```
   this fix will not be applied because it overlaps with another fix



##########
be/src/util/unicode.h:
##########
@@ -0,0 +1,188 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <ostream>
+#include <string>
+#include <vector>
+
+namespace doris {
+using std::string;
+using std::string_view;
+using std::vector;
+
+using Rune = uint32_t;
+
+struct RuneStr {
+    Rune rune;
+    uint32_t offset;
+    uint32_t len;
+    RuneStr() : rune(0), offset(0), len(0) {}
+    RuneStr(Rune r, uint32_t o, uint32_t l) : rune(r), offset(o), len(l) {}
+    RuneStr(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t 
unicode_length)
+            : rune(r), offset(o), len(l) {}
+}; // struct RuneStr
+
+inline std::ostream& operator<<(std::ostream& os, const RuneStr& r) {
+    return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset

Review Comment:
   warning: escaped string literal can be written as a raw string literal 
[modernize-raw-string-literal]
   
   ```suggestion
       return os << R"({"rune": ")" << r.rune << "\", \"offset\": " << r.offset
   ```
   



##########
be/src/util/unicode.h:
##########
@@ -0,0 +1,188 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <ostream>
+#include <string>
+#include <vector>
+
+namespace doris {
+using std::string;
+using std::string_view;
+using std::vector;
+
+using Rune = uint32_t;
+
+struct RuneStr {
+    Rune rune;
+    uint32_t offset;
+    uint32_t len;
+    RuneStr() : rune(0), offset(0), len(0) {}
+    RuneStr(Rune r, uint32_t o, uint32_t l) : rune(r), offset(o), len(l) {}
+    RuneStr(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t 
unicode_length)
+            : rune(r), offset(o), len(l) {}
+}; // struct RuneStr
+
+inline std::ostream& operator<<(std::ostream& os, const RuneStr& r) {
+    return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset
+              << ", \"len\": " << r.len << "}";
+}
+
+using Unicode = std::vector<Rune>;
+using RuneStrArray = std::vector<struct RuneStr>;
+
+struct WordRange {
+    RuneStrArray::const_iterator left;
+    RuneStrArray::const_iterator right;
+    WordRange(RuneStrArray::const_iterator l, RuneStrArray::const_iterator r) 
: left(l), right(r) {}
+    size_t Length() const { return right - left + 1; }
+}; // struct WordRange
+
+struct RuneStrLite {
+    uint32_t rune;
+    uint32_t len;
+    RuneStrLite() : rune(0), len(0) {}
+    RuneStrLite(uint32_t r, uint32_t l) : rune(r), len(l) {}
+}; // struct RuneStrLite
+
+inline RuneStrLite decode_rune_in_string(const char* str, size_t len) {
+    RuneStrLite rp(0, 0);
+    if (str == nullptr || len == 0) {
+        return rp;
+    }
+    if (!(str[0] & 0x80)) { // 0xxxxxxx
+        // 7bit, total 7bit
+        rp.rune = (uint8_t)(str[0]) & 0x7f;

Review Comment:
   warning: 0x7f is a magic number; consider replacing it with a named constant 
[readability-magic-numbers]
   ```cpp
           rp.rune = (uint8_t)(str[0]) & 0x7f;
                                         ^
   ```
   



##########
be/src/util/unicode.h:
##########
@@ -0,0 +1,188 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <ostream>
+#include <string>
+#include <vector>
+
+namespace doris {
+using std::string;
+using std::string_view;
+using std::vector;
+
+using Rune = uint32_t;
+
+struct RuneStr {
+    Rune rune;
+    uint32_t offset;
+    uint32_t len;
+    RuneStr() : rune(0), offset(0), len(0) {}
+    RuneStr(Rune r, uint32_t o, uint32_t l) : rune(r), offset(o), len(l) {}
+    RuneStr(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t 
unicode_length)
+            : rune(r), offset(o), len(l) {}
+}; // struct RuneStr
+
+inline std::ostream& operator<<(std::ostream& os, const RuneStr& r) {
+    return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset
+              << ", \"len\": " << r.len << "}";
+}
+
+using Unicode = std::vector<Rune>;
+using RuneStrArray = std::vector<struct RuneStr>;
+
+struct WordRange {
+    RuneStrArray::const_iterator left;
+    RuneStrArray::const_iterator right;
+    WordRange(RuneStrArray::const_iterator l, RuneStrArray::const_iterator r) 
: left(l), right(r) {}
+    size_t Length() const { return right - left + 1; }
+}; // struct WordRange
+
+struct RuneStrLite {
+    uint32_t rune;
+    uint32_t len;
+    RuneStrLite() : rune(0), len(0) {}
+    RuneStrLite(uint32_t r, uint32_t l) : rune(r), len(l) {}
+}; // struct RuneStrLite
+
+inline RuneStrLite decode_rune_in_string(const char* str, size_t len) {
+    RuneStrLite rp(0, 0);
+    if (str == nullptr || len == 0) {
+        return rp;
+    }
+    if (!(str[0] & 0x80)) { // 0xxxxxxx

Review Comment:
   warning: 0x80 is a magic number; consider replacing it with a named constant 
[readability-magic-numbers]
   ```cpp
       if (!(str[0] & 0x80)) { // 0xxxxxxx
                      ^
   ```
   



##########
be/src/util/unicode.h:
##########
@@ -0,0 +1,188 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdlib.h>

Review Comment:
   warning: inclusion of deprecated C++ header 'stdlib.h'; consider using 
'cstdlib' instead [modernize-deprecated-headers]
   
   ```suggestion
   #include <cstdlib>
   ```
   



##########
be/src/util/unicode.h:
##########
@@ -0,0 +1,188 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <ostream>
+#include <string>
+#include <vector>
+
+namespace doris {
+using std::string;
+using std::string_view;
+using std::vector;
+
+using Rune = uint32_t;
+
+struct RuneStr {
+    Rune rune;
+    uint32_t offset;
+    uint32_t len;
+    RuneStr() : rune(0), offset(0), len(0) {}
+    RuneStr(Rune r, uint32_t o, uint32_t l) : rune(r), offset(o), len(l) {}
+    RuneStr(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t 
unicode_length)
+            : rune(r), offset(o), len(l) {}
+}; // struct RuneStr
+
+inline std::ostream& operator<<(std::ostream& os, const RuneStr& r) {
+    return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset
+              << ", \"len\": " << r.len << "}";
+}
+
+using Unicode = std::vector<Rune>;
+using RuneStrArray = std::vector<struct RuneStr>;
+
+struct WordRange {
+    RuneStrArray::const_iterator left;
+    RuneStrArray::const_iterator right;
+    WordRange(RuneStrArray::const_iterator l, RuneStrArray::const_iterator r) 
: left(l), right(r) {}
+    size_t Length() const { return right - left + 1; }
+}; // struct WordRange
+
+struct RuneStrLite {
+    uint32_t rune;
+    uint32_t len;
+    RuneStrLite() : rune(0), len(0) {}
+    RuneStrLite(uint32_t r, uint32_t l) : rune(r), len(l) {}
+}; // struct RuneStrLite
+
+inline RuneStrLite decode_rune_in_string(const char* str, size_t len) {
+    RuneStrLite rp(0, 0);
+    if (str == nullptr || len == 0) {
+        return rp;
+    }
+    if (!(str[0] & 0x80)) { // 0xxxxxxx
+        // 7bit, total 7bit
+        rp.rune = (uint8_t)(str[0]) & 0x7f;
+        rp.len = 1;
+    } else if ((uint8_t)str[0] <= 0xdf && 1 < len) {
+        // 110xxxxxx
+        // 5bit, total 5bit
+        rp.rune = (uint8_t)(str[0]) & 0x1f;

Review Comment:
   warning: 0x1f is a magic number; consider replacing it with a named constant 
[readability-magic-numbers]
   ```cpp
           rp.rune = (uint8_t)(str[0]) & 0x1f;
                                         ^
   ```
   



##########
be/src/util/unicode.h:
##########
@@ -0,0 +1,188 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <ostream>
+#include <string>
+#include <vector>
+
+namespace doris {
+using std::string;
+using std::string_view;
+using std::vector;
+
+using Rune = uint32_t;
+
+struct RuneStr {
+    Rune rune;
+    uint32_t offset;
+    uint32_t len;
+    RuneStr() : rune(0), offset(0), len(0) {}
+    RuneStr(Rune r, uint32_t o, uint32_t l) : rune(r), offset(o), len(l) {}
+    RuneStr(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t 
unicode_length)
+            : rune(r), offset(o), len(l) {}
+}; // struct RuneStr
+
+inline std::ostream& operator<<(std::ostream& os, const RuneStr& r) {
+    return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset
+              << ", \"len\": " << r.len << "}";
+}
+
+using Unicode = std::vector<Rune>;
+using RuneStrArray = std::vector<struct RuneStr>;
+
+struct WordRange {
+    RuneStrArray::const_iterator left;
+    RuneStrArray::const_iterator right;
+    WordRange(RuneStrArray::const_iterator l, RuneStrArray::const_iterator r) 
: left(l), right(r) {}
+    size_t Length() const { return right - left + 1; }
+}; // struct WordRange
+
+struct RuneStrLite {
+    uint32_t rune;
+    uint32_t len;
+    RuneStrLite() : rune(0), len(0) {}
+    RuneStrLite(uint32_t r, uint32_t l) : rune(r), len(l) {}
+}; // struct RuneStrLite
+
+inline RuneStrLite decode_rune_in_string(const char* str, size_t len) {
+    RuneStrLite rp(0, 0);
+    if (str == nullptr || len == 0) {
+        return rp;
+    }
+    if (!(str[0] & 0x80)) { // 0xxxxxxx
+        // 7bit, total 7bit
+        rp.rune = (uint8_t)(str[0]) & 0x7f;
+        rp.len = 1;
+    } else if ((uint8_t)str[0] <= 0xdf && 1 < len) {
+        // 110xxxxxx
+        // 5bit, total 5bit
+        rp.rune = (uint8_t)(str[0]) & 0x1f;
+
+        // 6bit, total 11bit
+        rp.rune <<= 6;

Review Comment:
   warning: 6 is a magic number; consider replacing it with a named constant 
[readability-magic-numbers]
   ```cpp
           rp.rune <<= 6;
                       ^
   ```
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to