This is an automated email from the ASF dual-hosted git repository.

zhaoc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git


The following commit(s) were added to refs/heads/master by this push:
     new d5493fb  Replace std::regex with RE2 (#1930)
d5493fb is described below

commit d5493fb20a1fd77765e899e95d1514c3ef30c1c8
Author: ZHAO Chun <buaa.zh...@gmail.com>
AuthorDate: Fri Oct 11 15:57:53 2019 +0800

    Replace std::regex with RE2 (#1930)
    
    In Storage Engine GC, TabletManger use std::regex to extract tablet id
    and schema hash from path. But it will construct regex pattern for
    every path to check, this is a huge waste. This change list make this
    pattern a global static pattern, and replace it with RE2, which has
    better performance.
---
 be/src/olap/tablet_manager.cpp   | 53 ++++++++++++-----------------
 be/src/olap/tablet_manager.h     |  9 +++++
 be/test/olap/tablet_mgr_test.cpp | 73 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 103 insertions(+), 32 deletions(-)

diff --git a/be/src/olap/tablet_manager.cpp b/be/src/olap/tablet_manager.cpp
index 2ff7726..d209496 100755
--- a/be/src/olap/tablet_manager.cpp
+++ b/be/src/olap/tablet_manager.cpp
@@ -25,7 +25,6 @@
 #include <queue>
 #include <set>
 #include <random>
-#include <regex>
 #include <stdlib.h>
 
 #include <boost/algorithm/string/classification.hpp>
@@ -34,6 +33,7 @@
 #include <boost/filesystem.hpp>
 #include <rapidjson/document.h>
 #include <thrift/protocol/TDebugProtocol.h>
+#include <re2/re2.h>
 
 #include "olap/base_compaction.h"
 #include "olap/cumulative_compaction.h"
@@ -682,42 +682,31 @@ TabletSharedPtr TabletManager::get_tablet(TTabletId 
tablet_id, SchemaHash schema
     return nullptr;
 } // get_tablet
 
-bool TabletManager::get_tablet_id_and_schema_hash_from_path(const std::string& 
path,
-        TTabletId* tablet_id, TSchemaHash* schema_hash) {
-    std::vector<DataDir*> data_dirs = 
StorageEngine::instance()->get_stores<true>();
-    for (auto data_dir : data_dirs) {
-        const std::string& data_dir_path = data_dir->path();
-        if (path.find(data_dir_path) != std::string::npos) {
-            std::string pattern = data_dir_path + "/data/\\d+/(\\d+)/?(\\d+)?";
-            std::regex rgx (pattern.c_str());
-            std::smatch sm;
-            bool ret = std::regex_search(path, sm, rgx);
-            if (ret) {
-                if (sm.size() == 3) {
-                    *tablet_id = std::strtoll(sm.str(1).c_str(), nullptr, 10);
-                    *schema_hash = std::strtoll(sm.str(2).c_str(), nullptr, 
10);
-                    return true;
-                } else {
-                    LOG(WARNING) << "invalid match. match size:" << sm.size();
-                    return false;
-                }
-            }
-        }
+bool TabletManager::get_tablet_id_and_schema_hash_from_path(
+        const std::string& path, TTabletId* tablet_id, TSchemaHash* 
schema_hash) {
+    static re2::RE2 normal_re("/data/\\d+/(\\d+)/(\\d+)($|/)");
+    if (RE2::PartialMatch(path, normal_re, tablet_id, schema_hash)) {
+        return true;
     }
-    return false;
+
+    // If we can't match normal path pattern, this may be a path which is a 
empty tablet
+    // directory. Use this pattern to match empty tablet directory. In this 
case schema_hash
+    // will be set to zero.
+    static re2::RE2 empty_tablet_re("/data/\\d+/(\\d+)($|/$)");
+    if (!RE2::PartialMatch(path, empty_tablet_re, tablet_id)) {
+        return false;
+    }
+    *schema_hash = 0;
+    return true;
 }
 
 bool TabletManager::get_rowset_id_from_path(const std::string& path, RowsetId* 
rowset_id) {
-    static std::regex rgx ("/data/\\d+/\\d+/\\d+/([A-Fa-f0-9]+)_.*");
-    std::smatch sm;
-    bool ret = std::regex_search(path, sm, rgx);
+    static re2::RE2 re("/data/\\d+/\\d+/\\d+/([A-Fa-f0-9]+)_.*");
+    std::string id_str;
+    bool ret = RE2::PartialMatch(path, re, &id_str);
     if (ret) {
-        if (sm.size() == 2) {
-            rowset_id->init(sm.str(1));
-            return true;
-        } else {
-            return false;
-        }
+        rowset_id->init(id_str);
+        return true;
     }
     return false;
 }
diff --git a/be/src/olap/tablet_manager.h b/be/src/olap/tablet_manager.h
index 3cf1fc8..bf1c1d1 100644
--- a/be/src/olap/tablet_manager.h
+++ b/be/src/olap/tablet_manager.h
@@ -96,6 +96,15 @@ public:
                                TabletUid tablet_uid, bool include_deleted = 
false,
                                std::string* err = nullptr);
 
+    // Extract tablet_id and schema_hash from given path.
+    //
+    // The normal path pattern is like 
"/data/{shard_id}/{tablet_id}/{schema_hash}/xxx.data".
+    // Besides that, this also support empty tablet path, which path looks like
+    // "/data/{shard_id}/{tablet_id}"
+    //
+    // Return true when the path matches the path pattern, and tablet_id and 
schema_hash is
+    // saved in input params. When input path is an empty tablet directory, 
schema_hash will
+    // be set to 0. Return false if the path don't match valid pattern.
     bool get_tablet_id_and_schema_hash_from_path(const std::string& path,
             TTabletId* tablet_id, TSchemaHash* schema_hash);
 
diff --git a/be/test/olap/tablet_mgr_test.cpp b/be/test/olap/tablet_mgr_test.cpp
index 8745688..6bdd098 100644
--- a/be/test/olap/tablet_mgr_test.cpp
+++ b/be/test/olap/tablet_mgr_test.cpp
@@ -209,6 +209,79 @@ TEST_F(TabletMgrTest, DropTablet) {
     ASSERT_TRUE(!dir_exist);
 }
 
+TEST_F(TabletMgrTest, GetRowsetId) {
+    // normal case
+    {
+        std::string path = _engine_data_path + "/data/0/15007/368169781";
+        TTabletId tid;
+        TSchemaHash schema_hash;
+        ASSERT_TRUE(_tablet_mgr.get_tablet_id_and_schema_hash_from_path(path, 
&tid, &schema_hash));
+        ASSERT_EQ(15007, tid);
+        ASSERT_EQ(368169781, schema_hash);
+    }
+    {
+        std::string path = _engine_data_path + "/data/0/15007/368169781/";
+        TTabletId tid;
+        TSchemaHash schema_hash;
+        ASSERT_TRUE(_tablet_mgr.get_tablet_id_and_schema_hash_from_path(path, 
&tid, &schema_hash));
+        ASSERT_EQ(15007, tid);
+        ASSERT_EQ(368169781, schema_hash);
+    }
+    // normal case
+    {
+        std::string path = _engine_data_path + 
"/data/0/15007/368169781/020000000000000100000000000000020000000000000003_0_0.dat";
+        TTabletId tid;
+        TSchemaHash schema_hash;
+        ASSERT_TRUE(_tablet_mgr.get_tablet_id_and_schema_hash_from_path(path, 
&tid, &schema_hash));
+        ASSERT_EQ(15007, tid);
+        ASSERT_EQ(368169781, schema_hash);
+
+        RowsetId id;
+        ASSERT_TRUE(_tablet_mgr.get_rowset_id_from_path(path, &id));
+        EXPECT_EQ(2UL << 56 | 1, id.hi);
+        ASSERT_EQ(2, id.mi);
+        ASSERT_EQ(3, id.lo);
+    }
+    // empty tablet directory
+    {
+        std::string path = _engine_data_path + "/data/0/15007";
+        TTabletId tid;
+        TSchemaHash schema_hash;
+        ASSERT_TRUE(_tablet_mgr.get_tablet_id_and_schema_hash_from_path(path, 
&tid, &schema_hash));
+        ASSERT_EQ(15007, tid);
+        ASSERT_EQ(0, schema_hash);
+
+        RowsetId id;
+        ASSERT_FALSE(_tablet_mgr.get_rowset_id_from_path(path, &id));
+    }
+    // empty tablet directory
+    {
+        std::string path = _engine_data_path + "/data/0/15007/";
+        TTabletId tid;
+        TSchemaHash schema_hash;
+        ASSERT_TRUE(_tablet_mgr.get_tablet_id_and_schema_hash_from_path(path, 
&tid, &schema_hash));
+        ASSERT_EQ(15007, tid);
+        ASSERT_EQ(0, schema_hash);
+    }
+    // empty tablet directory
+    {
+        std::string path = _engine_data_path + "/data/0/15007abc";
+        TTabletId tid;
+        TSchemaHash schema_hash;
+        ASSERT_FALSE(_tablet_mgr.get_tablet_id_and_schema_hash_from_path(path, 
&tid, &schema_hash));
+    }
+    // not match pattern
+    {
+        std::string path = _engine_data_path + 
"/data/0/15007/123abc/020000000000000100000000000000020000000000000003_0_0.dat";
+        TTabletId tid;
+        TSchemaHash schema_hash;
+        ASSERT_FALSE(_tablet_mgr.get_tablet_id_and_schema_hash_from_path(path, 
&tid, &schema_hash));
+
+        RowsetId id;
+        ASSERT_FALSE(_tablet_mgr.get_rowset_id_from_path(path, &id));
+    }
+}
+
 }  // namespace doris
 
 int main(int argc, char **argv) {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to