This is an automated email from the ASF dual-hosted git repository. zhaoc pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
The following commit(s) were added to refs/heads/master by this push: new d5493fb Replace std::regex with RE2 (#1930) d5493fb is described below commit d5493fb20a1fd77765e899e95d1514c3ef30c1c8 Author: ZHAO Chun <buaa.zh...@gmail.com> AuthorDate: Fri Oct 11 15:57:53 2019 +0800 Replace std::regex with RE2 (#1930) In Storage Engine GC, TabletManger use std::regex to extract tablet id and schema hash from path. But it will construct regex pattern for every path to check, this is a huge waste. This change list make this pattern a global static pattern, and replace it with RE2, which has better performance. --- be/src/olap/tablet_manager.cpp | 53 ++++++++++++----------------- be/src/olap/tablet_manager.h | 9 +++++ be/test/olap/tablet_mgr_test.cpp | 73 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+), 32 deletions(-) diff --git a/be/src/olap/tablet_manager.cpp b/be/src/olap/tablet_manager.cpp index 2ff7726..d209496 100755 --- a/be/src/olap/tablet_manager.cpp +++ b/be/src/olap/tablet_manager.cpp @@ -25,7 +25,6 @@ #include <queue> #include <set> #include <random> -#include <regex> #include <stdlib.h> #include <boost/algorithm/string/classification.hpp> @@ -34,6 +33,7 @@ #include <boost/filesystem.hpp> #include <rapidjson/document.h> #include <thrift/protocol/TDebugProtocol.h> +#include <re2/re2.h> #include "olap/base_compaction.h" #include "olap/cumulative_compaction.h" @@ -682,42 +682,31 @@ TabletSharedPtr TabletManager::get_tablet(TTabletId tablet_id, SchemaHash schema return nullptr; } // get_tablet -bool TabletManager::get_tablet_id_and_schema_hash_from_path(const std::string& path, - TTabletId* tablet_id, TSchemaHash* schema_hash) { - std::vector<DataDir*> data_dirs = StorageEngine::instance()->get_stores<true>(); - for (auto data_dir : data_dirs) { - const std::string& data_dir_path = data_dir->path(); - if (path.find(data_dir_path) != std::string::npos) { - std::string pattern = data_dir_path + "/data/\\d+/(\\d+)/?(\\d+)?"; - std::regex rgx (pattern.c_str()); - std::smatch sm; - bool ret = std::regex_search(path, sm, rgx); - if (ret) { - if (sm.size() == 3) { - *tablet_id = std::strtoll(sm.str(1).c_str(), nullptr, 10); - *schema_hash = std::strtoll(sm.str(2).c_str(), nullptr, 10); - return true; - } else { - LOG(WARNING) << "invalid match. match size:" << sm.size(); - return false; - } - } - } +bool TabletManager::get_tablet_id_and_schema_hash_from_path( + const std::string& path, TTabletId* tablet_id, TSchemaHash* schema_hash) { + static re2::RE2 normal_re("/data/\\d+/(\\d+)/(\\d+)($|/)"); + if (RE2::PartialMatch(path, normal_re, tablet_id, schema_hash)) { + return true; } - return false; + + // If we can't match normal path pattern, this may be a path which is a empty tablet + // directory. Use this pattern to match empty tablet directory. In this case schema_hash + // will be set to zero. + static re2::RE2 empty_tablet_re("/data/\\d+/(\\d+)($|/$)"); + if (!RE2::PartialMatch(path, empty_tablet_re, tablet_id)) { + return false; + } + *schema_hash = 0; + return true; } bool TabletManager::get_rowset_id_from_path(const std::string& path, RowsetId* rowset_id) { - static std::regex rgx ("/data/\\d+/\\d+/\\d+/([A-Fa-f0-9]+)_.*"); - std::smatch sm; - bool ret = std::regex_search(path, sm, rgx); + static re2::RE2 re("/data/\\d+/\\d+/\\d+/([A-Fa-f0-9]+)_.*"); + std::string id_str; + bool ret = RE2::PartialMatch(path, re, &id_str); if (ret) { - if (sm.size() == 2) { - rowset_id->init(sm.str(1)); - return true; - } else { - return false; - } + rowset_id->init(id_str); + return true; } return false; } diff --git a/be/src/olap/tablet_manager.h b/be/src/olap/tablet_manager.h index 3cf1fc8..bf1c1d1 100644 --- a/be/src/olap/tablet_manager.h +++ b/be/src/olap/tablet_manager.h @@ -96,6 +96,15 @@ public: TabletUid tablet_uid, bool include_deleted = false, std::string* err = nullptr); + // Extract tablet_id and schema_hash from given path. + // + // The normal path pattern is like "/data/{shard_id}/{tablet_id}/{schema_hash}/xxx.data". + // Besides that, this also support empty tablet path, which path looks like + // "/data/{shard_id}/{tablet_id}" + // + // Return true when the path matches the path pattern, and tablet_id and schema_hash is + // saved in input params. When input path is an empty tablet directory, schema_hash will + // be set to 0. Return false if the path don't match valid pattern. bool get_tablet_id_and_schema_hash_from_path(const std::string& path, TTabletId* tablet_id, TSchemaHash* schema_hash); diff --git a/be/test/olap/tablet_mgr_test.cpp b/be/test/olap/tablet_mgr_test.cpp index 8745688..6bdd098 100644 --- a/be/test/olap/tablet_mgr_test.cpp +++ b/be/test/olap/tablet_mgr_test.cpp @@ -209,6 +209,79 @@ TEST_F(TabletMgrTest, DropTablet) { ASSERT_TRUE(!dir_exist); } +TEST_F(TabletMgrTest, GetRowsetId) { + // normal case + { + std::string path = _engine_data_path + "/data/0/15007/368169781"; + TTabletId tid; + TSchemaHash schema_hash; + ASSERT_TRUE(_tablet_mgr.get_tablet_id_and_schema_hash_from_path(path, &tid, &schema_hash)); + ASSERT_EQ(15007, tid); + ASSERT_EQ(368169781, schema_hash); + } + { + std::string path = _engine_data_path + "/data/0/15007/368169781/"; + TTabletId tid; + TSchemaHash schema_hash; + ASSERT_TRUE(_tablet_mgr.get_tablet_id_and_schema_hash_from_path(path, &tid, &schema_hash)); + ASSERT_EQ(15007, tid); + ASSERT_EQ(368169781, schema_hash); + } + // normal case + { + std::string path = _engine_data_path + "/data/0/15007/368169781/020000000000000100000000000000020000000000000003_0_0.dat"; + TTabletId tid; + TSchemaHash schema_hash; + ASSERT_TRUE(_tablet_mgr.get_tablet_id_and_schema_hash_from_path(path, &tid, &schema_hash)); + ASSERT_EQ(15007, tid); + ASSERT_EQ(368169781, schema_hash); + + RowsetId id; + ASSERT_TRUE(_tablet_mgr.get_rowset_id_from_path(path, &id)); + EXPECT_EQ(2UL << 56 | 1, id.hi); + ASSERT_EQ(2, id.mi); + ASSERT_EQ(3, id.lo); + } + // empty tablet directory + { + std::string path = _engine_data_path + "/data/0/15007"; + TTabletId tid; + TSchemaHash schema_hash; + ASSERT_TRUE(_tablet_mgr.get_tablet_id_and_schema_hash_from_path(path, &tid, &schema_hash)); + ASSERT_EQ(15007, tid); + ASSERT_EQ(0, schema_hash); + + RowsetId id; + ASSERT_FALSE(_tablet_mgr.get_rowset_id_from_path(path, &id)); + } + // empty tablet directory + { + std::string path = _engine_data_path + "/data/0/15007/"; + TTabletId tid; + TSchemaHash schema_hash; + ASSERT_TRUE(_tablet_mgr.get_tablet_id_and_schema_hash_from_path(path, &tid, &schema_hash)); + ASSERT_EQ(15007, tid); + ASSERT_EQ(0, schema_hash); + } + // empty tablet directory + { + std::string path = _engine_data_path + "/data/0/15007abc"; + TTabletId tid; + TSchemaHash schema_hash; + ASSERT_FALSE(_tablet_mgr.get_tablet_id_and_schema_hash_from_path(path, &tid, &schema_hash)); + } + // not match pattern + { + std::string path = _engine_data_path + "/data/0/15007/123abc/020000000000000100000000000000020000000000000003_0_0.dat"; + TTabletId tid; + TSchemaHash schema_hash; + ASSERT_FALSE(_tablet_mgr.get_tablet_id_and_schema_hash_from_path(path, &tid, &schema_hash)); + + RowsetId id; + ASSERT_FALSE(_tablet_mgr.get_rowset_id_from_path(path, &id)); + } +} + } // namespace doris int main(int argc, char **argv) { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org