This is an automated email from the ASF dual-hosted git repository.
wangdan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-pegasus.git
The following commit(s) were added to refs/heads/master by this push:
new 162f39986 refactor: use state machine to improve split_args for
strings (#1240)
162f39986 is described below
commit 162f39986ff4ce44a94a4132eecb9326c6692a94
Author: Dan Wang <[email protected]>
AuthorDate: Thu Nov 17 14:16:28 2022 +0800
refactor: use state machine to improve split_args for strings (#1240)
---
src/utils/strings.cpp | 231 ++++++++++++++++++++++++++++++++++-----------
src/utils/strings.h | 29 ++++--
src/utils/test/utils.cpp | 238 ++++++++++++++++++++++++++++++-----------------
3 files changed, 347 insertions(+), 151 deletions(-)
diff --git a/src/utils/strings.cpp b/src/utils/strings.cpp
index aef9af5e5..fb90cf844 100644
--- a/src/utils/strings.cpp
+++ b/src/utils/strings.cpp
@@ -24,9 +24,15 @@
* THE SOFTWARE.
*/
+#include <algorithm>
#include <cstring>
#include <sstream>
+#include <utility>
+
#include <openssl/md5.h>
+#include "utils/api_utilities.h"
+#include "utils/fmt_logging.h"
+#include "utils/ports.h"
#include "utils/strings.h"
namespace dsn {
@@ -50,75 +56,190 @@ std::string get_last_component(const std::string &input,
const char splitters[])
return input;
}
-void split_args(const char *args,
- /*out*/ std::vector<std::string> &sargs,
- char splitter,
- bool keep_place_holder)
+namespace {
+
+// The states while scan each split.
+enum class split_args_state : int
{
- sargs.clear();
- std::string v(args);
- uint64_t last_pos = 0;
- while (true) {
- auto pos = v.find(splitter, last_pos);
- if (pos != std::string::npos) {
- std::string s = trim_string((char *)v.substr(last_pos, pos -
last_pos).c_str());
- if (!s.empty()) {
- sargs.push_back(s);
- } else if (keep_place_holder) {
- sargs.emplace_back("");
- }
- last_pos = pos + 1;
- } else {
- std::string s = trim_string((char *)v.substr(last_pos).c_str());
- if (!s.empty()) {
- sargs.push_back(s);
- } else if (keep_place_holder) {
- sargs.emplace_back("");
- }
- break;
- }
- }
+ kSplitBeginning, // The initial state while starting to scan a split
+ kSplitLeadingSpaces, // While meeting leading spaces, if any
+ kSplitToken, // While running into token (after leading spaces)
+};
+
+const std::string kLeadingSpaces = " \t";
+const std::string kTrailingSpaces = " \t\r\n";
+
+inline bool is_leading_space(char ch)
+{
+ return std::any_of(
+ kLeadingSpaces.begin(), kLeadingSpaces.end(), [ch](char space) {
return ch == space; });
}
-void split_args(const char *args,
- /*out*/ std::unordered_set<std::string> &sargs,
- char splitter,
- bool keep_place_holder)
+inline bool is_trailing_space(char ch)
+{
+ return std::any_of(
+ kTrailingSpaces.begin(), kTrailingSpaces.end(), [ch](char space) {
return ch == space; });
+}
+
+// Skip trailing spaces and find the end of token.
+const char *find_token_end(const char *token_begin, const char *end)
{
- std::vector<std::string> sargs_vec;
- split_args(args, sargs_vec, splitter, keep_place_holder);
- sargs.insert(sargs_vec.begin(), sargs_vec.end());
+ CHECK(token_begin < end, "");
+
+ for (; end > token_begin && is_trailing_space(*(end - 1)); --end) {
+ }
+ return end;
}
-void split_args(const char *args, /*out*/ std::list<std::string> &sargs, char
splitter)
+// Append new element to sequence containers, such as std::vector and
std::list.
+struct SequenceInserter
{
- sargs.clear();
+ // The new element is constructed through variadic template and appended
at the end
+ // of the sequence container.
+ template <typename SequenceContainer, typename... Args>
+ void emplace(SequenceContainer &container, Args &&... args) const
+ {
+ container.emplace_back(std::forward<Args>(args)...);
+ }
+};
- std::string v(args);
+// Insert new element to associative containers, such as std::unordered_set
and std::set.
+struct AssociativeInserter
+{
+ // The new element is constructed through variadic template and inserted
into the associative
+ // container.
+ template <typename AssociativeContainer, typename... Args>
+ void emplace(AssociativeContainer &container, Args &&... args) const
+ {
+ container.emplace(std::forward<Args>(args)...);
+ }
+};
- int lastPos = 0;
- while (true) {
- auto pos = v.find(splitter, lastPos);
- if (pos != std::string::npos) {
- std::string s = v.substr(lastPos, pos - lastPos);
- if (s.length() > 0) {
- std::string s2 = trim_string((char *)s.c_str());
- if (s2.length() > 0)
- sargs.push_back(s2);
+// Split the `input` string by the only character `separator` into tokens.
Leading and trailing
+// spaces of each token will be stripped. Once the token is empty, or become
empty after
+// stripping, an empty string will be added into `output` if
`keep_place_holder` is enabled.
+//
+// `inserter` provides the only interface for all types of containers. By
`inserter`, all tokens
+// will be collected to each type of container: for sequence containers, such
as std::vector and
+// std::list, tokens will be "appended"; for associative containers, such as
std::unordered_set
+// and std::set, tokens will be "inserted".
+template <typename Inserter, typename Container>
+void split(const char *input,
+ char separator,
+ bool keep_place_holder,
+ const Inserter &inserter,
+ Container &output)
+{
+ CHECK_NOTNULL(input, "");
+
+ output.clear();
+
+ auto state = split_args_state::kSplitBeginning;
+ const char *token_begin = nullptr;
+ for (auto p = input;; ++p) {
+ if (*p == separator || *p == '\0') {
+ switch (state) {
+ case split_args_state::kSplitBeginning:
+ case split_args_state::kSplitLeadingSpaces:
+ if (keep_place_holder) {
+ // Will add an empty string as output, since all characters
+ // in this split are spaces.
+ inserter.emplace(output);
+ }
+ break;
+ case split_args_state::kSplitToken: {
+ auto token_end = find_token_end(token_begin, p);
+ CHECK(token_begin <= token_end, "");
+ if (token_begin == token_end && !keep_place_holder) {
+ // Current token is empty, and place holder is not needed.
+ break;
+ }
+ inserter.emplace(output, token_begin, token_end);
+ } break;
+ default:
+ break;
}
- lastPos = static_cast<int>(pos + 1);
- } else {
- std::string s = v.substr(lastPos);
- if (s.length() > 0) {
- std::string s2 = trim_string((char *)s.c_str());
- if (s2.length() > 0)
- sargs.push_back(s2);
+
+ if (*p == '\0') {
+ // The whole string has been scanned, just break from the loop.
+ break;
+ }
+
+ // Current token has been scanned, continue next split.
+ state = split_args_state::kSplitBeginning;
+ continue;
+ }
+
+ // Current scanned character is not `separator`.
+ switch (state) {
+ case split_args_state::kSplitBeginning:
+ if (is_leading_space(*p)) {
+ state = split_args_state::kSplitLeadingSpaces;
+ } else {
+ state = split_args_state::kSplitToken;
+ token_begin = p;
}
break;
+ case split_args_state::kSplitLeadingSpaces:
+ if (!is_leading_space(*p)) {
+ // Any character that is not leading space will be considered
as
+ // the beginning of a token. Whether all of the scanned
characters
+ // belong to the token will be decided while the `separator` is
+ // found.
+ state = split_args_state::kSplitToken;
+ token_begin = p;
+ }
+ break;
+ default:
+ break;
}
}
}
+template <typename Container>
+inline void split_to_sequence_container(const char *input,
+ char separator,
+ bool keep_place_holder,
+ Container &output)
+{
+ split(input, separator, keep_place_holder, SequenceInserter(), output);
+}
+
+template <typename Container>
+inline void split_to_associative_container(const char *input,
+ char separator,
+ bool keep_place_holder,
+ Container &output)
+{
+ split(input, separator, keep_place_holder, AssociativeInserter(), output);
+}
+
+} // anonymous namespace
+
+void split_args(const char *input,
+ std::vector<std::string> &output,
+ char separator,
+ bool keep_place_holder)
+{
+ split_to_sequence_container(input, separator, keep_place_holder, output);
+}
+
+void split_args(const char *input,
+ std::list<std::string> &output,
+ char separator,
+ bool keep_place_holder)
+{
+ split_to_sequence_container(input, separator, keep_place_holder, output);
+}
+
+void split_args(const char *input,
+ std::unordered_set<std::string> &output,
+ char separator,
+ bool keep_place_holder)
+{
+ split_to_associative_container(input, separator, keep_place_holder,
output);
+}
+
bool parse_kv_map(const char *args,
/*out*/ std::map<std::string, std::string> &kv_map,
char item_splitter,
@@ -181,12 +302,12 @@ replace_string(std::string subject, const std::string
&search, const std::string
char *trim_string(char *s)
{
- while (*s != '\0' && (*s == ' ' || *s == '\t')) {
+ while (*s != '\0' && is_leading_space(*s)) {
s++;
}
char *r = s;
s += strlen(s);
- while (s >= r && (*s == '\0' || *s == ' ' || *s == '\t' || *s == '\r' ||
*s == '\n')) {
+ while (s >= r && (*s == '\0' || is_trailing_space(*s))) {
*s = '\0';
s--;
}
diff --git a/src/utils/strings.h b/src/utils/strings.h
index dc9fb7835..79cdb7a7e 100644
--- a/src/utils/strings.h
+++ b/src/utils/strings.h
@@ -26,29 +26,38 @@
#pragma once
-#include <string>
-#include <vector>
+#include <iostream>
#include <list>
#include <map>
+#include <string>
#include <unordered_set>
-#include <iostream>
+#include <vector>
namespace dsn {
namespace utils {
inline bool is_empty(const char *str) { return str == nullptr || *str == '\0';
}
-void split_args(const char *args,
- /*out*/ std::vector<std::string> &sargs,
- char splitter = ' ',
+// Split the `input` string by the only character `separator` into tokens.
Leading and trailing
+// spaces of each token will be stripped. Once the token is empty, or become
empty after
+// stripping, an empty string will be added into `output` if
`keep_place_holder` is enabled.
+//
+// There are several overloaded `split_args` functions in the following all of
which are the
+// same except that the split tokens are collected to the different containers.
+void split_args(const char *input,
+ std::vector<std::string> &output,
+ char separator = ' ',
bool keep_place_holder = false);
-void split_args(const char *args,
- /*out*/ std::unordered_set<std::string> &sargs,
- char splitter = ' ',
+void split_args(const char *input,
+ std::list<std::string> &output,
+ char separator = ' ',
bool keep_place_holder = false);
-void split_args(const char *args, /*out*/ std::list<std::string> &sargs, char
splitter = ' ');
+void split_args(const char *input,
+ std::unordered_set<std::string> &output,
+ char separator = ' ',
+ bool keep_place_holder = false);
// kv_map sample (when item_splitter = ',' and kv_splitter = ':'):
// k1:v1,k2:v2,k3:v3
diff --git a/src/utils/test/utils.cpp b/src/utils/test/utils.cpp
index 35f6cebdd..7b8b4608e 100644
--- a/src/utils/test/utils.cpp
+++ b/src/utils/test/utils.cpp
@@ -115,96 +115,162 @@ TEST(core, check_c_string_empty)
}
}
-TEST(core, split_args)
+// For containers such as std::unordered_set, the expected result will be
deduplicated
+// at initialization. Therefore, it can be used to compare with actual result
safely.
+template <typename Container>
+void test_split_args()
{
- std::string value = "a ,b, c ";
- std::vector<std::string> sargs;
- std::list<std::string> sargs2;
- ::dsn::utils::split_args(value.c_str(), sargs, ',');
- ::dsn::utils::split_args(value.c_str(), sargs2, ',');
-
- EXPECT_EQ(sargs.size(), 3);
- EXPECT_EQ(sargs[0], "a");
- EXPECT_EQ(sargs[1], "b");
- EXPECT_EQ(sargs[2], "c");
-
- EXPECT_EQ(sargs2.size(), 3);
- auto it = sargs2.begin();
- EXPECT_EQ(*it++, "a");
- EXPECT_EQ(*it++, "b");
- EXPECT_EQ(*it++, "c");
-
- std::unordered_set<std::string> sargs_set;
- dsn::utils::split_args(value.c_str(), sargs_set, ',');
- EXPECT_EQ(sargs_set.size(), 3);
-
- // test value = ""
- value = "";
- sargs.clear();
- dsn::utils::split_args(value.c_str(), sargs, ',');
- EXPECT_EQ(sargs.size(), 0);
-
- sargs2.clear();
- dsn::utils::split_args(value.c_str(), sargs2, ',');
- EXPECT_EQ(sargs2.size(), 0);
-
- sargs_set.clear();
- dsn::utils::split_args(value.c_str(), sargs_set, ',');
- EXPECT_EQ(sargs_set.size(), 0);
+ // Test cases:
+ // - split empty string by ' ' without place holder
+ // - split empty string by ' ' with place holder
+ // - split empty string by ',' without place holder
+ // - split empty string by ',' with place holder
+ // - split a space (' ') by ' ' without place holder
+ // - split a space (' ') by ' ' with place holder
+ // - split a space (' ') by ',' without place holder
+ // - split a space (' ') by ',' with place holder
+ // - split a comma (',') by ' ' without place holder
+ // - split a comma (',') by ' ' with place holder
+ // - split a comma (',') by ',' without place holder
+ // - split a comma (',') by ',' with place holder
+ // - split 2 leading spaces by ' ' without place holder
+ // - split 2 leading spaces by ' ' with place holder
+ // - split 2 leading spaces by ',' without place holder
+ // - split 2 leading spaces by ',' with place holder
+ // - split 3 leading spaces by ' ' without place holder
+ // - split 3 leading spaces by ' ' with place holder
+ // - split 3 leading spaces by ',' without place holder
+ // - split 3 leading spaces by ',' with place holder
+ // - split 2 trailing spaces by ' ' without place holder
+ // - split 2 trailing spaces by ' ' with place holder
+ // - split 2 trailing spaces by ',' without place holder
+ // - split 2 trailing spaces by ',' with place holder
+ // - split 3 trailing spaces by ' ' without place holder
+ // - split 3 trailing spaces by ' ' with place holder
+ // - split 3 trailing spaces by ',' without place holder
+ // - split 3 trailing spaces by ',' with place holder
+ // - split a string including "\\t", "\\r" and "\\n" by ' ' without place
holder
+ // - split a string including "\\t", "\\r" and "\\n" by ' ' with place
holder
+ // - split a string including "\\t", "\\r" and "\\n" by ',' without place
holder
+ // - split a string including "\\t", "\\r" and "\\n" by ',' with place
holder
+ // - split a single letter by ' ' without place holder
+ // - split a single letter by ' ' with place holder
+ // - split a single letter by ',' without place holder
+ // - split a single letter by ',' with place holder
+ // - split a single word by ' ' without place holder
+ // - split a single word by ' ' with place holder
+ // - split a single word by ',' without place holder
+ // - split a single word by ',' with place holder
+ // - split a string including letters and words by ' ' without place holder
+ // - split a string including letters and words by ' ' with place holder
+ // - split a string including letters and words by ',' without place holder
+ // - split a string including letters and words by ',' with place holder
+ // - split a string that includes multiple letters by ' ' without place
holder
+ // - split a string that includes multiple letters by ' ' with place holder
+ // - split a string that includes multiple letters by ',' without place
holder
+ // - split a string that includes multiple letters by ',' with place holder
+ // - split a string that includes multiple words by ' ' without place
holder
+ // - split a string that includes multiple words by ' ' with place holder
+ // - split a string that includes multiple words by ',' without place
holder
+ // - split a string that includes multiple words by ',' with place holder
+ struct test_case
+ {
+ const char *input;
+ char separator;
+ bool keep_place_holder;
+ Container expected_output;
+ } tests[] = {{"", ' ', false, {}},
+ {"", ' ', true, {""}},
+ {"", ',', false, {}},
+ {"", ',', true, {""}},
+ {" ", ' ', false, {}},
+ {" ", ' ', true, {"", ""}},
+ {" ", ',', false, {}},
+ {" ", ',', true, {""}},
+ {",", ' ', false, {","}},
+ {",", ' ', true, {","}},
+ {",", ',', false, {}},
+ {",", ',', true, {"", ""}},
+ {" ", ' ', false, {}},
+ {"\t ", ' ', true, {"", ""}},
+ {" \t", ',', false, {}},
+ {"\t\t", ',', true, {""}},
+ {" \t ", ' ', false, {}},
+ {"\t ", ' ', true, {"", "", ""}},
+ {"\t\t ", ',', false, {}},
+ {" \t", ',', true, {""}},
+ {"\r ", ' ', false, {}},
+ {"\t\n", ' ', true, {""}},
+ {"\r\t", ',', false, {}},
+ {" \n", ',', true, {""}},
+ {"\n\t\r", ' ', false, {}},
+ {" \r ", ' ', true, {"", "", ""}},
+ {"\r \n", ',', false, {}},
+ {"\t\n\r", ',', true, {""}},
+ {" \\n,,\\t \\r ", ' ', false, {"\\n,,\\t", "\\r"}},
+ {" \\n,,\\t \\r ", ' ', true, {"", "\\n,,\\t", "\\r", ""}},
+ {" \\n,,\\t \\r ", ',', false, {"\\n", "\\t \\r"}},
+ {" \\n,,\\t \\r ", ',', true, {"\\n", "", "\\t \\r"}},
+ {"a", ' ', false, {"a"}},
+ {"a", ' ', true, {"a"}},
+ {"a", ',', false, {"a"}},
+ {"a", ',', true, {"a"}},
+ {"dinner", ' ', false, {"dinner"}},
+ {"dinner", ' ', true, {"dinner"}},
+ {"dinner", ',', false, {"dinner"}},
+ {"dinner", ',', true, {"dinner"}},
+ {"\t\r\na\t\tdog,\t\r\n \t\r\nand\t\r\n \t\ta\t\tcat",
+ ' ',
+ false,
+ {"\r\na\t\tdog,", "\r\nand", "a\t\tcat"}},
+ {"\t\r\na\t\tdog,\t\r\n \t\r\nand\t\r\n \t\ta\t\tcat",
+ ' ',
+ true,
+ {"\r\na\t\tdog,", "", "\r\nand", "a\t\tcat"}},
+ {"\t\r\na\t\tdog,\t\r\n \t\r\nand\t\r\n \t\ta\t\tcat",
+ ',',
+ false,
+ {"\r\na\t\tdog", "\r\n \t\r\nand\t\r\n \t\ta\t\tcat"}},
+ {"\t\r\na\t\tdog,\t\r\n \t\r\nand\t\r\n \t\ta\t\tcat",
+ ',',
+ true,
+ {"\r\na\t\tdog", "\r\n \t\r\nand\t\r\n \t\ta\t\tcat"}},
+ {"a ,b, ,c ", ' ', false, {"a", ",b,", ",c"}},
+ {"a ,b, ,c ", ' ', true, {"a", ",b,", ",c", ""}},
+ {"a ,b, ,c ", ',', false, {"a", "b", "c"}},
+ {"a ,b, ,c ", ',', true, {"a", "b", "", "c"}},
+ {" in early 2000s , too, ", ' ', false, {"in", "early",
"2000s", ",", "too,"}},
+ {" in early 2000s , too, ",
+ ' ',
+ true,
+ {"", "in", "", "early", "2000s", ",", "", "too,", ""}},
+ {" in early 2000s , too, ", ',', false, {"in early 2000s",
"too"}},
+ {" in early 2000s , too, ", ',', true, {"in early 2000s",
"too", ""}}};
+
+ for (const auto &test : tests) {
+ Container actual_output;
+ split_args(test.input, actual_output, test.separator,
test.keep_place_holder);
+ EXPECT_EQ(actual_output, test.expected_output);
+
+ // Test default value (i.e. false) for keep_place_holder.
+ if (!test.keep_place_holder) {
+ split_args(test.input, actual_output, test.separator);
+ EXPECT_EQ(actual_output, test.expected_output);
+
+ // Test default value (i.e. ' ') for separator.
+ if (test.separator == ' ') {
+ split_args(test.input, actual_output);
+ EXPECT_EQ(actual_output, test.expected_output);
+ }
+ }
+ }
}
-TEST(core, split_args_keep_place_holder)
+TEST(core, split_args)
{
- std::string value = "a ,b, c ";
- std::vector<std::string> sargs;
- ::dsn::utils::split_args(value.c_str(), sargs, ',', true);
-
- EXPECT_EQ(sargs.size(), 3);
- EXPECT_EQ(sargs[0], "a");
- EXPECT_EQ(sargs[1], "b");
- EXPECT_EQ(sargs[2], "c");
-
- value = " , a ,b, c ";
- sargs.clear();
- ::dsn::utils::split_args(value.c_str(), sargs, ',', true);
-
- EXPECT_EQ(sargs.size(), 4);
- EXPECT_EQ(sargs[0], "");
- EXPECT_EQ(sargs[1], "a");
- EXPECT_EQ(sargs[2], "b");
- EXPECT_EQ(sargs[3], "c");
-
- value = "a ,b, , c";
- sargs.clear();
- ::dsn::utils::split_args(value.c_str(), sargs, ',', true);
-
- EXPECT_EQ(sargs.size(), 4);
- EXPECT_EQ(sargs[0], "a");
- EXPECT_EQ(sargs[1], "b");
- EXPECT_EQ(sargs[2], "");
- EXPECT_EQ(sargs[3], "c");
-
- value = "a ,b, c , ";
- sargs.clear();
- ::dsn::utils::split_args(value.c_str(), sargs, ',', true);
-
- EXPECT_EQ(sargs.size(), 4);
- EXPECT_EQ(sargs[0], "a");
- EXPECT_EQ(sargs[1], "b");
- EXPECT_EQ(sargs[2], "c");
- EXPECT_EQ(sargs[3], "");
-
- value = ", a ,b, ,c , ";
- sargs.clear();
- ::dsn::utils::split_args(value.c_str(), sargs, ',', true);
-
- EXPECT_EQ(sargs.size(), 6);
- EXPECT_EQ(sargs[0], "");
- EXPECT_EQ(sargs[1], "a");
- EXPECT_EQ(sargs[2], "b");
- EXPECT_EQ(sargs[3], "");
- EXPECT_EQ(sargs[4], "c");
- EXPECT_EQ(sargs[5], "");
+ test_split_args<std::vector<std::string>>();
+ test_split_args<std::list<std::string>>();
+ test_split_args<std::unordered_set<std::string>>();
}
TEST(core, trim_string)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]