[incubator-pegasus] branch master updated: refactor: use state machine to improve split_args for strings (#1240)

wangdan Wed, 16 Nov 2022 22:16:43 -0800

This is an automated email from the ASF dual-hosted git repository.

wangdan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-pegasus.git



The following commit(s) were added to refs/heads/master by this push:
     new 162f39986 refactor: use state machine to improve split_args for 
strings (#1240)
162f39986 is described below

commit 162f39986ff4ce44a94a4132eecb9326c6692a94
Author: Dan Wang <[email protected]>
AuthorDate: Thu Nov 17 14:16:28 2022 +0800

    refactor: use state machine to improve split_args for strings (#1240)
---
 src/utils/strings.cpp    | 231 ++++++++++++++++++++++++++++++++++-----------
 src/utils/strings.h      |  29 ++++--
 src/utils/test/utils.cpp | 238 ++++++++++++++++++++++++++++++-----------------
 3 files changed, 347 insertions(+), 151 deletions(-)

diff --git a/src/utils/strings.cpp b/src/utils/strings.cpp
index aef9af5e5..fb90cf844 100644
--- a/src/utils/strings.cpp
+++ b/src/utils/strings.cpp
@@ -24,9 +24,15 @@
  * THE SOFTWARE.
  */
 
+#include <algorithm>
 #include <cstring>
 #include <sstream>
+#include <utility>
+
 #include <openssl/md5.h>
+#include "utils/api_utilities.h"
+#include "utils/fmt_logging.h"
+#include "utils/ports.h"
 #include "utils/strings.h"
 
 namespace dsn {
@@ -50,75 +56,190 @@ std::string get_last_component(const std::string &input, 
const char splitters[])
         return input;
 }
 
-void split_args(const char *args,
-                /*out*/ std::vector<std::string> &sargs,
-                char splitter,
-                bool keep_place_holder)
+namespace {
+
+// The states while scan each split.
+enum class split_args_state : int
 {
-    sargs.clear();
-    std::string v(args);
-    uint64_t last_pos = 0;
-    while (true) {
-        auto pos = v.find(splitter, last_pos);
-        if (pos != std::string::npos) {
-            std::string s = trim_string((char *)v.substr(last_pos, pos - 
last_pos).c_str());
-            if (!s.empty()) {
-                sargs.push_back(s);
-            } else if (keep_place_holder) {
-                sargs.emplace_back("");
-            }
-            last_pos = pos + 1;
-        } else {
-            std::string s = trim_string((char *)v.substr(last_pos).c_str());
-            if (!s.empty()) {
-                sargs.push_back(s);
-            } else if (keep_place_holder) {
-                sargs.emplace_back("");
-            }
-            break;
-        }
-    }
+    kSplitBeginning,     // The initial state while starting to scan a split
+    kSplitLeadingSpaces, // While meeting leading spaces, if any
+    kSplitToken,         // While running into token (after leading spaces)
+};
+
+const std::string kLeadingSpaces = " \t";
+const std::string kTrailingSpaces = " \t\r\n";
+
+inline bool is_leading_space(char ch)
+{
+    return std::any_of(
+        kLeadingSpaces.begin(), kLeadingSpaces.end(), [ch](char space) { 
return ch == space; });
 }
 
-void split_args(const char *args,
-                /*out*/ std::unordered_set<std::string> &sargs,
-                char splitter,
-                bool keep_place_holder)
+inline bool is_trailing_space(char ch)
+{
+    return std::any_of(
+        kTrailingSpaces.begin(), kTrailingSpaces.end(), [ch](char space) { 
return ch == space; });
+}
+
+// Skip trailing spaces and find the end of token.
+const char *find_token_end(const char *token_begin, const char *end)
 {
-    std::vector<std::string> sargs_vec;
-    split_args(args, sargs_vec, splitter, keep_place_holder);
-    sargs.insert(sargs_vec.begin(), sargs_vec.end());
+    CHECK(token_begin < end, "");
+
+    for (; end > token_begin && is_trailing_space(*(end - 1)); --end) {
+    }
+    return end;
 }
 
-void split_args(const char *args, /*out*/ std::list<std::string> &sargs, char 
splitter)
+// Append new element to sequence containers, such as std::vector and 
std::list.
+struct SequenceInserter
 {
-    sargs.clear();
+    // The new element is constructed through variadic template and appended 
at the end
+    // of the sequence container.
+    template <typename SequenceContainer, typename... Args>
+    void emplace(SequenceContainer &container, Args &&... args) const
+    {
+        container.emplace_back(std::forward<Args>(args)...);
+    }
+};
 
-    std::string v(args);
+// Insert new element to associative containers, such as std::unordered_set 
and std::set.
+struct AssociativeInserter
+{
+    // The new element is constructed through variadic template and inserted 
into the associative
+    // container.
+    template <typename AssociativeContainer, typename... Args>
+    void emplace(AssociativeContainer &container, Args &&... args) const
+    {
+        container.emplace(std::forward<Args>(args)...);
+    }
+};
 
-    int lastPos = 0;
-    while (true) {
-        auto pos = v.find(splitter, lastPos);
-        if (pos != std::string::npos) {
-            std::string s = v.substr(lastPos, pos - lastPos);
-            if (s.length() > 0) {
-                std::string s2 = trim_string((char *)s.c_str());
-                if (s2.length() > 0)
-                    sargs.push_back(s2);
+// Split the `input` string by the only character `separator` into tokens. 
Leading and trailing
+// spaces of each token will be stripped. Once the token is empty, or become 
empty after
+// stripping, an empty string will be added into `output` if 
`keep_place_holder` is enabled.
+//
+// `inserter` provides the only interface for all types of containers. By 
`inserter`, all tokens
+// will be collected to each type of container: for sequence containers, such 
as std::vector and
+// std::list, tokens will be "appended"; for associative containers, such as 
std::unordered_set
+// and std::set, tokens will be "inserted".
+template <typename Inserter, typename Container>
+void split(const char *input,
+           char separator,
+           bool keep_place_holder,
+           const Inserter &inserter,
+           Container &output)
+{
+    CHECK_NOTNULL(input, "");
+
+    output.clear();
+
+    auto state = split_args_state::kSplitBeginning;
+    const char *token_begin = nullptr;
+    for (auto p = input;; ++p) {
+        if (*p == separator || *p == '\0') {
+            switch (state) {
+            case split_args_state::kSplitBeginning:
+            case split_args_state::kSplitLeadingSpaces:
+                if (keep_place_holder) {
+                    // Will add an empty string as output, since all characters
+                    // in this split are spaces.
+                    inserter.emplace(output);
+                }
+                break;
+            case split_args_state::kSplitToken: {
+                auto token_end = find_token_end(token_begin, p);
+                CHECK(token_begin <= token_end, "");
+                if (token_begin == token_end && !keep_place_holder) {
+                    // Current token is empty, and place holder is not needed.
+                    break;
+                }
+                inserter.emplace(output, token_begin, token_end);
+            } break;
+            default:
+                break;
             }
-            lastPos = static_cast<int>(pos + 1);
-        } else {
-            std::string s = v.substr(lastPos);
-            if (s.length() > 0) {
-                std::string s2 = trim_string((char *)s.c_str());
-                if (s2.length() > 0)
-                    sargs.push_back(s2);
+
+            if (*p == '\0') {
+                // The whole string has been scanned, just break from the loop.
+                break;
+            }
+
+            // Current token has been scanned, continue next split.
+            state = split_args_state::kSplitBeginning;
+            continue;
+        }
+
+        // Current scanned character is not `separator`.
+        switch (state) {
+        case split_args_state::kSplitBeginning:
+            if (is_leading_space(*p)) {
+                state = split_args_state::kSplitLeadingSpaces;
+            } else {
+                state = split_args_state::kSplitToken;
+                token_begin = p;
             }
             break;
+        case split_args_state::kSplitLeadingSpaces:
+            if (!is_leading_space(*p)) {
+                // Any character that is not leading space will be considered 
as
+                // the beginning of a token. Whether all of the scanned 
characters
+                // belong to the token will be decided while the `separator` is
+                // found.
+                state = split_args_state::kSplitToken;
+                token_begin = p;
+            }
+            break;
+        default:
+            break;
         }
     }
 }
 
+template <typename Container>
+inline void split_to_sequence_container(const char *input,
+                                        char separator,
+                                        bool keep_place_holder,
+                                        Container &output)
+{
+    split(input, separator, keep_place_holder, SequenceInserter(), output);
+}
+
+template <typename Container>
+inline void split_to_associative_container(const char *input,
+                                           char separator,
+                                           bool keep_place_holder,
+                                           Container &output)
+{
+    split(input, separator, keep_place_holder, AssociativeInserter(), output);
+}
+
+} // anonymous namespace
+
+void split_args(const char *input,
+                std::vector<std::string> &output,
+                char separator,
+                bool keep_place_holder)
+{
+    split_to_sequence_container(input, separator, keep_place_holder, output);
+}
+
+void split_args(const char *input,
+                std::list<std::string> &output,
+                char separator,
+                bool keep_place_holder)
+{
+    split_to_sequence_container(input, separator, keep_place_holder, output);
+}
+
+void split_args(const char *input,
+                std::unordered_set<std::string> &output,
+                char separator,
+                bool keep_place_holder)
+{
+    split_to_associative_container(input, separator, keep_place_holder, 
output);
+}
+
 bool parse_kv_map(const char *args,
                   /*out*/ std::map<std::string, std::string> &kv_map,
                   char item_splitter,
@@ -181,12 +302,12 @@ replace_string(std::string subject, const std::string 
&search, const std::string
 
 char *trim_string(char *s)
 {
-    while (*s != '\0' && (*s == ' ' || *s == '\t')) {
+    while (*s != '\0' && is_leading_space(*s)) {
         s++;
     }
     char *r = s;
     s += strlen(s);
-    while (s >= r && (*s == '\0' || *s == ' ' || *s == '\t' || *s == '\r' || 
*s == '\n')) {
+    while (s >= r && (*s == '\0' || is_trailing_space(*s))) {
         *s = '\0';
         s--;
     }
diff --git a/src/utils/strings.h b/src/utils/strings.h
index dc9fb7835..79cdb7a7e 100644
--- a/src/utils/strings.h
+++ b/src/utils/strings.h
@@ -26,29 +26,38 @@
 
 #pragma once
 
-#include <string>
-#include <vector>
+#include <iostream>
 #include <list>
 #include <map>
+#include <string>
 #include <unordered_set>
-#include <iostream>
+#include <vector>
 
 namespace dsn {
 namespace utils {
 
 inline bool is_empty(const char *str) { return str == nullptr || *str == '\0'; 
}
 
-void split_args(const char *args,
-                /*out*/ std::vector<std::string> &sargs,
-                char splitter = ' ',
+// Split the `input` string by the only character `separator` into tokens. 
Leading and trailing
+// spaces of each token will be stripped. Once the token is empty, or become 
empty after
+// stripping, an empty string will be added into `output` if 
`keep_place_holder` is enabled.
+//
+// There are several overloaded `split_args` functions in the following all of 
which are the
+// same except that the split tokens are collected to the different containers.
+void split_args(const char *input,
+                std::vector<std::string> &output,
+                char separator = ' ',
                 bool keep_place_holder = false);
 
-void split_args(const char *args,
-                /*out*/ std::unordered_set<std::string> &sargs,
-                char splitter = ' ',
+void split_args(const char *input,
+                std::list<std::string> &output,
+                char separator = ' ',
                 bool keep_place_holder = false);
 
-void split_args(const char *args, /*out*/ std::list<std::string> &sargs, char 
splitter = ' ');
+void split_args(const char *input,
+                std::unordered_set<std::string> &output,
+                char separator = ' ',
+                bool keep_place_holder = false);
 
 // kv_map sample (when item_splitter = ',' and kv_splitter = ':'):
 //   k1:v1,k2:v2,k3:v3
diff --git a/src/utils/test/utils.cpp b/src/utils/test/utils.cpp
index 35f6cebdd..7b8b4608e 100644
--- a/src/utils/test/utils.cpp
+++ b/src/utils/test/utils.cpp
@@ -115,96 +115,162 @@ TEST(core, check_c_string_empty)
     }
 }
 
-TEST(core, split_args)
+// For containers such as std::unordered_set, the expected result will be 
deduplicated
+// at initialization. Therefore, it can be used to compare with actual result 
safely.
+template <typename Container>
+void test_split_args()
 {
-    std::string value = "a ,b, c ";
-    std::vector<std::string> sargs;
-    std::list<std::string> sargs2;
-    ::dsn::utils::split_args(value.c_str(), sargs, ',');
-    ::dsn::utils::split_args(value.c_str(), sargs2, ',');
-
-    EXPECT_EQ(sargs.size(), 3);
-    EXPECT_EQ(sargs[0], "a");
-    EXPECT_EQ(sargs[1], "b");
-    EXPECT_EQ(sargs[2], "c");
-
-    EXPECT_EQ(sargs2.size(), 3);
-    auto it = sargs2.begin();
-    EXPECT_EQ(*it++, "a");
-    EXPECT_EQ(*it++, "b");
-    EXPECT_EQ(*it++, "c");
-
-    std::unordered_set<std::string> sargs_set;
-    dsn::utils::split_args(value.c_str(), sargs_set, ',');
-    EXPECT_EQ(sargs_set.size(), 3);
-
-    // test value = ""
-    value = "";
-    sargs.clear();
-    dsn::utils::split_args(value.c_str(), sargs, ',');
-    EXPECT_EQ(sargs.size(), 0);
-
-    sargs2.clear();
-    dsn::utils::split_args(value.c_str(), sargs2, ',');
-    EXPECT_EQ(sargs2.size(), 0);
-
-    sargs_set.clear();
-    dsn::utils::split_args(value.c_str(), sargs_set, ',');
-    EXPECT_EQ(sargs_set.size(), 0);
+    // Test cases:
+    // - split empty string by ' ' without place holder
+    // - split empty string by ' ' with place holder
+    // - split empty string by ',' without place holder
+    // - split empty string by ',' with place holder
+    // - split a space (' ') by ' ' without place holder
+    // - split a space (' ') by ' ' with place holder
+    // - split a space (' ') by ',' without place holder
+    // - split a space (' ') by ',' with place holder
+    // - split a comma (',') by ' ' without place holder
+    // - split a comma (',') by ' ' with place holder
+    // - split a comma (',') by ',' without place holder
+    // - split a comma (',') by ',' with place holder
+    // - split 2 leading spaces by ' ' without place holder
+    // - split 2 leading spaces by ' ' with place holder
+    // - split 2 leading spaces by ',' without place holder
+    // - split 2 leading spaces by ',' with place holder
+    // - split 3 leading spaces by ' ' without place holder
+    // - split 3 leading spaces by ' ' with place holder
+    // - split 3 leading spaces by ',' without place holder
+    // - split 3 leading spaces by ',' with place holder
+    // - split 2 trailing spaces by ' ' without place holder
+    // - split 2 trailing spaces by ' ' with place holder
+    // - split 2 trailing spaces by ',' without place holder
+    // - split 2 trailing spaces by ',' with place holder
+    // - split 3 trailing spaces by ' ' without place holder
+    // - split 3 trailing spaces by ' ' with place holder
+    // - split 3 trailing spaces by ',' without place holder
+    // - split 3 trailing spaces by ',' with place holder
+    // - split a string including "\\t", "\\r" and "\\n" by ' ' without place 
holder
+    // - split a string including "\\t", "\\r" and "\\n" by ' ' with place 
holder
+    // - split a string including "\\t", "\\r" and "\\n" by ',' without place 
holder
+    // - split a string including "\\t", "\\r" and "\\n" by ',' with place 
holder
+    // - split a single letter by ' ' without place holder
+    // - split a single letter by ' ' with place holder
+    // - split a single letter by ',' without place holder
+    // - split a single letter by ',' with place holder
+    // - split a single word by ' ' without place holder
+    // - split a single word by ' ' with place holder
+    // - split a single word by ',' without place holder
+    // - split a single word by ',' with place holder
+    // - split a string including letters and words by ' ' without place holder
+    // - split a string including letters and words by ' ' with place holder
+    // - split a string including letters and words by ',' without place holder
+    // - split a string including letters and words by ',' with place holder
+    // - split a string that includes multiple letters by ' ' without place 
holder
+    // - split a string that includes multiple letters by ' ' with place holder
+    // - split a string that includes multiple letters by ',' without place 
holder
+    // - split a string that includes multiple letters by ',' with place holder
+    // - split a string that includes multiple words by ' ' without place 
holder
+    // - split a string that includes multiple words by ' ' with place holder
+    // - split a string that includes multiple words by ',' without place 
holder
+    // - split a string that includes multiple words by ',' with place holder
+    struct test_case
+    {
+        const char *input;
+        char separator;
+        bool keep_place_holder;
+        Container expected_output;
+    } tests[] = {{"", ' ', false, {}},
+                 {"", ' ', true, {""}},
+                 {"", ',', false, {}},
+                 {"", ',', true, {""}},
+                 {" ", ' ', false, {}},
+                 {" ", ' ', true, {"", ""}},
+                 {" ", ',', false, {}},
+                 {" ", ',', true, {""}},
+                 {",", ' ', false, {","}},
+                 {",", ' ', true, {","}},
+                 {",", ',', false, {}},
+                 {",", ',', true, {"", ""}},
+                 {"  ", ' ', false, {}},
+                 {"\t ", ' ', true, {"", ""}},
+                 {" \t", ',', false, {}},
+                 {"\t\t", ',', true, {""}},
+                 {" \t ", ' ', false, {}},
+                 {"\t  ", ' ', true, {"", "", ""}},
+                 {"\t\t ", ',', false, {}},
+                 {"  \t", ',', true, {""}},
+                 {"\r ", ' ', false, {}},
+                 {"\t\n", ' ', true, {""}},
+                 {"\r\t", ',', false, {}},
+                 {" \n", ',', true, {""}},
+                 {"\n\t\r", ' ', false, {}},
+                 {" \r ", ' ', true, {"", "", ""}},
+                 {"\r \n", ',', false, {}},
+                 {"\t\n\r", ',', true, {""}},
+                 {" \\n,,\\t \\r ", ' ', false, {"\\n,,\\t", "\\r"}},
+                 {" \\n,,\\t \\r ", ' ', true, {"", "\\n,,\\t", "\\r", ""}},
+                 {" \\n,,\\t \\r ", ',', false, {"\\n", "\\t \\r"}},
+                 {" \\n,,\\t \\r ", ',', true, {"\\n", "", "\\t \\r"}},
+                 {"a", ' ', false, {"a"}},
+                 {"a", ' ', true, {"a"}},
+                 {"a", ',', false, {"a"}},
+                 {"a", ',', true, {"a"}},
+                 {"dinner", ' ', false, {"dinner"}},
+                 {"dinner", ' ', true, {"dinner"}},
+                 {"dinner", ',', false, {"dinner"}},
+                 {"dinner", ',', true, {"dinner"}},
+                 {"\t\r\na\t\tdog,\t\r\n  \t\r\nand\t\r\n \t\ta\t\tcat",
+                  ' ',
+                  false,
+                  {"\r\na\t\tdog,", "\r\nand", "a\t\tcat"}},
+                 {"\t\r\na\t\tdog,\t\r\n  \t\r\nand\t\r\n \t\ta\t\tcat",
+                  ' ',
+                  true,
+                  {"\r\na\t\tdog,", "", "\r\nand", "a\t\tcat"}},
+                 {"\t\r\na\t\tdog,\t\r\n  \t\r\nand\t\r\n \t\ta\t\tcat",
+                  ',',
+                  false,
+                  {"\r\na\t\tdog", "\r\n  \t\r\nand\t\r\n \t\ta\t\tcat"}},
+                 {"\t\r\na\t\tdog,\t\r\n  \t\r\nand\t\r\n \t\ta\t\tcat",
+                  ',',
+                  true,
+                  {"\r\na\t\tdog", "\r\n  \t\r\nand\t\r\n \t\ta\t\tcat"}},
+                 {"a ,b, ,c ", ' ', false, {"a", ",b,", ",c"}},
+                 {"a ,b, ,c ", ' ', true, {"a", ",b,", ",c", ""}},
+                 {"a ,b, ,c ", ',', false, {"a", "b", "c"}},
+                 {"a ,b, ,c ", ',', true, {"a", "b", "", "c"}},
+                 {" in  early 2000s ,  too, ", ' ', false, {"in", "early", 
"2000s", ",", "too,"}},
+                 {" in  early 2000s ,  too, ",
+                  ' ',
+                  true,
+                  {"", "in", "", "early", "2000s", ",", "", "too,", ""}},
+                 {" in  early 2000s ,  too, ", ',', false, {"in  early 2000s", 
"too"}},
+                 {" in  early 2000s ,  too, ", ',', true, {"in  early 2000s", 
"too", ""}}};
+
+    for (const auto &test : tests) {
+        Container actual_output;
+        split_args(test.input, actual_output, test.separator, 
test.keep_place_holder);
+        EXPECT_EQ(actual_output, test.expected_output);
+
+        // Test default value (i.e. false) for keep_place_holder.
+        if (!test.keep_place_holder) {
+            split_args(test.input, actual_output, test.separator);
+            EXPECT_EQ(actual_output, test.expected_output);
+
+            // Test default value (i.e. ' ') for separator.
+            if (test.separator == ' ') {
+                split_args(test.input, actual_output);
+                EXPECT_EQ(actual_output, test.expected_output);
+            }
+        }
+    }
 }
 
-TEST(core, split_args_keep_place_holder)
+TEST(core, split_args)
 {
-    std::string value = "a ,b, c ";
-    std::vector<std::string> sargs;
-    ::dsn::utils::split_args(value.c_str(), sargs, ',', true);
-
-    EXPECT_EQ(sargs.size(), 3);
-    EXPECT_EQ(sargs[0], "a");
-    EXPECT_EQ(sargs[1], "b");
-    EXPECT_EQ(sargs[2], "c");
-
-    value = " ,  a ,b, c ";
-    sargs.clear();
-    ::dsn::utils::split_args(value.c_str(), sargs, ',', true);
-
-    EXPECT_EQ(sargs.size(), 4);
-    EXPECT_EQ(sargs[0], "");
-    EXPECT_EQ(sargs[1], "a");
-    EXPECT_EQ(sargs[2], "b");
-    EXPECT_EQ(sargs[3], "c");
-
-    value = "a ,b, , c";
-    sargs.clear();
-    ::dsn::utils::split_args(value.c_str(), sargs, ',', true);
-
-    EXPECT_EQ(sargs.size(), 4);
-    EXPECT_EQ(sargs[0], "a");
-    EXPECT_EQ(sargs[1], "b");
-    EXPECT_EQ(sargs[2], "");
-    EXPECT_EQ(sargs[3], "c");
-
-    value = "a ,b, c , ";
-    sargs.clear();
-    ::dsn::utils::split_args(value.c_str(), sargs, ',', true);
-
-    EXPECT_EQ(sargs.size(), 4);
-    EXPECT_EQ(sargs[0], "a");
-    EXPECT_EQ(sargs[1], "b");
-    EXPECT_EQ(sargs[2], "c");
-    EXPECT_EQ(sargs[3], "");
-
-    value = ", a ,b, ,c , ";
-    sargs.clear();
-    ::dsn::utils::split_args(value.c_str(), sargs, ',', true);
-
-    EXPECT_EQ(sargs.size(), 6);
-    EXPECT_EQ(sargs[0], "");
-    EXPECT_EQ(sargs[1], "a");
-    EXPECT_EQ(sargs[2], "b");
-    EXPECT_EQ(sargs[3], "");
-    EXPECT_EQ(sargs[4], "c");
-    EXPECT_EQ(sargs[5], "");
+    test_split_args<std::vector<std::string>>();
+    test_split_args<std::list<std::string>>();
+    test_split_args<std::unordered_set<std::string>>();
 }
 
 TEST(core, trim_string)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[incubator-pegasus] branch master updated: refactor: use state machine to improve split_args for strings (#1240)

Reply via email to