This is an automated email from the ASF dual-hosted git repository.

laiyingchun pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-pegasus.git


The following commit(s) were added to refs/heads/master by this push:
     new 1beb24a76 fix(ut): fix a flaky test integration_test.write_corrupt_db 
(#1442)
1beb24a76 is described below

commit 1beb24a765ba009499a6348f82f1ea95ca2a663d
Author: Yingchun Lai <[email protected]>
AuthorDate: Mon Apr 17 14:33:03 2023 +0800

    fix(ut): fix a flaky test integration_test.write_corrupt_db (#1442)
    
    https://github.com/apache/incubator-pegasus/issues/1383
    
    Commit 9303c3aba87c8392705e52baeeee768ad385d5b3 introduced a flaky test, 
this
    patch try to fix it.
    
    This patch also introduce some integration test utils, they would be helpful
    for following patches.
---
 .../base_api_test/integration_test.cpp             | 56 ++++++++++++----------
 src/test/function_test/utils/test_util.cpp         | 45 +++++++++++++++++
 src/test/function_test/utils/test_util.h           |  8 +++-
 3 files changed, 84 insertions(+), 25 deletions(-)

diff --git a/src/test/function_test/base_api_test/integration_test.cpp 
b/src/test/function_test/base_api_test/integration_test.cpp
index 8ab3cc56c..027635e2d 100644
--- a/src/test/function_test/base_api_test/integration_test.cpp
+++ b/src/test/function_test/base_api_test/integration_test.cpp
@@ -21,14 +21,13 @@
 #include <gtest/gtest-message.h>
 #include <gtest/gtest-test-part.h>
 #include <gtest/gtest.h>
-#include <unistd.h>
 #include <iostream>
 #include <string>
 
 #include "include/pegasus/client.h"
 #include "pegasus/error.h"
 #include "test/function_test/utils/test_util.h"
-#include "test/function_test/utils/utils.h"
+#include "test_util/test_util.h"
 
 using namespace ::pegasus;
 
@@ -40,7 +39,13 @@ class integration_test : public test_util
 
 TEST_F(integration_test, write_corrupt_db)
 {
-    // Inject a write error kCorruption to RS-0.
+    // Make best effort to rebalance the cluster,
+    ASSERT_NO_FATAL_FAILURE(
+        run_cmd_from_project_root("echo 'set_meta_level lively' | ./run.sh 
shell"));
+    // Make sure RS-1 has some primaries of table 'temp'.
+    ASSERT_IN_TIME([&] { ASSERT_GT(get_leader_count("temp", 1), 0); }, 120);
+
+    // Inject a write error kCorruption to RS-1.
     ASSERT_NO_FATAL_FAILURE(run_cmd_from_project_root(
         "curl 'localhost:34801/updateConfig?inject_write_error_for_test=2'"));
 
@@ -57,13 +62,13 @@ TEST_F(integration_test, write_corrupt_db)
                 ok_count++;
                 break;
             } else if (ret == PERR_CORRUPTION) {
-                // Suppose there must some primaries on RS-0.
+                // Suppose there must some primaries on RS-1.
                 corruption_count++;
                 break;
             } else if (ret == PERR_TIMEOUT) {
-                // If RS-0 crashed before (learn failed when write storage 
engine but get
-                // kCorruption),
-                // a new write operation on the primary replica it ever held 
will cause timeout.
+                // If RS-1 crashed before (learn failed when write storage 
engine but get
+                // kCorruption), a new write operation on the primary replica 
it ever held will
+                // cause timeout.
                 // Force to fetch the latest route table.
                 client_ =
                     pegasus_client_factory::get_client(cluster_name_.c_str(), 
app_name_.c_str());
@@ -92,27 +97,33 @@ TEST_F(integration_test, write_corrupt_db)
 
     EXPECT_GT(ok_count, 0);
     EXPECT_GT(corruption_count, 0);
-    std::cout << "ok_count: " << ok_count << ", corruption_count: " << 
corruption_count;
+    std::cout << "ok_count: " << ok_count << ", corruption_count: " << 
corruption_count
+              << std::endl;
 
-    // Now only 2 RS left.
-    std::string rs_count;
-    ASSERT_NO_FATAL_FAILURE(run_cmd(
-        "ps aux | grep 'pegasus_server config.ini -app_list replica' | grep -v 
grep | wc -l",
-        &rs_count));
-    ASSERT_EQ("2", rs_count);
+    // Now only 2 RSs left, or RS-1 has no leader replicas.
+    ASSERT_IN_TIME(
+        [&] {
+            ASSERT_TRUE(get_alive_replica_server_count() == 2 || 
get_leader_count("temp", 1) == 0);
+        },
+        60);
 
     // Replica server 0 is able to start normally.
     // After restart, the 'inject_write_error_for_test' config value will be 
reset to 0 (i.e. OK).
-    ASSERT_NO_FATAL_FAILURE(run_cmd_from_project_root("./run.sh 
start_onebox_instance -r 1"));
-    ASSERT_NO_FATAL_FAILURE(run_cmd(
-        "ps aux | grep 'pegasus_server config.ini -app_list replica' | grep -v 
grep | wc -l",
-        &rs_count));
-    ASSERT_EQ("3", rs_count);
+    if (get_alive_replica_server_count() == 2) {
+        ASSERT_NO_FATAL_FAILURE(run_cmd_from_project_root("./run.sh 
start_onebox_instance -r 1"));
+    } else {
+        ASSERT_EQ(3, get_alive_replica_server_count());
+        ASSERT_EQ(0, get_leader_count("temp", 1));
+        ASSERT_NO_FATAL_FAILURE(run_cmd_from_project_root("./run.sh 
restart_onebox_instance -r 1"));
+    }
+
+    ASSERT_IN_TIME([&] { ASSERT_EQ(3, get_alive_replica_server_count()); }, 
60);
 
     // Make best effort to rebalance the cluster,
     ASSERT_NO_FATAL_FAILURE(
         run_cmd_from_project_root("echo 'set_meta_level lively' | ./run.sh 
shell"));
-    usleep(10 * 1000 * 1000);
+    // Make sure RS-1 has some primaries of table 'temp'.
+    ASSERT_IN_TIME([&] { ASSERT_GT(get_leader_count("temp", 1), 0); }, 120);
 
     for (int i = 0; i < 1000; i++) {
         std::string hkey = fmt::format("hkey2_{}", i);
@@ -123,8 +134,5 @@ TEST_F(integration_test, write_corrupt_db)
         ASSERT_EQ(value, got_value);
     }
 
-    ASSERT_NO_FATAL_FAILURE(run_cmd(
-        "ps aux | grep 'pegasus_server config.ini -app_list replica' | grep -v 
grep | wc -l",
-        &rs_count));
-    ASSERT_EQ("3", rs_count);
+    ASSERT_IN_TIME([&] { ASSERT_EQ(3, get_alive_replica_server_count()); }, 
60);
 }
diff --git a/src/test/function_test/utils/test_util.cpp 
b/src/test/function_test/utils/test_util.cpp
index 74dcb0fae..f417379e9 100644
--- a/src/test/function_test/utils/test_util.cpp
+++ b/src/test/function_test/utils/test_util.cpp
@@ -19,26 +19,37 @@
 
 #include "test_util.h"
 
+#include <nlohmann/json.hpp>
 #include <unistd.h>
+#include <fstream>
+#include <initializer_list>
 #include <utility>
 #include <vector>
 
 #include "base/pegasus_const.h"
 #include "client/replication_ddl_client.h"
 #include "common/replication_other_types.h"
+#include "fmt/core.h"
 #include "gtest/gtest-message.h"
 #include "gtest/gtest-test-part.h"
 #include "gtest/gtest.h"
 #include "include/pegasus/client.h"
+#include "nlohmann/detail/iterators/iter_impl.hpp"
+#include "nlohmann/json_fwd.hpp"
 #include "runtime/rpc/rpc_address.h"
 #include "test/function_test/utils/global_env.h"
 #include "test/function_test/utils/utils.h"
+#include "utils/defer.h"
 #include "utils/error_code.h"
+#include "utils/filesystem.h"
+#include "utils/rand.h"
+#include "utils/string_conv.h"
 
 using dsn::partition_configuration;
 using dsn::replication::replica_helper;
 using dsn::replication::replication_ddl_client;
 using dsn::rpc_address;
+using nlohmann::json;
 using std::vector;
 
 namespace pegasus {
@@ -88,4 +99,38 @@ void test_util::run_cmd_from_project_root(const std::string 
&cmd)
     ASSERT_NO_FATAL_FAILURE(run_cmd(cmd));
 }
 
+int test_util::get_alive_replica_server_count()
+{
+    const auto json_filename = fmt::format("test_json_file.{}", 
dsn::rand::next_u32());
+    auto cleanup =
+        dsn::defer([json_filename]() { 
dsn::utils::filesystem::remove_path(json_filename); });
+    run_cmd_from_project_root(fmt::format("echo 'nodes -djo {}' | ./run.sh 
shell", json_filename));
+    std::ifstream f(json_filename);
+    const auto data = json::parse(f);
+    int replica_server_count = 0;
+    if (!dsn::buf2int32(data["summary"]["alive_node_count"], 
replica_server_count)) {
+        return -1;
+    }
+    return replica_server_count;
+}
+
+int test_util::get_leader_count(const std::string &table_name, int 
replica_server_index)
+{
+    const auto json_filename = fmt::format("test_json_file.{}", 
dsn::rand::next_u32());
+    auto cleanup =
+        dsn::defer([json_filename]() { 
dsn::utils::filesystem::remove_path(json_filename); });
+    run_cmd_from_project_root(
+        fmt::format("echo 'app {} -djo {}' | ./run.sh shell", table_name, 
json_filename));
+    std::ifstream f(json_filename);
+    const auto data = json::parse(f);
+    int leader_count = 0;
+    for (const auto &replica : data["replicas"]) {
+        const auto &primary = to_string(replica["primary"]);
+        if (primary.find(fmt::format("3480{}", replica_server_index)) != 
std::string::npos) {
+            leader_count++;
+        }
+    }
+    return leader_count;
+}
+
 } // namespace pegasus
diff --git a/src/test/function_test/utils/test_util.h 
b/src/test/function_test/utils/test_util.h
index 1a5993abb..d296bd3ff 100644
--- a/src/test/function_test/utils/test_util.h
+++ b/src/test/function_test/utils/test_util.h
@@ -55,7 +55,13 @@ public:
 
     void SetUp() override;
 
-    void run_cmd_from_project_root(const std::string &cmd);
+    static void run_cmd_from_project_root(const std::string &cmd);
+
+    static int get_alive_replica_server_count();
+
+    // Get the leader replica count of the 'replica_server_index' (based on 1) 
replica server
+    // on the 'table_name'.
+    static int get_leader_count(const std::string &table_name, int 
replica_server_index);
 
 protected:
     const std::string cluster_name_;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to