This is an automated email from the ASF dual-hosted git repository.
laiyingchun pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-pegasus.git
The following commit(s) were added to refs/heads/master by this push:
new 1beb24a76 fix(ut): fix a flaky test integration_test.write_corrupt_db
(#1442)
1beb24a76 is described below
commit 1beb24a765ba009499a6348f82f1ea95ca2a663d
Author: Yingchun Lai <[email protected]>
AuthorDate: Mon Apr 17 14:33:03 2023 +0800
fix(ut): fix a flaky test integration_test.write_corrupt_db (#1442)
https://github.com/apache/incubator-pegasus/issues/1383
Commit 9303c3aba87c8392705e52baeeee768ad385d5b3 introduced a flaky test,
this
patch try to fix it.
This patch also introduce some integration test utils, they would be helpful
for following patches.
---
.../base_api_test/integration_test.cpp | 56 ++++++++++++----------
src/test/function_test/utils/test_util.cpp | 45 +++++++++++++++++
src/test/function_test/utils/test_util.h | 8 +++-
3 files changed, 84 insertions(+), 25 deletions(-)
diff --git a/src/test/function_test/base_api_test/integration_test.cpp
b/src/test/function_test/base_api_test/integration_test.cpp
index 8ab3cc56c..027635e2d 100644
--- a/src/test/function_test/base_api_test/integration_test.cpp
+++ b/src/test/function_test/base_api_test/integration_test.cpp
@@ -21,14 +21,13 @@
#include <gtest/gtest-message.h>
#include <gtest/gtest-test-part.h>
#include <gtest/gtest.h>
-#include <unistd.h>
#include <iostream>
#include <string>
#include "include/pegasus/client.h"
#include "pegasus/error.h"
#include "test/function_test/utils/test_util.h"
-#include "test/function_test/utils/utils.h"
+#include "test_util/test_util.h"
using namespace ::pegasus;
@@ -40,7 +39,13 @@ class integration_test : public test_util
TEST_F(integration_test, write_corrupt_db)
{
- // Inject a write error kCorruption to RS-0.
+ // Make best effort to rebalance the cluster,
+ ASSERT_NO_FATAL_FAILURE(
+ run_cmd_from_project_root("echo 'set_meta_level lively' | ./run.sh
shell"));
+ // Make sure RS-1 has some primaries of table 'temp'.
+ ASSERT_IN_TIME([&] { ASSERT_GT(get_leader_count("temp", 1), 0); }, 120);
+
+ // Inject a write error kCorruption to RS-1.
ASSERT_NO_FATAL_FAILURE(run_cmd_from_project_root(
"curl 'localhost:34801/updateConfig?inject_write_error_for_test=2'"));
@@ -57,13 +62,13 @@ TEST_F(integration_test, write_corrupt_db)
ok_count++;
break;
} else if (ret == PERR_CORRUPTION) {
- // Suppose there must some primaries on RS-0.
+ // Suppose there must some primaries on RS-1.
corruption_count++;
break;
} else if (ret == PERR_TIMEOUT) {
- // If RS-0 crashed before (learn failed when write storage
engine but get
- // kCorruption),
- // a new write operation on the primary replica it ever held
will cause timeout.
+ // If RS-1 crashed before (learn failed when write storage
engine but get
+ // kCorruption), a new write operation on the primary replica
it ever held will
+ // cause timeout.
// Force to fetch the latest route table.
client_ =
pegasus_client_factory::get_client(cluster_name_.c_str(),
app_name_.c_str());
@@ -92,27 +97,33 @@ TEST_F(integration_test, write_corrupt_db)
EXPECT_GT(ok_count, 0);
EXPECT_GT(corruption_count, 0);
- std::cout << "ok_count: " << ok_count << ", corruption_count: " <<
corruption_count;
+ std::cout << "ok_count: " << ok_count << ", corruption_count: " <<
corruption_count
+ << std::endl;
- // Now only 2 RS left.
- std::string rs_count;
- ASSERT_NO_FATAL_FAILURE(run_cmd(
- "ps aux | grep 'pegasus_server config.ini -app_list replica' | grep -v
grep | wc -l",
- &rs_count));
- ASSERT_EQ("2", rs_count);
+ // Now only 2 RSs left, or RS-1 has no leader replicas.
+ ASSERT_IN_TIME(
+ [&] {
+ ASSERT_TRUE(get_alive_replica_server_count() == 2 ||
get_leader_count("temp", 1) == 0);
+ },
+ 60);
// Replica server 0 is able to start normally.
// After restart, the 'inject_write_error_for_test' config value will be
reset to 0 (i.e. OK).
- ASSERT_NO_FATAL_FAILURE(run_cmd_from_project_root("./run.sh
start_onebox_instance -r 1"));
- ASSERT_NO_FATAL_FAILURE(run_cmd(
- "ps aux | grep 'pegasus_server config.ini -app_list replica' | grep -v
grep | wc -l",
- &rs_count));
- ASSERT_EQ("3", rs_count);
+ if (get_alive_replica_server_count() == 2) {
+ ASSERT_NO_FATAL_FAILURE(run_cmd_from_project_root("./run.sh
start_onebox_instance -r 1"));
+ } else {
+ ASSERT_EQ(3, get_alive_replica_server_count());
+ ASSERT_EQ(0, get_leader_count("temp", 1));
+ ASSERT_NO_FATAL_FAILURE(run_cmd_from_project_root("./run.sh
restart_onebox_instance -r 1"));
+ }
+
+ ASSERT_IN_TIME([&] { ASSERT_EQ(3, get_alive_replica_server_count()); },
60);
// Make best effort to rebalance the cluster,
ASSERT_NO_FATAL_FAILURE(
run_cmd_from_project_root("echo 'set_meta_level lively' | ./run.sh
shell"));
- usleep(10 * 1000 * 1000);
+ // Make sure RS-1 has some primaries of table 'temp'.
+ ASSERT_IN_TIME([&] { ASSERT_GT(get_leader_count("temp", 1), 0); }, 120);
for (int i = 0; i < 1000; i++) {
std::string hkey = fmt::format("hkey2_{}", i);
@@ -123,8 +134,5 @@ TEST_F(integration_test, write_corrupt_db)
ASSERT_EQ(value, got_value);
}
- ASSERT_NO_FATAL_FAILURE(run_cmd(
- "ps aux | grep 'pegasus_server config.ini -app_list replica' | grep -v
grep | wc -l",
- &rs_count));
- ASSERT_EQ("3", rs_count);
+ ASSERT_IN_TIME([&] { ASSERT_EQ(3, get_alive_replica_server_count()); },
60);
}
diff --git a/src/test/function_test/utils/test_util.cpp
b/src/test/function_test/utils/test_util.cpp
index 74dcb0fae..f417379e9 100644
--- a/src/test/function_test/utils/test_util.cpp
+++ b/src/test/function_test/utils/test_util.cpp
@@ -19,26 +19,37 @@
#include "test_util.h"
+#include <nlohmann/json.hpp>
#include <unistd.h>
+#include <fstream>
+#include <initializer_list>
#include <utility>
#include <vector>
#include "base/pegasus_const.h"
#include "client/replication_ddl_client.h"
#include "common/replication_other_types.h"
+#include "fmt/core.h"
#include "gtest/gtest-message.h"
#include "gtest/gtest-test-part.h"
#include "gtest/gtest.h"
#include "include/pegasus/client.h"
+#include "nlohmann/detail/iterators/iter_impl.hpp"
+#include "nlohmann/json_fwd.hpp"
#include "runtime/rpc/rpc_address.h"
#include "test/function_test/utils/global_env.h"
#include "test/function_test/utils/utils.h"
+#include "utils/defer.h"
#include "utils/error_code.h"
+#include "utils/filesystem.h"
+#include "utils/rand.h"
+#include "utils/string_conv.h"
using dsn::partition_configuration;
using dsn::replication::replica_helper;
using dsn::replication::replication_ddl_client;
using dsn::rpc_address;
+using nlohmann::json;
using std::vector;
namespace pegasus {
@@ -88,4 +99,38 @@ void test_util::run_cmd_from_project_root(const std::string
&cmd)
ASSERT_NO_FATAL_FAILURE(run_cmd(cmd));
}
+int test_util::get_alive_replica_server_count()
+{
+ const auto json_filename = fmt::format("test_json_file.{}",
dsn::rand::next_u32());
+ auto cleanup =
+ dsn::defer([json_filename]() {
dsn::utils::filesystem::remove_path(json_filename); });
+ run_cmd_from_project_root(fmt::format("echo 'nodes -djo {}' | ./run.sh
shell", json_filename));
+ std::ifstream f(json_filename);
+ const auto data = json::parse(f);
+ int replica_server_count = 0;
+ if (!dsn::buf2int32(data["summary"]["alive_node_count"],
replica_server_count)) {
+ return -1;
+ }
+ return replica_server_count;
+}
+
+int test_util::get_leader_count(const std::string &table_name, int
replica_server_index)
+{
+ const auto json_filename = fmt::format("test_json_file.{}",
dsn::rand::next_u32());
+ auto cleanup =
+ dsn::defer([json_filename]() {
dsn::utils::filesystem::remove_path(json_filename); });
+ run_cmd_from_project_root(
+ fmt::format("echo 'app {} -djo {}' | ./run.sh shell", table_name,
json_filename));
+ std::ifstream f(json_filename);
+ const auto data = json::parse(f);
+ int leader_count = 0;
+ for (const auto &replica : data["replicas"]) {
+ const auto &primary = to_string(replica["primary"]);
+ if (primary.find(fmt::format("3480{}", replica_server_index)) !=
std::string::npos) {
+ leader_count++;
+ }
+ }
+ return leader_count;
+}
+
} // namespace pegasus
diff --git a/src/test/function_test/utils/test_util.h
b/src/test/function_test/utils/test_util.h
index 1a5993abb..d296bd3ff 100644
--- a/src/test/function_test/utils/test_util.h
+++ b/src/test/function_test/utils/test_util.h
@@ -55,7 +55,13 @@ public:
void SetUp() override;
- void run_cmd_from_project_root(const std::string &cmd);
+ static void run_cmd_from_project_root(const std::string &cmd);
+
+ static int get_alive_replica_server_count();
+
+ // Get the leader replica count of the 'replica_server_index' (based on 1)
replica server
+ // on the 'table_name'.
+ static int get_leader_count(const std::string &table_name, int
replica_server_index);
protected:
const std::string cluster_name_;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]