This is an automated email from the ASF dual-hosted git repository.

mgreber pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit edb51241c154e411d84a52cd88400aab680f7639
Author: Alexey Serbin <[email protected]>
AuthorDate: Thu Oct 31 20:17:26 2024 -0700

    KUDU-3613 fix flakiness in replace_tablet-itest
    
    Before this patch, the ReplaceTabletITest.ReplaceTabletsWhileWriting
    test scenario would fail in about 2% of runs, mostly in ASAN builds.
    The issue was due to selecting the same tablet UUID by the
    ReplaceRandomTablet() test fixture's function two times in a row
    in case of scheduler anomalies when running on busy nodes.
    
    Change-Id: I8a8882da6276a1e7cc8d8f3cb3ce1c69f540a5d0
    Reviewed-on: http://gerrit.cloudera.org:8080/22012
    Tested-by: Alexey Serbin <[email protected]>
    Reviewed-by: Mahesh Reddy <[email protected]>
    Reviewed-by: Marton Greber <[email protected]>
---
 src/kudu/integration-tests/replace_tablet-itest.cc | 42 ++++++++++++++--------
 1 file changed, 27 insertions(+), 15 deletions(-)

diff --git a/src/kudu/integration-tests/replace_tablet-itest.cc 
b/src/kudu/integration-tests/replace_tablet-itest.cc
index 0ae1ba9a9..47e2725e7 100644
--- a/src/kudu/integration-tests/replace_tablet-itest.cc
+++ b/src/kudu/integration-tests/replace_tablet-itest.cc
@@ -17,15 +17,15 @@
 
 #include <functional>
 #include <memory>
-#include <ostream>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
-#include <glog/logging.h>
 #include <gtest/gtest.h>
 
+#include "kudu/gutil/map-util.h"
 #include "kudu/integration-tests/cluster_itest_util.h"
 #include "kudu/integration-tests/cluster_verifier.h"
 #include "kudu/integration-tests/external_mini_cluster-itest-base.h"
@@ -43,21 +43,21 @@
 #include "kudu/util/test_macros.h"
 #include "kudu/util/test_util.h"
 
+using kudu::master::MasterServiceProxy;
+using kudu::master::ReplaceTabletRequestPB;
+using kudu::master::ReplaceTabletResponsePB;
+using kudu::tools::LeaderMasterProxy;
+using kudu::tserver::ListTabletsResponsePB_StatusAndSchemaPB;
 using std::string;
+using std::unordered_set;
 using std::vector;
 
 namespace kudu {
 
-using master::MasterServiceProxy;
-using master::ReplaceTabletRequestPB;
-using master::ReplaceTabletResponsePB;
-using tools::LeaderMasterProxy;
-using tserver::ListTabletsResponsePB_StatusAndSchemaPB;
-
 class ReplaceTabletITest : public ExternalMiniClusterITestBase {
  public:
-  ReplaceTabletITest() :
-    rand_(SeedRandom()) {
+  ReplaceTabletITest()
+       : rand_(SeedRandom()) {
   }
 
   Status RandomTabletId(string* tablet_id) {
@@ -72,13 +72,20 @@ class ReplaceTabletITest : public 
ExternalMiniClusterITestBase {
     return Status::OK();
   }
 
-  Status ReplaceRandomTablet(LeaderMasterProxy* proxy) {
+  Status ReplaceRandomTablet(LeaderMasterProxy* proxy,
+                             unordered_set<string>* replaced_tablet_ids) {
     string tablet_id;
-    RETURN_NOT_OK(RandomTabletId(&tablet_id));
-    LOG(INFO) << "Replacing tablet " << tablet_id;
+    while (true) {
+      RETURN_NOT_OK(RandomTabletId(&tablet_id));
+      if (!ContainsKey(*replaced_tablet_ids, tablet_id)) {
+        EmplaceIfNotPresent(replaced_tablet_ids, tablet_id);
+        break;
+      }
+      // Same tablet is chosen again: need to retry.
+    }
     ReplaceTabletRequestPB req;
-    ReplaceTabletResponsePB resp;
     req.set_tablet_id(tablet_id);
+    ReplaceTabletResponsePB resp;
     return proxy->SyncRpc<ReplaceTabletRequestPB, ReplaceTabletResponsePB>(
         req, &resp, "ReplaceTablet", &MasterServiceProxy::ReplaceTabletAsync);
   }
@@ -118,7 +125,12 @@ TEST_F(ReplaceTabletITest, ReplaceTabletsWhileWriting) {
 
   // Replace tablets while inserts continue.
   for (int i = 0; i < kNumReplaceTablets; i++) {
-    ASSERT_OK(ReplaceRandomTablet(&proxy));
+    // Since ReplaceRandomTablet() is prone to a TOCTOU race, it's necessary
+    // to keep track of tablet IDs where requests to replace tablets have
+    // already been sent. Otherwise, if the same tablet was chosen on next
+    // cycle, Status::NotPresent() status would be returned.
+    unordered_set<string> replaced_tablet_ids;
+    ASSERT_OK(ReplaceRandomTablet(&proxy, &replaced_tablet_ids));
     SleepFor(MonoDelta::FromMilliseconds(100));
   }
 

Reply via email to