This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch branch-1.18.x
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit db2a92a7c4b0f3cedd71ee25c423fc8d3cc21e96
Author: zhangyifan27 <[email protected]>
AuthorDate: Wed Nov 27 18:11:37 2024 +0800

    KUDU-3571: fix flakiness in AutoIncrementingItest.BootstrapNoWalsNoData
    
    The test AutoIncrementingItest.BootstrapNoWalsNoData sometimes failed due
    to MVCC timestamp has not been initialized or inability to wait for 
in-flight
    ops to be finished. This patch fixes this issue by waiting for everything
    to be consistent before scanning.
    
    Before this patch, when running the test with DEBUG configuration, 6/20 
tests
    failed. After this patch, 20/20 tests succeed.
    
    Change-Id: I5bd387c82b632dbb77aa5a45f831273392ae05b4
    Reviewed-on: http://gerrit.cloudera.org:8080/22133
    Tested-by: Kudu Jenkins
    Reviewed-by: Abhishek Chennaka <[email protected]>
    Reviewed-by: Alexey Serbin <[email protected]>
    Reviewed-by: Ashwani Raina <[email protected]>
    (cherry picked from commit 2b9a2012f6d7b59931119dfad03e8d40e3031a0e)
    Reviewed-on: http://gerrit.cloudera.org:8080/22240
    Reviewed-by: Yifan Zhang <[email protected]>
    Tested-by: Alexey Serbin <[email protected]>
---
 src/kudu/integration-tests/auto_incrementing-itest.cc | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/kudu/integration-tests/auto_incrementing-itest.cc 
b/src/kudu/integration-tests/auto_incrementing-itest.cc
index 4c49bde02..6f78b025f 100644
--- a/src/kudu/integration-tests/auto_incrementing-itest.cc
+++ b/src/kudu/integration-tests/auto_incrementing-itest.cc
@@ -18,6 +18,7 @@
 // Integration test for flexible partitioning (eg buckets, range partitioning
 // of PK subsets, etc).
 
+#include <functional>
 #include <memory>
 #include <ostream>
 #include <string>
@@ -38,6 +39,7 @@
 #include "kudu/common/wire_protocol.h"
 #include "kudu/consensus/metadata.pb.h"
 #include "kudu/gutil/strings/substitute.h"
+#include "kudu/integration-tests/cluster_verifier.h"
 #include "kudu/mini-cluster/external_mini_cluster.h"
 #include "kudu/rpc/rpc_controller.h"
 #include "kudu/tablet/tablet.pb.h"
@@ -394,7 +396,6 @@ TEST_F(AutoIncrementingItest, BootstrapWithNoWals) {
   }
 }
 
-
 TEST_F(AutoIncrementingItest, BootstrapNoWalsNoData) {
   string tablet_uuid;
   TestSetup(&tablet_uuid);
@@ -437,13 +438,17 @@ TEST_F(AutoIncrementingItest, BootstrapNoWalsNoData) {
     cluster_->tablet_server(i)->Shutdown();
     ASSERT_OK(cluster_->tablet_server(i)->Restart());
   }
+  // Ensure that the tablet is running and leader elected.
+  ASSERT_EVENTUALLY([&] { 
ASSERT_OK(ClusterVerifier(cluster_.get()).RunKsck()); });
 
   // Insert new data and verify auto_incrementing_id starts from 1.
   ASSERT_OK(InsertData(kNumRows, kNumRows * 2));
+  // Wait for all the replicas to converge.
+  NO_FATALS(ClusterVerifier(cluster_.get()).CheckCluster());
   for (int j = 0; j < kNumTabletServers; j++) {
     vector<string> results;
     ASSERT_OK(ScanTablet(j, tablet_uuid, &results));
-    ASSERT_EQ(200, results.size());
+    ASSERT_EQ(kNumRows, results.size());
     for (int i = 0; i < results.size(); i++) {
       ASSERT_EQ(Substitute("(int32 c0=$0, int64 $1=$2, string 
c1=\"string_val\")", i + kNumRows,
                            Schema::GetAutoIncrementingColumnName(), i + 1), 
results[i]);
@@ -503,7 +508,7 @@ TEST_F(AutoIncrementingItest, BootstrapWalsDiverge) {
   // Write 200 rows at the rate of 1 row every 5ms which are sent to the 
leader replica. After
   // 100ms of starting to insert data, we shutdown the followers and at this 
point the write
   // request is expected to 900ms more. Since the leader would mark the 
followers as
-  // unavailable after 3 lost hearbeats (1500ms), there will for sure be a 
situation where the
+  // unavailable after 3 lost heartbeats (1500ms), there will for sure be a 
situation where the
   // leader has sent a write op and hasn't gotten the response from majority-1 
number of
   // followers. In this case the write op is not marked committed in the 
leader replica. All
   // the writes including this are considered failed.

Reply via email to