This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new bb654e5af Fix flaky TestDownloadSuperblockInBatch
bb654e5af is described below

commit bb654e5afb0dc7ce925b9e8f36ad4413f80cfd20
Author: Marton Greber <[email protected]>
AuthorDate: Mon Jul 17 13:08:44 2023 +0000

    Fix flaky TestDownloadSuperblockInBatch
    
    The test has been failing from time to time with the following message:
    Expected: has substring "recv error: Network error: RPC frame had a
    length" Actual: "... UNKNOWN_ERROR: received error code Illegal state:
    The tablet is not in a running state: BOOTSTRAPPING from remote service"
    
    The fix is to add a wait in the test after the restart.
    
    Previous to this patch, this fluke made the kudu-tool-test quite flaky:
    http://dist-test.cloudera.org/job?job_id=root.1689595728.503176
    * 125/400 failed
    With this patch:
    http://dist-test.cloudera.org/job?job_id=root.1689597615.525960
    * 6/400 failed
      ^ I verified that those 6 are unrelated failures, compared to what is
      beeing fixed in this patch.
    
    (The command which has been used to produce the above dist-test results:
    KUDU_ALLOW_SLOW_TESTS=1 ../../build-support/dist_test.py loop -n 100 \
    -- ./bin/kudu-tool-test --stress_cpu_threads=16)
    
    Moreover, TSAN reported a race condition on 'StringVectorSink
    capture_logs'. Moved ScopedRegisterSink and the tool command into a
    scope, such that it works as expected.
    
    Change-Id: Id65cf0586416f70c72f61b2e6886bfc5d0690c0f
    Reviewed-on: http://gerrit.cloudera.org:8080/20205
    Tested-by: Kudu Jenkins
    Reviewed-by: Mahesh Reddy <[email protected]>
    Reviewed-by: Alexey Serbin <[email protected]>
---
 src/kudu/tools/kudu-tool-test.cc | 53 +++++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 25 deletions(-)

diff --git a/src/kudu/tools/kudu-tool-test.cc b/src/kudu/tools/kudu-tool-test.cc
index cfd331a84..6feac6b32 100644
--- a/src/kudu/tools/kudu-tool-test.cc
+++ b/src/kudu/tools/kudu-tool-test.cc
@@ -9439,6 +9439,7 @@ TEST_P(DownloadSuperblockInBatchTest, 
TestDownloadSuperblockInBatch) {
   // So it is easy to make the size of superblock over the value of 
rpc_max_message_size.
   FLAGS_rpc_max_message_size = kSuperblockSize / 4;
   ASSERT_OK(src_tserver->Restart());
+  ASSERT_OK(src_tserver->WaitStarted());
 
   string source_tserver_rpc_addr = src_tserver->bound_rpc_addr().ToString();
   string wal_dir = dst_tserver->options()->fs_opts.wal_root;
@@ -9447,32 +9448,34 @@ TEST_P(DownloadSuperblockInBatchTest, 
TestDownloadSuperblockInBatch) {
 
   // Copy tablet replicas from source tserver to destination tserver.
   StringVectorSink capture_logs;
-  ScopedRegisterSink reg(&capture_logs);
   string stderr;
-  RunActionStdoutStderrString(
-      Substitute("local_replica copy_from_remote $0 $1 "
-                 "-fs_data_dirs=$2 -fs_wal_dir=$3 "
-                 "--tablet_copy_support_download_superblock_in_batch=$4 "
-                 // Disable --rpc_max_message_size_enable_validation, so
-                 // --rpc_max_message_size can be set a small value.
-                 "--rpc_max_message_size_enable_validation=false "
-                 // Set --rpc_max_message_size very small, so it is easy for 
the size of
-                 // superblock over --rpc_max_message_size. It is used to 
repeat the network
-                 // error, see line 9477.
-                 "--rpc_max_message_size=$5 "
-                 // This flag and --rpc_max_message_size are in a group flag 
validator, so
-                 // it is also should be set a small value.
-                 "--consensus_max_batch_size_bytes=$6 "
-                 "--encrypt_data_at_rest=$7 "
-                 "--tablet_copy_transfer_chunk_size_bytes=50",
-                 tablet_id_to_copy,
-                 source_tserver_rpc_addr,
-                 data_dirs,
-                 wal_dir,
-                 FLAGS_tablet_copy_support_download_superblock_in_batch,
-                 (kSuperblockSize / 4),
-                 (kSuperblockSize / 8),
-                 FLAGS_encrypt_data_at_rest), nullptr, &stderr);
+  {
+    ScopedRegisterSink rs(&capture_logs);
+    RunActionStderrString(
+        Substitute("local_replica copy_from_remote $0 $1 "
+                  "-fs_data_dirs=$2 -fs_wal_dir=$3 "
+                  "--tablet_copy_support_download_superblock_in_batch=$4 "
+                  // Disable --rpc_max_message_size_enable_validation, so
+                  // --rpc_max_message_size can be set a small value.
+                  "--rpc_max_message_size_enable_validation=false "
+                  // Set --rpc_max_message_size very small, so it is easy for 
the size of
+                  // superblock over --rpc_max_message_size. It is used to 
repeat the network
+                  // error, see line 9477.
+                  "--rpc_max_message_size=$5 "
+                  // This flag and --rpc_max_message_size are in a group flag 
validator, so
+                  // it is also should be set a small value.
+                  "--consensus_max_batch_size_bytes=$6 "
+                  "--encrypt_data_at_rest=$7 "
+                  "--tablet_copy_transfer_chunk_size_bytes=50",
+                  tablet_id_to_copy,
+                  source_tserver_rpc_addr,
+                  data_dirs,
+                  wal_dir,
+                  FLAGS_tablet_copy_support_download_superblock_in_batch,
+                  (kSuperblockSize / 4),
+                  (kSuperblockSize / 8),
+                  FLAGS_encrypt_data_at_rest), &stderr);
+  }
   // The size of superblock is larger than rpc_max_message_size, it will cause 
a network error.
   // Downloading superblock will fail.
   if (!FLAGS_tablet_copy_support_download_superblock_in_batch) {

Reply via email to