This is an automated email from the ASF dual-hosted git repository.

achennaka pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new dc64df6c1 [tests] fix flakiness in TestSpaceAvailableMetrics
dc64df6c1 is described below

commit dc64df6c1219dc2c4e930f6ea8d5106d3c7fb5cf
Author: Alexey Serbin <[email protected]>
AuthorDate: Tue Dec 19 19:27:42 2023 -0800

    [tests] fix flakiness in TestSpaceAvailableMetrics
    
    I saw TabletServerDiskErrorITest.TestSpaceAvailableMetrics failing
    with an error like below in a pre-commit test run [1] (TSAN build):
    
      src/kudu/integration-tests/disk_failure-itest.cc:352: Failure
      Expected equality of these values:
        wal_dir_space
          Which is: 40835236209
        wal_dir_space_refetch
          Which is: 40767713649
    
    This patch addresses the issue.
    
    [1] 
http://dist-test.cloudera.org/job?job_id=jenkins-slave.1703028347.1027630
    
    Change-Id: Iec10ca3d19aa5e378cebd3ceef6368496797229f
    Reviewed-on: http://gerrit.cloudera.org:8080/20821
    Tested-by: Alexey Serbin <[email protected]>
    Reviewed-by: Abhishek Chennaka <[email protected]>
---
 src/kudu/integration-tests/disk_failure-itest.cc | 46 +++++++++++++++---------
 1 file changed, 29 insertions(+), 17 deletions(-)

diff --git a/src/kudu/integration-tests/disk_failure-itest.cc 
b/src/kudu/integration-tests/disk_failure-itest.cc
index 3cbcddc62..c2e677068 100644
--- a/src/kudu/integration-tests/disk_failure-itest.cc
+++ b/src/kudu/integration-tests/disk_failure-itest.cc
@@ -332,38 +332,50 @@ TEST_P(TabletServerDiskErrorITest, 
TestSpaceAvailableMetrics) {
   // flags --fs_wal_dir_available_space_cache_seconds and
   // --fs_data_dirs_available_space_cache_seconds.
   const auto get_metrics = [&] (int64_t* wal_dir_space, int64_t* 
data_dir_space) {
-    
RETURN_NOT_OK(itest::GetInt64Metric(cluster_->tablet_server(0)->bound_http_hostport(),
+    const auto& addr = cluster_->tablet_server(0)->bound_http_hostport();
+    RETURN_NOT_OK(itest::GetInt64Metric(addr,
                                         &METRIC_ENTITY_server, nullptr,
                                         &METRIC_wal_dir_space_available_bytes, 
"value",
                                         wal_dir_space));
-    return 
itest::GetInt64Metric(cluster_->tablet_server(0)->bound_http_hostport(),
+    return itest::GetInt64Metric(addr,
                                  &METRIC_ENTITY_server, nullptr,
                                  &METRIC_data_dirs_space_available_bytes, 
"value",
                                  data_dir_space);
-    };
-  int64_t wal_dir_space;
-  int64_t data_dir_space;
-  int64_t wal_dir_space_refetch;
-  int64_t data_dir_space_refetch;
-  ASSERT_OK(get_metrics(&wal_dir_space, &data_dir_space));
-  ASSERT_GT(wal_dir_space, 0);
-  ASSERT_GT(data_dir_space, 0);
-  ASSERT_OK(get_metrics(&wal_dir_space_refetch, &data_dir_space_refetch));
-  ASSERT_EQ(wal_dir_space, wal_dir_space_refetch);
-  ASSERT_EQ(data_dir_space, data_dir_space_refetch);
+  };
+
+  // Even with 10 seconds caching interval, it can happen that two metric
+  // snapshots are captured at different caching epochs, i.e. the metrics are
+  // refreshed in the cache between the two snapshots.  To avoid flakiness
+  // on busy test nodes due to scheduling anomalies or when running TSAN/ASAN
+  // binaries, the assertions below are wrapped into ASSERT_EVENTUALLY block.
+  ASSERT_EVENTUALLY([&] {
+    int64_t wal_dir_space = 0;
+    int64_t data_dir_space = 0;
+    ASSERT_OK(get_metrics(&wal_dir_space, &data_dir_space));
+    ASSERT_GT(wal_dir_space, 0);
+    ASSERT_GT(data_dir_space, 0);
+
+    int64_t wal_dir_space_refetch = 0;
+    int64_t data_dir_space_refetch = 0;
+    ASSERT_OK(get_metrics(&wal_dir_space_refetch, &data_dir_space_refetch));
+    ASSERT_EQ(wal_dir_space, wal_dir_space_refetch);
+    ASSERT_EQ(data_dir_space, data_dir_space_refetch);
+  });
 
   ExternalTabletServer* error_ts = cluster_->tablet_server(0);
 
-  // Inject EIO into one of the data directories and check if 
data_dirs_space_available_bytes
-  // now equals to -1
+  // Inject EIO into one of the data directories and check if
+  // data_dirs_space_available_bytes now equals to -1.
   error_ts->mutable_flags()->emplace_back(
-            Substitute("--env_inject_eio_globs=$0", 
JoinPathSegments(error_ts->data_dirs()[1],
-                                                                               
"**")));
+      Substitute("--env_inject_eio_globs=$0",
+                 JoinPathSegments(error_ts->data_dirs()[1], "**")));
   error_ts->mutable_flags()->emplace_back("--env_inject_eio=1.0");
 
   error_ts->Shutdown();
   ASSERT_OK(error_ts->Restart());
 
+  int64_t wal_dir_space = 0;
+  int64_t data_dir_space = 0;
   ASSERT_OK(get_metrics(&wal_dir_space, &data_dir_space));
   ASSERT_NE(wal_dir_space, -1);
   ASSERT_EQ(data_dir_space, -1);

Reply via email to