This is an automated email from the ASF dual-hosted git repository.
achennaka pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new dc64df6c1 [tests] fix flakiness in TestSpaceAvailableMetrics
dc64df6c1 is described below
commit dc64df6c1219dc2c4e930f6ea8d5106d3c7fb5cf
Author: Alexey Serbin <[email protected]>
AuthorDate: Tue Dec 19 19:27:42 2023 -0800
[tests] fix flakiness in TestSpaceAvailableMetrics
I saw TabletServerDiskErrorITest.TestSpaceAvailableMetrics failing
with an error like below in a pre-commit test run [1] (TSAN build):
src/kudu/integration-tests/disk_failure-itest.cc:352: Failure
Expected equality of these values:
wal_dir_space
Which is: 40835236209
wal_dir_space_refetch
Which is: 40767713649
This patch addresses the issue.
[1]
http://dist-test.cloudera.org/job?job_id=jenkins-slave.1703028347.1027630
Change-Id: Iec10ca3d19aa5e378cebd3ceef6368496797229f
Reviewed-on: http://gerrit.cloudera.org:8080/20821
Tested-by: Alexey Serbin <[email protected]>
Reviewed-by: Abhishek Chennaka <[email protected]>
---
src/kudu/integration-tests/disk_failure-itest.cc | 46 +++++++++++++++---------
1 file changed, 29 insertions(+), 17 deletions(-)
diff --git a/src/kudu/integration-tests/disk_failure-itest.cc
b/src/kudu/integration-tests/disk_failure-itest.cc
index 3cbcddc62..c2e677068 100644
--- a/src/kudu/integration-tests/disk_failure-itest.cc
+++ b/src/kudu/integration-tests/disk_failure-itest.cc
@@ -332,38 +332,50 @@ TEST_P(TabletServerDiskErrorITest,
TestSpaceAvailableMetrics) {
// flags --fs_wal_dir_available_space_cache_seconds and
// --fs_data_dirs_available_space_cache_seconds.
const auto get_metrics = [&] (int64_t* wal_dir_space, int64_t*
data_dir_space) {
-
RETURN_NOT_OK(itest::GetInt64Metric(cluster_->tablet_server(0)->bound_http_hostport(),
+ const auto& addr = cluster_->tablet_server(0)->bound_http_hostport();
+ RETURN_NOT_OK(itest::GetInt64Metric(addr,
&METRIC_ENTITY_server, nullptr,
&METRIC_wal_dir_space_available_bytes,
"value",
wal_dir_space));
- return
itest::GetInt64Metric(cluster_->tablet_server(0)->bound_http_hostport(),
+ return itest::GetInt64Metric(addr,
&METRIC_ENTITY_server, nullptr,
&METRIC_data_dirs_space_available_bytes,
"value",
data_dir_space);
- };
- int64_t wal_dir_space;
- int64_t data_dir_space;
- int64_t wal_dir_space_refetch;
- int64_t data_dir_space_refetch;
- ASSERT_OK(get_metrics(&wal_dir_space, &data_dir_space));
- ASSERT_GT(wal_dir_space, 0);
- ASSERT_GT(data_dir_space, 0);
- ASSERT_OK(get_metrics(&wal_dir_space_refetch, &data_dir_space_refetch));
- ASSERT_EQ(wal_dir_space, wal_dir_space_refetch);
- ASSERT_EQ(data_dir_space, data_dir_space_refetch);
+ };
+
+ // Even with 10 seconds caching interval, it can happen that two metric
+ // snapshots are captured at different caching epochs, i.e. the metrics are
+ // refreshed in the cache between the two snapshots. To avoid flakiness
+ // on busy test nodes due to scheduling anomalies or when running TSAN/ASAN
+ // binaries, the assertions below are wrapped into ASSERT_EVENTUALLY block.
+ ASSERT_EVENTUALLY([&] {
+ int64_t wal_dir_space = 0;
+ int64_t data_dir_space = 0;
+ ASSERT_OK(get_metrics(&wal_dir_space, &data_dir_space));
+ ASSERT_GT(wal_dir_space, 0);
+ ASSERT_GT(data_dir_space, 0);
+
+ int64_t wal_dir_space_refetch = 0;
+ int64_t data_dir_space_refetch = 0;
+ ASSERT_OK(get_metrics(&wal_dir_space_refetch, &data_dir_space_refetch));
+ ASSERT_EQ(wal_dir_space, wal_dir_space_refetch);
+ ASSERT_EQ(data_dir_space, data_dir_space_refetch);
+ });
ExternalTabletServer* error_ts = cluster_->tablet_server(0);
- // Inject EIO into one of the data directories and check if
data_dirs_space_available_bytes
- // now equals to -1
+ // Inject EIO into one of the data directories and check if
+ // data_dirs_space_available_bytes now equals to -1.
error_ts->mutable_flags()->emplace_back(
- Substitute("--env_inject_eio_globs=$0",
JoinPathSegments(error_ts->data_dirs()[1],
-
"**")));
+ Substitute("--env_inject_eio_globs=$0",
+ JoinPathSegments(error_ts->data_dirs()[1], "**")));
error_ts->mutable_flags()->emplace_back("--env_inject_eio=1.0");
error_ts->Shutdown();
ASSERT_OK(error_ts->Restart());
+ int64_t wal_dir_space = 0;
+ int64_t data_dir_space = 0;
ASSERT_OK(get_metrics(&wal_dir_space, &data_dir_space));
ASSERT_NE(wal_dir_space, -1);
ASSERT_EQ(data_dir_space, -1);