This is an automated email from the ASF dual-hosted git repository. alexey pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/kudu.git
commit 2f9f62c7a74661b781550216302eb90833516ad7 Author: Alexey Serbin <[email protected]> AuthorDate: Thu Aug 11 16:30:29 2022 -0700 [tests] fix flakiness in TestTabletCopyEncryptedServers The TabletCopyITest.TestTabletCopyEncryptedServers scenario deletes a tablet, and then checks to see that the tablet data state is TABLET_DATA_COPYING. However, it's possible for the remote bootstrap to complete so quickly that it's already TABLET_DATA_READY at the time of sampling, so from time to time the test failed with src/kudu/integration-tests/tablet_copy-itest.cc:1014: Failure Failed Bad status: Timed out: Timed out after 30.002s waiting for correct tablet state: Illegal state: State TABLET_DATA_READY unexpected, expected TABLET_DATA_COPYING This patch updates the assertion to allow both the COPYING and READY tablet data states. Without the patch, the test was about 7% flaky [1]. With the patch, it's not flaky [2]. [1] http://dist-test.cloudera.org/job?job_id=aserbin.1660260668.94650 [2] http://dist-test.cloudera.org/job?job_id=aserbin.1660261249.109365 Change-Id: I22933cc9cb727711ee5fb45c811c2a759958fdfa Reviewed-on: http://gerrit.cloudera.org:8080/18842 Tested-by: Alexey Serbin <[email protected]> Reviewed-by: Yingchun Lai <[email protected]> Reviewed-by: Abhishek Chennaka <[email protected]> Reviewed-by: Attila Bukor <[email protected]> --- src/kudu/integration-tests/tablet_copy-itest.cc | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/kudu/integration-tests/tablet_copy-itest.cc b/src/kudu/integration-tests/tablet_copy-itest.cc index 5ba3f1ea9..ba3e4be45 100644 --- a/src/kudu/integration-tests/tablet_copy-itest.cc +++ b/src/kudu/integration-tests/tablet_copy-itest.cc @@ -27,6 +27,7 @@ #include <set> #include <string> #include <thread> +#include <type_traits> #include <unordered_map> #include <utility> #include <vector> @@ -1003,15 +1004,17 @@ TEST_F(TabletCopyITest, TestTabletCopyEncryptedServers) { ExternalTabletServer* replica_ets = cluster_->tablet_server(2); TServerDetails* replica_ts = ts_map_[replica_ets->uuid()]; ASSERT_OK(WaitForNumTabletsOnTS(replica_ts, 1, timeout, &tablets)); - string tablet_id = tablets[0].tablet_status().tablet_id(); + const auto& tablet_id = tablets[0].tablet_status().tablet_id(); // Tombstone the follower. LOG(INFO) << "Tombstoning follower tablet " << tablet_id << " on TS " << replica_ts->uuid(); ASSERT_OK(DeleteTablet(replica_ts, tablet_id, TABLET_DATA_TOMBSTONED, timeout)); - // Wait for tablet copy to start. - ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(2, tablet_id, - { tablet::TABLET_DATA_COPYING }, timeout)); + // Wait for tablet copy to start. The copying might complete fast or there + // might be some scheduler anomalies, so here it's necessary to count in + // TABLET_DATA_COPYING --> TABLET_DATA_READY transitions as well. + ASSERT_OK(inspect_->WaitForTabletDataStateOnTS( + 2, tablet_id, { tablet::TABLET_DATA_COPYING, tablet::TABLET_DATA_READY }, timeout)); workload.StopAndJoin(); ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, 1));
