Repository: kudu Updated Branches: refs/heads/master 801d00536 -> c9f1772e7
create-table-stress-test: deflake TestConcurrentCreateTableAndReloadMetadata First, if the ASSERT_OK fired, the test would hang. No bueno. Second, in TSAN environments the test would sometimes fail with a table already present error, typically happening either 30s or 60s after that table was created. I believe this is due to a client-side timeout _after_ the table was created but _before_ its RPC response was sent. We could try to address this by bumping the client's timeout, but because reload_metadata_thread is DoS'ing the cluster, that doesn't seem robust. Instead, let's have the thread ease up in TSAN environments. Without the change, the test failed 4/1000 times (--stress_cpu_threads=4). With the change, the test failed 0/1000 times (--stress_cpu_threads=8). Change-Id: I607a8e278e8e8f370cda512f365a97fbb81e1ae3 Reviewed-on: http://gerrit.cloudera.org:8080/11523 Reviewed-by: Alexey Serbin <[email protected]> Tested-by: Kudu Jenkins Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/c9f1772e Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/c9f1772e Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/c9f1772e Branch: refs/heads/master Commit: c9f1772e7683d8e2d2467c565091aeb8d12d6af3 Parents: 801d005 Author: Adar Dembo <[email protected]> Authored: Wed Sep 26 11:11:59 2018 -0700 Committer: Adar Dembo <[email protected]> Committed: Wed Sep 26 21:35:28 2018 +0000 ---------------------------------------------------------------------- .../create-table-stress-test.cc | 23 ++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/c9f1772e/src/kudu/integration-tests/create-table-stress-test.cc ---------------------------------------------------------------------- diff --git a/src/kudu/integration-tests/create-table-stress-test.cc b/src/kudu/integration-tests/create-table-stress-test.cc index 35cbdf0..3bf6c58 100644 --- a/src/kudu/integration-tests/create-table-stress-test.cc +++ b/src/kudu/integration-tests/create-table-stress-test.cc @@ -53,6 +53,7 @@ #include "kudu/util/monotime.h" #include "kudu/util/net/sockaddr.h" #include "kudu/util/pb_util.h" +#include "kudu/util/scoped_cleanup.h" #include "kudu/util/status.h" #include "kudu/util/stopwatch.h" #include "kudu/util/test_macros.h" @@ -359,10 +360,20 @@ TEST_F(CreateTableStressTest, TestConcurrentCreateTableAndReloadMetadata) { CHECK_OK(cluster_->mini_master()->master()->catalog_manager()-> VisitTablesAndTablets()); - // Give table creation a chance to run. - SleepFor(MonoDelta::FromMilliseconds(1 + rand() % 10)); + // Give table creation a chance to run. TSAN is especially brutal; so + // let's sleep for longer in such environments. +#ifdef THREAD_SANITIZER + int sleep_ms = 10 + rand() % 91; +#else + int sleep_ms = 1 + rand() % 10; +#endif + SleepFor(MonoDelta::FromMilliseconds(sleep_ms)); } }); + SCOPED_CLEANUP({ + stop.Store(true); + reload_metadata_thread.join(); + }); for (int num_tables_created = 0; num_tables_created < 20;) { string table_name = Substitute("test-$0", num_tables_created); @@ -388,6 +399,12 @@ TEST_F(CreateTableStressTest, TestConcurrentCreateTableAndReloadMetadata) { // it's hard do deduce the universal timeout constant and we prefer to // not introduce test flakiness. // + // Note: on timeout, we don't actually know with certainty whether the + // table was created or not. It's possible for the RPC to be accepted but + // processed very slowly by the master, for the client to eventually give + // up and time out the request, all while the master continues and + // eventually finishes creating the table. + // // TODO(aserbin): update the test keeping its racy essence but making it // cleaner regarding this timeout&retry workaround. continue; @@ -395,8 +412,6 @@ TEST_F(CreateTableStressTest, TestConcurrentCreateTableAndReloadMetadata) { ASSERT_OK(s); num_tables_created++; } - stop.Store(true); - reload_metadata_thread.join(); } } // namespace kudu
