KUDU-2394. Fix flaky RpcStubTest.TestDontHandleTimedOutCalls This test was flaky under stress+ASAN with three cases of failures:
(1) most commonly, the assertion to check that an RPC had timed out in the queue would run too quickly, just before the actual metric was incremented. Changing it to ASSERT_EVENTUALLY got rid of most of the failures. (2) in a few cases, the 100ms sleeps were processed completely before the later RPC was processed, so the RPC that was supposed to time out in the queue actually had a chance to run. Switching to 1000ms sleeps got rid of the remaining failure case. (3) in a few other cases, the call that was supposed to time out (with a 1ms timeout) timed out before it was even sent to the server. This patch bumps the timeout to 5ms and retries that until it succeeds with a server-side timeout. Ran 5000x with the following command: ./build-support/dist_test.py loop -n 500 build/latest/bin/rpc_stub-test \ --gtest_filter=\*DontHandle\* --gtest_repeat=10 --stress-cpu-threads=8 and got no failures after this patch, whereas I'd usually get 5-10 failures prior. Change-Id: If8cfebb5e9160fb11586abdbba6fc1ae06574a2c Reviewed-on: http://gerrit.cloudera.org:8080/9897 Tested-by: Kudu Jenkins Reviewed-by: Adar Dembo <a...@cloudera.com> Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/f3913658 Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/f3913658 Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/f3913658 Branch: refs/heads/master Commit: f3913658cf7a65668f2f9592122190aa0b093751 Parents: a9bc168 Author: Todd Lipcon <t...@apache.org> Authored: Fri Mar 30 12:03:53 2018 -0700 Committer: Todd Lipcon <t...@apache.org> Committed: Tue Apr 17 17:16:59 2018 +0000 ---------------------------------------------------------------------- src/kudu/rpc/rpc_stub-test.cc | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/f3913658/src/kudu/rpc/rpc_stub-test.cc ---------------------------------------------------------------------- diff --git a/src/kudu/rpc/rpc_stub-test.cc b/src/kudu/rpc/rpc_stub-test.cc index d5b85d1..e626276 100644 --- a/src/kudu/rpc/rpc_stub-test.cc +++ b/src/kudu/rpc/rpc_stub-test.cc @@ -62,6 +62,7 @@ #include "kudu/util/status.h" #include "kudu/util/subprocess.h" #include "kudu/util/test_macros.h" +#include "kudu/util/test_util.h" #include "kudu/util/thread_restrictions.h" #include "kudu/util/user.h" @@ -435,7 +436,7 @@ TEST_F(RpcStubTest, TestDontHandleTimedOutCalls) { for (int i = 0; i < n_worker_threads_; i++) { gscoped_ptr<AsyncSleep> sleep(new AsyncSleep); sleep->rpc.set_timeout(MonoDelta::FromSeconds(1)); - sleep->req.set_sleep_micros(100*1000); // 100ms + sleep->req.set_sleep_micros(1000*1000); // 1sec p.SleepAsync(sleep->req, &sleep->resp, &sleep->rpc, boost::bind(&CountDownLatch::CountDown, &sleep->latch)); sleeps.push_back(sleep.release()); @@ -453,21 +454,34 @@ TEST_F(RpcStubTest, TestDontHandleTimedOutCalls) { // Send another call with a short timeout. This shouldn't get processed, because // it'll get stuck in the queue for longer than its timeout. - RpcController rpc; - SleepRequestPB req; - SleepResponsePB resp; - req.set_sleep_micros(1000); - rpc.set_timeout(MonoDelta::FromMilliseconds(1)); - Status s = p.Sleep(req, &resp, &rpc); - ASSERT_TRUE(s.IsTimedOut()) << s.ToString(); + ASSERT_EVENTUALLY([&]() { + RpcController rpc; + SleepRequestPB req; + SleepResponsePB resp; + req.set_sleep_micros(1); // unused but required. + rpc.set_timeout(MonoDelta::FromMilliseconds(5)); + Status s = p.Sleep(req, &resp, &rpc); + ASSERT_TRUE(s.IsTimedOut()) << s.ToString(); + // Since our timeout was short, it's possible in rare circumstances + // that we time out the RPC on the outbound queue, in which case + // we won't trigger the desired behavior here. In that case, the + // timeout error status would have the string 'ON_OUTBOUND_QUEUE' + // instead of 'SENT', so this assertion would fail and cause the + // ASSERT_EVENTUALLY to loop. + ASSERT_STR_CONTAINS(s.ToString(), "SENT"); + }); for (AsyncSleep* s : sleeps) { s->latch.Wait(); } // Verify that the timedout call got short circuited before being processed. + // We may need to loop a short amount of time as we are racing with the reactor + // thread to process the remaining elements of the queue. const Counter* timed_out_in_queue = service_pool_->RpcsTimedOutInQueueMetricForTests(); - ASSERT_EQ(1, timed_out_in_queue->value()); + ASSERT_EVENTUALLY([&]{ + ASSERT_EQ(1, timed_out_in_queue->value()); + }); } // Test which ensures that the RPC queue accepts requests with the earliest