Rerun ReplicationAwareTokenAllocatorTest on failure to avoid flakiness. patch by Branimir Lambov; reviewed by Sylvain Lebresne for CASSANDRA-12277.
Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/a5cbb026 Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/a5cbb026 Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/a5cbb026 Branch: refs/heads/cassandra-3.9 Commit: a5cbb026195ace450a514db22a64dc9c7c8fa66e Parents: 3f9b9ff Author: Branimir Lambov <[email protected]> Authored: Tue Jul 26 15:19:51 2016 +0300 Committer: Sylvain Lebresne <[email protected]> Committed: Thu Jul 28 16:25:07 2016 +0200 ---------------------------------------------------------------------- CHANGES.txt | 1 + .../ReplicationAwareTokenAllocatorTest.java | 15 ++++++ test/unit/org/apache/cassandra/Util.java | 56 ++++++++++++++++++++ 3 files changed, 72 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/cassandra/blob/a5cbb026/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index caecefb..8f6997b 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 3.0.9 + * Rerun ReplicationAwareTokenAllocatorTest on failure to avoid flakiness (CASSANDRA-12277) * Exception when computing read-repair for range tombstones (CASSANDRA-12263) * Lost counter writes in compact table and static columns (CASSANDRA-12219) * AssertionError with MVs on updating a row that isn't indexed due to a null value (CASSANDRA-12247) http://git-wip-us.apache.org/repos/asf/cassandra/blob/a5cbb026/test/long/org/apache/cassandra/dht/tokenallocator/ReplicationAwareTokenAllocatorTest.java ---------------------------------------------------------------------- diff --git a/test/long/org/apache/cassandra/dht/tokenallocator/ReplicationAwareTokenAllocatorTest.java b/test/long/org/apache/cassandra/dht/tokenallocator/ReplicationAwareTokenAllocatorTest.java index 0a27e1c..1b36c55 100644 --- a/test/long/org/apache/cassandra/dht/tokenallocator/ReplicationAwareTokenAllocatorTest.java +++ b/test/long/org/apache/cassandra/dht/tokenallocator/ReplicationAwareTokenAllocatorTest.java @@ -29,6 +29,7 @@ import org.apache.commons.math3.stat.descriptive.SummaryStatistics; import org.junit.Test; +import org.apache.cassandra.Util; import org.apache.cassandra.dht.Murmur3Partitioner; import org.apache.cassandra.dht.Token; @@ -545,6 +546,20 @@ public class ReplicationAwareTokenAllocatorTest @Test public void testNewCluster() { + Util.flakyTest(this::flakyTestNewCluster, + 5, + "It tends to fail sometimes due to the random selection of the tokens in the first few nodes."); + } + + public void flakyTestNewCluster() + { + // This test is flaky because the selection of the tokens for the first RF nodes (which is random, with an + // uncontrolled seed) can sometimes cause a pathological situation where the algorithm will find a (close to) + // ideal distribution of tokens for some number of nodes, which in turn will inevitably cause it to go into a + // bad (unacceptable to the test criteria) distribution after adding one more node. + + // This should happen very rarely, unless something is broken in the token allocation code. + for (int rf = 2; rf <= 5; ++rf) { for (int perUnitCount = 1; perUnitCount <= MAX_VNODE_COUNT; perUnitCount *= 4) http://git-wip-us.apache.org/repos/asf/cassandra/blob/a5cbb026/test/unit/org/apache/cassandra/Util.java ---------------------------------------------------------------------- diff --git a/test/unit/org/apache/cassandra/Util.java b/test/unit/org/apache/cassandra/Util.java index 7ce8f04..e7b1ffa 100644 --- a/test/unit/org/apache/cassandra/Util.java +++ b/test/unit/org/apache/cassandra/Util.java @@ -536,6 +536,62 @@ public class Util thread.join(10000); } + public static AssertionError runCatchingAssertionError(Runnable test) + { + try + { + test.run(); + return null; + } + catch (AssertionError e) + { + return e; + } + } + + /** + * Wrapper function used to run a test that can sometimes flake for uncontrollable reasons. + * + * If the given test fails on the first run, it is executed the given number of times again, expecting all secondary + * runs to succeed. If they do, the failure is understood as a flake and the test is treated as passing. + * + * Do not use this if the test is deterministic and its success is not influenced by external factors (such as time, + * selection of random seed, network failures, etc.). If the test can be made independent of such factors, it is + * probably preferable to do so rather than use this method. + * + * @param test The test to run. + * @param rerunsOnFailure How many times to re-run it if it fails. All reruns must pass. + * @param message Message to send to System.err on initial failure. + */ + public static void flakyTest(Runnable test, int rerunsOnFailure, String message) + { + AssertionError e = runCatchingAssertionError(test); + if (e == null) + return; // success + System.err.format("Test failed. %s%n" + + "Re-running %d times to verify it isn't failing more often than it should.%n" + + "Failure was: %s%n", message, rerunsOnFailure, e); + e.printStackTrace(); + + int rerunsFailed = 0; + for (int i = 0; i < rerunsOnFailure; ++i) + { + AssertionError t = runCatchingAssertionError(test); + if (t != null) + { + ++rerunsFailed; + e.addSuppressed(t); + } + } + if (rerunsFailed > 0) + { + System.err.format("Test failed in %d of the %d reruns.%n", rerunsFailed, rerunsOnFailure); + throw e; + } + + System.err.println("All reruns succeeded. Failure treated as flake."); + } + // for use with Optional in tests, can be used as an argument to orElseThrow public static Supplier<AssertionError> throwAssert(final String message) {
