This is an automated email from the ASF dual-hosted git repository. adar pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/kudu.git
commit c550fe8f55ee0a4c6838d246e00e541ef4cff460 Author: Adar Dembo <[email protected]> AuthorDate: Mon Apr 1 21:59:21 2019 -0700 build: adapt new Java flaky test infrastructure to existing controls Now that Java tests are reporting success/failure, we can use the existing flaky test controls to drive it. As a refresher, the C++ tests rely on these environment variables: - RUN_FLAKY_ONLY: whether to run just flaky tests or all tests - KUDU_FLAKY_TEST_ATTEMPTS: number of attempts for flaky tests - KUDU_FLAKY_TEST_LIST: path to list of flaky tests, one on each line - KUDU_RETRY_ALL_FAILED_TESTS: whether to retry all tests or just the ones in the flaky test list The algorithm is roughly: if RUN_FLAKY_ONLY or KUDU_FLAKY_TEST_ATTEMPTS > 1: populate KUDU_FLAKY_TEST_LIST from test result server if RUN_FLAKY_ONLY: testset = tests listed in KUDU_FLAKY_TEST_LIST else: testset = all tests for t in testset: if KUDU_RETRY_ALL_FAILED_TESTS or (KUDU_FLAKY_TEST_LIST and t in KUDU_FLAKY_TEST_LIST): num_attempts = KUDU_FLAKY_TEST_ATTEMPTS (or 1 if unset) else: num_attempts = 1 run t up to num_attempts times You can see it at work in build-and-test.sh/run-test.sh. You can also see it in dist-test.py though notably, it doesn't care about RUN_FLAKY_ONLY because we never used that particular combination (presumably the list of flaky tests is short enough that it wouldn't benefit from distributed testing). This patch attempts to mirror these exact semantics for Java tests. Here are the interesting changes: - In RetryRule, rerunFailingTestsCount is gone. The behavior is informed via the aforementioned environment variables instead. - In build-and-test.sh, if RUN_FLAKY_ONLY is set, parse the flaky test list into a series of --tests gradle command line arguments. - In dist-test.py, opt into the C++ flaky test handling (which reflects the above algorithm). There are also some small changes to flaky handling to accommodate Java's per-method flaky test tracking. Note: all of this assumes that there's no overlap between the names of any C++ or Java tests, which is currently true as all C++ tests have names like "tablet-test" or "master_cert_authority-itest" while all Java tests are prefixed with "org.apache.kudu...". If this were to change, we'd need to properly "namespace" the test results in the reporting infrastructure and fetch the flaky test lists separately for C++ and Java tests. For now there's just one flaky test list, and both ctest and gradle are OK with being asked to run irrelevant tests (they'll just be ignored). Change-Id: Ia89598d7eeb5ab642ab4ebb7aa583adcce770eae Reviewed-on: http://gerrit.cloudera.org:8080/12917 Reviewed-by: Grant Henke <[email protected]> Tested-by: Adar Dembo <[email protected]> --- build-support/dist_test.py | 40 ++++++-- build-support/jenkins/build-and-test.sh | 31 +++++-- java/gradle/tests.gradle | 3 - .../java/org/apache/kudu/test/junit/RetryRule.java | 102 ++++++++++++++++++--- 4 files changed, 145 insertions(+), 31 deletions(-) diff --git a/build-support/dist_test.py b/build-support/dist_test.py index 992da9c..2e207f5 100755 --- a/build-support/dist_test.py +++ b/build-support/dist_test.py @@ -70,13 +70,17 @@ GRADLE_FLAGS = os.environ.get('EXTRA_GRADLE_FLAGS', "") PATH_TO_REPO = "../" # Matches the command line listings in 'ctest -V -N'. For example: -# 262: Test command: /src/kudu/build-support/run-test.sh "/src/kudu/build/debug/bin/jsonwriter-test" +# 262: Test command: /src/kudu/build-support/run-test.sh "/src/kudu/build/debug/bin/jsonwriter-test" TEST_COMMAND_RE = re.compile('Test command: (.+)$') # Matches the environment variable listings in 'ctest -V -N'. For example: # 262: GTEST_TOTAL_SHARDS=1 TEST_ENV_RE = re.compile('^\d+: (\S+)=(.+)') +# Matches test names that have a shard suffix. For example: +# master-stress-test.8 +TEST_SHARD_RE = re.compile("\.\d+$") + DEPS_FOR_ALL = \ ["build-support/stacktrace_addr2line.pl", "build-support/run-test.sh", @@ -347,6 +351,10 @@ def create_task_json(staging, If 'replicate_tasks' is higher than one, each .isolate file will be submitted multiple times. This can be useful for looping tests. + + The test name is compared with the contents of 'flaky_test_set' to decide + how many times the execution service should retry the test on failure. + Alternatively, if 'retry_all_tests' is True, all tests will be retried. """ tasks = [] with file(staging.archive_dump_path(), "r") as isolate_dump: @@ -356,9 +364,9 @@ def create_task_json(staging, # the dumped JSON. Others list it in an 'items' dictionary. items = inmap.get('items', inmap) for k, v in items.iteritems(): - # The key is 'foo-test.<shard>'. So, chop off the last component - # to get the test name - test_name = ".".join(k.split(".")[:-1]) + # The key may be 'foo-test.<shard>'. So, chop off the last component + # to get the test name. + test_name = ".".join(k.split(".")[:-1]) if TEST_SHARD_RE.search(k) else k max_retries = 0 if test_name in flaky_test_set or retry_all_tests: max_retries = FLAKY_TEST_RETRIES @@ -426,7 +434,20 @@ def get_flakies(): path = os.getenv('KUDU_FLAKY_TEST_LIST') if not path: return set() - return set(l.strip() for l in file(path)) + + # dist-test can only retry tests on a per-class basis, but + # KUDU_FLAKY_TEST_LIST lists Java flakes on a per-method basis. + flaky_classes = [] + with open(path) as f: + for l in f: + l = l.strip() + if '.' in l: + # "o.a.k.client.TestKuduClient.testFoo" -> "o.a.k.client.TestKuduClient" + flaky_classes.append('.'.join(l.split('.')[:-1])) + else: + flaky_classes.append(l) + + return set(flaky_classes) def run_tests(parser, options): """ @@ -545,10 +566,11 @@ def run_java_tests(parser, options): cwd=rel_to_abs("java")) staging = StagingDir(rel_to_abs("java/build/dist-test")) run_isolate(staging) - # TODO(ghenke): Add Java tests to the flaky dashboard - # KUDU_FLAKY_TEST_LIST doesn't included Java tests. - # Instead we will retry all Java tests in case they are flaky. - create_task_json(staging, 1, retry_all_tests=True) + retry_all = RETRY_ALL_TESTS > 0 + create_task_json(staging, + flaky_test_set=get_flakies(), + replicate_tasks=1, + retry_all_tests=retry_all) submit_tasks(staging, options) def loop_java_test(parser, options): diff --git a/build-support/jenkins/build-and-test.sh b/build-support/jenkins/build-and-test.sh index 23b20e4..4eaf556 100755 --- a/build-support/jenkins/build-and-test.sh +++ b/build-support/jenkins/build-and-test.sh @@ -223,6 +223,13 @@ if [ "$RUN_FLAKY_ONLY" == "1" -o "$KUDU_FLAKY_TEST_ATTEMPTS" -gt 1 ]; then fi if [ "$RUN_FLAKY_ONLY" == "1" ]; then + # TODO(adar): we can't yet pass the flaky test list into the dist-test + # machinery (in order to control which tests are executed). + if [ "$ENABLE_DIST_TEST" == "1" ]; then + echo "Distributed testing is incompatible with RUN_FLAKY_ONLY=1" + exit 1 + fi + test_regex=$(perl -e ' chomp(my @lines = <>); print join("|", map { "^" . quotemeta($_) } @lines); @@ -232,14 +239,25 @@ if [ "$RUN_FLAKY_ONLY" == "1" -o "$KUDU_FLAKY_TEST_ATTEMPTS" -gt 1 ]; then exit 0 fi + # Set up ctest/gradle to run only those tests found in the flaky test list. + # + # Note: the flaky test list contains both C++ and Java tests and we pass it + # in its entirety to both ctest and gradle. This is safe because: + # 1. There are no test name collisions between C++ and Java tests. + # 2. Both ctest and gradle will happily ignore tests they can't find. + # + # If either of these assumptions changes, we'll need to explicitly split the + # test list into two lists, either here or in the test result server. EXTRA_TEST_FLAGS="$EXTRA_TEST_FLAGS -R $test_regex" + while IFS="" read t || [ -n "$t" ] + do + EXTRA_GRADLE_TEST_FLAGS="--tests $t $EXTRA_GRADLE_TEST_FLAGS" + done < $KUDU_FLAKY_TEST_LIST - # We don't support detecting java and python flaky tests at the moment. - echo "RUN_FLAKY_ONLY=1: running flaky tests only,"\ - "disabling Java and python builds." + # We don't support detecting python flaky tests at the moment. + echo "RUN_FLAKY_ONLY=1: running flaky tests only, disabling python build." BUILD_PYTHON=0 BUILD_PYTHON3=0 - BUILD_JAVA=0 elif [ "$KUDU_FLAKY_TEST_ATTEMPTS" -gt 1 ]; then echo Will retry the flaky tests up to $KUDU_FLAKY_TEST_ATTEMPTS times. fi @@ -373,7 +391,7 @@ EXIT_STATUS=0 FAILURES="" # If we're running distributed C++ tests, submit them asynchronously while -# we run the Java and Python tests. +# we run any local tests. if [ "$ENABLE_DIST_TEST" == "1" ]; then echo echo Submitting C++ distributed-test job. @@ -424,7 +442,6 @@ if [ "$BUILD_JAVA" == "1" ]; then export EXTRA_GRADLE_FLAGS="--console=plain" EXTRA_GRADLE_FLAGS="$EXTRA_GRADLE_FLAGS --no-daemon" EXTRA_GRADLE_FLAGS="$EXTRA_GRADLE_FLAGS --continue" - EXTRA_GRADLE_FLAGS="$EXTRA_GRADLE_FLAGS -DrerunFailingTestsCount=3" # KUDU-2524: temporarily disable scalafmt until we can work out its JDK # incompatibility issue. EXTRA_GRADLE_FLAGS="$EXTRA_GRADLE_FLAGS -DskipFormat" @@ -443,7 +460,7 @@ if [ "$BUILD_JAVA" == "1" ]; then fi else # TODO: Run `gradle check` in BUILD_TYPE DEBUG when static code analysis is fixed - if ! ./gradlew $EXTRA_GRADLE_FLAGS clean test ; then + if ! ./gradlew $EXTRA_GRADLE_FLAGS clean test $EXTRA_GRADLE_TEST_FLAGS; then TESTS_FAILED=1 FAILURES="$FAILURES"$'Java Gradle build/test failed\n' fi diff --git a/java/gradle/tests.gradle b/java/gradle/tests.gradle index 2f43d96..2b541c4 100644 --- a/java/gradle/tests.gradle +++ b/java/gradle/tests.gradle @@ -62,9 +62,6 @@ tasks.withType(Test) { systemProperty "java.net.preferIPv4Stack", true systemProperty "java.security.egd", "file:/dev/urandom" // Improve RNG generation speed. - // Set rerunFailingTestsCount for use in BaseKuduTest.java to rerun failing tests. - systemProperty "rerunFailingTestsCount", propertyWithDefault("rerunFailingTestsCount", 0) - // Set kuduBinDir to the binaries to use with the MiniKuduCluster. systemProperty "kuduBinDir", propertyWithDefault("kuduBinDir", "$project.rootDir/../build/latest/bin") diff --git a/java/kudu-test-utils/src/main/java/org/apache/kudu/test/junit/RetryRule.java b/java/kudu-test-utils/src/main/java/org/apache/kudu/test/junit/RetryRule.java index 31d8a95..a4611db 100644 --- a/java/kudu-test-utils/src/main/java/org/apache/kudu/test/junit/RetryRule.java +++ b/java/kudu-test-utils/src/main/java/org/apache/kudu/test/junit/RetryRule.java @@ -16,6 +16,8 @@ // under the License. package org.apache.kudu.test.junit; +import com.google.common.base.Preconditions; + import org.apache.kudu.test.CapturingToFileLogAppender; import org.apache.yetus.audience.InterfaceAudience; import org.apache.yetus.audience.InterfaceStability; @@ -24,15 +26,23 @@ import org.junit.runner.Description; import org.junit.runners.model.Statement; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.BufferedReader; import java.io.Closeable; import java.io.File; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.HashSet; +import java.util.Set; + +import static java.nio.charset.StandardCharsets.UTF_8; /** * JUnit rule to retry failed tests. * - * The number of retries is controlled by the "rerunFailingTestsCount" system - * property, mimicking Surefire in that regard. + * Uses the KUDU_FLAKY_TEST_LIST and KUDU_RETRY_ALL_FAILED_TESTS environment + * variables to determine whether a test should be retried, and the + * KUDU_FLAKY_TEST_ATTEMPTS environment variable to determine how many times. * * By default will use ResultReporter to report success/failure of each test * attempt to an external server; this may be skipped if desired. @@ -40,44 +50,112 @@ import java.io.IOException; @InterfaceAudience.Private @InterfaceStability.Unstable public class RetryRule implements TestRule { - - private static final String RETRY_PROP = "rerunFailingTestsCount"; - private static final Logger LOG = LoggerFactory.getLogger(RetryRule.class); + private static final int DEFAULT_RETRY_COUNT = 0; + private static final Set<String> FLAKY_TESTS = new HashSet<>(); private final int retryCount; private final ResultReporter reporter; + static { + // Initialize the flaky test set if it exists. The file wil have one test + // name per line. + String value = System.getenv("KUDU_FLAKY_TEST_LIST"); + if (value != null) { + try (BufferedReader br = Files.newBufferedReader(Paths.get(value), UTF_8)) { + for (String l = br.readLine(); l != null; l = br.readLine()) { + FLAKY_TESTS.add(l); + } + } catch (Exception e) { + throw new RuntimeException(e); + } + } + } + public RetryRule() { - this(Integer.getInteger(RETRY_PROP, 0), /*skipReporting=*/ false); + this(DEFAULT_RETRY_COUNT, /*skipReporting=*/ false); } @InterfaceAudience.LimitedPrivate("Test") RetryRule(int retryCount, boolean skipReporting) { + Preconditions.checkArgument(retryCount >= 0); this.retryCount = retryCount; this.reporter = skipReporting ? null : new ResultReporter(); } + private static boolean retryAllTests() { + String value = System.getenv("KUDU_RETRY_ALL_FAILED_TESTS"); + return value != null && !value.isEmpty(); + } + + private static boolean retryThisTest(String humanReadableTestName) { + return FLAKY_TESTS.contains(humanReadableTestName); + } + + private static int getActualRetryCount() { + String value = System.getenv("KUDU_FLAKY_TEST_ATTEMPTS"); + if (value == null) { + return DEFAULT_RETRY_COUNT; + } + try { + int val = Integer.parseInt(value); + if (val < 1) { + throw new NumberFormatException( + String.format("expected non-zero positive value, got %d", val)); + } + + // Convert from number of "attempts" to number of "retries". + return Integer.parseInt(value) - 1; + } catch (NumberFormatException e) { + LOG.warn("Could not parse KUDU_FLAKY_TEST_ATTEMPTS, using default value ({})", + DEFAULT_RETRY_COUNT, e); + return DEFAULT_RETRY_COUNT; + } + } + @Override public Statement apply(Statement base, Description description) { - return new RetryStatement(base, description, retryCount, reporter); + String humanReadableTestName = + description.getClassName() + "." + description.getMethodName(); + + // Retrying and reporting are independent; the RetryStatement is used if + // either is enabled. We'll retry the test under one of the following + // circumstances: + // + // 1. The RetryRule was constructed with an explicit retry count. + // 2. We've been asked to retry all tests via KUDU_RETRY_ALL_FAILED_TESTS. + // 3. We've been asked to retry this test via KUDU_FLAKY_TEST_LIST. + // + // In the latter two cases, we consult KUDU_FLAKY_TEST_ATTEMPTS for the retry count. + boolean retryExplicit = retryCount != DEFAULT_RETRY_COUNT; + boolean retryAll = retryAllTests(); + boolean retryThis = retryThisTest(humanReadableTestName); + if (retryExplicit || retryAll || retryThis || reporter != null) { + int actualRetryCount = (retryAll || retryThis) ? getActualRetryCount() : retryCount; + LOG.info("Creating RetryStatement {} result reporter and retry count of {} ({})", + reporter != null ? "with" : "without", + actualRetryCount, + retryExplicit ? "explicit" : + retryAll ? "all tests" : + retryThis ? "this test" : "no retries"); + return new RetryStatement(base, actualRetryCount, reporter, humanReadableTestName); + } + return base; } private static class RetryStatement extends Statement { private final Statement base; - private final Description description; private final int retryCount; private final ResultReporter reporter; private final String humanReadableTestName; - RetryStatement(Statement base, Description description, - int retryCount, ResultReporter reporter) { + RetryStatement(Statement base, int retryCount, ResultReporter reporter, + String humanReadableTestName) { this.base = base; - this.description = description; this.retryCount = retryCount; this.reporter = reporter; - this.humanReadableTestName = description.getClassName() + "." + description.getMethodName(); + this.humanReadableTestName = humanReadableTestName; } private void report(ResultReporter.Result result, File logFile) {
