HBASE-16775 Fix flaky TestExportSnapshot#testExportRetry. Reason for flakyness: Current test is probability based fault injection and triggers failure 3% of the time. Earlier when test used LocalJobRunner which didn't honor "mapreduce.map.maxattempts", it'd pass 97% time (when no fault is injected) and fail 3% time (when fault was injected). Point being, even when the test was complete wrong, we couldn't catch it because it was probability based.
This change will inject fault in a deterministic manner. On design side, it encapsulates all testing hooks in ExportSnapshot.java into single inner class. Change-Id: Icba866e1d56a5281748df89f4dd374bc45bad249 Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/da5fb27e Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/da5fb27e Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/da5fb27e Branch: refs/heads/hbase-12439 Commit: da5fb27eabed4a4b4d251be973ee945fb52895bf Parents: cf3215d Author: Apekshit Sharma <a...@apache.org> Authored: Thu Oct 6 14:20:58 2016 -0700 Committer: Apekshit Sharma <a...@apache.org> Committed: Wed Apr 12 11:11:31 2017 -0700 ---------------------------------------------------------------------- .../hadoop/hbase/snapshot/ExportSnapshot.java | 58 +++++++------- .../hbase/snapshot/TestExportSnapshot.java | 84 +++++++++++--------- .../snapshot/TestExportSnapshotNoCluster.java | 2 +- .../hbase/snapshot/TestMobExportSnapshot.java | 7 +- .../snapshot/TestMobSecureExportSnapshot.java | 7 +- .../snapshot/TestSecureExportSnapshot.java | 7 +- 6 files changed, 93 insertions(+), 72 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/da5fb27e/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/ExportSnapshot.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/ExportSnapshot.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/ExportSnapshot.java index e2086e9..e3ad951 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/ExportSnapshot.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/ExportSnapshot.java @@ -29,7 +29,6 @@ import java.util.Collections; import java.util.Comparator; import java.util.LinkedList; import java.util.List; -import java.util.Random; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.Option; @@ -110,9 +109,12 @@ public class ExportSnapshot extends AbstractHBaseTool implements Tool { private static final String CONF_BANDWIDTH_MB = "snapshot.export.map.bandwidth.mb"; protected static final String CONF_SKIP_TMP = "snapshot.export.skip.tmp"; - static final String CONF_TEST_FAILURE = "test.snapshot.export.failure"; - static final String CONF_TEST_RETRY = "test.snapshot.export.failure.retry"; - + static class Testing { + static final String CONF_TEST_FAILURE = "test.snapshot.export.failure"; + static final String CONF_TEST_FAILURE_COUNT = "test.snapshot.export.failure.count"; + int failuresCountToInject = 0; + int injectedFailureCount = 0; + } // Command line options and defaults. static final class Options { @@ -149,12 +151,10 @@ public class ExportSnapshot extends AbstractHBaseTool implements Tool { private static class ExportMapper extends Mapper<BytesWritable, NullWritable, NullWritable, NullWritable> { + private static final Log LOG = LogFactory.getLog(ExportMapper.class); final static int REPORT_SIZE = 1 * 1024 * 1024; final static int BUFFER_SIZE = 64 * 1024; - private boolean testFailures; - private Random random; - private boolean verifyChecksum; private String filesGroup; private String filesUser; @@ -169,9 +169,12 @@ public class ExportSnapshot extends AbstractHBaseTool implements Tool { private Path inputArchive; private Path inputRoot; + private static Testing testing = new Testing(); + @Override public void setup(Context context) throws IOException { Configuration conf = context.getConfiguration(); + Configuration srcConf = HBaseConfiguration.createClusterConf(conf, null, CONF_SOURCE_PREFIX); Configuration destConf = HBaseConfiguration.createClusterConf(conf, null, CONF_DEST_PREFIX); @@ -186,8 +189,6 @@ public class ExportSnapshot extends AbstractHBaseTool implements Tool { inputArchive = new Path(inputRoot, HConstants.HFILE_ARCHIVE_DIRECTORY); outputArchive = new Path(outputRoot, HConstants.HFILE_ARCHIVE_DIRECTORY); - testFailures = conf.getBoolean(CONF_TEST_FAILURE, false); - try { srcConf.setBoolean("fs." + inputRoot.toUri().getScheme() + ".impl.disable.cache", true); inputFs = FileSystem.get(inputRoot.toUri(), srcConf); @@ -210,6 +211,12 @@ public class ExportSnapshot extends AbstractHBaseTool implements Tool { for (Counter c : Counter.values()) { context.getCounter(c).increment(0); } + if (context.getConfiguration().getBoolean(Testing.CONF_TEST_FAILURE, false)) { + testing.failuresCountToInject = conf.getInt(Testing.CONF_TEST_FAILURE_COUNT, 0); + // Get number of times we have already injected failure based on attempt number of this + // task. + testing.injectedFailureCount = context.getTaskAttemptID().getId(); + } } @Override @@ -251,35 +258,23 @@ public class ExportSnapshot extends AbstractHBaseTool implements Tool { return new Path(outputArchive, path); } - /* - * Used by TestExportSnapshot to simulate a failure + /** + * Used by TestExportSnapshot to test for retries when failures happen. + * Failure is injected in {@link #copyFile(Context, SnapshotFileInfo, Path)}. */ private void injectTestFailure(final Context context, final SnapshotFileInfo inputInfo) throws IOException { - if (testFailures) { - if (context.getConfiguration().getBoolean(CONF_TEST_RETRY, false)) { - if (random == null) { - random = new Random(); - } - - // FLAKY-TEST-WARN: lower is better, we can get some runs without the - // retry, but at least we reduce the number of test failures due to - // this test exception from the same map task. - if (random.nextFloat() < 0.03) { - throw new IOException("TEST RETRY FAILURE: Unable to copy input=" + inputInfo - + " time=" + System.currentTimeMillis()); - } - } else { - context.getCounter(Counter.COPY_FAILED).increment(1); - throw new IOException("TEST FAILURE: Unable to copy input=" + inputInfo); - } - } + if (!context.getConfiguration().getBoolean(Testing.CONF_TEST_FAILURE, false)) return; + if (testing.injectedFailureCount >= testing.failuresCountToInject) return; + testing.injectedFailureCount++; + context.getCounter(Counter.COPY_FAILED).increment(1); + LOG.debug("Injecting failure. Count: " + testing.injectedFailureCount); + throw new IOException(String.format("TEST FAILURE (%d of max %d): Unable to copy input=%s", + testing.injectedFailureCount, testing.failuresCountToInject, inputInfo)); } private void copyFile(final Context context, final SnapshotFileInfo inputInfo, final Path outputPath) throws IOException { - injectTestFailure(context, inputInfo); - // Get the file information FileStatus inputStat = getSourceFileStatus(context, inputInfo); @@ -318,6 +313,7 @@ public class ExportSnapshot extends AbstractHBaseTool implements Tool { } } finally { in.close(); + injectTestFailure(context, inputInfo); } } http://git-wip-us.apache.org/repos/asf/hbase/blob/da5fb27e/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestExportSnapshot.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestExportSnapshot.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestExportSnapshot.java index 1beb518..52412d9 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestExportSnapshot.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestExportSnapshot.java @@ -21,6 +21,7 @@ package org.apache.hadoop.hbase.snapshot; import static org.apache.hadoop.util.ToolRunner.run; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; +import static org.junit.Assert.assertFalse; import java.io.IOException; import java.net.URI; @@ -44,12 +45,10 @@ import org.apache.hadoop.hbase.client.Admin; import org.apache.hadoop.hbase.master.snapshot.SnapshotManager; import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos.SnapshotDescription; -import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotFileInfo; import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotRegionManifest; import org.apache.hadoop.hbase.testclassification.VerySlowMapReduceTests; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.FSUtils; -import org.apache.hadoop.util.ToolRunner; import org.junit.After; import org.junit.AfterClass; import org.junit.Before; @@ -57,6 +56,7 @@ import org.junit.BeforeClass; import org.junit.Rule; import org.junit.Test; import org.junit.experimental.categories.Category; +import org.junit.rules.TestName; import org.junit.rules.TestRule; /** @@ -72,6 +72,9 @@ public class TestExportSnapshot { protected final static byte[] FAMILY = Bytes.toBytes("cf"); + @Rule + public final TestName testName = new TestName(); + protected TableName tableName; private byte[] emptySnapshotName; private byte[] snapshotName; @@ -85,17 +88,27 @@ public class TestExportSnapshot { conf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 6); conf.setBoolean("hbase.master.enabletable.roundrobin", true); conf.setInt("mapreduce.map.maxattempts", 10); + // If a single node has enough failures (default 3), resource manager will blacklist it. + // With only 2 nodes and tests injecting faults, we don't want that. + conf.setInt("mapreduce.job.maxtaskfailures.per.tracker", 100); } @BeforeClass public static void setUpBeforeClass() throws Exception { setUpBaseConf(TEST_UTIL.getConfiguration()); - TEST_UTIL.startMiniCluster(3); + // Setup separate test-data directory for MR cluster and set corresponding configurations. + // Otherwise, different test classes running MR cluster can step on each other. + TEST_UTIL.getDataTestDir(); + TEST_UTIL.startMiniZKCluster(); + TEST_UTIL.startMiniMapReduceCluster(); + TEST_UTIL.startMiniHBaseCluster(1, 3); } @AfterClass public static void tearDownAfterClass() throws Exception { - TEST_UTIL.shutdownMiniCluster(); + TEST_UTIL.shutdownMiniHBaseCluster(); + TEST_UTIL.shutdownMiniMapReduceCluster(); + TEST_UTIL.shutdownMiniZKCluster(); } /** @@ -105,10 +118,9 @@ public class TestExportSnapshot { public void setUp() throws Exception { this.admin = TEST_UTIL.getAdmin(); - long tid = System.currentTimeMillis(); - tableName = TableName.valueOf("testtb-" + tid); - snapshotName = Bytes.toBytes("snaptb0-" + tid); - emptySnapshotName = Bytes.toBytes("emptySnaptb0-" + tid); + tableName = TableName.valueOf("testtb-" + testName.getMethodName()); + snapshotName = Bytes.toBytes("snaptb0-" + testName.getMethodName()); + emptySnapshotName = Bytes.toBytes("emptySnaptb0-" + testName.getMethodName()); // create Table createTable(); @@ -191,16 +203,16 @@ public class TestExportSnapshot { Path copyDir, boolean overwrite) throws Exception { testExportFileSystemState(TEST_UTIL.getConfiguration(), tableName, snapshotName, targetName, filesExpected, TEST_UTIL.getDefaultRootDirPath(), copyDir, - overwrite, getBypassRegionPredicate()); + overwrite, getBypassRegionPredicate(), true); } /** - * Test ExportSnapshot + * Creates destination directory, runs ExportSnapshot() tool, and runs some verifications. */ protected static void testExportFileSystemState(final Configuration conf, final TableName tableName, final byte[] snapshotName, final byte[] targetName, final int filesExpected, final Path sourceDir, Path copyDir, final boolean overwrite, - final RegionPredicate bypassregionPredicate) throws Exception { + final RegionPredicate bypassregionPredicate, boolean success) throws Exception { URI hdfsUri = FileSystem.get(conf).getUri(); FileSystem fs = FileSystem.get(copyDir.toUri(), new Configuration()); copyDir = copyDir.makeQualified(fs); @@ -218,7 +230,12 @@ public class TestExportSnapshot { // Export Snapshot int res = run(conf, new ExportSnapshot(), opts.toArray(new String[opts.size()])); - assertEquals(0, res); + assertEquals(success ? 0 : 1, res); + if (!success) { + final Path targetDir = new Path(HConstants.SNAPSHOT_DIR_NAME, Bytes.toString(targetName)); + assertFalse(fs.exists(new Path(copyDir, targetDir))); + return; + } // Verify File-System state FileStatus[] rootFiles = fs.listStatus(copyDir); @@ -242,42 +259,35 @@ public class TestExportSnapshot { } /** - * Check that ExportSnapshot will return a failure if something fails. + * Check that ExportSnapshot will succeed if something fails but the retry succeed. */ @Test - public void testExportFailure() throws Exception { - assertEquals(1, runExportAndInjectFailures(snapshotName, false)); + public void testExportRetry() throws Exception { + Path copyDir = getLocalDestinationDir(); + FileSystem fs = FileSystem.get(copyDir.toUri(), new Configuration()); + copyDir = copyDir.makeQualified(fs); + Configuration conf = new Configuration(TEST_UTIL.getConfiguration()); + conf.setBoolean(ExportSnapshot.Testing.CONF_TEST_FAILURE, true); + conf.setInt(ExportSnapshot.Testing.CONF_TEST_FAILURE_COUNT, 2); + conf.setInt("mapreduce.map.maxattempts", 3); + testExportFileSystemState(conf, tableName, snapshotName, snapshotName, tableNumFiles, + TEST_UTIL.getDefaultRootDirPath(), copyDir, true, getBypassRegionPredicate(), true); } /** - * Check that ExportSnapshot will succede if something fails but the retry succede. + * Check that ExportSnapshot will fail if we inject failure more times than MR will retry. */ @Test - public void testExportRetry() throws Exception { - assertEquals(0, runExportAndInjectFailures(snapshotName, true)); - } - - /* - * Execute the ExportSnapshot job injecting failures - */ - private int runExportAndInjectFailures(final byte[] snapshotName, boolean retry) - throws Exception { + public void testExportFailure() throws Exception { Path copyDir = getLocalDestinationDir(); - URI hdfsUri = FileSystem.get(TEST_UTIL.getConfiguration()).getUri(); FileSystem fs = FileSystem.get(copyDir.toUri(), new Configuration()); copyDir = copyDir.makeQualified(fs); - Configuration conf = new Configuration(TEST_UTIL.getConfiguration()); - conf.setBoolean(ExportSnapshot.CONF_TEST_FAILURE, true); - conf.setBoolean(ExportSnapshot.CONF_TEST_RETRY, retry); - if (!retry) { - conf.setInt("mapreduce.map.maxattempts", 3); - } - // Export Snapshot - Path sourceDir = TEST_UTIL.getHBaseCluster().getMaster().getMasterFileSystem().getRootDir(); - String[] args = new String[] { "--snapshot", Bytes.toString(snapshotName), - "--copy-from", sourceDir.toString(), "--copy-to", copyDir.toString() }; - return ToolRunner.run(conf, new ExportSnapshot(), args); + conf.setBoolean(ExportSnapshot.Testing.CONF_TEST_FAILURE, true); + conf.setInt(ExportSnapshot.Testing.CONF_TEST_FAILURE_COUNT, 4); + conf.setInt("mapreduce.map.maxattempts", 3); + testExportFileSystemState(conf, tableName, snapshotName, snapshotName, tableNumFiles, + TEST_UTIL.getDefaultRootDirPath(), copyDir, true, getBypassRegionPredicate(), false); } /* http://git-wip-us.apache.org/repos/asf/hbase/blob/da5fb27e/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestExportSnapshotNoCluster.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestExportSnapshotNoCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestExportSnapshotNoCluster.java index e2d7c11..cd5ff6c 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestExportSnapshotNoCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestExportSnapshotNoCluster.java @@ -104,7 +104,7 @@ public class TestExportSnapshotNoCluster { TableName tableName = builder.getTableDescriptor().getTableName(); TestExportSnapshot.testExportFileSystemState(TEST_UTIL.getConfiguration(), tableName, snapshotName, snapshotName, snapshotFilesCount, - testDir, getDestinationDir(), false, null); + testDir, getDestinationDir(), false, null, true); } private Path getDestinationDir() { http://git-wip-us.apache.org/repos/asf/hbase/blob/da5fb27e/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestMobExportSnapshot.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestMobExportSnapshot.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestMobExportSnapshot.java index c375b0a..55686b1 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestMobExportSnapshot.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestMobExportSnapshot.java @@ -45,7 +45,12 @@ public class TestMobExportSnapshot extends TestExportSnapshot { @BeforeClass public static void setUpBeforeClass() throws Exception { setUpBaseConf(TEST_UTIL.getConfiguration()); - TEST_UTIL.startMiniCluster(3); + // Setup separate test-data directory for MR cluster and set corresponding configurations. + // Otherwise, different test classes running MR cluster can step on each other. + TEST_UTIL.getDataTestDir(); + TEST_UTIL.startMiniZKCluster(); + TEST_UTIL.startMiniMapReduceCluster(); + TEST_UTIL.startMiniHBaseCluster(1, 3); } @Override http://git-wip-us.apache.org/repos/asf/hbase/blob/da5fb27e/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestMobSecureExportSnapshot.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestMobSecureExportSnapshot.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestMobSecureExportSnapshot.java index 8154995..c0f31d5 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestMobSecureExportSnapshot.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestMobSecureExportSnapshot.java @@ -37,6 +37,9 @@ public class TestMobSecureExportSnapshot extends TestMobExportSnapshot { @BeforeClass public static void setUpBeforeClass() throws Exception { setUpBaseConf(TEST_UTIL.getConfiguration()); + // Setup separate test-data directory for MR cluster and set corresponding configurations. + // Otherwise, different test classes running MR cluster can step on each other. + TEST_UTIL.getDataTestDir(); // set the always on security provider UserProvider.setUserProviderForTesting(TEST_UTIL.getConfiguration(), @@ -45,7 +48,9 @@ public class TestMobSecureExportSnapshot extends TestMobExportSnapshot { // setup configuration SecureTestUtil.enableSecurity(TEST_UTIL.getConfiguration()); - TEST_UTIL.startMiniCluster(3); + TEST_UTIL.startMiniZKCluster(); + TEST_UTIL.startMiniMapReduceCluster(); + TEST_UTIL.startMiniHBaseCluster(1, 3); // Wait for the ACL table to become available TEST_UTIL.waitTableEnabled(AccessControlLists.ACL_TABLE_NAME); http://git-wip-us.apache.org/repos/asf/hbase/blob/da5fb27e/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSecureExportSnapshot.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSecureExportSnapshot.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSecureExportSnapshot.java index f335e4e..4d35b71 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSecureExportSnapshot.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/TestSecureExportSnapshot.java @@ -42,6 +42,9 @@ public class TestSecureExportSnapshot extends TestExportSnapshot { @BeforeClass public static void setUpBeforeClass() throws Exception { setUpBaseConf(TEST_UTIL.getConfiguration()); + // Setup separate test-data directory for MR cluster and set corresponding configurations. + // Otherwise, different test classes running MR cluster can step on each other. + TEST_UTIL.getDataTestDir(); // set the always on security provider UserProvider.setUserProviderForTesting(TEST_UTIL.getConfiguration(), @@ -50,7 +53,9 @@ public class TestSecureExportSnapshot extends TestExportSnapshot { // setup configuration SecureTestUtil.enableSecurity(TEST_UTIL.getConfiguration()); - TEST_UTIL.startMiniCluster(3); + TEST_UTIL.startMiniZKCluster(); + TEST_UTIL.startMiniMapReduceCluster(); + TEST_UTIL.startMiniHBaseCluster(1, 3); // Wait for the ACL table to become available TEST_UTIL.waitTableEnabled(AccessControlLists.ACL_TABLE_NAME);