HDFS-8036. Use snapshot path as source when using snapshot diff report in DistCp. Contributed by Jing Zhao.
Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/75cb1d42 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/75cb1d42 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/75cb1d42 Branch: refs/heads/HDFS-7285 Commit: 75cb1d42abec54ef5484636e020949ceebe189e9 Parents: 3c7adaa Author: Haohui Mai <[email protected]> Authored: Wed Apr 1 16:50:59 2015 -0700 Committer: Haohui Mai <[email protected]> Committed: Wed Apr 1 16:50:59 2015 -0700 ---------------------------------------------------------------------- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 ++ .../org/apache/hadoop/tools/DistCpSync.java | 21 +++++++++- .../hadoop/tools/mapred/CopyCommitter.java | 3 +- .../org/apache/hadoop/tools/TestDistCpSync.java | 40 +++++++++++++++++++- 4 files changed, 63 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/75cb1d42/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index f265ead..1d9e200 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -1345,6 +1345,9 @@ Release 2.7.0 - UNRELEASED HDFS-7748. Separate ECN flags from the Status in the DataTransferPipelineAck. (Anu Engineer and Haohui Mai via wheat9) + HDFS-8036. Use snapshot path as source when using snapshot diff report in + DistCp. (Jing Zhao via wheat9) + BREAKDOWN OF HDFS-7584 SUBTASKS AND RELATED JIRAS HDFS-7720. Quota by Storage Type API, tools and ClientNameNode http://git-wip-us.apache.org/repos/asf/hadoop/blob/75cb1d42/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpSync.java ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpSync.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpSync.java index 26d7eb4..8e71b6f 100644 --- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpSync.java +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpSync.java @@ -22,6 +22,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.DistributedFileSystem; +import org.apache.hadoop.hdfs.protocol.HdfsConstants; import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport; import java.io.IOException; @@ -86,6 +87,22 @@ class DistCpSync { } finally { deleteTargetTmpDir(targetFs, tmpDir); // TODO: since we have tmp directory, we can support "undo" with failures + // set the source path using the snapshot path + inputOptions.setSourcePaths(Arrays.asList(getSourceSnapshotPath(sourceDir, + inputOptions.getToSnapshot()))); + } + } + + private static String getSnapshotName(String name) { + return Path.CUR_DIR.equals(name) ? "" : name; + } + + private static Path getSourceSnapshotPath(Path sourceDir, String snapshotName) { + if (Path.CUR_DIR.equals(snapshotName)) { + return sourceDir; + } else { + return new Path(sourceDir, + HdfsConstants.DOT_SNAPSHOT_DIR + Path.SEPARATOR + snapshotName); } } @@ -136,8 +153,10 @@ class DistCpSync { static DiffInfo[] getDiffs(DistCpOptions inputOptions, DistributedFileSystem fs, Path sourceDir, Path targetDir) { try { + final String from = getSnapshotName(inputOptions.getFromSnapshot()); + final String to = getSnapshotName(inputOptions.getToSnapshot()); SnapshotDiffReport sourceDiff = fs.getSnapshotDiffReport(sourceDir, - inputOptions.getFromSnapshot(), inputOptions.getToSnapshot()); + from, to); return DiffInfo.getDiffs(sourceDiff, targetDir); } catch (IOException e) { DistCp.LOG.warn("Failed to compute snapshot diff on " + sourceDir, e); http://git-wip-us.apache.org/repos/asf/hadoop/blob/75cb1d42/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyCommitter.java ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyCommitter.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyCommitter.java index 9ec57f4..2b1e510 100644 --- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyCommitter.java +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyCommitter.java @@ -90,8 +90,7 @@ public class CopyCommitter extends FileOutputCommitter { } try { - if (conf.getBoolean(DistCpConstants.CONF_LABEL_DELETE_MISSING, false) - && !(conf.getBoolean(DistCpConstants.CONF_LABEL_DIFF, false))) { + if (conf.getBoolean(DistCpConstants.CONF_LABEL_DELETE_MISSING, false)) { deleteMissing(conf); } else if (conf.getBoolean(DistCpConstants.CONF_LABEL_ATOMIC_COPY, false)) { commitData(conf); http://git-wip-us.apache.org/repos/asf/hadoop/blob/75cb1d42/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestDistCpSync.java ---------------------------------------------------------------------- diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestDistCpSync.java b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestDistCpSync.java index 7d5dad0..75d1de5 100644 --- a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestDistCpSync.java +++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestDistCpSync.java @@ -24,6 +24,7 @@ import org.apache.hadoop.hdfs.DFSTestUtil; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.protocol.HdfsConstants; import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.SequenceFile; @@ -97,6 +98,8 @@ public class TestDistCpSync { dfs.createSnapshot(source, "s2"); dfs.createSnapshot(target, "s1"); Assert.assertTrue(DistCpSync.sync(options, conf)); + // reset source paths in options + options.setSourcePaths(Arrays.asList(source)); // changes have been made in target final Path subTarget = new Path(target, "sub"); @@ -183,9 +186,21 @@ public class TestDistCpSync { changeData(source); dfs.createSnapshot(source, "s2"); + // before sync, make some further changes on source. this should not affect + // the later distcp since we're copying (s2-s1) to target + final Path toDelete = new Path(source, "foo/d1/foo/f1"); + dfs.delete(toDelete, true); + final Path newdir = new Path(source, "foo/d1/foo/newdir"); + dfs.mkdirs(newdir); + // do the sync Assert.assertTrue(DistCpSync.sync(options, conf)); + // make sure the source path has been updated to the snapshot path + final Path spath = new Path(source, + HdfsConstants.DOT_SNAPSHOT_DIR + Path.SEPARATOR + "s2"); + Assert.assertEquals(spath, options.getSourcePaths().get(0)); + // build copy listing final Path listingPath = new Path("/tmp/META/fileList.seq"); CopyListing listing = new GlobbedCopyListing(conf, new Credentials()); @@ -209,7 +224,7 @@ public class TestDistCpSync { .getCounter(CopyMapper.Counter.BYTESCOPIED).getValue()); // verify the source and target now has the same structure - verifyCopy(dfs.getFileStatus(source), dfs.getFileStatus(target), false); + verifyCopy(dfs.getFileStatus(spath), dfs.getFileStatus(target), false); } private Map<Text, CopyListingFileStatus> getListing(Path listingPath) @@ -248,6 +263,29 @@ public class TestDistCpSync { } } + /** + * Similar test with testSync, but the "to" snapshot is specified as "." + * @throws Exception + */ + @Test + public void testSyncWithCurrent() throws Exception { + options.setUseDiff(true, "s1", "."); + initData(source); + initData(target); + dfs.allowSnapshot(source); + dfs.allowSnapshot(target); + dfs.createSnapshot(source, "s1"); + dfs.createSnapshot(target, "s1"); + + // make changes under source + changeData(source); + + // do the sync + Assert.assertTrue(DistCpSync.sync(options, conf)); + // make sure the source path is still unchanged + Assert.assertEquals(source, options.getSourcePaths().get(0)); + } + private void initData2(Path dir) throws Exception { final Path test = new Path(dir, "test"); final Path foo = new Path(dir, "foo");
