HADOOP-15297. Make S3A etag => checksum feature optional.
Contributed by Steve Loughran.


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/dd05871b
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/dd05871b
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/dd05871b

Branch: refs/heads/HDFS-12996
Commit: dd05871b8b57303fe0b0c652e03257b59c191802
Parents: e1f5251
Author: Steve Loughran <ste...@apache.org>
Authored: Mon Mar 12 14:01:42 2018 +0000
Committer: Steve Loughran <ste...@apache.org>
Committed: Mon Mar 12 14:01:42 2018 +0000

----------------------------------------------------------------------
 .../src/main/resources/core-default.xml         | 11 ++++
 .../org/apache/hadoop/fs/s3a/Constants.java     | 11 ++++
 .../org/apache/hadoop/fs/s3a/S3AFileSystem.java | 38 +++++++++-----
 .../hadoop/fs/s3a/S3AInstrumentation.java       |  1 +
 .../org/apache/hadoop/fs/s3a/Statistic.java     |  2 +
 .../src/site/markdown/tools/hadoop-aws/index.md | 41 ++++++++++++++-
 .../hadoop/fs/s3a/ITestS3AMiscOperations.java   | 53 +++++++++++++++++---
 7 files changed, 136 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/dd05871b/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml
----------------------------------------------------------------------
diff --git 
a/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml 
b/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml
index 6d6ed42..9074300 100644
--- a/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml
+++ b/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml
@@ -1547,6 +1547,17 @@
   </description>
 </property>
 
+<property>
+  <name>fs.s3a.etag.checksum.enabled</name>
+  <value>false</value>
+  <description>
+    Should calls to getFileChecksum() return the etag value of the remote
+    object.
+    WARNING: if enabled, distcp operations between HDFS and S3 will fail unless
+    -skipcrccheck is set.
+  </description>
+</property>
+
 <!-- Azure file system properties -->
 <property>
   <name>fs.wasb.impl</name>

http://git-wip-us.apache.org/repos/asf/hadoop/blob/dd05871b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java
----------------------------------------------------------------------
diff --git 
a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java 
b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java
index faec784..4c95843 100644
--- 
a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java
+++ 
b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java
@@ -542,4 +542,15 @@ public final class Constants {
    */
   public static final String RETRY_THROTTLE_INTERVAL_DEFAULT = "500ms";
 
+  /**
+   * Should etags be exposed as checksums?
+   */
+  public static final String ETAG_CHECKSUM_ENABLED =
+      "fs.s3a.etag.checksum.enabled";
+
+  /**
+   * Default value: false.
+   */
+  public static final boolean ETAG_CHECKSUM_ENABLED_DEFAULT = false;
+
 }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/dd05871b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
----------------------------------------------------------------------
diff --git 
a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
 
b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
index eb65cfa..4b0c208 100644
--- 
a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
+++ 
b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
@@ -2993,17 +2993,21 @@ public class S3AFileSystem extends FileSystem 
implements StreamCapabilities {
   }
 
   /**
-   * Get the etag of a object at the path via HEAD request and return it
-   * as a checksum object. This has the whatever guarantees about equivalence
-   * the S3 implementation offers.
+   * When enabled, get the etag of a object at the path via HEAD request and
+   * return it as a checksum object.
    * <ol>
    *   <li>If a tag has not changed, consider the object unchanged.</li>
    *   <li>Two tags being different does not imply the data is different.</li>
    * </ol>
    * Different S3 implementations may offer different guarantees.
+   *
+   * This check is (currently) only made if
+   * {@link Constants#ETAG_CHECKSUM_ENABLED} is set; turning it on
+   * has caused problems with Distcp (HADOOP-15273).
+   *
    * @param f The file path
    * @param length The length of the file range for checksum calculation
-   * @return The EtagChecksum or null if checksums are not supported.
+   * @return The EtagChecksum or null if checksums are not enabled or 
supported.
    * @throws IOException IO failure
    * @see <a 
href="http://docs.aws.amazon.com/AmazonS3/latest/API/RESTCommonResponseHeaders.html";>Common
 Response Headers</a>
    */
@@ -3012,15 +3016,23 @@ public class S3AFileSystem extends FileSystem 
implements StreamCapabilities {
   public EtagChecksum getFileChecksum(Path f, final long length)
       throws IOException {
     Preconditions.checkArgument(length >= 0);
-    Path path = qualify(f);
-    LOG.debug("getFileChecksum({})", path);
-    return once("getFileChecksum", path.toString(),
-        () -> {
-          // this always does a full HEAD to the object
-          ObjectMetadata headers = getObjectMetadata(path);
-          String eTag = headers.getETag();
-          return eTag != null ? new EtagChecksum(eTag) : null;
-        });
+    entryPoint(INVOCATION_GET_FILE_CHECKSUM);
+
+    if (getConf().getBoolean(ETAG_CHECKSUM_ENABLED,
+        ETAG_CHECKSUM_ENABLED_DEFAULT)) {
+      Path path = qualify(f);
+      LOG.debug("getFileChecksum({})", path);
+      return once("getFileChecksum", path.toString(),
+          () -> {
+            // this always does a full HEAD to the object
+            ObjectMetadata headers = getObjectMetadata(path);
+            String eTag = headers.getETag();
+            return eTag != null ? new EtagChecksum(eTag) : null;
+          });
+    } else {
+      // disabled
+      return null;
+    }
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/hadoop/blob/dd05871b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInstrumentation.java
----------------------------------------------------------------------
diff --git 
a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInstrumentation.java
 
b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInstrumentation.java
index b883455..29ee0c5 100644
--- 
a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInstrumentation.java
+++ 
b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInstrumentation.java
@@ -139,6 +139,7 @@ public class S3AInstrumentation implements Closeable, 
MetricsSource {
       INVOCATION_CREATE_NON_RECURSIVE,
       INVOCATION_DELETE,
       INVOCATION_EXISTS,
+      INVOCATION_GET_FILE_CHECKSUM,
       INVOCATION_GET_FILE_STATUS,
       INVOCATION_GLOB_STATUS,
       INVOCATION_IS_DIRECTORY,

http://git-wip-us.apache.org/repos/asf/hadoop/blob/dd05871b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Statistic.java
----------------------------------------------------------------------
diff --git 
a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Statistic.java 
b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Statistic.java
index bb30f1f..799a586 100644
--- 
a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Statistic.java
+++ 
b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Statistic.java
@@ -57,6 +57,8 @@ public enum Statistic {
       "Calls of delete()"),
   INVOCATION_EXISTS(CommonStatisticNames.OP_EXISTS,
       "Calls of exists()"),
+  INVOCATION_GET_FILE_CHECKSUM(CommonStatisticNames.OP_GET_FILE_CHECKSUM,
+      "Calls of getFileChecksum()"),
   INVOCATION_GET_FILE_STATUS(CommonStatisticNames.OP_GET_FILE_STATUS,
       "Calls of getFileStatus()"),
   INVOCATION_GLOB_STATUS(CommonStatisticNames.OP_GLOB_STATUS,

http://git-wip-us.apache.org/repos/asf/hadoop/blob/dd05871b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md
----------------------------------------------------------------------
diff --git 
a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md 
b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md
index 88cab37..1099d84 100644
--- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md
+++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md
@@ -1656,11 +1656,48 @@ in these metrics.
 
 ##<a name="further_reading"></a> Other Topics
 
-### Copying Data with distcp
+### <a name="distcp"></a> Copying Data with distcp
 
-Hadoop's `distcp` application can be used to copy data between a Hadoop
+Hadoop's `distcp` tool is often used to copy data between a Hadoop
 cluster and Amazon S3.
 See [Copying Data Between a Cluster and Amazon 
S3](https://hortonworks.github.io/hdp-aws/s3-copy-data/index.html)
 for details on S3 copying specifically.
 
+The `distcp update` command tries to do incremental updates of data.
+It is straightforward to verify when files do not match when they are of
+different length, but not when they are the same size.
+
+Distcp addresses this by comparing file checksums on the source and destination
+filesystems, which it tries to do *even if the filesystems have incompatible
+checksum algorithms*.
+
+The S3A connector can provide the HTTP etag header to the caller as the
+checksum of the uploaded file. Doing so will break distcp operations
+between hdfs and s3a.
+
+For this reason, the etag-as-checksum feature is disabled by default.
+
+```xml
+<property>
+  <name>fs.s3a.etag.checksum.enabled</name>
+  <value>false</value>
+  <description>
+    Should calls to getFileChecksum() return the etag value of the remote
+    object.
+    WARNING: if enabled, distcp operations between HDFS and S3 will fail unless
+    -skipcrccheck is set.
+  </description>
+</property>
+```
+
+If enabled, `distcp` between two S3 buckets can use the checksum to compare
+objects. Their checksums should be identical if they were either each uploaded
+as a single file PUT, or, if in a multipart PUT, in blocks of the same size,
+as configured by the value `fs.s3a.multipart.size`.
+
+To disable checksum verification in `distcp`, use the `-skipcrccheck` option:
+
+```bash
+hadoop distcp -update -skipcrccheck /user/alice/datasets 
s3a://alice-backup/datasets
+```
 

http://git-wip-us.apache.org/repos/asf/hadoop/blob/dd05871b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AMiscOperations.java
----------------------------------------------------------------------
diff --git 
a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AMiscOperations.java
 
b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AMiscOperations.java
index 7a21876..fc8d872 100644
--- 
a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AMiscOperations.java
+++ 
b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AMiscOperations.java
@@ -39,12 +39,30 @@ import static 
org.apache.hadoop.fs.contract.ContractTestUtils.touch;
 
 /**
  * Tests of the S3A FileSystem which don't have a specific home and can share
- * a filesystem instance with others..
+ * a filesystem instance with others.
+ * Checksums are turned on unless explicitly disabled for a test case.
  */
 public class ITestS3AMiscOperations extends AbstractS3ATestBase {
 
   private static final byte[] HELLO = "hello".getBytes(StandardCharsets.UTF_8);
 
+  @Override
+  public void setup() throws Exception {
+    super.setup();
+    // checksums are forced on.
+    enableChecksums(true);
+  }
+
+  /**
+   * Turn checksums on.
+   * Relies on the FS not caching the configuration option
+   * @param enabled enabled flag.
+   */
+  protected void enableChecksums(final boolean enabled) {
+    getFileSystem().getConf().setBoolean(Constants.ETAG_CHECKSUM_ENABLED,
+        enabled);
+  }
+
   @Test
   public void testCreateNonRecursiveSuccess() throws IOException {
     Path shouldWork = path("nonrecursivenode");
@@ -124,13 +142,27 @@ public class ITestS3AMiscOperations extends 
AbstractS3ATestBase {
     Path file1 = touchFile("file1");
     EtagChecksum checksum1 = fs.getFileChecksum(file1, 0);
     LOG.info("Checksum for {}: {}", file1, checksum1);
-    assertNotNull("file 1 checksum", checksum1);
+    assertNotNull("Null file 1 checksum", checksum1);
     assertNotEquals("file 1 checksum", 0, checksum1.getLength());
     assertEquals("checksums", checksum1,
         fs.getFileChecksum(touchFile("file2"), 0));
   }
 
   /**
+   * Make sure that when checksums are disabled, the caller
+   * gets null back.
+   */
+  @Test
+  public void testChecksumDisabled() throws Throwable {
+    // checksums are forced off.
+    enableChecksums(false);
+    final S3AFileSystem fs = getFileSystem();
+    Path file1 = touchFile("file1");
+    EtagChecksum checksum1 = fs.getFileChecksum(file1, 0);
+    assertNull("Checksums are being generated", checksum1);
+  }
+
+  /**
    * Verify that different file contents have different
    * checksums, and that that they aren't the same as the empty file.
    * @throws Throwable failure
@@ -138,6 +170,7 @@ public class ITestS3AMiscOperations extends 
AbstractS3ATestBase {
   @Test
   public void testNonEmptyFileChecksums() throws Throwable {
     final S3AFileSystem fs = getFileSystem();
+
     final Path file3 = mkFile("file3", HELLO);
     final EtagChecksum checksum1 = fs.getFileChecksum(file3, 0);
     assertNotNull("file 3 checksum", checksum1);
@@ -178,12 +211,20 @@ public class ITestS3AMiscOperations extends 
AbstractS3ATestBase {
   }
 
   @Test
-  public void testLengthPastEOF() throws Throwable {
+  public void testNegativeLengthDisabledChecksum() throws Throwable {
+    enableChecksums(false);
+    LambdaTestUtils.intercept(IllegalArgumentException.class,
+        () -> getFileSystem().getFileChecksum(mkFile("negative", HELLO), -1));
+  }
+
+  @Test
+  public void testChecksumLengthPastEOF() throws Throwable {
+    enableChecksums(true);
     final S3AFileSystem fs = getFileSystem();
     Path f = mkFile("file5", HELLO);
-    assertEquals(
-        fs.getFileChecksum(f, HELLO.length),
-        fs.getFileChecksum(f, HELLO.length * 2));
+    EtagChecksum l = fs.getFileChecksum(f, HELLO.length);
+    assertNotNull("Null checksum", l);
+    assertEquals(l, fs.getFileChecksum(f, HELLO.length * 2));
   }
 
   @Test


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-commits-h...@hadoop.apache.org

Reply via email to