[ 
https://issues.apache.org/jira/browse/BEAM-4524?focusedWorklogId=165843&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-165843
 ]

ASF GitHub Bot logged work on BEAM-4524:
----------------------------------------

                Author: ASF GitHub Bot
            Created on: 14/Nov/18 09:37
            Start Date: 14/Nov/18 09:37
    Worklog Time Spent: 10m 
      Work Description: robertwb closed pull request #6583: [BEAM-4524] Use 
sha256 instead of insecure md5 for artifact checksums.
URL: https://github.com/apache/beam/pull/6583
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/model/job-management/src/main/proto/beam_artifact_api.proto 
b/model/job-management/src/main/proto/beam_artifact_api.proto
index b4972afaef4..e03631570b6 100644
--- a/model/job-management/src/main/proto/beam_artifact_api.proto
+++ b/model/job-management/src/main/proto/beam_artifact_api.proto
@@ -62,7 +62,11 @@ message ArtifactMetadata {
 
   // (Optional) The base64-encoded md5 checksum of the artifact. Used, among 
other things, by
   // harness boot code to validate the integrity of the artifact.
-  string md5 = 3;
+  string md5X = 3;
+
+  // (Optional) The hex-encoded sha256 checksum of the artifact. Used, among 
other things, by
+  // harness boot code to validate the integrity of the artifact.
+  string sha256 = 4;
 }
 
 // A collection of artifacts.
@@ -147,4 +151,3 @@ message CommitManifestResponse {
   // ArtifactRetrievalService.
   string retrieval_token = 1;
 }
-
diff --git 
a/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/ArtifactServiceStager.java
 
b/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/ArtifactServiceStager.java
index d5dbca0c23f..9bb7618db22 100644
--- 
a/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/ArtifactServiceStager.java
+++ 
b/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/ArtifactServiceStager.java
@@ -19,7 +19,8 @@
 package org.apache.beam.runners.core.construction;
 
 import com.google.auto.value.AutoValue;
-import com.google.common.io.BaseEncoding;
+import com.google.common.hash.Hasher;
+import com.google.common.hash.Hashing;
 import com.google.common.util.concurrent.ListeningExecutorService;
 import com.google.common.util.concurrent.MoreExecutors;
 import java.io.File;
@@ -27,7 +28,6 @@
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.nio.channels.FileChannel;
-import java.security.MessageDigest;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
@@ -170,19 +170,20 @@ public ArtifactMetadata get() throws Exception {
               .build();
       
requestObserver.onNext(PutArtifactRequest.newBuilder().setMetadata(putMetadata).build());
 
-      MessageDigest md5Digest = MessageDigest.getInstance("MD5");
+      Hasher hasher = Hashing.sha256().newHasher();
       FileChannel channel = new FileInputStream(file.getFile()).getChannel();
       ByteBuffer readBuffer = ByteBuffer.allocate(bufferSize);
       while (!responseObserver.isTerminal() && channel.position() < 
channel.size()) {
         readBuffer.clear();
         channel.read(readBuffer);
         readBuffer.flip();
-        md5Digest.update(readBuffer);
+        ByteString chunk = ByteString.copyFrom(readBuffer);
+        // TODO: Use Guava 23.0's putBytes(ByteBuffer).
+        hasher.putBytes(chunk.toByteArray());
         readBuffer.rewind();
         PutArtifactRequest request =
             PutArtifactRequest.newBuilder()
-                .setData(
-                    
ArtifactChunk.newBuilder().setData(ByteString.copyFrom(readBuffer)).build())
+                .setData(ArtifactChunk.newBuilder().setData(chunk).build())
                 .build();
         requestObserver.onNext(request);
       }
@@ -192,7 +193,7 @@ public ArtifactMetadata get() throws Exception {
       if (responseObserver.err.get() != null) {
         throw new RuntimeException(responseObserver.err.get());
       }
-      return 
metadata.toBuilder().setMd5(BaseEncoding.base64().encode(md5Digest.digest())).build();
+      return metadata.toBuilder().setSha256(hasher.hash().toString()).build();
     }
 
     private class PutArtifactResponseObserver implements 
StreamObserver<PutArtifactResponse> {
diff --git 
a/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/PipelineResources.java
 
b/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/PipelineResources.java
index c982e62a330..c8dba9e3cb7 100644
--- 
a/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/PipelineResources.java
+++ 
b/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/PipelineResources.java
@@ -19,7 +19,6 @@
 
 import static com.google.common.base.Preconditions.checkArgument;
 
-import com.fasterxml.jackson.core.Base64Variants;
 import com.google.common.base.Strings;
 import com.google.common.hash.Funnels;
 import com.google.common.hash.Hasher;
@@ -101,10 +100,10 @@ private static String packageDirectoriesToStage(File 
directoryToStage, String tm
   }
 
   private static String calculateDirectoryContentHash(File directoryToStage) {
-    Hasher hasher = Hashing.md5().newHasher();
+    Hasher hasher = Hashing.sha256().newHasher();
     try (OutputStream hashStream = Funnels.asOutputStream(hasher)) {
       ZipFiles.zipDirectory(directoryToStage, hashStream);
-      return Base64Variants.MODIFIED_FOR_URL.encode(hasher.hash().asBytes());
+      return hasher.hash().toString();
     } catch (IOException e) {
       throw new RuntimeException(e);
     }
diff --git 
a/runners/core-construction-java/src/test/java/org/apache/beam/runners/core/construction/ArtifactServiceStagerTest.java
 
b/runners/core-construction-java/src/test/java/org/apache/beam/runners/core/construction/ArtifactServiceStagerTest.java
index 7c4ea9e3a57..16ed4a82d0e 100644
--- 
a/runners/core-construction-java/src/test/java/org/apache/beam/runners/core/construction/ArtifactServiceStagerTest.java
+++ 
b/runners/core-construction-java/src/test/java/org/apache/beam/runners/core/construction/ArtifactServiceStagerTest.java
@@ -20,19 +20,17 @@
 
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.hasSize;
-import static org.junit.Assert.assertArrayEquals;
 import static org.junit.Assert.assertThat;
 
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Iterables;
-import com.google.common.io.BaseEncoding;
+import com.google.common.hash.Hashing;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.nio.channels.FileChannel;
 import java.nio.charset.StandardCharsets;
-import java.security.MessageDigest;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashSet;
@@ -83,7 +81,7 @@ public void testStage() throws Exception {
     String stagingSessionToken = "token";
     File file = temp.newFile();
     byte[] content = "foo-bar-baz".getBytes(StandardCharsets.UTF_8);
-    byte[] contentMd5 = MessageDigest.getInstance("MD5").digest(content);
+    String contentSha256 = 
Hashing.sha256().newHasher().putBytes(content).hash().toString();
     try (FileChannel contentChannel = new FileOutputStream(file).getChannel()) 
{
       contentChannel.write(ByteBuffer.wrap(content));
     }
@@ -96,8 +94,8 @@ public void testStage() throws Exception {
 
     ArtifactMetadata staged = service.getManifest().getArtifact(0);
     assertThat(staged.getName(), equalTo(file.getName()));
-    byte[] manifestMd5 = BaseEncoding.base64().decode(staged.getMd5());
-    assertArrayEquals(contentMd5, manifestMd5);
+    String manifestSha256 = staged.getSha256();
+    assertThat(contentSha256, equalTo(manifestSha256));
 
     assertThat(service.getManifest().getArtifactCount(), equalTo(1));
     assertThat(staged, 
equalTo(Iterables.getOnlyElement(service.getStagedArtifacts().keySet())));
diff --git 
a/runners/core-construction-java/src/test/java/org/apache/beam/runners/core/construction/InMemoryArtifactStagerService.java
 
b/runners/core-construction-java/src/test/java/org/apache/beam/runners/core/construction/InMemoryArtifactStagerService.java
index eec3f7a04b5..1b99c33532e 100644
--- 
a/runners/core-construction-java/src/test/java/org/apache/beam/runners/core/construction/InMemoryArtifactStagerService.java
+++ 
b/runners/core-construction-java/src/test/java/org/apache/beam/runners/core/construction/InMemoryArtifactStagerService.java
@@ -21,11 +21,9 @@
 import static com.google.common.base.Preconditions.checkArgument;
 import static com.google.common.base.Preconditions.checkState;
 
-import com.google.common.io.BaseEncoding;
+import com.google.common.hash.Hashing;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
-import java.security.MessageDigest;
-import java.security.NoSuchAlgorithmException;
 import java.util.Collections;
 import java.util.Map;
 import java.util.concurrent.ConcurrentHashMap;
@@ -112,19 +110,17 @@ public void onError(Throwable t) {
     public void onCompleted() {
       if (writer != null) {
         writer.onCompleted();
-        try {
-          artifactBytes.put(
-              destination
-                  .toBuilder()
-                  .setMd5(
-                      BaseEncoding.base64()
-                          .encode(
-                              
MessageDigest.getInstance("MD5").digest(writer.stream.toByteArray())))
-                  .build(),
-              writer.stream.toByteArray());
-        } catch (NoSuchAlgorithmException e) {
-          throw new AssertionError("The Java Spec requires all JVMs to support 
MD5", e);
-        }
+        artifactBytes.put(
+            destination
+                .toBuilder()
+                .setSha256(
+                    Hashing.sha256()
+                        .newHasher()
+                        .putBytes(writer.stream.toByteArray())
+                        .hash()
+                        .toString())
+                .build(),
+            writer.stream.toByteArray());
       }
       responseObserver.onNext(PutArtifactResponse.getDefaultInstance());
       responseObserver.onCompleted();
diff --git 
a/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/artifact/BeamFileSystemArtifactRetrievalService.java
 
b/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/artifact/BeamFileSystemArtifactRetrievalService.java
index b4482d7e4fc..a55fc64ddd7 100644
--- 
a/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/artifact/BeamFileSystemArtifactRetrievalService.java
+++ 
b/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/artifact/BeamFileSystemArtifactRetrievalService.java
@@ -32,7 +32,6 @@
 import java.io.InputStream;
 import java.nio.channels.Channels;
 import java.nio.charset.StandardCharsets;
-import java.util.Base64;
 import java.util.List;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.TimeUnit;
@@ -132,7 +131,7 @@ public void getArtifact(
       ResourceId artifactResourceId =
           FileSystems.matchNewResource(location.getUri(), false /* is 
directory */);
       LOG.debug("Artifact {} located in {}", name, artifactResourceId);
-      Hasher hasher = Hashing.md5().newHasher();
+      Hasher hasher = Hashing.sha256().newHasher();
       byte[] data = new byte[ARTIFACT_CHUNK_SIZE_BYTES];
       try (InputStream stream = 
Channels.newInputStream(FileSystems.open(artifactResourceId))) {
         int len;
@@ -144,15 +143,15 @@ public void getArtifact(
                   .build());
         }
       }
-      if (metadata.getMd5() != null && !metadata.getMd5().isEmpty()) {
-        ByteString expected = 
ByteString.copyFrom(Base64.getDecoder().decode(metadata.getMd5()));
-        ByteString actual = ByteString.copyFrom(hasher.hash().asBytes());
+      if (metadata.getSha256() != null && !metadata.getSha256().isEmpty()) {
+        String expected = metadata.getSha256();
+        String actual = hasher.hash().toString();
         if (!actual.equals(expected)) {
           throw new StatusRuntimeException(
               Status.DATA_LOSS.withDescription(
                   String.format(
-                      "Artifact %s is corrupt: expected md5 %s, actual %s",
-                      name, expected.toString(), actual.toString())));
+                      "Artifact %s is corrupt: expected sha256 %s, actual %s",
+                      name, expected, actual)));
         }
       }
       responseObserver.onCompleted();
diff --git 
a/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/artifact/BeamFileSystemArtifactStagingService.java
 
b/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/artifact/BeamFileSystemArtifactStagingService.java
index 7a2d6345966..78cb2c575e5 100644
--- 
a/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/artifact/BeamFileSystemArtifactStagingService.java
+++ 
b/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/artifact/BeamFileSystemArtifactStagingService.java
@@ -28,7 +28,6 @@
 import java.nio.channels.WritableByteChannel;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
-import java.util.Base64;
 import java.util.Collections;
 import org.apache.beam.model.jobmanagement.v1.ArtifactApi.ArtifactMetadata;
 import 
org.apache.beam.model.jobmanagement.v1.ArtifactApi.CommitManifestRequest;
@@ -227,7 +226,7 @@ public void onNext(PutArtifactRequest putArtifactRequest) {
           LOG.debug(
               "Going to stage artifact {} to {}.", 
metadata.getMetadata().getName(), artifactId);
           artifactWritableByteChannel = FileSystems.create(artifactId, 
MimeTypes.BINARY);
-          hasher = Hashing.md5().newHasher();
+          hasher = Hashing.sha256().newHasher();
         } catch (Exception e) {
           String message =
               String.format(
@@ -290,16 +289,16 @@ public void onCompleted() {
           return;
         }
       }
-      String expectedMd5 = metadata.getMetadata().getMd5();
-      if (expectedMd5 != null && !expectedMd5.isEmpty()) {
-        String actualMd5 = 
Base64.getEncoder().encodeToString(hasher.hash().asBytes());
-        if (!actualMd5.equals(expectedMd5)) {
+      String expectedSha256 = metadata.getMetadata().getSha256();
+      if (expectedSha256 != null && !expectedSha256.isEmpty()) {
+        String actualSha256 = hasher.hash().toString();
+        if (!actualSha256.equals(expectedSha256)) {
           outboundObserver.onError(
               new StatusRuntimeException(
                   Status.INVALID_ARGUMENT.withDescription(
                       String.format(
-                          "Artifact %s is corrupt: expected md5 %s, but has 
md5 %s",
-                          metadata.getMetadata().getName(), expectedMd5, 
actualMd5))));
+                          "Artifact %s is corrupt: expected sah256 %s, but has 
sha256 %s",
+                          metadata.getMetadata().getName(), expectedSha256, 
actualSha256))));
           return;
         }
       }
diff --git 
a/runners/java-fn-execution/src/test/java/org/apache/beam/runners/fnexecution/artifact/BeamFileSystemArtifactServicesTest.java
 
b/runners/java-fn-execution/src/test/java/org/apache/beam/runners/fnexecution/artifact/BeamFileSystemArtifactServicesTest.java
index d7a0c57c028..10f2ad53d1e 100644
--- 
a/runners/java-fn-execution/src/test/java/org/apache/beam/runners/fnexecution/artifact/BeamFileSystemArtifactServicesTest.java
+++ 
b/runners/java-fn-execution/src/test/java/org/apache/beam/runners/fnexecution/artifact/BeamFileSystemArtifactServicesTest.java
@@ -31,7 +31,6 @@
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.ArrayList;
-import java.util.Base64;
 import java.util.Collections;
 import java.util.Iterator;
 import java.util.List;
@@ -242,7 +241,7 @@ public void putArtifactsMultipleFilesTest() throws 
Exception {
             .put("file10kb", 10 * DATA_1KB /*10 kb*/)
             .put("file100kb", 100 * DATA_1KB /*100 kb*/)
             .build();
-    Map<String, byte[]> md5 = Maps.newHashMap();
+    Map<String, String> hashes = Maps.newHashMap();
 
     final String text = "abcdefghinklmop\n";
     files.forEach(
@@ -255,7 +254,7 @@ public void putArtifactsMultipleFilesTest() throws 
Exception {
                         text, Double.valueOf(Math.ceil(size * 1.0 / 
text.length())).intValue())
                     .getBytes(StandardCharsets.UTF_8);
             Files.write(filePath, contents);
-            md5.put(fileName, Hashing.md5().hashBytes(contents).asBytes());
+            hashes.put(fileName, 
Hashing.sha256().hashBytes(contents).toString());
           } catch (IOException ignored) {
           }
         });
@@ -270,10 +269,7 @@ public void putArtifactsMultipleFilesTest() throws 
Exception {
           Paths.get(originalDir.toString(), 
fileName).toAbsolutePath().toString(),
           fileName);
       metadata.add(
-          ArtifactMetadata.newBuilder()
-              .setName(fileName)
-              .setMd5(Base64.getEncoder().encodeToString(md5.get(fileName)))
-              .build());
+          
ArtifactMetadata.newBuilder().setName(fileName).setSha256(hashes.get(fileName)).build());
     }
 
     String retrievalToken = commitManifest(stagingSessionToken, metadata);
diff --git a/sdks/python/apache_beam/runners/portability/portable_stager.py 
b/sdks/python/apache_beam/runners/portability/portable_stager.py
index 612d15c6286..d4ca7a2c2fe 100644
--- a/sdks/python/apache_beam/runners/portability/portable_stager.py
+++ b/sdks/python/apache_beam/runners/portability/portable_stager.py
@@ -20,7 +20,6 @@
 from __future__ import division
 from __future__ import print_function
 
-import base64
 import hashlib
 import os
 
@@ -72,7 +71,7 @@ def stage_artifact(self, local_path_to_artifact, 
artifact_name):
     def artifact_request_generator():
       artifact_metadata = beam_artifact_api_pb2.ArtifactMetadata(
           name=artifact_name,
-          md5=_get_file_hash(local_path_to_artifact))
+          sha256=_get_file_hash(local_path_to_artifact))
       metadata = beam_artifact_api_pb2.PutArtifactMetadata(
           staging_session_token=self._staging_session_token,
           metadata=artifact_metadata)
@@ -100,11 +99,11 @@ def commit_manifest(self):
 
 
 def _get_file_hash(path):
-  hasher = hashlib.md5()
+  hasher = hashlib.sha256()
   with open(path, 'rb') as f:
     while True:
       chunk = f.read(1 << 21)
       if chunk:
         hasher.update(chunk)
       else:
-        return base64.b64encode(hasher.digest())
+        return hasher.hexdigest()


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
-------------------

    Worklog Id:     (was: 165843)
    Time Spent: 1h 20m  (was: 1h 10m)

> We should not be using md5 to validate artifact integrity.
> ----------------------------------------------------------
>
>                 Key: BEAM-4524
>                 URL: https://issues.apache.org/jira/browse/BEAM-4524
>             Project: Beam
>          Issue Type: Task
>          Components: beam-model
>            Reporter: Robert Bradshaw
>            Priority: Major
>          Time Spent: 1h 20m
>  Remaining Estimate: 0h
>
> https://github.com/apache/beam/blob/6f239498e676f471427e17abc4bc5cffba9887c5/model/job-management/src/main/proto/beam_artifact_api.proto#L63
> Something like sha256 would probably be sufficient. 
> https://en.wikipedia.org/wiki/MD5#Overview_of_security_issues



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to