[ https://issues.apache.org/jira/browse/BEAM-4524?focusedWorklogId=165843&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-165843 ]
ASF GitHub Bot logged work on BEAM-4524: ---------------------------------------- Author: ASF GitHub Bot Created on: 14/Nov/18 09:37 Start Date: 14/Nov/18 09:37 Worklog Time Spent: 10m Work Description: robertwb closed pull request #6583: [BEAM-4524] Use sha256 instead of insecure md5 for artifact checksums. URL: https://github.com/apache/beam/pull/6583 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/model/job-management/src/main/proto/beam_artifact_api.proto b/model/job-management/src/main/proto/beam_artifact_api.proto index b4972afaef4..e03631570b6 100644 --- a/model/job-management/src/main/proto/beam_artifact_api.proto +++ b/model/job-management/src/main/proto/beam_artifact_api.proto @@ -62,7 +62,11 @@ message ArtifactMetadata { // (Optional) The base64-encoded md5 checksum of the artifact. Used, among other things, by // harness boot code to validate the integrity of the artifact. - string md5 = 3; + string md5X = 3; + + // (Optional) The hex-encoded sha256 checksum of the artifact. Used, among other things, by + // harness boot code to validate the integrity of the artifact. + string sha256 = 4; } // A collection of artifacts. @@ -147,4 +151,3 @@ message CommitManifestResponse { // ArtifactRetrievalService. string retrieval_token = 1; } - diff --git a/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/ArtifactServiceStager.java b/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/ArtifactServiceStager.java index d5dbca0c23f..9bb7618db22 100644 --- a/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/ArtifactServiceStager.java +++ b/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/ArtifactServiceStager.java @@ -19,7 +19,8 @@ package org.apache.beam.runners.core.construction; import com.google.auto.value.AutoValue; -import com.google.common.io.BaseEncoding; +import com.google.common.hash.Hasher; +import com.google.common.hash.Hashing; import com.google.common.util.concurrent.ListeningExecutorService; import com.google.common.util.concurrent.MoreExecutors; import java.io.File; @@ -27,7 +28,6 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; -import java.security.MessageDigest; import java.util.Collection; import java.util.Collections; import java.util.HashMap; @@ -170,19 +170,20 @@ public ArtifactMetadata get() throws Exception { .build(); requestObserver.onNext(PutArtifactRequest.newBuilder().setMetadata(putMetadata).build()); - MessageDigest md5Digest = MessageDigest.getInstance("MD5"); + Hasher hasher = Hashing.sha256().newHasher(); FileChannel channel = new FileInputStream(file.getFile()).getChannel(); ByteBuffer readBuffer = ByteBuffer.allocate(bufferSize); while (!responseObserver.isTerminal() && channel.position() < channel.size()) { readBuffer.clear(); channel.read(readBuffer); readBuffer.flip(); - md5Digest.update(readBuffer); + ByteString chunk = ByteString.copyFrom(readBuffer); + // TODO: Use Guava 23.0's putBytes(ByteBuffer). + hasher.putBytes(chunk.toByteArray()); readBuffer.rewind(); PutArtifactRequest request = PutArtifactRequest.newBuilder() - .setData( - ArtifactChunk.newBuilder().setData(ByteString.copyFrom(readBuffer)).build()) + .setData(ArtifactChunk.newBuilder().setData(chunk).build()) .build(); requestObserver.onNext(request); } @@ -192,7 +193,7 @@ public ArtifactMetadata get() throws Exception { if (responseObserver.err.get() != null) { throw new RuntimeException(responseObserver.err.get()); } - return metadata.toBuilder().setMd5(BaseEncoding.base64().encode(md5Digest.digest())).build(); + return metadata.toBuilder().setSha256(hasher.hash().toString()).build(); } private class PutArtifactResponseObserver implements StreamObserver<PutArtifactResponse> { diff --git a/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/PipelineResources.java b/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/PipelineResources.java index c982e62a330..c8dba9e3cb7 100644 --- a/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/PipelineResources.java +++ b/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/PipelineResources.java @@ -19,7 +19,6 @@ import static com.google.common.base.Preconditions.checkArgument; -import com.fasterxml.jackson.core.Base64Variants; import com.google.common.base.Strings; import com.google.common.hash.Funnels; import com.google.common.hash.Hasher; @@ -101,10 +100,10 @@ private static String packageDirectoriesToStage(File directoryToStage, String tm } private static String calculateDirectoryContentHash(File directoryToStage) { - Hasher hasher = Hashing.md5().newHasher(); + Hasher hasher = Hashing.sha256().newHasher(); try (OutputStream hashStream = Funnels.asOutputStream(hasher)) { ZipFiles.zipDirectory(directoryToStage, hashStream); - return Base64Variants.MODIFIED_FOR_URL.encode(hasher.hash().asBytes()); + return hasher.hash().toString(); } catch (IOException e) { throw new RuntimeException(e); } diff --git a/runners/core-construction-java/src/test/java/org/apache/beam/runners/core/construction/ArtifactServiceStagerTest.java b/runners/core-construction-java/src/test/java/org/apache/beam/runners/core/construction/ArtifactServiceStagerTest.java index 7c4ea9e3a57..16ed4a82d0e 100644 --- a/runners/core-construction-java/src/test/java/org/apache/beam/runners/core/construction/ArtifactServiceStagerTest.java +++ b/runners/core-construction-java/src/test/java/org/apache/beam/runners/core/construction/ArtifactServiceStagerTest.java @@ -20,19 +20,17 @@ import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.hasSize; -import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertThat; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; -import com.google.common.io.BaseEncoding; +import com.google.common.hash.Hashing; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; import java.nio.charset.StandardCharsets; -import java.security.MessageDigest; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; @@ -83,7 +81,7 @@ public void testStage() throws Exception { String stagingSessionToken = "token"; File file = temp.newFile(); byte[] content = "foo-bar-baz".getBytes(StandardCharsets.UTF_8); - byte[] contentMd5 = MessageDigest.getInstance("MD5").digest(content); + String contentSha256 = Hashing.sha256().newHasher().putBytes(content).hash().toString(); try (FileChannel contentChannel = new FileOutputStream(file).getChannel()) { contentChannel.write(ByteBuffer.wrap(content)); } @@ -96,8 +94,8 @@ public void testStage() throws Exception { ArtifactMetadata staged = service.getManifest().getArtifact(0); assertThat(staged.getName(), equalTo(file.getName())); - byte[] manifestMd5 = BaseEncoding.base64().decode(staged.getMd5()); - assertArrayEquals(contentMd5, manifestMd5); + String manifestSha256 = staged.getSha256(); + assertThat(contentSha256, equalTo(manifestSha256)); assertThat(service.getManifest().getArtifactCount(), equalTo(1)); assertThat(staged, equalTo(Iterables.getOnlyElement(service.getStagedArtifacts().keySet()))); diff --git a/runners/core-construction-java/src/test/java/org/apache/beam/runners/core/construction/InMemoryArtifactStagerService.java b/runners/core-construction-java/src/test/java/org/apache/beam/runners/core/construction/InMemoryArtifactStagerService.java index eec3f7a04b5..1b99c33532e 100644 --- a/runners/core-construction-java/src/test/java/org/apache/beam/runners/core/construction/InMemoryArtifactStagerService.java +++ b/runners/core-construction-java/src/test/java/org/apache/beam/runners/core/construction/InMemoryArtifactStagerService.java @@ -21,11 +21,9 @@ import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkState; -import com.google.common.io.BaseEncoding; +import com.google.common.hash.Hashing; import java.io.ByteArrayOutputStream; import java.io.IOException; -import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; import java.util.Collections; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; @@ -112,19 +110,17 @@ public void onError(Throwable t) { public void onCompleted() { if (writer != null) { writer.onCompleted(); - try { - artifactBytes.put( - destination - .toBuilder() - .setMd5( - BaseEncoding.base64() - .encode( - MessageDigest.getInstance("MD5").digest(writer.stream.toByteArray()))) - .build(), - writer.stream.toByteArray()); - } catch (NoSuchAlgorithmException e) { - throw new AssertionError("The Java Spec requires all JVMs to support MD5", e); - } + artifactBytes.put( + destination + .toBuilder() + .setSha256( + Hashing.sha256() + .newHasher() + .putBytes(writer.stream.toByteArray()) + .hash() + .toString()) + .build(), + writer.stream.toByteArray()); } responseObserver.onNext(PutArtifactResponse.getDefaultInstance()); responseObserver.onCompleted(); diff --git a/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/artifact/BeamFileSystemArtifactRetrievalService.java b/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/artifact/BeamFileSystemArtifactRetrievalService.java index b4482d7e4fc..a55fc64ddd7 100644 --- a/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/artifact/BeamFileSystemArtifactRetrievalService.java +++ b/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/artifact/BeamFileSystemArtifactRetrievalService.java @@ -32,7 +32,6 @@ import java.io.InputStream; import java.nio.channels.Channels; import java.nio.charset.StandardCharsets; -import java.util.Base64; import java.util.List; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; @@ -132,7 +131,7 @@ public void getArtifact( ResourceId artifactResourceId = FileSystems.matchNewResource(location.getUri(), false /* is directory */); LOG.debug("Artifact {} located in {}", name, artifactResourceId); - Hasher hasher = Hashing.md5().newHasher(); + Hasher hasher = Hashing.sha256().newHasher(); byte[] data = new byte[ARTIFACT_CHUNK_SIZE_BYTES]; try (InputStream stream = Channels.newInputStream(FileSystems.open(artifactResourceId))) { int len; @@ -144,15 +143,15 @@ public void getArtifact( .build()); } } - if (metadata.getMd5() != null && !metadata.getMd5().isEmpty()) { - ByteString expected = ByteString.copyFrom(Base64.getDecoder().decode(metadata.getMd5())); - ByteString actual = ByteString.copyFrom(hasher.hash().asBytes()); + if (metadata.getSha256() != null && !metadata.getSha256().isEmpty()) { + String expected = metadata.getSha256(); + String actual = hasher.hash().toString(); if (!actual.equals(expected)) { throw new StatusRuntimeException( Status.DATA_LOSS.withDescription( String.format( - "Artifact %s is corrupt: expected md5 %s, actual %s", - name, expected.toString(), actual.toString()))); + "Artifact %s is corrupt: expected sha256 %s, actual %s", + name, expected, actual))); } } responseObserver.onCompleted(); diff --git a/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/artifact/BeamFileSystemArtifactStagingService.java b/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/artifact/BeamFileSystemArtifactStagingService.java index 7a2d6345966..78cb2c575e5 100644 --- a/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/artifact/BeamFileSystemArtifactStagingService.java +++ b/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/artifact/BeamFileSystemArtifactStagingService.java @@ -28,7 +28,6 @@ import java.nio.channels.WritableByteChannel; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; -import java.util.Base64; import java.util.Collections; import org.apache.beam.model.jobmanagement.v1.ArtifactApi.ArtifactMetadata; import org.apache.beam.model.jobmanagement.v1.ArtifactApi.CommitManifestRequest; @@ -227,7 +226,7 @@ public void onNext(PutArtifactRequest putArtifactRequest) { LOG.debug( "Going to stage artifact {} to {}.", metadata.getMetadata().getName(), artifactId); artifactWritableByteChannel = FileSystems.create(artifactId, MimeTypes.BINARY); - hasher = Hashing.md5().newHasher(); + hasher = Hashing.sha256().newHasher(); } catch (Exception e) { String message = String.format( @@ -290,16 +289,16 @@ public void onCompleted() { return; } } - String expectedMd5 = metadata.getMetadata().getMd5(); - if (expectedMd5 != null && !expectedMd5.isEmpty()) { - String actualMd5 = Base64.getEncoder().encodeToString(hasher.hash().asBytes()); - if (!actualMd5.equals(expectedMd5)) { + String expectedSha256 = metadata.getMetadata().getSha256(); + if (expectedSha256 != null && !expectedSha256.isEmpty()) { + String actualSha256 = hasher.hash().toString(); + if (!actualSha256.equals(expectedSha256)) { outboundObserver.onError( new StatusRuntimeException( Status.INVALID_ARGUMENT.withDescription( String.format( - "Artifact %s is corrupt: expected md5 %s, but has md5 %s", - metadata.getMetadata().getName(), expectedMd5, actualMd5)))); + "Artifact %s is corrupt: expected sah256 %s, but has sha256 %s", + metadata.getMetadata().getName(), expectedSha256, actualSha256)))); return; } } diff --git a/runners/java-fn-execution/src/test/java/org/apache/beam/runners/fnexecution/artifact/BeamFileSystemArtifactServicesTest.java b/runners/java-fn-execution/src/test/java/org/apache/beam/runners/fnexecution/artifact/BeamFileSystemArtifactServicesTest.java index d7a0c57c028..10f2ad53d1e 100644 --- a/runners/java-fn-execution/src/test/java/org/apache/beam/runners/fnexecution/artifact/BeamFileSystemArtifactServicesTest.java +++ b/runners/java-fn-execution/src/test/java/org/apache/beam/runners/fnexecution/artifact/BeamFileSystemArtifactServicesTest.java @@ -31,7 +31,6 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; -import java.util.Base64; import java.util.Collections; import java.util.Iterator; import java.util.List; @@ -242,7 +241,7 @@ public void putArtifactsMultipleFilesTest() throws Exception { .put("file10kb", 10 * DATA_1KB /*10 kb*/) .put("file100kb", 100 * DATA_1KB /*100 kb*/) .build(); - Map<String, byte[]> md5 = Maps.newHashMap(); + Map<String, String> hashes = Maps.newHashMap(); final String text = "abcdefghinklmop\n"; files.forEach( @@ -255,7 +254,7 @@ public void putArtifactsMultipleFilesTest() throws Exception { text, Double.valueOf(Math.ceil(size * 1.0 / text.length())).intValue()) .getBytes(StandardCharsets.UTF_8); Files.write(filePath, contents); - md5.put(fileName, Hashing.md5().hashBytes(contents).asBytes()); + hashes.put(fileName, Hashing.sha256().hashBytes(contents).toString()); } catch (IOException ignored) { } }); @@ -270,10 +269,7 @@ public void putArtifactsMultipleFilesTest() throws Exception { Paths.get(originalDir.toString(), fileName).toAbsolutePath().toString(), fileName); metadata.add( - ArtifactMetadata.newBuilder() - .setName(fileName) - .setMd5(Base64.getEncoder().encodeToString(md5.get(fileName))) - .build()); + ArtifactMetadata.newBuilder().setName(fileName).setSha256(hashes.get(fileName)).build()); } String retrievalToken = commitManifest(stagingSessionToken, metadata); diff --git a/sdks/python/apache_beam/runners/portability/portable_stager.py b/sdks/python/apache_beam/runners/portability/portable_stager.py index 612d15c6286..d4ca7a2c2fe 100644 --- a/sdks/python/apache_beam/runners/portability/portable_stager.py +++ b/sdks/python/apache_beam/runners/portability/portable_stager.py @@ -20,7 +20,6 @@ from __future__ import division from __future__ import print_function -import base64 import hashlib import os @@ -72,7 +71,7 @@ def stage_artifact(self, local_path_to_artifact, artifact_name): def artifact_request_generator(): artifact_metadata = beam_artifact_api_pb2.ArtifactMetadata( name=artifact_name, - md5=_get_file_hash(local_path_to_artifact)) + sha256=_get_file_hash(local_path_to_artifact)) metadata = beam_artifact_api_pb2.PutArtifactMetadata( staging_session_token=self._staging_session_token, metadata=artifact_metadata) @@ -100,11 +99,11 @@ def commit_manifest(self): def _get_file_hash(path): - hasher = hashlib.md5() + hasher = hashlib.sha256() with open(path, 'rb') as f: while True: chunk = f.read(1 << 21) if chunk: hasher.update(chunk) else: - return base64.b64encode(hasher.digest()) + return hasher.hexdigest() ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking ------------------- Worklog Id: (was: 165843) Time Spent: 1h 20m (was: 1h 10m) > We should not be using md5 to validate artifact integrity. > ---------------------------------------------------------- > > Key: BEAM-4524 > URL: https://issues.apache.org/jira/browse/BEAM-4524 > Project: Beam > Issue Type: Task > Components: beam-model > Reporter: Robert Bradshaw > Priority: Major > Time Spent: 1h 20m > Remaining Estimate: 0h > > https://github.com/apache/beam/blob/6f239498e676f471427e17abc4bc5cffba9887c5/model/job-management/src/main/proto/beam_artifact_api.proto#L63 > Something like sha256 would probably be sufficient. > https://en.wikipedia.org/wiki/MD5#Overview_of_security_issues -- This message was sent by Atlassian JIRA (v7.6.3#76005)