steveloughran commented on code in PR #5308: URL: https://github.com/apache/hadoop/pull/5308#discussion_r1100395250
########## hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/DistCpUtils.java: ########## @@ -613,8 +623,12 @@ public static void compareFileLengthsAndChecksums(long srcLen, //At this point, src & dest lengths are same. if length==0, we skip checksum if ((srcLen != 0) && (!skipCrc)) { - if (!checksumsAreEqual(sourceFS, source, sourceChecksum, - targetFS, target, srcLen)) { + CopyMapper.ChecksumComparison + checksumComparison = checksumsAreEqual(sourceFS, source, sourceChecksum, + targetFS, target, srcLen); + // If Checksum comparison is false set it to false, else set to true. + boolean checksumResult = !checksumComparison.equals(CopyMapper.ChecksumComparison.FALSE); Review Comment: is this outcome right. as L632 should be reached for any outcome other than True. ########## hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyMapper.java: ########## @@ -361,31 +373,60 @@ private boolean canSkip(FileSystem sourceFS, CopyListingFileStatus source, if (sameLength && source.getLen() == 0) { return true; } - // if both the source and target have the same length, then check if the - // config to use modification time is set to true, then use the - // modification time and checksum comparison to determine if the copy can - // be skipped else if not set then just use the checksum comparison to - // check copy skip. + // If the src and target file have same size and block size, we would + // check if the checkCrc flag is enabled or not. If enabled, and the + // modTime comparison is enabled then return true if target file is older + // than the source file, since this indicates that the target file is + // recently updated and the source is not changed more recently than the + // update, we can skip the copy else we would copy. + // If skipCrc flag is disabled, we would check the checksum comparison + // which is an enum representing 3 values, of which if the comparison + // returns NOT_COMPATIBLE, we'll try to check modtime again, else return + // the result of checksum comparison which are compatible(true or false). // // Note: Different object stores can have different checksum algorithms // resulting in no checksum comparison that results in return true // always, having the modification time enabled can help in these // scenarios to not incorrectly skip a copy. Refer: HADOOP-18596. + if (sameLength && sameBlockSize) { - if (useModTimeToUpdate) { - return - (source.getModificationTime() < target.getModificationTime()) && - (skipCrc || DistCpUtils.checksumsAreEqual(sourceFS, - source.getPath(), null, - targetFS, target.getPath(), source.getLen())); + if (skipCrc) { + return maybeUseModTimeToCompare(source, target); } else { - return skipCrc || DistCpUtils + ChecksumComparison checksumComparison = DistCpUtils .checksumsAreEqual(sourceFS, source.getPath(), null, targetFS, target.getPath(), source.getLen()); + LOG.debug("Result of checksum comparison between src {} and target " + + "{} : {}", source, target, checksumComparison); + if (checksumComparison.equals(ChecksumComparison.INCOMPATIBLE)) { + return maybeUseModTimeToCompare(source, target); + } + // if skipCrc is disabled and checksumComparison is compatible we + // need not check the mod time. + return checksumComparison.equals(ChecksumComparison.TRUE); } - } else { - return false; } + return false; + } + + /** + * If the mod time comparison is enabled, check the mod time else return + * false. + * Comparison: If the target file perceives to have greater mod time(older) + * than the source file, we can assume that there has been no new changes + * that occurred in the source file, hence we should return true to skip the + * copy of the file. + * @param source Source fileStatus. + * @param target Target fileStatus. + * @return boolean representing result of modTime check. + */ + private boolean maybeUseModTimeToCompare( + CopyListingFileStatus source, FileStatus target) { + if (useModTimeToUpdate) { + return source.getModificationTime() < target.getModificationTime(); Review Comment: should this be <= ? ########## hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyCommitter.java: ########## @@ -562,9 +562,11 @@ private void testCommitWithChecksumMismatch(boolean skipCrc) Path sourcePath = new Path(sourceBase + srcFilename); CopyListingFileStatus sourceCurrStatus = new CopyListingFileStatus(fs.getFileStatus(sourcePath)); - Assert.assertFalse(DistCpUtils.checksumsAreEqual( + Assert.assertFalse(!DistCpUtils.checksumsAreEqual( Review Comment: move to assertEquals here -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org For additional commands, e-mail: common-issues-h...@hadoop.apache.org