[
https://issues.apache.org/jira/browse/HIVE-25790?focusedWorklogId=823022&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-823022
]
ASF GitHub Bot logged work on HIVE-25790:
-----------------------------------------
Author: ASF GitHub Bot
Created on: 03/Nov/22 10:35
Start Date: 03/Nov/22 10:35
Worklog Time Spent: 10m
Work Description: deniskuzZ commented on code in PR #3582:
URL: https://github.com/apache/hive/pull/3582#discussion_r1012643637
##########
ql/src/java/org/apache/hadoop/hive/ql/parse/repl/CopyUtils.java:
##########
@@ -492,20 +495,94 @@ private void doRegularCopyOnce(FileSystem sourceFs,
List<Path> srcList,
throw new IOException(e);
}
} else {
- //Destination should be empty
+ // Destination should have identical files only
if (overWrite) {
- deleteSubDirs(destinationFs, destination);
+ leaveIdenticalFilesOnly(sourceFs, paths, destinationFs, destination);
}
copyFilesBetweenFS(sourceFs, paths, destinationFs, destination, false,
true);
}
}
- private void deleteSubDirs(FileSystem fs, Path path) throws IOException {
- //Delete the root path instead of doing a listing
- //This is more optimised
- delete(fs, path, true);
- //Recreate just the Root folder
- mkdirs(fs, path);
+ /**
+ * Leave identical files only between source and destination. It gets source
paths.
+ */
+ void leaveIdenticalFilesOnly(FileSystem sourceFs, Path[] srcPaths,
+ FileSystem destinationFs, Path destinationPath) throws IOException {
+ for (Path srcPath : srcPaths) {
+ leaveIdenticalFilesOnlyFileByFile(sourceFs, srcPath,
+ destinationFs, new Path(destinationPath, srcPath.getName()));
+ }
+ }
+
+ /**
+ * Leave identical files only between source and destination. It gets a
source path.
+ */
+ private void leaveIdenticalFilesOnlyFileByFile(FileSystem sourceFs, Path
srcPath,
+ FileSystem destinationFs, Path dstPath) throws IOException {
+ boolean toDelete = false;
+ Type dstType = getType(destinationFs, dstPath);
+ Type srcType = getType(sourceFs, srcPath);
+ LOG.debug("Source path: {}, type: {}", srcPath, srcType);
+ LOG.debug("Destination path: {}, type: {}", dstPath, dstType);
+ switch (dstType) {
+ case NONE:
+ // If destination is none, then no need to delete anything.
+ break;
+ case FILE:
+ // If destination is a file,
+ if (srcType == Type.FILE) {
+ // If source is a file, then delete the destination file if it is not
same as source.
+ toDelete = !FileUtils.isSameFile(sourceFs, srcPath, destinationFs,
dstPath);
+ LOG.debug("Same file?: {}", !toDelete);
+ } else {
+ // Otherwise, delete the destination file.
+ toDelete = true;
Review Comment:
is it possible that srcType and dstType would be different?
##########
ql/src/java/org/apache/hadoop/hive/ql/parse/repl/CopyUtils.java:
##########
@@ -492,20 +495,94 @@ private void doRegularCopyOnce(FileSystem sourceFs,
List<Path> srcList,
throw new IOException(e);
}
} else {
- //Destination should be empty
+ // Destination should have identical files only
if (overWrite) {
- deleteSubDirs(destinationFs, destination);
+ leaveIdenticalFilesOnly(sourceFs, paths, destinationFs, destination);
}
copyFilesBetweenFS(sourceFs, paths, destinationFs, destination, false,
true);
}
}
- private void deleteSubDirs(FileSystem fs, Path path) throws IOException {
- //Delete the root path instead of doing a listing
- //This is more optimised
- delete(fs, path, true);
- //Recreate just the Root folder
- mkdirs(fs, path);
+ /**
+ * Leave identical files only between source and destination. It gets source
paths.
+ */
+ void leaveIdenticalFilesOnly(FileSystem sourceFs, Path[] srcPaths,
+ FileSystem destinationFs, Path destinationPath) throws IOException {
+ for (Path srcPath : srcPaths) {
+ leaveIdenticalFilesOnlyFileByFile(sourceFs, srcPath,
+ destinationFs, new Path(destinationPath, srcPath.getName()));
+ }
+ }
+
+ /**
+ * Leave identical files only between source and destination. It gets a
source path.
+ */
+ private void leaveIdenticalFilesOnlyFileByFile(FileSystem sourceFs, Path
srcPath,
+ FileSystem destinationFs, Path dstPath) throws IOException {
+ boolean toDelete = false;
+ Type dstType = getType(destinationFs, dstPath);
+ Type srcType = getType(sourceFs, srcPath);
+ LOG.debug("Source path: {}, type: {}", srcPath, srcType);
+ LOG.debug("Destination path: {}, type: {}", dstPath, dstType);
+ switch (dstType) {
+ case NONE:
+ // If destination is none, then no need to delete anything.
+ break;
+ case FILE:
+ // If destination is a file,
+ if (srcType == Type.FILE) {
+ // If source is a file, then delete the destination file if it is not
same as source.
+ toDelete = !FileUtils.isSameFile(sourceFs, srcPath, destinationFs,
dstPath);
+ LOG.debug("Same file?: {}", !toDelete);
+ } else {
+ // Otherwise, delete the destination file.
+ toDelete = true;
+ }
+ break;
+ case DIRECTORY:
+ // If destination is a directory,
+ if (srcType == Type.DIRECTORY) {
+ // If both are directories, visit for children of both.
+ Set<String> bothChildNames = Stream.concat(
Review Comment:
why do you need concat? I think idea was to remove target-only files that
are not identical to the source. You should only compare the common set.
##########
ql/src/java/org/apache/hadoop/hive/ql/parse/repl/CopyUtils.java:
##########
@@ -492,20 +495,94 @@ private void doRegularCopyOnce(FileSystem sourceFs,
List<Path> srcList,
throw new IOException(e);
}
} else {
- //Destination should be empty
+ // Destination should have identical files only
if (overWrite) {
- deleteSubDirs(destinationFs, destination);
+ leaveIdenticalFilesOnly(sourceFs, paths, destinationFs, destination);
}
copyFilesBetweenFS(sourceFs, paths, destinationFs, destination, false,
true);
}
}
- private void deleteSubDirs(FileSystem fs, Path path) throws IOException {
- //Delete the root path instead of doing a listing
- //This is more optimised
- delete(fs, path, true);
- //Recreate just the Root folder
- mkdirs(fs, path);
+ /**
+ * Leave identical files only between source and destination. It gets source
paths.
+ */
+ void leaveIdenticalFilesOnly(FileSystem sourceFs, Path[] srcPaths,
+ FileSystem destinationFs, Path destinationPath) throws IOException {
+ for (Path srcPath : srcPaths) {
+ leaveIdenticalFilesOnlyFileByFile(sourceFs, srcPath,
+ destinationFs, new Path(destinationPath, srcPath.getName()));
+ }
+ }
+
+ /**
+ * Leave identical files only between source and destination. It gets a
source path.
+ */
+ private void leaveIdenticalFilesOnlyFileByFile(FileSystem sourceFs, Path
srcPath,
+ FileSystem destinationFs, Path dstPath) throws IOException {
+ boolean toDelete = false;
+ Type dstType = getType(destinationFs, dstPath);
+ Type srcType = getType(sourceFs, srcPath);
+ LOG.debug("Source path: {}, type: {}", srcPath, srcType);
+ LOG.debug("Destination path: {}, type: {}", dstPath, dstType);
+ switch (dstType) {
+ case NONE:
+ // If destination is none, then no need to delete anything.
+ break;
+ case FILE:
+ // If destination is a file,
+ if (srcType == Type.FILE) {
+ // If source is a file, then delete the destination file if it is not
same as source.
+ toDelete = !FileUtils.isSameFile(sourceFs, srcPath, destinationFs,
dstPath);
+ LOG.debug("Same file?: {}", !toDelete);
+ } else {
+ // Otherwise, delete the destination file.
+ toDelete = true;
+ }
+ break;
+ case DIRECTORY:
+ // If destination is a directory,
+ if (srcType == Type.DIRECTORY) {
+ // If both are directories, visit for children of both.
+ Set<String> bothChildNames = Stream.concat(
+ Arrays.stream(sourceFs.listStatus(srcPath))
Review Comment:
you are listing plenty of times, here and when calling getType
##########
ql/src/java/org/apache/hadoop/hive/ql/parse/repl/CopyUtils.java:
##########
@@ -492,20 +495,94 @@ private void doRegularCopyOnce(FileSystem sourceFs,
List<Path> srcList,
throw new IOException(e);
}
} else {
- //Destination should be empty
+ // Destination should have identical files only
if (overWrite) {
- deleteSubDirs(destinationFs, destination);
+ leaveIdenticalFilesOnly(sourceFs, paths, destinationFs, destination);
}
copyFilesBetweenFS(sourceFs, paths, destinationFs, destination, false,
true);
}
}
- private void deleteSubDirs(FileSystem fs, Path path) throws IOException {
- //Delete the root path instead of doing a listing
- //This is more optimised
- delete(fs, path, true);
- //Recreate just the Root folder
- mkdirs(fs, path);
+ /**
+ * Leave identical files only between source and destination. It gets source
paths.
+ */
+ void leaveIdenticalFilesOnly(FileSystem sourceFs, Path[] srcPaths,
+ FileSystem destinationFs, Path destinationPath) throws IOException {
+ for (Path srcPath : srcPaths) {
+ leaveIdenticalFilesOnlyFileByFile(sourceFs, srcPath,
+ destinationFs, new Path(destinationPath, srcPath.getName()));
+ }
+ }
+
+ /**
+ * Leave identical files only between source and destination. It gets a
source path.
+ */
+ private void leaveIdenticalFilesOnlyFileByFile(FileSystem sourceFs, Path
srcPath,
+ FileSystem destinationFs, Path dstPath) throws IOException {
+ boolean toDelete = false;
+ Type dstType = getType(destinationFs, dstPath);
+ Type srcType = getType(sourceFs, srcPath);
+ LOG.debug("Source path: {}, type: {}", srcPath, srcType);
+ LOG.debug("Destination path: {}, type: {}", dstPath, dstType);
+ switch (dstType) {
+ case NONE:
Review Comment:
formatting
##########
common/src/java/org/apache/hadoop/hive/common/FileUtils.java:
##########
@@ -773,12 +816,57 @@ public static boolean copy(FileSystem srcFS, FileStatus
srcStatus, FileSystem ds
return deleteSource ? srcFS.delete(src, true) : true;
}
+ /**
+ * Checks if the source and destination are the same file. It follows the
same logic as
+ * https://hadoop.apache.org/docs/stable/hadoop-distcp/DistCp.html .
+ */
+ public static boolean isSameFile(FileSystem srcFS, Path src, FileSystem
dstFS, Path dst)
+ throws IOException {
+ // When both file systems are HDFS, use strong check conditions;
+ // block size, length, checksum.
+ FileStatus srcStatus = srcFS.getFileStatus(src);
+ if (srcFS.getScheme().equals("hdfs") && dstFS.getScheme().equals("hdfs")) {
+ if (!dstFS.exists(dst)) {
+ return false;
+ }
+ FileStatus dstStatus = dstFS.getFileStatus(dst);
Review Comment:
think about if we can reduce fs listing and use a recursive iterator:
````
RemoteIterator<FileStatus> it = FileUtils.listStatusIterator(fs, path,
FileUtils.HIDDEN_FILES_PATH_FILTER);
````
Issue Time Tracking
-------------------
Worklog Id: (was: 823022)
Time Spent: 1h (was: 50m)
> Make managed table copies handle updates (FileUtils)
> ----------------------------------------------------
>
> Key: HIVE-25790
> URL: https://issues.apache.org/jira/browse/HIVE-25790
> Project: Hive
> Issue Type: Sub-task
> Reporter: Haymant Mangla
> Assignee: Teddy Choi
> Priority: Major
> Labels: pull-request-available
> Time Spent: 1h
> Remaining Estimate: 0h
>
--
This message was sent by Atlassian Jira
(v8.20.10#820010)