garyli1019 commented on a change in pull request #1938:
URL: https://github.com/apache/hudi/pull/1938#discussion_r554291505
##########
File path:
hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieInputFormatUtils.java
##########
@@ -470,4 +471,45 @@ private static HoodieBaseFile
refreshFileStatus(Configuration conf, HoodieBaseFi
}
}
+ /**
+ * List affected file status based on given commits.
+ * @param basePath
+ * @param commitsToCheck
+ * @param timeline
+ * @return HashMap<partitionPath, HashMap<fileName, FileStatus>>
+ * @throws IOException
+ */
+ public static HashMap<String, HashMap<String, FileStatus>>
listStatusForAffectedPartitions(
+ Path basePath, List<HoodieInstant> commitsToCheck, HoodieTimeline
timeline) throws IOException {
+ // Extract files touched by these commits.
+ // TODO This might need to be done in parallel like listStatus parallelism
?
+ HashMap<String, HashMap<String, FileStatus>> partitionToFileStatusesMap =
new HashMap<>();
+ for (HoodieInstant commit: commitsToCheck) {
+ HoodieCommitMetadata commitMetadata =
HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(commit).get(),
+ HoodieCommitMetadata.class);
+ for (Map.Entry<String, List<HoodieWriteStat>> entry:
commitMetadata.getPartitionToWriteStats().entrySet()) {
+ if (!partitionToFileStatusesMap.containsKey(entry.getKey())) {
+ partitionToFileStatusesMap.put(entry.getKey(), new HashMap<>());
+ }
+ for (HoodieWriteStat stat : entry.getValue()) {
+ String relativeFilePath = stat.getPath();
+ Path fullPath = relativeFilePath != null ?
FSUtils.getPartitionPath(basePath, relativeFilePath) : null;
+ if (fullPath != null) {
+ if
(partitionToFileStatusesMap.get(entry.getKey()).containsKey(fullPath.getName()))
{
+ // If filesystem support Append. Update the FileStatus of log
file if being appended.
Review comment:
fixed.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]