nsivabalan commented on a change in pull request #4114:
URL: https://github.com/apache/hudi/pull/4114#discussion_r757659563



##########
File path: 
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
##########
@@ -769,8 +773,12 @@ public DirectoryInfo(String relativePath, FileStatus[] 
fileStatus) {
         } else if 
(status.getPath().getName().equals(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE))
 {
           // Presence of partition meta file implies this is a HUDI partition
           this.isHoodiePartition = true;
-        } else if (FSUtils.isDataFile(status.getPath())) {
-          // Regular HUDI data file (base file or log file)
+        } else if (FSUtils.isBaseFile(status.getPath())) {
+          final String commitTime = 
FSUtils.getCommitTime(status.getPath().toString());
+          if 
(completedInstantsTimeline.containsOrBeforeTimelineStarts(commitTime)) {
+            filenameToSizeMap.put(status.getPath().getName(), status.getLen());
+          }

Review comment:
       can you leave a comment here as to why we do this

##########
File path: 
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
##########
@@ -754,12 +758,12 @@ protected void bootstrapCommit(List<DirectoryInfo> 
partitionInfoList, String cre
     // Is this a hoodie partition
     private boolean isHoodiePartition = false;
 
-    public DirectoryInfo(String relativePath, FileStatus[] fileStatus) {
+    public DirectoryInfo(final HoodieTableMetaClient datasetMetaClient, String 
relativePath, FileStatus[] fileStatus) {
       this.relativePath = relativePath;
 
       // Pre-allocate with the maximum length possible
       filenameToSizeMap = new HashMap<>(fileStatus.length);
-
+      final HoodieTimeline completedInstantsTimeline = 
datasetMetaClient.getActiveTimeline().filterCompletedInstants();

Review comment:
       we can do the filtering outside and pass it in here. 

##########
File path: 
hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieClientMultiWriter.java
##########
@@ -351,10 +352,10 @@ public void 
testHoodieClientMultiWriterWithClustering(HoodieTableType tableType)
         numRecords, 200, 2);
     client2.commit(newCommitTime, result2);
     // Schedule and run clustering while previous writer for commit 003 is 
running
-    SparkRDDWriteClient client3 = getHoodieWriteClient(cfg);
+    SparkRDDWriteClient client3 = getHoodieWriteClient(cfg3);
     // schedule clustering
     Option<String> clusterInstant = 
client3.scheduleTableService(Option.empty(), TableServiceType.CLUSTER);
-    assertFalse(clusterInstant.isPresent());
+    assertTrue(clusterInstant.isPresent());

Review comment:
       you might have conflicts when you rebase. just watch out




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to