vinishjail97 commented on code in PR #17788:
URL: https://github.com/apache/hudi/pull/17788#discussion_r2729026048


##########
hudi-common/src/main/java/org/apache/hudi/common/util/ExternalFilePathUtil.java:
##########
@@ -43,4 +92,109 @@ public static String 
appendCommitTimeAndExternalFileMarker(String filePath, Stri
   public static boolean isExternallyCreatedFile(String fileName) {
     return fileName.endsWith(EXTERNAL_FILE_SUFFIX);
   }
+
+  /**
+   * Extracts the file group prefix from an external file name.
+   * @param fileName The external file name
+   * @return Option containing the decoded file group prefix, or empty if not 
present
+   */
+  private static Option<String> getExternalFileGroupPrefix(String fileName) {
+    if (!isExternallyCreatedFile(fileName)) {
+      return Option.empty();
+    }
+    int prefixMarkerIndex = fileName.indexOf(FILE_GROUP_PREFIX_MARKER);
+    if (prefixMarkerIndex == -1) {
+      return Option.empty();
+    }
+    int start = prefixMarkerIndex + FILE_GROUP_PREFIX_MARKER.length();
+    int end = fileName.lastIndexOf(EXTERNAL_FILE_SUFFIX);
+    return 
Option.of(PartitionPathEncodeUtils.unescapePathName(fileName.substring(start, 
end)));
+  }
+
+  /**
+   * Extracts the original file name from an external file name (without 
commit time and markers).
+   * For example, "data.parquet_123_hudiext" returns "data.parquet"
+   * And "data.parquet_123_fg%3Dbucket-0_hudiext" also returns "data.parquet"
+   *
+   * @param fileName The external file name
+   * @return The original file name
+   */
+  private static String getOriginalFileName(String fileName) {
+    if (!isExternallyCreatedFile(fileName)) {
+      return fileName;
+    }
+    int prefixMarkerIndex = fileName.indexOf(FILE_GROUP_PREFIX_MARKER);
+    int markerEnd = prefixMarkerIndex != -1
+        ? prefixMarkerIndex
+        : fileName.lastIndexOf(EXTERNAL_FILE_SUFFIX);
+    int commitTimeStart = fileName.lastIndexOf('_', markerEnd - 1);
+    return fileName.substring(0, commitTimeStart);
+  }
+
+  /**
+   * Adjusts the parent path for external files with file group prefix.
+   * For files with file group prefix, the prefix represents subdirectories 
within the partition,
+   * so we need to remove the prefix portion to get the actual partition path.
+   * Supports arbitrary nesting depths (e.g., "bucket-0/subdir1/subdir2").
+   *
+   * @param parent the parent path
+   * @param fileName the file name to check
+   * @return the adjusted parent path
+   */
+  public static StoragePath getFullPathOfPartition(StoragePath parent, String 
fileName) {
+    return getExternalFileGroupPrefix(fileName)
+        .map(prefix -> new StoragePath(parent.toString().substring(0, 
parent.toString().length() - prefix.length() - 1)))
+        .orElse(parent);
+  }
+
+  /**
+   * Parses external file names to extract fileId and commit time.
+   * Handles both formats:
+   *   - With prefix: originalName_commitTime_fg%3D<prefix>_hudiext -> fileId 
= prefix/originalName
+   *   - Without prefix: originalName_commitTime_hudiext -> fileId = 
originalName
+   *
+   * @param fileName The external file name to parse
+   * @return String array of size 2: [fileId, commitTime]
+   */
+  public static String[] parseFileIdAndCommitTimeFromExternalFile(String 
fileName) {
+    String[] values = new String[2];
+    // Extract original file name
+    String originalName = getOriginalFileName(fileName);
+    // Extract file group prefix (if present)
+    Option<String> prefix = getExternalFileGroupPrefix(fileName);

Review Comment:
   The tests are passing but I feel this method is more complex given that it's 
only called for external files?  



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to