vinishjail97 commented on code in PR #17788:
URL: https://github.com/apache/hudi/pull/17788#discussion_r2729026048
##########
hudi-common/src/main/java/org/apache/hudi/common/util/ExternalFilePathUtil.java:
##########
@@ -43,4 +92,109 @@ public static String
appendCommitTimeAndExternalFileMarker(String filePath, Stri
public static boolean isExternallyCreatedFile(String fileName) {
return fileName.endsWith(EXTERNAL_FILE_SUFFIX);
}
+
+ /**
+ * Extracts the file group prefix from an external file name.
+ * @param fileName The external file name
+ * @return Option containing the decoded file group prefix, or empty if not
present
+ */
+ private static Option<String> getExternalFileGroupPrefix(String fileName) {
+ if (!isExternallyCreatedFile(fileName)) {
+ return Option.empty();
+ }
+ int prefixMarkerIndex = fileName.indexOf(FILE_GROUP_PREFIX_MARKER);
+ if (prefixMarkerIndex == -1) {
+ return Option.empty();
+ }
+ int start = prefixMarkerIndex + FILE_GROUP_PREFIX_MARKER.length();
+ int end = fileName.lastIndexOf(EXTERNAL_FILE_SUFFIX);
+ return
Option.of(PartitionPathEncodeUtils.unescapePathName(fileName.substring(start,
end)));
+ }
+
+ /**
+ * Extracts the original file name from an external file name (without
commit time and markers).
+ * For example, "data.parquet_123_hudiext" returns "data.parquet"
+ * And "data.parquet_123_fg%3Dbucket-0_hudiext" also returns "data.parquet"
+ *
+ * @param fileName The external file name
+ * @return The original file name
+ */
+ private static String getOriginalFileName(String fileName) {
+ if (!isExternallyCreatedFile(fileName)) {
+ return fileName;
+ }
+ int prefixMarkerIndex = fileName.indexOf(FILE_GROUP_PREFIX_MARKER);
+ int markerEnd = prefixMarkerIndex != -1
+ ? prefixMarkerIndex
+ : fileName.lastIndexOf(EXTERNAL_FILE_SUFFIX);
+ int commitTimeStart = fileName.lastIndexOf('_', markerEnd - 1);
+ return fileName.substring(0, commitTimeStart);
+ }
+
+ /**
+ * Adjusts the parent path for external files with file group prefix.
+ * For files with file group prefix, the prefix represents subdirectories
within the partition,
+ * so we need to remove the prefix portion to get the actual partition path.
+ * Supports arbitrary nesting depths (e.g., "bucket-0/subdir1/subdir2").
+ *
+ * @param parent the parent path
+ * @param fileName the file name to check
+ * @return the adjusted parent path
+ */
+ public static StoragePath getFullPathOfPartition(StoragePath parent, String
fileName) {
+ return getExternalFileGroupPrefix(fileName)
+ .map(prefix -> new StoragePath(parent.toString().substring(0,
parent.toString().length() - prefix.length() - 1)))
+ .orElse(parent);
+ }
+
+ /**
+ * Parses external file names to extract fileId and commit time.
+ * Handles both formats:
+ * - With prefix: originalName_commitTime_fg%3D<prefix>_hudiext -> fileId
= prefix/originalName
+ * - Without prefix: originalName_commitTime_hudiext -> fileId =
originalName
+ *
+ * @param fileName The external file name to parse
+ * @return String array of size 2: [fileId, commitTime]
+ */
+ public static String[] parseFileIdAndCommitTimeFromExternalFile(String
fileName) {
+ String[] values = new String[2];
+ // Extract original file name
+ String originalName = getOriginalFileName(fileName);
+ // Extract file group prefix (if present)
+ Option<String> prefix = getExternalFileGroupPrefix(fileName);
Review Comment:
The tests are passing but I feel this method is more complex given that it's
only called for external files?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]