nsivabalan commented on code in PR #17788:
URL: https://github.com/apache/hudi/pull/17788#discussion_r2723453047
##########
hudi-common/src/main/java/org/apache/hudi/common/util/ExternalFilePathUtil.java:
##########
@@ -43,4 +92,109 @@ public static String
appendCommitTimeAndExternalFileMarker(String filePath, Stri
public static boolean isExternallyCreatedFile(String fileName) {
return fileName.endsWith(EXTERNAL_FILE_SUFFIX);
}
+
+ /**
+ * Extracts the file group prefix from an external file name.
+ * @param fileName The external file name
+ * @return Option containing the decoded file group prefix, or empty if not
present
+ */
+ private static Option<String> getExternalFileGroupPrefix(String fileName) {
+ if (!isExternallyCreatedFile(fileName)) {
+ return Option.empty();
+ }
+ int prefixMarkerIndex = fileName.indexOf(FILE_GROUP_PREFIX_MARKER);
+ if (prefixMarkerIndex == -1) {
+ return Option.empty();
+ }
+ int start = prefixMarkerIndex + FILE_GROUP_PREFIX_MARKER.length();
+ int end = fileName.lastIndexOf(EXTERNAL_FILE_SUFFIX);
+ return
Option.of(PartitionPathEncodeUtils.unescapePathName(fileName.substring(start,
end)));
+ }
+
+ /**
+ * Extracts the original file name from an external file name (without
commit time and markers).
+ * For example, "data.parquet_123_hudiext" returns "data.parquet"
+ * And "data.parquet_123_fg%3Dbucket-0_hudiext" also returns "data.parquet"
+ *
+ * @param fileName The external file name
+ * @return The original file name
+ */
+ private static String getOriginalFileName(String fileName) {
+ if (!isExternallyCreatedFile(fileName)) {
+ return fileName;
+ }
+ int prefixMarkerIndex = fileName.indexOf(FILE_GROUP_PREFIX_MARKER);
+ int markerEnd = prefixMarkerIndex != -1
+ ? prefixMarkerIndex
+ : fileName.lastIndexOf(EXTERNAL_FILE_SUFFIX);
+ int commitTimeStart = fileName.lastIndexOf('_', markerEnd - 1);
+ return fileName.substring(0, commitTimeStart);
+ }
+
+ /**
+ * Adjusts the parent path for external files with file group prefix.
+ * For files with file group prefix, the prefix represents subdirectories
within the partition,
+ * so we need to remove the prefix portion to get the actual partition path.
+ * Supports arbitrary nesting depths (e.g., "bucket-0/subdir1/subdir2").
+ *
+ * @param parent the parent path
+ * @param fileName the file name to check
+ * @return the adjusted parent path
+ */
+ public static StoragePath getFullPathOfPartition(StoragePath parent, String
fileName) {
+ return getExternalFileGroupPrefix(fileName)
+ .map(prefix -> new StoragePath(parent.toString().substring(0,
parent.toString().length() - prefix.length() - 1)))
+ .orElse(parent);
+ }
+
+ /**
+ * Parses external file names to extract fileId and commit time.
+ * Handles both formats:
+ * - With prefix: originalName_commitTime_fg%3D<prefix>_hudiext -> fileId
= prefix/originalName
+ * - Without prefix: originalName_commitTime_hudiext -> fileId =
originalName
+ *
+ * @param fileName The external file name to parse
+ * @return String array of size 2: [fileId, commitTime]
+ */
+ public static String[] parseFileIdAndCommitTimeFromExternalFile(String
fileName) {
+ String[] values = new String[2];
+ // Extract original file name
+ String originalName = getOriginalFileName(fileName);
+ // Extract file group prefix (if present)
+ Option<String> prefix = getExternalFileGroupPrefix(fileName);
Review Comment:
I see we are doing 2 passes here. can we get it done in one pass?
```
public static String[]
parseFileIdAndCommitTimeFromExternalFileOnePass(String fileName) {
// return [fileId, commitTime]
String[] out = new String[2];
if (!isExternallyCreatedFile(fileName)) {
// Keep same behavior you want for non-external files
out[0] = fileName;
out[1] = ""; // or null, or throw, depending on your contract
return out;
}
// We assume external files end with EXTERNAL_FILE_SUFFIX (e.g. "_hudiext")
int suffixStart = fileName.lastIndexOf(EXTERNAL_FILE_SUFFIX);
if (suffixStart < 0) {
// Defensive: treat as non-external or throw
out[0] = fileName;
out[1] = "";
return out;
}
// Check if prefix marker exists and is before suffix
int fgIdx = fileName.indexOf(FILE_GROUP_PREFIX_MARKER); // e.g. "_fg%3D"
final int markerEnd = (fgIdx >= 0 && fgIdx < suffixStart) ? fgIdx :
suffixStart;
// commitTime is between last '_' before markerEnd and markerEnd
int commitUnderscore = fileName.lastIndexOf('_', markerEnd - 1);
if (commitUnderscore < 0) {
throw new IllegalArgumentException("Invalid external file name (no
commitTime): " + fileName);
}
String commitTime = fileName.substring(commitUnderscore + 1, markerEnd);
String originalName = fileName.substring(0, commitUnderscore);
// Optional prefix
if (fgIdx >= 0 && fgIdx < suffixStart) {
int prefixStart = fgIdx + FILE_GROUP_PREFIX_MARKER.length();
String encodedPrefix = fileName.substring(prefixStart, suffixStart);
String prefix = PartitionPathEncodeUtils.unescapePathName(encodedPrefix);
out[0] = prefix + "/" + originalName;
} else {
out[0] = originalName;
}
out[1] = commitTime;
return out;
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]