the-other-tim-brown commented on code in PR #17788:
URL: https://github.com/apache/hudi/pull/17788#discussion_r2718036255
##########
hudi-common/src/main/java/org/apache/hudi/common/util/ExternalFilePathUtil.java:
##########
@@ -43,4 +92,112 @@ public static String
appendCommitTimeAndExternalFileMarker(String filePath, Stri
public static boolean isExternallyCreatedFile(String fileName) {
return fileName.endsWith(EXTERNAL_FILE_SUFFIX);
}
+
+ /**
+ * Checks if the external file name contains a file group prefix.
+ * @param fileName The file name
+ * @return True if the file has a file group prefix encoded in its name
+ */
+ private static boolean hasExternalFileGroupPrefix(String fileName) {
+ return isExternallyCreatedFile(fileName) &&
fileName.contains(FILE_GROUP_PREFIX_MARKER);
+ }
+
+ /**
+ * Extracts the file group prefix from an external file name.
+ * @param fileName The external file name
+ * @return Option containing the decoded file group prefix, or empty if not
present
+ */
+ private static Option<String> getExternalFileGroupPrefix(String fileName) {
+ if (!hasExternalFileGroupPrefix(fileName)) {
+ return Option.empty();
+ }
+ int start = fileName.indexOf(FILE_GROUP_PREFIX_MARKER) +
FILE_GROUP_PREFIX_MARKER.length();
+ int end = fileName.lastIndexOf(EXTERNAL_FILE_SUFFIX);
+ return
Option.of(PartitionPathEncodeUtils.unescapePathName(fileName.substring(start,
end)));
+ }
+
+ /**
+ * Extracts the original file name from an external file name (without
commit time and markers).
+ * For example, "data.parquet_123_hudiext" returns "data.parquet"
+ * And "data.parquet_123_fg%3Dbucket-0_hudiext" also returns "data.parquet"
+ *
+ * @param fileName The external file name
+ * @return The original file name
+ */
+ private static String getOriginalFileName(String fileName) {
+ if (!isExternallyCreatedFile(fileName)) {
+ return fileName;
+ }
+ int markerEnd = hasExternalFileGroupPrefix(fileName)
+ ? fileName.indexOf(FILE_GROUP_PREFIX_MARKER)
+ : fileName.lastIndexOf(EXTERNAL_FILE_SUFFIX);
+ int commitTimeStart = fileName.lastIndexOf('_', markerEnd - 1);
+ return fileName.substring(0, commitTimeStart);
+ }
+
+ /**
+ * Adjusts the parent path for external files with file group prefix.
+ * For files with file group prefix, the prefix represents a subdirectory
within the partition,
+ * so we need to go up one more level to get the actual partition path.
+ *
+ * @param parent the parent path
+ * @param fileName the file name to check
+ * @return the adjusted parent path
+ */
+ public static StoragePath getActualParentPath(StoragePath parent, String
fileName) {
Review Comment:
A better name may be `getFullPathOfPartition`. Parent path is already a
defined concept so this may be confusing.
##########
hudi-common/src/main/java/org/apache/hudi/common/util/ExternalFilePathUtil.java:
##########
@@ -43,4 +92,112 @@ public static String
appendCommitTimeAndExternalFileMarker(String filePath, Stri
public static boolean isExternallyCreatedFile(String fileName) {
return fileName.endsWith(EXTERNAL_FILE_SUFFIX);
}
+
+ /**
+ * Checks if the external file name contains a file group prefix.
+ * @param fileName The file name
+ * @return True if the file has a file group prefix encoded in its name
+ */
+ private static boolean hasExternalFileGroupPrefix(String fileName) {
+ return isExternallyCreatedFile(fileName) &&
fileName.contains(FILE_GROUP_PREFIX_MARKER);
+ }
+
+ /**
+ * Extracts the file group prefix from an external file name.
+ * @param fileName The external file name
+ * @return Option containing the decoded file group prefix, or empty if not
present
+ */
+ private static Option<String> getExternalFileGroupPrefix(String fileName) {
+ if (!hasExternalFileGroupPrefix(fileName)) {
+ return Option.empty();
+ }
+ int start = fileName.indexOf(FILE_GROUP_PREFIX_MARKER) +
FILE_GROUP_PREFIX_MARKER.length();
+ int end = fileName.lastIndexOf(EXTERNAL_FILE_SUFFIX);
+ return
Option.of(PartitionPathEncodeUtils.unescapePathName(fileName.substring(start,
end)));
+ }
+
+ /**
+ * Extracts the original file name from an external file name (without
commit time and markers).
+ * For example, "data.parquet_123_hudiext" returns "data.parquet"
+ * And "data.parquet_123_fg%3Dbucket-0_hudiext" also returns "data.parquet"
+ *
+ * @param fileName The external file name
+ * @return The original file name
+ */
+ private static String getOriginalFileName(String fileName) {
+ if (!isExternallyCreatedFile(fileName)) {
+ return fileName;
+ }
+ int markerEnd = hasExternalFileGroupPrefix(fileName)
+ ? fileName.indexOf(FILE_GROUP_PREFIX_MARKER)
+ : fileName.lastIndexOf(EXTERNAL_FILE_SUFFIX);
+ int commitTimeStart = fileName.lastIndexOf('_', markerEnd - 1);
+ return fileName.substring(0, commitTimeStart);
+ }
+
+ /**
+ * Adjusts the parent path for external files with file group prefix.
+ * For files with file group prefix, the prefix represents a subdirectory
within the partition,
+ * so we need to go up one more level to get the actual partition path.
+ *
+ * @param parent the parent path
+ * @param fileName the file name to check
+ * @return the adjusted parent path
+ */
+ public static StoragePath getActualParentPath(StoragePath parent, String
fileName) {
+ if (hasExternalFileGroupPrefix(fileName)) {
+ return parent.getParent();
Review Comment:
Let's make this work with arbitrary nesting. The portion of the path before
the prefix is the partition path
##########
hudi-common/src/main/java/org/apache/hudi/common/util/ExternalFilePathUtil.java:
##########
@@ -43,4 +92,112 @@ public static String
appendCommitTimeAndExternalFileMarker(String filePath, Stri
public static boolean isExternallyCreatedFile(String fileName) {
return fileName.endsWith(EXTERNAL_FILE_SUFFIX);
}
+
+ /**
+ * Checks if the external file name contains a file group prefix.
+ * @param fileName The file name
+ * @return True if the file has a file group prefix encoded in its name
+ */
+ private static boolean hasExternalFileGroupPrefix(String fileName) {
+ return isExternallyCreatedFile(fileName) &&
fileName.contains(FILE_GROUP_PREFIX_MARKER);
+ }
+
+ /**
+ * Extracts the file group prefix from an external file name.
+ * @param fileName The external file name
+ * @return Option containing the decoded file group prefix, or empty if not
present
+ */
+ private static Option<String> getExternalFileGroupPrefix(String fileName) {
+ if (!hasExternalFileGroupPrefix(fileName)) {
+ return Option.empty();
+ }
+ int start = fileName.indexOf(FILE_GROUP_PREFIX_MARKER) +
FILE_GROUP_PREFIX_MARKER.length();
+ int end = fileName.lastIndexOf(EXTERNAL_FILE_SUFFIX);
+ return
Option.of(PartitionPathEncodeUtils.unescapePathName(fileName.substring(start,
end)));
+ }
+
+ /**
+ * Extracts the original file name from an external file name (without
commit time and markers).
+ * For example, "data.parquet_123_hudiext" returns "data.parquet"
+ * And "data.parquet_123_fg%3Dbucket-0_hudiext" also returns "data.parquet"
+ *
+ * @param fileName The external file name
+ * @return The original file name
+ */
+ private static String getOriginalFileName(String fileName) {
+ if (!isExternallyCreatedFile(fileName)) {
+ return fileName;
+ }
+ int markerEnd = hasExternalFileGroupPrefix(fileName)
+ ? fileName.indexOf(FILE_GROUP_PREFIX_MARKER)
+ : fileName.lastIndexOf(EXTERNAL_FILE_SUFFIX);
+ int commitTimeStart = fileName.lastIndexOf('_', markerEnd - 1);
+ return fileName.substring(0, commitTimeStart);
+ }
+
+ /**
+ * Adjusts the parent path for external files with file group prefix.
+ * For files with file group prefix, the prefix represents a subdirectory
within the partition,
+ * so we need to go up one more level to get the actual partition path.
+ *
+ * @param parent the parent path
+ * @param fileName the file name to check
+ * @return the adjusted parent path
+ */
+ public static StoragePath getActualParentPath(StoragePath parent, String
fileName) {
+ if (hasExternalFileGroupPrefix(fileName)) {
+ return parent.getParent();
+ }
+ return parent;
+ }
+
+ /**
+ * Parses external file names to extract fileId and commit time.
+ * Handles both formats:
+ * - With prefix: originalName_commitTime_fg%3D<prefix>_hudiext -> fileId
= prefix/originalName
+ * - Without prefix: originalName_commitTime_hudiext -> fileId =
originalName
+ *
+ * @param fileName The external file name to parse
+ * @return String array of size 2: [fileId, commitTime]
+ */
+ public static String[] parseFileIdAndCommitTimeFromExternalFile(String
fileName) {
+ String[] values = new String[2];
+ // Extract original file name
+ String originalName = getOriginalFileName(fileName);
+ // Extract file group prefix (if present)
+ Option<String> prefix = getExternalFileGroupPrefix(fileName);
+ // Build fileId
+ values[0] = prefix.map(p -> p + "/" + originalName).orElse(originalName);
+ // Extract commitTime
+ int markerEnd = hasExternalFileGroupPrefix(fileName)
+ ? fileName.indexOf(FILE_GROUP_PREFIX_MARKER)
Review Comment:
This will scan the string twice. Can we check the `indexOf` and if it is
`-1` then use the `fileName.lastIndexOf(EXTERNAL_FILE_SUFFIX)`?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]