This is an automated email from the ASF dual-hosted git repository.
sivabalan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new d79360cb5f90 fix: Handle external file groups in ExternalFilePathUtil
(#17788)
d79360cb5f90 is described below
commit d79360cb5f9030c4fc32da2bcc9c748b34602d7a
Author: Vinish Reddy <[email protected]>
AuthorDate: Tue Jan 27 03:29:19 2026 +0530
fix: Handle external file groups in ExternalFilePathUtil (#17788)
---
.../functional/TestExternalPathHandling.java | 59 +++++-
.../apache/hudi/common/model/HoodieBaseFile.java | 35 +---
.../table/view/AbstractTableFileSystemView.java | 8 +-
.../hudi/common/util/ExternalFilePathUtil.java | 154 ++++++++++++++
.../hudi/common/util/TestExternalFilePathUtil.java | 233 +++++++++++++++++++++
5 files changed, 444 insertions(+), 45 deletions(-)
diff --git
a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestExternalPathHandling.java
b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestExternalPathHandling.java
index 2eb39df74f01..ce94187aacbc 100644
---
a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestExternalPathHandling.java
+++
b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestExternalPathHandling.java
@@ -113,13 +113,14 @@ public class TestExternalPathHandling extends
HoodieClientTestBase {
writeClient = getHoodieWriteClient(writeConfig);
writeClient.setOperationType(WriteOperationType.INSERT_OVERWRITE);
+ Option<String> prefix = fileIdAndNameGenerator.getPrefix();
String instantTime1 =
writeClient.startCommit(HoodieTimeline.REPLACE_COMMIT_ACTION, metaClient);
String partitionPath1 = partitions.get(0);
Pair<String, String> fileIdAndName1 = fileIdAndNameGenerator.generate(1,
instantTime1);
String fileId1 = fileIdAndName1.getLeft();
String fileName1 = fileIdAndName1.getRight();
- String filePath1 = getPath(partitionPath1, fileName1);
- WriteStatus writeStatus1 = createWriteStatus(instantTime1, partitionPath1,
filePath1, fileId1);
+ String filePath1 = getPath(partitionPath1, fileName1, prefix);
+ WriteStatus writeStatus1 = createWriteStatus(instantTime1, partitionPath1,
filePath1, fileId1, fileName1, prefix);
JavaRDD<WriteStatus> rdd1 =
createRdd(Collections.singletonList(writeStatus1));
metaClient.getActiveTimeline().transitionReplaceRequestedToInflight(
INSTANT_GENERATOR.createNewInstant(HoodieInstant.State.REQUESTED,
HoodieTimeline.REPLACE_COMMIT_ACTION, instantTime1), Option.empty());
@@ -132,8 +133,8 @@ public class TestExternalPathHandling extends
HoodieClientTestBase {
Pair<String, String> fileIdAndName2 = fileIdAndNameGenerator.generate(2,
instantTime2);
String fileId2 = fileIdAndName2.getLeft();
String fileName2 = fileIdAndName2.getRight();
- String filePath2 = getPath(partitionPath1, fileName2);
- WriteStatus newWriteStatus = createWriteStatus(instantTime2,
partitionPath1, filePath2, fileId2);
+ String filePath2 = getPath(partitionPath1, fileName2, prefix);
+ WriteStatus newWriteStatus = createWriteStatus(instantTime2,
partitionPath1, filePath2, fileId2, fileName2, prefix);
JavaRDD<WriteStatus> rdd2 =
createRdd(Collections.singletonList(newWriteStatus));
Map<String, List<String>> partitionToReplacedFileIds = new HashMap<>();
partitionToReplacedFileIds.put(partitionPath1,
Collections.singletonList(fileId1));
@@ -149,8 +150,8 @@ public class TestExternalPathHandling extends
HoodieClientTestBase {
Pair<String, String> fileIdAndName3 = fileIdAndNameGenerator.generate(3,
instantTime3);
String fileId3 = fileIdAndName3.getLeft();
String fileName3 = fileIdAndName3.getRight();
- String filePath3 = getPath(partitionPath2, fileName3);
- WriteStatus writeStatus3 = createWriteStatus(instantTime3, partitionPath2,
filePath3, fileId3);
+ String filePath3 = getPath(partitionPath2, fileName3, prefix);
+ WriteStatus writeStatus3 = createWriteStatus(instantTime3, partitionPath2,
filePath3, fileId3, fileName3, prefix);
JavaRDD<WriteStatus> rdd3 =
createRdd(Collections.singletonList(writeStatus3));
metaClient.getActiveTimeline().transitionReplaceRequestedToInflight(
INSTANT_GENERATOR.createNewInstant(HoodieInstant.State.REQUESTED,
HoodieTimeline.REPLACE_COMMIT_ACTION, instantTime3), Option.empty());
@@ -202,21 +203,50 @@ public class TestExternalPathHandling extends
HoodieClientTestBase {
String fileId = fileName;
return Pair.of(fileId, fileName);
};
+ FileIdAndNameGenerator paimonExternal = new FileIdAndNameGenerator() {
+ private static final String PREFIX = "bucket-0";
+
+ @Override
+ public Pair<String, String> generate(int iteration, String instantTime) {
+ String fileName = String.format("file_%d.parquet", iteration);
+ String fileId = PREFIX + "/" + fileName;
+ return Pair.of(fileId, fileName);
+ }
+
+ @Override
+ public Option<String> getPrefix() {
+ return Option.of(PREFIX);
+ }
+ };
List<String> partitionedTable = Arrays.asList("americas/brazil",
"americas/argentina");
List<String> unpartitionedTable = Arrays.asList("", "");
- return Stream.of(Arguments.of(external, partitionedTable),
Arguments.of(external, unpartitionedTable));
+ return Stream.of(
+ Arguments.of(external, partitionedTable),
+ Arguments.of(external, unpartitionedTable),
+ Arguments.of(paimonExternal, partitionedTable),
+ Arguments.of(paimonExternal, unpartitionedTable));
}
private String getPath(String partitionPath, String fileName) {
+ return getPath(partitionPath, fileName, Option.empty());
+ }
+
+ private String getPath(String partitionPath, String fileName, Option<String>
prefix) {
+ String prefixPath = prefix.map(p -> p + "/").orElse("");
if (partitionPath.isEmpty()) {
- return fileName;
+ return prefixPath + fileName;
}
- return String.format("%s/%s", partitionPath, fileName);
+ return String.format("%s/%s%s", partitionPath, prefixPath, fileName);
}
@FunctionalInterface
private interface FileIdAndNameGenerator {
+ // Returns: Pair<fileId, fileName>
Pair<String, String> generate(int iteration, String instantTime);
+
+ default Option<String> getPrefix() {
+ return Option.empty();
+ }
}
private void assertFileGroupCorrectness(String instantTime, String
partitionPath, String filePath, String fileId, int expectedSize) {
@@ -271,13 +301,20 @@ public class TestExternalPathHandling extends
HoodieClientTestBase {
return jsc.parallelize(writeStatuses, 1);
}
- private WriteStatus createWriteStatus(String commitTime, String
partitionPath, String filePath, String fileId) {
+ private WriteStatus createWriteStatus(String commitTime, String
partitionPath, String filePath,
+ String fileId, String fileName, Option<String> prefix) {
WriteStatus writeStatus = new WriteStatus();
writeStatus.setFileId(fileId);
writeStatus.setPartitionPath(partitionPath);
HoodieDeltaWriteStat writeStat = new HoodieDeltaWriteStat();
writeStat.setFileId(fileId);
-
writeStat.setPath(ExternalFilePathUtil.appendCommitTimeAndExternalFileMarker(filePath,
commitTime));
+ String markedFileName = prefix.isPresent()
+ ? ExternalFilePathUtil.appendCommitTimeAndExternalFileMarker(fileName,
commitTime, prefix.get())
+ : ExternalFilePathUtil.appendCommitTimeAndExternalFileMarker(filePath,
commitTime);
+ String fullMarkedPath = prefix.isPresent()
+ ? getPath(partitionPath, markedFileName, prefix)
+ : markedFileName;
+ writeStat.setPath(fullMarkedPath);
writeStat.setPartitionPath(partitionPath);
writeStat.setNumWrites(3);
writeStat.setNumDeletes(0);
diff --git
a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieBaseFile.java
b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieBaseFile.java
index f56310782708..5fcb9eb2407a 100644
--- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieBaseFile.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieBaseFile.java
@@ -20,7 +20,6 @@ package org.apache.hudi.common.model;
import org.apache.hudi.common.util.ExternalFilePathUtil;
import org.apache.hudi.common.util.Option;
-import org.apache.hudi.storage.StoragePath;
import org.apache.hudi.storage.StoragePathInfo;
/**
@@ -78,7 +77,7 @@ public class HoodieBaseFile extends BaseFile {
public HoodieBaseFile(StoragePathInfo pathInfo, String fileId, String
commitTime,
BaseFile bootstrapBaseFile) {
- super(maybeHandleExternallyGeneratedFileName(pathInfo, fileId));
+
super(ExternalFilePathUtil.maybeHandleExternallyGeneratedFileName(pathInfo,
fileId));
this.bootstrapBaseFile = Option.ofNullable(bootstrapBaseFile);
this.fileId = fileId;
this.commitTime = commitTime;
@@ -118,37 +117,7 @@ public class HoodieBaseFile extends BaseFile {
}
private static String[] handleExternallyGeneratedFile(String fileName) {
- String[] values = new String[2];
- // file name has format <originalFileName>_<commitTime>_hudiext and
originalFileName is used as fileId
- int lastUnderscore = fileName.lastIndexOf(UNDERSCORE);
- int secondToLastUnderscore = fileName.lastIndexOf(UNDERSCORE,
lastUnderscore - 1);
- values[0] = fileName.substring(0, secondToLastUnderscore);
- values[1] = fileName.substring(secondToLastUnderscore + 1, lastUnderscore);
- return values;
- }
-
- /**
- * If the file was created externally, the original file path will have a
'_[commitTime]_hudiext' suffix when stored in the metadata table. That suffix
needs to be removed from the FileStatus so
- * that the actual file can be found and read.
- *
- * @param pathInfo an input path info that may require updating
- * @param fileId the fileId for the file
- * @return the original file status if it was not externally created, or a
new FileStatus with the original file name if it was externally created
- */
- private static StoragePathInfo
maybeHandleExternallyGeneratedFileName(StoragePathInfo pathInfo,
- String
fileId) {
- if (pathInfo == null) {
- return null;
- }
- if
(ExternalFilePathUtil.isExternallyCreatedFile(pathInfo.getPath().getName())) {
- // fileId is the same as the original file name for externally created
files
- StoragePath parent = pathInfo.getPath().getParent();
- return new StoragePathInfo(
- new StoragePath(parent, fileId), pathInfo.getLength(),
pathInfo.isDirectory(),
- pathInfo.getBlockReplication(), pathInfo.getBlockSize(),
pathInfo.getModificationTime(), pathInfo.getLocations());
- } else {
- return pathInfo;
- }
+ return
ExternalFilePathUtil.parseFileIdAndCommitTimeFromExternalFile(fileName);
}
public String getFileId() {
diff --git
a/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java
b/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java
index 36611771fe16..075a58bf7bd5 100644
---
a/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java
+++
b/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java
@@ -38,6 +38,7 @@ import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.ClusteringUtils;
import org.apache.hudi.common.util.CompactionUtils;
+import org.apache.hudi.common.util.ExternalFilePathUtil;
import org.apache.hudi.common.util.HoodieTimer;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.SamplingLogger;
@@ -174,7 +175,12 @@ public abstract class AbstractTableFileSystemView
implements SyncableFileSystemV
*/
public List<HoodieFileGroup> addFilesToView(List<StoragePathInfo> statuses) {
Map<String, List<StoragePathInfo>> statusesByPartitionPath =
statuses.stream()
- .collect(Collectors.groupingBy(fileStatus ->
FSUtils.getRelativePartitionPath(metaClient.getBasePath(),
fileStatus.getPath().getParent())));
+ .collect(Collectors.groupingBy(fileStatus -> {
+ String fileName = fileStatus.getPath().getName();
+ StoragePath parent = ExternalFilePathUtil.getFullPathOfPartition(
+ fileStatus.getPath().getParent(), fileName);
+ return FSUtils.getRelativePartitionPath(metaClient.getBasePath(),
parent);
+ }));
return statusesByPartitionPath.entrySet().stream().map(entry ->
addFilesToView(entry.getKey(), entry.getValue()))
.flatMap(List::stream).collect(Collectors.toList());
}
diff --git
a/hudi-common/src/main/java/org/apache/hudi/common/util/ExternalFilePathUtil.java
b/hudi-common/src/main/java/org/apache/hudi/common/util/ExternalFilePathUtil.java
index 223ae8abc42b..41a86078ee3a 100644
---
a/hudi-common/src/main/java/org/apache/hudi/common/util/ExternalFilePathUtil.java
+++
b/hudi-common/src/main/java/org/apache/hudi/common/util/ExternalFilePathUtil.java
@@ -18,12 +18,44 @@
package org.apache.hudi.common.util;
+import org.apache.hudi.storage.StoragePath;
+import org.apache.hudi.storage.StoragePathInfo;
+
/**
* Utility methods for handling externally created files.
+ * <p>
+ * External files are files created outside Hudi (by external formats) that
are later
+ * registered in a Hudi table. To distinguish these files from Hudi-generated
files and
+ * to include commit time information, they are marked with a special suffix.
+ * <p>
+ * Hudi supports two formats for external files:
+ * <p>
+ * <b>Format 1: Without file group prefix</b>
+ * <pre>{@code <originalFileName>_<commitTime>_hudiext}</pre>
+ * Example: {@code data.parquet_20240101000000_hudiext}
+ * <ul>
+ * <li>fileId: {@code data.parquet}</li>
+ * <li>commitTime: {@code 20240101000000}</li>
+ * </ul>
+ * <p>
+ * <b>Format 2: With file group prefix</b>
+ * <pre>{@code
<originalFileName>_<commitTime>_fg%3D<encodedPrefix>_hudiext}</pre>
+ * Example: {@code data.parquet_20240101000000_fg%3Dbucket-0_hudiext}
+ * <ul>
+ * <li>fileId: {@code bucket-0/data.parquet} (includes prefix path)</li>
+ * <li>commitTime: {@code 20240101000000}</li>
+ * <li>prefix: {@code bucket-0} (URL-decoded from {@code _fg%3D} marker)</li>
+ * </ul>
+ * <p>
+ * The file group prefix format is used when external files are organized in
subdirectories
+ * within a partition (e.g., bucketed files). The prefix is URL-encoded to
avoid conflicts
+ * with special characters in directory names.
*/
public class ExternalFilePathUtil {
// Suffix acts as a marker when appended to a file path that the path was
created by an external system and not a Hudi writer.
private static final String EXTERNAL_FILE_SUFFIX = "_hudiext";
+ // Marker for file group prefix in external file names. URL-encoded "_fg="
to avoid conflicts with file names.
+ private static final String FILE_GROUP_PREFIX_MARKER = "_fg%3D";
/**
* Appends the commit time and external file marker to the file path. Hudi
relies on the commit time in the file name for properly generating views of the
files in a table.
@@ -35,6 +67,23 @@ public class ExternalFilePathUtil {
return filePath + "_" + commitTime + EXTERNAL_FILE_SUFFIX;
}
+ /**
+ * Appends the commit time, file group prefix, and external file marker to
the file name.
+ * Use this when the external file is located in a subdirectory within the
partition (e.g., bucket-0/file.parquet).
+ *
+ * @param fileName The original file name (without any path prefix)
+ * @param commitTime The time of the commit that added this file to the table
+ * @param externalFileGroupPrefix The prefix path where the file is located
(e.g., "bucket-0"). Can be null or empty.
+ * @return The file name with commit time and markers appended
+ */
+ public static String appendCommitTimeAndExternalFileMarker(String fileName,
String commitTime, String externalFileGroupPrefix) {
+ if (externalFileGroupPrefix == null || externalFileGroupPrefix.isEmpty()) {
+ return appendCommitTimeAndExternalFileMarker(fileName, commitTime);
+ }
+ String encodedPrefix =
PartitionPathEncodeUtils.escapePathName(externalFileGroupPrefix);
+ return fileName + "_" + commitTime + FILE_GROUP_PREFIX_MARKER +
encodedPrefix + EXTERNAL_FILE_SUFFIX;
+ }
+
/**
* Checks if the file name was created by an external system by checking for
the external file marker at the end of the file name.
* @param fileName The file name
@@ -43,4 +92,109 @@ public class ExternalFilePathUtil {
public static boolean isExternallyCreatedFile(String fileName) {
return fileName.endsWith(EXTERNAL_FILE_SUFFIX);
}
+
+ /**
+ * Extracts the file group prefix from an external file name.
+ * @param fileName The external file name
+ * @return Option containing the decoded file group prefix, or empty if not
present
+ */
+ private static Option<String> getExternalFileGroupPrefix(String fileName) {
+ if (!isExternallyCreatedFile(fileName)) {
+ return Option.empty();
+ }
+ int prefixMarkerIndex = fileName.indexOf(FILE_GROUP_PREFIX_MARKER);
+ if (prefixMarkerIndex == -1) {
+ return Option.empty();
+ }
+ int start = prefixMarkerIndex + FILE_GROUP_PREFIX_MARKER.length();
+ int end = fileName.lastIndexOf(EXTERNAL_FILE_SUFFIX);
+ return
Option.of(PartitionPathEncodeUtils.unescapePathName(fileName.substring(start,
end)));
+ }
+
+ /**
+ * Extracts the original file name from an external file name (without
commit time and markers).
+ * For example, "data.parquet_123_hudiext" returns "data.parquet"
+ * And "data.parquet_123_fg%3Dbucket-0_hudiext" also returns "data.parquet"
+ *
+ * @param fileName The external file name
+ * @return The original file name
+ */
+ private static String getOriginalFileName(String fileName) {
+ if (!isExternallyCreatedFile(fileName)) {
+ return fileName;
+ }
+ int prefixMarkerIndex = fileName.indexOf(FILE_GROUP_PREFIX_MARKER);
+ int markerEnd = prefixMarkerIndex != -1
+ ? prefixMarkerIndex
+ : fileName.lastIndexOf(EXTERNAL_FILE_SUFFIX);
+ int commitTimeStart = fileName.lastIndexOf('_', markerEnd - 1);
+ return fileName.substring(0, commitTimeStart);
+ }
+
+ /**
+ * Adjusts the parent path for external files with file group prefix.
+ * For files with file group prefix, the prefix represents subdirectories
within the partition,
+ * so we need to remove the prefix portion to get the actual partition path.
+ * Supports arbitrary nesting depths (e.g., "bucket-0/subdir1/subdir2").
+ *
+ * @param parent the parent path
+ * @param fileName the file name to check
+ * @return the adjusted parent path
+ */
+ public static StoragePath getFullPathOfPartition(StoragePath parent, String
fileName) {
+ return getExternalFileGroupPrefix(fileName)
+ .map(prefix -> new StoragePath(parent.toString().substring(0,
parent.toString().length() - prefix.length() - 1)))
+ .orElse(parent);
+ }
+
+ /**
+ * Parses external file names to extract fileId and commit time.
+ * Handles both formats:
+ * - With prefix: originalName_commitTime_fg%3D<prefix>_hudiext -> fileId
= prefix/originalName
+ * - Without prefix: originalName_commitTime_hudiext -> fileId =
originalName
+ *
+ * @param fileName The external file name to parse
+ * @return String array of size 2: [fileId, commitTime]
+ */
+ public static String[] parseFileIdAndCommitTimeFromExternalFile(String
fileName) {
+ String[] values = new String[2];
+ // Extract original file name
+ String originalName = getOriginalFileName(fileName);
+ // Extract file group prefix (if present)
+ Option<String> prefix = getExternalFileGroupPrefix(fileName);
+ // Build fileId
+ values[0] = prefix.map(p -> p + "/" + originalName).orElse(originalName);
+ // Extract commitTime
+ int prefixMarkerIndex = fileName.indexOf(FILE_GROUP_PREFIX_MARKER);
+ int markerEnd = prefixMarkerIndex != -1
+ ? prefixMarkerIndex
+ : fileName.lastIndexOf(EXTERNAL_FILE_SUFFIX);
+ int commitTimeStart = fileName.lastIndexOf('_', markerEnd - 1);
+ values[1] = fileName.substring(commitTimeStart + 1, markerEnd);
+ return values;
+ }
+
+ /**
+ * If the file was created externally, the original file path will have a
'_[commitTime]_hudiext' suffix when stored in the metadata table. That suffix
needs to be removed from the FileStatus so
+ * that the actual file can be found and read.
+ *
+ * @param pathInfo an input path info that may require updating
+ * @param fileId the fileId for the file
+ * @return the original file status if it was not externally created, or a
new FileStatus with the original file name if it was externally created
+ */
+ public static StoragePathInfo
maybeHandleExternallyGeneratedFileName(StoragePathInfo pathInfo,
+ String
fileId) {
+ if (pathInfo == null) {
+ return null;
+ }
+ if (isExternallyCreatedFile(pathInfo.getPath().getName())) {
+ String fileName = pathInfo.getPath().getName();
+ StoragePath parent =
getFullPathOfPartition(pathInfo.getPath().getParent(), fileName);
+ return new StoragePathInfo(
+ new StoragePath(parent, fileId), pathInfo.getLength(),
pathInfo.isDirectory(),
+ pathInfo.getBlockReplication(), pathInfo.getBlockSize(),
pathInfo.getModificationTime(), pathInfo.getLocations());
+ } else {
+ return pathInfo;
+ }
+ }
}
diff --git
a/hudi-common/src/test/java/org/apache/hudi/common/util/TestExternalFilePathUtil.java
b/hudi-common/src/test/java/org/apache/hudi/common/util/TestExternalFilePathUtil.java
new file mode 100644
index 000000000000..72ce3278013d
--- /dev/null
+++
b/hudi-common/src/test/java/org/apache/hudi/common/util/TestExternalFilePathUtil.java
@@ -0,0 +1,233 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.common.util;
+
+import org.apache.hudi.storage.StoragePath;
+import org.apache.hudi.storage.StoragePathInfo;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class TestExternalFilePathUtil {
+
+ private static final String COMMIT_TIME = "20240101000000";
+ private static final String EXTERNAL_FILE_SUFFIX = "_hudiext";
+
+ @Test
+ public void testAppendCommitTimeAndExternalFileMarker() {
+ String filePath = "partition1/file1.parquet";
+ String result =
ExternalFilePathUtil.appendCommitTimeAndExternalFileMarker(filePath,
COMMIT_TIME);
+ assertEquals("partition1/file1.parquet_20240101000000_hudiext", result);
+ }
+
+ @Test
+ public void testAppendCommitTimeAndExternalFileMarkerWithPrefix() {
+ String fileName = "file1.parquet";
+ String prefix = "bucket-0";
+ String result =
ExternalFilePathUtil.appendCommitTimeAndExternalFileMarker(fileName,
COMMIT_TIME, prefix);
+ assertEquals("file1.parquet_20240101000000_fg%3Dbucket-0_hudiext", result);
+ }
+
+ @Test
+ public void testAppendCommitTimeAndExternalFileMarkerWithNullPrefix() {
+ String fileName = "file1.parquet";
+ String result =
ExternalFilePathUtil.appendCommitTimeAndExternalFileMarker(fileName,
COMMIT_TIME, null);
+ assertEquals("file1.parquet_20240101000000_hudiext", result);
+ }
+
+ @Test
+ public void testAppendCommitTimeAndExternalFileMarkerWithEmptyPrefix() {
+ String fileName = "file1.parquet";
+ String result =
ExternalFilePathUtil.appendCommitTimeAndExternalFileMarker(fileName,
COMMIT_TIME, "");
+ assertEquals("file1.parquet_20240101000000_hudiext", result);
+ }
+
+ @Test
+ public void testIsExternallyCreatedFile() {
+
assertTrue(ExternalFilePathUtil.isExternallyCreatedFile("file1.parquet_20240101000000_hudiext"));
+
assertTrue(ExternalFilePathUtil.isExternallyCreatedFile("file1.parquet_20240101000000_fg%3Dbucket-0_hudiext"));
+
assertFalse(ExternalFilePathUtil.isExternallyCreatedFile("file1_1-0-1_20240101000000.parquet"));
+ assertFalse(ExternalFilePathUtil.isExternallyCreatedFile("file1.parquet"));
+ }
+
+ @Test
+ public void testParseFileIdAndCommitTimeFromExternalFile_LegacyFormat() {
+ String fileName = "myfile.parquet_20240101000000_hudiext";
+ String[] result =
ExternalFilePathUtil.parseFileIdAndCommitTimeFromExternalFile(fileName);
+
+ assertNotNull(result);
+ assertEquals(2, result.length);
+ assertEquals("myfile.parquet", result[0]); // fileId
+ assertEquals("20240101000000", result[1]); // commitTime
+ }
+
+ @Test
+ public void testParseFileIdAndCommitTimeFromExternalFile_WithPrefix() {
+ String fileName = "data.parquet_20240101000000_fg%3Dbucket-0_hudiext";
+ String[] result =
ExternalFilePathUtil.parseFileIdAndCommitTimeFromExternalFile(fileName);
+
+ assertNotNull(result);
+ assertEquals(2, result.length);
+ assertEquals("bucket-0/data.parquet", result[0]); // fileId includes prefix
+ assertEquals("20240101000000", result[1]); // commitTime
+ }
+
+ @Test
+ public void
testParseFileIdAndCommitTimeFromExternalFile_FileWithUnderscores() {
+ String fileName = "my_data_file.parquet_20240101000000_hudiext";
+ String[] result =
ExternalFilePathUtil.parseFileIdAndCommitTimeFromExternalFile(fileName);
+
+ assertNotNull(result);
+ assertEquals(2, result.length);
+ assertEquals("my_data_file.parquet", result[0]);
+ assertEquals("20240101000000", result[1]);
+ }
+
+ @Test
+ public void
testParseFileIdAndCommitTimeFromExternalFile_FileWithMultipleDots() {
+ String fileName = "data.backup.parquet_20240101000000_hudiext";
+ String[] result =
ExternalFilePathUtil.parseFileIdAndCommitTimeFromExternalFile(fileName);
+
+ assertNotNull(result);
+ assertEquals(2, result.length);
+ assertEquals("data.backup.parquet", result[0]);
+ assertEquals("20240101000000", result[1]);
+ }
+
+ @Test
+ public void testParseFileIdAndCommitTimeFromExternalFile_NoExtension() {
+ String fileName = "datafile_20240101000000_hudiext";
+ String[] result =
ExternalFilePathUtil.parseFileIdAndCommitTimeFromExternalFile(fileName);
+
+ assertNotNull(result);
+ assertEquals(2, result.length);
+ assertEquals("datafile", result[0]);
+ assertEquals("20240101000000", result[1]);
+ }
+
+ @Test
+ public void testGetFullPath_OfPartition_WithPrefix() {
+ StoragePath parent = new StoragePath("/table/partition1/bucket-0");
+ String fileName = "file1.parquet_20240101000000_fg%3Dbucket-0_hudiext";
+
+ StoragePath result = ExternalFilePathUtil.getFullPathOfPartition(parent,
fileName);
+ assertEquals(new StoragePath("/table/partition1"), result);
+ }
+
+ @Test
+ public void testGetFullPath_OfPartition_WithNestedPrefix_2Levels() {
+ StoragePath parent = new StoragePath("/table/partition1/bucket-0/subdir");
+ String fileName =
"file1.parquet_20240101000000_fg%3Dbucket-0%2Fsubdir_hudiext";
+
+ StoragePath result = ExternalFilePathUtil.getFullPathOfPartition(parent,
fileName);
+ assertEquals(new StoragePath("/table/partition1"), result);
+ }
+
+ @Test
+ public void testGetFullPath_OfPartition_WithNestedPrefix_3Levels() {
+ StoragePath parent = new
StoragePath("/table/partition1/bucket-0/subdir1/subdir2");
+ String fileName =
"file1.parquet_20240101000000_fg%3Dbucket-0%2Fsubdir1%2Fsubdir2_hudiext";
+
+ StoragePath result = ExternalFilePathUtil.getFullPathOfPartition(parent,
fileName);
+ assertEquals(new StoragePath("/table/partition1"), result);
+ }
+
+ @Test
+ public void testGetFullPath_OfPartition_WithoutPrefix() {
+ StoragePath parent = new StoragePath("/table/partition1");
+ String fileName = "file1.parquet_20240101000000_hudiext";
+
+ StoragePath result = ExternalFilePathUtil.getFullPathOfPartition(parent,
fileName);
+ assertEquals(new StoragePath("/table/partition1"), result);
+ }
+
+ @Test
+ public void testMaybeHandleExternallyGeneratedFileName_NullPathInfo() {
+ StoragePathInfo result =
ExternalFilePathUtil.maybeHandleExternallyGeneratedFileName(null, "fileId");
+ assertNull(result);
+ }
+
+ @Test
+ public void testMaybeHandleExternallyGeneratedFileName_NonExternalFile() {
+ StoragePath path = new
StoragePath("/table/partition1/file_1-0-1_20240101000000.parquet");
+ StoragePathInfo pathInfo = new StoragePathInfo(path, 1000, false, (short)
1, 128 * 1024 * 1024, 0);
+
+ StoragePathInfo result =
ExternalFilePathUtil.maybeHandleExternallyGeneratedFileName(pathInfo, "fileId");
+ assertEquals(pathInfo, result);
+ }
+
+ @Test
+ public void testMaybeHandleExternallyGeneratedFileName_ExternalFileLegacy() {
+ StoragePath originalPath = new
StoragePath("/table/partition1/file1.parquet_20240101000000_hudiext");
+ StoragePathInfo pathInfo = new StoragePathInfo(originalPath, 1000, false,
(short) 1, 128 * 1024 * 1024, 12345);
+ String fileId = "file1.parquet";
+
+ StoragePathInfo result =
ExternalFilePathUtil.maybeHandleExternallyGeneratedFileName(pathInfo, fileId);
+
+ assertNotNull(result);
+ assertEquals(new StoragePath("/table/partition1/file1.parquet"),
result.getPath());
+ assertEquals(1000, result.getLength());
+ assertEquals(12345, result.getModificationTime());
+ }
+
+ @Test
+ public void
testMaybeHandleExternallyGeneratedFileName_ExternalFileWithPrefix() {
+ StoragePath originalPath = new
StoragePath("/table/partition1/bucket-0/file1.parquet_20240101000000_fg%3Dbucket-0_hudiext");
+ StoragePathInfo pathInfo = new StoragePathInfo(originalPath, 1000, false,
(short) 1, 128 * 1024 * 1024, 12345);
+ String fileId = "bucket-0/file1.parquet";
+
+ StoragePathInfo result =
ExternalFilePathUtil.maybeHandleExternallyGeneratedFileName(pathInfo, fileId);
+
+ assertNotNull(result);
+ // Parent should go up two levels (bucket-0 and then original parent),
resulting in /table/partition1/bucket-0/file1.parquet
+ assertEquals(new StoragePath("/table/partition1/bucket-0/file1.parquet"),
result.getPath());
+ assertEquals(1000, result.getLength());
+ assertEquals(12345, result.getModificationTime());
+ }
+
+ @Test
+ public void testRoundTrip_LegacyFormat() {
+ String originalFile = "mydata.parquet";
+ String withMarker =
ExternalFilePathUtil.appendCommitTimeAndExternalFileMarker(originalFile,
COMMIT_TIME);
+
+ assertTrue(ExternalFilePathUtil.isExternallyCreatedFile(withMarker));
+
+ String[] parsed =
ExternalFilePathUtil.parseFileIdAndCommitTimeFromExternalFile(withMarker);
+ assertEquals(originalFile, parsed[0]);
+ assertEquals(COMMIT_TIME, parsed[1]);
+ }
+
+ @Test
+ public void testRoundTrip_WithPrefix() {
+ String originalFile = "mydata.parquet";
+ String prefix = "bucket-0";
+ String withMarker =
ExternalFilePathUtil.appendCommitTimeAndExternalFileMarker(originalFile,
COMMIT_TIME, prefix);
+
+ assertTrue(ExternalFilePathUtil.isExternallyCreatedFile(withMarker));
+
+ String[] parsed =
ExternalFilePathUtil.parseFileIdAndCommitTimeFromExternalFile(withMarker);
+ assertEquals(prefix + "/" + originalFile, parsed[0]);
+ assertEquals(COMMIT_TIME, parsed[1]);
+ }
+}