This is an automated email from the ASF dual-hosted git repository.
sivabalan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new 8b1a565cbfb [HUDI-9071] MDT validator can configure if log truncation
applies (#12877)
8b1a565cbfb is described below
commit 8b1a565cbfb1b8b2368a29c218f27301d58b1cd8
Author: Davis-Zhang-Onehouse
<[email protected]>
AuthorDate: Mon Feb 24 20:28:12 2025 -0800
[HUDI-9071] MDT validator can configure if log truncation applies (#12877)
bump up unit test coverage
---
.../org/apache/hudi/common/util/StringUtils.java | 4 +
.../apache/hudi/common/util/TestStringUtils.java | 4 +
.../utilities/HoodieMetadataTableValidator.java | 24 +--
.../TestHoodieMetadataTableValidator.java | 198 ++++++++++++++++++++-
4 files changed, 211 insertions(+), 19 deletions(-)
diff --git a/hudi-io/src/main/java/org/apache/hudi/common/util/StringUtils.java
b/hudi-io/src/main/java/org/apache/hudi/common/util/StringUtils.java
index 888a9ae6d74..3c5545a6bf7 100644
--- a/hudi-io/src/main/java/org/apache/hudi/common/util/StringUtils.java
+++ b/hudi-io/src/main/java/org/apache/hudi/common/util/StringUtils.java
@@ -383,6 +383,10 @@ public class StringUtils {
if (objectList == null || objectList.isEmpty()) {
return "";
}
+ // For non-positive value, we will not do any truncation.
+ if (lengthThreshold <= 0) {
+ return objectList.toString();
+ }
StringBuilder sb = new StringBuilder();
for (Object obj : objectList) {
diff --git
a/hudi-io/src/test/java/org/apache/hudi/common/util/TestStringUtils.java
b/hudi-io/src/test/java/org/apache/hudi/common/util/TestStringUtils.java
index 77407a87145..9ae52f18efb 100644
--- a/hudi-io/src/test/java/org/apache/hudi/common/util/TestStringUtils.java
+++ b/hudi-io/src/test/java/org/apache/hudi/common/util/TestStringUtils.java
@@ -233,6 +233,8 @@ public class TestStringUtils {
toStringWithThreshold(Collections.singletonList(str1), 2));
assertEquals("string_...",
toStringWithThreshold(Collections.singletonList(str1), str1.length() -
3));
+ assertEquals("[string_value1]",
+ toStringWithThreshold(Collections.singletonList(str1), 0));
assertEquals(str1,
toStringWithThreshold(Collections.singletonList(str1), str1.length()));
assertEquals(str1,
@@ -253,6 +255,8 @@ public class TestStringUtils {
toStringWithThreshold(stringList, str1.length() + str2.length() +
str3.length() - 3));
assertEquals("string_value1,string_value2,string_value3",
toStringWithThreshold(stringList, str1.length() + str2.length() +
str3.length() + 2));
+ assertEquals("[string_value1, string_value2, string_value3]",
+ toStringWithThreshold(stringList, - 1));
}
@Test
diff --git
a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java
b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java
index 72e99e954b9..75bc9f40aa2 100644
---
a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java
+++
b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java
@@ -199,7 +199,6 @@ public class HoodieMetadataTableValidator implements
Serializable {
private static final long serialVersionUID = 1L;
private static final Logger LOG =
LoggerFactory.getLogger(HoodieMetadataTableValidator.class);
- static final int LOG_DETAIL_MAX_LENGTH = 100_000;
// Spark context
private transient JavaSparkContext jsc;
@@ -403,6 +402,9 @@ public class HoodieMetadataTableValidator implements
Serializable {
@Parameter(names = {"--help", "-h"}, help = true)
public Boolean help = false;
+ @Parameter(names = {"--log-detail-max-length"}, description = "Maximum
length for log details in validation messages. To avoid truncation, input a
non-positive value.")
+ public int logDetailMaxLength = 100_000;
+
@Override
public String toString() {
return "MetadataTableValidatorConfig {\n"
@@ -429,6 +431,7 @@ public class HoodieMetadataTableValidator implements
Serializable {
+ " --spark-master " + sparkMaster + ", \n"
+ " --spark-memory " + sparkMemory + ", \n"
+ " --assumeDatePartitioning-memory " + assumeDatePartitioning +
", \n"
+ + " --log-detail-max-length " + logDetailMaxLength + ", \n"
+ " --props " + propsFilePath + ", \n"
+ " --hoodie-conf " + configs
+ "\n}";
@@ -465,6 +468,7 @@ public class HoodieMetadataTableValidator implements
Serializable {
&& Objects.equals(sparkMaster, config.sparkMaster)
&& Objects.equals(sparkMemory, config.sparkMemory)
&& Objects.equals(assumeDatePartitioning,
config.assumeDatePartitioning)
+ && Objects.equals(logDetailMaxLength, config.logDetailMaxLength)
&& Objects.equals(propsFilePath, config.propsFilePath)
&& Objects.equals(configs, config.configs);
}
@@ -476,7 +480,7 @@ public class HoodieMetadataTableValidator implements
Serializable {
validateSecondaryIndex, validateRecordIndexCount,
validateRecordIndexContent, numRecordIndexErrorSamples,
viewStorageTypeForFSListing, viewStorageTypeForMetadata,
minValidateIntervalSeconds, parallelism, recordIndexParallelism,
ignoreFailed,
- sparkMaster, sparkMemory, assumeDatePartitioning, propsFilePath,
configs, help);
+ sparkMaster, sparkMemory, assumeDatePartitioning,
logDetailMaxLength, propsFilePath, configs, help);
}
}
@@ -776,12 +780,12 @@ public class HoodieMetadataTableValidator implements
Serializable {
if (misMatch.get()) {
String message = "Compare Partitions Failed! " + " Additional "
+ additionalFromFS.size() + " partitions from FS, but missing from
MDT : \""
- + toStringWithThreshold(additionalFromFS, LOG_DETAIL_MAX_LENGTH)
+ + toStringWithThreshold(additionalFromFS, cfg.logDetailMaxLength)
+ "\" and additional " + actualAdditionalPartitionsInMDT.size()
- + "partitions from MDT, but missing from FS listing : \""
- + toStringWithThreshold(actualAdditionalPartitionsInMDT,
LOG_DETAIL_MAX_LENGTH)
+ + " partitions from MDT, but missing from FS listing : \""
+ + toStringWithThreshold(actualAdditionalPartitionsInMDT,
cfg.logDetailMaxLength)
+ "\".\n All " + allPartitionPathsFromFS.size() + " partitions
from FS listing "
- + toStringWithThreshold(allPartitionPathsFromFS,
LOG_DETAIL_MAX_LENGTH);
+ + toStringWithThreshold(allPartitionPathsFromFS,
cfg.logDetailMaxLength);
LOG.error(message);
throw new HoodieValidationException(message);
}
@@ -1401,9 +1405,9 @@ public class HoodieMetadataTableValidator implements
Serializable {
+ "MDT-based listing (%s): %s.",
label,
infoListFromFS.size(),
- toStringWithThreshold(infoListFromFS, LOG_DETAIL_MAX_LENGTH),
+ toStringWithThreshold(infoListFromFS, cfg.logDetailMaxLength),
infoListFromMetadataTable.size(),
- toStringWithThreshold(infoListFromMetadataTable,
LOG_DETAIL_MAX_LENGTH));
+ toStringWithThreshold(infoListFromMetadataTable,
cfg.logDetailMaxLength));
mismatch = true;
} else {
for (int i = 0; i < infoListFromMetadataTable.size(); i++) {
@@ -1440,9 +1444,9 @@ public class HoodieMetadataTableValidator implements
Serializable {
+ "metadata table. File system-based listing (%s file slices):
%s; "
+ "MDT-based listing (%s file slices): %s.",
fileSliceListFromFS.size(),
- toStringWithThreshold(fileSliceListFromFS, LOG_DETAIL_MAX_LENGTH),
+ toStringWithThreshold(fileSliceListFromFS, cfg.logDetailMaxLength),
fileSliceListFromMetadataTable.size(),
- toStringWithThreshold(fileSliceListFromMetadataTable,
LOG_DETAIL_MAX_LENGTH));
+ toStringWithThreshold(fileSliceListFromMetadataTable,
cfg.logDetailMaxLength));
mismatch = true;
} else if (!fileSliceListFromMetadataTable.equals(fileSliceListFromFS)) {
// In-memory cache for the set of committed files of commits of interest
diff --git
a/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieMetadataTableValidator.java
b/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieMetadataTableValidator.java
index ee62611f1e4..582639928f5 100644
---
a/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieMetadataTableValidator.java
+++
b/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieMetadataTableValidator.java
@@ -105,7 +105,6 @@ import static
org.apache.hudi.common.testutils.RawTripTestPayload.recordToString
import static org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema;
import static org.apache.hudi.common.util.StringUtils.toStringWithThreshold;
import static org.apache.hudi.common.util.TestStringUtils.generateRandomString;
-import static
org.apache.hudi.utilities.HoodieMetadataTableValidator.LOG_DETAIL_MAX_LENGTH;
import static org.apache.spark.sql.types.DataTypes.IntegerType;
import static org.apache.spark.sql.types.DataTypes.StringType;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
@@ -123,6 +122,8 @@ public class TestHoodieMetadataTableValidator extends
HoodieSparkClientTestBase
return Stream.of(-1, 1, 3, 4, 5).flatMap(i -> Stream.of(Arguments.of(i,
true), Arguments.of(i, false)));
}
+ private final int logDetailMaxLength = new
HoodieMetadataTableValidator.Config().logDetailMaxLength;
+
private static Stream<Arguments> viewStorageArgs() {
return Stream.of(
Arguments.of(null, null, false),
@@ -900,10 +901,10 @@ public class TestHoodieMetadataTableValidator extends
HoodieSparkClientTestBase
List<T> listMdt, List<T> listFs, T
newItem) {
assertEquals(
oversizeList,
- toStringWithThreshold(listMdt, Integer.MAX_VALUE).length() >
LOG_DETAIL_MAX_LENGTH);
+ toStringWithThreshold(listMdt, Integer.MAX_VALUE).length() >
logDetailMaxLength);
assertEquals(
oversizeList,
- toStringWithThreshold(listFs, Integer.MAX_VALUE).length() >
LOG_DETAIL_MAX_LENGTH);
+ toStringWithThreshold(listFs, Integer.MAX_VALUE).length() >
logDetailMaxLength);
// Equal case
assertDoesNotThrow(() ->
validator.validate(listMdt, listFs, partition, label));
@@ -919,8 +920,8 @@ public class TestHoodieMetadataTableValidator extends
HoodieSparkClientTestBase
+ "the metadata table. File system-based listing (%s): %s; "
+ "MDT-based listing (%s): %s.",
label, partition, basePath, label, listFs.size(),
- toStringWithThreshold(listFs, LOG_DETAIL_MAX_LENGTH),
- listMdt.size(), toStringWithThreshold(listMdt,
LOG_DETAIL_MAX_LENGTH)),
+ toStringWithThreshold(listFs, logDetailMaxLength),
+ listMdt.size(), toStringWithThreshold(listMdt,
logDetailMaxLength)),
exception.getMessage());
listFs.remove(listFs.size() - 1);
// Item mismatch
@@ -968,10 +969,10 @@ public class TestHoodieMetadataTableValidator extends
HoodieSparkClientTestBase
TimelineUtils.generateInstantTime(true, timeGenerator)).getLeft());
assertEquals(
oversizeList,
- toStringWithThreshold(listMdt, Integer.MAX_VALUE).length() >
LOG_DETAIL_MAX_LENGTH);
+ toStringWithThreshold(listMdt, Integer.MAX_VALUE).length() >
logDetailMaxLength);
assertEquals(
oversizeList,
- toStringWithThreshold(listFs, Integer.MAX_VALUE).length() >
LOG_DETAIL_MAX_LENGTH);
+ toStringWithThreshold(listFs, Integer.MAX_VALUE).length() >
logDetailMaxLength);
Exception exception = assertThrows(
HoodieValidationException.class,
() -> validator.validateFileSlices(listMdt, listFs, partition,
metaClient, label));
@@ -982,8 +983,8 @@ public class TestHoodieMetadataTableValidator extends
HoodieSparkClientTestBase
+ "metadata table. File system-based listing (%s file slices):
%s; "
+ "MDT-based listing (%s file slices): %s.",
label, partition, basePath, listFs.size(),
- toStringWithThreshold(listFs, LOG_DETAIL_MAX_LENGTH),
- listMdt.size(), toStringWithThreshold(listMdt,
LOG_DETAIL_MAX_LENGTH)),
+ toStringWithThreshold(listFs, logDetailMaxLength),
+ listMdt.size(), toStringWithThreshold(listMdt,
logDetailMaxLength)),
exception.getMessage());
listFs.remove(listFs.size() - 1);
// Item mismatch
@@ -1364,4 +1365,183 @@ public class TestHoodieMetadataTableValidator extends
HoodieSparkClientTestBase
throw new RuntimeException(e);
}
}
+
+ @Test
+ void testLogDetailMaxLength() {
+ Map<String, String> writeOptions = new HashMap<>();
+ writeOptions.put(DataSourceWriteOptions.TABLE_NAME().key(), "test_table");
+ writeOptions.put("hoodie.table.name", "test_table");
+ writeOptions.put(DataSourceWriteOptions.TABLE_TYPE().key(),
"MERGE_ON_READ");
+ writeOptions.put(DataSourceWriteOptions.RECORDKEY_FIELD().key(),
"_row_key");
+ writeOptions.put(DataSourceWriteOptions.PRECOMBINE_FIELD().key(),
"timestamp");
+ writeOptions.put(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(),
"partition_path");
+
+ // Create a large dataset to generate long validation messages
+ Dataset<Row> inserts = makeInsertDf("000", 1000).cache();
+ inserts.write().format("hudi").options(writeOptions)
+ .option(DataSourceWriteOptions.OPERATION().key(),
WriteOperationType.BULK_INSERT.value())
+ .mode(SaveMode.Overwrite)
+ .save(basePath);
+
+ // Test with default max length
+ HoodieMetadataTableValidator.Config config = new
HoodieMetadataTableValidator.Config();
+ config.basePath = "file:" + basePath;
+ config.validateLatestFileSlices = true;
+ config.validateAllFileGroups = true;
+ MockHoodieMetadataTableValidator validator = new
MockHoodieMetadataTableValidator(jsc, config);
+
+ // Generate two unequal lists to trigger validation error
+ TimeGenerator timeGenerator = TimeGenerators
+ .getTimeGenerator(HoodieTimeGeneratorConfig.defaultConfig(basePath),
+ HadoopFSUtils.getStorageConf(jsc.hadoopConfiguration()));
+ Pair<List<FileSlice>, List<FileSlice>> filelistPair =
generateTwoEqualFileSliceList(500, timeGenerator);
+ List<FileSlice> listMdt = filelistPair.getLeft();
+ List<FileSlice> listFs = new ArrayList<>(filelistPair.getRight());
+ listFs.add(generateRandomFileSlice(TimelineUtils.generateInstantTime(true,
timeGenerator),
+ TimelineUtils.generateInstantTime(true, timeGenerator),
+ TimelineUtils.generateInstantTime(true, timeGenerator)).getLeft());
+
+ // Verify default behavior (100,000 chars)
+ MockHoodieMetadataTableValidator finalValidator = validator;
+ Exception exception = assertThrows(
+ HoodieValidationException.class,
+ () -> finalValidator.validateFileSlices(listMdt, listFs, "partition",
metaClient, "test"));
+ // The message include 3 parts: Truncated file slice list of MDT,
truncated file slice list of File system, other exception message strings.
+ assertTrue(exception.getMessage().length() <= 100_000 * 2 + 1000);
+
+ // Test with custom small max length
+ config.logDetailMaxLength = 1000;
+ validator = new MockHoodieMetadataTableValidator(jsc, config);
+ MockHoodieMetadataTableValidator finalValidator1 = validator;
+ exception = assertThrows(
+ HoodieValidationException.class,
+ () -> finalValidator1.validateFileSlices(listMdt, listFs, "partition",
metaClient, "test"));
+ // The message include 3 parts: Truncated file slice list of MDT,
truncated file slice list of File system, other exception message strings.
+ assertTrue(exception.getMessage().length() <= 1000 * 2 + 1000);
+
+ // Test with custom large max length
+ config.logDetailMaxLength = 200_000;
+ validator = new MockHoodieMetadataTableValidator(jsc, config);
+ MockHoodieMetadataTableValidator finalValidator2 = validator;
+ exception = assertThrows(
+ HoodieValidationException.class,
+ () -> finalValidator2.validateFileSlices(listMdt, listFs, "partition",
metaClient, "test"));
+ // The message include 3 parts: Truncated file slice list of MDT,
truncated file slice list of File system, other exception message strings.
+ assertTrue(exception.getMessage().length() <= 200_000 * 2 + 1000);
+ }
+
+ @Test
+ void testValidatePartitionsTruncation() throws IOException {
+ // Setup mock objects
+ HoodieMetadataTableValidator.Config config = new
HoodieMetadataTableValidator.Config();
+ config.basePath = basePath;
+ config.logDetailMaxLength = 100; // Small length to force truncation
+
+ MockHoodieMetadataTableValidator validator = new
MockHoodieMetadataTableValidator(jsc, config);
+ HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
+ HoodieTableMetaClient metaClient = mock(HoodieTableMetaClient.class);
+ HoodieStorage fs = mock(HoodieStorage.class);
+
+ // Generate long partition lists that will exceed the truncation threshold
+ List<String> mdtPartitions = new ArrayList<>();
+ List<String> fsPartitions = new ArrayList<>();
+ for (int i = 0; i < 20; i++) {
+ mdtPartitions.add("partition_" + generateRandomString(20));
+ }
+ for (int i = 0; i < 15; i++) {
+ fsPartitions.add("partition_" + generateRandomString(20));
+ }
+
+ // Setup mocks
+ when(metaClient.getStorage()).thenReturn(fs);
+ for (String partition : mdtPartitions) {
+ when(fs.exists(new StoragePath(basePath + "/" +
partition))).thenReturn(true);
+ }
+
+ // Mock timeline
+ HoodieTimeline commitsTimeline = mock(HoodieTimeline.class);
+ HoodieTimeline completedTimeline = mock(HoodieTimeline.class);
+ when(metaClient.getCommitsTimeline()).thenReturn(commitsTimeline);
+
when(commitsTimeline.filterCompletedInstants()).thenReturn(completedTimeline);
+
+ // Setup validator with test data
+ validator.setMetadataPartitionsToReturn(mdtPartitions);
+ validator.setFsPartitionsToReturn(fsPartitions);
+
+ // Test validation with truncation
+ HoodieValidationException exception =
assertThrows(HoodieValidationException.class, () -> {
+ validator.validatePartitions(engineContext, new StoragePath(basePath),
metaClient);
+ });
+
+ // Verify truncation in error message
+ String errorMsg = exception.getMessage();
+ assertTrue(errorMsg.contains("...")); // Should contain truncation
indicator
+ assertTrue(errorMsg.length() <= config.logDetailMaxLength * 2 + 1000); //
Account for both lists and additional message text
+
+ // Verify the error message contains the count of partitions
+ assertTrue(errorMsg.contains(String.format("Additional %d partitions from
FS, but missing from MDT : ",
+ fsPartitions.size())));
+ assertTrue(errorMsg.contains(String.format("additional %d partitions from
MDT, but missing from FS listing :",
+ mdtPartitions.size())));
+ }
+
+ @Test
+ void testValidateFileSlicesTruncation() {
+ // Setup mock objects
+ HoodieMetadataTableValidator.Config config = new
HoodieMetadataTableValidator.Config();
+ config.basePath = basePath;
+ config.logDetailMaxLength = 100; // Small length to force truncation
+
+ MockHoodieMetadataTableValidator validator = new
MockHoodieMetadataTableValidator(jsc, config);
+
+ // Generate large lists of file slices that will exceed truncation
threshold
+ String partition = "partition_" + generateRandomString(10);
+ List<FileSlice> mdtFileSlices = new ArrayList<>();
+ List<FileSlice> fsFileSlices = new ArrayList<>();
+
+ // Generate 20 file slices for MDT and 15 for FS to ensure they're
different
+ TimeGenerator timeGenerator = TimeGenerators
+ .getTimeGenerator(HoodieTimeGeneratorConfig.defaultConfig(basePath),
+ HadoopFSUtils.getStorageConf(jsc.hadoopConfiguration()));
+ for (int i = 0; i < 20; i++) {
+ String fileId = UUID.randomUUID().toString();
+
+ String baseInstantTime = metaClient.createNewInstantTime();
+
+ // Create file slice with base file and log files
+ HoodieBaseFile baseFile = new HoodieBaseFile(FSUtils.makeBaseFileName(
+ baseInstantTime, "1-0-1", fileId,
HoodieFileFormat.PARQUET.getFileExtension()));
+ List<HoodieLogFile> logFiles = Arrays.asList(
+ new HoodieLogFile(FSUtils.makeLogFileName(fileId,
HoodieLogFile.DELTA_EXTENSION, baseInstantTime, 1, "1-0-1")),
+ new HoodieLogFile(FSUtils.makeLogFileName(fileId,
HoodieLogFile.DELTA_EXTENSION, baseInstantTime, 2, "1-0-1"))
+ );
+
+ FileSlice slice = new FileSlice(new HoodieFileGroupId(partition,
fileId), baseInstantTime);
+ slice.setBaseFile(baseFile);
+ logFiles.forEach(slice::addLogFile);
+ mdtFileSlices.add(slice);
+
+ // Add to FS list for first 15 entries
+ if (i < 15) {
+ fsFileSlices.add(new FileSlice(slice));
+ }
+ }
+
+ // Test validation with truncation
+ HoodieValidationException exception = assertThrows(
+ HoodieValidationException.class,
+ () -> validator.validateFileSlices(mdtFileSlices, fsFileSlices,
partition, metaClient, "test"));
+
+ String errorMsg = exception.getMessage();
+
+ // Verify truncation behavior
+ assertTrue(errorMsg.contains("...")); // Should contain truncation
indicator
+ assertTrue(errorMsg.length() <= config.logDetailMaxLength * 2 + 1000); //
Account for both lists and additional message text
+
+ // Verify error message contains file slice counts
+ assertTrue(errorMsg.contains(String.format("Number of file slices based on
the file system does not match that based on the metadata table. File
system-based listing (%d file slices)",
+ fsFileSlices.size())));
+ assertTrue(errorMsg.contains(String.format("MDT-based listing (%d file
slices)",
+ mdtFileSlices.size())));
+ }
}