This is an automated email from the ASF dual-hosted git repository.

sivabalan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
     new 8b1a565cbfb [HUDI-9071] MDT validator can configure if log truncation 
applies (#12877)
8b1a565cbfb is described below

commit 8b1a565cbfb1b8b2368a29c218f27301d58b1cd8
Author: Davis-Zhang-Onehouse 
<[email protected]>
AuthorDate: Mon Feb 24 20:28:12 2025 -0800

    [HUDI-9071] MDT validator can configure if log truncation applies (#12877)
    
    bump up unit test coverage
---
 .../org/apache/hudi/common/util/StringUtils.java   |   4 +
 .../apache/hudi/common/util/TestStringUtils.java   |   4 +
 .../utilities/HoodieMetadataTableValidator.java    |  24 +--
 .../TestHoodieMetadataTableValidator.java          | 198 ++++++++++++++++++++-
 4 files changed, 211 insertions(+), 19 deletions(-)

diff --git a/hudi-io/src/main/java/org/apache/hudi/common/util/StringUtils.java 
b/hudi-io/src/main/java/org/apache/hudi/common/util/StringUtils.java
index 888a9ae6d74..3c5545a6bf7 100644
--- a/hudi-io/src/main/java/org/apache/hudi/common/util/StringUtils.java
+++ b/hudi-io/src/main/java/org/apache/hudi/common/util/StringUtils.java
@@ -383,6 +383,10 @@ public class StringUtils {
     if (objectList == null || objectList.isEmpty()) {
       return "";
     }
+    // For non-positive value, we will not do any truncation.
+    if (lengthThreshold <= 0) {
+      return objectList.toString();
+    }
     StringBuilder sb = new StringBuilder();
 
     for (Object obj : objectList) {
diff --git 
a/hudi-io/src/test/java/org/apache/hudi/common/util/TestStringUtils.java 
b/hudi-io/src/test/java/org/apache/hudi/common/util/TestStringUtils.java
index 77407a87145..9ae52f18efb 100644
--- a/hudi-io/src/test/java/org/apache/hudi/common/util/TestStringUtils.java
+++ b/hudi-io/src/test/java/org/apache/hudi/common/util/TestStringUtils.java
@@ -233,6 +233,8 @@ public class TestStringUtils {
         toStringWithThreshold(Collections.singletonList(str1), 2));
     assertEquals("string_...",
         toStringWithThreshold(Collections.singletonList(str1), str1.length() - 
3));
+    assertEquals("[string_value1]",
+        toStringWithThreshold(Collections.singletonList(str1), 0));
     assertEquals(str1,
         toStringWithThreshold(Collections.singletonList(str1), str1.length()));
     assertEquals(str1,
@@ -253,6 +255,8 @@ public class TestStringUtils {
         toStringWithThreshold(stringList, str1.length() + str2.length() + 
str3.length() - 3));
     assertEquals("string_value1,string_value2,string_value3",
         toStringWithThreshold(stringList, str1.length() + str2.length() + 
str3.length() + 2));
+    assertEquals("[string_value1, string_value2, string_value3]",
+        toStringWithThreshold(stringList, - 1));
   }
 
   @Test
diff --git 
a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java
 
b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java
index 72e99e954b9..75bc9f40aa2 100644
--- 
a/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java
+++ 
b/hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java
@@ -199,7 +199,6 @@ public class HoodieMetadataTableValidator implements 
Serializable {
 
   private static final long serialVersionUID = 1L;
   private static final Logger LOG = 
LoggerFactory.getLogger(HoodieMetadataTableValidator.class);
-  static final int LOG_DETAIL_MAX_LENGTH = 100_000;
 
   // Spark context
   private transient JavaSparkContext jsc;
@@ -403,6 +402,9 @@ public class HoodieMetadataTableValidator implements 
Serializable {
     @Parameter(names = {"--help", "-h"}, help = true)
     public Boolean help = false;
 
+    @Parameter(names = {"--log-detail-max-length"}, description = "Maximum 
length for log details in validation messages. To avoid truncation, input a 
non-positive value.")
+    public int logDetailMaxLength = 100_000;
+
     @Override
     public String toString() {
       return "MetadataTableValidatorConfig {\n"
@@ -429,6 +431,7 @@ public class HoodieMetadataTableValidator implements 
Serializable {
           + "   --spark-master " + sparkMaster + ", \n"
           + "   --spark-memory " + sparkMemory + ", \n"
           + "   --assumeDatePartitioning-memory " + assumeDatePartitioning + 
", \n"
+          + "   --log-detail-max-length " + logDetailMaxLength + ", \n"
           + "   --props " + propsFilePath + ", \n"
           + "   --hoodie-conf " + configs
           + "\n}";
@@ -465,6 +468,7 @@ public class HoodieMetadataTableValidator implements 
Serializable {
           && Objects.equals(sparkMaster, config.sparkMaster)
           && Objects.equals(sparkMemory, config.sparkMemory)
           && Objects.equals(assumeDatePartitioning, 
config.assumeDatePartitioning)
+          && Objects.equals(logDetailMaxLength, config.logDetailMaxLength)
           && Objects.equals(propsFilePath, config.propsFilePath)
           && Objects.equals(configs, config.configs);
     }
@@ -476,7 +480,7 @@ public class HoodieMetadataTableValidator implements 
Serializable {
           validateSecondaryIndex, validateRecordIndexCount, 
validateRecordIndexContent, numRecordIndexErrorSamples,
           viewStorageTypeForFSListing, viewStorageTypeForMetadata,
           minValidateIntervalSeconds, parallelism, recordIndexParallelism, 
ignoreFailed,
-          sparkMaster, sparkMemory, assumeDatePartitioning, propsFilePath, 
configs, help);
+          sparkMaster, sparkMemory, assumeDatePartitioning, 
logDetailMaxLength, propsFilePath, configs, help);
     }
   }
 
@@ -776,12 +780,12 @@ public class HoodieMetadataTableValidator implements 
Serializable {
       if (misMatch.get()) {
         String message = "Compare Partitions Failed! " + " Additional "
             + additionalFromFS.size() + " partitions from FS, but missing from 
MDT : \""
-            + toStringWithThreshold(additionalFromFS, LOG_DETAIL_MAX_LENGTH)
+            + toStringWithThreshold(additionalFromFS, cfg.logDetailMaxLength)
             + "\" and additional " + actualAdditionalPartitionsInMDT.size()
-            + "partitions from MDT, but missing from FS listing : \""
-            + toStringWithThreshold(actualAdditionalPartitionsInMDT, 
LOG_DETAIL_MAX_LENGTH)
+            + " partitions from MDT, but missing from FS listing : \""
+            + toStringWithThreshold(actualAdditionalPartitionsInMDT, 
cfg.logDetailMaxLength)
             + "\".\n All " + allPartitionPathsFromFS.size() + " partitions 
from FS listing "
-            + toStringWithThreshold(allPartitionPathsFromFS, 
LOG_DETAIL_MAX_LENGTH);
+            + toStringWithThreshold(allPartitionPathsFromFS, 
cfg.logDetailMaxLength);
         LOG.error(message);
         throw new HoodieValidationException(message);
       }
@@ -1401,9 +1405,9 @@ public class HoodieMetadataTableValidator implements 
Serializable {
               + "MDT-based listing (%s): %s.",
           label,
           infoListFromFS.size(),
-          toStringWithThreshold(infoListFromFS, LOG_DETAIL_MAX_LENGTH),
+          toStringWithThreshold(infoListFromFS, cfg.logDetailMaxLength),
           infoListFromMetadataTable.size(),
-          toStringWithThreshold(infoListFromMetadataTable, 
LOG_DETAIL_MAX_LENGTH));
+          toStringWithThreshold(infoListFromMetadataTable, 
cfg.logDetailMaxLength));
       mismatch = true;
     } else {
       for (int i = 0; i < infoListFromMetadataTable.size(); i++) {
@@ -1440,9 +1444,9 @@ public class HoodieMetadataTableValidator implements 
Serializable {
               + "metadata table. File system-based listing (%s file slices): 
%s; "
               + "MDT-based listing (%s file slices): %s.",
           fileSliceListFromFS.size(),
-          toStringWithThreshold(fileSliceListFromFS, LOG_DETAIL_MAX_LENGTH),
+          toStringWithThreshold(fileSliceListFromFS, cfg.logDetailMaxLength),
           fileSliceListFromMetadataTable.size(),
-          toStringWithThreshold(fileSliceListFromMetadataTable, 
LOG_DETAIL_MAX_LENGTH));
+          toStringWithThreshold(fileSliceListFromMetadataTable, 
cfg.logDetailMaxLength));
       mismatch = true;
     } else if (!fileSliceListFromMetadataTable.equals(fileSliceListFromFS)) {
       // In-memory cache for the set of committed files of commits of interest
diff --git 
a/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieMetadataTableValidator.java
 
b/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieMetadataTableValidator.java
index ee62611f1e4..582639928f5 100644
--- 
a/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieMetadataTableValidator.java
+++ 
b/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieMetadataTableValidator.java
@@ -105,7 +105,6 @@ import static 
org.apache.hudi.common.testutils.RawTripTestPayload.recordToString
 import static org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema;
 import static org.apache.hudi.common.util.StringUtils.toStringWithThreshold;
 import static org.apache.hudi.common.util.TestStringUtils.generateRandomString;
-import static 
org.apache.hudi.utilities.HoodieMetadataTableValidator.LOG_DETAIL_MAX_LENGTH;
 import static org.apache.spark.sql.types.DataTypes.IntegerType;
 import static org.apache.spark.sql.types.DataTypes.StringType;
 import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
@@ -123,6 +122,8 @@ public class TestHoodieMetadataTableValidator extends 
HoodieSparkClientTestBase
     return Stream.of(-1, 1, 3, 4, 5).flatMap(i -> Stream.of(Arguments.of(i, 
true), Arguments.of(i, false)));
   }
 
+  private final int logDetailMaxLength = new 
HoodieMetadataTableValidator.Config().logDetailMaxLength;
+
   private static Stream<Arguments> viewStorageArgs() {
     return Stream.of(
         Arguments.of(null, null, false),
@@ -900,10 +901,10 @@ public class TestHoodieMetadataTableValidator extends 
HoodieSparkClientTestBase
                                         List<T> listMdt, List<T> listFs, T 
newItem) {
     assertEquals(
         oversizeList,
-        toStringWithThreshold(listMdt, Integer.MAX_VALUE).length() > 
LOG_DETAIL_MAX_LENGTH);
+        toStringWithThreshold(listMdt, Integer.MAX_VALUE).length() > 
logDetailMaxLength);
     assertEquals(
         oversizeList,
-        toStringWithThreshold(listFs, Integer.MAX_VALUE).length() > 
LOG_DETAIL_MAX_LENGTH);
+        toStringWithThreshold(listFs, Integer.MAX_VALUE).length() > 
logDetailMaxLength);
     // Equal case
     assertDoesNotThrow(() ->
         validator.validate(listMdt, listFs, partition, label));
@@ -919,8 +920,8 @@ public class TestHoodieMetadataTableValidator extends 
HoodieSparkClientTestBase
                 + "the metadata table. File system-based listing (%s): %s; "
                 + "MDT-based listing (%s): %s.",
             label, partition, basePath, label, listFs.size(),
-            toStringWithThreshold(listFs, LOG_DETAIL_MAX_LENGTH),
-            listMdt.size(), toStringWithThreshold(listMdt, 
LOG_DETAIL_MAX_LENGTH)),
+            toStringWithThreshold(listFs, logDetailMaxLength),
+            listMdt.size(), toStringWithThreshold(listMdt, 
logDetailMaxLength)),
         exception.getMessage());
     listFs.remove(listFs.size() - 1);
     // Item mismatch
@@ -968,10 +969,10 @@ public class TestHoodieMetadataTableValidator extends 
HoodieSparkClientTestBase
         TimelineUtils.generateInstantTime(true, timeGenerator)).getLeft());
     assertEquals(
         oversizeList,
-        toStringWithThreshold(listMdt, Integer.MAX_VALUE).length() > 
LOG_DETAIL_MAX_LENGTH);
+        toStringWithThreshold(listMdt, Integer.MAX_VALUE).length() > 
logDetailMaxLength);
     assertEquals(
         oversizeList,
-        toStringWithThreshold(listFs, Integer.MAX_VALUE).length() > 
LOG_DETAIL_MAX_LENGTH);
+        toStringWithThreshold(listFs, Integer.MAX_VALUE).length() > 
logDetailMaxLength);
     Exception exception = assertThrows(
         HoodieValidationException.class,
         () -> validator.validateFileSlices(listMdt, listFs, partition, 
metaClient, label));
@@ -982,8 +983,8 @@ public class TestHoodieMetadataTableValidator extends 
HoodieSparkClientTestBase
                 + "metadata table. File system-based listing (%s file slices): 
%s; "
                 + "MDT-based listing (%s file slices): %s.",
             label, partition, basePath, listFs.size(),
-            toStringWithThreshold(listFs, LOG_DETAIL_MAX_LENGTH),
-            listMdt.size(), toStringWithThreshold(listMdt, 
LOG_DETAIL_MAX_LENGTH)),
+            toStringWithThreshold(listFs, logDetailMaxLength),
+            listMdt.size(), toStringWithThreshold(listMdt, 
logDetailMaxLength)),
         exception.getMessage());
     listFs.remove(listFs.size() - 1);
     // Item mismatch
@@ -1364,4 +1365,183 @@ public class TestHoodieMetadataTableValidator extends 
HoodieSparkClientTestBase
       throw new RuntimeException(e);
     }
   }
+
+  @Test
+  void testLogDetailMaxLength() {
+    Map<String, String> writeOptions = new HashMap<>();
+    writeOptions.put(DataSourceWriteOptions.TABLE_NAME().key(), "test_table");
+    writeOptions.put("hoodie.table.name", "test_table");
+    writeOptions.put(DataSourceWriteOptions.TABLE_TYPE().key(), 
"MERGE_ON_READ");
+    writeOptions.put(DataSourceWriteOptions.RECORDKEY_FIELD().key(), 
"_row_key");
+    writeOptions.put(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), 
"timestamp");
+    writeOptions.put(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), 
"partition_path");
+
+    // Create a large dataset to generate long validation messages
+    Dataset<Row> inserts = makeInsertDf("000", 1000).cache();
+    inserts.write().format("hudi").options(writeOptions)
+        .option(DataSourceWriteOptions.OPERATION().key(), 
WriteOperationType.BULK_INSERT.value())
+        .mode(SaveMode.Overwrite)
+        .save(basePath);
+
+    // Test with default max length
+    HoodieMetadataTableValidator.Config config = new 
HoodieMetadataTableValidator.Config();
+    config.basePath = "file:" + basePath;
+    config.validateLatestFileSlices = true;
+    config.validateAllFileGroups = true;
+    MockHoodieMetadataTableValidator validator = new 
MockHoodieMetadataTableValidator(jsc, config);
+
+    // Generate two unequal lists to trigger validation error
+    TimeGenerator timeGenerator = TimeGenerators
+        .getTimeGenerator(HoodieTimeGeneratorConfig.defaultConfig(basePath),
+            HadoopFSUtils.getStorageConf(jsc.hadoopConfiguration()));
+    Pair<List<FileSlice>, List<FileSlice>> filelistPair = 
generateTwoEqualFileSliceList(500, timeGenerator);
+    List<FileSlice> listMdt = filelistPair.getLeft();
+    List<FileSlice> listFs = new ArrayList<>(filelistPair.getRight());
+    listFs.add(generateRandomFileSlice(TimelineUtils.generateInstantTime(true, 
timeGenerator),
+        TimelineUtils.generateInstantTime(true, timeGenerator),
+        TimelineUtils.generateInstantTime(true, timeGenerator)).getLeft());
+
+    // Verify default behavior (100,000 chars)
+    MockHoodieMetadataTableValidator finalValidator = validator;
+    Exception exception = assertThrows(
+        HoodieValidationException.class,
+        () -> finalValidator.validateFileSlices(listMdt, listFs, "partition", 
metaClient, "test"));
+    // The message include 3 parts: Truncated file slice list of MDT, 
truncated file slice list of File system, other exception message strings.
+    assertTrue(exception.getMessage().length() <= 100_000 * 2 + 1000);
+
+    // Test with custom small max length
+    config.logDetailMaxLength = 1000;
+    validator = new MockHoodieMetadataTableValidator(jsc, config);
+    MockHoodieMetadataTableValidator finalValidator1 = validator;
+    exception = assertThrows(
+        HoodieValidationException.class,
+        () -> finalValidator1.validateFileSlices(listMdt, listFs, "partition", 
metaClient, "test"));
+    // The message include 3 parts: Truncated file slice list of MDT, 
truncated file slice list of File system, other exception message strings.
+    assertTrue(exception.getMessage().length() <= 1000 * 2 + 1000);
+
+    // Test with custom large max length
+    config.logDetailMaxLength = 200_000;
+    validator = new MockHoodieMetadataTableValidator(jsc, config);
+    MockHoodieMetadataTableValidator finalValidator2 = validator;
+    exception = assertThrows(
+        HoodieValidationException.class,
+        () -> finalValidator2.validateFileSlices(listMdt, listFs, "partition", 
metaClient, "test"));
+    // The message include 3 parts: Truncated file slice list of MDT, 
truncated file slice list of File system, other exception message strings.
+    assertTrue(exception.getMessage().length() <= 200_000 * 2 + 1000);
+  }
+
+  @Test
+  void testValidatePartitionsTruncation() throws IOException {
+    // Setup mock objects
+    HoodieMetadataTableValidator.Config config = new 
HoodieMetadataTableValidator.Config();
+    config.basePath = basePath;
+    config.logDetailMaxLength = 100; // Small length to force truncation
+    
+    MockHoodieMetadataTableValidator validator = new 
MockHoodieMetadataTableValidator(jsc, config);
+    HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
+    HoodieTableMetaClient metaClient = mock(HoodieTableMetaClient.class);
+    HoodieStorage fs = mock(HoodieStorage.class);
+    
+    // Generate long partition lists that will exceed the truncation threshold
+    List<String> mdtPartitions = new ArrayList<>();
+    List<String> fsPartitions = new ArrayList<>();
+    for (int i = 0; i < 20; i++) {
+      mdtPartitions.add("partition_" + generateRandomString(20));
+    }
+    for (int i = 0; i < 15; i++) {
+      fsPartitions.add("partition_" + generateRandomString(20));
+    }
+    
+    // Setup mocks
+    when(metaClient.getStorage()).thenReturn(fs);
+    for (String partition : mdtPartitions) {
+      when(fs.exists(new StoragePath(basePath + "/" + 
partition))).thenReturn(true);
+    }
+    
+    // Mock timeline
+    HoodieTimeline commitsTimeline = mock(HoodieTimeline.class);
+    HoodieTimeline completedTimeline = mock(HoodieTimeline.class);
+    when(metaClient.getCommitsTimeline()).thenReturn(commitsTimeline);
+    
when(commitsTimeline.filterCompletedInstants()).thenReturn(completedTimeline);
+    
+    // Setup validator with test data
+    validator.setMetadataPartitionsToReturn(mdtPartitions);
+    validator.setFsPartitionsToReturn(fsPartitions);
+    
+    // Test validation with truncation
+    HoodieValidationException exception = 
assertThrows(HoodieValidationException.class, () -> {
+      validator.validatePartitions(engineContext, new StoragePath(basePath), 
metaClient);
+    });
+    
+    // Verify truncation in error message
+    String errorMsg = exception.getMessage();
+    assertTrue(errorMsg.contains("..."));  // Should contain truncation 
indicator
+    assertTrue(errorMsg.length() <= config.logDetailMaxLength * 2 + 1000); // 
Account for both lists and additional message text
+    
+    // Verify the error message contains the count of partitions
+    assertTrue(errorMsg.contains(String.format("Additional %d partitions from 
FS, but missing from MDT : ",
+        fsPartitions.size())));
+    assertTrue(errorMsg.contains(String.format("additional %d partitions from 
MDT, but missing from FS listing :",
+        mdtPartitions.size())));
+  }
+
+  @Test
+  void testValidateFileSlicesTruncation() {
+    // Setup mock objects
+    HoodieMetadataTableValidator.Config config = new 
HoodieMetadataTableValidator.Config();
+    config.basePath = basePath;
+    config.logDetailMaxLength = 100; // Small length to force truncation
+    
+    MockHoodieMetadataTableValidator validator = new 
MockHoodieMetadataTableValidator(jsc, config);
+    
+    // Generate large lists of file slices that will exceed truncation 
threshold
+    String partition = "partition_" + generateRandomString(10);
+    List<FileSlice> mdtFileSlices = new ArrayList<>();
+    List<FileSlice> fsFileSlices = new ArrayList<>();
+    
+    // Generate 20 file slices for MDT and 15 for FS to ensure they're 
different
+    TimeGenerator timeGenerator = TimeGenerators
+        .getTimeGenerator(HoodieTimeGeneratorConfig.defaultConfig(basePath),
+            HadoopFSUtils.getStorageConf(jsc.hadoopConfiguration()));
+    for (int i = 0; i < 20; i++) {
+      String fileId = UUID.randomUUID().toString();
+
+      String baseInstantTime = metaClient.createNewInstantTime();
+
+      // Create file slice with base file and log files
+      HoodieBaseFile baseFile = new HoodieBaseFile(FSUtils.makeBaseFileName(
+          baseInstantTime, "1-0-1", fileId, 
HoodieFileFormat.PARQUET.getFileExtension()));
+      List<HoodieLogFile> logFiles = Arrays.asList(
+        new HoodieLogFile(FSUtils.makeLogFileName(fileId, 
HoodieLogFile.DELTA_EXTENSION, baseInstantTime, 1, "1-0-1")),
+        new HoodieLogFile(FSUtils.makeLogFileName(fileId, 
HoodieLogFile.DELTA_EXTENSION, baseInstantTime, 2, "1-0-1"))
+      );
+
+      FileSlice slice = new FileSlice(new HoodieFileGroupId(partition, 
fileId), baseInstantTime);
+      slice.setBaseFile(baseFile);
+      logFiles.forEach(slice::addLogFile);
+      mdtFileSlices.add(slice);
+
+      // Add to FS list for first 15 entries
+      if (i < 15) {
+        fsFileSlices.add(new FileSlice(slice));
+      }
+    }
+    
+    // Test validation with truncation
+    HoodieValidationException exception = assertThrows(
+        HoodieValidationException.class,
+        () -> validator.validateFileSlices(mdtFileSlices, fsFileSlices, 
partition, metaClient, "test"));
+    
+    String errorMsg = exception.getMessage();
+    
+    // Verify truncation behavior
+    assertTrue(errorMsg.contains("..."));  // Should contain truncation 
indicator
+    assertTrue(errorMsg.length() <= config.logDetailMaxLength * 2 + 1000); // 
Account for both lists and additional message text
+    
+    // Verify error message contains file slice counts
+    assertTrue(errorMsg.contains(String.format("Number of file slices based on 
the file system does not match that based on the metadata table. File 
system-based listing (%d file slices)",
+        fsFileSlices.size())));
+    assertTrue(errorMsg.contains(String.format("MDT-based listing (%d file 
slices)", 
+        mdtFileSlices.size())));
+  }
 }

Reply via email to