nsivabalan commented on code in PR #18353:
URL: https://github.com/apache/hudi/pull/18353#discussion_r2988704486
##########
hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestHoodieBackedMetadata.java:
##########
@@ -2031,6 +1941,138 @@ public void testFailedBootstrap() throws Exception {
}
}
+ /**
+ * Test that partitioned RLI initialization is deferred for fresh tables.
+ * Partitioned RLI should NOT be initialized on the first commit but should
be initialized
+ * on the second commit with programmatically determined file group count
(should be 1 for small tables).
+ */
+ @ParameterizedTest
+ @EnumSource(HoodieTableType.class)
+ public void
testPartitionedRecordIndexDeferredInitializationForFreshTable(HoodieTableType
tableType) throws Exception {
+ init(tableType);
+ HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
+
+ // Config with partitioned record index enabled (not global)
+ HoodieWriteConfig writeConfig = getWriteConfigBuilder(true, true, false)
+ .withIndexConfig(HoodieIndexConfig.newBuilder()
+ .withIndexType(HoodieIndex.IndexType.RECORD_INDEX)
+ .build())
+ .withMetadataConfig(HoodieMetadataConfig.newBuilder()
+ .enable(true)
+ .withEnableRecordLevelIndex(true) // Partitioned RLI
+ .withPartitionedRecordIndexFileGroupCount(2,2)
+ .withDeferRliInitializationForFreshTable(true)
+ .build())
+ .build();
+
+ try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext,
writeConfig)) {
+ // First commit - Partitioned RLI should NOT be initialized yet for a
fresh table
+ String firstCommitTime = client.startCommit();
+ List<HoodieRecord> records = dataGen.generateInserts(firstCommitTime,
1000);
+ List<WriteStatus> writeStatuses = client.insert(jsc.parallelize(records,
2), firstCommitTime).collect();
+ assertNoWriteErrors(writeStatuses);
+ client.commit(firstCommitTime, jsc.parallelize(writeStatuses));
+
+ // Verify metadata table exists
+ metaClient = HoodieTableMetaClient.reload(metaClient);
+ assertTrue(metaClient.getTableConfig().isMetadataTableAvailable());
+
+ // Verify partitioned RLI partition is NOT initialized after first commit
+
assertFalse(metaClient.getTableConfig().isMetadataPartitionAvailable(RECORD_INDEX),
+ "Partitioned RLI should NOT be initialized on first commit for a
fresh table");
+
+ // Files partition should be initialized
+
assertTrue(metaClient.getTableConfig().isMetadataPartitionAvailable(MetadataPartitionType.FILES),
+ "Files partition should be initialized");
+
+ // Second commit - Partitioned RLI should NOW be initialized
+ String secondCommitTime = client.startCommit();
+ List<HoodieRecord> moreRecords =
dataGen.generateInserts(secondCommitTime, 500);
+ writeStatuses = client.insert(jsc.parallelize(moreRecords, 2),
secondCommitTime).collect();
+ assertNoWriteErrors(writeStatuses);
+ client.commit(secondCommitTime, jsc.parallelize(writeStatuses));
+
+ // Reload and verify partitioned RLI is now initialized
+ metaClient = HoodieTableMetaClient.reload(metaClient);
+
assertTrue(metaClient.getTableConfig().isMetadataPartitionAvailable(RECORD_INDEX),
+ "Partitioned RLI should be initialized after second commit");
+
+ // Verify file group count is 1 for small tables (150 records total)
Review Comment:
will fix the comments.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]