Re: [PR] [HUDI-7712] Fixing RLI initialization to account for file slices instead of just base files while initializing [hudi]

via GitHub Tue, 14 May 2024 00:11:35 -0700


nsivabalan commented on code in PR #11153:
URL: https://github.com/apache/hudi/pull/11153#discussion_r1599492069



##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestRecordLevelIndex.scala:
##########
@@ -55,6 +59,76 @@ class TestRecordLevelIndex extends RecordLevelIndexTestBase {
       saveMode = SaveMode.Overwrite)
   }
 
+  @ParameterizedTest
+  @EnumSource(classOf[HoodieTableType])
+  def testRLIInitializationForMorGlobalIndex(tableType: HoodieTableType): Unit 
= {
+    val hudiOpts = commonOpts + (DataSourceWriteOptions.TABLE_TYPE.key -> 
tableType.name()) +
+      (HoodieMetadataConfig.RECORD_INDEX_MIN_FILE_GROUP_COUNT_PROP.key -> "1") 
+
+      (HoodieMetadataConfig.RECORD_INDEX_MAX_FILE_GROUP_COUNT_PROP.key -> "1") 
+
+      (HoodieIndexConfig.INDEX_TYPE.key -> "RECORD_INDEX") +
+      (HoodieIndexConfig.RECORD_INDEX_UPDATE_PARTITION_PATH_ENABLE.key -> 
"true") -
+      HoodieMetadataConfig.RECORD_INDEX_ENABLE_PROP.key
+
+    val dataGen1 = HoodieTestDataGenerator.createTestGeneratorFirstPartition()
+    val dataGen2 = HoodieTestDataGenerator.createTestGeneratorSecondPartition()
+
+    // batch1 inserts
+    val instantTime1 = getInstantTime()
+    val latestBatch = recordsToStrings(dataGen1.generateInserts(instantTime1, 
5)).asScala
+    var operation = INSERT_OPERATION_OPT_VAL
+    val latestBatchDf = 
spark.read.json(spark.sparkContext.parallelize(latestBatch, 1))
+    latestBatchDf.cache()
+    latestBatchDf.write.format("org.apache.hudi")
+      .options(hudiOpts)
+      .mode(SaveMode.Overwrite)
+      .save(basePath)
+    val deletedDf1 = calculateMergedDf(latestBatchDf, operation, true)
+    deletedDf1.cache()
+
+    // batch2. upsert. update few records to 2nd partition from partition1 and 
insert a few to partition2.
+    val instantTime2 = getInstantTime()
+
+    val latestBatch2_1 = 
recordsToStrings(dataGen1.generateUniqueUpdates(instantTime2, 3)).asScala
+    val latestBatchDf2_1 = 
spark.read.json(spark.sparkContext.parallelize(latestBatch2_1, 1))
+    val latestBatchDf2_2 = latestBatchDf2_1.withColumn("partition", 
lit(HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH))
+      .withColumn("partition_path", 
lit(HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH))
+    val latestBatch2_3 = 
recordsToStrings(dataGen2.generateInserts(instantTime2, 2)).asScala
+    val latestBatchDf2_3 = 
spark.read.json(spark.sparkContext.parallelize(latestBatch2_3, 1))
+    val latestBatchDf2Final = latestBatchDf2_3.union(latestBatchDf2_2)
+    latestBatchDf2Final.cache()
+    latestBatchDf2Final.write.format("org.apache.hudi")
+      .options(hudiOpts)
+      .mode(SaveMode.Append)
+      .save(basePath)
+    operation = UPSERT_OPERATION_OPT_VAL
+    val deletedDf2 = calculateMergedDf(latestBatchDf2Final, operation, true)
+    deletedDf2.cache()
+
+    val hudiOpts2 = commonOpts + (DataSourceWriteOptions.TABLE_TYPE.key -> 
tableType.name()) +
+      (HoodieMetadataConfig.RECORD_INDEX_MIN_FILE_GROUP_COUNT_PROP.key -> "1") 
+
+      (HoodieMetadataConfig.RECORD_INDEX_MAX_FILE_GROUP_COUNT_PROP.key -> "1") 
+
+      (HoodieIndexConfig.INDEX_TYPE.key -> "RECORD_INDEX") +
+      (HoodieIndexConfig.RECORD_INDEX_UPDATE_PARTITION_PATH_ENABLE.key -> 
"true") +
+      (HoodieMetadataConfig.RECORD_INDEX_ENABLE_PROP.key -> "true")
+
+    val instantTime3 = getInstantTime()
+    // batch3. updates to partition2
+    val latestBatch3 = 
recordsToStrings(dataGen2.generateUniqueUpdates(instantTime3, 2)).asScala
+    val latestBatchDf3 = 
spark.read.json(spark.sparkContext.parallelize(latestBatch3, 1))
+    latestBatchDf3.cache()
+    latestBatchDf.write.format("org.apache.hudi")
+      .options(hudiOpts2)
+      .mode(SaveMode.Append)
+      .save(basePath)
+    val deletedDf3 = calculateMergedDf(latestBatchDf, operation, true)
+    deletedDf3.cache()
+    validateDataAndRecordIndices(hudiOpts, deletedDf3)
+  }
+
+  private def getInstantTime(): String = {

Review Comment:
   will use InProcessTimeGenerator 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [HUDI-7712] Fixing RLI initialization to account for file slices instead of just base files while initializing [hudi]

Reply via email to