[GitHub] [hudi] nsivabalan commented on a diff in pull request #5342: [HUDI-3899] Drop index to delete pending index instants from timeline

GitBox Mon, 18 Apr 2022 07:41:22 -0700


nsivabalan commented on code in PR #5342:
URL: https://github.com/apache/hudi/pull/5342#discussion_r852164219



##########
hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieIndexer.java:
##########
@@ -132,29 +139,84 @@ public void testIndexerWithNotAllIndexesEnabled() {
     assertNoWriteErrors(statuses);
 
     // validate table config
-    
assertTrue(HoodieTableMetadataUtil.getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(FILES.getPartitionPath()));
-    
assertTrue(HoodieTableMetadataUtil.getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(BLOOM_FILTERS.getPartitionPath()));
+    
assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(FILES.getPartitionPath()));
+    
assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(BLOOM_FILTERS.getPartitionPath()));
 
     // build indexer config which has only column_stats enabled (files is 
enabled by default)
     HoodieIndexer.Config config = new HoodieIndexer.Config();
     String propsPath = 
Objects.requireNonNull(getClass().getClassLoader().getResource("delta-streamer-config/indexer.properties")).getPath();
     config.basePath = basePath;
     config.tableName = tableName;
     config.indexTypes = "COLUMN_STATS";
-    config.runningMode = "scheduleAndExecute";
+    config.runningMode = SCHEDULE_AND_EXECUTE;
     config.propsFilePath = propsPath;
     // start the indexer and validate column_stats index is also complete
     HoodieIndexer indexer = new HoodieIndexer(jsc, config);
     assertEquals(0, indexer.start(0));
 
     // validate table config
-    
assertTrue(HoodieTableMetadataUtil.getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(FILES.getPartitionPath()));
-    
assertTrue(HoodieTableMetadataUtil.getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(BLOOM_FILTERS.getPartitionPath()));
-    
assertTrue(HoodieTableMetadataUtil.getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(COLUMN_STATS.getPartitionPath()));
+    
assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(FILES.getPartitionPath()));
+    
assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(BLOOM_FILTERS.getPartitionPath()));
+    
assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(COLUMN_STATS.getPartitionPath()));
     // validate metadata partitions actually exist
-    assertTrue(HoodieTableMetadataUtil.metadataPartitionExists(basePath, 
context, FILES));
-    assertTrue(HoodieTableMetadataUtil.metadataPartitionExists(basePath, 
context, COLUMN_STATS));
-    assertTrue(HoodieTableMetadataUtil.metadataPartitionExists(basePath, 
context, BLOOM_FILTERS));
+    assertTrue(metadataPartitionExists(basePath, context, FILES));
+    assertTrue(metadataPartitionExists(basePath, context, COLUMN_STATS));
+    assertTrue(metadataPartitionExists(basePath, context, BLOOM_FILTERS));
+  }
+
+  @Test
+  public void testIndexerDropPartitionDeletesInstantFromTimeline() throws 
Exception {
+    initTestDataGenerator();
+    String tableName = "indexer_test";
+    HoodieWriteConfig.Builder writeConfigBuilder = 
getWriteConfigBuilder(basePath, tableName);
+    // enable files on the regular write client
+    HoodieMetadataConfig.Builder metadataConfigBuilder = 
getMetadataConfigBuilder(true, false).withMetadataIndexBloomFilter(true);
+    HoodieWriteConfig writeConfig = 
writeConfigBuilder.withMetadataConfig(metadataConfigBuilder.build()).build();
+    // do one upsert with synchronous metadata update
+    SparkRDDWriteClient writeClient = new SparkRDDWriteClient(context, 
writeConfig);
+    String instant = "0001";
+    writeClient.startCommitWithTime(instant);
+    List<HoodieRecord> records = dataGen.generateInserts(instant, 100);
+    JavaRDD<WriteStatus> result = writeClient.upsert(jsc.parallelize(records, 
1), instant);
+    List<WriteStatus> statuses = result.collect();
+    assertNoWriteErrors(statuses);
+
+    // validate partitions built successfully
+    
assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(FILES.getPartitionPath()));
+    assertTrue(metadataPartitionExists(basePath, context, FILES));
+    
assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(BLOOM_FILTERS.getPartitionPath()));
+    assertTrue(metadataPartitionExists(basePath, context, BLOOM_FILTERS));
+
+    // build indexer config which has only column_stats enabled (files is 
enabled by default)
+    HoodieIndexer.Config config = new HoodieIndexer.Config();
+    String propsPath = 
Objects.requireNonNull(getClass().getClassLoader().getResource("delta-streamer-config/indexer.properties")).getPath();
+    config.basePath = basePath;
+    config.tableName = tableName;
+    config.indexTypes = "COLUMN_STATS";
+    config.runningMode = SCHEDULE;
+    config.propsFilePath = propsPath;
+
+    // schedule indexing and validate column_stats index is also initialized
+    HoodieIndexer indexer = new HoodieIndexer(jsc, config);
+    assertEquals(0, indexer.start(0));
+    Option<HoodieInstant> indexInstantInTimeline = 
metaClient.reloadActiveTimeline().filterPendingIndexTimeline().lastInstant();
+    assertTrue(indexInstantInTimeline.isPresent());
+    assertEquals(REQUESTED, indexInstantInTimeline.get().getState());
+    assertTrue(metadataPartitionExists(basePath, context, COLUMN_STATS));
+
+    // drop column_stats and validate indexing.requested is also removed from 
the timeline
+    config.runningMode = DROP_INDEX;
+    indexer = new HoodieIndexer(jsc, config);
+    assertEquals(0, indexer.start(0));
+    indexInstantInTimeline = 
metaClient.reloadActiveTimeline().filterPendingIndexTimeline().lastInstant();
+    assertFalse(indexInstantInTimeline.isPresent());
+    assertFalse(metadataPartitionExists(basePath, context, COLUMN_STATS));
+
+    // check other partitions are intact
+    
assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(FILES.getPartitionPath()));

Review Comment:
   if we have any other tests for DropPartitions, can we add assertion that for 
a fully built out MDT partition, timeline files should not be removed. ie. if 
index instants are completed, dropPartition should not touch the timeline 
files. 
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [hudi] nsivabalan commented on a diff in pull request #5342: [HUDI-3899] Drop index to delete pending index instants from timeline

Reply via email to