nsivabalan commented on code in PR #5342:
URL: https://github.com/apache/hudi/pull/5342#discussion_r852164219
##########
hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieIndexer.java:
##########
@@ -132,29 +139,84 @@ public void testIndexerWithNotAllIndexesEnabled() {
assertNoWriteErrors(statuses);
// validate table config
-
assertTrue(HoodieTableMetadataUtil.getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(FILES.getPartitionPath()));
-
assertTrue(HoodieTableMetadataUtil.getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(BLOOM_FILTERS.getPartitionPath()));
+
assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(FILES.getPartitionPath()));
+
assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(BLOOM_FILTERS.getPartitionPath()));
// build indexer config which has only column_stats enabled (files is
enabled by default)
HoodieIndexer.Config config = new HoodieIndexer.Config();
String propsPath =
Objects.requireNonNull(getClass().getClassLoader().getResource("delta-streamer-config/indexer.properties")).getPath();
config.basePath = basePath;
config.tableName = tableName;
config.indexTypes = "COLUMN_STATS";
- config.runningMode = "scheduleAndExecute";
+ config.runningMode = SCHEDULE_AND_EXECUTE;
config.propsFilePath = propsPath;
// start the indexer and validate column_stats index is also complete
HoodieIndexer indexer = new HoodieIndexer(jsc, config);
assertEquals(0, indexer.start(0));
// validate table config
-
assertTrue(HoodieTableMetadataUtil.getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(FILES.getPartitionPath()));
-
assertTrue(HoodieTableMetadataUtil.getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(BLOOM_FILTERS.getPartitionPath()));
-
assertTrue(HoodieTableMetadataUtil.getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(COLUMN_STATS.getPartitionPath()));
+
assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(FILES.getPartitionPath()));
+
assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(BLOOM_FILTERS.getPartitionPath()));
+
assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(COLUMN_STATS.getPartitionPath()));
// validate metadata partitions actually exist
- assertTrue(HoodieTableMetadataUtil.metadataPartitionExists(basePath,
context, FILES));
- assertTrue(HoodieTableMetadataUtil.metadataPartitionExists(basePath,
context, COLUMN_STATS));
- assertTrue(HoodieTableMetadataUtil.metadataPartitionExists(basePath,
context, BLOOM_FILTERS));
+ assertTrue(metadataPartitionExists(basePath, context, FILES));
+ assertTrue(metadataPartitionExists(basePath, context, COLUMN_STATS));
+ assertTrue(metadataPartitionExists(basePath, context, BLOOM_FILTERS));
+ }
+
+ @Test
+ public void testIndexerDropPartitionDeletesInstantFromTimeline() throws
Exception {
+ initTestDataGenerator();
+ String tableName = "indexer_test";
+ HoodieWriteConfig.Builder writeConfigBuilder =
getWriteConfigBuilder(basePath, tableName);
+ // enable files on the regular write client
+ HoodieMetadataConfig.Builder metadataConfigBuilder =
getMetadataConfigBuilder(true, false).withMetadataIndexBloomFilter(true);
+ HoodieWriteConfig writeConfig =
writeConfigBuilder.withMetadataConfig(metadataConfigBuilder.build()).build();
+ // do one upsert with synchronous metadata update
+ SparkRDDWriteClient writeClient = new SparkRDDWriteClient(context,
writeConfig);
+ String instant = "0001";
+ writeClient.startCommitWithTime(instant);
+ List<HoodieRecord> records = dataGen.generateInserts(instant, 100);
+ JavaRDD<WriteStatus> result = writeClient.upsert(jsc.parallelize(records,
1), instant);
+ List<WriteStatus> statuses = result.collect();
+ assertNoWriteErrors(statuses);
+
+ // validate partitions built successfully
+
assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(FILES.getPartitionPath()));
+ assertTrue(metadataPartitionExists(basePath, context, FILES));
+
assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(BLOOM_FILTERS.getPartitionPath()));
+ assertTrue(metadataPartitionExists(basePath, context, BLOOM_FILTERS));
+
+ // build indexer config which has only column_stats enabled (files is
enabled by default)
+ HoodieIndexer.Config config = new HoodieIndexer.Config();
+ String propsPath =
Objects.requireNonNull(getClass().getClassLoader().getResource("delta-streamer-config/indexer.properties")).getPath();
+ config.basePath = basePath;
+ config.tableName = tableName;
+ config.indexTypes = "COLUMN_STATS";
+ config.runningMode = SCHEDULE;
+ config.propsFilePath = propsPath;
+
+ // schedule indexing and validate column_stats index is also initialized
+ HoodieIndexer indexer = new HoodieIndexer(jsc, config);
+ assertEquals(0, indexer.start(0));
+ Option<HoodieInstant> indexInstantInTimeline =
metaClient.reloadActiveTimeline().filterPendingIndexTimeline().lastInstant();
+ assertTrue(indexInstantInTimeline.isPresent());
+ assertEquals(REQUESTED, indexInstantInTimeline.get().getState());
+ assertTrue(metadataPartitionExists(basePath, context, COLUMN_STATS));
+
+ // drop column_stats and validate indexing.requested is also removed from
the timeline
+ config.runningMode = DROP_INDEX;
+ indexer = new HoodieIndexer(jsc, config);
+ assertEquals(0, indexer.start(0));
+ indexInstantInTimeline =
metaClient.reloadActiveTimeline().filterPendingIndexTimeline().lastInstant();
+ assertFalse(indexInstantInTimeline.isPresent());
+ assertFalse(metadataPartitionExists(basePath, context, COLUMN_STATS));
+
+ // check other partitions are intact
+
assertTrue(getCompletedMetadataPartitions(reload(metaClient).getTableConfig()).contains(FILES.getPartitionPath()));
Review Comment:
if we have any other tests for DropPartitions, can we add assertion that for
a fully built out MDT partition, timeline files should not be removed. ie. if
index instants are completed, dropPartition should not touch the timeline
files.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]