nsivabalan commented on code in PR #13229:
URL: https://github.com/apache/hudi/pull/13229#discussion_r2078585540
##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieTableServiceClient.java:
##########
@@ -514,23 +555,72 @@ public boolean purgePendingClustering(String
clusteringInstant) {
return false;
}
+ protected abstract HoodieWriteMetadata<O>
convertToOutputMetadata(HoodieWriteMetadata<T> writeMetadata);
+
+ private Map<String, List<String>>
getPartitionToReplacedFileIds(HoodieClusteringPlan clusteringPlan,
HoodieWriteMetadata<?> writeMetadata) {
+ Set<HoodieFileGroupId> newFilesWritten =
writeMetadata.getWriteStats().get().stream()
+ .map(s -> new HoodieFileGroupId(s.getPartitionPath(),
s.getFileId())).collect(Collectors.toSet());
+
+ return ClusteringUtils.getFileGroupsFromClusteringPlan(clusteringPlan)
+ .filter(fg ->
"org.apache.hudi.client.clustering.run.strategy.SparkSingleFileSortExecutionStrategy"
+ .equals(config.getClusteringExecutionStrategyClass())
+ || !newFilesWritten.contains(fg))
+ .collect(Collectors.groupingBy(HoodieFileGroupId::getPartitionPath,
Collectors.mapping(HoodieFileGroupId::getFileId, Collectors.toList())));
+ }
+
/**
- * Delete expired partition by config.
- *
- * @param instantTime Instant Time for the action
- * @return HoodieWriteMetadata
+ * Check if any validators are configured and run those validations. If any
of the validations fail, throws HoodieValidationException.
*/
- public HoodieWriteMetadata<T> managePartitionTTL(String instantTime) {
- HoodieTable<?, I, ?, T> table = createTable(config,
context.getStorageConf());
- return table.managePartitionTTL(context, instantTime);
+ protected void runPrecommitValidationForClustering(HoodieWriteMetadata<O>
writeMetadata, HoodieTable table, String instantTime) {
+ if (StringUtils.isNullOrEmpty(config.getPreCommitValidators())) {
+ return;
+ }
+ throw new HoodieIOException("Precommit validation not implemented for all
engines yet");
}
- protected abstract HoodieWriteMetadata<O>
convertToOutputMetadata(HoodieWriteMetadata<T> writeMetadata);
+ private void commitClustering(HoodieWriteMetadata<O> clusteringWriteMetadata,
+ HoodieTable table,
+ String clusteringCommitTime) {
+ // triggering the dag for the first time for clustering
+ List<HoodieWriteStat> writeStats =
triggerWritesAndFetchWriteStats(clusteringWriteMetadata);
+ clusteringWriteMetadata.setWriteStats(writeStats);
+
+ HoodieClusteringPlan clusteringPlan =
ClusteringUtils.getPendingClusteringPlan(table.getMetaClient(),
clusteringCommitTime);
+
clusteringWriteMetadata.setPartitionToReplaceFileIds(getPartitionToReplacedFileIds(clusteringPlan,
clusteringWriteMetadata));
+ Option<Schema> schema;
+ try {
+ schema = new
TableSchemaResolver(table.getMetaClient()).getTableAvroSchemaIfPresent(false);
+ } catch (Exception ex) {
+ throw new HoodieSchemaException(ex);
+ }
+ // Create HoodieCommitMetadata w/ all required info except
HoodieWriteStats which will be populated later when dag is triggered.
+ HoodieCommitMetadata commitMetadata =
CommitUtils.buildMetadata(Collections.emptyList(),
clusteringWriteMetadata.getPartitionToReplaceFileIds(),
+ Option.empty(), WriteOperationType.CLUSTER, schema.get().toString(),
HoodieTimeline.CLUSTERING_ACTION);
+ clusteringWriteMetadata.setCommitMetadata(Option.of(commitMetadata));
+ HoodieReplaceCommitMetadata replaceCommitMetadata =
(HoodieReplaceCommitMetadata) clusteringWriteMetadata.getCommitMetadata().get();
+ for (HoodieWriteStat writeStat: writeStats) {
+ replaceCommitMetadata.addWriteStat(writeStat.getPartitionPath(),
writeStat);
+ }
+
clusteringWriteMetadata.setCommitMetadata(Option.of(replaceCommitMetadata));
+ runPrecommitValidationForClustering(clusteringWriteMetadata, table,
clusteringCommitTime);
Review Comment:
looks like for compaction, we are not running this pre commit validation.
its happening only for clusteirng. and I don't want to fix that as part of this
patch.
I can open to take it as a follow up patch if you insist on unifying them.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]