satishkotha commented on a change in pull request #3833: URL: https://github.com/apache/hudi/pull/3833#discussion_r738038132
########## File path: hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/SparkSingleFileSortPlanStrategy.java ########## @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.client.clustering.plan.strategy; + +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.config.HoodieClusteringConfig; +import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieSparkCopyOnWriteTable; +import org.apache.hudi.table.HoodieSparkMergeOnReadTable; + +import java.util.Properties; + +/** + * In this strategy, clustering group for each partition is built in the same way as {@link SparkSizeBasedClusteringPlanStrategy}. + * The difference is that {@link HoodieClusteringConfig#PLAN_STRATEGY_MAX_BYTES_PER_OUTPUT_FILEGROUP} is set to a small value so that + * each file is considered as separate clustering group. This ensures that each clustering group has just one file group. + * Since file could be huge as data will be sorted and clustered in the single file group, so + * {@link HoodieStorageConfig#PARQUET_MAX_FILE_SIZE} is set to a large value. + */ +public class SparkSingleFileSortPlanStrategy<T extends HoodieRecordPayload<T>> + extends SparkSizeBasedClusteringPlanStrategy<T> { + + public SparkSingleFileSortPlanStrategy(HoodieSparkCopyOnWriteTable<T> table, HoodieSparkEngineContext engineContext, HoodieWriteConfig writeConfig) { + super(table, engineContext, writeConfig); + } + + public SparkSingleFileSortPlanStrategy(HoodieSparkMergeOnReadTable<T> table, HoodieSparkEngineContext engineContext, HoodieWriteConfig writeConfig) { + super(table, engineContext, writeConfig); + } + + protected HoodieWriteConfig getWriteConfig() { + Properties props = super.getWriteConfig().getProps(); + props.put(HoodieClusteringConfig.PLAN_STRATEGY_MAX_BYTES_PER_OUTPUT_FILEGROUP.key(), String.valueOf(0)); Review comment: instead of playing with these props, I think it is better to override buildClusteringGroupsForPartition method. We can basically map every FileSlice to ClusteringGroup. Makes better separation of code and easy to read code. ########## File path: hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieClientOnCopyOnWriteStorage.java ########## @@ -1370,6 +1372,19 @@ public void testClusteringWithSortColumns(boolean populateMetaFields, boolean pr testInsertAndClustering(clusteringConfig, populateMetaFields, true, SqlQueryEqualityPreCommitValidator.class.getName(), COUNT_SQL_QUERY_FOR_VALIDATION, ""); } + @ParameterizedTest + @MethodSource("populateMetaFieldsAndPreserveMetadataParams") + public void testClusteringWithSortOneFilePerGroup(boolean populateMetaFields, boolean preserveCommitMetadata) throws Exception { + // setup clustering config. + HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10) + .withClusteringSortColumns("begin_lat,begin_lon") + .withClusteringPlanStrategyClass(SparkSingleFileSortPlanStrategy.class.getName()) + .withClusteringExecutionStrategyClass(SparkSingleFileSortExecutionStrategy.class.getName()) + .withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1) + .withPreserveHoodieCommitMetadata(preserveCommitMetadata).build(); + testInsertAndClustering(clusteringConfig, populateMetaFields, true, SqlQueryEqualityPreCommitValidator.class.getName(), COUNT_SQL_QUERY_FOR_VALIDATION, ""); Review comment: Can you also verify no new fileId is created after clustering? Let me know if this is already covered ########## File path: hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/plan/strategy/SparkSizeBasedClusteringPlanStrategy.java ########## @@ -71,24 +71,30 @@ public SparkSizeBasedClusteringPlanStrategy(HoodieSparkMergeOnReadTable<T> table List<Pair<List<FileSlice>, Integer>> fileSliceGroups = new ArrayList<>(); List<FileSlice> currentGroup = new ArrayList<>(); long totalSizeSoFar = 0; + HoodieWriteConfig writeConfig = getWriteConfig(); for (FileSlice currentSlice : fileSlices) { // assume each filegroup size is ~= parquet.max.file.size - totalSizeSoFar += currentSlice.getBaseFile().isPresent() ? currentSlice.getBaseFile().get().getFileSize() : getWriteConfig().getParquetMaxFileSize(); + totalSizeSoFar += currentSlice.getBaseFile().isPresent() ? currentSlice.getBaseFile().get().getFileSize() : writeConfig.getParquetMaxFileSize(); // check if max size is reached and create new group, if needed. - if (totalSizeSoFar >= getWriteConfig().getClusteringMaxBytesInGroup() && !currentGroup.isEmpty()) { - int numOutputGroups = getNumberOfOutputFileGroups(totalSizeSoFar, getWriteConfig().getClusteringTargetFileMaxBytes()); + if (totalSizeSoFar >= writeConfig.getClusteringMaxBytesInGroup() && !currentGroup.isEmpty()) { + int numOutputGroups = getNumberOfOutputFileGroups(totalSizeSoFar, writeConfig.getClusteringTargetFileMaxBytes()); LOG.info("Adding one clustering group " + totalSizeSoFar + " max bytes: " - + getWriteConfig().getClusteringMaxBytesInGroup() + " num input slices: " + currentGroup.size() + " output groups: " + numOutputGroups); + + writeConfig.getClusteringMaxBytesInGroup() + " num input slices: " + currentGroup.size() + " output groups: " + numOutputGroups); fileSliceGroups.add(Pair.of(currentGroup, numOutputGroups)); currentGroup = new ArrayList<>(); totalSizeSoFar = 0; } currentGroup.add(currentSlice); + // totalSizeSoFar could be 0 when new group was created in the previous conditional block. + // reset to the size of current slice, otherwise the number of output file group will become 0 even though current slice is present. + if (totalSizeSoFar == 0) { Review comment: I'm not entirely following this. For normal size based strategy, this will not be correct IIUC. totalSizeSoFar is set to 0 only when a group quota is complete in conditional block. So for new group, we want to start from 0. Here you are changing it to the last file size. ########## File path: hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SparkSingleFileSortExecutionStrategy.java ########## @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.client.clustering.run.strategy; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieFileGroupId; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.config.HoodieStorageConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieClusteringException; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.action.commit.SparkBulkInsertHelper; + +import org.apache.avro.Schema; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaRDD; + +import java.util.List; +import java.util.Map; +import java.util.Properties; + +/** + * This strategy is similar to {@link SparkSortAndSizeExecutionStrategy} with the difference being that + * there should be only one large file group per clustering group. + */ +public class SparkSingleFileSortExecutionStrategy<T extends HoodieRecordPayload<T>> + extends MultipleSparkJobExecutionStrategy<T> { + + private static final Logger LOG = LogManager.getLogger(SparkSingleFileSortExecutionStrategy.class); + + public SparkSingleFileSortExecutionStrategy(HoodieTable table, + HoodieEngineContext engineContext, + HoodieWriteConfig writeConfig) { + super(table, engineContext, writeConfig); + } + + @Override + public JavaRDD<WriteStatus> performClusteringWithRecordsRDD(JavaRDD<HoodieRecord<T>> inputRecords, + int numOutputGroups, + String instantTime, + Map<String, String> strategyParams, + Schema schema, + List<HoodieFileGroupId> fileGroupIdList, + boolean preserveHoodieMetadata) { + if (numOutputGroups != 1 && fileGroupIdList.size() != 1) { + throw new HoodieClusteringException("Expect only one file group for strategy: " + getClass().getName()); + } + LOG.info("Starting clustering for a group, parallelism:" + numOutputGroups + " commit:" + instantTime); + Properties props = getWriteConfig().getProps(); + props.put(HoodieWriteConfig.BULKINSERT_PARALLELISM_VALUE.key(), String.valueOf(numOutputGroups)); + // We are calling another action executor - disable auto commit. Strategy is only expected to write data in new files. + props.put(HoodieWriteConfig.AUTO_COMMIT_ENABLE.key(), Boolean.FALSE.toString()); + // Since clustering will write to single file group using HoodieUnboundedCreateHandle, set max file size to a large value. + props.put(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE.key(), String.valueOf(Long.MAX_VALUE)); + HoodieWriteConfig newConfig = HoodieWriteConfig.newBuilder().withProps(props).build(); + return (JavaRDD<WriteStatus>) SparkBulkInsertHelper.newInstance().bulkInsert(inputRecords, instantTime, getHoodieTable(), newConfig, Review comment: have you tested this? I dont see fileId being used from fileGroupIdList argument. So wouldnt bulkInsert create new fileId? ########## File path: hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/BulkInsertMapFunction.java ########## @@ -60,8 +63,19 @@ public BulkInsertMapFunction(String instantTime, boolean areRecordsSorted, @Override public Iterator<List<WriteStatus>> call(Integer partition, Iterator<HoodieRecord<T>> recordItr) { + // Use SingleFileHandleCreateFactory when clustering plan is SparkSingleFileSortPlanStrategy, + // and execution strategy is SparkSingleFileSortExecutionStrategy. + CreateHandleFactory writeHandleFactory; + if (config.getClusteringPlanStrategyClass().equals(SparkSingleFileSortPlanStrategy.class.getName()) + && config.getClusteringExecutionStrategyClass().equals(SparkSingleFileSortExecutionStrategy.class.getName())) { Review comment: this doesnt seem to be the correct abstraction to have clustering references in BulkInsertMapFunction. Maybe pass WriteHandleFactory to BulkInsertMapFunction. Default value is CreateHandleFactory. But SparkSingleFileExecutionStrategy can pass SingleFileHandleCreateFactory. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
