nsivabalan commented on a change in pull request #4459: URL: https://github.com/apache/hudi/pull/4459#discussion_r819169413
########## File path: hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieDropPartitionsTool.java ########## @@ -0,0 +1,477 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utilities; + +import com.beust.jcommander.JCommander; +import com.beust.jcommander.Parameter; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hudi.DataSourceUtils; +import org.apache.hudi.DataSourceWriteOptions; +import org.apache.hudi.client.HoodieWriteResult; +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.fs.HoodieWrapperFileSystem; +import org.apache.hudi.common.model.HoodiePartitionMetadata; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.hive.HiveSyncConfig; +import org.apache.hudi.hive.HiveSyncTool; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; +import org.apache.hudi.table.HoodieSparkTable; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; + +import scala.Tuple2; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +/** + * A tool with spark-submit to drop Hudi table partitions + * <p> + * You can dry run this tool with the following command: + * ``` + * spark-submit \ + * --class org.apache.hudi.utilities.HoodieDropPartitionsTool \ + * --packages org.apache.spark:spark-avro_2.11:2.4.4 \ + * --master local[*] + * --driver-memory 1g \ + * --executor-memory 1g \ + * $HUDI_DIR/hudi/packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.11-0.11.0-SNAPSHOT.jar \ + * --base-path basePath \ + * --table-name tableName \ + * --mode dry_run \ + * --partitions partition1,partition2 + * ``` + * + * <p> + * You can specify the running mode of the tool through `--mode`. + * There are four modes of the {@link HoodieDropPartitionsTool}: + * - DELETE_PARTITIONS_LAZY ("delete_partitions_lazy"): This tool will mask/tombstone these partitions and corresponding data files and let cleaner delete these files later. + * - Also you can set --sync-hive-meta to sync current drop partition into hive + * <p> + * Example command: + * ``` + * spark-submit \ + * --class org.apache.hudi.utilities.HoodieDropPartitionsTool \ + * --packages org.apache.spark:spark-avro_2.11:2.4.4 \ + * --master local[*] + * --driver-memory 1g \ + * --executor-memory 1g \ + * $HUDI_DIR/hudi/packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.11-0.11.0-SNAPSHOT.jar \ + * --base-path basePath \ + * --table-name tableName \ + * --mode delete_partitions_lazy \ + * --partitions partition1,partition2 + * ``` + * + * <p> + * - DELETE_PARTITIONS_EAGER ("delete_partitions_eager"): This tool will mask/tombstone these partitions and corresponding data files. Also request a clean action eagerly. + * - Also you can set --sync-hive-meta to sync current drop partition into hive + * <p> + * Example command: + * ``` + * spark-submit \ + * --class org.apache.hudi.utilities.HoodieDropPartitionsTool \ + * --packages org.apache.spark:spark-avro_2.11:2.4.4 \ + * --master local[*] + * --driver-memory 1g \ + * --executor-memory 1g \ + * $HUDI_DIR/hudi/packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.11-0.11.0-SNAPSHOT.jar \ + * --base-path basePath \ + * --table-name tableName \ + * --mode delete_partitions_eager \ + * --partitions partition1,partition2 + * ``` + * + * <p> + * - DRY_RUN ("dry_run"): look and print for the table partitions and corresponding data files which will be deleted. + * <p> + * Example command: + * ``` + * spark-submit \ + * --class org.apache.hudi.utilities.HoodieDropPartitionsTool \ + * --packages org.apache.spark:spark-avro_2.11:2.4.4 \ + * --master local[*] + * --driver-memory 1g \ + * --executor-memory 1g \ + * $HUDI_DIR/hudi/packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.11-0.11.0-SNAPSHOT.jar \ + * --base-path basePath \ + * --table-name tableName \ + * --mode dry_run \ + * --partitions partition1,partition2 + * ``` + * + * Also you can use --help to find more configs to use. + */ +public class HoodieDropPartitionsTool implements Serializable { + + private static final Logger LOG = LogManager.getLogger(HoodieDropPartitionsTool.class); + // Spark context + private final transient JavaSparkContext jsc; + // config + private final Config cfg; + // Properties with source, hoodie client, key generator etc. + private TypedProperties props; + + private final HoodieTableMetaClient metaClient; + + public HoodieDropPartitionsTool(JavaSparkContext jsc, Config cfg) { + this.jsc = jsc; + this.cfg = cfg; + + this.props = cfg.propsFilePath == null + ? UtilHelpers.buildProperties(cfg.configs) + : readConfigFromFileSystem(jsc, cfg); + this.metaClient = HoodieTableMetaClient.builder() + .setConf(jsc.hadoopConfiguration()).setBasePath(cfg.basePath) + .setLoadActiveTimelineOnLoad(true) + .build(); + } + + /** + * Reads config from the file system. + * + * @param jsc {@link JavaSparkContext} instance. + * @param cfg {@link Config} instance. + * @return the {@link TypedProperties} instance. + */ + private TypedProperties readConfigFromFileSystem(JavaSparkContext jsc, Config cfg) { + return UtilHelpers.readConfig(jsc.hadoopConfiguration(), new Path(cfg.propsFilePath), cfg.configs) + .getProps(true); + } + + public enum Mode { + // Mask/Tombstone these partitions and corresponding data files and let cleaner delete these files later. + DELETE_PARTITIONS_LAZY, Review comment: I guess we can't support eager deletions. we have to go via cleaner always as we need to sync w/ metadata table. Can we simplify this. optionally we can have dry-run as an argument in spark-submit. else, this tool will have to call into writeclient's delete partitions api. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
