vinothchandar commented on a change in pull request #3330: URL: https://github.com/apache/hudi/pull/3330#discussion_r707370295
########## File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieOptimizeConfig.java ########## @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.config; + +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Properties; + +/** + * Hoodie Configs for Data layout optimize. + */ +public class HoodieOptimizeConfig extends HoodieConfig { + // Any Data layout optimize params can be saved with this prefix + public static final String DATA_LAYOUT_OPTIMIZE_PARAM_PREFIX = "hoodie.data.layout.optimize."; + public static final ConfigProperty<String> DATA_LAYOUT_STRATEGY = ConfigProperty + .key(DATA_LAYOUT_OPTIMIZE_PARAM_PREFIX + "strategy") + .defaultValue("z-order") + .sinceVersion("0.10.0") + .withDocumentation("config to provide a way to optimize data layout for table, current only support z-order and hilbert"); + + public static final ConfigProperty<String> DATA_LAYOUT_BUILD_CURVE_STRATEGY = ConfigProperty + .key(DATA_LAYOUT_OPTIMIZE_PARAM_PREFIX + "build.curve.optimize.strategy") + .defaultValue("directly") + .sinceVersion("0.10.0") + .withDocumentation("Config to provide whether use directly/sample method to build curve optimize for data layout," + + "build curve_optimize by directly method is faster than by sample method, however sample method produce a better data layout." + + "now support two strategies: directly,sample"); + + public static final ConfigProperty<String> DATA_LAYOUT_CURVE_OPTIMIZE_SAMPLE_NUMBER = ConfigProperty + .key(DATA_LAYOUT_OPTIMIZE_PARAM_PREFIX + "curve.optimize.sample.number") + .defaultValue("200000") + .sinceVersion("0.10.0") + .withDocumentation("when set" + DATA_LAYOUT_BUILD_CURVE_STRATEGY.key() + " to sample method, sample number need to be set for it." + + " larger number means better layout result, but more memory consumer"); + + public static final ConfigProperty<String> DATA_LAYOUT_CURVE_OPTIMIZE_SORT_COLUMNS = ConfigProperty Review comment: can we reuse the sort columns from the clustering config? multiple configs for sort columns could be confusing? ########## File path: hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/clustering/run/strategy/SparkSortAndSizeExecutionStrategy.java ########## @@ -77,7 +79,11 @@ public SparkSortAndSizeExecutionStrategy(HoodieTable table, * Create BulkInsertPartitioner based on strategy params. */ protected Option<BulkInsertPartitioner<T>> getPartitioner(Map<String, String> strategyParams, Schema schema) { - if (strategyParams.containsKey(PLAN_STRATEGY_SORT_COLUMNS.key())) { + if (!getWriteConfig().getOptimizeSortColumns().isEmpty()) { Review comment: weprobably need a nicer way to use optimize Sort curves within clustering. This can be a bit challenging to use. let me think more ########## File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieOptimizeConfig.java ########## @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.config; + +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Properties; + +/** + * Hoodie Configs for Data layout optimize. + */ +public class HoodieOptimizeConfig extends HoodieConfig { Review comment: LayoutOptimize Config? To be explicit about what this is ########## File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieOptimizeConfig.java ########## @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.config; + +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Properties; + +/** + * Hoodie Configs for Data layout optimize. + */ +public class HoodieOptimizeConfig extends HoodieConfig { + // Any Data layout optimize params can be saved with this prefix + public static final String DATA_LAYOUT_OPTIMIZE_PARAM_PREFIX = "hoodie.data.layout.optimize."; Review comment: just hoodie.layout.optimize? ########## File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/optimize/UnsafeAccess.java ########## @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.optimize; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import sun.misc.Unsafe; + +import java.lang.reflect.Field; +import java.nio.ByteOrder; +import java.security.AccessController; +import java.security.PrivilegedAction; + +public class UnsafeAccess { Review comment: this item is still open. why would we need something like this? is it faster? could you please explain ########## File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieOptimizeConfig.java ########## @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.config; + +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Properties; + +/** + * Hoodie Configs for Data layout optimize. + */ +public class HoodieOptimizeConfig extends HoodieConfig { + // Any Data layout optimize params can be saved with this prefix + public static final String DATA_LAYOUT_OPTIMIZE_PARAM_PREFIX = "hoodie.data.layout.optimize."; + public static final ConfigProperty DATA_LAYOUT_STRATEGY = ConfigProperty + .key(DATA_LAYOUT_OPTIMIZE_PARAM_PREFIX + "strategy") + .defaultValue("z-order") + .sinceVersion("0.10.0") + .withDocumentation("config to provide a way to optimize data layout for table, current only support z-order and hilbert"); + + public static final ConfigProperty DATA_LAYOUT_BUILD_CURVE_METHOD = ConfigProperty + .key(DATA_LAYOUT_OPTIMIZE_PARAM_PREFIX + "build.curve.optimize.method") + .defaultValue("directly") Review comment: just call it direct ,instead of directly? ########## File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieOptimizeConfig.java ########## @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.config; + +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Properties; + +/** + * Hoodie Configs for Data layout optimize. + */ +public class HoodieOptimizeConfig extends HoodieConfig { + // Any Data layout optimize params can be saved with this prefix + public static final String DATA_LAYOUT_OPTIMIZE_PARAM_PREFIX = "hoodie.data.layout.optimize."; + public static final ConfigProperty<String> DATA_LAYOUT_STRATEGY = ConfigProperty + .key(DATA_LAYOUT_OPTIMIZE_PARAM_PREFIX + "strategy") + .defaultValue("z-order") + .sinceVersion("0.10.0") + .withDocumentation("config to provide a way to optimize data layout for table, current only support z-order and hilbert"); + + public static final ConfigProperty<String> DATA_LAYOUT_BUILD_CURVE_STRATEGY = ConfigProperty + .key(DATA_LAYOUT_OPTIMIZE_PARAM_PREFIX + "build.curve.optimize.strategy") + .defaultValue("directly") + .sinceVersion("0.10.0") + .withDocumentation("Config to provide whether use directly/sample method to build curve optimize for data layout," + + "build curve_optimize by directly method is faster than by sample method, however sample method produce a better data layout." + + "now support two strategies: directly,sample"); + + public static final ConfigProperty<String> DATA_LAYOUT_CURVE_OPTIMIZE_SAMPLE_NUMBER = ConfigProperty + .key(DATA_LAYOUT_OPTIMIZE_PARAM_PREFIX + "curve.optimize.sample.number") + .defaultValue("200000") + .sinceVersion("0.10.0") + .withDocumentation("when set" + DATA_LAYOUT_BUILD_CURVE_STRATEGY.key() + " to sample method, sample number need to be set for it." Review comment: Probably need a better explanation for this ########## File path: hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/execution/bulkinsert/RDDSpatialCurveOptimizationSortPartitioner.java ########## @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.execution.bulkinsert; + +import org.apache.hudi.AvroConversionUtils; +import org.apache.hudi.HoodieSparkUtils; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.config.SerializableSchema; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.BulkInsertPartitioner; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.Zoptimize$; + +/** + * A partitioner that does spartial curve optimization sorting based on specified column values for each RDD partition. + * support z-curve optimization, hilbert will come soon. + * @param <T> HoodieRecordPayload type + */ +public class RDDSpatialCurveOptimizationSortPartitioner<T extends HoodieRecordPayload> + implements BulkInsertPartitioner<JavaRDD<HoodieRecord<T>>> { + private final HoodieSparkEngineContext sparkEngineContext; + private final SerializableSchema serializableSchema; + private final HoodieWriteConfig config; + + public RDDSpatialCurveOptimizationSortPartitioner(HoodieSparkEngineContext sparkEngineContext, HoodieWriteConfig config, Schema schema) { + this.sparkEngineContext = sparkEngineContext; + this.config = config; + this.serializableSchema = new SerializableSchema(schema); + } + + @Override + public JavaRDD<HoodieRecord<T>> repartitionRecords(JavaRDD<HoodieRecord<T>> records, int outputSparkPartitions) { + String payloadClass = config.getPayloadClass(); + // do sort + JavaRDD<GenericRecord> preparedRecord = prepareGenericRecord(records, outputSparkPartitions, serializableSchema.get()); + return preparedRecord.map(record -> { + String key = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + String partition = record.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); + HoodieKey hoodieKey = new HoodieKey(key, partition); + HoodieRecordPayload avroPayload = ReflectionUtils.loadPayload(payloadClass, + new Object[] {Option.of(record)}, Option.class); + HoodieRecord hoodieRecord = new HoodieRecord(hoodieKey, avroPayload); + return hoodieRecord; + }); + } + + private JavaRDD<GenericRecord> prepareGenericRecord(JavaRDD<HoodieRecord<T>> inputRecords, final int numOutputGroups, final Schema schema) { + SerializableSchema serializableSchema = new SerializableSchema(schema); + JavaRDD<GenericRecord> genericRecordJavaRDD = inputRecords.map(f -> (GenericRecord) f.getData().getInsertValue(serializableSchema.get()).get()); + Dataset<Row> originDF = AvroConversionUtils.createDataFrame(genericRecordJavaRDD.rdd(), schema.toString(), sparkEngineContext.getSqlContext().sparkSession()); + Dataset<Row> zDataFrame; + + if (config.getOptimizeBuildCurveOptimizeMethod().equals("sample")) { Review comment: We need an Enum to hold all the different Strategies - direct, sample etc. ########## File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java ########## @@ -241,6 +241,27 @@ private synchronized FileSystemViewManager getViewManager() { */ public abstract HoodieWriteMetadata<O> insertOverwriteTable(HoodieEngineContext context, String instantTime, I records); + /** + * Replaces all the existing records of the Hoodie table and optimize data layout, + * for the partition paths contained in input records. + * + * @param context HoodieEngineContext + * @param instantTime Instant time for the replace action + * @param records input records + * @return HoodieWriteMetadata + */ + public abstract HoodieWriteMetadata<O> optimize(HoodieEngineContext context, String instantTime, I records); + + /** + * update statistics info for current table. + * now only support OPTIMIZE operation, to do support other operation type. + * + * @param context HoodieEngineContext + * @param instantTime Instant time for the replace action + * @param isOptimizeOperation whether current operation is OPTIMIZE type + */ + public abstract void updateStatistics(HoodieEngineContext context, List<HoodieWriteStat> stats, String instantTime, Boolean isOptimizeOperation); Review comment: We can take this up as a follow up later -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
