[GitHub] [hudi] n3nash commented on a change in pull request #2263: [HUDI-1075] [WIP] Implement simple clustering strategies to create and run ClusteringPlan

GitBox Fri, 20 Nov 2020 22:24:56 -0800


n3nash commented on a change in pull request #2263:
URL: https://github.com/apache/hudi/pull/2263#discussion_r528080568




##########
File path: 
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/cluster/strategy/ScheduleClusteringStrategy.java
##########
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.table.action.cluster.strategy;
+
+import org.apache.hudi.avro.model.HoodieClusteringPlan;
+import org.apache.hudi.avro.model.HoodieSliceInfo;
+import org.apache.hudi.client.common.HoodieEngineContext;
+import org.apache.hudi.common.fs.FSUtils;
+import org.apache.hudi.common.model.BaseFile;
+import org.apache.hudi.common.model.FileSlice;
+import org.apache.hudi.common.model.HoodieBaseFile;
+import org.apache.hudi.common.model.HoodieFileGroupId;
+import org.apache.hudi.common.model.HoodieLogFile;
+import org.apache.hudi.common.model.HoodieRecordPayload;
+import org.apache.hudi.common.table.view.SyncableFileSystemView;
+import org.apache.hudi.common.util.ClusteringUtils;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.table.HoodieTable;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+import java.io.Serializable;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+/**
+ * Pluggable implementation for scheduling clustering and creating 
ClusteringPlan.
+ */
+public abstract class ScheduleClusteringStrategy<T extends 
HoodieRecordPayload,I,K,O> implements Serializable {
+  private static final Logger LOG = 
LogManager.getLogger(ScheduleClusteringStrategy.class);
+
+  public static final String TOTAL_IO_READ_MB = "TOTAL_IO_READ_MB";
+  public static final String TOTAL_IO_WRITE_MB = "TOTAL_IO_WRITE_MB";
+  public static final String TOTAL_IO_MB = "TOTAL_IO_MB";
+  public static final String TOTAL_LOG_FILE_SIZE = "TOTAL_LOG_FILES_SIZE";
+  public static final String TOTAL_LOG_FILES = "TOTAL_LOG_FILES";
+
+  public static final int CLUSTERING_PLAN_VERSION_1 = 1;
+
+  private final HoodieTable<T,I,K,O> hoodieTable;
+  private final HoodieEngineContext engineContext;
+  private final HoodieWriteConfig writeConfig;
+
+  public ScheduleClusteringStrategy(HoodieTable table, HoodieEngineContext 
engineContext, HoodieWriteConfig writeConfig) {
+    this.writeConfig = writeConfig;
+    this.hoodieTable = table;
+    this.engineContext = engineContext;
+  }
+
+  /**
+   * Generate metadata for grouping eligible files and create a plan. Note 
that data is not moved around
+   * as part of this step.
+   *
+   * If there is no data available to cluster, return None.
+   */
+  public abstract Option<HoodieClusteringPlan> generateClusteringPlan();
+
+  /**
+   * Return file slices eligible for clustering. FileIds in
+   * 1) pending clustering/compaction
+   * 2) Larger than clustering target file size
+   *
+   * are not eligible for clustering
+   */
+  protected List<FileSlice> getFileSlicesEligibleForClustering(String 
partition) {
+    Set<HoodieFileGroupId> fgIdsInPendingCompactionAndClustering = 
((SyncableFileSystemView) 
getHoodieTable().getSliceView()).getPendingCompactionOperations()
+        .map(instantTimeOpPair -> 
instantTimeOpPair.getValue().getFileGroupId())
+        .collect(Collectors.toSet());
+    
fgIdsInPendingCompactionAndClustering.addAll(ClusteringUtils.getAllFileGroupsInPendingClusteringPlans(getHoodieTable().getMetaClient()).keySet());
+
+    return hoodieTable.getSliceView().getLatestFileSlices(partition)
+        // file ids already in clustering are not eligible
+        .filter(slice -> 
!fgIdsInPendingCompactionAndClustering.contains(slice.getFileGroupId()))
+        // files that have basefile size larger than clustering target file 
size are not eligible (Note that compaction can merge any updates)
+        .filter(slice -> 
slice.getBaseFile().map(HoodieBaseFile::getFileSize).orElse(0L) < 
writeConfig.getClusteringTargetFileSize())
+        .collect(Collectors.toList());
+  }
+
+  /**
+   * Get parameters specific to strategy. These parameters are passed from 
'schedule clustering' step to
+   * 'run clustering' step. 'run clustering' step is typically async. So these 
params help with passing any required
+   * context from schedule to run step.
+   */
+  protected abstract Map<String, String> getStrategyParams();
+
+  /**
+   * Returns any specific parameters to be stored as part of clustering 
metadata.
+   */
+  protected Map<String, String> getExtraMetadata() {
+    return Collections.emptyMap();
+  }
+
+  /**
+   * Version to support future changes for plan.
+   */
+  protected int getPlanVersion() {
+    return CLUSTERING_PLAN_VERSION_1;
+  }
+
+  /**
+   * Transform {@link FileSlice} to {@link HoodieSliceInfo}.
+   */
+  protected List<HoodieSliceInfo> getFileSliceInfo(List<FileSlice> slices) {
+    return slices.stream().map(slice -> new HoodieSliceInfo().newBuilder()
+        .setPartitionPath(slice.getPartitionPath())
+        .setFileId(slice.getFileId())
+        .setDataFilePath(slice.getBaseFile().map(BaseFile::getPath).orElse(""))

Review comment:
       Could we use StringUtils.EMPTY or create a constant ?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [hudi] n3nash commented on a change in pull request #2263: [HUDI-1075] [WIP] Implement simple clustering strategies to create and run ClusteringPlan

Reply via email to