[GitHub] [incubator-pinot] snleee commented on a change in pull request #6975: Merge rollup executor enhancement

GitBox Fri, 28 May 2021 14:02:18 -0700


snleee commented on a change in pull request #6975:
URL: https://github.com/apache/incubator-pinot/pull/6975#discussion_r641760045




##########
File path: 
pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/merge_rollup/MergeRollupTaskExecutor.java
##########
@@ -40,40 +53,133 @@
  * TODO:
  *   1. Add the support for roll-up
  *   2. Add the support for time split to provide backfill support for merged 
segments
- *   3. Change the way to decide the number of output segments (explicit 
numPartition config -> maxNumRowsPerSegment)
+ *   3. Add merge/rollup name prefixes for generated segments
+ *   4. Add the support for realtime table
  */
 public class MergeRollupTaskExecutor extends 
BaseMultipleSegmentsConversionExecutor {
   private static final Logger LOGGER = 
LoggerFactory.getLogger(MergeRollupTaskExecutor.class);
+  private static final String INPUT_SEGMENTS_DIR = "input_segments";
+  private static final String OUTPUT_SEGMENTS_DIR = "output_segments";
 
   @Override
   protected List<SegmentConversionResult> convert(PinotTaskConfig 
pinotTaskConfig, List<File> originalIndexDirs,
       File workingDir)
       throws Exception {
+    String taskType = pinotTaskConfig.getTaskType();
     Map<String, String> configs = pinotTaskConfig.getConfigs();
+    LOGGER.info("Starting task: {} with configs: {}", taskType, configs);
+    long startMillis = System.currentTimeMillis();
+
     String mergeTypeString = 
configs.get(MinionConstants.MergeRollupTask.MERGE_TYPE_KEY);
     // TODO: add the support for rollup
     Preconditions.checkNotNull(mergeTypeString, "MergeType cannot be null");
 
-    MergeType mergeType = MergeType.fromString(mergeTypeString);
-    Preconditions.checkState(mergeType == MergeType.CONCATENATE, "Only 
'CONCATENATE' mode is currently supported.");
-
-    String mergedSegmentName = 
configs.get(MinionConstants.MergeRollupTask.MERGED_SEGMENT_NAME_KEY);
     String tableNameWithType = configs.get(MinionConstants.TABLE_NAME_KEY);
-
     TableConfig tableConfig = getTableConfig(tableNameWithType);
+    Schema schema = getSchema(tableNameWithType);
+    Set<String> schemaColumns = schema.getPhysicalColumnNames();
+
+    Map<String, ValueAggregatorFactory.ValueAggregatorType> aggregatorConfigs =
+        MergeRollupTaskUtils.getRollupAggregationTypeMap(configs);
+    String numRecordsPerSegment = 
configs.get(MinionConstants.MergeRollupTask.MAX_NUM_RECORDS_PER_SEGMENT);
+
+    SegmentProcessorConfig.Builder segmentProcessorConfigBuilder =
+        new 
SegmentProcessorConfig.Builder().setTableConfig(tableConfig).setSchema(schema);
+
+    // Partition config from tableConfig
+    if (tableConfig.getIndexingConfig().getSegmentPartitionConfig() != null) {
+      Map<String, ColumnPartitionConfig> columnPartitionMap =
+          
tableConfig.getIndexingConfig().getSegmentPartitionConfig().getColumnPartitionMap();
+      PartitionerConfig partitionerConfig = 
getPartitionerConfig(columnPartitionMap, tableNameWithType, schemaColumns);
+      
segmentProcessorConfigBuilder.setPartitionerConfigs(Lists.newArrayList(partitionerConfig));
+    }
 
-    MergeRollupSegmentConverter rollupSegmentConverter =
-        new 
MergeRollupSegmentConverter.Builder().setMergeType(mergeType).setTableName(tableNameWithType)
-            
.setSegmentName(mergedSegmentName).setInputIndexDirs(originalIndexDirs).setWorkingDir(workingDir)
-            .setTableConfig(tableConfig).build();
+    // Aggregations using configured Collector
+    List<String> sortedColumns = 
tableConfig.getIndexingConfig().getSortedColumn();
+    CollectorConfig collectorConfig =
+        getCollectorConfig(mergeTypeString, aggregatorConfigs, schemaColumns, 
sortedColumns);
+    Preconditions.checkState(collectorConfig.getCollectorType() == 
CollectorFactory.CollectorType.CONCAT,
+        "Only 'CONCAT' mode is currently supported.");
+    segmentProcessorConfigBuilder.setCollectorConfig(collectorConfig);
 
-    List<File> resultFiles = rollupSegmentConverter.convert();
+    // Segment config
+    if (numRecordsPerSegment != null) {
+      SegmentConfig segmentConfig = getSegmentConfig(numRecordsPerSegment);
+      segmentProcessorConfigBuilder.setSegmentConfig(segmentConfig);
+    }
+
+    SegmentProcessorConfig segmentProcessorConfig = 
segmentProcessorConfigBuilder.build();
+
+    File inputSegmentsDir = new File(workingDir, INPUT_SEGMENTS_DIR);
+    Preconditions.checkState(inputSegmentsDir.mkdirs(), "Failed to create 
input directory: %s for task: %s",
+        inputSegmentsDir.getAbsolutePath(), taskType);
+    for (File indexDir : originalIndexDirs) {
+      FileUtils.copyDirectoryToDirectory(indexDir, inputSegmentsDir);
+    }
+    File outputSegmentsDir = new File(workingDir, OUTPUT_SEGMENTS_DIR);
+    Preconditions.checkState(outputSegmentsDir.mkdirs(), "Failed to create 
output directory: %s for task: %s",
+        outputSegmentsDir.getAbsolutePath(), taskType);
+
+    SegmentProcessorFramework segmentProcessorFramework =
+        new SegmentProcessorFramework(inputSegmentsDir, 
segmentProcessorConfig, outputSegmentsDir);
+    try {
+      segmentProcessorFramework.processSegments();
+    } finally {
+      segmentProcessorFramework.cleanup();
+    }
+
+    long endMillis = System.currentTimeMillis();
+    LOGGER.info("Finished task: {} with configs: {}. Total time: {}ms", 
taskType, configs, (endMillis - startMillis));
     List<SegmentConversionResult> results = new ArrayList<>();
-    for (File file : resultFiles) {
+    for (File file : outputSegmentsDir.listFiles()) {
       String outputSegmentName = file.getName();
       results.add(new 
SegmentConversionResult.Builder().setFile(file).setSegmentName(outputSegmentName)
           .setTableNameWithType(tableNameWithType).build());
     }
     return results;
   }
+
+  @Override
+  protected SegmentZKMetadataCustomMapModifier 
getSegmentZKMetadataCustomMapModifier(PinotTaskConfig pinotTaskConfig) {
+    return new 
SegmentZKMetadataCustomMapModifier(SegmentZKMetadataCustomMapModifier.ModifyMode.UPDATE,
 Collections
+        .singletonMap(MinionConstants.MergeRollupTask.TASK_TYPE + 
MinionConstants.TASK_BUCKET_GRANULARITY_SUFFIX,
+            
pinotTaskConfig.getConfigs().get(MinionConstants.MergeRollupTask.GRANULARITY_KEY)));

Review comment:
       Do we have a guarantee that granularity values are always upper case 
(e.g. `DAILY`, `HOURLY`) ? 
   
   I think that the following input should all work: `hourly, Hourly, 
HOURLY...etc`

##########
File path: 
pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/merge_rollup/MergeRollupTaskExecutor.java
##########
@@ -40,40 +53,133 @@
  * TODO:
  *   1. Add the support for roll-up
  *   2. Add the support for time split to provide backfill support for merged 
segments
- *   3. Change the way to decide the number of output segments (explicit 
numPartition config -> maxNumRowsPerSegment)
+ *   3. Add merge/rollup name prefixes for generated segments
+ *   4. Add the support for realtime table
  */
 public class MergeRollupTaskExecutor extends 
BaseMultipleSegmentsConversionExecutor {
   private static final Logger LOGGER = 
LoggerFactory.getLogger(MergeRollupTaskExecutor.class);
+  private static final String INPUT_SEGMENTS_DIR = "input_segments";
+  private static final String OUTPUT_SEGMENTS_DIR = "output_segments";
 
   @Override
   protected List<SegmentConversionResult> convert(PinotTaskConfig 
pinotTaskConfig, List<File> originalIndexDirs,
       File workingDir)
       throws Exception {
+    String taskType = pinotTaskConfig.getTaskType();
     Map<String, String> configs = pinotTaskConfig.getConfigs();
+    LOGGER.info("Starting task: {} with configs: {}", taskType, configs);
+    long startMillis = System.currentTimeMillis();
+
     String mergeTypeString = 
configs.get(MinionConstants.MergeRollupTask.MERGE_TYPE_KEY);
     // TODO: add the support for rollup
     Preconditions.checkNotNull(mergeTypeString, "MergeType cannot be null");
 
-    MergeType mergeType = MergeType.fromString(mergeTypeString);
-    Preconditions.checkState(mergeType == MergeType.CONCATENATE, "Only 
'CONCATENATE' mode is currently supported.");
-
-    String mergedSegmentName = 
configs.get(MinionConstants.MergeRollupTask.MERGED_SEGMENT_NAME_KEY);
     String tableNameWithType = configs.get(MinionConstants.TABLE_NAME_KEY);
-
     TableConfig tableConfig = getTableConfig(tableNameWithType);
+    Schema schema = getSchema(tableNameWithType);
+    Set<String> schemaColumns = schema.getPhysicalColumnNames();
+
+    Map<String, ValueAggregatorFactory.ValueAggregatorType> aggregatorConfigs =
+        MergeRollupTaskUtils.getRollupAggregationTypeMap(configs);
+    String numRecordsPerSegment = 
configs.get(MinionConstants.MergeRollupTask.MAX_NUM_RECORDS_PER_SEGMENT);
+
+    SegmentProcessorConfig.Builder segmentProcessorConfigBuilder =
+        new 
SegmentProcessorConfig.Builder().setTableConfig(tableConfig).setSchema(schema);
+
+    // Partition config from tableConfig
+    if (tableConfig.getIndexingConfig().getSegmentPartitionConfig() != null) {
+      Map<String, ColumnPartitionConfig> columnPartitionMap =
+          
tableConfig.getIndexingConfig().getSegmentPartitionConfig().getColumnPartitionMap();
+      PartitionerConfig partitionerConfig = 
getPartitionerConfig(columnPartitionMap, tableNameWithType, schemaColumns);
+      
segmentProcessorConfigBuilder.setPartitionerConfigs(Lists.newArrayList(partitionerConfig));
+    }
 
-    MergeRollupSegmentConverter rollupSegmentConverter =
-        new 
MergeRollupSegmentConverter.Builder().setMergeType(mergeType).setTableName(tableNameWithType)
-            
.setSegmentName(mergedSegmentName).setInputIndexDirs(originalIndexDirs).setWorkingDir(workingDir)
-            .setTableConfig(tableConfig).build();
+    // Aggregations using configured Collector
+    List<String> sortedColumns = 
tableConfig.getIndexingConfig().getSortedColumn();
+    CollectorConfig collectorConfig =
+        getCollectorConfig(mergeTypeString, aggregatorConfigs, schemaColumns, 
sortedColumns);
+    Preconditions.checkState(collectorConfig.getCollectorType() == 
CollectorFactory.CollectorType.CONCAT,
+        "Only 'CONCAT' mode is currently supported.");
+    segmentProcessorConfigBuilder.setCollectorConfig(collectorConfig);
 
-    List<File> resultFiles = rollupSegmentConverter.convert();
+    // Segment config
+    if (numRecordsPerSegment != null) {

Review comment:
       Let's parse the string to integer here instead of passing string value 
to `getSegmentConfig()`.  Also, we can use `1000000` as the default value.

##########
File path: 
pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/merge_rollup/MergeRollupTaskExecutor.java
##########
@@ -40,40 +53,133 @@
  * TODO:
  *   1. Add the support for roll-up
  *   2. Add the support for time split to provide backfill support for merged 
segments
- *   3. Change the way to decide the number of output segments (explicit 
numPartition config -> maxNumRowsPerSegment)
+ *   3. Add merge/rollup name prefixes for generated segments
+ *   4. Add the support for realtime table
  */
 public class MergeRollupTaskExecutor extends 
BaseMultipleSegmentsConversionExecutor {
   private static final Logger LOGGER = 
LoggerFactory.getLogger(MergeRollupTaskExecutor.class);
+  private static final String INPUT_SEGMENTS_DIR = "input_segments";
+  private static final String OUTPUT_SEGMENTS_DIR = "output_segments";
 
   @Override
   protected List<SegmentConversionResult> convert(PinotTaskConfig 
pinotTaskConfig, List<File> originalIndexDirs,
       File workingDir)
       throws Exception {
+    String taskType = pinotTaskConfig.getTaskType();
     Map<String, String> configs = pinotTaskConfig.getConfigs();
+    LOGGER.info("Starting task: {} with configs: {}", taskType, configs);
+    long startMillis = System.currentTimeMillis();
+
     String mergeTypeString = 
configs.get(MinionConstants.MergeRollupTask.MERGE_TYPE_KEY);
     // TODO: add the support for rollup
     Preconditions.checkNotNull(mergeTypeString, "MergeType cannot be null");
 
-    MergeType mergeType = MergeType.fromString(mergeTypeString);
-    Preconditions.checkState(mergeType == MergeType.CONCATENATE, "Only 
'CONCATENATE' mode is currently supported.");
-
-    String mergedSegmentName = 
configs.get(MinionConstants.MergeRollupTask.MERGED_SEGMENT_NAME_KEY);
     String tableNameWithType = configs.get(MinionConstants.TABLE_NAME_KEY);
-
     TableConfig tableConfig = getTableConfig(tableNameWithType);
+    Schema schema = getSchema(tableNameWithType);
+    Set<String> schemaColumns = schema.getPhysicalColumnNames();
+
+    Map<String, ValueAggregatorFactory.ValueAggregatorType> aggregatorConfigs =
+        MergeRollupTaskUtils.getRollupAggregationTypeMap(configs);
+    String numRecordsPerSegment = 
configs.get(MinionConstants.MergeRollupTask.MAX_NUM_RECORDS_PER_SEGMENT);
+
+    SegmentProcessorConfig.Builder segmentProcessorConfigBuilder =
+        new 
SegmentProcessorConfig.Builder().setTableConfig(tableConfig).setSchema(schema);
+
+    // Partition config from tableConfig
+    if (tableConfig.getIndexingConfig().getSegmentPartitionConfig() != null) {
+      Map<String, ColumnPartitionConfig> columnPartitionMap =
+          
tableConfig.getIndexingConfig().getSegmentPartitionConfig().getColumnPartitionMap();
+      PartitionerConfig partitionerConfig = 
getPartitionerConfig(columnPartitionMap, tableNameWithType, schemaColumns);
+      
segmentProcessorConfigBuilder.setPartitionerConfigs(Lists.newArrayList(partitionerConfig));
+    }
 
-    MergeRollupSegmentConverter rollupSegmentConverter =
-        new 
MergeRollupSegmentConverter.Builder().setMergeType(mergeType).setTableName(tableNameWithType)
-            
.setSegmentName(mergedSegmentName).setInputIndexDirs(originalIndexDirs).setWorkingDir(workingDir)
-            .setTableConfig(tableConfig).build();
+    // Aggregations using configured Collector
+    List<String> sortedColumns = 
tableConfig.getIndexingConfig().getSortedColumn();
+    CollectorConfig collectorConfig =
+        getCollectorConfig(mergeTypeString, aggregatorConfigs, schemaColumns, 
sortedColumns);
+    Preconditions.checkState(collectorConfig.getCollectorType() == 
CollectorFactory.CollectorType.CONCAT,
+        "Only 'CONCAT' mode is currently supported.");
+    segmentProcessorConfigBuilder.setCollectorConfig(collectorConfig);
 
-    List<File> resultFiles = rollupSegmentConverter.convert();
+    // Segment config
+    if (numRecordsPerSegment != null) {
+      SegmentConfig segmentConfig = getSegmentConfig(numRecordsPerSegment);
+      segmentProcessorConfigBuilder.setSegmentConfig(segmentConfig);

Review comment:
       What happens if we don't set `SegmentConfig` ?

##########
File path: 
pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/merge_rollup/MergeRollupTaskExecutor.java
##########
@@ -40,40 +53,133 @@
  * TODO:
  *   1. Add the support for roll-up
  *   2. Add the support for time split to provide backfill support for merged 
segments
- *   3. Change the way to decide the number of output segments (explicit 
numPartition config -> maxNumRowsPerSegment)
+ *   3. Add merge/rollup name prefixes for generated segments
+ *   4. Add the support for realtime table
  */
 public class MergeRollupTaskExecutor extends 
BaseMultipleSegmentsConversionExecutor {
   private static final Logger LOGGER = 
LoggerFactory.getLogger(MergeRollupTaskExecutor.class);
+  private static final String INPUT_SEGMENTS_DIR = "input_segments";
+  private static final String OUTPUT_SEGMENTS_DIR = "output_segments";
 
   @Override
   protected List<SegmentConversionResult> convert(PinotTaskConfig 
pinotTaskConfig, List<File> originalIndexDirs,

Review comment:
       Instead of directly using `SegmentProcessingFramework`, can we wrap this 
to a separate class something similar to `MergeRollupSegmentConverter`?
   
   In that case, we can reuse `MergeRollupSegmentConverter` in the 
`SegmentMergeCommand`.
   

##########
File path: 
pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/purge/PurgeTaskExecutor.java
##########
@@ -68,7 +68,7 @@ protected SegmentConversionResult convert(PinotTaskConfig 
pinotTaskConfig, File
 
   @Override
   protected SegmentZKMetadataCustomMapModifier 
getSegmentZKMetadataCustomMapModifier() {
-    return new 
SegmentZKMetadataCustomMapModifier(SegmentZKMetadataCustomMapModifier.ModifyMode.REPLACE,
 Collections
+    return new 
SegmentZKMetadataCustomMapModifier(SegmentZKMetadataCustomMapModifier.ModifyMode.UPDATE,
 Collections

Review comment:
       Is there an existing test for segment upload API? Can you check if 
there's a test for `SegmentZKMetadataCustomMapModifier`? If so, we can add the 
test in regarding to this change (`REPLACE -> UPDATE`)

##########
File path: 
pinot-common/src/main/java/org/apache/pinot/common/utils/FileUploadDownloadClient.java
##########
@@ -296,6 +309,21 @@ public static URI getUploadSegmentURI(URI controllerURI)
     return getURI(controllerURI.getScheme(), controllerURI.getHost(), 
controllerURI.getPort(), SEGMENT_PATH);
   }
 
+  public static URI getStartReplaceSegmentsURI(URI controllerURI, String 
rawTableName, String tableType)
+      throws URISyntaxException {
+    return getURI(controllerURI.getScheme(), controllerURI.getHost(), 
controllerURI.getPort(),

Review comment:
       I remember that using `getUri()` somehow didn't work so I ended up using 
`new URI`. Can you double check on this 
https://github.com/apache/incubator-pinot/pull/6094?
   
   ```
   http://localhost:18998/segments/mytable/startReplaceSegments%3Ftype=OFFLINE 
<- getURI(),
   http://localhost:18998/segments/mytable/startReplaceSegments?type=OFFLINE  
<- new URI(..)
   ``` 

##########
File path: 
pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/merge_rollup/MergeRollupTaskExecutor.java
##########
@@ -40,40 +53,133 @@
  * TODO:
  *   1. Add the support for roll-up
  *   2. Add the support for time split to provide backfill support for merged 
segments
- *   3. Change the way to decide the number of output segments (explicit 
numPartition config -> maxNumRowsPerSegment)
+ *   3. Add merge/rollup name prefixes for generated segments
+ *   4. Add the support for realtime table
  */
 public class MergeRollupTaskExecutor extends 
BaseMultipleSegmentsConversionExecutor {
   private static final Logger LOGGER = 
LoggerFactory.getLogger(MergeRollupTaskExecutor.class);
+  private static final String INPUT_SEGMENTS_DIR = "input_segments";
+  private static final String OUTPUT_SEGMENTS_DIR = "output_segments";
 
   @Override
   protected List<SegmentConversionResult> convert(PinotTaskConfig 
pinotTaskConfig, List<File> originalIndexDirs,
       File workingDir)
       throws Exception {
+    String taskType = pinotTaskConfig.getTaskType();
     Map<String, String> configs = pinotTaskConfig.getConfigs();
+    LOGGER.info("Starting task: {} with configs: {}", taskType, configs);
+    long startMillis = System.currentTimeMillis();
+
     String mergeTypeString = 
configs.get(MinionConstants.MergeRollupTask.MERGE_TYPE_KEY);
     // TODO: add the support for rollup
     Preconditions.checkNotNull(mergeTypeString, "MergeType cannot be null");
 
-    MergeType mergeType = MergeType.fromString(mergeTypeString);
-    Preconditions.checkState(mergeType == MergeType.CONCATENATE, "Only 
'CONCATENATE' mode is currently supported.");
-
-    String mergedSegmentName = 
configs.get(MinionConstants.MergeRollupTask.MERGED_SEGMENT_NAME_KEY);
     String tableNameWithType = configs.get(MinionConstants.TABLE_NAME_KEY);
-
     TableConfig tableConfig = getTableConfig(tableNameWithType);
+    Schema schema = getSchema(tableNameWithType);
+    Set<String> schemaColumns = schema.getPhysicalColumnNames();
+
+    Map<String, ValueAggregatorFactory.ValueAggregatorType> aggregatorConfigs =
+        MergeRollupTaskUtils.getRollupAggregationTypeMap(configs);
+    String numRecordsPerSegment = 
configs.get(MinionConstants.MergeRollupTask.MAX_NUM_RECORDS_PER_SEGMENT);
+
+    SegmentProcessorConfig.Builder segmentProcessorConfigBuilder =
+        new 
SegmentProcessorConfig.Builder().setTableConfig(tableConfig).setSchema(schema);
+
+    // Partition config from tableConfig
+    if (tableConfig.getIndexingConfig().getSegmentPartitionConfig() != null) {
+      Map<String, ColumnPartitionConfig> columnPartitionMap =
+          
tableConfig.getIndexingConfig().getSegmentPartitionConfig().getColumnPartitionMap();
+      PartitionerConfig partitionerConfig = 
getPartitionerConfig(columnPartitionMap, tableNameWithType, schemaColumns);

Review comment:
       Why do we read the partition config from the table config and set it to 
the partition for the segment processing framework?
   
   We should handle this on the task scheduler. The scheduler should only pick 
the segments from the same partition when the custom partitioning is enabled.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [incubator-pinot] snleee commented on a change in pull request #6975: Merge rollup executor enhancement

Reply via email to