snleee commented on a change in pull request #6975:
URL: https://github.com/apache/incubator-pinot/pull/6975#discussion_r641760045
##########
File path:
pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/merge_rollup/MergeRollupTaskExecutor.java
##########
@@ -40,40 +53,133 @@
* TODO:
* 1. Add the support for roll-up
* 2. Add the support for time split to provide backfill support for merged
segments
- * 3. Change the way to decide the number of output segments (explicit
numPartition config -> maxNumRowsPerSegment)
+ * 3. Add merge/rollup name prefixes for generated segments
+ * 4. Add the support for realtime table
*/
public class MergeRollupTaskExecutor extends
BaseMultipleSegmentsConversionExecutor {
private static final Logger LOGGER =
LoggerFactory.getLogger(MergeRollupTaskExecutor.class);
+ private static final String INPUT_SEGMENTS_DIR = "input_segments";
+ private static final String OUTPUT_SEGMENTS_DIR = "output_segments";
@Override
protected List<SegmentConversionResult> convert(PinotTaskConfig
pinotTaskConfig, List<File> originalIndexDirs,
File workingDir)
throws Exception {
+ String taskType = pinotTaskConfig.getTaskType();
Map<String, String> configs = pinotTaskConfig.getConfigs();
+ LOGGER.info("Starting task: {} with configs: {}", taskType, configs);
+ long startMillis = System.currentTimeMillis();
+
String mergeTypeString =
configs.get(MinionConstants.MergeRollupTask.MERGE_TYPE_KEY);
// TODO: add the support for rollup
Preconditions.checkNotNull(mergeTypeString, "MergeType cannot be null");
- MergeType mergeType = MergeType.fromString(mergeTypeString);
- Preconditions.checkState(mergeType == MergeType.CONCATENATE, "Only
'CONCATENATE' mode is currently supported.");
-
- String mergedSegmentName =
configs.get(MinionConstants.MergeRollupTask.MERGED_SEGMENT_NAME_KEY);
String tableNameWithType = configs.get(MinionConstants.TABLE_NAME_KEY);
-
TableConfig tableConfig = getTableConfig(tableNameWithType);
+ Schema schema = getSchema(tableNameWithType);
+ Set<String> schemaColumns = schema.getPhysicalColumnNames();
+
+ Map<String, ValueAggregatorFactory.ValueAggregatorType> aggregatorConfigs =
+ MergeRollupTaskUtils.getRollupAggregationTypeMap(configs);
+ String numRecordsPerSegment =
configs.get(MinionConstants.MergeRollupTask.MAX_NUM_RECORDS_PER_SEGMENT);
+
+ SegmentProcessorConfig.Builder segmentProcessorConfigBuilder =
+ new
SegmentProcessorConfig.Builder().setTableConfig(tableConfig).setSchema(schema);
+
+ // Partition config from tableConfig
+ if (tableConfig.getIndexingConfig().getSegmentPartitionConfig() != null) {
+ Map<String, ColumnPartitionConfig> columnPartitionMap =
+
tableConfig.getIndexingConfig().getSegmentPartitionConfig().getColumnPartitionMap();
+ PartitionerConfig partitionerConfig =
getPartitionerConfig(columnPartitionMap, tableNameWithType, schemaColumns);
+
segmentProcessorConfigBuilder.setPartitionerConfigs(Lists.newArrayList(partitionerConfig));
+ }
- MergeRollupSegmentConverter rollupSegmentConverter =
- new
MergeRollupSegmentConverter.Builder().setMergeType(mergeType).setTableName(tableNameWithType)
-
.setSegmentName(mergedSegmentName).setInputIndexDirs(originalIndexDirs).setWorkingDir(workingDir)
- .setTableConfig(tableConfig).build();
+ // Aggregations using configured Collector
+ List<String> sortedColumns =
tableConfig.getIndexingConfig().getSortedColumn();
+ CollectorConfig collectorConfig =
+ getCollectorConfig(mergeTypeString, aggregatorConfigs, schemaColumns,
sortedColumns);
+ Preconditions.checkState(collectorConfig.getCollectorType() ==
CollectorFactory.CollectorType.CONCAT,
+ "Only 'CONCAT' mode is currently supported.");
+ segmentProcessorConfigBuilder.setCollectorConfig(collectorConfig);
- List<File> resultFiles = rollupSegmentConverter.convert();
+ // Segment config
+ if (numRecordsPerSegment != null) {
+ SegmentConfig segmentConfig = getSegmentConfig(numRecordsPerSegment);
+ segmentProcessorConfigBuilder.setSegmentConfig(segmentConfig);
+ }
+
+ SegmentProcessorConfig segmentProcessorConfig =
segmentProcessorConfigBuilder.build();
+
+ File inputSegmentsDir = new File(workingDir, INPUT_SEGMENTS_DIR);
+ Preconditions.checkState(inputSegmentsDir.mkdirs(), "Failed to create
input directory: %s for task: %s",
+ inputSegmentsDir.getAbsolutePath(), taskType);
+ for (File indexDir : originalIndexDirs) {
+ FileUtils.copyDirectoryToDirectory(indexDir, inputSegmentsDir);
+ }
+ File outputSegmentsDir = new File(workingDir, OUTPUT_SEGMENTS_DIR);
+ Preconditions.checkState(outputSegmentsDir.mkdirs(), "Failed to create
output directory: %s for task: %s",
+ outputSegmentsDir.getAbsolutePath(), taskType);
+
+ SegmentProcessorFramework segmentProcessorFramework =
+ new SegmentProcessorFramework(inputSegmentsDir,
segmentProcessorConfig, outputSegmentsDir);
+ try {
+ segmentProcessorFramework.processSegments();
+ } finally {
+ segmentProcessorFramework.cleanup();
+ }
+
+ long endMillis = System.currentTimeMillis();
+ LOGGER.info("Finished task: {} with configs: {}. Total time: {}ms",
taskType, configs, (endMillis - startMillis));
List<SegmentConversionResult> results = new ArrayList<>();
- for (File file : resultFiles) {
+ for (File file : outputSegmentsDir.listFiles()) {
String outputSegmentName = file.getName();
results.add(new
SegmentConversionResult.Builder().setFile(file).setSegmentName(outputSegmentName)
.setTableNameWithType(tableNameWithType).build());
}
return results;
}
+
+ @Override
+ protected SegmentZKMetadataCustomMapModifier
getSegmentZKMetadataCustomMapModifier(PinotTaskConfig pinotTaskConfig) {
+ return new
SegmentZKMetadataCustomMapModifier(SegmentZKMetadataCustomMapModifier.ModifyMode.UPDATE,
Collections
+ .singletonMap(MinionConstants.MergeRollupTask.TASK_TYPE +
MinionConstants.TASK_BUCKET_GRANULARITY_SUFFIX,
+
pinotTaskConfig.getConfigs().get(MinionConstants.MergeRollupTask.GRANULARITY_KEY)));
Review comment:
Do we have a guarantee that granularity values are always upper case
(e.g. `DAILY`, `HOURLY`) ?
I think that the following input should all work: `hourly, Hourly,
HOURLY...etc`
##########
File path:
pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/merge_rollup/MergeRollupTaskExecutor.java
##########
@@ -40,40 +53,133 @@
* TODO:
* 1. Add the support for roll-up
* 2. Add the support for time split to provide backfill support for merged
segments
- * 3. Change the way to decide the number of output segments (explicit
numPartition config -> maxNumRowsPerSegment)
+ * 3. Add merge/rollup name prefixes for generated segments
+ * 4. Add the support for realtime table
*/
public class MergeRollupTaskExecutor extends
BaseMultipleSegmentsConversionExecutor {
private static final Logger LOGGER =
LoggerFactory.getLogger(MergeRollupTaskExecutor.class);
+ private static final String INPUT_SEGMENTS_DIR = "input_segments";
+ private static final String OUTPUT_SEGMENTS_DIR = "output_segments";
@Override
protected List<SegmentConversionResult> convert(PinotTaskConfig
pinotTaskConfig, List<File> originalIndexDirs,
File workingDir)
throws Exception {
+ String taskType = pinotTaskConfig.getTaskType();
Map<String, String> configs = pinotTaskConfig.getConfigs();
+ LOGGER.info("Starting task: {} with configs: {}", taskType, configs);
+ long startMillis = System.currentTimeMillis();
+
String mergeTypeString =
configs.get(MinionConstants.MergeRollupTask.MERGE_TYPE_KEY);
// TODO: add the support for rollup
Preconditions.checkNotNull(mergeTypeString, "MergeType cannot be null");
- MergeType mergeType = MergeType.fromString(mergeTypeString);
- Preconditions.checkState(mergeType == MergeType.CONCATENATE, "Only
'CONCATENATE' mode is currently supported.");
-
- String mergedSegmentName =
configs.get(MinionConstants.MergeRollupTask.MERGED_SEGMENT_NAME_KEY);
String tableNameWithType = configs.get(MinionConstants.TABLE_NAME_KEY);
-
TableConfig tableConfig = getTableConfig(tableNameWithType);
+ Schema schema = getSchema(tableNameWithType);
+ Set<String> schemaColumns = schema.getPhysicalColumnNames();
+
+ Map<String, ValueAggregatorFactory.ValueAggregatorType> aggregatorConfigs =
+ MergeRollupTaskUtils.getRollupAggregationTypeMap(configs);
+ String numRecordsPerSegment =
configs.get(MinionConstants.MergeRollupTask.MAX_NUM_RECORDS_PER_SEGMENT);
+
+ SegmentProcessorConfig.Builder segmentProcessorConfigBuilder =
+ new
SegmentProcessorConfig.Builder().setTableConfig(tableConfig).setSchema(schema);
+
+ // Partition config from tableConfig
+ if (tableConfig.getIndexingConfig().getSegmentPartitionConfig() != null) {
+ Map<String, ColumnPartitionConfig> columnPartitionMap =
+
tableConfig.getIndexingConfig().getSegmentPartitionConfig().getColumnPartitionMap();
+ PartitionerConfig partitionerConfig =
getPartitionerConfig(columnPartitionMap, tableNameWithType, schemaColumns);
+
segmentProcessorConfigBuilder.setPartitionerConfigs(Lists.newArrayList(partitionerConfig));
+ }
- MergeRollupSegmentConverter rollupSegmentConverter =
- new
MergeRollupSegmentConverter.Builder().setMergeType(mergeType).setTableName(tableNameWithType)
-
.setSegmentName(mergedSegmentName).setInputIndexDirs(originalIndexDirs).setWorkingDir(workingDir)
- .setTableConfig(tableConfig).build();
+ // Aggregations using configured Collector
+ List<String> sortedColumns =
tableConfig.getIndexingConfig().getSortedColumn();
+ CollectorConfig collectorConfig =
+ getCollectorConfig(mergeTypeString, aggregatorConfigs, schemaColumns,
sortedColumns);
+ Preconditions.checkState(collectorConfig.getCollectorType() ==
CollectorFactory.CollectorType.CONCAT,
+ "Only 'CONCAT' mode is currently supported.");
+ segmentProcessorConfigBuilder.setCollectorConfig(collectorConfig);
- List<File> resultFiles = rollupSegmentConverter.convert();
+ // Segment config
+ if (numRecordsPerSegment != null) {
Review comment:
Let's parse the string to integer here instead of passing string value
to `getSegmentConfig()`. Also, we can use `1000000` as the default value.
##########
File path:
pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/merge_rollup/MergeRollupTaskExecutor.java
##########
@@ -40,40 +53,133 @@
* TODO:
* 1. Add the support for roll-up
* 2. Add the support for time split to provide backfill support for merged
segments
- * 3. Change the way to decide the number of output segments (explicit
numPartition config -> maxNumRowsPerSegment)
+ * 3. Add merge/rollup name prefixes for generated segments
+ * 4. Add the support for realtime table
*/
public class MergeRollupTaskExecutor extends
BaseMultipleSegmentsConversionExecutor {
private static final Logger LOGGER =
LoggerFactory.getLogger(MergeRollupTaskExecutor.class);
+ private static final String INPUT_SEGMENTS_DIR = "input_segments";
+ private static final String OUTPUT_SEGMENTS_DIR = "output_segments";
@Override
protected List<SegmentConversionResult> convert(PinotTaskConfig
pinotTaskConfig, List<File> originalIndexDirs,
File workingDir)
throws Exception {
+ String taskType = pinotTaskConfig.getTaskType();
Map<String, String> configs = pinotTaskConfig.getConfigs();
+ LOGGER.info("Starting task: {} with configs: {}", taskType, configs);
+ long startMillis = System.currentTimeMillis();
+
String mergeTypeString =
configs.get(MinionConstants.MergeRollupTask.MERGE_TYPE_KEY);
// TODO: add the support for rollup
Preconditions.checkNotNull(mergeTypeString, "MergeType cannot be null");
- MergeType mergeType = MergeType.fromString(mergeTypeString);
- Preconditions.checkState(mergeType == MergeType.CONCATENATE, "Only
'CONCATENATE' mode is currently supported.");
-
- String mergedSegmentName =
configs.get(MinionConstants.MergeRollupTask.MERGED_SEGMENT_NAME_KEY);
String tableNameWithType = configs.get(MinionConstants.TABLE_NAME_KEY);
-
TableConfig tableConfig = getTableConfig(tableNameWithType);
+ Schema schema = getSchema(tableNameWithType);
+ Set<String> schemaColumns = schema.getPhysicalColumnNames();
+
+ Map<String, ValueAggregatorFactory.ValueAggregatorType> aggregatorConfigs =
+ MergeRollupTaskUtils.getRollupAggregationTypeMap(configs);
+ String numRecordsPerSegment =
configs.get(MinionConstants.MergeRollupTask.MAX_NUM_RECORDS_PER_SEGMENT);
+
+ SegmentProcessorConfig.Builder segmentProcessorConfigBuilder =
+ new
SegmentProcessorConfig.Builder().setTableConfig(tableConfig).setSchema(schema);
+
+ // Partition config from tableConfig
+ if (tableConfig.getIndexingConfig().getSegmentPartitionConfig() != null) {
+ Map<String, ColumnPartitionConfig> columnPartitionMap =
+
tableConfig.getIndexingConfig().getSegmentPartitionConfig().getColumnPartitionMap();
+ PartitionerConfig partitionerConfig =
getPartitionerConfig(columnPartitionMap, tableNameWithType, schemaColumns);
+
segmentProcessorConfigBuilder.setPartitionerConfigs(Lists.newArrayList(partitionerConfig));
+ }
- MergeRollupSegmentConverter rollupSegmentConverter =
- new
MergeRollupSegmentConverter.Builder().setMergeType(mergeType).setTableName(tableNameWithType)
-
.setSegmentName(mergedSegmentName).setInputIndexDirs(originalIndexDirs).setWorkingDir(workingDir)
- .setTableConfig(tableConfig).build();
+ // Aggregations using configured Collector
+ List<String> sortedColumns =
tableConfig.getIndexingConfig().getSortedColumn();
+ CollectorConfig collectorConfig =
+ getCollectorConfig(mergeTypeString, aggregatorConfigs, schemaColumns,
sortedColumns);
+ Preconditions.checkState(collectorConfig.getCollectorType() ==
CollectorFactory.CollectorType.CONCAT,
+ "Only 'CONCAT' mode is currently supported.");
+ segmentProcessorConfigBuilder.setCollectorConfig(collectorConfig);
- List<File> resultFiles = rollupSegmentConverter.convert();
+ // Segment config
+ if (numRecordsPerSegment != null) {
+ SegmentConfig segmentConfig = getSegmentConfig(numRecordsPerSegment);
+ segmentProcessorConfigBuilder.setSegmentConfig(segmentConfig);
Review comment:
What happens if we don't set `SegmentConfig` ?
##########
File path:
pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/merge_rollup/MergeRollupTaskExecutor.java
##########
@@ -40,40 +53,133 @@
* TODO:
* 1. Add the support for roll-up
* 2. Add the support for time split to provide backfill support for merged
segments
- * 3. Change the way to decide the number of output segments (explicit
numPartition config -> maxNumRowsPerSegment)
+ * 3. Add merge/rollup name prefixes for generated segments
+ * 4. Add the support for realtime table
*/
public class MergeRollupTaskExecutor extends
BaseMultipleSegmentsConversionExecutor {
private static final Logger LOGGER =
LoggerFactory.getLogger(MergeRollupTaskExecutor.class);
+ private static final String INPUT_SEGMENTS_DIR = "input_segments";
+ private static final String OUTPUT_SEGMENTS_DIR = "output_segments";
@Override
protected List<SegmentConversionResult> convert(PinotTaskConfig
pinotTaskConfig, List<File> originalIndexDirs,
Review comment:
Instead of directly using `SegmentProcessingFramework`, can we wrap this
to a separate class something similar to `MergeRollupSegmentConverter`?
In that case, we can reuse `MergeRollupSegmentConverter` in the
`SegmentMergeCommand`.
##########
File path:
pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/purge/PurgeTaskExecutor.java
##########
@@ -68,7 +68,7 @@ protected SegmentConversionResult convert(PinotTaskConfig
pinotTaskConfig, File
@Override
protected SegmentZKMetadataCustomMapModifier
getSegmentZKMetadataCustomMapModifier() {
- return new
SegmentZKMetadataCustomMapModifier(SegmentZKMetadataCustomMapModifier.ModifyMode.REPLACE,
Collections
+ return new
SegmentZKMetadataCustomMapModifier(SegmentZKMetadataCustomMapModifier.ModifyMode.UPDATE,
Collections
Review comment:
Is there an existing test for segment upload API? Can you check if
there's a test for `SegmentZKMetadataCustomMapModifier`? If so, we can add the
test in regarding to this change (`REPLACE -> UPDATE`)
##########
File path:
pinot-common/src/main/java/org/apache/pinot/common/utils/FileUploadDownloadClient.java
##########
@@ -296,6 +309,21 @@ public static URI getUploadSegmentURI(URI controllerURI)
return getURI(controllerURI.getScheme(), controllerURI.getHost(),
controllerURI.getPort(), SEGMENT_PATH);
}
+ public static URI getStartReplaceSegmentsURI(URI controllerURI, String
rawTableName, String tableType)
+ throws URISyntaxException {
+ return getURI(controllerURI.getScheme(), controllerURI.getHost(),
controllerURI.getPort(),
Review comment:
I remember that using `getUri()` somehow didn't work so I ended up using
`new URI`. Can you double check on this
https://github.com/apache/incubator-pinot/pull/6094?
```
http://localhost:18998/segments/mytable/startReplaceSegments%3Ftype=OFFLINE
<- getURI(),
http://localhost:18998/segments/mytable/startReplaceSegments?type=OFFLINE
<- new URI(..)
```
##########
File path:
pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/merge_rollup/MergeRollupTaskExecutor.java
##########
@@ -40,40 +53,133 @@
* TODO:
* 1. Add the support for roll-up
* 2. Add the support for time split to provide backfill support for merged
segments
- * 3. Change the way to decide the number of output segments (explicit
numPartition config -> maxNumRowsPerSegment)
+ * 3. Add merge/rollup name prefixes for generated segments
+ * 4. Add the support for realtime table
*/
public class MergeRollupTaskExecutor extends
BaseMultipleSegmentsConversionExecutor {
private static final Logger LOGGER =
LoggerFactory.getLogger(MergeRollupTaskExecutor.class);
+ private static final String INPUT_SEGMENTS_DIR = "input_segments";
+ private static final String OUTPUT_SEGMENTS_DIR = "output_segments";
@Override
protected List<SegmentConversionResult> convert(PinotTaskConfig
pinotTaskConfig, List<File> originalIndexDirs,
File workingDir)
throws Exception {
+ String taskType = pinotTaskConfig.getTaskType();
Map<String, String> configs = pinotTaskConfig.getConfigs();
+ LOGGER.info("Starting task: {} with configs: {}", taskType, configs);
+ long startMillis = System.currentTimeMillis();
+
String mergeTypeString =
configs.get(MinionConstants.MergeRollupTask.MERGE_TYPE_KEY);
// TODO: add the support for rollup
Preconditions.checkNotNull(mergeTypeString, "MergeType cannot be null");
- MergeType mergeType = MergeType.fromString(mergeTypeString);
- Preconditions.checkState(mergeType == MergeType.CONCATENATE, "Only
'CONCATENATE' mode is currently supported.");
-
- String mergedSegmentName =
configs.get(MinionConstants.MergeRollupTask.MERGED_SEGMENT_NAME_KEY);
String tableNameWithType = configs.get(MinionConstants.TABLE_NAME_KEY);
-
TableConfig tableConfig = getTableConfig(tableNameWithType);
+ Schema schema = getSchema(tableNameWithType);
+ Set<String> schemaColumns = schema.getPhysicalColumnNames();
+
+ Map<String, ValueAggregatorFactory.ValueAggregatorType> aggregatorConfigs =
+ MergeRollupTaskUtils.getRollupAggregationTypeMap(configs);
+ String numRecordsPerSegment =
configs.get(MinionConstants.MergeRollupTask.MAX_NUM_RECORDS_PER_SEGMENT);
+
+ SegmentProcessorConfig.Builder segmentProcessorConfigBuilder =
+ new
SegmentProcessorConfig.Builder().setTableConfig(tableConfig).setSchema(schema);
+
+ // Partition config from tableConfig
+ if (tableConfig.getIndexingConfig().getSegmentPartitionConfig() != null) {
+ Map<String, ColumnPartitionConfig> columnPartitionMap =
+
tableConfig.getIndexingConfig().getSegmentPartitionConfig().getColumnPartitionMap();
+ PartitionerConfig partitionerConfig =
getPartitionerConfig(columnPartitionMap, tableNameWithType, schemaColumns);
Review comment:
Why do we read the partition config from the table config and set it to
the partition for the segment processing framework?
We should handle this on the task scheduler. The scheduler should only pick
the segments from the same partition when the custom partitioning is enabled.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]