snleee commented on a change in pull request #7180: URL: https://github.com/apache/incubator-pinot/pull/7180#discussion_r673334463
########## File path: pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/MergeTaskUtils.java ########## @@ -0,0 +1,147 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.plugin.minion.tasks; + +import com.google.common.base.Preconditions; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import javax.annotation.Nullable; +import org.apache.pinot.core.common.MinionConstants.MergeTask; +import org.apache.pinot.core.segment.processing.framework.MergeType; +import org.apache.pinot.core.segment.processing.framework.SegmentConfig; +import org.apache.pinot.core.segment.processing.partitioner.PartitionerConfig; +import org.apache.pinot.core.segment.processing.partitioner.PartitionerFactory; +import org.apache.pinot.core.segment.processing.timehandler.TimeHandler; +import org.apache.pinot.core.segment.processing.timehandler.TimeHandlerConfig; +import org.apache.pinot.segment.spi.AggregationFunctionType; +import org.apache.pinot.spi.config.table.ColumnPartitionConfig; +import org.apache.pinot.spi.config.table.SegmentPartitionConfig; +import org.apache.pinot.spi.config.table.TableConfig; +import org.apache.pinot.spi.data.DateTimeFieldSpec; +import org.apache.pinot.spi.data.Schema; +import org.apache.pinot.spi.utils.TimeUtils; + + +/** + * Common utils for segment merge tasks. + */ +public class MergeTaskUtils { + private MergeTaskUtils() { + } + + private static final int AGGREGATION_TYPE_KEY_SUFFIX_LENGTH = MergeTask.AGGREGATION_TYPE_KEY_SUFFIX.length(); + + /** + * Creates the time handler config based on the given table config, schema and task config. Returns {@code null} if + * the table does not have a time column. + */ + @Nullable + public static TimeHandlerConfig getTimeHandlerConfig(TableConfig tableConfig, Schema schema, + Map<String, String> taskConfig) { + String timeColumn = tableConfig.getValidationConfig().getTimeColumnName(); + if (timeColumn == null) { + return null; + } + DateTimeFieldSpec fieldSpec = schema.getSpecForTimeColumn(timeColumn); + Preconditions + .checkState(fieldSpec != null, "No valid spec found for time column: %s in schema for table: %s", timeColumn, + tableConfig.getTableName()); + + TimeHandlerConfig.Builder timeHandlerConfigBuilder = new TimeHandlerConfig.Builder(TimeHandler.Type.EPOCH); + + String windowStartMs = taskConfig.get(MergeTask.WINDOW_START_MS_KEY); Review comment: I think that we need to resolve this part. It looks that we need to pass `start & end timestamp` of the current window because we are updating `custom config in SegmentZKMetadata` on the executor side. However, for merge & rollup case, the executor should not set those values to the `TimeHandlerConfig` because we don't want to filter out the rows. ########## File path: pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/test/java/org/apache/pinot/plugin/minion/tasks/MergeTaskUtilsTest.java ########## @@ -0,0 +1,170 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.plugin.minion.tasks; + +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.pinot.core.common.MinionConstants.MergeTask; +import org.apache.pinot.core.segment.processing.framework.MergeType; +import org.apache.pinot.core.segment.processing.framework.SegmentConfig; +import org.apache.pinot.core.segment.processing.partitioner.PartitionerConfig; +import org.apache.pinot.core.segment.processing.partitioner.PartitionerFactory; +import org.apache.pinot.core.segment.processing.timehandler.TimeHandlerConfig; +import org.apache.pinot.segment.spi.AggregationFunctionType; +import org.apache.pinot.spi.config.table.ColumnPartitionConfig; +import org.apache.pinot.spi.config.table.SegmentPartitionConfig; +import org.apache.pinot.spi.config.table.TableConfig; +import org.apache.pinot.spi.config.table.TableType; +import org.apache.pinot.spi.data.FieldSpec.DataType; +import org.apache.pinot.spi.data.Schema; +import org.apache.pinot.spi.utils.builder.TableConfigBuilder; +import org.testng.annotations.Test; + +import static org.testng.Assert.*; + + +public class MergeTaskUtilsTest { + + @Test + public void testGetTimeHandlerConfig() { + TableConfig tableConfig = + new TableConfigBuilder(TableType.OFFLINE).setTableName("myTable").setTimeColumnName("millisSinceEpoch").build(); + Schema schema = new Schema.SchemaBuilder() + .addDateTime("millisSinceEpoch", DataType.LONG, "1:MILLISECONDS:EPOCH", "1:MILLISECONDS").build(); + Map<String, String> taskConfig = new HashMap<>(); + long expectedWindowStartMs = 1625097600000L; + long expectedWindowEndMs = 1625184000000L; + taskConfig.put(MergeTask.WINDOW_START_MS_KEY, Long.toString(expectedWindowStartMs)); + taskConfig.put(MergeTask.WINDOW_END_MS_KEY, Long.toString(expectedWindowEndMs)); + long expectedRoundBucketMs = 6 * 3600 * 1000; + taskConfig.put(MergeTask.ROUND_BUCKET_TIME_PERIOD_KEY, "6h"); + long expectedPartitionBucketMs = 24 * 3600 * 1000; + taskConfig.put(MergeTask.PARTITION_BUCKET_TIME_PERIOD_KEY, "1d"); + + TimeHandlerConfig timeHandlerConfig = MergeTaskUtils.getTimeHandlerConfig(tableConfig, schema, taskConfig); + assertNotNull(timeHandlerConfig); + assertEquals(timeHandlerConfig.getStartTimeMs(), expectedWindowStartMs); + assertEquals(timeHandlerConfig.getEndTimeMs(), expectedWindowEndMs); + assertEquals(timeHandlerConfig.getRoundBucketMs(), expectedRoundBucketMs); + assertEquals(timeHandlerConfig.getPartitionBucketMs(), expectedPartitionBucketMs); + + // No time column in table config + TableConfig tableConfigWithoutTimeColumn = + new TableConfigBuilder(TableType.OFFLINE).setTableName("myTable").build(); + assertNull(MergeTaskUtils.getTimeHandlerConfig(tableConfigWithoutTimeColumn, schema, taskConfig)); + + // Time column does not exist in schema + Schema schemaWithoutTimeColumn = new Schema.SchemaBuilder().build(); + try { + MergeTaskUtils.getTimeHandlerConfig(tableConfig, schemaWithoutTimeColumn, taskConfig); + fail(); + } catch (IllegalStateException e) { + // Expected + } + } + + @Test + public void testGetPartitionerConfigs() { + TableConfig tableConfig = new TableConfigBuilder(TableType.OFFLINE).setTableName("myTable") + .setSegmentPartitionConfig( + new SegmentPartitionConfig(Collections.singletonMap("memberId", new ColumnPartitionConfig("murmur", 10)))) + .build(); + Schema schema = new Schema.SchemaBuilder().addSingleValueDimension("memberId", DataType.LONG).build(); + Map<String, String> taskConfig = Collections.emptyMap(); + + List<PartitionerConfig> partitionerConfigs = MergeTaskUtils.getPartitionerConfigs(tableConfig, schema, taskConfig); + assertEquals(partitionerConfigs.size(), 1); + PartitionerConfig partitionerConfig = partitionerConfigs.get(0); + assertEquals(partitionerConfig.getPartitionerType(), PartitionerFactory.PartitionerType.TABLE_PARTITION_CONFIG); + assertEquals(partitionerConfig.getColumnName(), "memberId"); + ColumnPartitionConfig columnPartitionConfig = partitionerConfig.getColumnPartitionConfig(); + assertEquals(columnPartitionConfig.getFunctionName(), "murmur"); + assertEquals(columnPartitionConfig.getNumPartitions(), 10); + + // No partition column in table config + TableConfig tableConfigWithoutPartitionColumn = + new TableConfigBuilder(TableType.OFFLINE).setTableName("myTable").build(); + assertTrue(MergeTaskUtils.getPartitionerConfigs(tableConfigWithoutPartitionColumn, schema, taskConfig).isEmpty()); + + // Partition column does not exist in schema + Schema schemaWithoutPartitionColumn = new Schema.SchemaBuilder().build(); + try { + MergeTaskUtils.getPartitionerConfigs(tableConfig, schemaWithoutPartitionColumn, taskConfig); + fail(); + } catch (IllegalStateException e) { + // Expected + } + } + + @Test + public void testGetMergeType() { + assertEquals(MergeTaskUtils.getMergeType(Collections.singletonMap(MergeTask.MERGE_TYPE_KEY, "concat")), + MergeType.CONCAT); + assertEquals(MergeTaskUtils.getMergeType(Collections.singletonMap(MergeTask.MERGE_TYPE_KEY, "Rollup")), + MergeType.ROLLUP); + assertEquals(MergeTaskUtils.getMergeType(Collections.singletonMap(MergeTask.MERGE_TYPE_KEY, "DeDuP")), + MergeType.DEDUP); + assertNull(MergeTaskUtils.getMergeType(Collections.emptyMap())); + + try { + MergeTaskUtils.getMergeType(Collections.singletonMap(MergeTask.MERGE_TYPE_KEY, "unsupported")); + fail(); + } catch (IllegalArgumentException e) { + // Expected + } + } + + @Test + public void testGetAggregationTypes() { + Map<String, String> taskConfig = new HashMap<>(); + taskConfig.put("colA.aggregationType", "sum"); + taskConfig.put("colB.aggregationType", "Min"); + taskConfig.put("colC.aggregationType", "MaX"); + + Map<String, AggregationFunctionType> aggregationTypes = MergeTaskUtils.getAggregationTypes(taskConfig); + assertEquals(aggregationTypes.size(), 3); + assertEquals(aggregationTypes.get("colA"), AggregationFunctionType.SUM); + assertEquals(aggregationTypes.get("colB"), AggregationFunctionType.MIN); + assertEquals(aggregationTypes.get("colC"), AggregationFunctionType.MAX); + + taskConfig.put("colD.aggregationType", "unsupported"); + try { + MergeTaskUtils.getAggregationTypes(taskConfig); + fail(); + } catch (IllegalArgumentException e) { + // Expected + } + } + + @Test + public void testGetSegmentConfig() { + Map<String, String> taskConfig = new HashMap<>(); + taskConfig.put(MergeTask.MAX_NUM_RECORDS_PER_SEGMENT_KEY, "10000"); + taskConfig.put(MergeTask.SEGMENT_NAME_PREFIX_KEY, "myPrefix"); + SegmentConfig segmentConfig = MergeTaskUtils.getSegmentConfig(taskConfig); + assertEquals(segmentConfig.getMaxNumRecordsPerSegment(), 10000); + assertEquals(segmentConfig.getSegmentNamePrefix(), "myPrefix"); + + segmentConfig = MergeTaskUtils.getSegmentConfig(Collections.emptyMap()); + assertEquals(segmentConfig.getMaxNumRecordsPerSegment(), SegmentConfig.DEFAULT_MAX_NUM_RECORDS_PER_SEGMENT); + assertNull(segmentConfig.getSegmentNamePrefix()); + } +} Review comment: add a line ########## File path: pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/test/java/org/apache/pinot/plugin/minion/tasks/MergeTaskUtilsTest.java ########## @@ -0,0 +1,170 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.plugin.minion.tasks; + +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.pinot.core.common.MinionConstants.MergeTask; +import org.apache.pinot.core.segment.processing.framework.MergeType; +import org.apache.pinot.core.segment.processing.framework.SegmentConfig; +import org.apache.pinot.core.segment.processing.partitioner.PartitionerConfig; +import org.apache.pinot.core.segment.processing.partitioner.PartitionerFactory; +import org.apache.pinot.core.segment.processing.timehandler.TimeHandlerConfig; +import org.apache.pinot.segment.spi.AggregationFunctionType; +import org.apache.pinot.spi.config.table.ColumnPartitionConfig; +import org.apache.pinot.spi.config.table.SegmentPartitionConfig; +import org.apache.pinot.spi.config.table.TableConfig; +import org.apache.pinot.spi.config.table.TableType; +import org.apache.pinot.spi.data.FieldSpec.DataType; +import org.apache.pinot.spi.data.Schema; +import org.apache.pinot.spi.utils.builder.TableConfigBuilder; +import org.testng.annotations.Test; + +import static org.testng.Assert.*; + + +public class MergeTaskUtilsTest { + + @Test + public void testGetTimeHandlerConfig() { Review comment: Can you add the case where the table has non-epoch based time column? ########## File path: pinot-plugins/pinot-minion-tasks/pinot-minion-builtin-tasks/src/main/java/org/apache/pinot/plugin/minion/tasks/merge_rollup/MergeRollupTaskExecutor.java ########## @@ -64,27 +57,29 @@ LOGGER.info("Starting task: {} with configs: {}", taskType, configs); long startMillis = System.currentTimeMillis(); - Preconditions.checkState( - MergeType.CONCAT.name().equalsIgnoreCase(configs.get(MinionConstants.MergeRollupTask.MERGE_TYPE_KEY)), - "Only 'CONCAT' mode is currently supported."); - String tableNameWithType = configs.get(MinionConstants.TABLE_NAME_KEY); TableConfig tableConfig = getTableConfig(tableNameWithType); Schema schema = getSchema(tableNameWithType); - Map<String, AggregationFunctionType> aggregationTypes = MergeRollupTaskUtils.getRollupAggregationTypes(configs); - String numRecordsPerSegmentString = configs.get(MinionConstants.MergeRollupTask.MAX_NUM_RECORDS_PER_SEGMENT); - SegmentProcessorConfig.Builder segmentProcessorConfigBuilder = - new SegmentProcessorConfig.Builder().setTableConfig(tableConfig).setSchema(schema) - .setMergeType(MergeType.CONCAT); - if (!aggregationTypes.isEmpty()) { - segmentProcessorConfigBuilder.setAggregationTypes(aggregationTypes); - } - if (numRecordsPerSegmentString != null) { - segmentProcessorConfigBuilder.setSegmentConfig( - new SegmentConfig.Builder().setMaxNumRecordsPerSegment(Integer.parseInt(numRecordsPerSegmentString)).build()); - } + new SegmentProcessorConfig.Builder().setTableConfig(tableConfig).setSchema(schema); + + // Time handler config + segmentProcessorConfigBuilder + .setTimeHandlerConfig(MergeTaskUtils.getTimeHandlerConfig(tableConfig, schema, configs)); + + // Partitioner config Review comment: For handling the custom partitioning, are we going to depend on the segment framework or the task scheduler? One way to handle this is to make merge segments from the same partition. Another way is to do the partitioning based on the underlying partitioning column value in the framework. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
