Jackie-Jiang commented on a change in pull request #6004: URL: https://github.com/apache/incubator-pinot/pull/6004#discussion_r496925731
########## File path: pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/DistinctCountThetaSketchAggregationFunction.java ########## @@ -19,136 +19,144 @@ package org.apache.pinot.core.query.aggregation.function; import com.google.common.base.Preconditions; -import com.google.common.collect.ImmutableList; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import org.apache.calcite.sql.parser.SqlParseException; -import org.apache.commons.collections.MapUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.datasketches.Util; import org.apache.datasketches.memory.Memory; import org.apache.datasketches.theta.AnotB; import org.apache.datasketches.theta.Intersection; import org.apache.datasketches.theta.SetOperationBuilder; import org.apache.datasketches.theta.Sketch; import org.apache.datasketches.theta.Union; +import org.apache.datasketches.theta.UpdateSketch; +import org.apache.datasketches.theta.UpdateSketchBuilder; import org.apache.pinot.common.function.AggregationFunctionType; -import org.apache.pinot.common.utils.DataSchema; +import org.apache.pinot.common.utils.DataSchema.ColumnDataType; import org.apache.pinot.core.common.BlockValSet; import org.apache.pinot.core.operator.filter.predicate.PredicateEvaluator; import org.apache.pinot.core.operator.filter.predicate.PredicateEvaluatorProvider; import org.apache.pinot.core.query.aggregation.AggregationResultHolder; import org.apache.pinot.core.query.aggregation.ObjectAggregationResultHolder; -import org.apache.pinot.core.query.aggregation.function.RawThetaSketchAggregationFunction.Parameters; import org.apache.pinot.core.query.aggregation.groupby.GroupByResultHolder; import org.apache.pinot.core.query.aggregation.groupby.ObjectGroupByResultHolder; import org.apache.pinot.core.query.request.context.ExpressionContext; import org.apache.pinot.core.query.request.context.FilterContext; +import org.apache.pinot.core.query.request.context.FunctionContext; import org.apache.pinot.core.query.request.context.predicate.Predicate; import org.apache.pinot.core.query.request.context.utils.QueryContextConverterUtils; -import org.apache.pinot.spi.data.FieldSpec; +import org.apache.pinot.spi.data.FieldSpec.DataType; import org.apache.pinot.sql.parsers.CalciteSqlParser; /** - * Implementation of {@link AggregationFunction} to perform the distinct count aggregation using - * Theta Sketches. - * <p>TODO: For performance concern, use {@code List<Sketch>} as the intermediate result. + * The {@code DistinctCountThetaSketchAggregationFunction} can be used in 2 modes: + * <ul> + * <li> + * Simple union without post-aggregation (1 or 2 arguments): main expression to aggregate on, optional theta-sketch + * parameters + * <p>E.g. DISTINCT_COUNT_THETA_SKETCH(col) + * </li> + * <li> + * Union with post-aggregation (at least 4 arguments): main expression to aggregate on, theta-sketch parameters, + * filter(s), post-aggregation expression + * <p>E.g. DISTINCT_COUNT_THETA_SKETCH(col, '', 'dimName=''gender'' AND dimValue=''male''', + * 'dimName=''course'' AND dimValue=''math''', 'SET_INTERSECT($1,$2)') + * </li> + * </ul> + * Currently there is only 1 parameter for the function: + * <ul> + * <li> + * nominalEntries: The nominal entries used to create the sketch. (Default 4096) + * </li> + * </ul> + * <p>E.g. DISTINCT_COUNT_THETA_SKETCH(col, 'nominalEntries=8192') */ -public class DistinctCountThetaSketchAggregationFunction implements AggregationFunction<Map<String, Sketch>, Long> { - - public enum MergeFunction { - SET_UNION, SET_INTERSECT, SET_DIFF; - - public static final ImmutableList<String> STRING_VALUES = - ImmutableList.of(SET_UNION.name(), SET_INTERSECT.name(), SET_DIFF.name()); - - public static final String CSV_VALUES = String.join(",", STRING_VALUES); - - public static boolean isValid(String name) { - return SET_UNION.name().equalsIgnoreCase(name) || SET_INTERSECT.name().equalsIgnoreCase(name) || SET_DIFF.name() - .equalsIgnoreCase(name); - } - } - - private static final Pattern ARGUMENT_SUBSTITUTION = Pattern.compile("\\$(\\d+)"); - - private final ExpressionContext _thetaSketchColumn; - private final SetOperationBuilder _setOperationBuilder; +@SuppressWarnings({"rawtypes", "unchecked"}) +public class DistinctCountThetaSketchAggregationFunction implements AggregationFunction<List<Sketch>, Long> { + private static final String SET_UNION = "SET_UNION"; Review comment: Using string has these 2 benefits over enum: - Save the extra parsing of the enum - Simplify the handling of invalid operations ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org