[GitHub] [incubator-pinot] Jackie-Jiang commented on a change in pull request #6004: Enhance DistinctCountThetaSketchAggregationFunction

GitBox Tue, 29 Sep 2020 10:46:24 -0700


Jackie-Jiang commented on a change in pull request #6004:
URL: https://github.com/apache/incubator-pinot/pull/6004#discussion_r496925731




##########
File path: 
pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/DistinctCountThetaSketchAggregationFunction.java
##########
@@ -19,136 +19,144 @@
 package org.apache.pinot.core.query.aggregation.function;
 
 import com.google.common.base.Preconditions;
-import com.google.common.collect.ImmutableList;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 import org.apache.calcite.sql.parser.SqlParseException;
-import org.apache.commons.collections.MapUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.datasketches.Util;
 import org.apache.datasketches.memory.Memory;
 import org.apache.datasketches.theta.AnotB;
 import org.apache.datasketches.theta.Intersection;
 import org.apache.datasketches.theta.SetOperationBuilder;
 import org.apache.datasketches.theta.Sketch;
 import org.apache.datasketches.theta.Union;
+import org.apache.datasketches.theta.UpdateSketch;
+import org.apache.datasketches.theta.UpdateSketchBuilder;
 import org.apache.pinot.common.function.AggregationFunctionType;
-import org.apache.pinot.common.utils.DataSchema;
+import org.apache.pinot.common.utils.DataSchema.ColumnDataType;
 import org.apache.pinot.core.common.BlockValSet;
 import org.apache.pinot.core.operator.filter.predicate.PredicateEvaluator;
 import 
org.apache.pinot.core.operator.filter.predicate.PredicateEvaluatorProvider;
 import org.apache.pinot.core.query.aggregation.AggregationResultHolder;
 import org.apache.pinot.core.query.aggregation.ObjectAggregationResultHolder;
-import 
org.apache.pinot.core.query.aggregation.function.RawThetaSketchAggregationFunction.Parameters;
 import org.apache.pinot.core.query.aggregation.groupby.GroupByResultHolder;
 import 
org.apache.pinot.core.query.aggregation.groupby.ObjectGroupByResultHolder;
 import org.apache.pinot.core.query.request.context.ExpressionContext;
 import org.apache.pinot.core.query.request.context.FilterContext;
+import org.apache.pinot.core.query.request.context.FunctionContext;
 import org.apache.pinot.core.query.request.context.predicate.Predicate;
 import 
org.apache.pinot.core.query.request.context.utils.QueryContextConverterUtils;
-import org.apache.pinot.spi.data.FieldSpec;
+import org.apache.pinot.spi.data.FieldSpec.DataType;
 import org.apache.pinot.sql.parsers.CalciteSqlParser;
 
 
 /**
- * Implementation of {@link AggregationFunction} to perform the distinct count 
aggregation using
- * Theta Sketches.
- * <p>TODO: For performance concern, use {@code List<Sketch>} as the 
intermediate result.
+ * The {@code DistinctCountThetaSketchAggregationFunction} can be used in 2 
modes:
+ * <ul>
+ *   <li>
+ *     Simple union without post-aggregation (1 or 2 arguments): main 
expression to aggregate on, optional theta-sketch
+ *     parameters
+ *     <p>E.g. DISTINCT_COUNT_THETA_SKETCH(col)
+ *   </li>
+ *   <li>
+ *     Union with post-aggregation (at least 4 arguments): main expression to 
aggregate on, theta-sketch parameters,
+ *     filter(s), post-aggregation expression
+ *     <p>E.g. DISTINCT_COUNT_THETA_SKETCH(col, '', 'dimName=''gender'' AND 
dimValue=''male''',
+ *     'dimName=''course'' AND dimValue=''math''', 'SET_INTERSECT($1,$2)')
+ *   </li>
+ * </ul>
+ * Currently there is only 1 parameter for the function:
+ * <ul>
+ *   <li>
+ *     nominalEntries: The nominal entries used to create the sketch. (Default 
4096)
+ *   </li>
+ * </ul>
+ * <p>E.g. DISTINCT_COUNT_THETA_SKETCH(col, 'nominalEntries=8192')
  */
-public class DistinctCountThetaSketchAggregationFunction implements 
AggregationFunction<Map<String, Sketch>, Long> {
-
-  public enum MergeFunction {
-    SET_UNION, SET_INTERSECT, SET_DIFF;
-
-    public static final ImmutableList<String> STRING_VALUES =
-        ImmutableList.of(SET_UNION.name(), SET_INTERSECT.name(), 
SET_DIFF.name());
-
-    public static final String CSV_VALUES = String.join(",", STRING_VALUES);
-
-    public static boolean isValid(String name) {
-      return SET_UNION.name().equalsIgnoreCase(name) || 
SET_INTERSECT.name().equalsIgnoreCase(name) || SET_DIFF.name()
-          .equalsIgnoreCase(name);
-    }
-  }
-
-  private static final Pattern ARGUMENT_SUBSTITUTION = 
Pattern.compile("\\$(\\d+)");
-
-  private final ExpressionContext _thetaSketchColumn;
-  private final SetOperationBuilder _setOperationBuilder;
+@SuppressWarnings({"rawtypes", "unchecked"})
+public class DistinctCountThetaSketchAggregationFunction implements 
AggregationFunction<List<Sketch>, Long> {
+  private static final String SET_UNION = "SET_UNION";

Review comment:
       Using string has these 2 benefits over enum:
   - Save the extra parsing of the enum
   - Simplify the handling of invalid operations




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org
For additional commands, e-mail: commits-h...@pinot.apache.org

[GitHub] [incubator-pinot] Jackie-Jiang commented on a change in pull request #6004: Enhance DistinctCountThetaSketchAggregationFunction

Reply via email to