This is an automated email from the ASF dual-hosted git repository.
rongr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new c68b3be360 [multistage] support more agg functions currently
implementable (#11208)
c68b3be360 is described below
commit c68b3be360fafac1c38f3322ed5fa64201d08daf
Author: Rong Rong <[email protected]>
AuthorDate: Tue Aug 1 09:09:51 2023 -0700
[multistage] support more agg functions currently implementable (#11208)
* support the rest of the agg functions
* support others trivial ones such as DISTINCT_SUM/AVG, MIN_MAX_RANGE,
etc
* add DISTINCT_COUNT variances
* add PERCENTILE_* variances
* added TODO comments for the rest of the functions
---------
Co-authored-by: Rong Rong <[email protected]>
---
.../query/planner/logical/LiteralHintUtils.java | 4 +-
.../src/test/resources/queries/CountDistinct.json | 103 ++++++++++++--
.../src/test/resources/queries/UDFAggregates.json | 152 ++++++++++++++++++++-
.../pinot/segment/spi/AggregationFunctionType.java | 68 ++++++---
4 files changed, 294 insertions(+), 33 deletions(-)
diff --git
a/pinot-query-planner/src/main/java/org/apache/pinot/query/planner/logical/LiteralHintUtils.java
b/pinot-query-planner/src/main/java/org/apache/pinot/query/planner/logical/LiteralHintUtils.java
index 21441b6d43..8f3fa8e2e6 100644
---
a/pinot-query-planner/src/main/java/org/apache/pinot/query/planner/logical/LiteralHintUtils.java
+++
b/pinot-query-planner/src/main/java/org/apache/pinot/query/planner/logical/LiteralHintUtils.java
@@ -46,13 +46,13 @@ public class LiteralHintUtils {
e.getValue().getDataType().name(), e.getValue().getValue()));
}
// semi-colon is used to separate between encoded literals
- return "{" + StringUtils.join(literalStrings, ";") + "}";
+ return "{" + StringUtils.join(literalStrings, ";:;") + "}";
}
public static Map<Integer, Map<Integer, Literal>>
hintStringToLiteralMap(String literalString) {
Map<Integer, Map<Integer, Literal>> aggCallToLiteralArgsMap = new
HashMap<>();
if (StringUtils.isNotEmpty(literalString) && !"{}".equals(literalString)) {
- String[] literalStringArr = literalString.substring(1,
literalString.length() - 1).split(";");
+ String[] literalStringArr = literalString.substring(1,
literalString.length() - 1).split(";:;");
for (String literalStr : literalStringArr) {
String[] literalStrParts = literalStr.split("\\|", 4);
int aggIdx = Integer.parseInt(literalStrParts[0]);
diff --git a/pinot-query-runtime/src/test/resources/queries/CountDistinct.json
b/pinot-query-runtime/src/test/resources/queries/CountDistinct.json
index f7f8ecd5d4..9c5cea93f3 100644
--- a/pinot-query-runtime/src/test/resources/queries/CountDistinct.json
+++ b/pinot-query-runtime/src/test/resources/queries/CountDistinct.json
@@ -104,12 +104,55 @@
"sql": "SELECT l.groupingCol, DISTINCTCOUNT(CONCAT(l.val, r.val)) FROM
{tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol",
"outputs": [["b", 6], ["a", 6]]
},
+ {
+ "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
groupingCol, DISTINCTCOUNT(val) FROM {tbl1} GROUP BY groupingCol",
+ "outputs": [["b", 2], ["a", 2]]
+ },
+ {
+ "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
l.groupingCol, DISTINCTCOUNT(l.val), DISTINCTCOUNT(r.val) FROM {tbl1} l JOIN
{tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol",
+ "outputs": [["b", 2, 3], ["a", 2, 3]]
+ },
+ {
+ "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
l.groupingCol, DISTINCTCOUNT(CONCAT(l.val, r.val)) FROM {tbl1} l JOIN {tbl2} r
ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol",
+ "outputs": [["b", 6], ["a", 6]]
+ },
+ {
+ "comments": "table aren't actually partitioned by val thus all
segments can produce duplicate results, thus [[6]]",
+ "sql": "SELECT SEGMENT_PARTITIONED_DISTINCT_COUNT(val) FROM {tbl1}",
+ "outputs": [[6]]
+ },
+ {
+ "comments": "table aren't actually partitioned by val thus all
segments can produce duplicate results, thus [[b, 4], [a, 4]]",
+ "sql": "SELECT groupingCol, SEGMENT_PARTITIONED_DISTINCT_COUNT(val)
FROM {tbl1} GROUP BY groupingCol",
+ "outputs": [["b", 4], ["a", 4]]
+ },
+ {
+ "sql": "SELECT l.groupingCol,
SEGMENT_PARTITIONED_DISTINCT_COUNT(l.val),
SEGMENT_PARTITIONED_DISTINCT_COUNT(r.val) FROM {tbl1} l JOIN {tbl2} r ON
l.groupingCol = r.groupingCol GROUP BY l.groupingCol",
+ "outputs": [["b", 2, 3], ["a", 2, 3]]
+ },
+ {
+ "sql": "SELECT l.groupingCol,
SEGMENT_PARTITIONED_DISTINCT_COUNT(CONCAT(l.val, r.val)) FROM {tbl1} l JOIN
{tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol",
+ "outputs": [["b", 6], ["a", 6]]
+ },
+ {
+ "comments": "table aren't actually partitioned by val thus all
segments can produce duplicate results, thus [[b, 4], [a, 4]]",
+ "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
groupingCol, SEGMENT_PARTITIONED_DISTINCT_COUNT(val) FROM {tbl1} GROUP BY
groupingCol",
+ "outputs": [["b", 4], ["a", 4]]
+ },
+ {
+ "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
l.groupingCol, SEGMENT_PARTITIONED_DISTINCT_COUNT(l.val),
SEGMENT_PARTITIONED_DISTINCT_COUNT(r.val) FROM {tbl1} l JOIN {tbl2} r ON
l.groupingCol = r.groupingCol GROUP BY l.groupingCol",
+ "outputs": [["b", 2, 3], ["a", 2, 3]]
+ },
+ {
+ "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
l.groupingCol, SEGMENT_PARTITIONED_DISTINCT_COUNT(CONCAT(l.val, r.val)) FROM
{tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol",
+ "outputs": [["b", 6], ["a", 6]]
+ },
{
"sql": "SELECT DISTINCTCOUNTHLL(val) FROM {tbl1}",
"outputs": [[3]]
},
{
- "sql": "SELECT groupingCol, DISTINCTCOUNTHLL(val) FROM {tbl1} GROUP BY
groupingCol",
+ "sql": "SELECT groupingCol, DISTINCTCOUNTHLL(val, 8) FROM {tbl1} GROUP
BY groupingCol",
"outputs": [["b", 2], ["a", 2]]
},
{
@@ -117,32 +160,76 @@
"outputs": [["b", 2, 3], ["a", 2, 3]]
},
{
- "sql": "SELECT l.groupingCol, DISTINCTCOUNTHLL(CONCAT(l.val, r.val))
FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY
l.groupingCol",
+ "sql": "SELECT l.groupingCol, DISTINCTCOUNTHLL(CONCAT(l.val, r.val),
8) FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY
l.groupingCol",
"outputs": [["b", 6], ["a", 6]]
},
{
- "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
groupingCol, DISTINCTCOUNT(val) FROM {tbl1} GROUP BY groupingCol",
+ "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
groupingCol, DISTINCTCOUNTHLL(val, 8) FROM {tbl1} GROUP BY groupingCol",
"outputs": [["b", 2], ["a", 2]]
},
{
- "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
l.groupingCol, DISTINCTCOUNT(l.val), DISTINCTCOUNT(r.val) FROM {tbl1} l JOIN
{tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol",
+ "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
l.groupingCol, DISTINCTCOUNTHLL(l.val), DISTINCTCOUNTHLL(r.val) FROM {tbl1} l
JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol",
"outputs": [["b", 2, 3], ["a", 2, 3]]
},
{
- "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
l.groupingCol, DISTINCTCOUNT(CONCAT(l.val, r.val)) FROM {tbl1} l JOIN {tbl2} r
ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol",
+ "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
l.groupingCol, DISTINCTCOUNTHLL(CONCAT(l.val, r.val), 8) FROM {tbl1} l JOIN
{tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol",
"outputs": [["b", 6], ["a", 6]]
},
{
- "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
groupingCol, DISTINCTCOUNTHLL(val) FROM {tbl1} GROUP BY groupingCol",
+ "sql": "SELECT DISTINCTCOUNTSMARTHLL(val, 'hllLog2m=2') FROM {tbl1}",
+ "outputs": [[3]]
+ },
+ {
+ "sql": "SELECT groupingCol, DISTINCTCOUNTSMARTHLL(val,
'hllConversionThreshold=10;hllLog2m=8') FROM {tbl1} GROUP BY groupingCol",
"outputs": [["b", 2], ["a", 2]]
},
{
- "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
l.groupingCol, DISTINCTCOUNTHLL(l.val), DISTINCTCOUNTHLL(r.val) FROM {tbl1} l
JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol",
+ "sql": "SELECT l.groupingCol, DISTINCTCOUNTSMARTHLL(l.val),
DISTINCTCOUNTSMARTHLL(r.val) FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol =
r.groupingCol GROUP BY l.groupingCol",
+ "outputs": [["b", 2, 3], ["a", 2, 3]]
+ },
+ {
+ "sql": "SELECT l.groupingCol, DISTINCTCOUNTSMARTHLL(CONCAT(l.val,
r.val), 'threshold=10;hllLog2m=2') FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol
= r.groupingCol GROUP BY l.groupingCol",
+ "outputs": [["b", 6], ["a", 6]]
+ },
+ {
+ "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
groupingCol, DISTINCTCOUNTSMARTHLL(val, 'hllConversionThreshold=10;hllLog2m=8')
FROM {tbl1} GROUP BY groupingCol",
+ "outputs": [["b", 2], ["a", 2]]
+ },
+ {
+ "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
l.groupingCol, DISTINCTCOUNTSMARTHLL(l.val), DISTINCTCOUNTSMARTHLL(r.val) FROM
{tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol",
"outputs": [["b", 2, 3], ["a", 2, 3]]
},
{
- "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
l.groupingCol, DISTINCTCOUNTHLL(CONCAT(l.val, r.val)) FROM {tbl1} l JOIN {tbl2}
r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol",
+ "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
l.groupingCol, DISTINCTCOUNTSMARTHLL(CONCAT(l.val, r.val),
'threshold=10;hllLog2m=2') FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol =
r.groupingCol GROUP BY l.groupingCol",
"outputs": [["b", 6], ["a", 6]]
+ },
+ {
+ "sql": "SELECT DISTINCTCOUNTRAWHLL(val, 2) FROM {tbl1}",
+ "outputs": [["000000020000000400000460"]]
+ },
+ {
+ "sql": "SELECT groupingCol, DISTINCTCOUNTRAWHLL(val) FROM {tbl1} GROUP
BY groupingCol",
+ "outputs": [["a",
"00000008000000ac00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000004000000000000000000000000000000000000000000002000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"],
["b",
"00000008000000ac000000000000000000000000000000000000000000000000000000000000000000000000000000
[...]
+ },
+ {
+ "sql": "SELECT l.groupingCol, DISTINCTCOUNTRAWHLL(l.val, 2),
DISTINCTCOUNTRAWHLL(r.val, 2) FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol =
r.groupingCol GROUP BY l.groupingCol",
+ "outputs": [["a", "000000020000000400000060",
"000000020000000400010060"], ["b", "000000020000000400000420",
"000000020000000400000480"]]
+ },
+ {
+ "sql": "SELECT l.groupingCol, DISTINCTCOUNTRAWHLL(CONCAT(l.val,
r.val), 2) FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP
BY l.groupingCol",
+ "outputs": [["a", "000000020000000400028842"], ["b",
"000000020000000400008405"]]
+ },
+ {
+ "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
groupingCol, DISTINCTCOUNTRAWHLL(val) FROM {tbl1} GROUP BY groupingCol",
+ "outputs": [["a",
"00000008000000ac00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000004000000000000000000000000000000000000000000002000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"],
["b",
"00000008000000ac000000000000000000000000000000000000000000000000000000000000000000000000000000
[...]
+ },
+ {
+ "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
l.groupingCol, DISTINCTCOUNTRAWHLL(l.val, 2), DISTINCTCOUNTRAWHLL(r.val, 2)
FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY
l.groupingCol",
+ "outputs": [["a", "000000020000000400000060",
"000000020000000400010060"], ["b", "000000020000000400000420",
"000000020000000400000480"]]
+ },
+ {
+ "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
l.groupingCol, DISTINCTCOUNTRAWHLL(CONCAT(l.val, r.val), 2) FROM {tbl1} l JOIN
{tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol",
+ "outputs": [["a", "000000020000000400028842"], ["b",
"000000020000000400008405"]]
}
]
}
diff --git a/pinot-query-runtime/src/test/resources/queries/UDFAggregates.json
b/pinot-query-runtime/src/test/resources/queries/UDFAggregates.json
index 1bc39d0953..57bc0d62f1 100644
--- a/pinot-query-runtime/src/test/resources/queries/UDFAggregates.json
+++ b/pinot-query-runtime/src/test/resources/queries/UDFAggregates.json
@@ -48,6 +48,82 @@
"sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
string_col, LASTWITHTIME(float_col, timestamp_col, 'FLOAT'),
LASTWITHTIME(double_col, timestamp_col, 'DOUBLE'), last_with_time(long_col,
CAST(long_col AS TIMESTAMP), 'LONG') FROM {tbl} GROUP BY string_col",
"outputs": [["a", 400, 400, 2], ["b", 1, 1, 100], ["c", 1.01, 1.01,
175]]
},
+ {
+ "sql": "SELECT SUMPRECISION(decimal_col) FROM {tbl}",
+ "outputs":
[["10000000000100000000110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
[...]
+ },
+ {
+ "sql": "SELECT string_col, SUMPRECISION(decimal_col) FROM {tbl} GROUP
BY string_col",
+ "outputs": [["a"
,"10000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
[...]
+ },
+ {
+ "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
string_col, SUMPRECISION(decimal_col) FROM {tbl} GROUP BY string_col",
+ "outputs": [["a"
,"10000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
[...]
+ },
+ {
+ "sql": "select min_max_range(int_col), minMaxRange(long_col),
MIN_MAX_RANGE(float_col), MINMAXRANGE(double_col) FROM {tbl}",
+ "outputs": [[173, 173, 399.0, 399.0]]
+ },
+ {
+ "sql": "select bool_col, min_max_range(int_col),
minMaxRange(long_col), MIN_MAX_RANGE(float_col), MINMAXRANGE(double_col) FROM
{tbl} GROUP BY bool_col",
+ "outputs": [[true, 173, 173, 398.25, 398.25], [false, 147, 147, 99.0,
99.0]]
+ },
+ {
+ "sql": "select /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
bool_col, min_max_range(int_col), minMaxRange(long_col),
MIN_MAX_RANGE(float_col), MINMAXRANGE(double_col) FROM {tbl} GROUP BY bool_col",
+ "outputs": [[true, 173, 173, 398.25, 398.25], [false, 147, 147, 99.0,
99.0]]
+ },
+ {
+ "sql": "select distinct_sum(int_col), distinctSum(long_col),
DISTINCT_SUM(float_col), DISTINCTSUM(double_col) FROM {tbl}",
+ "outputs": [[531, 531, 805.26, 805.26]]
+ },
+ {
+ "sql": "select string_col, distinct_sum(int_col),
distinctSum(long_col), DISTINCT_SUM(float_col), DISTINCTSUM(double_col) FROM
{tbl} GROUP BY string_col",
+ "outputs": [["a", 2, 2, 700.0, 700.0], ["b", 103, 103, 101.0, 101.0],
["c", 426, 426, 4.26, 4.26]]
+ },
+ {
+ "sql": "select /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
string_col, distinct_sum(int_col), distinctSum(long_col),
DISTINCT_SUM(float_col), DISTINCTSUM(double_col) FROM {tbl} GROUP BY
string_col",
+ "outputs": [["a", 2, 2, 700.0, 700.0], ["b", 103, 103, 101.0, 101.0],
["c", 426, 426, 4.26, 4.26]]
+ },
+ {
+ "sql": "select distinct_avg(int_col), distinctAvg(long_col),
DISTINCT_AVG(float_col), DISTINCTAVG(double_col) FROM {tbl}",
+ "outputs": [[88.5, 88.5, 115.03714285, 115.03714285]]
+ },
+ {
+ "sql": "select string_col, distinct_avg(int_col),
distinctAvg(long_col), DISTINCT_AVG(float_col), DISTINCTAVG(double_col) FROM
{tbl} GROUP BY string_col",
+ "outputs": [["a", 2.0, 2.0, 350.0, 350.0], ["b", 51.5, 51.5, 50.5,
50.5], ["c", 142.0, 142.0, 1.42, 1.42]]
+ },
+ {
+ "sql": "select /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
string_col, distinct_avg(int_col), distinctAvg(long_col),
DISTINCT_AVG(float_col), DISTINCTAVG(double_col) FROM {tbl} GROUP BY
string_col",
+ "outputs": [["a", 2.0, 2.0, 350.0, 350.0], ["b", 51.5, 51.5, 50.5,
50.5], ["c", 142.0, 142.0, 1.42, 1.42]]
+ }
+ ]
+ },
+ "pinot_percentile_agg_udf_tests": {
+ "tables": {
+ "tbl": {
+ "schema": [
+ {"name": "int_col", "type": "INT"},
+ {"name": "long_col", "type": "LONG"},
+ {"name": "float_col", "type": "FLOAT"},
+ {"name": "double_col", "type": "DOUBLE"},
+ {"name": "string_col", "type": "STRING"},
+ {"name": "bool_col", "type": "BOOLEAN"},
+ {"name": "decimal_col", "type": "BIG_DECIMAL"},
+ {"name": "timestamp_col", "type": "TIMESTAMP"},
+ {"name": "bytes_col", "type": "BYTES"}
+ ],
+ "inputs": [
+ [2, 2, 300, 300, "a", true, "1E-307", "2020-01-01 03:10:12",
"DEADBEEF"],
+ [2, 2, 400, 400, "a", true, "1E+307", "2020-01-01 03:39:12",
"FDEADBEE"],
+ [3, 3, 100, 100, "b", false, "1E-308", "2020-01-01 08:32:12",
"EFDEADBE"],
+ [100, 100, 1, 1, "b", false, "1E+308", "2020-02-06 03:32:12",
"FDEADBEE"],
+ [101, 101, 1.01, 1.01, "c", false, "1E-317", "2020-05-01 03:32:12",
"DEADBEEF"],
+ [150, 150, 1.5, 1.5, "c", false, "1E+317", "2020-02-03 03:32:12",
"EFDEADBE"],
+ [175, 175, 1.75, 1.75, "c", true, "1E+328", "2020-01-02 03:32:12",
"EFDEADBE"]
+ ]
+ }
+ },
+ "queries": [
{
"sql": "SELECT PERCENTILE(float_col, 50), PERCENTILE(double_col, 5),
PERCENTILE(int_col, 75), PERCENTILE(long_col, 75) FROM {tbl}",
"outputs": [[1.75, 1, 150, 150]]
@@ -61,17 +137,69 @@
"outputs": [["a", 400, 300, 2, 2], ["b", 100, 1, 100, 100], ["c", 1.5,
1.01, 175, 175]]
},
{
- "sql": "SELECT SUMPRECISION(decimal_col) FROM {tbl}",
- "outputs":
[["10000000000100000000110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
[...]
+ "sql": "SELECT PERCENTILE_EST(float_col, 50),
PERCENTILE_EST(double_col, 5), PERCENTILE_EST(int_col, 75),
PERCENTILE_EST(long_col, 75) FROM {tbl}",
+ "outputs": [[1.0, 1.0, 150, 150]]
},
{
- "sql": "SELECT string_col, SUMPRECISION(decimal_col) FROM {tbl} GROUP
BY string_col",
- "outputs": [["a"
,"10000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
[...]
+ "sql": "SELECT bool_col, PERCENTILE_EST(float_col, 50),
PERCENTILE_EST(double_col, 5), PERCENTILE_EST(int_col, 75),
PERCENTILE_EST(long_col, 75) FROM {tbl} GROUP BY bool_col",
+ "outputs": [[false, 1.0, 1.0, 150, 150], [true, 300, 1.0, 175, 175]]
},
{
- "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
string_col, SUMPRECISION(decimal_col) FROM {tbl} GROUP BY string_col",
- "outputs": [["a"
,"10000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
[...]
+ "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
string_col, PERCENTILE_EST(float_col, 50), PERCENTILE_EST(double_col, 5),
PERCENTILE_EST(int_col, 75), PERCENTILE_EST(long_col, 75) FROM {tbl} GROUP BY
string_col",
+ "outputs": [["a", 400, 300, 2, 2], ["b", 100, 1, 100, 100], ["c", 1.0,
1.0, 175, 175]]
+ },
+ {
+ "sql": "SELECT PERCENTILE_TDIGEST(float_col, 50),
PERCENTILE_TDIGEST(double_col, 5), PERCENTILE_TDIGEST(int_col, 75),
PERCENTILE_TDIGEST(long_col, 75) FROM {tbl}",
+ "outputs": [[1.75, 1.0, 137, 137]]
+ },
+ {
+ "sql": "SELECT bool_col, PERCENTILE_TDIGEST(float_col, 50),
PERCENTILE_TDIGEST(double_col, 5), PERCENTILE_TDIGEST(int_col, 75),
PERCENTILE_TDIGEST(long_col, 75) FROM {tbl} GROUP BY bool_col",
+ "outputs": [[false, 1.255, 1.0, 125, 125], [true, 300, 1.75, 131, 131]]
+ },
+ {
+ "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
string_col, PERCENTILE_TDIGEST(float_col, 50), PERCENTILE_TDIGEST(double_col,
5), PERCENTILE_TDIGEST(int_col, 75), PERCENTILE_TDIGEST(long_col, 75) FROM
{tbl} GROUP BY string_col",
+ "outputs": [["a", 350, 300, 2, 2], ["b", 50.5, 1, 100, 100], ["c",
1.5, 1.01, 168, 168]]
+ },
+ {
+ "sql": "SELECT PERCENTILE_KLL(float_col, 50),
PERCENTILE_KLL(double_col, 5), PERCENTILE_KLL(int_col, 75),
PERCENTILE_KLL(long_col, 75) FROM {tbl}",
+ "outputs": [[1.75, 1, 150, 150]]
},
+ {
+ "sql": "SELECT bool_col, PERCENTILE_KLL(float_col, 50),
PERCENTILE_KLL(double_col, 5), PERCENTILE_KLL(int_col, 75),
PERCENTILE_KLL(long_col, 75) FROM {tbl} GROUP BY bool_col",
+ "outputs": [[false, 1.01, 1, 101, 101], [true, 300, 1.75, 175, 175]]
+ },
+ {
+ "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
string_col, PERCENTILE_KLL(float_col, 50), PERCENTILE_KLL(double_col, 5),
PERCENTILE_KLL(int_col, 75), PERCENTILE_KLL(long_col, 75) FROM {tbl} GROUP BY
string_col",
+ "outputs": [["a", 300, 300, 2, 2], ["b", 1, 1, 100, 100], ["c", 1.5,
1.01, 175, 175]]
+ }
+ ]
+ },
+ "pinot_sketch_agg_udf_tests": {
+ "tables": {
+ "tbl": {
+ "schema": [
+ {"name": "int_col", "type": "INT"},
+ {"name": "long_col", "type": "LONG"},
+ {"name": "float_col", "type": "FLOAT"},
+ {"name": "double_col", "type": "DOUBLE"},
+ {"name": "string_col", "type": "STRING"},
+ {"name": "bool_col", "type": "BOOLEAN"},
+ {"name": "decimal_col", "type": "BIG_DECIMAL"},
+ {"name": "timestamp_col", "type": "TIMESTAMP"},
+ {"name": "bytes_col", "type": "BYTES"}
+ ],
+ "inputs": [
+ [2, 2, 300, 300, "a", true, "1E-307", "2020-01-01 03:10:12",
"DEADBEEF"],
+ [2, 2, 400, 400, "a", true, "1E+307", "2020-01-01 03:39:12",
"FDEADBEE"],
+ [3, 3, 100, 100, "b", false, "1E-308", "2020-01-01 08:32:12",
"EFDEADBE"],
+ [100, 100, 1, 1, "b", false, "1E+308", "2020-02-06 03:32:12",
"FDEADBEE"],
+ [101, 101, 1.01, 1.01, "c", false, "1E-317", "2020-05-01 03:32:12",
"DEADBEEF"],
+ [150, 150, 1.5, 1.5, "c", false, "1E+317", "2020-02-03 03:32:12",
"EFDEADBE"],
+ [175, 175, 1.75, 1.75, "c", true, "1E+328", "2020-01-02
03:32:12","EFDEADBE"]
+ ]
+ }
+ },
+ "queries": [
{
"sql": "select
GET_THETA_SKETCH_ESTIMATE(DISTINCT_COUNT_RAW_THETA_SKETCH(string_col,
'nominalEntries=16')),
GET_THETA_SKETCH_ESTIMATE(THETA_SKETCH_DIFF(DISTINCT_COUNT_RAW_THETA_SKETCH(int_col,
''), DISTINCT_COUNT_RAW_THETA_SKETCH(int_col, ''))),
GET_THETA_SKETCH_ESTIMATE(THETA_SKETCH_UNION(DISTINCT_COUNT_RAW_THETA_SKETCH(int_col,
''), DISTINCT_COUNT_RAW_THETA_SKETCH(long_col, ''))),
GET_THETA_SKETCH_ESTIMATE(THETA_SKETCH_INTERSECT(DISTINCT_COUNT_RAW_THETA_SKETCH(double_col,
''), [...]
"outputs": [[3, 0, 6, 6]]
@@ -83,6 +211,18 @@
{
"sql": "select /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
bool_col, GET_THETA_SKETCH_ESTIMATE(DISTINCT_COUNT_RAW_THETA_SKETCH(string_col,
'nominalEntries=16')),
GET_THETA_SKETCH_ESTIMATE(THETA_SKETCH_DIFF(DISTINCT_COUNT_RAW_THETA_SKETCH(int_col,
''), DISTINCT_COUNT_RAW_THETA_SKETCH(int_col, ''))),
GET_THETA_SKETCH_ESTIMATE(THETA_SKETCH_UNION(DISTINCT_COUNT_RAW_THETA_SKETCH(int_col,
''), DISTINCT_COUNT_RAW_THETA_SKETCH(long_col, ''))),
GET_THETA_SKETCH_ESTIMATE(THETA_S [...]
"outputs": [[true, 2, 0, 2, 3], [false, 2, 0, 4, 3]]
+ },
+ {
+ "sql": "select distinct_count_theta_sketch(int_col),
distinctCountThetaSketch(long_col), DISTINCT_COUNT_THETA_SKETCH(float_col),
DISTINCTCOUNTTHETASKETCH(double_col) FROM {tbl}",
+ "outputs": [[6, 6, 7, 7]]
+ },
+ {
+ "sql": "select string_col, distinct_count_theta_sketch(int_col),
distinctCountThetaSketch(long_col), DISTINCT_COUNT_THETA_SKETCH(float_col),
DISTINCTCOUNTTHETASKETCH(double_col) FROM {tbl} GROUP BY string_col",
+ "outputs": [["a", 1, 1, 2, 2], ["b", 2, 2, 2, 2], ["c", 3, 3, 3, 3]]
+ },
+ {
+ "sql": "select /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */
string_col, distinct_count_theta_sketch(int_col),
distinctCountThetaSketch(long_col), DISTINCT_COUNT_THETA_SKETCH(float_col),
DISTINCTCOUNTTHETASKETCH(double_col) FROM {tbl} GROUP BY string_col",
+ "outputs": [["a", 1, 1, 2, 2], ["b", 2, 2, 2, 2], ["c", 3, 3, 3, 3]]
}
]
}
diff --git
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/AggregationFunctionType.java
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/AggregationFunctionType.java
index 5f7ed4a748..77a9199727 100644
---
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/AggregationFunctionType.java
+++
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/AggregationFunctionType.java
@@ -63,6 +63,7 @@ public enum AggregationFunctionType {
ReturnTypes.AGG_SUM_EMPTY_IS_ZERO,
ReturnTypes.explicit(SqlTypeName.DOUBLE)),
SUMPRECISION("sumPrecision", null, SqlKind.OTHER_FUNCTION,
SqlFunctionCategory.USER_DEFINED_FUNCTION,
OperandTypes.ANY, ReturnTypes.explicit(SqlTypeName.DECIMAL),
ReturnTypes.explicit(SqlTypeName.OTHER)),
+ // NO NEEDED in v2, AVG is compiled as SUM/COUNT
AVG("avg"),
MODE("mode"),
@@ -72,7 +73,8 @@ public enum AggregationFunctionType {
LASTWITHTIME("lastWithTime", null, SqlKind.OTHER_FUNCTION,
SqlFunctionCategory.USER_DEFINED_FUNCTION,
OperandTypes.family(ImmutableList.of(SqlTypeFamily.ANY,
SqlTypeFamily.TIMESTAMP, SqlTypeFamily.CHARACTER)),
ReturnTypes.ARG0, ReturnTypes.explicit(SqlTypeName.OTHER)),
- MINMAXRANGE("minMaxRange"),
+ MINMAXRANGE("minMaxRange", null, SqlKind.OTHER_FUNCTION,
SqlFunctionCategory.NUMERIC,
+ OperandTypes.NUMERIC, ReturnTypes.ARG0,
ReturnTypes.explicit(SqlTypeName.OTHER)),
/**
* for all distinct count family functions:
* (1) distinct_count only supports single argument;
@@ -81,41 +83,69 @@ public enum AggregationFunctionType {
DISTINCTCOUNT("distinctCount", null, SqlKind.OTHER_FUNCTION,
SqlFunctionCategory.USER_DEFINED_FUNCTION, OperandTypes.ANY,
ReturnTypes.BIGINT,
ReturnTypes.explicit(SqlTypeName.OTHER)),
+ // TODO: support bitmap and segment partition in V2
DISTINCTCOUNTBITMAP("distinctCountBitmap"),
- SEGMENTPARTITIONEDDISTINCTCOUNT("segmentPartitionedDistinctCount"),
- DISTINCTCOUNTHLL("distinctCountHLL", ImmutableList.of("DISTINCT_COUNT_HLL"),
SqlKind.OTHER_FUNCTION,
+ SEGMENTPARTITIONEDDISTINCTCOUNT("segmentPartitionedDistinctCount", null,
SqlKind.OTHER_FUNCTION,
SqlFunctionCategory.USER_DEFINED_FUNCTION, OperandTypes.ANY,
ReturnTypes.BIGINT,
- ReturnTypes.explicit(SqlTypeName.OTHER)),
- DISTINCTCOUNTRAWHLL("distinctCountRawHLL"),
- DISTINCTCOUNTSMARTHLL("distinctCountSmartHLL"),
+ ReturnTypes.BIGINT),
+ DISTINCTCOUNTHLL("distinctCountHLL", ImmutableList.of("DISTINCT_COUNT_HLL"),
SqlKind.OTHER_FUNCTION,
+ SqlFunctionCategory.USER_DEFINED_FUNCTION,
+ OperandTypes.family(ImmutableList.of(SqlTypeFamily.ANY,
SqlTypeFamily.NUMERIC), ordinal -> ordinal > 0),
+ ReturnTypes.BIGINT, ReturnTypes.explicit(SqlTypeName.OTHER)),
+ DISTINCTCOUNTRAWHLL("distinctCountRawHLL",
ImmutableList.of("DISTINCT_COUNT_RAW_HLL"), SqlKind.OTHER_FUNCTION,
+ SqlFunctionCategory.USER_DEFINED_FUNCTION,
+ OperandTypes.family(ImmutableList.of(SqlTypeFamily.ANY,
SqlTypeFamily.INTEGER), ordinal -> ordinal > 0),
+ ReturnTypes.VARCHAR_2000, ReturnTypes.explicit(SqlTypeName.OTHER)),
+ DISTINCTCOUNTSMARTHLL("distinctCountSmartHLL",
ImmutableList.of("DISTINCT_COUNT_SMART_HLL"), SqlKind.OTHER_FUNCTION,
+ SqlFunctionCategory.USER_DEFINED_FUNCTION,
+ OperandTypes.family(ImmutableList.of(SqlTypeFamily.ANY,
SqlTypeFamily.CHARACTER), ordinal -> ordinal > 0),
+ ReturnTypes.BIGINT, ReturnTypes.explicit(SqlTypeName.OTHER)),
+ // DEPRECATED in v2
FASTHLL("fastHLL"),
- DISTINCTCOUNTTHETASKETCH("distinctCountThetaSketch",
ImmutableList.of("DISTINCT_COUNT_THETA_SKETCH"),
+ DISTINCTCOUNTTHETASKETCH("distinctCountThetaSketch", null,
SqlKind.OTHER_FUNCTION, SqlFunctionCategory.USER_DEFINED_FUNCTION,
OperandTypes.family(ImmutableList.of(SqlTypeFamily.ANY,
SqlTypeFamily.CHARACTER), ordinal -> ordinal > 0),
ReturnTypes.BIGINT, ReturnTypes.explicit(SqlTypeName.OTHER)),
- DISTINCTCOUNTRAWTHETASKETCH("distinctCountRawThetaSketch",
ImmutableList.of("DISTINCT_COUNT_RAW_THETA_SKETCH"),
+ DISTINCTCOUNTRAWTHETASKETCH("distinctCountRawThetaSketch", null,
SqlKind.OTHER_FUNCTION, SqlFunctionCategory.USER_DEFINED_FUNCTION,
OperandTypes.family(ImmutableList.of(SqlTypeFamily.ANY,
SqlTypeFamily.CHARACTER), ordinal -> ordinal > 0),
ReturnTypes.VARCHAR_2000, ReturnTypes.explicit(SqlTypeName.OTHER)),
- DISTINCTSUM("distinctSum"),
- DISTINCTAVG("distinctAvg"),
+ DISTINCTSUM("distinctSum", null, SqlKind.OTHER_FUNCTION,
SqlFunctionCategory.NUMERIC,
+ OperandTypes.NUMERIC, ReturnTypes.AGG_SUM,
ReturnTypes.explicit(SqlTypeName.OTHER)),
+ DISTINCTAVG("distinctAvg", null, SqlKind.OTHER_FUNCTION,
SqlFunctionCategory.NUMERIC,
+ OperandTypes.NUMERIC, ReturnTypes.explicit(SqlTypeName.DOUBLE),
ReturnTypes.explicit(SqlTypeName.OTHER)),
PERCENTILE("percentile", null, SqlKind.OTHER_FUNCTION,
SqlFunctionCategory.USER_DEFINED_FUNCTION,
OperandTypes.family(ImmutableList.of(SqlTypeFamily.NUMERIC,
SqlTypeFamily.NUMERIC)), ReturnTypes.ARG0,
ReturnTypes.explicit(SqlTypeName.OTHER)),
- PERCENTILEEST("percentileEst"),
- PERCENTILERAWEST("percentileRawEst"),
- PERCENTILETDIGEST("percentileTDigest"),
- PERCENTILERAWTDIGEST("percentileRawTDigest"),
+ PERCENTILEEST("percentileEst", null, SqlKind.OTHER_FUNCTION,
SqlFunctionCategory.USER_DEFINED_FUNCTION,
+ OperandTypes.family(ImmutableList.of(SqlTypeFamily.NUMERIC,
SqlTypeFamily.NUMERIC)), ReturnTypes.ARG0,
+ ReturnTypes.explicit(SqlTypeName.OTHER)),
+ PERCENTILERAWEST("percentileRawEst", null, SqlKind.OTHER_FUNCTION,
SqlFunctionCategory.USER_DEFINED_FUNCTION,
+ OperandTypes.family(ImmutableList.of(SqlTypeFamily.NUMERIC,
SqlTypeFamily.NUMERIC)), ReturnTypes.VARCHAR_2000,
+ ReturnTypes.explicit(SqlTypeName.OTHER)),
+ PERCENTILETDIGEST("percentileTDigest", null, SqlKind.OTHER_FUNCTION,
SqlFunctionCategory.USER_DEFINED_FUNCTION,
+ OperandTypes.family(ImmutableList.of(SqlTypeFamily.NUMERIC,
SqlTypeFamily.NUMERIC)), ReturnTypes.ARG0,
+ ReturnTypes.explicit(SqlTypeName.OTHER)),
+ PERCENTILERAWTDIGEST("percentileRawTDigest", null, SqlKind.OTHER_FUNCTION,
SqlFunctionCategory.USER_DEFINED_FUNCTION,
+ OperandTypes.family(ImmutableList.of(SqlTypeFamily.NUMERIC,
SqlTypeFamily.NUMERIC)), ReturnTypes.VARCHAR_2000,
+ ReturnTypes.explicit(SqlTypeName.OTHER)),
+ // DEPRECATED in v2
PERCENTILESMARTTDIGEST("percentileSmartTDigest"),
- PERCENTILEKLL("percentileKLL"),
- PERCENTILERAWKLL("percentileRawKLL"),
+ PERCENTILEKLL("percentileKLL", null, SqlKind.OTHER_FUNCTION,
SqlFunctionCategory.USER_DEFINED_FUNCTION,
+ OperandTypes.family(ImmutableList.of(SqlTypeFamily.NUMERIC,
SqlTypeFamily.NUMERIC)), ReturnTypes.ARG0,
+ ReturnTypes.explicit(SqlTypeName.OTHER)),
+ PERCENTILERAWKLL("percentileRawKLL", null, SqlKind.OTHER_FUNCTION,
SqlFunctionCategory.USER_DEFINED_FUNCTION,
+ OperandTypes.family(ImmutableList.of(SqlTypeFamily.NUMERIC,
SqlTypeFamily.NUMERIC)), ReturnTypes.VARCHAR_2000,
+ ReturnTypes.explicit(SqlTypeName.OTHER)),
+ // DEPRECATED in v2
IDSET("idSet"),
+ // TODO: support histogram requires solving ARRAY constructor and
multi-function signature without optional ordinal
HISTOGRAM("histogram"),
- // TODO: support underscore separated version of the stats functions
+ // TODO: support underscore separated version of the stats functions,
resolving conflict in SqlStdOptTable
// currently Pinot is missing generated agg functions impl from Calcite's
AggregateReduceFunctionsRule
COVARPOP("covarPop", Collections.emptyList(), SqlKind.OTHER_FUNCTION,
SqlFunctionCategory.USER_DEFINED_FUNCTION,
OperandTypes.family(ImmutableList.of(SqlTypeFamily.NUMERIC,
SqlTypeFamily.NUMERIC)), ReturnTypes.DOUBLE,
@@ -137,6 +167,7 @@ public enum AggregationFunctionType {
OperandTypes.NUMERIC, ReturnTypes.DOUBLE,
ReturnTypes.explicit(SqlTypeName.OTHER)),
FOURTHMOMENT("fourthMoment"),
+ // TODO: revisit support for Tuple sketches in V2
// DataSketches Tuple Sketch support
DISTINCTCOUNTTUPLESKETCH("distinctCountTupleSketch"),
@@ -146,6 +177,7 @@ public enum AggregationFunctionType {
SUMVALUESINTEGERSUMTUPLESKETCH("sumValuesIntegerSumTupleSketch"),
AVGVALUEINTEGERSUMTUPLESKETCH("avgValueIntegerSumTupleSketch"),
+ // TODO: revisit support for Geo-spatial agg in V2
// Geo aggregation functions
STUNION("STUnion", ImmutableList.of("ST_UNION"), SqlKind.OTHER_FUNCTION,
SqlFunctionCategory.USER_DEFINED_FUNCTION,
OperandTypes.BINARY, ReturnTypes.explicit(SqlTypeName.VARBINARY),
ReturnTypes.explicit(SqlTypeName.OTHER)),
@@ -209,6 +241,7 @@ public enum AggregationFunctionType {
OperandTypes.BOOLEAN, ReturnTypes.BOOLEAN,
ReturnTypes.explicit(SqlTypeName.INTEGER)),
// argMin and argMax
+ // TODO: argmin/argmax syntax not conformed with Calcite. we need to migrate.
ARGMIN("argMin"),
ARGMAX("argMax"),
PARENTARGMIN(CommonConstants.RewriterConstants.PARENT_AGGREGATION_NAME_PREFIX +
ARGMIN.getName()),
@@ -217,6 +250,7 @@ public enum AggregationFunctionType {
CHILDARGMAX(CommonConstants.RewriterConstants.CHILD_AGGREGATION_NAME_PREFIX
+ ARGMAX.getName()),
// funnel aggregate functions
+ // TODO: revisit support for funnel count in V2
FUNNELCOUNT("funnelCount");
private static final Set<String> NAMES =
Arrays.stream(values()).flatMap(func -> Stream.of(func.name(),
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]