This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push:
new c7b59b38ef8 [fix](hist) Fix unstable result of aggregrate function
hist #38608 (#38893)
c7b59b38ef8 is described below
commit c7b59b38ef83dba21beb557cb99b76c8d90e8048
Author: zhiqiang <[email protected]>
AuthorDate: Tue Aug 6 08:52:03 2024 +0800
[fix](hist) Fix unstable result of aggregrate function hist #38608 (#38893)
cherry pick from #38608
---
.../aggregate_function_histogram.h | 41 ++++++----
.../test_aggregate_all_functions2.out | 72 ++++++++++++++++
.../test_aggregate_all_functions2.groovy | 95 ++++++++++++++++++++++
3 files changed, 193 insertions(+), 15 deletions(-)
diff --git a/be/src/vec/aggregate_functions/aggregate_function_histogram.h
b/be/src/vec/aggregate_functions/aggregate_function_histogram.h
index cae2a88daf0..8fcd133b055 100644
--- a/be/src/vec/aggregate_functions/aggregate_function_histogram.h
+++ b/be/src/vec/aggregate_functions/aggregate_function_histogram.h
@@ -28,6 +28,8 @@
#include <utility>
#include <vector>
+#include "common/exception.h"
+#include "common/status.h"
#include "vec/aggregate_functions/aggregate_function.h"
#include "vec/aggregate_functions/aggregate_function_simple_factory.h"
#include "vec/columns/column.h"
@@ -57,12 +59,10 @@ template <typename T>
struct AggregateFunctionHistogramData {
using ColVecType =
std::conditional_t<IsDecimalNumber<T>,
ColumnDecimal<Decimal128V2>, ColumnVector<T>>;
+ const static size_t DEFAULT_BUCKET_NUM = 128;
+ const static size_t BUCKET_NUM_INIT_VALUE = 0;
- void set_parameters(int input_max_num_buckets) {
- if (input_max_num_buckets > 0) {
- max_num_buckets = (size_t)input_max_num_buckets;
- }
- }
+ void set_parameters(size_t input_max_num_buckets) { max_num_buckets =
input_max_num_buckets; }
void reset() { ordered_map.clear(); }
@@ -86,6 +86,8 @@ struct AggregateFunctionHistogramData {
}
void merge(const AggregateFunctionHistogramData& rhs) {
+ // if rhs.max_num_buckets == 0, it means the input block for
serialization is all null
+ // we should discard this data, because histogram only fouce on the
not-null data
if (!rhs.max_num_buckets) {
return;
}
@@ -104,7 +106,6 @@ struct AggregateFunctionHistogramData {
void write(BufferWritable& buf) const {
write_binary(max_num_buckets, buf);
-
size_t element_number = (size_t)ordered_map.size();
write_binary(element_number, buf);
@@ -148,7 +149,13 @@ struct AggregateFunctionHistogramData {
std::string get(const DataTypePtr& data_type) const {
std::vector<Bucket<T>> buckets;
rapidjson::StringBuffer buffer;
- build_histogram(buckets, ordered_map, max_num_buckets);
+ // NOTE: We need an extral branch for to handle max_num_buckets == 0,
+ // when target column is nullable, and input block is all null,
+ // set_parameters will not be called because of the short-circuit in
+ // AggregateFunctionNullVariadicInline, so max_num_buckets will be 0
in this situation.
+ build_histogram(
+ buckets, ordered_map,
+ max_num_buckets == BUCKET_NUM_INIT_VALUE ? DEFAULT_BUCKET_NUM
: max_num_buckets);
histogram_to_json(buffer, buckets, data_type);
return std::string(buffer.GetString());
}
@@ -162,7 +169,7 @@ struct AggregateFunctionHistogramData {
}
private:
- size_t max_num_buckets = 128;
+ size_t max_num_buckets = BUCKET_NUM_INIT_VALUE;
std::map<T, size_t> ordered_map;
};
@@ -186,13 +193,17 @@ public:
void add(AggregateDataPtr __restrict place, const IColumn** columns,
ssize_t row_num,
Arena* arena) const override {
- if (columns[0]->is_null_at(row_num)) {
- return;
- }
-
- if (has_input_param) {
- this->data(place).set_parameters(
- assert_cast<const
ColumnInt32*>(columns[1])->get_element(row_num));
+ if constexpr (has_input_param) {
+ Int32 input_max_num_buckets =
+ assert_cast<const
ColumnInt32*>(columns[1])->get_element(row_num);
+ if (input_max_num_buckets <= 0) {
+ throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
+ "Invalid max_num_buckets {}, row_num
{}",
+ input_max_num_buckets, row_num);
+ }
+ this->data(place).set_parameters(input_max_num_buckets);
+ } else {
+ this->data(place).set_parameters(Data::DEFAULT_BUCKET_NUM);
}
if constexpr (std::is_same_v<T, std::string>) {
diff --git
a/regression-test/data/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.out
b/regression-test/data/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.out
index 0d83f2238a9..683461806d6 100644
---
a/regression-test/data/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.out
+++
b/regression-test/data/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.out
@@ -118,3 +118,75 @@
-- !select_minmax4 --
243
+-- !select_histogram_k0 --
+{"num_buckets":2,"buckets":[{"lower":"0","upper":"0","ndv":1,"count":7,"pre_sum":0},{"lower":"1","upper":"1","ndv":1,"count":8,"pre_sum":7}]}
+
+-- !select_histogram_k1 --
+{"num_buckets":1,"buckets":[{"lower":"1","upper":"15","ndv":15,"count":15,"pre_sum":0}]}
+
+-- !select_histogram_k2 --
+{"num_buckets":2,"buckets":[{"lower":"-32767","upper":"1985","ndv":9,"count":11,"pre_sum":0},{"lower":"1986","upper":"32767","ndv":5,"count":10,"pre_sum":11}]}
+
+-- !select_histogram_k3 --
+{"num_buckets":3,"buckets":[{"lower":"-2147483647","upper":"1001","ndv":3,"count":5,"pre_sum":0},{"lower":"1002","upper":"3021","ndv":2,"count":5,"pre_sum":5},{"lower":"5014","upper":"2147483647","ndv":3,"count":5,"pre_sum":10}]}
+
+-- !select_histogram_k4 --
+{"num_buckets":4,"buckets":[{"lower":"-9223372036854775807","upper":"123456","ndv":4,"count":5,"pre_sum":0},{"lower":"7210457","upper":"11011903","ndv":3,"count":5,"pre_sum":5},{"lower":"11011905","upper":"11011920","ndv":2,"count":3,"pre_sum":10},{"lower":"9223372036854775807","upper":"9223372036854775807","ndv":1,"count":2,"pre_sum":13}]}
+
+-- !select_histogram_k5 --
+{"num_buckets":5,"buckets":[{"lower":"-654.654","upper":"-0.123","ndv":3,"count":3,"pre_sum":0},{"lower":"0.000","upper":"0.666","ndv":2,"count":3,"pre_sum":3},{"lower":"3.141","upper":"123.123","ndv":3,"count":3,"pre_sum":6},{"lower":"243.325","upper":"1243.500","ndv":2,"count":3,"pre_sum":9},{"lower":"24453.325","upper":"604587.000","ndv":3,"count":3,"pre_sum":12}]}
+
+-- !select_histogram_k6 --
+{"num_buckets":2,"buckets":[{"lower":"false","upper":"false","ndv":1,"count":8,"pre_sum":0},{"lower":"true","upper":"true","ndv":1,"count":7,"pre_sum":8}]}
+
+-- !select_histogram_k7 --
+{"num_buckets":7,"buckets":[{"lower":"","upper":"du3lnvl","ndv":3,"count":3,"pre_sum":0},{"lower":"jiw3n4","upper":"lifsno","ndv":2,"count":2,"pre_sum":3},{"lower":"wangjuoo4","upper":"wangjuoo5","ndv":2,"count":3,"pre_sum":5},{"lower":"wangynnsf","upper":"wenlsfnl","ndv":2,"count":3,"pre_sum":8},{"lower":"yanavnd","upper":"yanavnd","ndv":1,"count":1,"pre_sum":11},{"lower":"yanvjldjlll","upper":"yanvjldjlll","ndv":1,"count":1,"pre_sum":12},{"lower":"yunlj8@nk","upper":"yunlj8@nk","ndv":1
[...]
+
+-- !select_histogram_k10 --
+{"num_buckets":10,"buckets":[{"lower":"1901-12-31","upper":"1901-12-31","ndv":1,"count":1,"pre_sum":0},{"lower":"1988-03-21","upper":"1988-03-21","ndv":1,"count":1,"pre_sum":1},{"lower":"1989-03-21","upper":"1989-03-21","ndv":1,"count":2,"pre_sum":2},{"lower":"1991-08-11","upper":"1991-08-11","ndv":1,"count":2,"pre_sum":4},{"lower":"2012-03-14","upper":"2012-03-14","ndv":1,"count":1,"pre_sum":6},{"lower":"2014-11-11","upper":"2014-11-11","ndv":1,"count":1,"pre_sum":7},{"lower":"2015-01-0
[...]
+
+-- !select_histogram_k11 --
+{"num_buckets":9,"buckets":[{"lower":"1901-01-01 00:00:00","upper":"1901-01-01
00:00:00","ndv":1,"count":1,"pre_sum":0},{"lower":"1989-03-21
13:00:00","upper":"1989-03-21
13:00:00","ndv":1,"count":2,"pre_sum":1},{"lower":"1989-03-21
13:11:00","upper":"1989-03-21
13:11:00","ndv":1,"count":2,"pre_sum":3},{"lower":"2000-01-01
00:00:00","upper":"2000-01-01
00:00:00","ndv":1,"count":1,"pre_sum":5},{"lower":"2013-04-02
15:16:52","upper":"2013-04-02 15:16:52","ndv":1,"count":2,"pre_sum":6},{"lo
[...]
+
+-- !select_histogram_k12 --
+{"num_buckets":1,"buckets":[{"lower":"string12345","upper":"string12345","ndv":1,"count":15,"pre_sum":0}]}
+
+-- !select_histogram_k13 --
+{"num_buckets":13,"buckets":[{"lower":"-170141183460469231731687303715884105727","upper":"-20220101","ndv":2,"count":2,"pre_sum":0},{"lower":"-11011903","upper":"-2022","ndv":2,"count":2,"pre_sum":2},{"lower":"0","upper":"0","ndv":1,"count":1,"pre_sum":4},{"lower":"11011903","upper":"11011903","ndv":1,"count":1,"pre_sum":5},{"lower":"20220101","upper":"20220101","ndv":1,"count":1,"pre_sum":6},{"lower":"20220102","upper":"20220102","ndv":1,"count":1,"pre_sum":7},{"lower":"20220104","upper
[...]
+
+-- !select_histogram_k0_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k1_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k2 --
+{"num_buckets":2,"buckets":[{"lower":"10","upper":"12","ndv":3,"count":3,"pre_sum":0},{"lower":"13","upper":"15","ndv":3,"count":3,"pre_sum":3}]}
+
+-- !select_histogram_k3_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k4_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k5_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k6_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k7_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k10_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k11_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k12_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k13_all_null --
+{"num_buckets":0,"buckets":[]}
+
diff --git
a/regression-test/suites/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.groovy
b/regression-test/suites/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.groovy
index f119b6637dd..126c2b63f46 100644
---
a/regression-test/suites/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.groovy
+++
b/regression-test/suites/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.groovy
@@ -113,4 +113,99 @@ suite("test_aggregate_all_functions2") {
qt_select_minmax2 """ select max_by(datekey,hour) from metric_table; """
qt_select_minmax3 """ select bitmap_to_string(max_by(device_id,hour)) from
metric_table; """
qt_select_minmax4 """ select bitmap_to_string(min_by(device_id,hour)) from
metric_table; """
+
+ sql """
+ INSERT INTO baseall values
+ (NULL, NULL, 10, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL)
+ """
+
+ sql """
+ INSERT INTO baseall values
+ (NULL, NULL, 11, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL)
+ """
+
+ sql """
+ INSERT INTO baseall values
+ (NULL, NULL, 12, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL)
+ """
+
+ sql """
+ INSERT INTO baseall values
+ (NULL, NULL, 13, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL)
+ """
+
+ sql """
+ INSERT INTO baseall values
+ (NULL, NULL, 14, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL)
+ """
+
+ sql """
+ INSERT INTO baseall values
+ (NULL, NULL, 15, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL)
+ """
+
+ qt_select_histogram_k0 """SELECT histogram(k0) FROM baseall"""
+ qt_select_histogram_k1 """SELECT histogram(k1, 1) FROM baseall"""
+ qt_select_histogram_k2 """SELECT histogram(k2, 2) FROM baseall"""
+ qt_select_histogram_k3 """SELECT histogram(k3, 3) FROM baseall"""
+ qt_select_histogram_k4 """SELECT histogram(k4, 4) FROM baseall"""
+ qt_select_histogram_k5 """SELECT histogram(k5, 5) FROM baseall"""
+ qt_select_histogram_k6 """SELECT histogram(k6, 6) FROM baseall"""
+ qt_select_histogram_k7 """SELECT histogram(k7, 7) FROM baseall"""
+ // the test case for double and float is removed, becase the result is not
stable since we have
+ // 0 and -0 in column k8, both of them are valid but we can not make both
of them stand in out file.
+// qt_select_histogram_k8 """SELECT histogram(k8, 8) FROM baseall"""
+// qt_select_histogram_k9 """SELECT histogram(k9, 9) FROM baseall"""
+ qt_select_histogram_k10 """SELECT histogram(k10, 10) FROM baseall"""
+ qt_select_histogram_k11 """SELECT histogram(k11, 11) FROM baseall"""
+ qt_select_histogram_k12 """SELECT histogram(k12, 12) FROM baseall"""
+ qt_select_histogram_k13 """SELECT histogram(k13, 13) FROM baseall"""
+
+ sql """
+ TRUNCATE TABLE baseall;
+ """
+
+ sql """
+ INSERT INTO baseall values
+ (NULL, NULL, 10, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL)
+ """
+
+ sql """
+ INSERT INTO baseall values
+ (NULL, NULL, 11, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL)
+ """
+
+ sql """
+ INSERT INTO baseall values
+ (NULL, NULL, 12, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL)
+ """
+
+ sql """
+ INSERT INTO baseall values
+ (NULL, NULL, 13, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL)
+ """
+
+ sql """
+ INSERT INTO baseall values
+ (NULL, NULL, 14, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL)
+ """
+
+ sql """
+ INSERT INTO baseall values
+ (NULL, NULL, 15, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL)
+ """
+ qt_select_histogram_k0_all_null """SELECT histogram(k0) FROM baseall"""
+ qt_select_histogram_k1_all_null """SELECT histogram(k1, 1) FROM baseall"""
+ qt_select_histogram_k2 """SELECT histogram(k2, 2) FROM baseall"""
+ qt_select_histogram_k3_all_null """SELECT histogram(k3, 3) FROM baseall"""
+ qt_select_histogram_k4_all_null """SELECT histogram(k4, 4) FROM baseall"""
+ qt_select_histogram_k5_all_null """SELECT histogram(k5, 5) FROM baseall"""
+ qt_select_histogram_k6_all_null """SELECT histogram(k6, 6) FROM baseall"""
+ qt_select_histogram_k7_all_null """SELECT histogram(k7, 7) FROM baseall"""
+// qt_select_histogram_k8_all_null """SELECT histogram(k8, 8) FROM
baseall"""
+// qt_select_histogram_k9_all_null """SELECT histogram(k9, 9) FROM
baseall"""
+ qt_select_histogram_k10_all_null """SELECT histogram(k10, 10) FROM
baseall"""
+ qt_select_histogram_k11_all_null """SELECT histogram(k11, 11) FROM
baseall"""
+ qt_select_histogram_k12_all_null """SELECT histogram(k12, 12) FROM
baseall"""
+ qt_select_histogram_k13_all_null """SELECT histogram(k13, 13) FROM
baseall"""
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]