This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new c7b59b38ef8 [fix](hist) Fix unstable result of aggregrate function 
hist #38608 (#38893)
c7b59b38ef8 is described below

commit c7b59b38ef83dba21beb557cb99b76c8d90e8048
Author: zhiqiang <[email protected]>
AuthorDate: Tue Aug 6 08:52:03 2024 +0800

    [fix](hist) Fix unstable result of aggregrate function hist #38608 (#38893)
    
    cherry pick from #38608
---
 .../aggregate_function_histogram.h                 | 41 ++++++----
 .../test_aggregate_all_functions2.out              | 72 ++++++++++++++++
 .../test_aggregate_all_functions2.groovy           | 95 ++++++++++++++++++++++
 3 files changed, 193 insertions(+), 15 deletions(-)

diff --git a/be/src/vec/aggregate_functions/aggregate_function_histogram.h 
b/be/src/vec/aggregate_functions/aggregate_function_histogram.h
index cae2a88daf0..8fcd133b055 100644
--- a/be/src/vec/aggregate_functions/aggregate_function_histogram.h
+++ b/be/src/vec/aggregate_functions/aggregate_function_histogram.h
@@ -28,6 +28,8 @@
 #include <utility>
 #include <vector>
 
+#include "common/exception.h"
+#include "common/status.h"
 #include "vec/aggregate_functions/aggregate_function.h"
 #include "vec/aggregate_functions/aggregate_function_simple_factory.h"
 #include "vec/columns/column.h"
@@ -57,12 +59,10 @@ template <typename T>
 struct AggregateFunctionHistogramData {
     using ColVecType =
             std::conditional_t<IsDecimalNumber<T>, 
ColumnDecimal<Decimal128V2>, ColumnVector<T>>;
+    const static size_t DEFAULT_BUCKET_NUM = 128;
+    const static size_t BUCKET_NUM_INIT_VALUE = 0;
 
-    void set_parameters(int input_max_num_buckets) {
-        if (input_max_num_buckets > 0) {
-            max_num_buckets = (size_t)input_max_num_buckets;
-        }
-    }
+    void set_parameters(size_t input_max_num_buckets) { max_num_buckets = 
input_max_num_buckets; }
 
     void reset() { ordered_map.clear(); }
 
@@ -86,6 +86,8 @@ struct AggregateFunctionHistogramData {
     }
 
     void merge(const AggregateFunctionHistogramData& rhs) {
+        // if rhs.max_num_buckets == 0, it means the input block for 
serialization is all null
+        // we should discard this data, because histogram only fouce on the 
not-null data
         if (!rhs.max_num_buckets) {
             return;
         }
@@ -104,7 +106,6 @@ struct AggregateFunctionHistogramData {
 
     void write(BufferWritable& buf) const {
         write_binary(max_num_buckets, buf);
-
         size_t element_number = (size_t)ordered_map.size();
         write_binary(element_number, buf);
 
@@ -148,7 +149,13 @@ struct AggregateFunctionHistogramData {
     std::string get(const DataTypePtr& data_type) const {
         std::vector<Bucket<T>> buckets;
         rapidjson::StringBuffer buffer;
-        build_histogram(buckets, ordered_map, max_num_buckets);
+        // NOTE: We need an extral branch for to handle max_num_buckets == 0,
+        // when target column is nullable, and input block is all null,
+        // set_parameters will not be called because of the short-circuit in
+        // AggregateFunctionNullVariadicInline, so max_num_buckets will be 0 
in this situation.
+        build_histogram(
+                buckets, ordered_map,
+                max_num_buckets == BUCKET_NUM_INIT_VALUE ? DEFAULT_BUCKET_NUM 
: max_num_buckets);
         histogram_to_json(buffer, buckets, data_type);
         return std::string(buffer.GetString());
     }
@@ -162,7 +169,7 @@ struct AggregateFunctionHistogramData {
     }
 
 private:
-    size_t max_num_buckets = 128;
+    size_t max_num_buckets = BUCKET_NUM_INIT_VALUE;
     std::map<T, size_t> ordered_map;
 };
 
@@ -186,13 +193,17 @@ public:
 
     void add(AggregateDataPtr __restrict place, const IColumn** columns, 
ssize_t row_num,
              Arena* arena) const override {
-        if (columns[0]->is_null_at(row_num)) {
-            return;
-        }
-
-        if (has_input_param) {
-            this->data(place).set_parameters(
-                    assert_cast<const 
ColumnInt32*>(columns[1])->get_element(row_num));
+        if constexpr (has_input_param) {
+            Int32 input_max_num_buckets =
+                    assert_cast<const 
ColumnInt32*>(columns[1])->get_element(row_num);
+            if (input_max_num_buckets <= 0) {
+                throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
+                                       "Invalid max_num_buckets {}, row_num 
{}",
+                                       input_max_num_buckets, row_num);
+            }
+            this->data(place).set_parameters(input_max_num_buckets);
+        } else {
+            this->data(place).set_parameters(Data::DEFAULT_BUCKET_NUM);
         }
 
         if constexpr (std::is_same_v<T, std::string>) {
diff --git 
a/regression-test/data/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.out
 
b/regression-test/data/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.out
index 0d83f2238a9..683461806d6 100644
--- 
a/regression-test/data/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.out
+++ 
b/regression-test/data/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.out
@@ -118,3 +118,75 @@
 -- !select_minmax4 --
 243
 
+-- !select_histogram_k0 --
+{"num_buckets":2,"buckets":[{"lower":"0","upper":"0","ndv":1,"count":7,"pre_sum":0},{"lower":"1","upper":"1","ndv":1,"count":8,"pre_sum":7}]}
+
+-- !select_histogram_k1 --
+{"num_buckets":1,"buckets":[{"lower":"1","upper":"15","ndv":15,"count":15,"pre_sum":0}]}
+
+-- !select_histogram_k2 --
+{"num_buckets":2,"buckets":[{"lower":"-32767","upper":"1985","ndv":9,"count":11,"pre_sum":0},{"lower":"1986","upper":"32767","ndv":5,"count":10,"pre_sum":11}]}
+
+-- !select_histogram_k3 --
+{"num_buckets":3,"buckets":[{"lower":"-2147483647","upper":"1001","ndv":3,"count":5,"pre_sum":0},{"lower":"1002","upper":"3021","ndv":2,"count":5,"pre_sum":5},{"lower":"5014","upper":"2147483647","ndv":3,"count":5,"pre_sum":10}]}
+
+-- !select_histogram_k4 --
+{"num_buckets":4,"buckets":[{"lower":"-9223372036854775807","upper":"123456","ndv":4,"count":5,"pre_sum":0},{"lower":"7210457","upper":"11011903","ndv":3,"count":5,"pre_sum":5},{"lower":"11011905","upper":"11011920","ndv":2,"count":3,"pre_sum":10},{"lower":"9223372036854775807","upper":"9223372036854775807","ndv":1,"count":2,"pre_sum":13}]}
+
+-- !select_histogram_k5 --
+{"num_buckets":5,"buckets":[{"lower":"-654.654","upper":"-0.123","ndv":3,"count":3,"pre_sum":0},{"lower":"0.000","upper":"0.666","ndv":2,"count":3,"pre_sum":3},{"lower":"3.141","upper":"123.123","ndv":3,"count":3,"pre_sum":6},{"lower":"243.325","upper":"1243.500","ndv":2,"count":3,"pre_sum":9},{"lower":"24453.325","upper":"604587.000","ndv":3,"count":3,"pre_sum":12}]}
+
+-- !select_histogram_k6 --
+{"num_buckets":2,"buckets":[{"lower":"false","upper":"false","ndv":1,"count":8,"pre_sum":0},{"lower":"true","upper":"true","ndv":1,"count":7,"pre_sum":8}]}
+
+-- !select_histogram_k7 --
+{"num_buckets":7,"buckets":[{"lower":"","upper":"du3lnvl","ndv":3,"count":3,"pre_sum":0},{"lower":"jiw3n4","upper":"lifsno","ndv":2,"count":2,"pre_sum":3},{"lower":"wangjuoo4","upper":"wangjuoo5","ndv":2,"count":3,"pre_sum":5},{"lower":"wangynnsf","upper":"wenlsfnl","ndv":2,"count":3,"pre_sum":8},{"lower":"yanavnd","upper":"yanavnd","ndv":1,"count":1,"pre_sum":11},{"lower":"yanvjldjlll","upper":"yanvjldjlll","ndv":1,"count":1,"pre_sum":12},{"lower":"yunlj8@nk","upper":"yunlj8@nk","ndv":1
 [...]
+
+-- !select_histogram_k10 --
+{"num_buckets":10,"buckets":[{"lower":"1901-12-31","upper":"1901-12-31","ndv":1,"count":1,"pre_sum":0},{"lower":"1988-03-21","upper":"1988-03-21","ndv":1,"count":1,"pre_sum":1},{"lower":"1989-03-21","upper":"1989-03-21","ndv":1,"count":2,"pre_sum":2},{"lower":"1991-08-11","upper":"1991-08-11","ndv":1,"count":2,"pre_sum":4},{"lower":"2012-03-14","upper":"2012-03-14","ndv":1,"count":1,"pre_sum":6},{"lower":"2014-11-11","upper":"2014-11-11","ndv":1,"count":1,"pre_sum":7},{"lower":"2015-01-0
 [...]
+
+-- !select_histogram_k11 --
+{"num_buckets":9,"buckets":[{"lower":"1901-01-01 00:00:00","upper":"1901-01-01 
00:00:00","ndv":1,"count":1,"pre_sum":0},{"lower":"1989-03-21 
13:00:00","upper":"1989-03-21 
13:00:00","ndv":1,"count":2,"pre_sum":1},{"lower":"1989-03-21 
13:11:00","upper":"1989-03-21 
13:11:00","ndv":1,"count":2,"pre_sum":3},{"lower":"2000-01-01 
00:00:00","upper":"2000-01-01 
00:00:00","ndv":1,"count":1,"pre_sum":5},{"lower":"2013-04-02 
15:16:52","upper":"2013-04-02 15:16:52","ndv":1,"count":2,"pre_sum":6},{"lo 
[...]
+
+-- !select_histogram_k12 --
+{"num_buckets":1,"buckets":[{"lower":"string12345","upper":"string12345","ndv":1,"count":15,"pre_sum":0}]}
+
+-- !select_histogram_k13 --
+{"num_buckets":13,"buckets":[{"lower":"-170141183460469231731687303715884105727","upper":"-20220101","ndv":2,"count":2,"pre_sum":0},{"lower":"-11011903","upper":"-2022","ndv":2,"count":2,"pre_sum":2},{"lower":"0","upper":"0","ndv":1,"count":1,"pre_sum":4},{"lower":"11011903","upper":"11011903","ndv":1,"count":1,"pre_sum":5},{"lower":"20220101","upper":"20220101","ndv":1,"count":1,"pre_sum":6},{"lower":"20220102","upper":"20220102","ndv":1,"count":1,"pre_sum":7},{"lower":"20220104","upper
 [...]
+
+-- !select_histogram_k0_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k1_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k2 --
+{"num_buckets":2,"buckets":[{"lower":"10","upper":"12","ndv":3,"count":3,"pre_sum":0},{"lower":"13","upper":"15","ndv":3,"count":3,"pre_sum":3}]}
+
+-- !select_histogram_k3_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k4_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k5_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k6_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k7_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k10_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k11_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k12_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k13_all_null --
+{"num_buckets":0,"buckets":[]}
+
diff --git 
a/regression-test/suites/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.groovy
 
b/regression-test/suites/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.groovy
index f119b6637dd..126c2b63f46 100644
--- 
a/regression-test/suites/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.groovy
+++ 
b/regression-test/suites/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.groovy
@@ -113,4 +113,99 @@ suite("test_aggregate_all_functions2") {
     qt_select_minmax2 """ select max_by(datekey,hour) from metric_table; """
     qt_select_minmax3 """ select bitmap_to_string(max_by(device_id,hour)) from 
metric_table; """
     qt_select_minmax4 """ select bitmap_to_string(min_by(device_id,hour)) from 
metric_table; """
+
+    sql """
+    INSERT INTO baseall values
+            (NULL, NULL, 10, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL)
+    """
+
+    sql """
+    INSERT INTO baseall values
+            (NULL, NULL, 11, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL)
+    """
+
+    sql """
+    INSERT INTO baseall values
+            (NULL, NULL, 12, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL)
+    """
+
+    sql """
+    INSERT INTO baseall values
+            (NULL, NULL, 13, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL)
+    """
+
+    sql """
+    INSERT INTO baseall values
+            (NULL, NULL, 14, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL)
+    """
+
+    sql """
+    INSERT INTO baseall values
+            (NULL, NULL, 15, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL)
+    """
+
+    qt_select_histogram_k0 """SELECT histogram(k0) FROM baseall"""
+    qt_select_histogram_k1 """SELECT histogram(k1, 1) FROM baseall"""
+    qt_select_histogram_k2 """SELECT histogram(k2, 2) FROM baseall"""
+    qt_select_histogram_k3 """SELECT histogram(k3, 3) FROM baseall"""
+    qt_select_histogram_k4 """SELECT histogram(k4, 4) FROM baseall"""
+    qt_select_histogram_k5 """SELECT histogram(k5, 5) FROM baseall"""
+    qt_select_histogram_k6 """SELECT histogram(k6, 6) FROM baseall"""
+    qt_select_histogram_k7 """SELECT histogram(k7, 7) FROM baseall"""
+    // the test case for double and float is removed, becase the result is not 
stable since we have
+    // 0 and -0 in column k8, both of them are valid but we can not make both 
of them stand in out file.
+//     qt_select_histogram_k8 """SELECT histogram(k8, 8) FROM baseall"""
+//     qt_select_histogram_k9 """SELECT histogram(k9, 9) FROM baseall"""
+    qt_select_histogram_k10 """SELECT histogram(k10, 10) FROM baseall"""
+    qt_select_histogram_k11 """SELECT histogram(k11, 11) FROM baseall"""
+    qt_select_histogram_k12 """SELECT histogram(k12, 12) FROM baseall"""
+    qt_select_histogram_k13 """SELECT histogram(k13, 13) FROM baseall"""
+
+    sql """
+    TRUNCATE TABLE baseall;
+    """
+
+    sql """
+    INSERT INTO baseall values
+            (NULL, NULL, 10, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL)
+    """
+
+    sql """
+    INSERT INTO baseall values
+            (NULL, NULL, 11, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL)
+    """
+
+    sql """
+    INSERT INTO baseall values
+            (NULL, NULL, 12, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL)
+    """
+
+    sql """
+    INSERT INTO baseall values
+            (NULL, NULL, 13, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL)
+    """
+
+    sql """
+    INSERT INTO baseall values
+            (NULL, NULL, 14, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL)
+    """
+
+    sql """
+    INSERT INTO baseall values
+            (NULL, NULL, 15, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL)
+    """
+    qt_select_histogram_k0_all_null """SELECT histogram(k0) FROM baseall"""
+    qt_select_histogram_k1_all_null """SELECT histogram(k1, 1) FROM baseall"""
+    qt_select_histogram_k2 """SELECT histogram(k2, 2) FROM baseall"""
+    qt_select_histogram_k3_all_null """SELECT histogram(k3, 3) FROM baseall"""
+    qt_select_histogram_k4_all_null """SELECT histogram(k4, 4) FROM baseall"""
+    qt_select_histogram_k5_all_null """SELECT histogram(k5, 5) FROM baseall"""
+    qt_select_histogram_k6_all_null """SELECT histogram(k6, 6) FROM baseall"""
+    qt_select_histogram_k7_all_null """SELECT histogram(k7, 7) FROM baseall"""
+//     qt_select_histogram_k8_all_null """SELECT histogram(k8, 8) FROM 
baseall"""
+//     qt_select_histogram_k9_all_null """SELECT histogram(k9, 9) FROM 
baseall"""
+    qt_select_histogram_k10_all_null """SELECT histogram(k10, 10) FROM 
baseall"""
+    qt_select_histogram_k11_all_null """SELECT histogram(k11, 11) FROM 
baseall"""
+    qt_select_histogram_k12_all_null """SELECT histogram(k12, 12) FROM 
baseall"""
+    qt_select_histogram_k13_all_null """SELECT histogram(k13, 13) FROM 
baseall"""
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to