This is an automated email from the ASF dual-hosted git repository.

dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git

commit a83bf6c8bebd5a724e6955cc892206a8d1b2f7f6
Author: zhiqiang <[email protected]>
AuthorDate: Mon Aug 5 12:06:14 2024 +0800

    [fix](hist) Fix unstable result of aggregrate function hist (#38608)
    
    * Target
    
    Fix unstable result of hist function when involving null value.
    
    * Reproduce
    
    test result of
    
`regression-test/suites/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.groovy`
    is unstable, sql `SELECT histogram(k7, 5) FROM baseall` will sometimes
    acts like the second argument is not passed in.
    
    * Root reason
    
    We have short-circuit in AggregateFunctionNullVariadicInline, when this
    row is NULL, the value will not be added by the nested function.
    Implementation of histogram relies on its add method to get its seconds
    argument, when we have an all null value block, histogram will not get
    its seconds arg even if sql is like `select(k7, 5)`, so a max_bucket_num
    with default value 128 is serialized. When we do merging, and happens to
    deserialize the above block at last, the max_bucket_num in merge stage
    will be assigned to 128, and this leads to the wrong result.
    
    * Fix by
    
    Init value of max_bucket_num is assigned to 0, when we do merging, we
    will discard this aggregated data if its max_bucket_num is 0.
---
 .../aggregate_function_histogram.h                 | 41 ++++++----
 .../test_aggregate_all_functions2.out              | 72 ++++++++++++++++
 .../test_aggregate_all_functions2.groovy           | 95 ++++++++++++++++++++++
 3 files changed, 193 insertions(+), 15 deletions(-)

diff --git a/be/src/vec/aggregate_functions/aggregate_function_histogram.h 
b/be/src/vec/aggregate_functions/aggregate_function_histogram.h
index cae2a88daf0..8fcd133b055 100644
--- a/be/src/vec/aggregate_functions/aggregate_function_histogram.h
+++ b/be/src/vec/aggregate_functions/aggregate_function_histogram.h
@@ -28,6 +28,8 @@
 #include <utility>
 #include <vector>
 
+#include "common/exception.h"
+#include "common/status.h"
 #include "vec/aggregate_functions/aggregate_function.h"
 #include "vec/aggregate_functions/aggregate_function_simple_factory.h"
 #include "vec/columns/column.h"
@@ -57,12 +59,10 @@ template <typename T>
 struct AggregateFunctionHistogramData {
     using ColVecType =
             std::conditional_t<IsDecimalNumber<T>, 
ColumnDecimal<Decimal128V2>, ColumnVector<T>>;
+    const static size_t DEFAULT_BUCKET_NUM = 128;
+    const static size_t BUCKET_NUM_INIT_VALUE = 0;
 
-    void set_parameters(int input_max_num_buckets) {
-        if (input_max_num_buckets > 0) {
-            max_num_buckets = (size_t)input_max_num_buckets;
-        }
-    }
+    void set_parameters(size_t input_max_num_buckets) { max_num_buckets = 
input_max_num_buckets; }
 
     void reset() { ordered_map.clear(); }
 
@@ -86,6 +86,8 @@ struct AggregateFunctionHistogramData {
     }
 
     void merge(const AggregateFunctionHistogramData& rhs) {
+        // if rhs.max_num_buckets == 0, it means the input block for 
serialization is all null
+        // we should discard this data, because histogram only fouce on the 
not-null data
         if (!rhs.max_num_buckets) {
             return;
         }
@@ -104,7 +106,6 @@ struct AggregateFunctionHistogramData {
 
     void write(BufferWritable& buf) const {
         write_binary(max_num_buckets, buf);
-
         size_t element_number = (size_t)ordered_map.size();
         write_binary(element_number, buf);
 
@@ -148,7 +149,13 @@ struct AggregateFunctionHistogramData {
     std::string get(const DataTypePtr& data_type) const {
         std::vector<Bucket<T>> buckets;
         rapidjson::StringBuffer buffer;
-        build_histogram(buckets, ordered_map, max_num_buckets);
+        // NOTE: We need an extral branch for to handle max_num_buckets == 0,
+        // when target column is nullable, and input block is all null,
+        // set_parameters will not be called because of the short-circuit in
+        // AggregateFunctionNullVariadicInline, so max_num_buckets will be 0 
in this situation.
+        build_histogram(
+                buckets, ordered_map,
+                max_num_buckets == BUCKET_NUM_INIT_VALUE ? DEFAULT_BUCKET_NUM 
: max_num_buckets);
         histogram_to_json(buffer, buckets, data_type);
         return std::string(buffer.GetString());
     }
@@ -162,7 +169,7 @@ struct AggregateFunctionHistogramData {
     }
 
 private:
-    size_t max_num_buckets = 128;
+    size_t max_num_buckets = BUCKET_NUM_INIT_VALUE;
     std::map<T, size_t> ordered_map;
 };
 
@@ -186,13 +193,17 @@ public:
 
     void add(AggregateDataPtr __restrict place, const IColumn** columns, 
ssize_t row_num,
              Arena* arena) const override {
-        if (columns[0]->is_null_at(row_num)) {
-            return;
-        }
-
-        if (has_input_param) {
-            this->data(place).set_parameters(
-                    assert_cast<const 
ColumnInt32*>(columns[1])->get_element(row_num));
+        if constexpr (has_input_param) {
+            Int32 input_max_num_buckets =
+                    assert_cast<const 
ColumnInt32*>(columns[1])->get_element(row_num);
+            if (input_max_num_buckets <= 0) {
+                throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
+                                       "Invalid max_num_buckets {}, row_num 
{}",
+                                       input_max_num_buckets, row_num);
+            }
+            this->data(place).set_parameters(input_max_num_buckets);
+        } else {
+            this->data(place).set_parameters(Data::DEFAULT_BUCKET_NUM);
         }
 
         if constexpr (std::is_same_v<T, std::string>) {
diff --git 
a/regression-test/data/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.out
 
b/regression-test/data/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.out
index f41a52f0ae0..454d1b49353 100644
--- 
a/regression-test/data/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.out
+++ 
b/regression-test/data/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.out
@@ -118,3 +118,75 @@
 -- !select_minmax4 --
 243
 
+-- !select_histogram_k0 --
+{"num_buckets":2,"buckets":[{"lower":"0","upper":"0","ndv":1,"count":7,"pre_sum":0},{"lower":"1","upper":"1","ndv":1,"count":8,"pre_sum":7}]}
+
+-- !select_histogram_k1 --
+{"num_buckets":1,"buckets":[{"lower":"1","upper":"15","ndv":15,"count":15,"pre_sum":0}]}
+
+-- !select_histogram_k2 --
+{"num_buckets":2,"buckets":[{"lower":"-32767","upper":"1985","ndv":9,"count":11,"pre_sum":0},{"lower":"1986","upper":"32767","ndv":5,"count":10,"pre_sum":11}]}
+
+-- !select_histogram_k3 --
+{"num_buckets":3,"buckets":[{"lower":"-2147483647","upper":"1001","ndv":3,"count":5,"pre_sum":0},{"lower":"1002","upper":"3021","ndv":2,"count":5,"pre_sum":5},{"lower":"5014","upper":"2147483647","ndv":3,"count":5,"pre_sum":10}]}
+
+-- !select_histogram_k4 --
+{"num_buckets":4,"buckets":[{"lower":"-9223372036854775807","upper":"123456","ndv":4,"count":5,"pre_sum":0},{"lower":"7210457","upper":"11011903","ndv":3,"count":5,"pre_sum":5},{"lower":"11011905","upper":"11011920","ndv":2,"count":3,"pre_sum":10},{"lower":"9223372036854775807","upper":"9223372036854775807","ndv":1,"count":2,"pre_sum":13}]}
+
+-- !select_histogram_k5 --
+{"num_buckets":5,"buckets":[{"lower":"-654.654","upper":"-0.123","ndv":3,"count":3,"pre_sum":0},{"lower":"0.000","upper":"0.666","ndv":2,"count":3,"pre_sum":3},{"lower":"3.141","upper":"123.123","ndv":3,"count":3,"pre_sum":6},{"lower":"243.325","upper":"1243.500","ndv":2,"count":3,"pre_sum":9},{"lower":"24453.325","upper":"604587.000","ndv":3,"count":3,"pre_sum":12}]}
+
+-- !select_histogram_k6 --
+{"num_buckets":2,"buckets":[{"lower":"false","upper":"false","ndv":1,"count":8,"pre_sum":0},{"lower":"true","upper":"true","ndv":1,"count":7,"pre_sum":8}]}
+
+-- !select_histogram_k7 --
+{"num_buckets":7,"buckets":[{"lower":"","upper":"du3lnvl","ndv":3,"count":3,"pre_sum":0},{"lower":"jiw3n4","upper":"lifsno","ndv":2,"count":2,"pre_sum":3},{"lower":"wangjuoo4","upper":"wangjuoo5","ndv":2,"count":3,"pre_sum":5},{"lower":"wangynnsf","upper":"wenlsfnl","ndv":2,"count":3,"pre_sum":8},{"lower":"yanavnd","upper":"yanavnd","ndv":1,"count":1,"pre_sum":11},{"lower":"yanvjldjlll","upper":"yanvjldjlll","ndv":1,"count":1,"pre_sum":12},{"lower":"yunlj8@nk","upper":"yunlj8@nk","ndv":1
 [...]
+
+-- !select_histogram_k10 --
+{"num_buckets":10,"buckets":[{"lower":"1901-12-31","upper":"1901-12-31","ndv":1,"count":1,"pre_sum":0},{"lower":"1988-03-21","upper":"1988-03-21","ndv":1,"count":1,"pre_sum":1},{"lower":"1989-03-21","upper":"1989-03-21","ndv":1,"count":2,"pre_sum":2},{"lower":"1991-08-11","upper":"1991-08-11","ndv":1,"count":2,"pre_sum":4},{"lower":"2012-03-14","upper":"2012-03-14","ndv":1,"count":1,"pre_sum":6},{"lower":"2014-11-11","upper":"2014-11-11","ndv":1,"count":1,"pre_sum":7},{"lower":"2015-01-0
 [...]
+
+-- !select_histogram_k11 --
+{"num_buckets":9,"buckets":[{"lower":"1901-01-01 00:00:00","upper":"1901-01-01 
00:00:00","ndv":1,"count":1,"pre_sum":0},{"lower":"1989-03-21 
13:00:00","upper":"1989-03-21 
13:00:00","ndv":1,"count":2,"pre_sum":1},{"lower":"1989-03-21 
13:11:00","upper":"1989-03-21 
13:11:00","ndv":1,"count":2,"pre_sum":3},{"lower":"2000-01-01 
00:00:00","upper":"2000-01-01 
00:00:00","ndv":1,"count":1,"pre_sum":5},{"lower":"2013-04-02 
15:16:52","upper":"2013-04-02 15:16:52","ndv":1,"count":2,"pre_sum":6},{"lo 
[...]
+
+-- !select_histogram_k12 --
+{"num_buckets":1,"buckets":[{"lower":"string12345","upper":"string12345","ndv":1,"count":15,"pre_sum":0}]}
+
+-- !select_histogram_k13 --
+{"num_buckets":13,"buckets":[{"lower":"-170141183460469231731687303715884105727","upper":"-20220101","ndv":2,"count":2,"pre_sum":0},{"lower":"-11011903","upper":"-2022","ndv":2,"count":2,"pre_sum":2},{"lower":"0","upper":"0","ndv":1,"count":1,"pre_sum":4},{"lower":"11011903","upper":"11011903","ndv":1,"count":1,"pre_sum":5},{"lower":"20220101","upper":"20220101","ndv":1,"count":1,"pre_sum":6},{"lower":"20220102","upper":"20220102","ndv":1,"count":1,"pre_sum":7},{"lower":"20220104","upper
 [...]
+
+-- !select_histogram_k0_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k1_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k2 --
+{"num_buckets":2,"buckets":[{"lower":"10","upper":"12","ndv":3,"count":3,"pre_sum":0},{"lower":"13","upper":"15","ndv":3,"count":3,"pre_sum":3}]}
+
+-- !select_histogram_k3_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k4_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k5_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k6_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k7_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k10_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k11_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k12_all_null --
+{"num_buckets":0,"buckets":[]}
+
+-- !select_histogram_k13_all_null --
+{"num_buckets":0,"buckets":[]}
+
diff --git 
a/regression-test/suites/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.groovy
 
b/regression-test/suites/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.groovy
index f119b6637dd..126c2b63f46 100644
--- 
a/regression-test/suites/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.groovy
+++ 
b/regression-test/suites/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions2.groovy
@@ -113,4 +113,99 @@ suite("test_aggregate_all_functions2") {
     qt_select_minmax2 """ select max_by(datekey,hour) from metric_table; """
     qt_select_minmax3 """ select bitmap_to_string(max_by(device_id,hour)) from 
metric_table; """
     qt_select_minmax4 """ select bitmap_to_string(min_by(device_id,hour)) from 
metric_table; """
+
+    sql """
+    INSERT INTO baseall values
+            (NULL, NULL, 10, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL)
+    """
+
+    sql """
+    INSERT INTO baseall values
+            (NULL, NULL, 11, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL)
+    """
+
+    sql """
+    INSERT INTO baseall values
+            (NULL, NULL, 12, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL)
+    """
+
+    sql """
+    INSERT INTO baseall values
+            (NULL, NULL, 13, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL)
+    """
+
+    sql """
+    INSERT INTO baseall values
+            (NULL, NULL, 14, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL)
+    """
+
+    sql """
+    INSERT INTO baseall values
+            (NULL, NULL, 15, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL)
+    """
+
+    qt_select_histogram_k0 """SELECT histogram(k0) FROM baseall"""
+    qt_select_histogram_k1 """SELECT histogram(k1, 1) FROM baseall"""
+    qt_select_histogram_k2 """SELECT histogram(k2, 2) FROM baseall"""
+    qt_select_histogram_k3 """SELECT histogram(k3, 3) FROM baseall"""
+    qt_select_histogram_k4 """SELECT histogram(k4, 4) FROM baseall"""
+    qt_select_histogram_k5 """SELECT histogram(k5, 5) FROM baseall"""
+    qt_select_histogram_k6 """SELECT histogram(k6, 6) FROM baseall"""
+    qt_select_histogram_k7 """SELECT histogram(k7, 7) FROM baseall"""
+    // the test case for double and float is removed, becase the result is not 
stable since we have
+    // 0 and -0 in column k8, both of them are valid but we can not make both 
of them stand in out file.
+//     qt_select_histogram_k8 """SELECT histogram(k8, 8) FROM baseall"""
+//     qt_select_histogram_k9 """SELECT histogram(k9, 9) FROM baseall"""
+    qt_select_histogram_k10 """SELECT histogram(k10, 10) FROM baseall"""
+    qt_select_histogram_k11 """SELECT histogram(k11, 11) FROM baseall"""
+    qt_select_histogram_k12 """SELECT histogram(k12, 12) FROM baseall"""
+    qt_select_histogram_k13 """SELECT histogram(k13, 13) FROM baseall"""
+
+    sql """
+    TRUNCATE TABLE baseall;
+    """
+
+    sql """
+    INSERT INTO baseall values
+            (NULL, NULL, 10, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL)
+    """
+
+    sql """
+    INSERT INTO baseall values
+            (NULL, NULL, 11, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL)
+    """
+
+    sql """
+    INSERT INTO baseall values
+            (NULL, NULL, 12, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL)
+    """
+
+    sql """
+    INSERT INTO baseall values
+            (NULL, NULL, 13, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL)
+    """
+
+    sql """
+    INSERT INTO baseall values
+            (NULL, NULL, 14, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL)
+    """
+
+    sql """
+    INSERT INTO baseall values
+            (NULL, NULL, 15, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL)
+    """
+    qt_select_histogram_k0_all_null """SELECT histogram(k0) FROM baseall"""
+    qt_select_histogram_k1_all_null """SELECT histogram(k1, 1) FROM baseall"""
+    qt_select_histogram_k2 """SELECT histogram(k2, 2) FROM baseall"""
+    qt_select_histogram_k3_all_null """SELECT histogram(k3, 3) FROM baseall"""
+    qt_select_histogram_k4_all_null """SELECT histogram(k4, 4) FROM baseall"""
+    qt_select_histogram_k5_all_null """SELECT histogram(k5, 5) FROM baseall"""
+    qt_select_histogram_k6_all_null """SELECT histogram(k6, 6) FROM baseall"""
+    qt_select_histogram_k7_all_null """SELECT histogram(k7, 7) FROM baseall"""
+//     qt_select_histogram_k8_all_null """SELECT histogram(k8, 8) FROM 
baseall"""
+//     qt_select_histogram_k9_all_null """SELECT histogram(k9, 9) FROM 
baseall"""
+    qt_select_histogram_k10_all_null """SELECT histogram(k10, 10) FROM 
baseall"""
+    qt_select_histogram_k11_all_null """SELECT histogram(k11, 11) FROM 
baseall"""
+    qt_select_histogram_k12_all_null """SELECT histogram(k12, 12) FROM 
baseall"""
+    qt_select_histogram_k13_all_null """SELECT histogram(k13, 13) FROM 
baseall"""
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to