[doris] branch master updated: [enhancement](histogram) optimise aggregate function histogram (#15317)

morrysnow Fri, 06 Jan 2023 08:50:49 -0800

This is an automated email from the ASF dual-hosted git repository.

morrysnow pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/master by this push:
     new 76ad599fd7 [enhancement](histogram) optimise aggregate function 
histogram (#15317)
76ad599fd7 is described below

commit 76ad599fd7411e490c479629516756e1c821ffe4
Author: ElvinWei <[email protected]>
AuthorDate: Sat Jan 7 00:50:32 2023 +0800

    [enhancement](histogram) optimise aggregate function histogram (#15317)
    
    This pr mainly to optimize the histogram(👉🏻 
https://github.com/apache/doris/pull/14910)  aggregation function. Including 
the following:
    1. Support input parameters `sample_rate` and `max_bucket_num`
    2. Add UT and regression test
    3. Add documentation
    4. Optimize function implementation logic
    
    Parameter description：
    - `sample_rate`：Optional. The proportion of sample data used to generate 
the histogram. The default is 0.2.
    - `max_bucket_num`：Optional. Limit the number of histogram buckets. The 
default value is 128.
    
    ---
    
    Example：
    
    ```
    MySQL [test]> SELECT histogram(c_float) FROM histogram_test;
    
+-------------------------------------------------------------------------------------------------------------------------------------+
    | histogram(`c_float`)                                                      
                                                          |
    
+-------------------------------------------------------------------------------------------------------------------------------------+
    | 
{"sample_rate":0.2,"max_bucket_num":128,"bucket_num":3,"buckets":[{"lower":"0.1","upper":"0.1","count":1,"pre_sum":0,"ndv":1},...]}
 |
    
+-------------------------------------------------------------------------------------------------------------------------------------+
    
    MySQL [test]> SELECT histogram(c_string, 0.5, 2) FROM histogram_test;
    
+-------------------------------------------------------------------------------------------------------------------------------------+
    | histogram(`c_string`)                                                     
                                                          |
    
+-------------------------------------------------------------------------------------------------------------------------------------+
    | 
{"sample_rate":0.5,"max_bucket_num":2,"bucket_num":2,"buckets":[{"lower":"str1","upper":"str7","count":4,"pre_sum":0,"ndv":3},...]}
 |
    
+-------------------------------------------------------------------------------------------------------------------------------------+
    ```
    
    Query result description：
    
    ```
    {
        "sample_rate": 0.2,
        "max_bucket_num": 128,
        "bucket_num": 3,
        "buckets": [
            {
                "lower": "0.1",
                "upper": "0.2",
                "count": 2,
                "pre_sum": 0,
                "ndv": 2
            },
            {
                "lower": "0.8",
                "upper": "0.9",
                "count": 2,
                "pre_sum": 2,
                "ndv": 2
            },
            {
                "lower": "1.0",
                "upper": "1.0",
                "count": 2,
                "pre_sum": 4,
                "ndv": 1
            }
        ]
    }
    ```
    
    Field description：
    - sample_rate：Rate of sampling
    - max_bucket_num：Limit the maximum number of buckets
    - bucket_num：The actual number of buckets
    - buckets：All buckets
        - lower：Upper bound of the bucket
        - upper：Lower bound of the bucket
        - count：The number of elements contained in the bucket
        - pre_sum：The total number of elements in the front bucket
        - ndv：The number of different values in the bucket
    
    > Total number of histogram elements = number of elements in the last 
bucket(count) + total number of elements in the previous bucket(pre_sum).
---
 .../aggregate_function_histogram.cpp               |  67 ++--
 .../aggregate_function_histogram.h                 | 354 ++++++---------------
 be/src/vec/utils/histogram_helpers.hpp             | 270 ++++++++++++++++
 .../vec/aggregate_functions/agg_histogram_test.cpp | 153 +++++++--
 .../sql-functions/aggregate-functions/histogram.md |  86 ++---
 .../sql-functions/aggregate-functions/histogram.md |  87 ++---
 .../java/org/apache/doris/catalog/FunctionSet.java |   2 +
 .../test_aggregate_histogram.out                   |  14 +
 .../test_aggregate_histogram.groovy                | 214 +++++++++++++
 9 files changed, 868 insertions(+), 379 deletions(-)

diff --git a/be/src/vec/aggregate_functions/aggregate_function_histogram.cpp 
b/be/src/vec/aggregate_functions/aggregate_function_histogram.cpp
index b400b37edc..5ab7dd6035 100644
--- a/be/src/vec/aggregate_functions/aggregate_function_histogram.cpp
+++ b/be/src/vec/aggregate_functions/aggregate_function_histogram.cpp
@@ -17,12 +17,24 @@
 
 #include "vec/aggregate_functions/aggregate_function_histogram.h"
 
+#include "vec/aggregate_functions/helpers.h"
+#include "vec/core/types.h"
+
 namespace doris::vectorized {
 
 template <typename T>
 AggregateFunctionPtr create_agg_function_histogram(const DataTypes& 
argument_types) {
-    return AggregateFunctionPtr(
-            new AggregateFunctionHistogram<AggregateFunctionHistogramData<T>, 
T>(argument_types));
+    bool has_input_param = (argument_types.size() == 3);
+
+    if (has_input_param) {
+        return AggregateFunctionPtr(
+                new 
AggregateFunctionHistogram<AggregateFunctionHistogramData<T>, T, true>(
+                        argument_types));
+    } else {
+        return AggregateFunctionPtr(
+                new 
AggregateFunctionHistogram<AggregateFunctionHistogramData<T>, T, false>(
+                        argument_types));
+    }
 }
 
 AggregateFunctionPtr create_aggregate_function_histogram(const std::string& 
name,
@@ -31,34 +43,37 @@ AggregateFunctionPtr 
create_aggregate_function_histogram(const std::string& name
                                                          const bool 
result_is_nullable) {
     WhichDataType type(argument_types[0]);
 
-    if (type.is_uint8()) {
-        return create_agg_function_histogram<UInt8>(argument_types);
-    } else if (type.is_int8()) {
-        return create_agg_function_histogram<Int8>(argument_types);
-    } else if (type.is_int16()) {
-        return create_agg_function_histogram<Int16>(argument_types);
-    } else if (type.is_int32()) {
-        return create_agg_function_histogram<Int32>(argument_types);
-    } else if (type.is_int64()) {
+    LOG(INFO) << fmt::format("supported input type {} for aggregate function 
{}",
+                             argument_types[0]->get_name(), name);
+
+#define DISPATCH(TYPE) \
+    if (type.idx == TypeIndex::TYPE) return 
create_agg_function_histogram<TYPE>(argument_types);
+    FOR_NUMERIC_TYPES(DISPATCH)
+#undef DISPATCH
+
+    if (type.idx == TypeIndex::String) {
+        return create_agg_function_histogram<String>(argument_types);
+    }
+    if (type.idx == TypeIndex::DateTime || type.idx == TypeIndex::Date) {
         return create_agg_function_histogram<Int64>(argument_types);
-    } else if (type.is_int128()) {
-        return create_agg_function_histogram<Int128>(argument_types);
-    } else if (type.is_float32()) {
-        return create_agg_function_histogram<Float32>(argument_types);
-    } else if (type.is_float64()) {
-        return create_agg_function_histogram<Float64>(argument_types);
-    } else if (type.is_decimal32()) {
+    }
+    if (type.idx == TypeIndex::DateV2) {
+        return create_agg_function_histogram<UInt32>(argument_types);
+    }
+    if (type.idx == TypeIndex::DateTimeV2) {
+        return create_agg_function_histogram<UInt64>(argument_types);
+    }
+    if (type.idx == TypeIndex::Decimal32) {
         return create_agg_function_histogram<Decimal32>(argument_types);
-    } else if (type.is_decimal64()) {
+    }
+    if (type.idx == TypeIndex::Decimal64) {
         return create_agg_function_histogram<Decimal64>(argument_types);
-    } else if (type.is_decimal128()) {
+    }
+    if (type.idx == TypeIndex::Decimal128) {
         return create_agg_function_histogram<Decimal128>(argument_types);
-    } else if (type.is_date()) {
-        return create_agg_function_histogram<Int64>(argument_types);
-    } else if (type.is_date_time()) {
-        return create_agg_function_histogram<Int64>(argument_types);
-    } else if (type.is_string()) {
-        return create_agg_function_histogram<StringRef>(argument_types);
+    }
+    if (type.idx == TypeIndex::Decimal128I) {
+        return create_agg_function_histogram<Decimal128I>(argument_types);
     }
 
     LOG(WARNING) << fmt::format("unsupported input type {} for aggregate 
function {}",
diff --git a/be/src/vec/aggregate_functions/aggregate_function_histogram.h 
b/be/src/vec/aggregate_functions/aggregate_function_histogram.h
index f92edd7090..5f036a398d 100644
--- a/be/src/vec/aggregate_functions/aggregate_function_histogram.h
+++ b/be/src/vec/aggregate_functions/aggregate_function_histogram.h
@@ -17,299 +17,140 @@
 
 #pragma once
 
-#include <rapidjson/document.h>
-#include <rapidjson/prettywriter.h>
-#include <rapidjson/stringbuffer.h>
-
-#include <cmath>
-#include <iostream>
-
-#include "runtime/datetime_value.h"
 #include "vec/aggregate_functions/aggregate_function.h"
 #include "vec/aggregate_functions/aggregate_function_simple_factory.h"
-#include "vec/io/io_helper.h"
+#include "vec/utils/histogram_helpers.hpp"
 
 namespace doris::vectorized {
 
-// TODO: support input parameters and statistics of sampling
-const int64_t MAX_BUCKET_SIZE = 128;
-const float_t SAMPLE_RATE = 1.0;
-
 template <typename T>
-struct Bucket {
-public:
-    Bucket() = default;
-    Bucket(T value, size_t pre_sum)
-            : lower(value), upper(value), count(1), pre_sum(pre_sum), ndv(1) {}
-    Bucket(T lower, T upper, size_t count, size_t pre_sum, size_t ndv)
-            : lower(lower), upper(upper), count(count), pre_sum(pre_sum), 
ndv(ndv) {}
-
-    T lower;
-    T upper;
-    int64_t count;
-    int64_t pre_sum;
-    int64_t ndv;
-};
+struct AggregateFunctionHistogramData {
+    using ColVecType =
+            std::conditional_t<IsDecimalNumber<T>, ColumnDecimal<Decimal128>, 
ColumnVector<T>>;
 
-struct AggregateFunctionHistogramBase {
-public:
-    AggregateFunctionHistogramBase() = default;
-
-    template <typename T>
-    static std::vector<Bucket<T>> build_bucket_from_data(const std::vector<T>& 
sorted_data,
-                                                         int64_t 
max_bucket_size) {
-        std::vector<Bucket<T>> buckets;
-
-        if (sorted_data.size() > 0) {
-            int64_t data_size = sorted_data.size();
-            int num_per_bucket = (int64_t)std::ceil((Float64)data_size / 
max_bucket_size);
-
-            for (int i = 0; i < data_size; ++i) {
-                T v = sorted_data[i];
-                if (buckets.empty()) {
-                    Bucket<T> bucket(v, 0);
-                    buckets.emplace_back(bucket);
-                } else {
-                    Bucket<T>* bucket = &buckets.back();
-                    T upper = bucket->upper;
-                    if (upper == v) {
-                        bucket->count++;
-                    } else if (bucket->count < num_per_bucket) {
-                        bucket->count++;
-                        bucket->ndv++;
-                        bucket->upper = v;
-                    } else {
-                        int64_t pre_sum = bucket->pre_sum + bucket->count;
-                        Bucket<T> new_bucket(v, pre_sum);
-                        buckets.emplace_back(new_bucket);
-                    }
-                }
-            }
+    void set_parameters(double input_sample_rate, size_t input_max_bucket_num) 
{
+        if (input_sample_rate > 0 && input_sample_rate <= 1) {
+            sample_rate = input_sample_rate;
         }
-
-        return buckets;
-    }
-
-    template <typename T>
-    static std::string build_json_from_bucket(const std::vector<Bucket<T>>& 
buckets,
-                                              const DataTypePtr& data_type, 
int64_t max_bucket_size,
-                                              int64_t sample_rate) {
-        rapidjson::Document doc;
-        doc.SetObject();
-        rapidjson::Document::AllocatorType& allocator = doc.GetAllocator();
-
-        rapidjson::Value max_bucket_size_val(max_bucket_size);
-        doc.AddMember("max_bucket_size", max_bucket_size_val, allocator);
-
-        rapidjson::Value sample_rate_val(sample_rate);
-        doc.AddMember("sample_rate", sample_rate_val, allocator);
-
-        // buckets
-        rapidjson::Value bucket_arr(rapidjson::kArrayType);
-
-        if (!buckets.empty()) {
-            int size = buckets.size();
-            rapidjson::Value bucket_size_val(size);
-            doc.AddMember("bucket_size", bucket_size_val, allocator);
-
-            WhichDataType type(data_type);
-            if (type.is_int() || type.is_float() || type.is_decimal() || 
type.is_string()) {
-                for (int i = 0; i < size; ++i) {
-                    std::string lower_str = 
numerical_to_string(buckets[i].lower);
-                    std::string upper_str = 
numerical_to_string(buckets[i].upper);
-                    to_bucket_json(allocator, bucket_arr, lower_str, upper_str,
-                                   (int64_t)(buckets[i].count), 
(int64_t)(buckets[i].pre_sum),
-                                   (int64_t)(buckets[i].ndv));
-                }
-            } else if (type.is_date_or_datetime()) {
-                for (int i = 0; i < size; ++i) {
-                    std::string lower_str = to_date_string(buckets[i].lower);
-                    std::string upper_str = to_date_string(buckets[i].upper);
-                    to_bucket_json(allocator, bucket_arr, lower_str, upper_str,
-                                   (int64_t)(buckets[i].count), 
(int64_t)(buckets[i].pre_sum),
-                                   (int64_t)(buckets[i].ndv));
-                }
-            } else {
-                rapidjson::Value bucket_size_zero(0);
-                doc.AddMember("bucket_size", bucket_size_zero, allocator);
-                LOG(WARNING) << fmt::format("unable to convert histogram data 
of type {}",
-                                            data_type->get_name());
-            }
+        if (input_max_bucket_num > 0) {
+            max_bucket_num = (uint32_t)input_max_bucket_num;
         }
-
-        doc.AddMember("buckets", bucket_arr, allocator);
-
-        rapidjson::StringBuffer sb;
-        rapidjson::Writer<rapidjson::StringBuffer> writer(sb);
-        doc.Accept(writer);
-
-        return std::string(sb.GetString());
     }
 
-    static void to_bucket_json(rapidjson::Document::AllocatorType& allocator,
-                               rapidjson::Value& bucket_arr, std::string 
lower, std::string upper,
-                               int64 count, int64 pre_sum, int64 ndv) {
-        rapidjson::Value bucket(rapidjson::kObjectType);
-
-        rapidjson::Value lower_val(lower.c_str(), allocator);
-        bucket.AddMember("lower", lower_val, allocator);
+    void reset() { ordered_map.clear(); }
 
-        rapidjson::Value upper_val(upper.c_str(), allocator);
-        bucket.AddMember("upper", upper_val, allocator);
-
-        rapidjson::Value count_val(count);
-        bucket.AddMember("count", count_val, allocator);
-
-        rapidjson::Value pre_sum_val(pre_sum);
-        bucket.AddMember("pre_sum", pre_sum_val, allocator);
-
-        rapidjson::Value ndv_val(ndv);
-        bucket.AddMember("ndv", ndv_val, allocator);
-
-        bucket_arr.PushBack(bucket, allocator);
-    }
-
-private:
-    template <typename T>
-    static std::string numerical_to_string(T input) {
-        fmt::memory_buffer buffer;
-        fmt::format_to(buffer, "{}", input);
-        return std::string(buffer.data(), buffer.size());
-    }
-
-    template <typename T>
-    static std::string to_date_string(T input) {
-        auto* date_int = reinterpret_cast<vectorized::Int64*>(&input);
-        auto date_value = binary_cast<vectorized::Int64, 
vectorized::VecDateTimeValue>(*date_int);
-        char buf[32] = {};
-        date_value.to_string(buf);
-        return std::string(buf, strlen(buf));
-    }
-};
-
-template <typename T>
-struct AggregateFunctionHistogramData : public AggregateFunctionHistogramBase {
-    using ElementType = T;
-    using ColVecType = ColumnVectorOrDecimal<ElementType>;
-    PaddedPODArray<ElementType> data;
-
-    void add(const IColumn& column, size_t row_num) {
-        const auto& vec = assert_cast<const ColVecType&>(column).get_data();
-        data.push_back(vec[row_num]);
-    }
-
-    void write(BufferWritable& buf) const {
-        write_var_uint(data.size(), buf);
-        buf.write(data.raw_data(), data.size() * sizeof(ElementType));
+    void add(const StringRef& value, const UInt64& number = 1) {
+        std::string data = value.to_string();
+        auto it = ordered_map.find(data);
+        if (it != ordered_map.end()) {
+            it->second = it->second + number;
+        } else {
+            ordered_map.insert({data, number});
+        }
     }
 
-    void read(BufferReadable& buf) {
-        UInt64 rows = 0;
-        read_var_uint(rows, buf);
-        data.resize(rows);
-        buf.read(reinterpret_cast<char*>(data.data()), rows * 
sizeof(ElementType));
+    void add(const T& value, const UInt64& number = 1) {
+        auto it = ordered_map.find(value);
+        if (it != ordered_map.end()) {
+            it->second = it->second + number;
+        } else {
+            ordered_map.insert({value, number});
+        }
     }
 
     void merge(const AggregateFunctionHistogramData& rhs) {
-        data.insert(rhs.data.begin(), rhs.data.end());
-    }
-
-    void insert_result_into(IColumn& to) const {
-        auto& vec = assert_cast<ColVecType&>(to).get_data();
-        size_t old_size = vec.size();
-        vec.resize(old_size + data.size());
-        memcpy(vec.data() + old_size, data.data(), data.size() * 
sizeof(ElementType));
-    }
-
-    std::string get(const DataTypePtr& data_type) const {
-        std::vector<ElementType> vec_data;
-
-        for (size_t i = 0; i < data.size(); ++i) {
-            [[maybe_unused]] ElementType d = data[i];
-            vec_data.push_back(d);
+        if (!rhs.sample_rate) {
+            return;
         }
 
-        std::sort(vec_data.begin(), vec_data.end());
-        auto buckets = build_bucket_from_data<ElementType>(vec_data, 
MAX_BUCKET_SIZE);
-        auto result_str = build_json_from_bucket<ElementType>(buckets, 
data_type, MAX_BUCKET_SIZE,
-                                                              SAMPLE_RATE);
-
-        return result_str;
+        for (auto rhs_it : rhs.ordered_map) {
+            auto lhs_it = ordered_map.find(rhs_it.first);
+            if (lhs_it != ordered_map.end()) {
+                lhs_it->second += rhs_it.second;
+            } else {
+                ordered_map.insert({rhs_it.first, rhs_it.second});
+            }
+        }
     }
 
-    void reset() { data.clear(); }
-};
-
-template <>
-struct AggregateFunctionHistogramData<StringRef> : public 
AggregateFunctionHistogramBase {
-    using ElementType = StringRef;
-    using ColVecType = ColumnString;
-    MutableColumnPtr data;
-
-    AggregateFunctionHistogramData<ElementType>() { data = 
ColVecType::create(); }
-
-    void add(const IColumn& column, size_t row_num) { 
data->insert_from(column, row_num); }
-
     void write(BufferWritable& buf) const {
-        auto& col = assert_cast<ColVecType&>(*data);
+        write_binary(sample_rate, buf);
+        write_binary(max_bucket_num, buf);
 
-        write_var_uint(col.size(), buf);
-        buf.write(col.get_offsets().raw_data(), col.size() * 
sizeof(IColumn::Offset));
+        uint64_t element_number = (uint64_t)ordered_map.size();
+        write_binary(element_number, buf);
 
-        write_var_uint(col.get_chars().size(), buf);
-        buf.write(col.get_chars().raw_data(), col.get_chars().size());
-    }
+        auto pair_vector = map_to_vector();
 
-    void read(BufferReadable& buf) {
-        auto& col = assert_cast<ColVecType&>(*data);
-        UInt64 offs_size = 0;
-        read_var_uint(offs_size, buf);
-        col.get_offsets().resize(offs_size);
-        buf.read(reinterpret_cast<char*>(col.get_offsets().data()),
-                 offs_size * sizeof(IColumn::Offset));
-
-        UInt64 chars_size = 0;
-        read_var_uint(chars_size, buf);
-        col.get_chars().resize(chars_size);
-        buf.read(reinterpret_cast<char*>(col.get_chars().data()), chars_size);
+        for (auto i = 0; i < element_number; i++) {
+            auto element = pair_vector[i];
+            write_binary(element.second, buf);
+            write_binary(element.first, buf);
+        }
     }
 
-    void merge(const AggregateFunctionHistogramData& rhs) {
-        data->insert_range_from(*rhs.data, 0, rhs.data->size());
+    void read(BufferReadable& buf) {
+        read_binary(sample_rate, buf);
+        read_binary(max_bucket_num, buf);
+
+        uint64_t element_number = 0;
+        read_binary(element_number, buf);
+
+        ordered_map.clear();
+        std::pair<T, uint64_t> element;
+        for (auto i = 0; i < element_number; i++) {
+            read_binary(element.first, buf);
+            read_binary(element.second, buf);
+            ordered_map.insert(element);
+        }
     }
 
     void insert_result_into(IColumn& to) const {
-        auto& to_str = assert_cast<ColVecType&>(to);
-        to_str.insert_range_from(*data, 0, data->size());
+        auto pair_vector = map_to_vector();
+        for (auto i = 0; i < pair_vector.size(); i++) {
+            const auto& element = pair_vector[i];
+            if constexpr (std::is_same_v<T, std::string>) {
+                
static_cast<ColumnString&>(to).insert_data(element.second.c_str(),
+                                                           
element.second.length());
+            } else {
+                
static_cast<ColVecType&>(to).get_data().push_back(element.second);
+            }
+        }
     }
 
     std::string get(const DataTypePtr& data_type) const {
-        std::vector<std::string> str_data;
-        auto* res_column = reinterpret_cast<ColVecType*>(data.get());
+        std::vector<Bucket<T>> buckets;
+        rapidjson::StringBuffer buffer;
+        build_bucket_from_data(buckets, ordered_map, sample_rate, 
max_bucket_num);
+        build_json_from_bucket(buffer, buckets, data_type, sample_rate, 
max_bucket_num);
+        return std::string(buffer.GetString());
+    }
 
-        for (int i = 0; i < res_column->size(); ++i) {
-            [[maybe_unused]] ElementType c = res_column->get_data_at(i);
-            str_data.push_back(c.to_string());
+    std::vector<std::pair<uint64_t, T>> map_to_vector() const {
+        std::vector<std::pair<uint64_t, T>> pair_vector;
+        for (auto it : ordered_map) {
+            pair_vector.emplace_back(it.second, it.first);
         }
-
-        std::sort(str_data.begin(), str_data.end());
-        const auto buckets = build_bucket_from_data<std::string>(str_data, 
MAX_BUCKET_SIZE);
-        auto result_str = build_json_from_bucket<std::string>(buckets, 
data_type, MAX_BUCKET_SIZE,
-                                                              SAMPLE_RATE);
-
-        return result_str;
+        return pair_vector;
     }
 
-    void reset() { data->clear(); }
+private:
+    double sample_rate = 0.2;
+    uint32_t max_bucket_num = 128;
+    std::map<T, uint64_t> ordered_map;
 };
 
-template <typename Data, typename T>
+template <typename Data, typename T, bool has_input_param>
 class AggregateFunctionHistogram final
-        : public IAggregateFunctionDataHelper<Data, 
AggregateFunctionHistogram<Data, T>> {
+        : public IAggregateFunctionDataHelper<
+                  Data, AggregateFunctionHistogram<Data, T, has_input_param>> {
 public:
+    using ColVecType = ColumnVectorOrDecimal<T>;
+
     AggregateFunctionHistogram() = default;
     AggregateFunctionHistogram(const DataTypes& argument_types_)
-            : IAggregateFunctionDataHelper<Data, 
AggregateFunctionHistogram<Data, T>>(
+            : IAggregateFunctionDataHelper<Data,
+                                           AggregateFunctionHistogram<Data, T, 
has_input_param>>(
                       argument_types_, {}),
               _argument_type(argument_types_[0]) {}
 
@@ -323,7 +164,18 @@ public:
             return;
         }
 
-        this->data(place).add(*columns[0], row_num);
+        if (has_input_param) {
+            this->data(place).set_parameters(
+                    static_cast<const 
ColumnFloat64*>(columns[1])->get_element(row_num),
+                    static_cast<const 
ColumnInt32*>(columns[2])->get_element(row_num));
+        }
+
+        if constexpr (std::is_same_v<T, std::string>) {
+            this->data(place).add(
+                    static_cast<const 
ColumnString&>(*columns[0]).get_data_at(row_num));
+        } else {
+            this->data(place).add(static_cast<const 
ColVecType&>(*columns[0]).get_data()[row_num]);
+        }
     }
 
     void reset(AggregateDataPtr place) const override { 
this->data(place).reset(); }
diff --git a/be/src/vec/utils/histogram_helpers.hpp 
b/be/src/vec/utils/histogram_helpers.hpp
new file mode 100644
index 0000000000..206948d733
--- /dev/null
+++ b/be/src/vec/utils/histogram_helpers.hpp
@@ -0,0 +1,270 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <rapidjson/document.h>
+#include <rapidjson/prettywriter.h>
+#include <rapidjson/stringbuffer.h>
+
+#include <boost/dynamic_bitset.hpp>
+
+#include "vec/data_types/data_type_decimal.h"
+#include "vec/io/io_helper.h"
+
+namespace doris::vectorized {
+
+template <typename T>
+struct Bucket {
+public:
+    Bucket() = default;
+    Bucket(T value, size_t pre_sum)
+            : lower(value), upper(value), count(1), pre_sum(pre_sum), ndv(1) {}
+    Bucket(T lower, T upper, size_t count, size_t pre_sum, size_t ndv)
+            : lower(lower), upper(upper), count(count), pre_sum(pre_sum), 
ndv(ndv) {}
+
+    T lower;
+    T upper;
+    uint64_t count;
+    uint64_t pre_sum;
+    uint64_t ndv;
+};
+
+template <typename T>
+bool build_bucket_from_data(std::vector<Bucket<T>>& buckets,
+                            const std::map<T, uint64_t>& ordered_map, double 
sample_rate,
+                            uint32_t max_bucket_num) {
+    if (ordered_map.size() == 0) {
+        return false;
+    }
+
+    uint64_t element_number = 0;
+    for (auto it : ordered_map) {
+        element_number += it.second;
+    }
+
+    auto sample_number = (uint64_t)std::ceil(element_number * sample_rate);
+    auto num_per_bucket = (uint64_t)std::ceil((Float64)sample_number / 
max_bucket_num);
+
+    LOG(INFO) << fmt::format(
+            "histogram bucket info: element number {}, sample number:{}, num 
per bucket:{}",
+            element_number, sample_number, num_per_bucket);
+
+    if (sample_rate == 1) {
+        for (auto it : ordered_map) {
+            for (auto i = it.second; i > 0; --i) {
+                auto v = it.first;
+                value_to_bucket(buckets, v, num_per_bucket);
+            }
+        }
+        return true;
+    }
+
+    // if sampling is required (0<sample_rate<1),
+    // we need to build the sampled data index
+    boost::dynamic_bitset<> sample_index(element_number);
+
+    // use a same seed value so that we get
+    // same result each time we run this function
+    srand(element_number * sample_rate * max_bucket_num);
+
+    while (sample_index.count() < sample_number) {
+        uint64_t num = (rand() % (element_number));
+        sample_index[num] = true;
+    }
+
+    uint64_t element_cnt = 0;
+    uint64_t sample_cnt = 0;
+    bool break_flag = false;
+
+    for (auto it : ordered_map) {
+        if (break_flag) {
+            break;
+        }
+        for (auto i = it.second; i > 0; --i) {
+            if (sample_cnt >= sample_number) {
+                break_flag = true;
+                break;
+            }
+            if (sample_index[element_cnt]) {
+                sample_cnt += 1;
+                auto v = it.first;
+                value_to_bucket(buckets, v, num_per_bucket);
+            }
+            element_cnt += 1;
+        }
+    }
+
+    return true;
+}
+
+bool inline bucket_to_json(rapidjson::Document::AllocatorType& allocator,
+                           rapidjson::Value& bucket_arr, std::string lower, 
std::string upper,
+                           int64 count, int64 pre_sum, int64 ndv) {
+    rapidjson::Value bucket(rapidjson::kObjectType);
+
+    rapidjson::Value lower_val(lower.c_str(), allocator);
+    bucket.AddMember("lower", lower_val, allocator);
+
+    rapidjson::Value upper_val(upper.c_str(), allocator);
+    bucket.AddMember("upper", upper_val, allocator);
+
+    rapidjson::Value count_val(count);
+    bucket.AddMember("count", count_val, allocator);
+
+    rapidjson::Value pre_sum_val(pre_sum);
+    bucket.AddMember("pre_sum", pre_sum_val, allocator);
+
+    rapidjson::Value ndv_val(ndv);
+    bucket.AddMember("ndv", ndv_val, allocator);
+
+    bucket_arr.PushBack(bucket, allocator);
+
+    return bucket_arr.Size() > 0;
+}
+
+template <typename T>
+bool value_to_bucket(std::vector<Bucket<T>>& buckets, T v, size_t 
num_per_bucket) {
+    if (buckets.empty()) {
+        Bucket<T> bucket(v, 0);
+        buckets.emplace_back(bucket);
+    } else {
+        Bucket<T>* bucket = &buckets.back();
+        T upper = bucket->upper;
+        if (upper == v) {
+            bucket->count++;
+        } else if (bucket->count < num_per_bucket) {
+            bucket->count++;
+            bucket->ndv++;
+            bucket->upper = v;
+        } else {
+            uint64_t pre_sum = bucket->pre_sum + bucket->count;
+            Bucket<T> new_bucket(v, pre_sum);
+            buckets.emplace_back(new_bucket);
+        }
+    }
+
+    return buckets.size() > 0;
+}
+
+template <typename T>
+bool value_to_string(std::stringstream& ss, T input, const DataTypePtr& 
data_type) {
+    fmt::memory_buffer _insert_stmt_buffer;
+    switch (data_type->get_type_id()) {
+    case TypeIndex::Int8:
+    case TypeIndex::UInt8:
+    case TypeIndex::Int16:
+    case TypeIndex::UInt16:
+    case TypeIndex::Int32:
+    case TypeIndex::UInt32:
+    case TypeIndex::Int64:
+    case TypeIndex::UInt64:
+    case TypeIndex::Int128:
+    case TypeIndex::UInt128:
+    case TypeIndex::Float32:
+    case TypeIndex::Float64:
+    case TypeIndex::String: {
+        fmt::memory_buffer buffer;
+        fmt::format_to(buffer, "{}", input);
+        ss << std::string(buffer.data(), buffer.size());
+        break;
+    }
+    case TypeIndex::Decimal32: {
+        auto scale = get_decimal_scale(*data_type);
+        auto decimal_val = reinterpret_cast<const Decimal32*>(&input);
+        write_text(*decimal_val, scale, ss);
+        break;
+    }
+    case TypeIndex::Decimal64: {
+        auto scale = get_decimal_scale(*data_type);
+        auto decimal_val = reinterpret_cast<const Decimal64*>(&input);
+        write_text(*decimal_val, scale, ss);
+        break;
+    }
+    case TypeIndex::Decimal128:
+    case TypeIndex::Decimal128I: {
+        auto scale = get_decimal_scale(*data_type);
+        auto decimal_val = reinterpret_cast<const Decimal128*>(&input);
+        write_text(*decimal_val, scale, ss);
+        break;
+    }
+    case TypeIndex::Date:
+    case TypeIndex::DateTime: {
+        auto* date_int = reinterpret_cast<Int64*>(&input);
+        auto date_value = binary_cast<Int64, VecDateTimeValue>(*date_int);
+        char buf[32] = {};
+        date_value.to_string(buf);
+        ss << std::string(buf, strlen(buf));
+        break;
+    }
+    case TypeIndex::DateV2: {
+        auto* value = (DateV2Value<DateV2ValueType>*)(&input);
+        ss << *value;
+        break;
+    }
+    case TypeIndex::DateTimeV2: {
+        auto* value = (DateV2Value<DateTimeV2ValueType>*)(&input);
+        ss << *value;
+        break;
+    }
+    default:
+        LOG(WARNING) << fmt::format("unable to convert histogram data of type 
{}",
+                                    data_type->get_name());
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T>
+bool build_json_from_bucket(rapidjson::StringBuffer& buffer, const 
std::vector<Bucket<T>>& buckets,
+                            const DataTypePtr& data_type, double sample_rate,
+                            uint32_t max_bucket_num) {
+    rapidjson::Document doc;
+    doc.SetObject();
+    rapidjson::Document::AllocatorType& allocator = doc.GetAllocator();
+
+    rapidjson::Value sample_rate_val(sample_rate);
+    doc.AddMember("sample_rate", sample_rate_val, allocator);
+
+    rapidjson::Value max_bucket_num_val(max_bucket_num);
+    doc.AddMember("max_bucket_num", max_bucket_num_val, allocator);
+
+    int bucket_num = buckets.size();
+    rapidjson::Value bucket_num_val(bucket_num);
+    doc.AddMember("bucket_num", bucket_num_val, allocator);
+
+    rapidjson::Value bucket_arr(rapidjson::kArrayType);
+    for (auto i = 0; i < bucket_num; ++i) {
+        std::stringstream ss1;
+        std::stringstream ss2;
+        value_to_string(ss1, buckets[i].lower, data_type);
+        value_to_string(ss2, buckets[i].upper, data_type);
+        std::string lower_str = ss1.str();
+        std::string upper_str = ss2.str();
+        bucket_to_json(allocator, bucket_arr, lower_str, upper_str, 
(uint64_t)(buckets[i].count),
+                       (uint64_t)(buckets[i].pre_sum), 
(uint64_t)(buckets[i].ndv));
+    }
+    doc.AddMember("buckets", bucket_arr, allocator);
+
+    rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
+    doc.Accept(writer);
+
+    return buffer.GetSize() > 0;
+}
+
+} // namespace  doris::vectorized
diff --git a/be/test/vec/aggregate_functions/agg_histogram_test.cpp 
b/be/test/vec/aggregate_functions/agg_histogram_test.cpp
index d2d41d5f51..545e118568 100644
--- a/be/test/vec/aggregate_functions/agg_histogram_test.cpp
+++ b/be/test/vec/aggregate_functions/agg_histogram_test.cpp
@@ -22,7 +22,6 @@
 #include "vec/aggregate_functions/aggregate_function.h"
 #include "vec/aggregate_functions/aggregate_function_histogram.h"
 #include "vec/aggregate_functions/aggregate_function_simple_factory.h"
-#include "vec/columns/column_vector.h"
 #include "vec/data_types/data_type.h"
 #include "vec/data_types/data_type_date.h"
 #include "vec/data_types/data_type_date_time.h"
@@ -45,30 +44,67 @@ public:
 
     template <typename DataType>
     void agg_histogram_add_elements(AggregateFunctionPtr agg_function, 
AggregateDataPtr place,
-                                    size_t input_nums) {
+                                    size_t input_rows, double sample_rate, 
size_t max_bucket_num) {
         using FieldType = typename DataType::FieldType;
         auto type = std::make_shared<DataType>();
-        auto input_col = type->create_column();
-        for (size_t i = 0; i < input_nums; ++i) {
+
+        if (sample_rate == 0 || max_bucket_num == 0) {
+            auto input_col = type->create_column();
+            for (size_t i = 0; i < input_rows; ++i) {
+                if constexpr (std::is_same_v<DataType, DataTypeString>) {
+                    auto item = std::string("item") + std::to_string(i);
+                    input_col->insert_data(item.c_str(), item.size());
+                } else {
+                    auto item = FieldType(static_cast<uint64_t>(i));
+                    input_col->insert_data(reinterpret_cast<const 
char*>(&item), 0);
+                }
+            }
+
+            EXPECT_EQ(input_col->size(), input_rows);
+
+            const IColumn* column[1] = {input_col.get()};
+            for (int i = 0; i < input_col->size(); i++) {
+                agg_function->add(place, column, i, &_agg_arena_pool);
+            }
+
+            return;
+        }
+
+        MutableColumns columns(3);
+        columns[0] = type->create_column();
+        columns[1] = ColumnFloat64::create();
+        columns[2] = ColumnInt32::create();
+
+        for (size_t i = 0; i < input_rows; ++i) {
             if constexpr (std::is_same_v<DataType, DataTypeString>) {
                 auto item = std::string("item") + std::to_string(i);
-                input_col->insert_data(item.c_str(), item.size());
+                columns[0]->insert_data(item.c_str(), item.size());
             } else {
                 auto item = FieldType(static_cast<uint64_t>(i));
-                input_col->insert_data(reinterpret_cast<const char*>(&item), 
0);
+                columns[0]->insert_data(reinterpret_cast<const char*>(&item), 
0);
             }
+            columns[1]->insert_data(reinterpret_cast<char*>(&sample_rate), 
sizeof(sample_rate));
+            columns[2]->insert_data(reinterpret_cast<char*>(&max_bucket_num),
+                                    sizeof(max_bucket_num));
         }
-        EXPECT_EQ(input_col->size(), input_nums);
 
-        const IColumn* column[1] = {input_col.get()};
-        for (int i = 0; i < input_col->size(); i++) {
+        EXPECT_EQ(columns[0]->size(), input_rows);
+
+        const IColumn* column[3] = {columns[0].get(), columns[1].get(), 
columns[2].get()};
+        for (int i = 0; i < input_rows; i++) {
             agg_function->add(place, column, i, &_agg_arena_pool);
         }
     }
 
     template <typename DataType>
-    void test_agg_histogram(size_t input_nums = 0) {
-        DataTypes data_types = {(DataTypePtr)std::make_shared<DataType>()};
+    void test_agg_histogram(size_t input_rows = 0, double sample_rate = 0,
+                            size_t max_bucket_num = 0) {
+        DataTypes data_types1 = {(DataTypePtr)std::make_shared<DataType>()};
+        DataTypes data_types3 = {(DataTypePtr)std::make_shared<DataType>(),
+                                 std::make_shared<DataTypeFloat64>(),
+                                 std::make_shared<DataTypeInt32>()};
+
+        auto data_types = (sample_rate == 0 || max_bucket_num == 0) ? 
data_types1 : data_types3;
         LOG(INFO) << "test_agg_histogram for type"
                   << "(" << data_types[0]->get_name() << ")";
 
@@ -80,8 +116,8 @@ public:
         std::unique_ptr<char[]> memory(new char[agg_function->size_of_data()]);
         AggregateDataPtr place = memory.get();
         agg_function->create(place);
-
-        agg_histogram_add_elements<DataType>(agg_function, place, input_nums);
+        agg_histogram_add_elements<DataType>(agg_function, place, input_rows, 
sample_rate,
+                                             max_bucket_num);
 
         ColumnString buf;
         VectorBufferWriter buf_writer(buf);
@@ -93,22 +129,67 @@ public:
         std::unique_ptr<char[]> memory2(new 
char[agg_function->size_of_data()]);
         AggregateDataPtr place2 = memory2.get();
         agg_function->create(place2);
-
-        agg_histogram_add_elements<DataType>(agg_function, place2, input_nums);
-
+        agg_histogram_add_elements<DataType>(agg_function, place2, input_rows, 
sample_rate,
+                                             max_bucket_num);
         agg_function->merge(place, place2, &_agg_arena_pool);
-        auto column_result = ColumnString::create();
-        agg_function->insert_result_into(place, *column_result);
-        EXPECT_EQ(column_result->size(), 1);
-        EXPECT_TRUE(column_result->get_offsets()[0] >= 1);
+
+        auto column_result1 = ColumnString::create();
+        agg_function->insert_result_into(place, *column_result1);
+        EXPECT_EQ(column_result1->size(), 1);
+        EXPECT_TRUE(column_result1->get_offsets()[0] >= 1);
 
         auto column_result2 = ColumnString::create();
         agg_function->insert_result_into(place2, *column_result2);
         EXPECT_EQ(column_result2->size(), 1);
         EXPECT_TRUE(column_result2->get_offsets()[0] >= 1);
 
-        LOG(INFO) << column_result->get_offsets()[0];
-        LOG(INFO) << column_result2->get_offsets()[0];
+        LOG(INFO) << column_result1->get_data_at(0).to_string();
+        LOG(INFO) << column_result2->get_data_at(0).to_string();
+
+        // test empty data
+        if (input_rows == 0 && sample_rate == 0 && max_bucket_num == 0) {
+            std::string expect_empty_result =
+                    
"{\"sample_rate\":0.2,\"max_bucket_num\":128,\"bucket_num\":0,\"buckets\":[]"
+                    "}";
+            std::string empty_result1 = 
column_result1->get_data_at(0).to_string();
+            std::string empty_result2 = 
column_result2->get_data_at(0).to_string();
+            EXPECT_EQ(empty_result1, expect_empty_result);
+            EXPECT_EQ(empty_result2, expect_empty_result);
+        }
+
+        // test with data
+        if (input_rows == 1000 && sample_rate == 0.5 && max_bucket_num == 5) {
+            if constexpr (std::is_same_v<DataType, DataTypeInt32>) {
+                std::string expect_result1 =
+                        
"{\"sample_rate\":0.5,\"max_bucket_num\":5,\"bucket_num\":5,\"buckets\":["
+                        
"{\"lower\":\"0\",\"upper\":\"189\",\"count\":200,\"pre_sum\":0,\"ndv\":"
+                        "151},"
+                        
"{\"lower\":\"190\",\"upper\":\"380\",\"count\":200,\"pre_sum\":200,"
+                        "\"ndv\":149},"
+                        
"{\"lower\":\"382\",\"upper\":\"582\",\"count\":200,\"pre_sum\":400,"
+                        "\"ndv\":150},"
+                        
"{\"lower\":\"586\",\"upper\":\"796\",\"count\":200,\"pre_sum\":600,"
+                        "\"ndv\":157},"
+                        
"{\"lower\":\"797\",\"upper\":\"999\",\"count\":200,\"pre_sum\":800,"
+                        "\"ndv\":147}]}";
+                std::string expect_result2 =
+                        
"{\"sample_rate\":0.5,\"max_bucket_num\":5,\"bucket_num\":5,\"buckets\":["
+                        
"{\"lower\":\"0\",\"upper\":\"207\",\"count\":100,\"pre_sum\":0,\"ndv\":"
+                        "100},"
+                        
"{\"lower\":\"209\",\"upper\":\"410\",\"count\":100,\"pre_sum\":100,"
+                        "\"ndv\":100},"
+                        
"{\"lower\":\"412\",\"upper\":\"599\",\"count\":100,\"pre_sum\":200,"
+                        "\"ndv\":100},"
+                        
"{\"lower\":\"600\",\"upper\":\"797\",\"count\":100,\"pre_sum\":300,"
+                        "\"ndv\":100},"
+                        
"{\"lower\":\"799\",\"upper\":\"998\",\"count\":100,\"pre_sum\":400,"
+                        "\"ndv\":100}]}";
+                std::string result1 = 
column_result1->get_data_at(0).to_string();
+                std::string result2 = 
column_result2->get_data_at(0).to_string();
+                EXPECT_EQ(result1, expect_result1);
+                EXPECT_EQ(result2, expect_result2);
+            }
+        }
 
         agg_function->destroy(place);
         agg_function->destroy(place2);
@@ -125,18 +206,34 @@ TEST_F(VAggHistogramTest, test_empty) {
     test_agg_histogram<DataTypeInt64>();
     test_agg_histogram<DataTypeInt128>();
 
-    test_agg_histogram<DataTypeDecimal<Decimal128>>();
+    test_agg_histogram<DataTypeFloat32>();
+    test_agg_histogram<DataTypeFloat64>();
+
     test_agg_histogram<DataTypeDate>();
+    test_agg_histogram<DataTypeDateTime>();
     test_agg_histogram<DataTypeString>();
+    test_agg_histogram<DataTypeDecimal<Decimal128>>();
 }
 
 TEST_F(VAggHistogramTest, test_with_data) {
-    test_agg_histogram<DataTypeInt32>(8);
-    test_agg_histogram<DataTypeInt128>(10);
+    // rows 1000, sample rate 0.5, max bucket size 5
+    test_agg_histogram<DataTypeString>(1000, 0.5, 5);
+
+    test_agg_histogram<DataTypeInt8>(100, 0.5, 5);
+    test_agg_histogram<DataTypeInt16>(100, 0.5, 5);
+    test_agg_histogram<DataTypeInt32>(100, 0.5, 5);
+    test_agg_histogram<DataTypeInt64>(100, 0.5, 5);
+    test_agg_histogram<DataTypeInt128>(100, 0.5, 5);
+    test_agg_histogram<DataTypeFloat32>(100, 0.5, 5);
+    test_agg_histogram<DataTypeFloat64>(100, 0.5, 5);
+
+    test_agg_histogram<DataTypeDate>(100, 0.5, 5);
+    test_agg_histogram<DataTypeDateV2>(100, 0.5, 5);
+
+    test_agg_histogram<DataTypeDateTime>(100, 0.5, 5);
+    test_agg_histogram<DataTypeDateTimeV2>(100, 0.5, 5);
 
-    test_agg_histogram<DataTypeDecimal<Decimal128>>(12);
-    test_agg_histogram<DataTypeDateTime>(14);
-    test_agg_histogram<DataTypeString>(10);
+    test_agg_histogram<DataTypeDecimal<Decimal128>>(100, 0.5, 5);
 }
 
 } // namespace doris::vectorized
diff --git 
a/docs/en/docs/sql-manual/sql-functions/aggregate-functions/histogram.md 
b/docs/en/docs/sql-manual/sql-functions/aggregate-functions/histogram.md
index 63c522c500..6cc96340dc 100644
--- a/docs/en/docs/sql-manual/sql-functions/aggregate-functions/histogram.md
+++ b/docs/en/docs/sql-manual/sql-functions/aggregate-functions/histogram.md
@@ -32,67 +32,79 @@ under the License.
 
 The histogram function is used to describe the distribution of the data. It 
uses an "equal height" bucking strategy, and divides the data into buckets 
according to the value of the data. It describes each bucket with some simple 
data, such as the number of values that fall in the bucket. It is mainly used 
by the optimizer to estimate the range query.
 
+The result of the function returns an empty or Json string.
+
+Parameter description：
+- sample_rate：Optional. The proportion of sample data used to generate the 
histogram. The default is 0.2.
+- max_bucket_num：Optional. Limit the number of histogram buckets. The default 
value is 128.
+
 ### notice
 
-```
-Only supported in vectorized engine
-```
+> Only supported in vectorized engine
 
 ### example
 
 ```
-MySQL [test]> select histogram(login_time) from dev_table;
-+------------------------------------------------------------------------------------------------------------------------------+
-| histogram(`login_time`)                                                      
                                                |
-+------------------------------------------------------------------------------------------------------------------------------+
-| {"bucket_size":5,"buckets":[{"lower":"2022-09-21 
17:30:29","upper":"2022-09-21 22:30:29","count":9,"pre_sum":0,"ndv":1},...]}|
-+------------------------------------------------------------------------------------------------------------------------------+
+MySQL [test]> SELECT histogram(c_float) FROM histogram_test;
++-------------------------------------------------------------------------------------------------------------------------------------+
+| histogram(`c_float`)                                                         
                                                       |
++-------------------------------------------------------------------------------------------------------------------------------------+
+| 
{"sample_rate":0.2,"max_bucket_num":128,"bucket_num":3,"buckets":[{"lower":"0.1","upper":"0.1","count":1,"pre_sum":0,"ndv":1},...]}
 |
++-------------------------------------------------------------------------------------------------------------------------------------+
+
+MySQL [test]> SELECT histogram(c_string, 0.5, 2) FROM histogram_test;
++-------------------------------------------------------------------------------------------------------------------------------------+
+| histogram(`c_string`)                                                        
                                                       |
++-------------------------------------------------------------------------------------------------------------------------------------+
+| 
{"sample_rate":0.5,"max_bucket_num":2,"bucket_num":2,"buckets":[{"lower":"str1","upper":"str7","count":4,"pre_sum":0,"ndv":3},...]}
 |
++-------------------------------------------------------------------------------------------------------------------------------------+
 ```
+
 Query result description：
 
 ```
 {
-    "bucket_size": 5, 
+    "sample_rate": 0.2, 
+    "max_bucket_num": 128, 
+    "bucket_num": 3, 
     "buckets": [
         {
-            "lower": "2022-09-21 17:30:29", 
-            "upper": "2022-09-21 22:30:29", 
-            "count": 9, 
+            "lower": "0.1", 
+            "upper": "0.2", 
+            "count": 2, 
             "pre_sum": 0, 
-            "ndv": 1
+            "ndv": 2
         }, 
         {
-            "lower": "2022-09-22 17:30:29", 
-            "upper": "2022-09-22 22:30:29", 
-            "count": 10, 
-            "pre_sum": 9, 
-            "ndv": 1
+            "lower": "0.8", 
+            "upper": "0.9", 
+            "count": 2, 
+            "pre_sum": 2, 
+            "ndv": 2
         }, 
         {
-            "lower": "2022-09-23 17:30:29", 
-            "upper": "2022-09-23 22:30:29", 
-            "count": 9, 
-            "pre_sum": 19, 
-            "ndv": 1
-        }, 
-        {
-            "lower": "2022-09-24 17:30:29", 
-            "upper": "2022-09-24 22:30:29", 
-            "count": 9, 
-            "pre_sum": 28, 
-            "ndv": 1
-        }, 
-        {
-            "lower": "2022-09-25 17:30:29", 
-            "upper": "2022-09-25 22:30:29", 
-            "count": 9, 
-            "pre_sum": 37, 
+            "lower": "1.0", 
+            "upper": "1.0", 
+            "count": 2, 
+            "pre_sum": 4, 
             "ndv": 1
         }
     ]
 }
 ```
 
+Field description：
+- sample_rate：Rate of sampling
+- max_bucket_num：Limit the maximum number of buckets
+- bucket_num：The actual number of buckets
+- buckets：All buckets
+    - lower：Upper bound of the bucket
+    - upper：Lower bound of the bucket
+    - count：The number of elements contained in the bucket
+    - pre_sum：The total number of elements in the front bucket
+    - ndv：The number of different values in the bucket
+
+> Total number of histogram elements = number of elements in the last 
bucket(count) + total number of elements in the previous bucket(pre_sum).
 
 ### keywords
 
diff --git 
a/docs/zh-CN/docs/sql-manual/sql-functions/aggregate-functions/histogram.md 
b/docs/zh-CN/docs/sql-manual/sql-functions/aggregate-functions/histogram.md
index dbd1cd8445..55085ee26d 100644
--- a/docs/zh-CN/docs/sql-manual/sql-functions/aggregate-functions/histogram.md
+++ b/docs/zh-CN/docs/sql-manual/sql-functions/aggregate-functions/histogram.md
@@ -28,11 +28,15 @@ under the License.
 ### description
 #### Syntax
 
-仅支持向量
+`histogram(expr[, DOUBLE sample_rate, INT max_bucket_num])`
 
-`histogram(expr)`
+histogram（直方图）函数用于描述数据分布情况，它使用“等高”的分桶策略，并按照数据的值大小进行分桶，并用一些简单的数据来描述每个桶，比如落在桶里的值的个数。主要用于优化器进行区间查询的估算。
 
-histogram(直方图)函数用于描述数据分布情况，它使用“等高”的分桶策略，并按照数据的值大小进行分桶，并用一些简单的数据来描述每个桶，比如落在桶里的值的个数。主要用于优化器进行区间查询的估算。
+函数结果返回空或者 Json 字符串。
+
+参数说明：
+- sample_rate：可选项。用于生成直方图的抽样数据比例，默认值 0.2。
+- max_bucket_num：可选项。用于限制直方图桶（bucket）的数量，默认值 128。
 
 ### notice
 
@@ -43,58 +47,67 @@ histogram(直方图)函数用于描述数据分布情况，它使用“等高”
 ### example
 
 ```
-MySQL [test]> select histogram(login_time) from dev_table;
-+------------------------------------------------------------------------------------------------------------------------------+
-| histogram(`login_time`)                                                      
                                                |
-+------------------------------------------------------------------------------------------------------------------------------+
-| {"bucket_size":5,"buckets":[{"lower":"2022-09-21 
17:30:29","upper":"2022-09-21 22:30:29","count":9,"pre_sum":0,"ndv":1},...]}|
-+------------------------------------------------------------------------------------------------------------------------------+
+MySQL [test]> SELECT histogram(c_float) FROM histogram_test;
++-------------------------------------------------------------------------------------------------------------------------------------+
+| histogram(`c_float`)                                                         
                                                       |
++-------------------------------------------------------------------------------------------------------------------------------------+
+| 
{"sample_rate":0.2,"max_bucket_num":128,"bucket_num":3,"buckets":[{"lower":"0.1","upper":"0.1","count":1,"pre_sum":0,"ndv":1},...]}
 |
++-------------------------------------------------------------------------------------------------------------------------------------+
+
+MySQL [test]> SELECT histogram(c_string, 0.5, 2) FROM histogram_test;
++-------------------------------------------------------------------------------------------------------------------------------------+
+| histogram(`c_string`)                                                        
                                                       |
++-------------------------------------------------------------------------------------------------------------------------------------+
+| 
{"sample_rate":0.5,"max_bucket_num":2,"bucket_num":2,"buckets":[{"lower":"str1","upper":"str7","count":4,"pre_sum":0,"ndv":3},...]}
 |
++-------------------------------------------------------------------------------------------------------------------------------------+
 ```
+
 查询结果说明：
 
 ```
 {
-    "bucket_size": 5, 
+    "sample_rate": 0.2, 
+    "max_bucket_num": 128, 
+    "bucket_num": 3, 
     "buckets": [
         {
-            "lower": "2022-09-21 17:30:29", 
-            "upper": "2022-09-21 22:30:29", 
-            "count": 9, 
+            "lower": "0.1", 
+            "upper": "0.2", 
+            "count": 2, 
             "pre_sum": 0, 
-            "ndv": 1
+            "ndv": 2
         }, 
         {
-            "lower": "2022-09-22 17:30:29", 
-            "upper": "2022-09-22 22:30:29", 
-            "count": 10, 
-            "pre_sum": 9, 
-            "ndv": 1
+            "lower": "0.8", 
+            "upper": "0.9", 
+            "count": 2, 
+            "pre_sum": 2, 
+            "ndv": 2
         }, 
         {
-            "lower": "2022-09-23 17:30:29", 
-            "upper": "2022-09-23 22:30:29", 
-            "count": 9, 
-            "pre_sum": 19, 
-            "ndv": 1
-        }, 
-        {
-            "lower": "2022-09-24 17:30:29", 
-            "upper": "2022-09-24 22:30:29", 
-            "count": 9, 
-            "pre_sum": 28, 
-            "ndv": 1
-        }, 
-        {
-            "lower": "2022-09-25 17:30:29", 
-            "upper": "2022-09-25 22:30:29", 
-            "count": 9, 
-            "pre_sum": 37, 
+            "lower": "1.0", 
+            "upper": "1.0", 
+            "count": 2, 
+            "pre_sum": 4, 
             "ndv": 1
         }
     ]
 }
 ```
 
+字段说明：
+- sample_rate：抽样数据比例
+- max_bucket_num：用户限制的最大桶数量
+- bucket_num：实际的桶数量
+- buckets：直方图所包含的桶
+  - lower：桶的上界
+  - upper：桶的下界
+  - count：桶内包含的元素数量
+  - pre_sum：前面桶的元素总量
+  - ndv：桶内不同值的个数
+
+> 直方图总的元素数量 = 最后一个桶的元素数量（count）+ 前面桶的元素总量（pre_sum）。
+
 ### keywords
 
 HISTOGRAM
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionSet.java 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionSet.java
index 1d93829d83..1b437e3e92 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionSet.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionSet.java
@@ -2609,6 +2609,8 @@ public class FunctionSet<T> {
                                     new ArrayType(t), t,
                                     "", "", "", "", "", true, false, true, 
true));
             addBuiltin(AggregateFunction.createBuiltin(HISTOGRAM, 
Lists.newArrayList(t), Type.VARCHAR, t,
+                    "", "", "", "", "", true, false, true, true));
+            addBuiltin(AggregateFunction.createBuiltin(HISTOGRAM, 
Lists.newArrayList(t, Type.DOUBLE, Type.INT), Type.VARCHAR, t,
                                     "", "", "", "", "", true, false, true, 
true));
         }
 
diff --git 
a/regression-test/data/query_p0/sql_functions/aggregate_functions/test_aggregate_histogram.out
 
b/regression-test/data/query_p0/sql_functions/aggregate_functions/test_aggregate_histogram.out
new file mode 100644
index 0000000000..b7a75da277
--- /dev/null
+++ 
b/regression-test/data/query_p0/sql_functions/aggregate_functions/test_aggregate_histogram.out
@@ -0,0 +1,14 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !select --
+{"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"0","upper":"1","count":6,"pre_sum":0,"ndv":2}]}
      
{"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"10","upper":"31","count":6,"pre_sum":0,"ndv":4}]}
     
{"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"12","upper":"32","count":6,"pre_sum":0,"ndv":5}]}
     
{"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"944444444444","upper":"4444444444444","count":6,
 [...]
+
+-- !select --
+1      
{"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"0","upper":"1","count":3,"pre_sum":0,"ndv":2}]}
       
{"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"10","upper":"11","count":3,"pre_sum":0,"ndv":2}]}
     
{"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"12","upper":"21","count":3,"pre_sum":0,"ndv":3}]}
     
{"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"1444444444444","upper":"4444444444444","count"
 [...]
+2      
{"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"0","upper":"1","count":3,"pre_sum":0,"ndv":2}]}
       
{"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"10","upper":"31","count":3,"pre_sum":0,"ndv":3}]}
     
{"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"20","upper":"32","count":3,"pre_sum":0,"ndv":3}]}
     
{"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"944444444444","upper":"3444444444444","count":
 [...]
+
+-- !select --
+1      
{"sample_rate":1.0,"max_bucket_num":2,"bucket_num":2,"buckets":[{"lower":"0","upper":"0","count":4,"pre_sum":0,"ndv":1},{"lower":"1","upper":"1","count":2,"pre_sum":4,"ndv":1}]}
       
{"sample_rate":1.0,"max_bucket_num":2,"bucket_num":2,"buckets":[{"lower":"10","upper":"11","count":4,"pre_sum":0,"ndv":2},{"lower":"21","upper":"31","count":2,"pre_sum":4,"ndv":2}]}
   
{"sample_rate":1.0,"max_bucket_num":2,"bucket_num":2,"buckets":[{"lower":"12","upper":"20","count":3,"pre_sum":0,"ndv":2},{"lower"
 [...]
+
+-- !select --
+1      
{"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"0","upper":"1","count":6,"pre_sum":0,"ndv":2}]}
       
{"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"10","upper":"31","count":6,"pre_sum":0,"ndv":4}]}
     
{"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"12","upper":"32","count":6,"pre_sum":0,"ndv":5}]}
     
{"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"944444444444","upper":"4444444444444","count":
 [...]
+
diff --git 
a/regression-test/suites/query_p0/sql_functions/aggregate_functions/test_aggregate_histogram.groovy
 
b/regression-test/suites/query_p0/sql_functions/aggregate_functions/test_aggregate_histogram.groovy
new file mode 100644
index 0000000000..dcd7ea0709
--- /dev/null
+++ 
b/regression-test/suites/query_p0/sql_functions/aggregate_functions/test_aggregate_histogram.groovy
@@ -0,0 +1,214 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_aggregate_histogram") {
+    sql "set enable_vectorized_engine = true"
+
+    def tableName = "histogram_test"
+    def tableCTAS1 = "histogram_test_ctas1"
+    def tableCTAS2 = "histogram_test_ctas2"
+
+    sql "DROP TABLE IF EXISTS ${tableName}"
+    sql "DROP TABLE IF EXISTS ${tableCTAS1}"
+    sql "DROP TABLE IF EXISTS ${tableCTAS2}"
+
+    sql """
+        CREATE TABLE IF NOT EXISTS ${tableName} (
+            c_id INT,
+            c_bool BOOLEAN,
+            c_tinyint TINYINT,
+            c_smallint SMALLINT,
+            c_int INT,
+            c_bigint BIGINT,
+            c_largeint LARGEINT,
+            c_float FLOAT,
+            c_double DOUBLE,
+            c_decimal DECIMAL(9, 2),
+            c_decimalv3 DECIMALV3(9, 2),
+            c_char CHAR,
+            c_varchar VARCHAR(10),
+            c_string STRING,
+            c_date DATE,
+            c_datev2 DATEV2,
+            c_date_time DATETIME,
+            c_date_timev2 DATETIMEV2(6),
+            c_string_not_null VARCHAR(10) NOT NULL
+           )
+        DISTRIBUTED BY HASH(c_id) BUCKETS 1
+        PROPERTIES (
+            "replication_num" = "1"
+        )
+    """
+
+    sql """
+        INSERT INTO ${tableName} values 
+            (1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL, 
+            NULL, NULL, NULL, NULL, 'not null'),
+            (1, false, 10, 20, 30, 4444444444444, 55555555555, 0.1, 0.222, 
3333.33, 4444.44, 'c', 'varchar1', 'string1', 
+            '2022-12-01', '2022-12-01', '2022-12-01 22:23:23', '2022-12-01 
22:23:24.999999', 'not null')
+    """
+
+    sql """
+        INSERT INTO ${tableName} values 
+            (1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL, 
+            NULL, NULL, NULL, NULL, 'not null'),
+            (1, false, 11, 21, 33, 4444444444444, 55555555555, 0.1, 0.222, 
3333.33, 4444.44, 'c', 'varchar1', 'string1', 
+            '2022-12-01', '2022-12-01', '2022-12-01 22:23:23', '2022-12-01 
22:23:24.999999', 'not null')
+    """
+
+    sql """
+        INSERT INTO ${tableName} values 
+            (1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL, 
+            NULL, NULL, NULL, NULL, 'not null'),
+            (1, true, 11, 12, 13, 1444444444444, 1555555555, 1.1, 1.222, 
13333.33, 14444.44, 'd', 'varchar2', 'string2', 
+            '2022-12-02', '2022-12-02', '2022-12-02 22:23:23', '2022-12-02 
22:23:24.999999', 'not null')
+    """
+
+    sql """
+        INSERT INTO ${tableName} values 
+            (2, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL, 
+            NULL, NULL, NULL, NULL, 'not null'),
+            (2, false, 21, 22, 23, 2444444444444, 255555555, 2.1, 2.222, 
23333.33, 24444.44, 'f', 'varchar3', 'string3', 
+            '2022-12-03', '2022-12-03', '2022-12-03 22:23:23', '2022-12-03 
22:23:24.999999', 'not null')
+    """
+
+    sql """
+        INSERT INTO ${tableName} values 
+            (2, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL, 
+            NULL, NULL, NULL, NULL, 'not null'),
+            (2, true, 31, 32, 33, 3444444444444, 3555555555, 3.1, 3.222, 
33333.33, 34444.44, 'l', 'varchar3', 'string3', 
+            '2022-12-03', '2022-12-03', '2022-12-03 22:23:23', '2022-12-03 
22:23:24.999999', 'not null')
+    """
+
+    sql """
+        INSERT INTO ${tableName} values 
+            (2, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL, NULL, NULL, 
+            NULL, NULL, NULL, NULL, 'not null'),
+            (2, false, 10, 20, 30, 944444444444, 9555555555, 9.1, 9.222, 
93333.33, 94444.44, 'p', 'varchar9', 'string9', 
+            '2022-12-09', '2022-12-09', '2022-12-09 22:23:23', '2022-12-09 
22:23:24.999999', 'not null')
+    """
+
+    // Test without GROUP BY
+    qt_select """
+        SELECT
+            histogram(c_bool, 1.0, 1), 
+            histogram(c_tinyint, 1.0, 1), 
+            histogram(c_smallint, 1.0, 1), 
+            histogram(c_bigint, 1.0, 1), 
+            histogram(c_largeint, 1.0, 1), 
+            histogram(c_float, 1.0, 1), 
+            histogram(c_double, 1.0, 1), 
+            histogram(c_decimal, 1.0, 1), 
+            histogram(c_decimalv3, 1.0, 1), 
+            histogram(c_char, 1.0, 1), 
+            histogram(c_varchar, 1.0, 1), 
+            histogram(c_string, 1.0, 1), 
+            histogram(c_date, 1.0, 1), 
+            histogram(c_datev2, 1.0, 1), 
+            histogram(c_date_time, 1.0, 1), 
+            histogram(c_date_timev2, 1.0, 1), 
+            histogram(c_string_not_null, 1.0, 1)
+        FROM
+            ${tableName}
+    """
+
+    // Test with GROUP BY
+    qt_select """
+        SELECT
+            c_id, 
+            histogram(c_bool, 1.0, 1), 
+            histogram(c_tinyint, 1.0, 1), 
+            histogram(c_smallint, 1.0, 1), 
+            histogram(c_bigint, 1.0, 1), 
+            histogram(c_largeint, 1.0, 1), 
+            histogram(c_float, 1.0, 1), 
+            histogram(c_double, 1.0, 1), 
+            histogram(c_decimal, 1.0, 1), 
+            histogram(c_decimalv3, 1.0, 1), 
+            histogram(c_char, 1.0, 1), 
+            histogram(c_varchar, 1.0, 1), 
+            histogram(c_string, 1.0, 1), 
+            histogram(c_date, 1.0, 1), 
+            histogram(c_datev2, 1.0, 1), 
+            histogram(c_date_time, 1.0, 1), 
+            histogram(c_date_timev2, 1.0, 1), 
+            histogram(c_string_not_null, 1.0, 1)
+        FROM
+            ${tableName}
+        GROUP BY
+            c_id
+        ORDER BY
+            c_id
+    """
+
+    sql """
+        CREATE TABLE ${tableCTAS1} PROPERTIES("replication_num" = "1") AS
+        SELECT
+            1, 
+            histogram(c_bool, 1.0, 2), 
+            histogram(c_tinyint, 1.0, 2), 
+            histogram(c_smallint, 1.0, 2), 
+            histogram(c_bigint, 1.0, 2), 
+            histogram(c_largeint, 1.0, 2), 
+            histogram(c_float, 1.0, 2), 
+            histogram(c_double, 1.0, 2), 
+            histogram(c_decimal, 1.0, 2), 
+            histogram(c_decimalv3, 1.0, 2), 
+            histogram(c_char, 1.0, 2), 
+            histogram(c_varchar, 1.0, 2), 
+            histogram(c_string, 1.0, 2), 
+            histogram(c_date, 1.0, 2), 
+            histogram(c_datev2, 1.0, 2), 
+            histogram(c_date_time, 1.0, 2), 
+            histogram(c_date_timev2, 1.0, 2), 
+            histogram(c_string_not_null, 1.0, 2)
+        FROM
+            ${tableName}
+    """
+
+    sql """
+        CREATE TABLE ${tableCTAS2} PROPERTIES("replication_num" = "1") AS
+        SELECT
+            1, 
+            histogram(c_bool, 1.0, 1), 
+            histogram(c_tinyint, 1.0, 1), 
+            histogram(c_smallint, 1.0, 1), 
+            histogram(c_bigint, 1.0, 1), 
+            histogram(c_largeint, 1.0, 1), 
+            histogram(c_float, 1.0, 1), 
+            histogram(c_double, 1.0, 1), 
+            histogram(c_decimal, 1.0, 1), 
+            histogram(c_decimalv3, 1.0, 1), 
+            histogram(c_char, 1.0, 1), 
+            histogram(c_varchar, 1.0, 1), 
+            histogram(c_string, 1.0, 1), 
+            histogram(c_date, 1.0, 1), 
+            histogram(c_datev2, 1.0, 1), 
+            histogram(c_date_time, 1.0, 1), 
+            histogram(c_date_timev2, 1.0, 1), 
+            histogram(c_string_not_null, 1.0, 1)
+        FROM
+            ${tableName}
+    """
+
+    qt_select "SELECT * from ${tableCTAS1}"
+    qt_select "SELECT * from ${tableCTAS2}"
+
+    sql "DROP TABLE IF EXISTS ${tableName}"
+    sql "DROP TABLE IF EXISTS ${tableCTAS1}"
+    sql "DROP TABLE IF EXISTS ${tableCTAS2}"
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[doris] branch master updated: [enhancement](histogram) optimise aggregate function histogram (#15317)

Reply via email to