This is an automated email from the ASF dual-hosted git repository.

zhaoc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 4a17152  Add tdigest compression param for pencentile_approx function 
(#1939)
4a17152 is described below

commit 4a17152f40797299e31f50cd1d616865ac181e21
Author: shengyunyao <shunyu...@126.com>
AuthorDate: Fri Oct 11 18:56:59 2019 +0800

    Add tdigest compression param for pencentile_approx function (#1939)
---
 be/src/exprs/aggregate_functions.cpp               | 192 ++++++++++++---------
 be/src/exprs/aggregate_functions.h                 |  12 +-
 .../aggregate-functions/percentile_approx.md       |  12 +-
 .../aggregate-functions/percentile_approx_EN.md    |  11 +-
 .../org/apache/doris/analysis/AnalyticWindow.java  |   6 +-
 .../main/java/org/apache/doris/analysis/Expr.java  |   4 +-
 .../apache/doris/analysis/FunctionCallExpr.java    |  20 ++-
 .../java/org/apache/doris/catalog/FunctionSet.java |  81 +++++----
 8 files changed, 199 insertions(+), 139 deletions(-)

diff --git a/be/src/exprs/aggregate_functions.cpp 
b/be/src/exprs/aggregate_functions.cpp
index 75de6da..6ae307f 100644
--- a/be/src/exprs/aggregate_functions.cpp
+++ b/be/src/exprs/aggregate_functions.cpp
@@ -179,12 +179,12 @@ void AggregateFunctions::count_remove(
 
 struct PercentileApproxState {
 public:
-    PercentileApproxState() : digest(new TDigest()){
-    }
+    PercentileApproxState() : digest(new TDigest()) {}
+    PercentileApproxState(double compression) : digest(new 
TDigest(compression)) {}
     ~PercentileApproxState() {
         delete digest;
     }
-    
+
     TDigest *digest = nullptr;
     double targetQuantile = -1.0;
 };
@@ -192,6 +192,15 @@ public:
 void AggregateFunctions::percentile_approx_init(FunctionContext* ctx, 
StringVal* dst) {
     dst->is_null = false;
     dst->len = sizeof(PercentileApproxState);
+    const AnyVal* digest_compression = ctx->get_constant_arg(2);
+    if (digest_compression != nullptr) {
+        double compression = reinterpret_cast<const 
DoubleVal*>(digest_compression)->val;
+        if (compression > 0 && compression < 10000) {
+            dst->ptr = (uint8_t*) new PercentileApproxState(compression);
+            return;
+        }
+    }
+
     dst->ptr = (uint8_t*) new PercentileApproxState();
 };
 
@@ -208,6 +217,20 @@ void 
AggregateFunctions::percentile_approx_update(FunctionContext* ctx, const T&
     percentile->targetQuantile = quantile.val;
 }
 
+template<typename T>
+void AggregateFunctions::percentile_approx_update(FunctionContext* ctx, const 
T& src, const DoubleVal& quantile,
+        const DoubleVal& digest_compression, StringVal* dst) {
+    if (src.is_null) {
+        return;
+    }
+    DCHECK(dst->ptr != NULL);
+    DCHECK_EQ(sizeof(PercentileApproxState), dst->len);
+
+    PercentileApproxState* percentile = 
reinterpret_cast<PercentileApproxState*>(dst->ptr);
+    percentile->digest->add(src.val);
+    percentile->targetQuantile = quantile.val;
+}
+
 StringVal AggregateFunctions::percentile_approx_serialize(FunctionContext* 
ctx, const StringVal& src) {
     DCHECK(!src.is_null);
 
@@ -1184,12 +1207,12 @@ StringVal 
AggregateFunctions::hll_finalize(FunctionContext* ctx, const StringVal
     memcpy(result_str.ptr, out_str.c_str(), result_str.len);
     return result_str;
 }
-    
+
 void AggregateFunctions::hll_union_agg_init(FunctionContext* ctx, HllVal* dst) 
{
     dst->init(ctx);
 }
 
-void AggregateFunctions::hll_union_agg_update(FunctionContext* ctx, 
+void AggregateFunctions::hll_union_agg_update(FunctionContext* ctx,
                                               const HllVal& src, HllVal* dst) {
     if (src.is_null) {
         return;
@@ -1218,11 +1241,11 @@ doris_udf::BigIntVal 
AggregateFunctions::hll_union_agg_finalize(doris_udf::Funct
 
 int64_t AggregateFunctions::hll_algorithm(uint8_t *pdata, int data_len) {
     DCHECK_EQ(data_len, HLL_REGISTERS_COUNT);
-    
+
     const int num_streams = HLL_REGISTERS_COUNT;
     // Empirical constants for the algorithm.
     float alpha = 0;
-    
+
     if (num_streams == 16) {
         alpha = 0.673f;
     } else if (num_streams == 32) {
@@ -1232,18 +1255,18 @@ int64_t AggregateFunctions::hll_algorithm(uint8_t 
*pdata, int data_len) {
     } else {
         alpha = 0.7213f / (1 + 1.079f / num_streams);
     }
-    
+
     float harmonic_mean = 0;
     int num_zero_registers = 0;
-    
+
     for (int i = 0; i < data_len; ++i) {
         harmonic_mean += powf(2.0f, -pdata[i]);
-        
+
         if (pdata[i] == 0) {
             ++num_zero_registers;
         }
     }
-    
+
     harmonic_mean = 1.0f / harmonic_mean;
     double estimate = alpha * num_streams * num_streams * harmonic_mean;
     // according to HerperLogLog current correction, if E is cardinal
@@ -1382,7 +1405,7 @@ public:
     BigIntVal count_finalize() {
         return BigIntVal(_set.size());
     }
-  
+
     // sum for double, decimal
     DoubleVal sum_finalize_double() {
         double sum = 0;
@@ -1392,7 +1415,7 @@ public:
         return DoubleVal(sum);
     }
 
-    // sum for largeint 
+    // sum for largeint
     LargeIntVal sum_finalize_largeint() {
         __int128 sum = 0;
         for (auto& value : _set) {
@@ -1416,16 +1439,16 @@ public:
 
 private:
 
-    class NumericHashHelper {   
+    class NumericHashHelper {
     public:
         size_t operator()(const T& obj) const {
-            size_t result = AnyValUtil::hash64_murmur(obj, 
HashUtil::MURMUR_SEED);   
+            size_t result = AnyValUtil::hash64_murmur(obj, 
HashUtil::MURMUR_SEED);
             return result;
         }
     };
 
     std::unordered_set<T, NumericHashHelper> _set;
-    // Because Anyval does not provide the hash function, in order 
+    // Because Anyval does not provide the hash function, in order
     // to adopt the type different from the template, the pointer is used
     // HybirdSetBase* _set;
     // _type is serialized into buffer by one byte
@@ -1460,7 +1483,7 @@ public:
         int total_serialized_set_length = 1;
         HybirdSetBase::IteratorBase* iterator = _set.begin();
         while (iterator->has_next()) {
-            const StringValue* value = 
+            const StringValue* value =
                         reinterpret_cast<const 
StringValue*>(iterator->get_value());
             total_serialized_set_length += STRING_LENGTH_RECORD_LENGTH + 
value->len;
             iterator->next();
@@ -1485,7 +1508,7 @@ public:
         }
         return result;
     }
-    
+
     void unserialize(StringVal& src) {
         uint8_t* reader = src.ptr;
         // skip type ,no used now
@@ -1502,12 +1525,12 @@ public:
         }
         DCHECK(reader == end);
     }
-   
-    // merge set 
+
+    // merge set
     void merge(MultiDistinctStringCountState& state) {
         _set.insert(&(state._set));
     }
-    
+
     BigIntVal finalize() {
         return BigIntVal(_set.size());
     }
@@ -1515,17 +1538,17 @@ public:
     FunctionContext::Type set_type() {
         return _type;
     }
-  
-    static const int STRING_LENGTH_RECORD_LENGTH = 4; 
+
+    static const int STRING_LENGTH_RECORD_LENGTH = 4;
 private:
- 
+
     StringValueSet _set;
     // _type is serialized into buffer by one byte
     FunctionContext::Type _type;
 };
 
 // multi distinct state for decimal
-// serialize order type:int_len:frac_len:sign:int_len ... 
+// serialize order type:int_len:frac_len:sign:int_len ...
 class MultiDistinctDecimalState {
 public:
 
@@ -1548,8 +1571,8 @@ public:
 
     // type:one byte  value:sizeof(T)
     StringVal serialize(FunctionContext* ctx) {
-        const int serialized_set_length = sizeof(uint8_t) 
-                   + (DECIMAL_INT_LEN_BYTE_SIZE 
+        const int serialized_set_length = sizeof(uint8_t)
+                   + (DECIMAL_INT_LEN_BYTE_SIZE
                      + DECIMAL_FRAC_BYTE_SIZE
                      + DECIMAL_SIGN_BYTE_SIZE
                      + DECIMAL_BUFFER_BYTE_SIZE) * _set.size();
@@ -1567,9 +1590,9 @@ public:
             writer += DECIMAL_SIGN_BYTE_SIZE;
             memcpy(writer, value._buffer, DECIMAL_BUFFER_BYTE_SIZE);
             writer += DECIMAL_BUFFER_BYTE_SIZE;
-        }    
+        }
         return result;
-    }    
+    }
 
     void unserialize(StringVal& src) {
         const uint8_t* reader = src.ptr;
@@ -1590,9 +1613,9 @@ public:
             memcpy(value._buffer, reader, DECIMAL_BUFFER_BYTE_SIZE);
             reader += DECIMAL_BUFFER_BYTE_SIZE;
             _set.insert(value);
-        }    
-    }    
- 
+        }
+    }
+
     FunctionContext::Type set_type() {
         return _type;
     }
@@ -1600,12 +1623,12 @@ public:
     // merge set
     void merge(MultiDistinctDecimalState& state) {
         _set.insert(state._set.begin(), state._set.end());
-    }    
+    }
 
     // count
     BigIntVal count_finalize() {
         return BigIntVal(_set.size());
-    }   
+    }
 
     DecimalVal sum_finalize() {
         DecimalValue sum;
@@ -1613,15 +1636,15 @@ public:
              sum += value;
         }
         DecimalVal result;
-        sum.to_decimal_val(&result); 
+        sum.to_decimal_val(&result);
         return result;
     }
 
 private:
 
-    const int DECIMAL_INT_LEN_BYTE_SIZE = 1; 
-    const int DECIMAL_FRAC_BYTE_SIZE = 1; 
-    const int DECIMAL_SIGN_BYTE_SIZE = 1; 
+    const int DECIMAL_INT_LEN_BYTE_SIZE = 1;
+    const int DECIMAL_FRAC_BYTE_SIZE = 1;
+    const int DECIMAL_SIGN_BYTE_SIZE = 1;
     const int DECIMAL_BUFFER_BYTE_SIZE = 36;
 
     std::unordered_set<DecimalValue> _set;
@@ -1650,7 +1673,7 @@ public:
 
     // type:one byte  value:sizeof(T)
     StringVal serialize(FunctionContext* ctx) {
-        const int serialized_set_length = sizeof(uint8_t) 
+        const int serialized_set_length = sizeof(uint8_t)
             + DECIMAL_BYTE_SIZE * _set.size();
         StringVal result(ctx, serialized_set_length);
         uint8_t* writer = result.ptr;
@@ -1661,9 +1684,9 @@ public:
             __int128 v = value.value();
             memcpy(writer, &v, DECIMAL_BYTE_SIZE);
             writer += DECIMAL_BYTE_SIZE;
-        }    
+        }
         return result;
-    }    
+    }
 
     void unserialize(StringVal& src) {
         const uint8_t* reader = src.ptr;
@@ -1678,9 +1701,9 @@ public:
             DecimalV2Value value(v);
             reader += DECIMAL_BYTE_SIZE;
             _set.insert(value);
-        }    
-    }    
- 
+        }
+    }
+
     FunctionContext::Type set_type() {
         return _type;
     }
@@ -1688,12 +1711,12 @@ public:
     // merge set
     void merge(MultiDistinctDecimalV2State& state) {
         _set.insert(state._set.begin(), state._set.end());
-    }    
+    }
 
     // count
     BigIntVal count_finalize() {
         return BigIntVal(_set.size());
-    }   
+    }
 
     DecimalV2Val sum_finalize() {
         DecimalV2Value sum;
@@ -1701,13 +1724,13 @@ public:
              sum += value;
         }
         DecimalV2Val result;
-        sum.to_decimal_val(&result); 
+        sum.to_decimal_val(&result);
         return result;
     }
 
 private:
     const int DECIMAL_BYTE_SIZE = 16;
-    
+
     std::unordered_set<DecimalV2Value> _set;
     FunctionContext::Type _type;
 };
@@ -1716,7 +1739,7 @@ private:
 // serialize order type:packed_time:type:packed_time:type ...
 class MultiDistinctCountDateState {
 public:
-    
+
     static void create(StringVal* dst) {
         dst->is_null = false;
         const int state_size = sizeof(MultiDistinctCountDateState);
@@ -1725,18 +1748,18 @@ public:
         dst->len = state_size;
         dst->ptr = (uint8_t*)state;
     }
-    
+
     static void destory(const StringVal& dst) {
         delete (MultiDistinctCountDateState*)dst.ptr;
     }
-    
+
     void update(DateTimeVal& t) {
         _set.insert(t);
     }
-    
+
     // type:one byte  value:sizeof(T)
     StringVal serialize(FunctionContext* ctx) {
-        const int serialized_set_length = sizeof(uint8_t) + 
+        const int serialized_set_length = sizeof(uint8_t) +
                    (DATETIME_PACKED_TIME_BYTE_SIZE + DATETIME_TYPE_BYTE_SIZE) 
* _set.size();
         StringVal result(ctx, serialized_set_length);
         uint8_t* writer = result.ptr;
@@ -1754,7 +1777,7 @@ public:
         }
         return result;
     }
-    
+
     void unserialize(StringVal& src) {
         const uint8_t* reader = src.ptr;
         // type
@@ -1774,47 +1797,47 @@ public:
             _set.insert(value);
         }
     }
-    
+
     // merge set
     void merge(MultiDistinctCountDateState& state) {
         _set.insert(state._set.begin(), state._set.end());
     }
-    
+
     // count
     BigIntVal count_finalize() {
         return BigIntVal(_set.size());
     }
-   
+
     FunctionContext::Type set_type() {
         return _type;
     }
- 
+
 private:
-   
-    class DateTimeHashHelper {    
+
+    class DateTimeHashHelper {
     public:
         size_t operator()(const DateTimeVal& obj) const {
-            size_t result = AnyValUtil::hash64_murmur(obj, 
HashUtil::MURMUR_SEED);   
+            size_t result = AnyValUtil::hash64_murmur(obj, 
HashUtil::MURMUR_SEED);
             return result;
         }
-    }; 
- 
+    };
+
     const int DATETIME_PACKED_TIME_BYTE_SIZE = 8;
     const int DATETIME_TYPE_BYTE_SIZE = 4;
 
     std::unordered_set<DateTimeVal, DateTimeHashHelper> _set;
     FunctionContext::Type _type;
 };
-    
+
 template <typename T>
 void AggregateFunctions::count_or_sum_distinct_numeric_init(FunctionContext* 
ctx, StringVal* dst) {
     MultiDistinctNumericState<T>::create(dst);
 }
-    
+
 void AggregateFunctions::count_distinct_string_init(FunctionContext* ctx, 
StringVal* dst) {
     MultiDistinctStringCountState::create(dst);
 }
-    
+
 void AggregateFunctions::count_or_sum_distinct_decimal_init(FunctionContext* 
ctx, StringVal* dst) {
     MultiDistinctDecimalState::create(dst);
 }
@@ -1822,7 +1845,7 @@ void 
AggregateFunctions::count_or_sum_distinct_decimal_init(FunctionContext* ctx
 void AggregateFunctions::count_or_sum_distinct_decimalv2_init(FunctionContext* 
ctx, StringVal* dst) {
     MultiDistinctDecimalV2State::create(dst);
 }
-    
+
 void AggregateFunctions::count_distinct_date_init(FunctionContext* ctx, 
StringVal* dst) {
     MultiDistinctCountDateState::create(dst);
 }
@@ -1835,7 +1858,7 @@ void 
AggregateFunctions::count_or_sum_distinct_numeric_update(FunctionContext* c
     MultiDistinctNumericState<T>* state = 
reinterpret_cast<MultiDistinctNumericState<T>*>(dst->ptr);
     state->update(src);
 }
-    
+
 void AggregateFunctions::count_distinct_string_update(FunctionContext* ctx, 
StringVal& src,
                            StringVal* dst) {
     DCHECK(!dst->is_null);
@@ -1844,7 +1867,7 @@ void 
AggregateFunctions::count_distinct_string_update(FunctionContext* ctx, Stri
     StringValue sv = StringValue::from_string_val(src);
     state->update(&sv);
 }
-    
+
 void AggregateFunctions::count_or_sum_distinct_decimal_update(FunctionContext* 
ctx, DecimalVal& src,
                                                               StringVal* dst) {
     DCHECK(!dst->is_null);
@@ -1860,7 +1883,7 @@ void 
AggregateFunctions::count_or_sum_distinct_decimalv2_update(FunctionContext*
     MultiDistinctDecimalV2State* state = 
reinterpret_cast<MultiDistinctDecimalV2State*>(dst->ptr);
     state->update(src);
 }
- 
+
 void AggregateFunctions::count_distinct_date_update(FunctionContext* ctx, 
DateTimeVal& src,
                                                               StringVal* dst) {
     DCHECK(!dst->is_null);
@@ -1877,14 +1900,14 @@ void 
AggregateFunctions::count_or_sum_distinct_numeric_merge(FunctionContext* ct
    MultiDistinctNumericState<T>* dst_state = 
reinterpret_cast<MultiDistinctNumericState<T>*>(dst->ptr);
    // unserialize src
    StringVal src_state_val;
-   MultiDistinctNumericState<T>::create(&src_state_val); 
+   MultiDistinctNumericState<T>::create(&src_state_val);
    MultiDistinctNumericState<T>* src_state = 
reinterpret_cast<MultiDistinctNumericState<T>*>(src_state_val.ptr);
    src_state->unserialize(src);
    DCHECK(dst_state->set_type() == src_state->set_type());
    dst_state->merge(*src_state);
    MultiDistinctNumericState<T>::destory(src_state_val);
 }
-    
+
 void AggregateFunctions::count_distinct_string_merge(FunctionContext* ctx, 
StringVal& src,
                           StringVal* dst) {
     DCHECK(!dst->is_null);
@@ -1897,7 +1920,7 @@ void 
AggregateFunctions::count_distinct_string_merge(FunctionContext* ctx, Strin
     src_state->unserialize(src);
     DCHECK(dst_state->set_type() == src_state->set_type());
     dst_state->merge(*src_state);
-    MultiDistinctStringCountState::destory(src_state_val); 
+    MultiDistinctStringCountState::destory(src_state_val);
 }
 
 
@@ -1930,7 +1953,7 @@ void 
AggregateFunctions::count_or_sum_distinct_decimalv2_merge(FunctionContext*
     dst_state->merge(*src_state);
     MultiDistinctDecimalV2State::destory(src_state_val);
 }
-    
+
 void AggregateFunctions::count_distinct_date_merge(FunctionContext* ctx, 
StringVal& src,
                                                              StringVal* dst) {
     DCHECK(!dst->is_null);
@@ -1945,7 +1968,7 @@ void 
AggregateFunctions::count_distinct_date_merge(FunctionContext* ctx, StringV
     dst_state->merge(*src_state);
     MultiDistinctCountDateState::destory(src_state_val);
 }
-    
+
 template <typename T>
 StringVal 
AggregateFunctions::count_or_sum_distinct_numeric_serialize(FunctionContext* 
ctx, const StringVal& state_sv) {
     DCHECK(!state_sv.is_null);
@@ -1955,7 +1978,7 @@ StringVal 
AggregateFunctions::count_or_sum_distinct_numeric_serialize(FunctionCo
     MultiDistinctNumericState<T>::destory(state_sv);
     return result;
 }
-    
+
 StringVal AggregateFunctions::count_distinct_string_serialize(FunctionContext* 
ctx, const StringVal& state_sv) {
     DCHECK(!state_sv.is_null);
     MultiDistinctStringCountState* state = 
reinterpret_cast<MultiDistinctStringCountState*>(state_sv.ptr);
@@ -1973,7 +1996,7 @@ StringVal 
AggregateFunctions::count_or_sum_distinct_decimal_serialize(FunctionCo
     MultiDistinctDecimalState::destory(state_sv);
     return result;
 }
-    
+
 StringVal 
AggregateFunctions::count_or_sum_distinct_decimalv2_serialize(FunctionContext* 
ctx, const StringVal& state_sv) {
     DCHECK(!state_sv.is_null);
     MultiDistinctDecimalV2State* state = 
reinterpret_cast<MultiDistinctDecimalV2State*>(state_sv.ptr);
@@ -1991,7 +2014,7 @@ StringVal 
AggregateFunctions::count_distinct_date_serialize(FunctionContext* ctx
     MultiDistinctCountDateState::destory(state_sv);
     return result;
 }
-    
+
 template <typename T>
 BigIntVal 
AggregateFunctions::count_or_sum_distinct_numeric_finalize(FunctionContext* 
ctx, const StringVal& state_sv) {
     DCHECK(!state_sv.is_null);
@@ -2000,7 +2023,7 @@ BigIntVal 
AggregateFunctions::count_or_sum_distinct_numeric_finalize(FunctionCon
     MultiDistinctNumericState<T>::destory(state_sv);
     return result;
 }
-    
+
 BigIntVal AggregateFunctions::count_distinct_string_finalize(FunctionContext* 
ctx, const StringVal& state_sv) {
     DCHECK(!state_sv.is_null);
     MultiDistinctStringCountState* state = 
reinterpret_cast<MultiDistinctStringCountState*>(state_sv.ptr);
@@ -2051,7 +2074,7 @@ BigIntVal 
AggregateFunctions::count_distinct_decimalv2_finalize(FunctionContext*
     MultiDistinctDecimalV2State::destory(state_sv);
     return result;
 }
-    
+
 DecimalVal AggregateFunctions::sum_distinct_decimal_finalize(FunctionContext* 
ctx, const StringVal& state_sv) {
     DCHECK(!state_sv.is_null);
     MultiDistinctDecimalState* state = 
reinterpret_cast<MultiDistinctDecimalState*>(state_sv.ptr);
@@ -2067,7 +2090,7 @@ DecimalV2Val 
AggregateFunctions::sum_distinct_decimalv2_finalize(FunctionContext
     MultiDistinctDecimalV2State::destory(state_sv);
     return result;
 }
-    
+
 BigIntVal AggregateFunctions::count_distinct_date_finalize(FunctionContext* 
ctx, const StringVal& state_sv) {
     DCHECK(!state_sv.is_null);
     MultiDistinctCountDateState* state = 
reinterpret_cast<MultiDistinctCountDateState*>(state_sv.ptr);
@@ -2075,7 +2098,7 @@ BigIntVal 
AggregateFunctions::count_distinct_date_finalize(FunctionContext* ctx,
     MultiDistinctCountDateState::destory(state_sv);
     return result;
 }
-    
+
 // An implementation of a simple single pass variance algorithm. A standard 
UDA must
 // be single pass (i.e. does not scan the table more than once), so the most 
canonical
 // two pass approach is not practical.
@@ -2157,7 +2180,7 @@ DoubleVal 
AggregateFunctions::knuth_var_pop_finalize(FunctionContext* ctx,
     return DoubleVal(variance);
 }
 
-DoubleVal AggregateFunctions::knuth_stddev_finalize(FunctionContext* ctx, 
+DoubleVal AggregateFunctions::knuth_stddev_finalize(FunctionContext* ctx,
                                                   const StringVal& state_sv) {
     DCHECK(!state_sv.is_null);
     DCHECK_EQ(state_sv.len, sizeof(KnuthVarianceState));
@@ -2792,4 +2815,7 @@ template void 
AggregateFunctions::offset_fn_update<DecimalV2Val>(
 
 template void 
AggregateFunctions::percentile_approx_update<doris_udf::DoubleVal>(
     FunctionContext* ctx, const doris_udf::DoubleVal&, const 
doris_udf::DoubleVal&, doris_udf::StringVal*);
+
+template void 
AggregateFunctions::percentile_approx_update<doris_udf::DoubleVal>(
+    FunctionContext* ctx, const doris_udf::DoubleVal&, const 
doris_udf::DoubleVal&, const doris_udf::DoubleVal&, doris_udf::StringVal*);
 }
diff --git a/be/src/exprs/aggregate_functions.h 
b/be/src/exprs/aggregate_functions.h
index 0e49334..d9a0590 100644
--- a/be/src/exprs/aggregate_functions.h
+++ b/be/src/exprs/aggregate_functions.h
@@ -71,6 +71,10 @@ public:
     template <typename T>
     static void percentile_approx_update(FunctionContext* ctx, const T& src, 
const DoubleVal& quantile, StringVal* dst);
 
+    template <typename T>
+    static void percentile_approx_update(FunctionContext* ctx, const T& src, 
const DoubleVal& quantile,
+            const DoubleVal& digest_compression, StringVal* dst);
+
     static void percentile_approx_merge(FunctionContext* ctx, const StringVal& 
src, StringVal* dst);
 
     static DoubleVal percentile_approx_finalize(FunctionContext* ctx, const 
StringVal& src);
@@ -200,13 +204,13 @@ dst);
     template <typename T>
     static BigIntVal count_or_sum_distinct_numeric_finalize(FunctionContext* 
ctx, const StringVal& state_sv);
 
-    // count distinct in multi distinct for string 
+    // count distinct in multi distinct for string
     static void count_distinct_string_init(doris_udf::FunctionContext* ctx, 
doris_udf::StringVal* dst);
     static void count_distinct_string_update(FunctionContext* ctx, StringVal& 
src, StringVal* dst);
     static void count_distinct_string_merge(FunctionContext* ctx, StringVal& 
src, StringVal* dst);
     static StringVal count_distinct_string_serialize(FunctionContext* ctx, 
const StringVal& state_sv);
     static BigIntVal count_distinct_string_finalize(FunctionContext* ctx, 
const StringVal& state_sv);
- 
+
     // count distinct in multi distinct for decimal
     static void count_or_sum_distinct_decimal_init(doris_udf::FunctionContext* 
ctx, doris_udf::StringVal* dst);
     static void 
count_or_sum_distinct_decimalv2_init(doris_udf::FunctionContext* ctx, 
doris_udf::StringVal* dst);
@@ -227,13 +231,13 @@ dst);
     static void count_distinct_date_merge(FunctionContext* ctx, StringVal& 
src, StringVal* dst);
     static StringVal count_distinct_date_serialize(FunctionContext* ctx, const 
StringVal& state_sv);
     static BigIntVal count_distinct_date_finalize(FunctionContext* ctx, const 
StringVal& state_sv);
- 
+
     template <typename T>
     static BigIntVal sum_distinct_bigint_finalize(FunctionContext* ctx, const 
StringVal& state_sv);
     template <typename T>
     static LargeIntVal sum_distinct_largeint_finalize(FunctionContext* ctx, 
const StringVal& state_sv);
     template <typename T>
-    static DoubleVal sum_distinct_double_finalize(FunctionContext* ctx, const 
StringVal& state_sv); 
+    static DoubleVal sum_distinct_double_finalize(FunctionContext* ctx, const 
StringVal& state_sv);
 
     /// Knuth's variance algorithm, more numerically stable than canonical 
stddev
     /// algorithms; reference implementation:
diff --git 
a/docs/documentation/cn/sql-reference/sql-functions/aggregate-functions/percentile_approx.md
 
b/docs/documentation/cn/sql-reference/sql-functions/aggregate-functions/percentile_approx.md
index f47ce1c..5d91110 100755
--- 
a/docs/documentation/cn/sql-reference/sql-functions/aggregate-functions/percentile_approx.md
+++ 
b/docs/documentation/cn/sql-reference/sql-functions/aggregate-functions/percentile_approx.md
@@ -2,11 +2,14 @@
 ## description
 ### Syntax
 
-`PERCENTILE_APPROX(expr, DOUBLE p)`
+`PERCENTILE_APPROX(expr, DOUBLE p[, DOUBLE compression])`
 
 
 返回第p个百分位点的近似值,p的值介于0到1之间
 
+compression参数是可选项,可设置范围是(0, 10000),值越大,精度越高,内存消耗越大,计算耗时越长。
+compression参数未指定或设置的值在(0, 10000)范围外,以10000的默认值运行
+
 该函数使用固定大小的内存,因此对于高基数的列可以使用更少的内存,可用于计算tp99等统计值
 
 ## example
@@ -17,5 +20,12 @@ MySQL > select `table`, percentile_approx(cost_time,0.99) 
from log_statis group
 +----------+--------------------------------------+
 | test     |                                54.22 |
 +----------+--------------------------------------+
+
+MySQL > select `table`, percentile_approx(cost_time,0.99, 100) from log_statis 
group by `table`;
++---------------------+---------------------------+
+| table    | percentile_approx(`cost_time`, 0.99, 100) |
++----------+--------------------------------------+
+| test     |                                54.21 |
++----------+--------------------------------------+
 ##keyword
 PERCENTILE_APPROX,PERCENTILE,APPROX
diff --git 
a/docs/documentation/en/sql-reference/sql-functions/aggregate-functions/percentile_approx_EN.md
 
b/docs/documentation/en/sql-reference/sql-functions/aggregate-functions/percentile_approx_EN.md
index e8bec54..d4e3dbb 100644
--- 
a/docs/documentation/en/sql-reference/sql-functions/aggregate-functions/percentile_approx_EN.md
+++ 
b/docs/documentation/en/sql-reference/sql-functions/aggregate-functions/percentile_approx_EN.md
@@ -2,11 +2,12 @@
 ## Description
 ### Syntax
 
-`PERCENTILE_APPROX(expr, DOUBLE p)`
-
+`PERCENTILE_APPROX(expr, DOUBLE p[, DOUBLE compression])`
 
 Return the approximation of the point p, where the value of P is between 0 and 
1.
 
+Compression param is optional and can be setted to a value in the range of (0, 
10000). The bigger compression you set, the more precise result and more time 
cost you will get. If it is not setted or not setted in the correct range, 
PERCENTILE_APPROX function will run with a default compression param of 10000.
+
 This function uses fixed size memory, so less memory can be used for columns 
with high cardinality, and can be used to calculate statistics such as tp99.
 
 ## example
@@ -17,5 +18,11 @@ MySQL > select `table`, percentile_approx(cost_time,0.99) 
from log_statis group
 +----------+--------------------------------------+
 | test     |                                54.22 |
 +----------+--------------------------------------+
+MySQL > select `table`, percentile_approx(cost_time,0.99, 100) from log_statis 
group by `table`;
++---------------------+---------------------------+
+| table    | percentile_approx(`cost_time`, 0.99, 100) |
++----------+--------------------------------------+
+| test     |                                54.21 |
++----------+--------------------------------------+
 ##keyword
 PERCENTILE_APPROX,PERCENTILE,APPROX
diff --git a/fe/src/main/java/org/apache/doris/analysis/AnalyticWindow.java 
b/fe/src/main/java/org/apache/doris/analysis/AnalyticWindow.java
index f1a921e..1a6d220 100644
--- a/fe/src/main/java/org/apache/doris/analysis/AnalyticWindow.java
+++ b/fe/src/main/java/org/apache/doris/analysis/AnalyticWindow.java
@@ -350,7 +350,7 @@ public class AnalyticWindow {
 
         if (e.isConstant() && e.getType().isNumericType()) {
             try {
-                val=e.getConstFromExpr(e);
+                val = Expr.getConstFromExpr(e);
 //                val = TColumnValueUtil.getNumericVal(
 //                        FeSupport.EvalConstExpr(e, 
analyzer.getQueryGlobals()));
 
@@ -400,8 +400,8 @@ public class AnalyticWindow {
         try {
 //            TColumnValue val1 = FeSupport.EvalConstExpr(e1, 
analyzer.getQueryGlobals());
 //            TColumnValue val2 = FeSupport.EvalConstExpr(e2, 
analyzer.getQueryGlobals());
-            double left = e1.getConstFromExpr(e1);
-            double right = e2.getConstFromExpr(e2);
+            double left = Expr.getConstFromExpr(e1);
+            double right = Expr.getConstFromExpr(e2);
 
             if (left > right) {
                 throw new AnalysisException(
diff --git a/fe/src/main/java/org/apache/doris/analysis/Expr.java 
b/fe/src/main/java/org/apache/doris/analysis/Expr.java
index 9acc131..c89f447 100644
--- a/fe/src/main/java/org/apache/doris/analysis/Expr.java
+++ b/fe/src/main/java/org/apache/doris/analysis/Expr.java
@@ -632,7 +632,7 @@ abstract public class Expr extends TreeNode<Expr> 
implements ParseNode, Cloneabl
      * If smap is null, this function is equivalent to clone().
      * If preserveRootType is true, the resulting expr tree will be cast if 
necessary to
      * the type of 'this'.
-     * 
+     *
      * @throws AnalysisException
      */
     public Expr substitute(ExprSubstitutionMap smap, Analyzer analyzer, 
boolean preserveRootType)
@@ -1322,7 +1322,7 @@ abstract public class Expr extends TreeNode<Expr> 
implements ParseNode, Cloneabl
         return this;
     }
 
-    public double getConstFromExpr(Expr e) throws AnalysisException{
+    public static double getConstFromExpr(Expr e) throws AnalysisException{
         Preconditions.checkState(e.isConstant());
         double value = 0;
         if( e instanceof LiteralExpr){
diff --git a/fe/src/main/java/org/apache/doris/analysis/FunctionCallExpr.java 
b/fe/src/main/java/org/apache/doris/analysis/FunctionCallExpr.java
index dbdc79c..d152012 100644
--- a/fe/src/main/java/org/apache/doris/analysis/FunctionCallExpr.java
+++ b/fe/src/main/java/org/apache/doris/analysis/FunctionCallExpr.java
@@ -167,7 +167,7 @@ public class FunctionCallExpr extends Expr {
             fn = null;
         }
     }
-    
+
     @Override
     public boolean equals(Object obj) {
         if (!super.equals(obj)) {
@@ -278,7 +278,7 @@ public class FunctionCallExpr extends Expr {
                 throw new AnalysisException(
                         "COUNT must have DISTINCT for multiple arguments: " + 
this.toSql());
             }
-            
+
             for (int i = 0; i < children.size(); i++) {
                 if (children.get(i).type.isHllType()) {
                     throw new AnalysisException(
@@ -319,7 +319,7 @@ public class FunctionCallExpr extends Expr {
                 if (arg0.type.isHllType()) {
                     throw new AnalysisException(
                             "group_concat requires second parameter can't be 
of type HLL: " + this.toSql());
-                } 
+                }
             }
             return;
         }
@@ -369,7 +369,7 @@ public class FunctionCallExpr extends Expr {
                     "SUM_DISTINCT requires a numeric parameter: " + 
this.toSql());
         }
 
-        if ((fnName.getFunction().equalsIgnoreCase("min") 
+        if ((fnName.getFunction().equalsIgnoreCase("min")
                 || fnName.getFunction().equalsIgnoreCase("max")
                 || fnName.getFunction().equalsIgnoreCase("DISTINCT_PC")
                 || fnName.getFunction().equalsIgnoreCase("DISTINCT_PCSA")
@@ -422,13 +422,19 @@ public class FunctionCallExpr extends Expr {
         }
 
         if (fnName.getFunction().equalsIgnoreCase("percentile_approx")) {
-            if (children.size() != 2) {
-                throw new AnalysisException("percentile_approx(expr, DOUBLE) 
requires two parameters");
+            if (children.size() != 2 && children.size() != 3) {
+                throw new AnalysisException("percentile_approx(expr, DOUBLE [, 
B]) requires two or three parameters");
             }
             if (!getChild(1).isConstant()) {
                 throw new AnalysisException("percentile_approx requires second 
parameter must be a constant : "
                         + this.toSql());
             }
+            if (children.size() == 3) {
+                if (!getChild(2).isConstant()) {
+                    throw new AnalysisException("percentile_approx requires 
the third parameter must be a constant : "
+                            + this.toSql());
+                }
+            }
         }
         return;
     }
@@ -559,7 +565,7 @@ public class FunctionCallExpr extends Expr {
             LOG.warn("fn {} not exists", fnName.getFunction());
             throw new 
AnalysisException(getFunctionNotFoundError(collectChildReturnTypes()));
         }
-        
+
         if (fn.getFunctionName().getFunction().equals("time_diff")) {
             fn.getReturnType().getPrimitiveType().setTimeType();
             return;
diff --git a/fe/src/main/java/org/apache/doris/catalog/FunctionSet.java 
b/fe/src/main/java/org/apache/doris/catalog/FunctionSet.java
index adb35c4..03ca91e 100644
--- a/fe/src/main/java/org/apache/doris/catalog/FunctionSet.java
+++ b/fe/src/main/java/org/apache/doris/catalog/FunctionSet.java
@@ -127,7 +127,7 @@ public class FunctionSet {
                     
"3maxIN9doris_udf11LargeIntValEEEvPNS2_15FunctionContextERKT_PS6_")
                .build();
 
-    private static final Map<Type, Type> MULTI_DISTINCT_SUM_RETURN_TYPE = 
+    private static final Map<Type, Type> MULTI_DISTINCT_SUM_RETURN_TYPE =
              ImmutableMap.<Type, Type>builder()
                     .put(Type.TINYINT, Type.BIGINT)
                     .put(Type.SMALLINT, Type.BIGINT)
@@ -138,7 +138,7 @@ public class FunctionSet {
                     .put(Type.LARGEINT, Type.LARGEINT)
                     .put(Type.DECIMAL, Type.DECIMAL)
                     .put(Type.DECIMALV2, Type.DECIMALV2)
-                    .build(); 
+                    .build();
 
     private static final Map<Type, String> MULTI_DISTINCT_INIT_SYMBOL =
             ImmutableMap.<Type, String>builder()
@@ -229,8 +229,8 @@ public class FunctionSet {
                     .put(Type.LARGEINT,
                             
"38count_or_sum_distinct_numeric_finalizeIN9doris_udf11LargeIntValEEENS2_9BigIntValEPNS2_15FunctionContextERKNS2_9StringValE")
                     .build();
-   
-    
+
+
     private static final Map<Type, String> MULTI_DISTINCT_SUM_FINALIZE_SYMBOL =
              ImmutableMap.<Type, String>builder()
                     .put(Type.BIGINT,
@@ -290,7 +290,7 @@ public class FunctionSet {
                 .put(Type.LARGEINT,
                     
"10hll_updateIN9doris_udf11LargeIntValEEEvPNS2_15FunctionContextERKT_PNS2_9StringValE")
                 .build();
-   
+
 
     private static final Map<Type, String> HLL_UNION_AGG_UPDATE_SYMBOL =
         ImmutableMap.<Type, String>builder()
@@ -299,7 +299,7 @@ public class FunctionSet {
                 .put(Type.HLL,
                         
"_ZN5doris12HllFunctions9hll_mergeEPN9doris_udf15FunctionContextERKNS1_9StringValEPS4_")
                 .build();
- 
+
     private static final Map<Type, String> OFFSET_FN_INIT_SYMBOL =
         ImmutableMap.<Type, String>builder()
                 .put(Type.BOOLEAN,
@@ -571,7 +571,7 @@ public class FunctionSet {
     }
 
     /**
-     * There are essential differences in the implementation of some functions 
for different 
+     * There are essential differences in the implementation of some functions 
for different
      * types params, which should be prohibited.
      * @param desc
      * @param candicate
@@ -581,7 +581,7 @@ public class FunctionSet {
         final String functionName = desc.getFunctionName().getFunction();
         final Type[] descArgTypes = desc.getArgs();
         final Type[] candicateArgTypes = candicate.getArgs();
-        if (functionName.equalsIgnoreCase("hex") 
+        if (functionName.equalsIgnoreCase("hex")
                 || functionName.equalsIgnoreCase("greast")
                 || functionName.equalsIgnoreCase("least")) {
             final ScalarType descArgType = (ScalarType)descArgTypes[0];
@@ -704,18 +704,18 @@ public class FunctionSet {
                     prefix + 
"12count_removeEPN9doris_udf15FunctionContextERKNS1_6AnyValEPNS1_9BigIntValE",
                     null, false, true, true));
 
-           
+
             // count in multi distinct
             if (t == Type.CHAR || t == Type.VARCHAR) {
-               
addBuiltin(AggregateFunction.createBuiltin("multi_distinct_count", 
Lists.newArrayList(t), 
+               
addBuiltin(AggregateFunction.createBuiltin("multi_distinct_count", 
Lists.newArrayList(t),
                     Type.BIGINT,
-                    Type.VARCHAR, 
+                    Type.VARCHAR,
                     prefix + 
"26count_distinct_string_initEPN9doris_udf15FunctionContextEPNS1_9StringValE",
                     prefix + 
"28count_distinct_string_updateEPN9doris_udf15FunctionContextERNS1_9StringValEPS4_",
-                    prefix + 
"27count_distinct_string_mergeEPN9doris_udf15FunctionContextERNS1_9StringValEPS4_",
 
-                    prefix + 
"31count_distinct_string_serializeEPN9doris_udf15FunctionContextERKNS1_9StringValE",
  
-                    null,      
-                    null, 
+                    prefix + 
"27count_distinct_string_mergeEPN9doris_udf15FunctionContextERNS1_9StringValEPS4_",
+                    prefix + 
"31count_distinct_string_serializeEPN9doris_udf15FunctionContextERKNS1_9StringValE",
+                    null,
+                    null,
                     prefix + 
"30count_distinct_string_finalizeEPN9doris_udf15FunctionContextERKNS1_9StringValE",
                     false, true, true));
 
@@ -729,31 +729,31 @@ public class FunctionSet {
                     
"_ZN5doris15BitmapFunctions16bitmap_serializeEPN9doris_udf15FunctionContextERKNS1_9StringValE",
                     true, false, true));
 
-            } else if (t == Type.TINYINT || t == Type.SMALLINT || t == 
Type.INT 
+            } else if (t == Type.TINYINT || t == Type.SMALLINT || t == Type.INT
                 || t == Type.BIGINT || t == Type.LARGEINT || t == Type.DOUBLE) 
{
-               
addBuiltin(AggregateFunction.createBuiltin("multi_distinct_count", 
Lists.newArrayList(t), 
+               
addBuiltin(AggregateFunction.createBuiltin("multi_distinct_count", 
Lists.newArrayList(t),
                     Type.BIGINT,
-                    Type.VARCHAR, 
+                    Type.VARCHAR,
                     prefix + MULTI_DISTINCT_INIT_SYMBOL.get(t),
                     prefix + MULTI_DISTINCT_UPDATE_SYMBOL.get(t),
-                    prefix + MULTI_DISTINCT_MERGE_SYMBOL.get(t), 
-                    prefix + MULTI_DISTINCT_SERIALIZE_SYMBOL.get(t),  
-                    null,                      
-                    null, 
+                    prefix + MULTI_DISTINCT_MERGE_SYMBOL.get(t),
+                    prefix + MULTI_DISTINCT_SERIALIZE_SYMBOL.get(t),
+                    null,
+                    null,
                     prefix + MULTI_DISTINCT_COUNT_FINALIZE_SYMBOL.get(t),
                     false, true, true));
             } else if (t == Type.DATE || t == Type.DATETIME) {
-               
addBuiltin(AggregateFunction.createBuiltin("multi_distinct_count", 
Lists.newArrayList(t), 
+               
addBuiltin(AggregateFunction.createBuiltin("multi_distinct_count", 
Lists.newArrayList(t),
                     Type.BIGINT,
-                    Type.VARCHAR, 
+                    Type.VARCHAR,
                     prefix + 
"24count_distinct_date_initEPN9doris_udf15FunctionContextEPNS1_9StringValE",
                     prefix + 
"26count_distinct_date_updateEPN9doris_udf15FunctionContextERNS1_11DateTimeValEPNS1_9StringValE",
                     prefix + 
"25count_distinct_date_mergeEPN9doris_udf15FunctionContextERNS1_9StringValEPS4_",
-                    prefix + 
"29count_distinct_date_serializeEPN9doris_udf15FunctionContextERKNS1_9StringValE",
  
-                    null,    
-                    null, 
+                    prefix + 
"29count_distinct_date_serializeEPN9doris_udf15FunctionContextERKNS1_9StringValE",
+                    null,
+                    null,
                     prefix + 
"28count_distinct_date_finalizeEPN9doris_udf15FunctionContextERKNS1_9StringValE",
-                    false, true, true)); 
+                    false, true, true));
             } else if (t == Type.DECIMAL) {
                
addBuiltin(AggregateFunction.createBuiltin("multi_distinct_count", 
Lists.newArrayList(t),
                     Type.BIGINT,
@@ -765,7 +765,7 @@ public class FunctionSet {
                     null,
                     null,
                     prefix + 
"31count_distinct_decimal_finalizeEPN9doris_udf15FunctionContextERKNS1_9StringValE",
-                    false, true, true)); 
+                    false, true, true));
             } else if (t == Type.DECIMALV2) {
                
addBuiltin(AggregateFunction.createBuiltin("multi_distinct_count", 
Lists.newArrayList(t),
                     Type.BIGINT,
@@ -777,20 +777,20 @@ public class FunctionSet {
                     null,
                     null,
                     prefix + 
"33count_distinct_decimalv2_finalizeEPN9doris_udf15FunctionContextERKNS1_9StringValE",
-                    false, true, true)); 
+                    false, true, true));
             }
 
             // sum in multi distinct
             if (t == Type.BIGINT || t == Type.LARGEINT || t == Type.DOUBLE) {
-                
addBuiltin(AggregateFunction.createBuiltin("multi_distinct_sum", 
Lists.newArrayList(t), 
+                
addBuiltin(AggregateFunction.createBuiltin("multi_distinct_sum", 
Lists.newArrayList(t),
                     t,
-                    Type.VARCHAR, 
+                    Type.VARCHAR,
                     prefix + MULTI_DISTINCT_INIT_SYMBOL.get(t),
                     prefix + MULTI_DISTINCT_UPDATE_SYMBOL.get(t),
-                    prefix + MULTI_DISTINCT_MERGE_SYMBOL.get(t), 
-                    prefix + MULTI_DISTINCT_SERIALIZE_SYMBOL.get(t),  
-                    null,                      
-                    null, 
+                    prefix + MULTI_DISTINCT_MERGE_SYMBOL.get(t),
+                    prefix + MULTI_DISTINCT_SERIALIZE_SYMBOL.get(t),
+                    null,
+                    null,
                     prefix + MULTI_DISTINCT_SUM_FINALIZE_SYMBOL.get(t),
                     false, true, true));
             }  else if (t == Type.DECIMAL) {
@@ -998,7 +998,14 @@ public class FunctionSet {
                 prefix + 
"27percentile_approx_serializeEPN9doris_udf15FunctionContextERKNS1_9StringValE",
                 prefix + 
"26percentile_approx_finalizeEPN9doris_udf15FunctionContextERKNS1_9StringValE",
                 false, false, false));
-
+        addBuiltin(AggregateFunction.createBuiltin("percentile_approx",
+                Lists.<Type>newArrayList(Type.DOUBLE, Type.DOUBLE, 
Type.DOUBLE), Type.DOUBLE, Type.VARCHAR,
+                prefix + 
"22percentile_approx_initEPN9doris_udf15FunctionContextEPNS1_9StringValE",
+                prefix + 
"24percentile_approx_updateIN9doris_udf9DoubleValEEEvPNS2_15FunctionContextERKT_RKS3_SA_PNS2_9StringValE",
+                prefix + 
"23percentile_approx_mergeEPN9doris_udf15FunctionContextERKNS1_9StringValEPS4_",
+                prefix + 
"27percentile_approx_serializeEPN9doris_udf15FunctionContextERKNS1_9StringValE",
+                prefix + 
"26percentile_approx_finalizeEPN9doris_udf15FunctionContextERKNS1_9StringValE",
+                false, false, false));
 
         // Avg
         // TODO: switch to CHAR(sizeof(AvgIntermediateType) when that becomes 
available


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to