zuochunwei commented on a change in pull request #6625:
URL: https://github.com/apache/incubator-doris/pull/6625#discussion_r708986529
##########
File path: be/src/olap/hll.cpp
##########
@@ -100,36 +102,72 @@ void HyperLogLog::merge(const HyperLogLog& other) {
}
case HLL_DATA_EXPLICIT: {
switch (other._type) {
- case HLL_DATA_EXPLICIT:
+ case HLL_DATA_EXPLICIT: {
// Merge other's explicit values first, then check if the number
is exceed
// HLL_EXPLICIT_INT64_NUM. This is OK because the max value is 2 *
160.
- _hash_set.insert(other._hash_set.begin(), other._hash_set.end());
- if (_hash_set.size() > HLL_EXPLICIT_INT64_NUM) {
- _convert_explicit_to_register();
- _type = HLL_DATA_FULL;
+ if (other._explicit_data_num > HLL_EXPLICIT_INT64_NUM / 2) {
//merge
+ uint64_t explicit_data[HLL_EXPLICIT_INT64_NUM * 2];
+ memcpy(explicit_data, _explicit_data, sizeof(*_explicit_data)
* _explicit_data_num);
+ uint32_t explicit_data_num = _explicit_data_num;
+ _explicit_data_num = 0;
+
+ // merge _explicit_data and other's _explicit_data to
_explicit_data
+ uint32_t i = 0, j = 0, k = 0;
+ while (i < explicit_data_num || j < other._explicit_data_num) {
+ if (i == explicit_data_num) {
+ uint32_t n = other._explicit_data_num - j;
+ memcpy(_explicit_data + k, other._explicit_data + j, n
* sizeof(*_explicit_data));
+ k += n; break;
+ } else if (j == other._explicit_data_num) {
+ uint32_t n = explicit_data_num - i;
+ memcpy(_explicit_data + k, explicit_data + i, n *
sizeof(*_explicit_data));
+ k += n; break;
+ } else {
+ if (explicit_data[i] < other._explicit_data[j]) {
+ _explicit_data[k++] = explicit_data[i++];
+ } else if (explicit_data[i] > other._explicit_data[j])
{
+ _explicit_data[k++] = other._explicit_data[j++];
+ } else {
+ _explicit_data[k++] = explicit_data[i++]; j++;
+ }
+ }
+ }
+ _explicit_data_num = k;
+ } else { //依次插入
Review comment:
ok
##########
File path: be/src/olap/hll.cpp
##########
@@ -363,53 +427,53 @@ int64_t HyperLogLog::estimate_cardinality() const {
// there are relatively large fluctuations, we fixed the problem refer
to redis.
double bias = 5.9119 * 1.0e-18 * (estimate * estimate * estimate *
estimate) -
1.4253 * 1.0e-12 * (estimate * estimate * estimate) +
- 1.2940 * 1.0e-7 * (estimate * estimate) - 5.2921 *
1.0e-3 * estimate +
- 83.3216;
+ 1.2940 * 1.0e-7 * (estimate * estimate) -
+ 5.2921 * 1.0e-3 * estimate + 83.3216;
estimate -= estimate * (bias / 100);
}
return (int64_t)(estimate + 0.5);
}
void HllSetResolver::parse() {
// skip LengthValueType
- char* pdata = _buf_ref;
+ char* pdata = _buf_ref;
_set_type = (HllDataType)pdata[0];
char* sparse_data = NULL;
switch (_set_type) {
- case HLL_DATA_EXPLICIT:
- // first byte : type
- // second~five byte : hash values's number
- // five byte later : hash value
- _explicit_num =
(ExplicitLengthValueType)(pdata[sizeof(SetTypeValueType)]);
- _explicit_value =
- (uint64_t*)(pdata + sizeof(SetTypeValueType) +
sizeof(ExplicitLengthValueType));
- break;
- case HLL_DATA_SPARSE:
- // first byte : type
- // second ~(2^HLL_COLUMN_PRECISION)/8 byte : bitmap mark which is not
zero
- // 2^HLL_COLUMN_PRECISION)/8 + 1以后value
- _sparse_count = (SparseLengthValueType*)(pdata +
sizeof(SetTypeValueType));
- sparse_data = pdata + sizeof(SetTypeValueType) +
sizeof(SparseLengthValueType);
- for (int i = 0; i < *_sparse_count; i++) {
- SparseIndexType* index = (SparseIndexType*)sparse_data;
- sparse_data += sizeof(SparseIndexType);
- SparseValueType* value = (SparseValueType*)sparse_data;
- _sparse_map[*index] = *value;
- sparse_data += sizeof(SetTypeValueType);
- }
- break;
- case HLL_DATA_FULL:
- // first byte : type
- // second byte later : hll register value
- _full_value_position = pdata + sizeof(SetTypeValueType);
- break;
- default:
- // HLL_DATA_EMPTY
- break;
+ case HLL_DATA_EXPLICIT:
Review comment:
ok
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]