morningman commented on a change in pull request #1845: Reduce size of HyperLogLog struct URL: https://github.com/apache/incubator-doris/pull/1845#discussion_r326670122
########## File path: be/src/olap/hll.cpp ########## @@ -20,172 +20,236 @@ #include <algorithm> #include <map> +#include "common/logging.h" +#include "util/coding.h" + using std::map; using std::nothrow; using std::string; using std::stringstream; namespace doris { -HyperLogLog::HyperLogLog(char* src) { - _type = (HllDataType)src[0]; +// TODO(zc): we should check if src is valid, it maybe corrupted +HyperLogLog::HyperLogLog(const uint8_t* src) { + deserialize(src); +} + +// Convert explicit values to register format, and clear explicit values. +// NOTE: this function won't modify _type. +void HyperLogLog::_convert_explicit_to_register() { + DCHECK(_type == HLL_DATA_EXPLICIT) << "_type(" << _type << ") should be explicit(" + << HLL_DATA_EXPLICIT << ")"; + _registers = new uint8_t[HLL_REGISTERS_COUNT]; memset(_registers, 0, HLL_REGISTERS_COUNT); - char* sparse_data = nullptr; - switch (_type) { - case HLL_DATA_EXPLICIT: - // first byte : type - // second~five byte : hash values's number - // five byte later : hash value - { - auto _explicit_num = (uint8_t) (src[sizeof(SetTypeValueType)]); - auto *_explicit_value = (uint64_t *) (src + sizeof(SetTypeValueType) + sizeof(uint8_t)); - for (int i = 0; i < _explicit_num; ++i) { - _hash_set.insert(_explicit_value[i]); - } - } - break; - case HLL_DATA_SPRASE: - // first byte : type - // second ~(2^HLL_COLUMN_PRECISION)/8 byte : bitmap mark which is not zero - // 2^HLL_COLUMN_PRECISION)/8 + 1以后value - { - auto* _sparse_count = (SparseLengthValueType*)(src + sizeof (SetTypeValueType)); - sparse_data = src + sizeof(SetTypeValueType) + sizeof(SparseLengthValueType); - std::map<SparseIndexType, SparseValueType> _sparse_map; - for (int i = 0; i < *_sparse_count; i++) { - auto* index = (SparseIndexType*)sparse_data; - sparse_data += sizeof(SparseIndexType); - auto* value = (SparseValueType*)sparse_data; - _sparse_map[*index] = *value; - sparse_data += sizeof(SetTypeValueType); - } + for (auto value : _hash_set) { + _update_registers(value); + } + // clear _hash_set + std::set<uint64_t>().swap(_hash_set); +} - for (auto iter: _sparse_map) { - _registers[iter.first] = (uint8_t)iter.second; - } - } +// Change HLL_DATA_EXPLICIT to HLL_DATA_FULL directly because they are +// implemented in the same way in memory. Review comment: What does this comment mean? It seems that HLL_DATA_EXPLICIT and HLL_DATA_FULL are not same in memory? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org