This is an automated email from the ASF dual-hosted git repository.
alsay pushed a commit to branch theta_compression
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git
The following commit(s) were added to refs/heads/theta_compression by this push:
new 3631c4f removed unused code, reused parser
3631c4f is described below
commit 3631c4fc113b2316c02d83a25534892ee6ed1554
Author: AlexanderSaydakov <[email protected]>
AuthorDate: Wed Jan 11 14:32:01 2023 -0800
removed unused code, reused parser
---
theta/include/theta_sketch.hpp | 6 -
theta/include/theta_sketch_impl.hpp | 249 ++++--------------------------------
2 files changed, 23 insertions(+), 232 deletions(-)
diff --git a/theta/include/theta_sketch.hpp b/theta/include/theta_sketch.hpp
index 10133b6..8950b63 100644
--- a/theta/include/theta_sketch.hpp
+++ b/theta/include/theta_sketch.hpp
@@ -394,13 +394,7 @@ private:
uint64_t theta_;
std::vector<uint64_t, Allocator> entries_;
- vector_bytes serialize_version_4_SLZ(unsigned header_size_bytes = 0) const;
vector_bytes serialize_version_4_MLZ(unsigned header_size_bytes = 0) const;
- vector_bytes serialize_version_4_FLZ(unsigned header_size_bytes = 0) const;
- vector_bytes serialize_version_4_ULEB128(unsigned header_size_bytes = 0)
const;
-
- static compact_theta_sketch_alloc deserialize_version_4(const void* bytes,
size_t size,
- uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
virtual void print_specifics(std::ostringstream& os) const;
};
diff --git a/theta/include/theta_sketch_impl.hpp
b/theta/include/theta_sketch_impl.hpp
index e09022b..a49cb31 100644
--- a/theta/include/theta_sketch_impl.hpp
+++ b/theta/include/theta_sketch_impl.hpp
@@ -410,70 +410,6 @@ auto
compact_theta_sketch_alloc<A>::serialize_compressed(unsigned header_size_by
return serialize_version_4_MLZ(header_size_bytes);
}
-// should be called for ordered sketches that are not empty and not single item
-template<typename A>
-auto compact_theta_sketch_alloc<A>::serialize_version_4_FLZ(unsigned
header_size_bytes) const -> vector_bytes {
- const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1;
- // compression based on leading zeros in deltas between ordered hash values
- // assumes ordered sketch
- uint64_t previous = 0;
- uint8_t min_zeros = 64;
- uint8_t max_zeros = 0;
- std::vector<uint8_t> leading_zeros(entries_.get_allocator());
- leading_zeros.reserve(entries_.size());
- std::vector<uint64_t> deltas(entries_.get_allocator());
- deltas.reserve(entries_.size());
- size_t compressed_bits = 0;
- for (unsigned i = 0; i < entries_.size(); ++i) {
- const uint64_t delta = entries_[i] - previous;
- deltas.push_back(delta);
- previous = entries_[i];
- const uint8_t zeros = count_leading_zeros_in_u64(delta);
- leading_zeros.push_back(zeros);
- min_zeros = std::min(min_zeros, zeros);
- max_zeros = std::max(max_zeros, zeros);
- compressed_bits += 63 - zeros; // the first 1 is understood
- }
- const uint8_t count_zeros_bits = 8 - byte_leading_zeros_table[max_zeros -
min_zeros];
- compressed_bits += count_zeros_bits * entries_.size();
-
- const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
- const uint8_t num_entries_bits = 31 -
count_leading_zeros_in_u32(num_entries); // the first 1 is understood
- compressed_bits += 5 + num_entries_bits;
-
- const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs +
std::ceil(compressed_bits / 8.0);
- vector_bytes bytes(size, 0, entries_.get_allocator());
- uint8_t* ptr = bytes.data() + header_size_bytes;
-
- ptr += copy_to_mem(preamble_longs, ptr);
- const uint8_t serial_version = 4;
- ptr += copy_to_mem(serial_version, ptr);
- const uint8_t type = SKETCH_TYPE;
- ptr += copy_to_mem(type, ptr);
- ptr += copy_to_mem(min_zeros, ptr);
- ptr += copy_to_mem(count_zeros_bits, ptr);
- const uint8_t flags_byte(
- (1 << flags::IS_COMPACT) |
- (1 << flags::IS_READ_ONLY) |
- (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
- (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
- );
- ptr += copy_to_mem(flags_byte, ptr);
- const uint16_t seed_hash = get_seed_hash();
- ptr += copy_to_mem(seed_hash, ptr);
- if (this->is_estimation_mode()) {
- ptr += copy_to_mem(theta_, ptr);
- }
-
- size_t offset_bits = pack_bits(num_entries_bits, 5, ptr, 0);
- offset_bits = pack_bits(num_entries, num_entries_bits, ptr, offset_bits);
- for (unsigned i = 0; i < entries_.size(); ++i) {
- offset_bits = pack_bits(leading_zeros[i] - min_zeros, count_zeros_bits,
ptr, offset_bits);
- offset_bits = pack_bits(deltas[i], 63 - leading_zeros[i], ptr,
offset_bits);
- }
- return bytes;
-}
-
template<typename A>
auto compact_theta_sketch_alloc<A>::serialize_version_4_MLZ(unsigned
header_size_bytes) const -> vector_bytes {
const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1;
@@ -547,95 +483,6 @@ auto
compact_theta_sketch_alloc<A>::serialize_version_4_MLZ(unsigned header_size
return bytes;
}
-template<typename A>
-auto compact_theta_sketch_alloc<A>::serialize_version_4_SLZ(unsigned
header_size_bytes) const -> vector_bytes {
- const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1;
- // compression based on leading zeros in deltas between ordered hash values
- // assumes ordered sketch
- const size_t fixed_size = header_size_bytes + sizeof(uint64_t) *
preamble_longs;
- const size_t estimated_variable_size = 5 // for num_entries
- + 9 * entries_.size();
- vector_bytes bytes(fixed_size + estimated_variable_size, 0,
entries_.get_allocator());
- uint8_t* ptr = bytes.data() + header_size_bytes;
-
- ptr += copy_to_mem(preamble_longs, ptr);
- const uint8_t serial_version = 4;
- ptr += copy_to_mem(serial_version, ptr);
- const uint8_t type = SKETCH_TYPE;
- ptr += copy_to_mem(type, ptr);
- ptr += sizeof(uint16_t); // unused
- const uint8_t flags_byte(
- (1 << flags::IS_COMPACT) |
- (1 << flags::IS_READ_ONLY) |
- (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
- (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
- );
- ptr += copy_to_mem(flags_byte, ptr);
- const uint16_t seed_hash = get_seed_hash();
- ptr += copy_to_mem(seed_hash, ptr);
- if (this->is_estimation_mode()) {
- ptr += copy_to_mem(theta_, ptr);
- }
-
- ptr += ULEB128(entries_.size(), ptr);
-
- uint8_t offset_bits = 0;
- uint64_t previous = 0;
- for (const uint64_t entry: entries_) {
- const uint64_t delta = entry - previous;
- previous = entry;
- const uint8_t zeros = count_leading_zeros_in_u64(delta);
- offset_bits = pack_bits(zeros, 6, ptr, offset_bits);
- offset_bits = pack_bits(delta, 63 - zeros, ptr, offset_bits);
- }
- if (offset_bits > 0) ++ptr;
- bytes.resize(ptr - bytes.data());
- return bytes;
-}
-
-template<typename A>
-auto compact_theta_sketch_alloc<A>::serialize_version_4_ULEB128(unsigned
header_size_bytes) const -> vector_bytes {
- const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1;
- // compression based on ULEB128 code of deltas between ordered hash values
- // assumes ordered sketch
-
- const size_t fixed_size = header_size_bytes + sizeof(uint64_t) *
preamble_longs;
- const size_t estimated_variable_size = 5 // for num_entries
- + 10 * entries_.size();
- vector_bytes bytes(fixed_size + estimated_variable_size, 0,
entries_.get_allocator());
- uint8_t* ptr = bytes.data() + header_size_bytes;
-
- ptr += copy_to_mem(preamble_longs, ptr);
- const uint8_t serial_version = 4;
- ptr += copy_to_mem(serial_version, ptr);
- const uint8_t type = SKETCH_TYPE;
- ptr += copy_to_mem(type, ptr);
- ptr += sizeof(uint16_t); // unused
- const uint8_t flags_byte(
- (1 << flags::IS_COMPACT) |
- (1 << flags::IS_READ_ONLY) |
- (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
- (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
- );
- ptr += copy_to_mem(flags_byte, ptr);
- const uint16_t seed_hash = get_seed_hash();
- ptr += copy_to_mem(seed_hash, ptr);
- if (this->is_estimation_mode()) {
- ptr += copy_to_mem(theta_, ptr);
- }
-
- ptr += ULEB128(entries_.size(), ptr);
-
- uint64_t previous = 0;
- for (unsigned i = 0; i < entries_.size(); ++i) {
- const uint64_t delta = entries_[i] - previous;
- previous = entries_[i];
- ptr += pack_ULEB128(delta, ptr);
- }
- bytes.resize(ptr - bytes.data());
- return bytes;
-}
-
template<typename A>
compact_theta_sketch_alloc<A>
compact_theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed,
const A& allocator) {
const auto preamble_longs = read<uint8_t>(is);
@@ -740,81 +587,31 @@ compact_theta_sketch_alloc<A>
compact_theta_sketch_alloc<A>::deserialize(std::is
template<typename A>
compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const
void* bytes, size_t size, uint64_t seed, const A& allocator) {
- if (static_cast<const uint8_t*>(bytes)[1] == 4) return
deserialize_version_4(bytes, size, seed, allocator); // TODO: check size
auto data = compact_theta_sketch_parser<true>::parse(bytes, size, seed,
false);
- const uint64_t* entries = reinterpret_cast<const
uint64_t*>(data.entries_start_ptr);
- return compact_theta_sketch_alloc(data.is_empty, data.is_ordered,
data.seed_hash, data.theta,
- std::vector<uint64_t, A>(entries, entries + data.num_entries,
allocator));
-}
-
-// MLZ
-template<typename A>
-compact_theta_sketch_alloc<A>
compact_theta_sketch_alloc<A>::deserialize_version_4(const void* bytes, size_t
size,
- uint64_t seed, const A& allocator) {
- ensure_minimum_memory(size, 8);
- const uint8_t* ptr = static_cast<const uint8_t*>(bytes);
- uint8_t preamble_longs;
- ptr += copy_from_mem(ptr, preamble_longs);
- uint8_t serial_version;
- ptr += copy_from_mem(ptr, serial_version);
- uint8_t type;
- ptr += copy_from_mem(ptr, type);
- uint8_t min_entry_zeros;
- ptr += copy_from_mem(ptr, min_entry_zeros);
-// uint8_t num_entries_zeros;
-// ptr += copy_from_mem(ptr, num_entries_zeros);
- ptr++; // unused
- uint8_t flags_byte;
- ptr += copy_from_mem(ptr, flags_byte);
- uint16_t seed_hash;
- ptr += copy_from_mem(ptr, seed_hash);
- checker<true>::check_sketch_type(type, SKETCH_TYPE);
- checker<true>::check_serial_version(serial_version, 4);
- const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
- if (!is_empty) checker<true>::check_seed_hash(seed_hash,
compute_seed_hash(seed));
- // check counts of zeros?
-
- uint64_t theta = theta_constants::MAX_THETA;
- if (preamble_longs == 2) {
- ensure_minimum_memory(size, sizeof(uint64_t) * 2);
- ptr += copy_from_mem(ptr, theta);
- }
-
- // TODO: check size
-
-// const size_t expected_bits = (64 - min_entry_zeros) * num_entries +
num_entries_bits * (num_entries > 1);
-// const size_t expected_size = sizeof(uint64_t) * (preamble_longs +
std::ceil(expected_bits / 64.0));
-// ensure_minimum_memory(size, expected_size);
-
- // uncompressed num_entries for now
- uint32_t num_entries;
- ptr += copy_from_mem(ptr, num_entries);
-
- const size_t expected_bits = (64 - min_entry_zeros) * num_entries;
- const size_t expected_size = sizeof(uint64_t) * preamble_longs +
std::ceil(expected_bits / 8.0) + sizeof(uint32_t);
- ensure_minimum_memory(size, expected_size);
-
- const uint8_t entry_bits = 64 - min_entry_zeros;
- uint64_t previous = 0;
- std::vector<uint64_t, A> entries(num_entries, 0, allocator);
-
- unsigned i;
- for (i = 0; i + 7 < num_entries; i += 8) {
- unpack_bits_block8(&entries[i], ptr, entry_bits);
- ptr += entry_bits;
- }
- uint8_t offset = 0;
- for (; i < num_entries; ++i) {
- offset = unpack_bits(entries[i], entry_bits, ptr, offset);
- }
- // undo deltas
- for (i = 0; i < num_entries; ++i) {
- entries[i] += previous;
- previous = entries[i];
+ if (data.entry_bits == 64) { // versions 1 to 3
+ const uint64_t* entries = reinterpret_cast<const
uint64_t*>(data.entries_start_ptr);
+ return compact_theta_sketch_alloc(data.is_empty, data.is_ordered,
data.seed_hash, data.theta,
+ std::vector<uint64_t, A>(entries, entries + data.num_entries,
allocator));
+ } else { // version 4
+ std::vector<uint64_t, A> entries(data.num_entries, 0, allocator);
+ const uint8_t* ptr = reinterpret_cast<const
uint8_t*>(data.entries_start_ptr);
+ unsigned i;
+ for (i = 0; i + 7 < data.num_entries; i += 8) {
+ unpack_bits_block8(&entries[i], ptr, data.entry_bits);
+ ptr += data.entry_bits;
+ }
+ uint8_t offset = 0;
+ for (; i < data.num_entries; ++i) {
+ offset = unpack_bits(entries[i], data.entry_bits, ptr, offset);
+ }
+ // undo deltas
+ uint64_t previous = 0;
+ for (i = 0; i < data.num_entries; ++i) {
+ entries[i] += previous;
+ previous = entries[i];
+ }
+ return compact_theta_sketch_alloc(data.is_empty, data.is_ordered,
data.seed_hash, data.theta, std::move(entries));
}
-
- const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
- return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta,
std::move(entries));
}
// wrapped compact sketch
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]