AlexanderSaydakov commented on a change in pull request #253:
URL: https://github.com/apache/datasketches-cpp/pull/253#discussion_r765236700
##########
File path: theta/include/compact_theta_sketch_parser_impl.hpp
##########
@@ -29,32 +29,97 @@ template<bool dummy>
auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size,
uint64_t seed, bool dump_on_error) -> compact_theta_sketch_data {
if (size < 8) throw std::invalid_argument("at least 8 bytes expected, actual
" + std::to_string(size)
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const
uint8_t*>(ptr), size)) : ""));
- checker<true>::check_serial_version(reinterpret_cast<const
uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE],
COMPACT_SKETCH_SERIAL_VERSION);
- checker<true>::check_sketch_type(reinterpret_cast<const
uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
- uint64_t theta = theta_constants::MAX_THETA;
- const uint16_t seed_hash = reinterpret_cast<const
uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
- checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
- if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 <<
COMPACT_SKETCH_IS_EMPTY_FLAG)) {
- return {true, true, seed_hash, 0, theta, nullptr};
+
+ uint8_t serial_version = reinterpret_cast<const
uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE];
+
+ switch(serial_version) {
+ case COMPACT_SKETCH_SERIAL_VERSION: {
+ checker<true>::check_sketch_type(reinterpret_cast<const
uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
+ uint64_t theta = theta_constants::MAX_THETA;
+ const uint16_t seed_hash = reinterpret_cast<const
uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
+ if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] &
(1 << COMPACT_SKETCH_IS_EMPTY_FLAG)) {
+ return {true, true, seed_hash, 0, theta, nullptr};
+ }
+ checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
+ const bool has_theta = reinterpret_cast<const
uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 2;
+ if (has_theta) {
+ if (size < 16) throw std::invalid_argument("at least 16 bytes
expected, actual " + std::to_string(size));
+ theta = reinterpret_cast<const
uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
+ }
+ if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE]
== 1) {
+ return {false, true, seed_hash, 1, theta, reinterpret_cast<const
uint64_t*>(ptr) + COMPACT_SKETCH_SINGLE_ENTRY_U64};
+ }
+ const uint32_t num_entries = reinterpret_cast<const
uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
+ const size_t entries_start_u64 = has_theta ?
COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 : COMPACT_SKETCH_ENTRIES_EXACT_U64;
+ const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) +
entries_start_u64;
+ const size_t expected_size_bytes = (entries_start_u64 + num_entries) *
sizeof(uint64_t);
+ if (size < expected_size_bytes) {
+ throw std::invalid_argument(std::to_string(expected_size_bytes) + "
bytes expected, actual " + std::to_string(size)
+ + (dump_on_error ? (", sketch dump: " +
hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
+ }
+ const bool is_ordered = reinterpret_cast<const
uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 <<
COMPACT_SKETCH_IS_ORDERED_FLAG);
+ return {false, is_ordered, seed_hash, num_entries, theta, entries};
}
- const bool has_theta = reinterpret_cast<const
uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 2;
- if (has_theta) {
- if (size < 16) throw std::invalid_argument("at least 16 bytes expected,
actual " + std::to_string(size));
- theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
+ case 1: {
+ uint16_t seed_hash = compute_seed_hash(seed);
+ checker<true>::check_sketch_type(reinterpret_cast<const
uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
+ const uint32_t num_entries = reinterpret_cast<const
uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
+ uint64_t theta = reinterpret_cast<const
uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
+ bool is_empty = (num_entries == 0) && (theta ==
theta_constants::MAX_THETA);
+ if (is_empty) {
+ return {true, true, seed_hash, 0, theta, nullptr};
+ }
+ const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) +
COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
+ const size_t expected_size_bytes =
(COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
+ if (size < expected_size_bytes) {
+ throw std::invalid_argument(std::to_string(expected_size_bytes) + "
bytes expected, actual " + std::to_string(size)
+ + (dump_on_error ? (", sketch dump: " +
hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
+ }
+ return {false, false, seed_hash, num_entries, theta, entries};
}
- if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] ==
1) {
- return {false, true, seed_hash, 1, theta, reinterpret_cast<const
uint64_t*>(ptr) + COMPACT_SKETCH_SINGLE_ENTRY_U64};
+ case 2: {
+ uint8_t preamble_size = reinterpret_cast<const
uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE];
+ checker<true>::check_sketch_type(reinterpret_cast<const
uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
+ const uint16_t seed_hash = reinterpret_cast<const
uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
+ checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
+ if (preamble_size == 1) {
+ return {true, true, seed_hash, 0, theta_constants::MAX_THETA,
nullptr};
+ } else if (preamble_size == 2) {
+ const uint32_t num_entries = reinterpret_cast<const
uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
+ if (num_entries == 0) {
+ return {true, true, seed_hash, 0, theta_constants::MAX_THETA,
nullptr};
+ } else {
+ const size_t expected_size_bytes = (preamble_size + num_entries)
<< 3;
+ if (size < expected_size_bytes) {
+ throw
std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected,
actual " + std::to_string(size)
+ + (dump_on_error ? (", sketch dump: " +
hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
+ }
+ const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr)
+ COMPACT_SKETCH_ENTRIES_EXACT_U64;
+ return {false, true, seed_hash, num_entries,
theta_constants::MAX_THETA, entries};
+ }
+ } else if (preamble_size == 3) {
+ const uint32_t num_entries = reinterpret_cast<const
uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
+ uint64_t theta = reinterpret_cast<const
uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
+ bool is_empty = (num_entries == 0) && (theta ==
theta_constants::MAX_THETA);
+ if (is_empty) {
+ return {true, true, seed_hash, 0, theta, nullptr};
+ }
+ const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) +
COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
+ const size_t expected_size_bytes =
(COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
+ if (size < expected_size_bytes) {
+ throw std::invalid_argument(std::to_string(expected_size_bytes) +
" bytes expected, actual " + std::to_string(size)
+ + (dump_on_error ? (", sketch dump: " +
hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
+ }
+ return {false, false, seed_hash, num_entries, theta, entries};
+ } else {
+ throw std::invalid_argument(std::to_string(preamble_size) + " longs
of premable, but expected 1, 2, or 3");
+ }
}
- const uint32_t num_entries = reinterpret_cast<const
uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
- const size_t entries_start_u64 = has_theta ?
COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 : COMPACT_SKETCH_ENTRIES_EXACT_U64;
- const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) +
entries_start_u64;
- const size_t expected_size_bytes = (entries_start_u64 + num_entries) *
sizeof(uint64_t);
- if (size < expected_size_bytes) {
- throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes
expected, actual " + std::to_string(size)
- + (dump_on_error ? (", sketch dump: " +
hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
+ default:
+ // this should always fail since the valid cases are handled above
+ checker<true>::check_serial_version(reinterpret_cast<const
uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE],
COMPACT_SKETCH_SERIAL_VERSION);
Review comment:
I don't quite like this, but not a problem. Perhaps can be addressed
later. This also reminds me that I wanted to call this parser from the regular
deserialization to avoid code duplication, but never got around to it.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]