AlexanderSaydakov commented on a change in pull request #253:
URL: https://github.com/apache/datasketches-cpp/pull/253#discussion_r765236700



##########
File path: theta/include/compact_theta_sketch_parser_impl.hpp
##########
@@ -29,32 +29,97 @@ template<bool dummy>
 auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, 
uint64_t seed, bool dump_on_error) -> compact_theta_sketch_data {
   if (size < 8) throw std::invalid_argument("at least 8 bytes expected, actual 
" + std::to_string(size)
       + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const 
uint8_t*>(ptr), size)) : ""));
-  checker<true>::check_serial_version(reinterpret_cast<const 
uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE], 
COMPACT_SKETCH_SERIAL_VERSION);
-  checker<true>::check_sketch_type(reinterpret_cast<const 
uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
-  uint64_t theta = theta_constants::MAX_THETA;
-  const uint16_t seed_hash = reinterpret_cast<const 
uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
-  checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
-  if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << 
COMPACT_SKETCH_IS_EMPTY_FLAG)) {
-    return {true, true, seed_hash, 0, theta, nullptr};
+
+  uint8_t serial_version = reinterpret_cast<const 
uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE];
+
+  switch(serial_version) {
+  case COMPACT_SKETCH_SERIAL_VERSION: {
+      checker<true>::check_sketch_type(reinterpret_cast<const 
uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
+      uint64_t theta = theta_constants::MAX_THETA;
+      const uint16_t seed_hash = reinterpret_cast<const 
uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
+      if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & 
(1 << COMPACT_SKETCH_IS_EMPTY_FLAG)) {
+        return {true, true, seed_hash, 0, theta, nullptr};
+      }
+      checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
+      const bool has_theta = reinterpret_cast<const 
uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 2;
+      if (has_theta) {
+        if (size < 16) throw std::invalid_argument("at least 16 bytes 
expected, actual " + std::to_string(size));
+        theta = reinterpret_cast<const 
uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
+      }
+      if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] 
== 1) {
+        return {false, true, seed_hash, 1, theta, reinterpret_cast<const 
uint64_t*>(ptr) + COMPACT_SKETCH_SINGLE_ENTRY_U64};
+      }
+      const uint32_t num_entries = reinterpret_cast<const 
uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
+      const size_t entries_start_u64 = has_theta ? 
COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 : COMPACT_SKETCH_ENTRIES_EXACT_U64;
+      const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + 
entries_start_u64;
+      const size_t expected_size_bytes = (entries_start_u64 + num_entries) * 
sizeof(uint64_t);
+      if (size < expected_size_bytes) {
+        throw std::invalid_argument(std::to_string(expected_size_bytes) + " 
bytes expected, actual " + std::to_string(size)
+            + (dump_on_error ? (", sketch dump: " + 
hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
+      }
+      const bool is_ordered = reinterpret_cast<const 
uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << 
COMPACT_SKETCH_IS_ORDERED_FLAG);
+      return {false, is_ordered, seed_hash, num_entries, theta, entries};
   }
-  const bool has_theta = reinterpret_cast<const 
uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 2;
-  if (has_theta) {
-    if (size < 16) throw std::invalid_argument("at least 16 bytes expected, 
actual " + std::to_string(size));
-    theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
+  case 1:  {
+      uint16_t seed_hash = compute_seed_hash(seed);
+      checker<true>::check_sketch_type(reinterpret_cast<const 
uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
+      const uint32_t num_entries = reinterpret_cast<const 
uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
+      uint64_t theta = reinterpret_cast<const 
uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
+      bool is_empty = (num_entries == 0) && (theta == 
theta_constants::MAX_THETA);
+      if (is_empty) {
+          return {true, true, seed_hash, 0, theta, nullptr};
+      }
+      const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + 
COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
+      const size_t expected_size_bytes = 
(COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
+      if (size < expected_size_bytes) {
+        throw std::invalid_argument(std::to_string(expected_size_bytes) + " 
bytes expected, actual " + std::to_string(size)
+            + (dump_on_error ? (", sketch dump: " + 
hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
+      }
+      return {false, false, seed_hash, num_entries, theta, entries};
   }
-  if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] == 
1) {
-    return {false, true, seed_hash, 1, theta, reinterpret_cast<const 
uint64_t*>(ptr) + COMPACT_SKETCH_SINGLE_ENTRY_U64};
+  case 2:  {
+      uint8_t preamble_size =  reinterpret_cast<const 
uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE];
+      checker<true>::check_sketch_type(reinterpret_cast<const 
uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
+      const uint16_t seed_hash = reinterpret_cast<const 
uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
+      checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
+      if (preamble_size == 1) {
+          return {true, true, seed_hash, 0, theta_constants::MAX_THETA, 
nullptr};
+      } else if (preamble_size == 2) {
+          const uint32_t num_entries = reinterpret_cast<const 
uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
+          if (num_entries == 0) {
+              return {true, true, seed_hash, 0, theta_constants::MAX_THETA, 
nullptr};
+          } else {
+              const size_t expected_size_bytes = (preamble_size + num_entries) 
<< 3;
+              if (size < expected_size_bytes) {
+                  throw 
std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, 
actual " + std::to_string(size)
+                      + (dump_on_error ? (", sketch dump: " + 
hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
+              }
+              const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) 
+ COMPACT_SKETCH_ENTRIES_EXACT_U64;
+              return {false, true, seed_hash, num_entries, 
theta_constants::MAX_THETA, entries};
+          }
+      } else if (preamble_size == 3) {
+          const uint32_t num_entries = reinterpret_cast<const 
uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
+          uint64_t theta = reinterpret_cast<const 
uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
+          bool is_empty = (num_entries == 0) && (theta == 
theta_constants::MAX_THETA);
+          if (is_empty) {
+              return {true, true, seed_hash, 0, theta, nullptr};
+          }
+          const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + 
COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
+          const size_t expected_size_bytes = 
(COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
+          if (size < expected_size_bytes) {
+            throw std::invalid_argument(std::to_string(expected_size_bytes) + 
" bytes expected, actual " + std::to_string(size)
+                + (dump_on_error ? (", sketch dump: " + 
hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
+          }
+          return {false, false, seed_hash, num_entries, theta, entries};
+      } else {
+          throw std::invalid_argument(std::to_string(preamble_size) + " longs 
of premable, but expected 1, 2, or 3");
+      }
   }
-  const uint32_t num_entries = reinterpret_cast<const 
uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
-  const size_t entries_start_u64 = has_theta ? 
COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 : COMPACT_SKETCH_ENTRIES_EXACT_U64;
-  const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + 
entries_start_u64;
-  const size_t expected_size_bytes = (entries_start_u64 + num_entries) * 
sizeof(uint64_t);
-  if (size < expected_size_bytes) {
-    throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes 
expected, actual " + std::to_string(size)
-        + (dump_on_error ? (", sketch dump: " + 
hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
+  default:
+      // this should always fail since the valid cases are handled above
+      checker<true>::check_serial_version(reinterpret_cast<const 
uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE], 
COMPACT_SKETCH_SERIAL_VERSION);

Review comment:
       I don't quite like this, but not a problem. Perhaps can be addressed 
later. This also reminds me that I wanted to call this parser from the regular 
deserialization to avoid code duplication, but never got around to it.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to