This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch theta_common_reading_bytes
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git

commit 6d883c1213694b1b55967dc15ac06e7c64816f43
Author: AlexanderSaydakov <[email protected]>
AuthorDate: Tue Dec 21 12:14:26 2021 -0800

    reuse parser in deserialize from bytes
---
 theta/include/compact_theta_sketch_parser_impl.hpp | 13 +++---
 theta/include/theta_sketch_impl.hpp                | 51 +++-------------------
 theta/test/theta_sketch_test.cpp                   | 29 +++++++++++-
 3 files changed, 41 insertions(+), 52 deletions(-)

diff --git a/theta/include/compact_theta_sketch_parser_impl.hpp 
b/theta/include/compact_theta_sketch_parser_impl.hpp
index c2d786b..7b231aa 100644
--- a/theta/include/compact_theta_sketch_parser_impl.hpp
+++ b/theta/include/compact_theta_sketch_parser_impl.hpp
@@ -27,7 +27,7 @@ namespace datasketches {
 
 template<bool dummy>
 auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, 
uint64_t seed, bool dump_on_error) -> compact_theta_sketch_data {
-  if (size < 8) throw std::invalid_argument("at least 8 bytes expected, actual 
" + std::to_string(size)
+  if (size < 8) throw std::out_of_range("at least 8 bytes expected, actual " + 
std::to_string(size)
       + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const 
uint8_t*>(ptr), size)) : ""));
 
   uint8_t serial_version = reinterpret_cast<const 
uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE];
@@ -43,10 +43,11 @@ auto compact_theta_sketch_parser<dummy>::parse(const void* 
ptr, size_t size, uin
       checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
       const bool has_theta = reinterpret_cast<const 
uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 2;
       if (has_theta) {
-        if (size < 16) throw std::invalid_argument("at least 16 bytes 
expected, actual " + std::to_string(size));
+        if (size < 16) throw std::out_of_range("at least 16 bytes expected, 
actual " + std::to_string(size));
         theta = reinterpret_cast<const 
uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
       }
       if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] 
== 1) {
+        if (size < 16) throw std::out_of_range("at least 16 bytes expected, 
actual " + std::to_string(size));
         return {false, true, seed_hash, 1, theta, reinterpret_cast<const 
uint64_t*>(ptr) + COMPACT_SKETCH_SINGLE_ENTRY_U64};
       }
       const uint32_t num_entries = reinterpret_cast<const 
uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
@@ -54,7 +55,7 @@ auto compact_theta_sketch_parser<dummy>::parse(const void* 
ptr, size_t size, uin
       const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + 
entries_start_u64;
       const size_t expected_size_bytes = (entries_start_u64 + num_entries) * 
sizeof(uint64_t);
       if (size < expected_size_bytes) {
-        throw std::invalid_argument(std::to_string(expected_size_bytes) + " 
bytes expected, actual " + std::to_string(size)
+        throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes 
expected, actual " + std::to_string(size)
             + (dump_on_error ? (", sketch dump: " + 
hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
       }
       const bool is_ordered = reinterpret_cast<const 
uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << 
COMPACT_SKETCH_IS_ORDERED_FLAG);
@@ -72,7 +73,7 @@ auto compact_theta_sketch_parser<dummy>::parse(const void* 
ptr, size_t size, uin
       const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + 
COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
       const size_t expected_size_bytes = 
(COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
       if (size < expected_size_bytes) {
-        throw std::invalid_argument(std::to_string(expected_size_bytes) + " 
bytes expected, actual " + std::to_string(size)
+        throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes 
expected, actual " + std::to_string(size)
             + (dump_on_error ? (", sketch dump: " + 
hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
       }
       return {false, true, seed_hash, num_entries, theta, entries};
@@ -91,7 +92,7 @@ auto compact_theta_sketch_parser<dummy>::parse(const void* 
ptr, size_t size, uin
           } else {
               const size_t expected_size_bytes = (preamble_size + num_entries) 
<< 3;
               if (size < expected_size_bytes) {
-                  throw 
std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, 
actual " + std::to_string(size)
+                  throw std::out_of_range(std::to_string(expected_size_bytes) 
+ " bytes expected, actual " + std::to_string(size)
                       + (dump_on_error ? (", sketch dump: " + 
hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
               }
               const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) 
+ COMPACT_SKETCH_ENTRIES_EXACT_U64;
@@ -107,7 +108,7 @@ auto compact_theta_sketch_parser<dummy>::parse(const void* 
ptr, size_t size, uin
           const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + 
COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
           const size_t expected_size_bytes = 
(COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
           if (size < expected_size_bytes) {
-            throw std::invalid_argument(std::to_string(expected_size_bytes) + 
" bytes expected, actual " + std::to_string(size)
+            throw std::out_of_range(std::to_string(expected_size_bytes) + " 
bytes expected, actual " + std::to_string(size)
                 + (dump_on_error ? (", sketch dump: " + 
hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
           }
           return {false, true, seed_hash, num_entries, theta, entries};
diff --git a/theta/include/theta_sketch_impl.hpp 
b/theta/include/theta_sketch_impl.hpp
index 8596216..7d49f04 100644
--- a/theta/include/theta_sketch_impl.hpp
+++ b/theta/include/theta_sketch_impl.hpp
@@ -453,7 +453,7 @@ compact_theta_sketch_alloc<A> 
compact_theta_sketch_alloc<A>::deserialize(std::is
       const auto num_entries = read<uint32_t>(is);
       read<uint32_t>(is); //unused
       const auto theta = read<uint64_t>(is);
-      std::vector<uint64_t> entries(num_entries, 0, allocator);
+      std::vector<uint64_t, A> entries(num_entries, 0, allocator);
       bool is_empty = (num_entries == 0) && (theta == 
theta_constants::MAX_THETA);
       if (!is_empty)
           read(is, entries.data(), sizeof(uint64_t) * entries.size());
@@ -470,12 +470,12 @@ compact_theta_sketch_alloc<A> 
compact_theta_sketch_alloc<A>::deserialize(std::is
       if (preamble_longs == 1) {
           if (!is.good())
               throw std::runtime_error("error reading from std::istream");
-          std::vector<uint64_t> entries(0, 0, allocator);
+          std::vector<uint64_t, A> entries(0, 0, allocator);
           return compact_theta_sketch_alloc(true, true, seed_hash, 
theta_constants::MAX_THETA, std::move(entries));
       } else if (preamble_longs == 2) {
           const uint32_t num_entries = read<uint32_t>(is);
           read<uint32_t>(is); // unused
-          std::vector<uint64_t> entries(num_entries, 0, allocator);
+          std::vector<uint64_t, A> entries(num_entries, 0, allocator);
           if (num_entries == 0) {
               return compact_theta_sketch_alloc(true, true, seed_hash, 
theta_constants::MAX_THETA, std::move(entries));
           }
@@ -488,7 +488,7 @@ compact_theta_sketch_alloc<A> 
compact_theta_sketch_alloc<A>::deserialize(std::is
           read<uint32_t>(is); // unused
           const auto theta = read<uint64_t>(is);
           bool is_empty = (num_entries == 0) && (theta == 
theta_constants::MAX_THETA);
-          std::vector<uint64_t> entries(num_entries, 0, allocator);
+          std::vector<uint64_t, A> entries(num_entries, 0, allocator);
           if (is_empty) {
               if (!is.good())
                   throw std::runtime_error("error reading from std::istream");
@@ -514,47 +514,8 @@ compact_theta_sketch_alloc<A> 
compact_theta_sketch_alloc<A>::deserialize(std::is
 
 template<typename A>
 compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const 
void* bytes, size_t size, uint64_t seed, const A& allocator) {
-  ensure_minimum_memory(size, 8);
-  const char* ptr = static_cast<const char*>(bytes);
-  const char* base = ptr;
-  uint8_t preamble_longs;
-  ptr += copy_from_mem(ptr, preamble_longs);
-  uint8_t serial_version;
-  ptr += copy_from_mem(ptr, serial_version);
-  uint8_t type;
-  ptr += copy_from_mem(ptr, type);
-  ptr += sizeof(uint16_t); // unused
-  uint8_t flags_byte;
-  ptr += copy_from_mem(ptr, flags_byte);
-  uint16_t seed_hash;
-  ptr += copy_from_mem(ptr, seed_hash);
-  checker<true>::check_sketch_type(type, SKETCH_TYPE);
-  checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
-  const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
-  if (!is_empty) checker<true>::check_seed_hash(seed_hash, 
compute_seed_hash(seed));
-
-  uint64_t theta = theta_constants::MAX_THETA;
-  uint32_t num_entries = 0;
-  if (!is_empty) {
-    if (preamble_longs == 1) {
-      num_entries = 1;
-    } else {
-      ensure_minimum_memory(size, 8); // read the first prelong before this 
method
-      ptr += copy_from_mem(ptr, num_entries);
-      ptr += sizeof(uint32_t); // unused
-      if (preamble_longs > 2) {
-        ensure_minimum_memory(size, (preamble_longs - 1) << 3);
-        ptr += copy_from_mem(ptr, theta);
-      }
-    }
-  }
-  const size_t entries_size_bytes = sizeof(uint64_t) * num_entries;
-  check_memory_size(ptr - base + entries_size_bytes, size);
-  std::vector<uint64_t, A> entries(num_entries, 0, allocator);
-  if (!is_empty) ptr += copy_from_mem(ptr, entries.data(), entries_size_bytes);
-
-  const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
-  return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, 
std::move(entries));
+  auto data = compact_theta_sketch_parser<true>::parse(bytes, size, seed, 
false);
+  return compact_theta_sketch_alloc(data.is_empty, data.is_ordered, 
data.seed_hash, data.theta, std::vector<uint64_t, A>(data.entries, data.entries 
+ data.num_entries, allocator));
 }
 
 // wrapped compact sketch
diff --git a/theta/test/theta_sketch_test.cpp b/theta/test/theta_sketch_test.cpp
index 091a9c7..00505e0 100644
--- a/theta/test/theta_sketch_test.cpp
+++ b/theta/test/theta_sketch_test.cpp
@@ -394,7 +394,13 @@ TEST_CASE("theta sketch: serialize deserialize stream and 
bytes equivalence", "[
   }
 }
 
-TEST_CASE("theta sketch: deserialize compact single item buffer overrun", 
"[theta_sketch]") {
+TEST_CASE("theta sketch: deserialize empty buffer overrun", "[theta_sketch]") {
+  update_theta_sketch update_sketch = update_theta_sketch::builder().build();
+  auto bytes = update_sketch.compact().serialize();
+  REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), 
bytes.size() - 1), std::out_of_range);
+}
+
+TEST_CASE("theta sketch: deserialize single item buffer overrun", 
"[theta_sketch]") {
   update_theta_sketch update_sketch = update_theta_sketch::builder().build();
   update_sketch.update(1);
   auto bytes = update_sketch.compact().serialize();
@@ -402,6 +408,27 @@ TEST_CASE("theta sketch: deserialize compact single item 
buffer overrun", "[thet
   REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), 
bytes.size() - 1), std::out_of_range);
 }
 
+TEST_CASE("theta sketch: deserialize exact mode buffer overrun", 
"[theta_sketch]") {
+  update_theta_sketch update_sketch = update_theta_sketch::builder().build();
+  for (int i = 0; i < 1000; ++i) update_sketch.update(i);
+  auto bytes = update_sketch.compact().serialize();
+  REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), 7), 
std::out_of_range);
+  REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), 8), 
std::out_of_range);
+  REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), 16), 
std::out_of_range);
+  REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), 
bytes.size() - 1), std::out_of_range);
+}
+
+TEST_CASE("theta sketch: deserialize estimation mode buffer overrun", 
"[theta_sketch]") {
+  update_theta_sketch update_sketch = update_theta_sketch::builder().build();
+  for (int i = 0; i < 10000; ++i) update_sketch.update(i);
+  auto bytes = update_sketch.compact().serialize();
+  REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), 7), 
std::out_of_range);
+  REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), 8), 
std::out_of_range);
+  REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), 16), 
std::out_of_range);
+  REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), 24), 
std::out_of_range);
+  REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), 
bytes.size() - 1), std::out_of_range);
+}
+
 TEST_CASE("theta sketch: conversion constructor and wrapped compact", 
"[theta_sketch]") {
   update_theta_sketch update_sketch = update_theta_sketch::builder().build();
   const int n = 8192;

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to