This is an automated email from the ASF dual-hosted git repository. alsay pushed a commit to branch theta_common_reading_bytes in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git
commit 6d883c1213694b1b55967dc15ac06e7c64816f43 Author: AlexanderSaydakov <[email protected]> AuthorDate: Tue Dec 21 12:14:26 2021 -0800 reuse parser in deserialize from bytes --- theta/include/compact_theta_sketch_parser_impl.hpp | 13 +++--- theta/include/theta_sketch_impl.hpp | 51 +++------------------- theta/test/theta_sketch_test.cpp | 29 +++++++++++- 3 files changed, 41 insertions(+), 52 deletions(-) diff --git a/theta/include/compact_theta_sketch_parser_impl.hpp b/theta/include/compact_theta_sketch_parser_impl.hpp index c2d786b..7b231aa 100644 --- a/theta/include/compact_theta_sketch_parser_impl.hpp +++ b/theta/include/compact_theta_sketch_parser_impl.hpp @@ -27,7 +27,7 @@ namespace datasketches { template<bool dummy> auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uint64_t seed, bool dump_on_error) -> compact_theta_sketch_data { - if (size < 8) throw std::invalid_argument("at least 8 bytes expected, actual " + std::to_string(size) + if (size < 8) throw std::out_of_range("at least 8 bytes expected, actual " + std::to_string(size) + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : "")); uint8_t serial_version = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE]; @@ -43,10 +43,11 @@ auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uin checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed)); const bool has_theta = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 2; if (has_theta) { - if (size < 16) throw std::invalid_argument("at least 16 bytes expected, actual " + std::to_string(size)); + if (size < 16) throw std::out_of_range("at least 16 bytes expected, actual " + std::to_string(size)); theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64]; } if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] == 1) { + if (size < 16) throw std::out_of_range("at least 16 bytes expected, actual " + std::to_string(size)); return {false, true, seed_hash, 1, theta, reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_SINGLE_ENTRY_U64}; } const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32]; @@ -54,7 +55,7 @@ auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uin const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + entries_start_u64; const size_t expected_size_bytes = (entries_start_u64 + num_entries) * sizeof(uint64_t); if (size < expected_size_bytes) { - throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size) + throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size) + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : "")); } const bool is_ordered = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_ORDERED_FLAG); @@ -72,7 +73,7 @@ auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uin const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_ESTIMATION_U64; const size_t expected_size_bytes = (COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t); if (size < expected_size_bytes) { - throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size) + throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size) + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : "")); } return {false, true, seed_hash, num_entries, theta, entries}; @@ -91,7 +92,7 @@ auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uin } else { const size_t expected_size_bytes = (preamble_size + num_entries) << 3; if (size < expected_size_bytes) { - throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size) + throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size) + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : "")); } const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_EXACT_U64; @@ -107,7 +108,7 @@ auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uin const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_ESTIMATION_U64; const size_t expected_size_bytes = (COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t); if (size < expected_size_bytes) { - throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size) + throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size) + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : "")); } return {false, true, seed_hash, num_entries, theta, entries}; diff --git a/theta/include/theta_sketch_impl.hpp b/theta/include/theta_sketch_impl.hpp index 8596216..7d49f04 100644 --- a/theta/include/theta_sketch_impl.hpp +++ b/theta/include/theta_sketch_impl.hpp @@ -453,7 +453,7 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::is const auto num_entries = read<uint32_t>(is); read<uint32_t>(is); //unused const auto theta = read<uint64_t>(is); - std::vector<uint64_t> entries(num_entries, 0, allocator); + std::vector<uint64_t, A> entries(num_entries, 0, allocator); bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA); if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size()); @@ -470,12 +470,12 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::is if (preamble_longs == 1) { if (!is.good()) throw std::runtime_error("error reading from std::istream"); - std::vector<uint64_t> entries(0, 0, allocator); + std::vector<uint64_t, A> entries(0, 0, allocator); return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries)); } else if (preamble_longs == 2) { const uint32_t num_entries = read<uint32_t>(is); read<uint32_t>(is); // unused - std::vector<uint64_t> entries(num_entries, 0, allocator); + std::vector<uint64_t, A> entries(num_entries, 0, allocator); if (num_entries == 0) { return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries)); } @@ -488,7 +488,7 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::is read<uint32_t>(is); // unused const auto theta = read<uint64_t>(is); bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA); - std::vector<uint64_t> entries(num_entries, 0, allocator); + std::vector<uint64_t, A> entries(num_entries, 0, allocator); if (is_empty) { if (!is.good()) throw std::runtime_error("error reading from std::istream"); @@ -514,47 +514,8 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::is template<typename A> compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed, const A& allocator) { - ensure_minimum_memory(size, 8); - const char* ptr = static_cast<const char*>(bytes); - const char* base = ptr; - uint8_t preamble_longs; - ptr += copy_from_mem(ptr, preamble_longs); - uint8_t serial_version; - ptr += copy_from_mem(ptr, serial_version); - uint8_t type; - ptr += copy_from_mem(ptr, type); - ptr += sizeof(uint16_t); // unused - uint8_t flags_byte; - ptr += copy_from_mem(ptr, flags_byte); - uint16_t seed_hash; - ptr += copy_from_mem(ptr, seed_hash); - checker<true>::check_sketch_type(type, SKETCH_TYPE); - checker<true>::check_serial_version(serial_version, SERIAL_VERSION); - const bool is_empty = flags_byte & (1 << flags::IS_EMPTY); - if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed)); - - uint64_t theta = theta_constants::MAX_THETA; - uint32_t num_entries = 0; - if (!is_empty) { - if (preamble_longs == 1) { - num_entries = 1; - } else { - ensure_minimum_memory(size, 8); // read the first prelong before this method - ptr += copy_from_mem(ptr, num_entries); - ptr += sizeof(uint32_t); // unused - if (preamble_longs > 2) { - ensure_minimum_memory(size, (preamble_longs - 1) << 3); - ptr += copy_from_mem(ptr, theta); - } - } - } - const size_t entries_size_bytes = sizeof(uint64_t) * num_entries; - check_memory_size(ptr - base + entries_size_bytes, size); - std::vector<uint64_t, A> entries(num_entries, 0, allocator); - if (!is_empty) ptr += copy_from_mem(ptr, entries.data(), entries_size_bytes); - - const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED); - return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries)); + auto data = compact_theta_sketch_parser<true>::parse(bytes, size, seed, false); + return compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.theta, std::vector<uint64_t, A>(data.entries, data.entries + data.num_entries, allocator)); } // wrapped compact sketch diff --git a/theta/test/theta_sketch_test.cpp b/theta/test/theta_sketch_test.cpp index 091a9c7..00505e0 100644 --- a/theta/test/theta_sketch_test.cpp +++ b/theta/test/theta_sketch_test.cpp @@ -394,7 +394,13 @@ TEST_CASE("theta sketch: serialize deserialize stream and bytes equivalence", "[ } } -TEST_CASE("theta sketch: deserialize compact single item buffer overrun", "[theta_sketch]") { +TEST_CASE("theta sketch: deserialize empty buffer overrun", "[theta_sketch]") { + update_theta_sketch update_sketch = update_theta_sketch::builder().build(); + auto bytes = update_sketch.compact().serialize(); + REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range); +} + +TEST_CASE("theta sketch: deserialize single item buffer overrun", "[theta_sketch]") { update_theta_sketch update_sketch = update_theta_sketch::builder().build(); update_sketch.update(1); auto bytes = update_sketch.compact().serialize(); @@ -402,6 +408,27 @@ TEST_CASE("theta sketch: deserialize compact single item buffer overrun", "[thet REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range); } +TEST_CASE("theta sketch: deserialize exact mode buffer overrun", "[theta_sketch]") { + update_theta_sketch update_sketch = update_theta_sketch::builder().build(); + for (int i = 0; i < 1000; ++i) update_sketch.update(i); + auto bytes = update_sketch.compact().serialize(); + REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), 7), std::out_of_range); + REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), 8), std::out_of_range); + REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), 16), std::out_of_range); + REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range); +} + +TEST_CASE("theta sketch: deserialize estimation mode buffer overrun", "[theta_sketch]") { + update_theta_sketch update_sketch = update_theta_sketch::builder().build(); + for (int i = 0; i < 10000; ++i) update_sketch.update(i); + auto bytes = update_sketch.compact().serialize(); + REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), 7), std::out_of_range); + REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), 8), std::out_of_range); + REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), 16), std::out_of_range); + REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), 24), std::out_of_range); + REQUIRE_THROWS_AS(compact_theta_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range); +} + TEST_CASE("theta sketch: conversion constructor and wrapped compact", "[theta_sketch]") { update_theta_sketch update_sketch = update_theta_sketch::builder().build(); const int n = 8192; --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
