AlexanderSaydakov commented on a change in pull request #253:
URL: https://github.com/apache/datasketches-cpp/pull/253#discussion_r765102061
##########
File path: theta/test/theta_sketch_test.cpp
##########
@@ -214,6 +244,78 @@ TEST_CASE("theta sketch: deserialize compact estimation
from java", "[theta_sket
}
}
+TEST_CASE("theta sketch: deserialize compact v1 estimation from java",
"[theta_sketch]") {
+ std::ifstream is;
+ is.exceptions(std::ios::failbit | std::ios::badbit);
+ is.open(inputPath + "theta_compact_estimation_from_java_v1.sk",
std::ios::binary);
+ auto sketch = compact_theta_sketch::deserialize(is);
+ REQUIRE_FALSE(sketch.is_empty());
+ REQUIRE(sketch.is_estimation_mode());
+// REQUIRE(sketch.is_ordered()); // v1 sketches may not be ordered
+ REQUIRE(sketch.get_num_retained() == 4342);
+ REQUIRE(sketch.get_theta() == Approx(0.531700444213199).margin(1e-10));
+ REQUIRE(sketch.get_estimate() == Approx(8166.25234614053).margin(1e-10));
+ REQUIRE(sketch.get_lower_bound(2) ==
Approx(7996.956955317471).margin(1e-10));
+ REQUIRE(sketch.get_upper_bound(2) ==
Approx(8339.090301078124).margin(1e-10));
+
+ // the same construction process in Java must have produced exactly the same
sketch
+ update_theta_sketch update_sketch = update_theta_sketch::builder().build();
+ const int n = 8192;
+ for (int i = 0; i < n; i++) update_sketch.update(i);
+ REQUIRE(sketch.get_num_retained() == update_sketch.get_num_retained());
+ REQUIRE(sketch.get_theta() ==
Approx(update_sketch.get_theta()).margin(1e-10));
+ REQUIRE(sketch.get_estimate() ==
Approx(update_sketch.get_estimate()).margin(1e-10));
+ REQUIRE(sketch.get_lower_bound(1) ==
Approx(update_sketch.get_lower_bound(1)).margin(1e-10));
+ REQUIRE(sketch.get_upper_bound(1) ==
Approx(update_sketch.get_upper_bound(1)).margin(1e-10));
+ REQUIRE(sketch.get_lower_bound(2) ==
Approx(update_sketch.get_lower_bound(2)).margin(1e-10));
+ REQUIRE(sketch.get_upper_bound(2) ==
Approx(update_sketch.get_upper_bound(2)).margin(1e-10));
+ REQUIRE(sketch.get_lower_bound(3) ==
Approx(update_sketch.get_lower_bound(3)).margin(1e-10));
+ REQUIRE(sketch.get_upper_bound(3) ==
Approx(update_sketch.get_upper_bound(3)).margin(1e-10));
+ compact_theta_sketch compact_sketch = update_sketch.compact();
+ // the sketches are ordered, so the iteration sequence must match exactly
+ auto iter = sketch.begin();
+ for (const auto& key: compact_sketch) {
+ REQUIRE(*iter == key);
+ ++iter;
+ }
+}
+
+TEST_CASE("theta sketch: deserialize compact v2 estimation from java",
"[theta_sketch]") {
+ std::ifstream is;
+ is.exceptions(std::ios::failbit | std::ios::badbit);
+ is.open(inputPath + "theta_compact_estimation_from_java_v2.sk",
std::ios::binary);
+ auto sketch = compact_theta_sketch::deserialize(is);
+ REQUIRE_FALSE(sketch.is_empty());
+ REQUIRE(sketch.is_estimation_mode());
+// REQUIRE(sketch.is_ordered()); // v1 sketches may not be ordered
+ REQUIRE(sketch.get_num_retained() == 4342);
+ REQUIRE(sketch.get_theta() == Approx(0.531700444213199).margin(1e-10));
+ REQUIRE(sketch.get_estimate() == Approx(8166.25234614053).margin(1e-10));
+ REQUIRE(sketch.get_lower_bound(2) ==
Approx(7996.956955317471).margin(1e-10));
+ REQUIRE(sketch.get_upper_bound(2) ==
Approx(8339.090301078124).margin(1e-10));
+
+ // the same construction process in Java must have produced exactly the same
sketch
+ update_theta_sketch update_sketch = update_theta_sketch::builder().build();
+ const int n = 8192;
+ for (int i = 0; i < n; i++) update_sketch.update(i);
+ REQUIRE(sketch.get_num_retained() == update_sketch.get_num_retained());
+ REQUIRE(sketch.get_theta() ==
Approx(update_sketch.get_theta()).margin(1e-10));
+ REQUIRE(sketch.get_estimate() ==
Approx(update_sketch.get_estimate()).margin(1e-10));
+ REQUIRE(sketch.get_lower_bound(1) ==
Approx(update_sketch.get_lower_bound(1)).margin(1e-10));
+ REQUIRE(sketch.get_upper_bound(1) ==
Approx(update_sketch.get_upper_bound(1)).margin(1e-10));
+ REQUIRE(sketch.get_lower_bound(2) ==
Approx(update_sketch.get_lower_bound(2)).margin(1e-10));
+ REQUIRE(sketch.get_upper_bound(2) ==
Approx(update_sketch.get_upper_bound(2)).margin(1e-10));
+ REQUIRE(sketch.get_lower_bound(3) ==
Approx(update_sketch.get_lower_bound(3)).margin(1e-10));
+ REQUIRE(sketch.get_upper_bound(3) ==
Approx(update_sketch.get_upper_bound(3)).margin(1e-10));
+ compact_theta_sketch compact_sketch = update_sketch.compact();
+ // the sketches are ordered, so the iteration sequence must match exactly
Review comment:
same here. ordering check was disabled above, but this loop relies on
the same order
##########
File path: theta/test/theta_sketch_test.cpp
##########
@@ -296,4 +398,234 @@ TEST_CASE("theta sketch: conversion constructor and
wrapped compact", "[theta_sk
REQUIRE_THROWS_AS(wrapped_compact_theta_sketch::wrap(bytes.data(),
bytes.size(), 0), std::invalid_argument);
}
+TEST_CASE("theta sketch: wrap compact empty from java", "[theta_sketch]") {
+ std::ifstream is;
+ is.exceptions(std::ios::failbit | std::ios::badbit);
+ is.open(inputPath + "theta_compact_empty_from_java.sk", std::ios::binary |
std::ios::ate);
+
+ std::vector<uint8_t> buf;
+ if(is) {
+ auto size = is.tellg();
+ buf.reserve(size);
+ buf.assign(size, 0);
+ is.seekg(0, std::ios_base::beg);
+ is.read((char*)(buf.data()), buf.size());
+ }
+
+ auto sketch = wrapped_compact_theta_sketch::wrap(buf.data(), buf.size());
+ REQUIRE(sketch.is_empty());
+ REQUIRE_FALSE(sketch.is_estimation_mode());
+ REQUIRE(sketch.get_num_retained() == 0);
+ REQUIRE(sketch.get_theta() == 1.0);
+ REQUIRE(sketch.get_estimate() == 0.0);
+ REQUIRE(sketch.get_lower_bound(1) == 0.0);
+ REQUIRE(sketch.get_upper_bound(1) == 0.0);
+}
+
+TEST_CASE("theta sketch: wrap compact v1 empty from java", "[theta_sketch]") {
+ std::ifstream is;
+ is.exceptions(std::ios::failbit | std::ios::badbit);
+ is.open(inputPath + "theta_compact_empty_from_java_v1.sk", std::ios::binary
| std::ios::ate);
+
+ std::vector<uint8_t> buf;
+ if(is) {
+ auto size = is.tellg();
+ buf.reserve(size);
+ buf.assign(size, 0);
+ is.seekg(0, std::ios_base::beg);
+ is.read((char*)(buf.data()), buf.size());
+ }
+
+ auto sketch = wrapped_compact_theta_sketch::wrap(buf.data(), buf.size());
+ REQUIRE(sketch.is_empty());
+ REQUIRE_FALSE(sketch.is_estimation_mode());
+ REQUIRE(sketch.get_num_retained() == 0);
+ REQUIRE(sketch.get_theta() == 1.0);
+ REQUIRE(sketch.get_estimate() == 0.0);
+ REQUIRE(sketch.get_lower_bound(1) == 0.0);
+ REQUIRE(sketch.get_upper_bound(1) == 0.0);
+}
+
+TEST_CASE("theta sketch: wrap compact v2 empty from java", "[theta_sketch]") {
+ std::ifstream is;
+ is.exceptions(std::ios::failbit | std::ios::badbit);
+ is.open(inputPath + "theta_compact_empty_from_java_v2.sk", std::ios::binary
| std::ios::ate);
+
+ std::vector<uint8_t> buf;
+ if(is) {
+ auto size = is.tellg();
+ buf.reserve(size);
+ buf.assign(size, 0);
+ is.seekg(0, std::ios_base::beg);
+ is.read((char*)(buf.data()), buf.size());
+ }
+
+ auto sketch = wrapped_compact_theta_sketch::wrap(buf.data(), buf.size());
+ REQUIRE(sketch.is_empty());
+ REQUIRE_FALSE(sketch.is_estimation_mode());
+ REQUIRE(sketch.get_num_retained() == 0);
+ REQUIRE(sketch.get_theta() == 1.0);
+ REQUIRE(sketch.get_estimate() == 0.0);
+ REQUIRE(sketch.get_lower_bound(1) == 0.0);
+ REQUIRE(sketch.get_upper_bound(1) == 0.0);
+}
+
+TEST_CASE("theta sketch: wrap single item from java", "[theta_sketch]") {
+ std::ifstream is;
+ is.exceptions(std::ios::failbit | std::ios::badbit);
+ is.open(inputPath + "theta_compact_single_item_from_java.sk",
std::ios::binary | std::ios::ate);
+ std::vector<uint8_t> buf;
+ if(is) {
+ auto size = is.tellg();
Review comment:
and here, and more. I am not going to mark every instance
##########
File path: theta/test/theta_sketch_test.cpp
##########
@@ -296,4 +398,234 @@ TEST_CASE("theta sketch: conversion constructor and
wrapped compact", "[theta_sk
REQUIRE_THROWS_AS(wrapped_compact_theta_sketch::wrap(bytes.data(),
bytes.size(), 0), std::invalid_argument);
}
+TEST_CASE("theta sketch: wrap compact empty from java", "[theta_sketch]") {
+ std::ifstream is;
+ is.exceptions(std::ios::failbit | std::ios::badbit);
+ is.open(inputPath + "theta_compact_empty_from_java.sk", std::ios::binary |
std::ios::ate);
+
+ std::vector<uint8_t> buf;
+ if(is) {
+ auto size = is.tellg();
+ buf.reserve(size);
+ buf.assign(size, 0);
+ is.seekg(0, std::ios_base::beg);
+ is.read((char*)(buf.data()), buf.size());
+ }
+
+ auto sketch = wrapped_compact_theta_sketch::wrap(buf.data(), buf.size());
+ REQUIRE(sketch.is_empty());
+ REQUIRE_FALSE(sketch.is_estimation_mode());
+ REQUIRE(sketch.get_num_retained() == 0);
+ REQUIRE(sketch.get_theta() == 1.0);
+ REQUIRE(sketch.get_estimate() == 0.0);
+ REQUIRE(sketch.get_lower_bound(1) == 0.0);
+ REQUIRE(sketch.get_upper_bound(1) == 0.0);
+}
+
+TEST_CASE("theta sketch: wrap compact v1 empty from java", "[theta_sketch]") {
+ std::ifstream is;
+ is.exceptions(std::ios::failbit | std::ios::badbit);
+ is.open(inputPath + "theta_compact_empty_from_java_v1.sk", std::ios::binary
| std::ios::ate);
+
+ std::vector<uint8_t> buf;
+ if(is) {
+ auto size = is.tellg();
Review comment:
same here. I don't think this works as intended
##########
File path: theta/test/theta_sketch_test.cpp
##########
@@ -296,4 +398,234 @@ TEST_CASE("theta sketch: conversion constructor and
wrapped compact", "[theta_sk
REQUIRE_THROWS_AS(wrapped_compact_theta_sketch::wrap(bytes.data(),
bytes.size(), 0), std::invalid_argument);
}
+TEST_CASE("theta sketch: wrap compact empty from java", "[theta_sketch]") {
+ std::ifstream is;
+ is.exceptions(std::ios::failbit | std::ios::badbit);
+ is.open(inputPath + "theta_compact_empty_from_java.sk", std::ios::binary |
std::ios::ate);
+
+ std::vector<uint8_t> buf;
+ if(is) {
+ auto size = is.tellg();
+ buf.reserve(size);
+ buf.assign(size, 0);
+ is.seekg(0, std::ios_base::beg);
+ is.read((char*)(buf.data()), buf.size());
+ }
+
+ auto sketch = wrapped_compact_theta_sketch::wrap(buf.data(), buf.size());
+ REQUIRE(sketch.is_empty());
+ REQUIRE_FALSE(sketch.is_estimation_mode());
+ REQUIRE(sketch.get_num_retained() == 0);
+ REQUIRE(sketch.get_theta() == 1.0);
+ REQUIRE(sketch.get_estimate() == 0.0);
+ REQUIRE(sketch.get_lower_bound(1) == 0.0);
+ REQUIRE(sketch.get_upper_bound(1) == 0.0);
+}
+
+TEST_CASE("theta sketch: wrap compact v1 empty from java", "[theta_sketch]") {
+ std::ifstream is;
+ is.exceptions(std::ios::failbit | std::ios::badbit);
+ is.open(inputPath + "theta_compact_empty_from_java_v1.sk", std::ios::binary
| std::ios::ate);
+
+ std::vector<uint8_t> buf;
+ if(is) {
+ auto size = is.tellg();
+ buf.reserve(size);
+ buf.assign(size, 0);
+ is.seekg(0, std::ios_base::beg);
+ is.read((char*)(buf.data()), buf.size());
+ }
+
+ auto sketch = wrapped_compact_theta_sketch::wrap(buf.data(), buf.size());
+ REQUIRE(sketch.is_empty());
+ REQUIRE_FALSE(sketch.is_estimation_mode());
+ REQUIRE(sketch.get_num_retained() == 0);
+ REQUIRE(sketch.get_theta() == 1.0);
+ REQUIRE(sketch.get_estimate() == 0.0);
+ REQUIRE(sketch.get_lower_bound(1) == 0.0);
+ REQUIRE(sketch.get_upper_bound(1) == 0.0);
+}
+
+TEST_CASE("theta sketch: wrap compact v2 empty from java", "[theta_sketch]") {
+ std::ifstream is;
+ is.exceptions(std::ios::failbit | std::ios::badbit);
+ is.open(inputPath + "theta_compact_empty_from_java_v2.sk", std::ios::binary
| std::ios::ate);
+
+ std::vector<uint8_t> buf;
+ if(is) {
+ auto size = is.tellg();
Review comment:
and here
##########
File path: theta/include/compact_theta_sketch_parser_impl.hpp
##########
@@ -29,32 +29,97 @@ template<bool dummy>
auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size,
uint64_t seed, bool dump_on_error) -> compact_theta_sketch_data {
if (size < 8) throw std::invalid_argument("at least 8 bytes expected, actual
" + std::to_string(size)
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const
uint8_t*>(ptr), size)) : ""));
- checker<true>::check_serial_version(reinterpret_cast<const
uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE],
COMPACT_SKETCH_SERIAL_VERSION);
- checker<true>::check_sketch_type(reinterpret_cast<const
uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
- uint64_t theta = theta_constants::MAX_THETA;
- const uint16_t seed_hash = reinterpret_cast<const
uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
- checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
- if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 <<
COMPACT_SKETCH_IS_EMPTY_FLAG)) {
- return {true, true, seed_hash, 0, theta, nullptr};
+
+ uint8_t serial_version = reinterpret_cast<const
uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE];
+
+ switch(serial_version) {
+ case COMPACT_SKETCH_SERIAL_VERSION: {
+ checker<true>::check_sketch_type(reinterpret_cast<const
uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
+ uint64_t theta = theta_constants::MAX_THETA;
+ const uint16_t seed_hash = reinterpret_cast<const
uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
+ if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] &
(1 << COMPACT_SKETCH_IS_EMPTY_FLAG)) {
+ return {true, true, seed_hash, 0, theta, nullptr};
+ }
+ checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
+ const bool has_theta = reinterpret_cast<const
uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 2;
+ if (has_theta) {
+ if (size < 16) throw std::invalid_argument("at least 16 bytes
expected, actual " + std::to_string(size));
+ theta = reinterpret_cast<const
uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
+ }
+ if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE]
== 1) {
+ return {false, true, seed_hash, 1, theta, reinterpret_cast<const
uint64_t*>(ptr) + COMPACT_SKETCH_SINGLE_ENTRY_U64};
+ }
+ const uint32_t num_entries = reinterpret_cast<const
uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
+ const size_t entries_start_u64 = has_theta ?
COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 : COMPACT_SKETCH_ENTRIES_EXACT_U64;
+ const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) +
entries_start_u64;
+ const size_t expected_size_bytes = (entries_start_u64 + num_entries) *
sizeof(uint64_t);
+ if (size < expected_size_bytes) {
+ throw std::invalid_argument(std::to_string(expected_size_bytes) + "
bytes expected, actual " + std::to_string(size)
+ + (dump_on_error ? (", sketch dump: " +
hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
+ }
+ const bool is_ordered = reinterpret_cast<const
uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 <<
COMPACT_SKETCH_IS_ORDERED_FLAG);
+ return {false, is_ordered, seed_hash, num_entries, theta, entries};
}
- const bool has_theta = reinterpret_cast<const
uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 2;
- if (has_theta) {
- if (size < 16) throw std::invalid_argument("at least 16 bytes expected,
actual " + std::to_string(size));
- theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
+ case 1: {
+ uint16_t seed_hash = compute_seed_hash(seed);
+ checker<true>::check_sketch_type(reinterpret_cast<const
uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
+ const uint32_t num_entries = reinterpret_cast<const
uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
+ uint64_t theta = reinterpret_cast<const
uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
+ bool is_empty = (num_entries == 0) && (theta ==
theta_constants::MAX_THETA);
+ if (is_empty) {
+ return {true, true, seed_hash, 0, theta, nullptr};
+ }
+ const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) +
COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
+ const size_t expected_size_bytes =
(COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
+ if (size < expected_size_bytes) {
+ throw std::invalid_argument(std::to_string(expected_size_bytes) + "
bytes expected, actual " + std::to_string(size)
+ + (dump_on_error ? (", sketch dump: " +
hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
+ }
+ return {false, false, seed_hash, num_entries, theta, entries};
}
- if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] ==
1) {
- return {false, true, seed_hash, 1, theta, reinterpret_cast<const
uint64_t*>(ptr) + COMPACT_SKETCH_SINGLE_ENTRY_U64};
+ case 2: {
+ uint8_t preamble_size = reinterpret_cast<const
uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE];
+ checker<true>::check_sketch_type(reinterpret_cast<const
uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
+ const uint16_t seed_hash = reinterpret_cast<const
uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
+ checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
+ if (preamble_size == 1) {
+ return {true, true, seed_hash, 0, theta_constants::MAX_THETA,
nullptr};
+ } else if (preamble_size == 2) {
+ const uint32_t num_entries = reinterpret_cast<const
uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
+ if (num_entries == 0) {
+ return {true, true, seed_hash, 0, theta_constants::MAX_THETA,
nullptr};
+ } else {
+ const size_t expected_size_bytes = (preamble_size + num_entries)
<< 3;
+ if (size < expected_size_bytes) {
+ throw
std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected,
actual " + std::to_string(size)
+ + (dump_on_error ? (", sketch dump: " +
hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
+ }
+ const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr)
+ COMPACT_SKETCH_ENTRIES_EXACT_U64;
+ return {false, true, seed_hash, num_entries,
theta_constants::MAX_THETA, entries};
+ }
+ } else if (preamble_size == 3) {
+ const uint32_t num_entries = reinterpret_cast<const
uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
+ uint64_t theta = reinterpret_cast<const
uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
+ bool is_empty = (num_entries == 0) && (theta ==
theta_constants::MAX_THETA);
+ if (is_empty) {
+ return {true, true, seed_hash, 0, theta, nullptr};
+ }
+ const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) +
COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
+ const size_t expected_size_bytes =
(COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
+ if (size < expected_size_bytes) {
+ throw std::invalid_argument(std::to_string(expected_size_bytes) +
" bytes expected, actual " + std::to_string(size)
+ + (dump_on_error ? (", sketch dump: " +
hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
+ }
+ return {false, false, seed_hash, num_entries, theta, entries};
+ } else {
+ throw std::invalid_argument(std::to_string(preamble_size) + " longs
of premable, but expected 1, 2, or 3");
+ }
}
- const uint32_t num_entries = reinterpret_cast<const
uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
- const size_t entries_start_u64 = has_theta ?
COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 : COMPACT_SKETCH_ENTRIES_EXACT_U64;
- const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) +
entries_start_u64;
- const size_t expected_size_bytes = (entries_start_u64 + num_entries) *
sizeof(uint64_t);
- if (size < expected_size_bytes) {
- throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes
expected, actual " + std::to_string(size)
- + (dump_on_error ? (", sketch dump: " +
hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
+ default:
+ // this should always fail since the valid cases are handled above
+ checker<true>::check_serial_version(reinterpret_cast<const
uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE],
COMPACT_SKETCH_SERIAL_VERSION);
Review comment:
why check here? shouldn't we just throw?
##########
File path: theta/include/theta_sketch_impl.hpp
##########
@@ -413,33 +413,99 @@ compact_theta_sketch_alloc<A>
compact_theta_sketch_alloc<A>::deserialize(std::is
const auto preamble_longs = read<uint8_t>(is);
const auto serial_version = read<uint8_t>(is);
const auto type = read<uint8_t>(is);
- read<uint16_t>(is); // unused
- const auto flags_byte = read<uint8_t>(is);
- const auto seed_hash = read<uint16_t>(is);
- checker<true>::check_sketch_type(type, SKETCH_TYPE);
- checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
- const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
- if (!is_empty) checker<true>::check_seed_hash(seed_hash,
compute_seed_hash(seed));
+ switch (serial_version) {
+ case SERIAL_VERSION: {
+ read<uint16_t>(is); // unused
+ const auto flags_byte = read<uint8_t>(is);
+ const auto seed_hash = read<uint16_t>(is);
+ checker<true>::check_sketch_type(type, SKETCH_TYPE);
+ checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
+ const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
+ if (!is_empty) checker<true>::check_seed_hash(seed_hash,
compute_seed_hash(seed));
+
+ uint64_t theta = theta_constants::MAX_THETA;
+ uint32_t num_entries = 0;
+ if (!is_empty) {
+ if (preamble_longs == 1) {
+ num_entries = 1;
+ } else {
+ num_entries = read<uint32_t>(is);
+ read<uint32_t>(is); // unused
+ if (preamble_longs > 2) {
+ theta = read<uint64_t>(is);
+ }
+ }
+ }
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
+ if (!is_empty) read(is, entries.data(), sizeof(uint64_t) *
entries.size());
- uint64_t theta = theta_constants::MAX_THETA;
- uint32_t num_entries = 0;
- if (!is_empty) {
- if (preamble_longs == 1) {
- num_entries = 1;
- } else {
- num_entries = read<uint32_t>(is);
+ const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
+ if (!is.good()) throw std::runtime_error("error reading from
std::istream");
+ return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash,
theta, std::move(entries));
+ }
+ case 1: {
+ const auto seed_hash = compute_seed_hash(seed);
+ checker<true>::check_sketch_type(type, SKETCH_TYPE);
+ read<uint8_t>(is); // unused
read<uint32_t>(is); // unused
- if (preamble_longs > 2) {
- theta = read<uint64_t>(is);
+ const auto num_entries = read<uint32_t>(is);
+ read<uint32_t>(is); //unused
+ const auto theta = read<uint64_t>(is);
+ std::vector<uint64_t> entries(num_entries, 0, allocator);
+ bool is_empty = (num_entries == 0) && (theta ==
theta_constants::MAX_THETA);
+ if (!is_empty)
+ read(is, entries.data(), sizeof(uint64_t) * entries.size());
+ if (!is.good())
+ throw std::runtime_error("error reading from std::istream");
+ return compact_theta_sketch_alloc(is_empty, false, seed_hash, theta,
std::move(entries));
+ }
+ case 2: {
+ checker<true>::check_sketch_type(type, SKETCH_TYPE);
+ read<uint8_t>(is); // unused
+ read<uint16_t>(is); // unused
+ const uint16_t seed_hash = read<uint16_t>(is);
+ checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
+ if (preamble_longs == 1) {
+ if (!is.good())
+ throw std::runtime_error("error reading from std::istream");
+ std::vector<uint64_t> entries(0, 0, allocator);
+ return compact_theta_sketch_alloc(true, true, seed_hash,
theta_constants::MAX_THETA, std::move(entries));
+ } else if (preamble_longs == 2) {
+ const uint32_t num_entries = read<uint32_t>(is);
+ read<uint32_t>(is); // unused
+ std::vector<uint64_t> entries(num_entries, 0, allocator);
+ if (num_entries == 0) {
+ return compact_theta_sketch_alloc(true, true, seed_hash,
theta_constants::MAX_THETA, std::move(entries));
+ }
+ read(is, entries.data(), entries.size() * sizeof(uint64_t));
+ if (!is.good())
+ throw std::runtime_error("error reading from std::istream");
+ return compact_theta_sketch_alloc(false, true, seed_hash,
theta_constants::MAX_THETA, std::move(entries));
+ } else if (preamble_longs == 3) {
+ const uint32_t num_entries = read<uint32_t>(is);
+ read<uint32_t>(is); // unused
+ const auto theta = read<uint64_t>(is);
+ bool is_empty = (num_entries == 0) && (theta ==
theta_constants::MAX_THETA);
+ std::vector<uint64_t> entries(num_entries, 0, allocator);
+ if (is_empty) {
+ if (!is.good())
+ throw std::runtime_error("error reading from std::istream");
+ return compact_theta_sketch_alloc(true, true, seed_hash, theta,
std::move(entries));
+ } else {
+ read(is, entries.data(), sizeof(uint64_t) * entries.size());
+ if (!is.good())
+ throw std::runtime_error("error reading from std::istream");
+ return compact_theta_sketch_alloc(false, false, seed_hash,
theta, std::move(entries));
+ }
+ } else {
+ throw std::invalid_argument(std::to_string(preamble_longs) + " longs
of premable, but expected 1, 2, or 3");
}
- }
}
- std::vector<uint64_t, A> entries(num_entries, 0, allocator);
- if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
-
- const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
- if (!is.good()) throw std::runtime_error("error reading from std::istream");
- return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta,
std::move(entries));
+ default:
+ // this should always fail since the valid cases are handled above
+ checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
Review comment:
why check? shouldn't we just throw here?
##########
File path: theta/test/theta_sketch_test.cpp
##########
@@ -214,6 +244,78 @@ TEST_CASE("theta sketch: deserialize compact estimation
from java", "[theta_sket
}
}
+TEST_CASE("theta sketch: deserialize compact v1 estimation from java",
"[theta_sketch]") {
+ std::ifstream is;
+ is.exceptions(std::ios::failbit | std::ios::badbit);
+ is.open(inputPath + "theta_compact_estimation_from_java_v1.sk",
std::ios::binary);
+ auto sketch = compact_theta_sketch::deserialize(is);
+ REQUIRE_FALSE(sketch.is_empty());
+ REQUIRE(sketch.is_estimation_mode());
+// REQUIRE(sketch.is_ordered()); // v1 sketches may not be ordered
+ REQUIRE(sketch.get_num_retained() == 4342);
+ REQUIRE(sketch.get_theta() == Approx(0.531700444213199).margin(1e-10));
+ REQUIRE(sketch.get_estimate() == Approx(8166.25234614053).margin(1e-10));
+ REQUIRE(sketch.get_lower_bound(2) ==
Approx(7996.956955317471).margin(1e-10));
+ REQUIRE(sketch.get_upper_bound(2) ==
Approx(8339.090301078124).margin(1e-10));
+
+ // the same construction process in Java must have produced exactly the same
sketch
+ update_theta_sketch update_sketch = update_theta_sketch::builder().build();
+ const int n = 8192;
+ for (int i = 0; i < n; i++) update_sketch.update(i);
+ REQUIRE(sketch.get_num_retained() == update_sketch.get_num_retained());
+ REQUIRE(sketch.get_theta() ==
Approx(update_sketch.get_theta()).margin(1e-10));
+ REQUIRE(sketch.get_estimate() ==
Approx(update_sketch.get_estimate()).margin(1e-10));
+ REQUIRE(sketch.get_lower_bound(1) ==
Approx(update_sketch.get_lower_bound(1)).margin(1e-10));
+ REQUIRE(sketch.get_upper_bound(1) ==
Approx(update_sketch.get_upper_bound(1)).margin(1e-10));
+ REQUIRE(sketch.get_lower_bound(2) ==
Approx(update_sketch.get_lower_bound(2)).margin(1e-10));
+ REQUIRE(sketch.get_upper_bound(2) ==
Approx(update_sketch.get_upper_bound(2)).margin(1e-10));
+ REQUIRE(sketch.get_lower_bound(3) ==
Approx(update_sketch.get_lower_bound(3)).margin(1e-10));
+ REQUIRE(sketch.get_upper_bound(3) ==
Approx(update_sketch.get_upper_bound(3)).margin(1e-10));
+ compact_theta_sketch compact_sketch = update_sketch.compact();
+ // the sketches are ordered, so the iteration sequence must match exactly
Review comment:
this iteration relies on ordering, but there is a note above that v1
sketch may not be ordered
##########
File path: theta/test/theta_sketch_test.cpp
##########
@@ -296,4 +398,234 @@ TEST_CASE("theta sketch: conversion constructor and
wrapped compact", "[theta_sk
REQUIRE_THROWS_AS(wrapped_compact_theta_sketch::wrap(bytes.data(),
bytes.size(), 0), std::invalid_argument);
}
+TEST_CASE("theta sketch: wrap compact empty from java", "[theta_sketch]") {
+ std::ifstream is;
+ is.exceptions(std::ios::failbit | std::ios::badbit);
+ is.open(inputPath + "theta_compact_empty_from_java.sk", std::ios::binary |
std::ios::ate);
+
+ std::vector<uint8_t> buf;
+ if(is) {
+ auto size = is.tellg();
Review comment:
I would think that is.seekg(0, is.end) is needed first to obtain the
file size with is.tellg()
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]