jmalkin commented on code in PR #438: URL: https://github.com/apache/datasketches-cpp/pull/438#discussion_r1718031035
########## filters/include/bloom_filter_impl.hpp: ########## @@ -0,0 +1,907 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef _BLOOM_FILTER_IMPL_HPP_ +#define _BLOOM_FILTER_IMPL_HPP_ + +#include <memory> +#include <sstream> +#include <vector> + +#include "common_defs.hpp" +#include "bit_array_ops.hpp" +#include "memory_operations.hpp" +#include "xxhash64.h" + +// memory scenarios: +// * on-heap: owned, bit_array_ set, memory_ null +// * direct: not owned, bit_array_ set, memory_ set +// * read-only an option for direct + +namespace datasketches { + +template<typename A> +bloom_filter_alloc<A>::bloom_filter_alloc(const uint64_t num_bits, const uint16_t num_hashes, uint64_t seed, const A& allocator) : + allocator_(allocator), + seed_(seed), + num_hashes_(num_hashes), + is_dirty_(false), + is_owned_(true), + is_read_only_(false), + capacity_bits_((num_bits + 63) & ~0x3F), // can round to nearest multiple of 64 prior to bounds checks + num_bits_set_(0) +{ + if (num_hashes == 0) { + throw std::invalid_argument("Must have at least 1 hash function"); + } + if (num_bits == 0) { + throw std::invalid_argument("Number of bits must be greater than zero"); + } else if (num_bits > MAX_FILTER_SIZE_BITS) { + throw std::invalid_argument("Filter may not exceed " + std::to_string(MAX_FILTER_SIZE_BITS) + " bits"); + } + + const uint64_t num_bytes = capacity_bits_ >> 3; + bit_array_ = AllocUint8(allocator_).allocate(num_bytes); + std::fill_n(bit_array_, num_bytes, 0); + if (bit_array_ == nullptr) { + throw std::bad_alloc(); + } + memory_ = nullptr; +} + +template<typename A> +bloom_filter_alloc<A>::bloom_filter_alloc(uint8_t* memory, + size_t length_bytes, + const uint64_t num_bits, + const uint16_t num_hashes, + const uint64_t seed, + const A& allocator) : + allocator_(allocator), + seed_(seed), + num_hashes_(num_hashes), + is_dirty_(false), + is_owned_(false), + is_read_only_(false), + capacity_bits_((num_bits + 63) & ~0x3F), // can round to nearest multiple of 64 prior to bounds checks + num_bits_set_(0) +{ + if (num_hashes == 0) { + throw std::invalid_argument("Must have at least 1 hash function"); + } + if (num_bits == 0) { + throw std::invalid_argument("Number of bits must be greater than zero"); + } else if (num_bits > MAX_FILTER_SIZE_BITS) { + throw std::invalid_argument("Filter may not exceed " + std::to_string(MAX_FILTER_SIZE_BITS) + " bits"); + } + + const size_t num_bytes = get_serialized_size_bytes(capacity_bits_); + if (length_bytes < num_bytes) { + throw std::invalid_argument("Input memory block is too small"); + } + + // fill in header info + uint8_t* ptr = memory; + const uint8_t preamble_longs = PREAMBLE_LONGS_STANDARD; // no resizing so assume non-empty + ptr += copy_to_mem(preamble_longs, ptr); + const uint8_t serial_version = SER_VER; + ptr += copy_to_mem(serial_version, ptr); + const uint8_t family = FAMILY_ID; + ptr += copy_to_mem(family, ptr); + const uint8_t flags_byte = 0; // again, assuming non-empty + ptr += copy_to_mem(flags_byte, ptr); + + ptr += copy_to_mem(num_hashes_, ptr); + ptr += copy_to_mem(static_cast<uint16_t>(0), ptr); // 2 bytes unused + ptr += copy_to_mem(seed_, ptr); + ptr += copy_to_mem(static_cast<int32_t>(capacity_bits_ >> 6), ptr); // sized in java longs + ptr += copy_to_mem(static_cast<uint32_t>(0), ptr); // 4 bytes unused + + // rest of memory is num bits and bit array, so start with zeroes + std::fill_n(ptr, sizeof(uint64_t) * ((capacity_bits_ >> 6) + 1), 0); + bit_array_ = memory + BIT_ARRAY_OFFSET_BYTES; + memory_ = memory; +} + +template<typename A> +bloom_filter_alloc<A>::bloom_filter_alloc(const uint64_t seed, + const uint16_t num_hashes, + const bool is_dirty, + const bool is_owned, + const bool is_read_only, + const uint64_t capacity_bits, + const uint64_t num_bits_set, + uint8_t* bit_array, + uint8_t* memory, + const A& allocator) : + allocator_(allocator), + seed_(seed), + num_hashes_(num_hashes), + is_dirty_(is_dirty), + is_owned_(is_owned), + is_read_only_(is_read_only), + capacity_bits_((capacity_bits + 63) & ~0x3F), + num_bits_set_(num_bits_set), + bit_array_(bit_array), + memory_(memory) +{ + // private constructor + // no consistency checks since we should have done those prior to calling this + if (is_read_only_ && memory_ != nullptr && num_bits_set == DIRTY_BITS_VALUE) { + num_bits_set_ = bit_array_ops::count_num_bits_set(bit_array_, capacity_bits_ >> 3); + } +} + +template<typename A> +bloom_filter_alloc<A>::bloom_filter_alloc(const bloom_filter_alloc& other) : + allocator_(other.allocator_), + seed_(other.seed_), + num_hashes_(other.num_hashes_), + is_dirty_(other.is_dirty_), + is_owned_(other.is_owned_), + is_read_only_(other.is_read_only_), + capacity_bits_(other.capacity_bits_), + num_bits_set_(other.num_bits_set_) +{ + if (is_owned_) { + const size_t num_bytes = capacity_bits_ >> 3; + bit_array_ = AllocUint8(allocator_).allocate(num_bytes); + if (bit_array_ == nullptr) { + throw std::bad_alloc(); + } + std::copy_n(other.bit_array_, num_bytes, bit_array_); + memory_ = nullptr; + } else { + bit_array_ = other.bit_array_; + memory_ = other.memory_; + } +} + +template<typename A> +bloom_filter_alloc<A>::bloom_filter_alloc(bloom_filter_alloc&& other) noexcept : + allocator_(std::move(other.allocator_)), + seed_(other.seed_), + num_hashes_(other.num_hashes_), + is_dirty_(other.is_dirty_), + is_owned_(other.is_owned_), + is_read_only_(other.is_read_only_), + capacity_bits_(other.capacity_bits_), + num_bits_set_(other.num_bits_set_), + bit_array_(std::move(other.bit_array_)), Review Comment: yeah, i was trying to use copilot and while it helps with some boilerplate it inserts stuff like this, too, and for whatever reason i missed removing it. thinking i need to turn that off overall. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
