http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/d0756e7e/storage/FastHashTable.hpp ---------------------------------------------------------------------- diff --git a/storage/FastHashTable.hpp b/storage/FastHashTable.hpp new file mode 100644 index 0000000..12e447f --- /dev/null +++ b/storage/FastHashTable.hpp @@ -0,0 +1,2640 @@ +/** + * Copyright 2011-2015 Quickstep Technologies LLC. + * Copyright 2015-2016 Pivotal Software, Inc. + * Copyright 2016, Quickstep Research Group, Computer Sciences Department, + * University of WisconsinâMadison. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +#ifndef QUICKSTEP_STORAGE_FAST_HASH_TABLE_HPP_ +#define QUICKSTEP_STORAGE_FAST_HASH_TABLE_HPP_ + +#include <atomic> +#include <cstddef> +#include <cstdlib> +#include <type_traits> +#include <vector> + +#include "catalog/CatalogTypedefs.hpp" +#include "storage/HashTableBase.hpp" +#include "storage/StorageBlob.hpp" +#include "storage/StorageBlockInfo.hpp" +#include "storage/StorageConstants.hpp" +#include "storage/StorageManager.hpp" +#include "storage/TupleReference.hpp" +#include "storage/ValueAccessor.hpp" +#include "storage/ValueAccessorUtil.hpp" +#include "expressions/aggregation/AggregationHandleAvg.hpp" +#include "threading/SpinSharedMutex.hpp" +#include "threading/SpinMutex.hpp" +#include "types/Type.hpp" +#include "types/TypedValue.hpp" +#include "utility/BloomFilter.hpp" +#include "utility/HashPair.hpp" +#include "utility/Macros.hpp" +#include "storage/HashTable.hpp" + +namespace quickstep { + +/** \addtogroup Storage + * @{ + */ + +/** + * @brief Base class for hash table. + * + * This class is templated so that the core hash-table logic can be reused in + * different contexts requiring different value types and semantics (e.g. + * hash-joins vs. hash-based grouping for aggregates vs. hash-based indices). + * The base template defines the interface that HashTables provide to clients + * and implements some common functionality for all HashTables. There a few + * different (also templated) implementation classes that inherit from this + * base class and have different physical layouts with different performance + * characteristics. As of this writing, they are: + * 1. LinearOpenAddressingHashTable - All keys/values are stored directly + * in a single array of buckets. Collisions are handled by simply + * advancing to the "next" adjacent bucket until an empty bucket is + * found. This implementation is vulnerable to performance degradation + * due to the formation of bucket chains when there are many duplicate + * and/or consecutive keys. + * 2. SeparateChainingHashTable - Keys/values are stored in a separate + * region of memory from the base hash table slot array. Every bucket + * has a "next" pointer so that entries that collide (i.e. map to the + * same base slot) form chains of pointers with each other. Although + * this implementation has some extra indirection compared to + * LinearOpenAddressingHashTable, it does not have the same + * vulnerabilities to key skew, and it additionally supports a very + * efficient bucket-preallocation mechanism that minimizes cache + * coherency overhead when multiple threads are building a HashTable + * as part of a hash-join. + * 3. SimpleScalarSeparateChainingHashTable - A simplified version of + * SeparateChainingHashTable that is only usable for single, scalar + * keys with a reversible hash function. This implementation exploits + * the reversible hash to avoid storing separate copies of keys at all, + * and to skip an extra key comparison when hash codes collide. + * + * @note If you need to create a HashTable and not just use it as a client, see + * HashTableFactory, which simplifies the process of creating a + * HashTable. + * + * @param ValueT The mapped value in this hash table. Must be + * copy-constructible. For a serializable hash table, ValueT must also + * be trivially copyable and trivially destructible (and beware of + * pointers to external memory). + * @param resizable Whether this hash table is resizable (using memory from a + * StorageManager) or not (using a private, fixed memory allocation). + * @param serializable If true, this hash table can safely be saved to and + * loaded from disk. If false, some out of band memory may be used (e.g. + * to store variable length keys). + * @param force_key_copy If true, inserted keys are always copied into this + * HashTable's memory. If false, pointers to external values may be + * stored instead. force_key_copy should be true if the hash table will + * outlive the external key values which are inserted into it. Note that + * if serializable is true and force_key_copy is false, then relative + * offsets will be used instead of absolute pointers to keys, meaning + * that the pointed-to keys must be serialized and deserialized in + * exactly the same relative byte order (e.g. as part of the same + * StorageBlock), and keys must not change position relative to this + * HashTable (beware TupleStorageSubBlocks that may self-reorganize when + * modified). If serializable and resizable are both true, then + * force_key_copy must also be true. + * @param allow_duplicate_keys If true, multiple values can be mapped to the + * same key. If false, one and only one value may be mapped. + **/ +template <bool resizable, + bool serializable, + bool force_key_copy, + bool allow_duplicate_keys> +class FastHashTable : public HashTableBase<resizable, + serializable, + force_key_copy, + allow_duplicate_keys> { + static_assert(!(serializable && resizable && !force_key_copy), + "A HashTable must have force_key_copy=true when serializable " + "and resizable are both true."); + + // TODO(chasseur): GCC 4.8.3 doesn't yet implement + // std::is_trivially_copyable. In the future, we should include a + // static_assert that prevents a serializable HashTable from being used with + // a ValueT which is not trivially copyable. + + public: + // Shadow template parameters. This is useful for shared test harnesses. +// typedef ValueT value_type; + static constexpr bool template_resizable = resizable; + static constexpr bool template_serializable = serializable; + static constexpr bool template_force_key_copy = force_key_copy; + static constexpr bool template_allow_duplicate_keys = allow_duplicate_keys; + + // Some HashTable implementations (notably LinearOpenAddressingHashTable) + // use a special hash code to represent an empty bucket, and another special + // code to indicate that a bucket is currently being inserted into. For those + // HashTables, this is a surrogate hash value for empty buckets. Keys which + // actually hash to this value should have their hashes mutated (e.g. by + // adding 1). We use zero, since we will often be using memory which is + // already zeroed-out and this saves us the trouble of a memset. This has + // some downside, as the hash function we use is the identity hash for + // integers, and the integer 0 is common in many data sets and must be + // adjusted (and will then spuriously collide with 1). Nevertheless, this + // expense is outweighed by no longer having to memset large regions of + // memory when initializing a HashTable. + static constexpr unsigned char kEmptyHashByte = 0x0; + static constexpr std::size_t kEmptyHash = 0x0; + + // A surrogate hash value for a bucket which is currently being inserted + // into. As with kEmptyHash, keys which actually hash to this value should + // have their hashes adjusted. + static constexpr std::size_t kPendingHash = ~kEmptyHash; + + /** + * @brief Virtual destructor. + **/ + virtual ~FastHashTable() { + if (resizable) { + if (blob_.valid()) { + if (serializable) { + DEV_WARNING("Destroying a resizable serializable HashTable's underlying " + "StorageBlob."); + } + const block_id blob_id = blob_->getID(); + blob_.release(); + storage_manager_->deleteBlockOrBlobFile(blob_id); + } + } + } + + /** + * @brief Get the ID of the StorageBlob used to store a resizable HashTable. + * + * @warning This method must not be used for a non-resizable HashTable. + * + * @return The ID of the StorageBlob used to store this HashTable. + **/ + inline block_id getBlobId() const { + DEBUG_ASSERT(resizable); + return blob_->getID(); + } + + /** + * @brief Erase all entries in this hash table. + * + * @warning This method is not guaranteed to be threadsafe. + **/ + virtual void clear() = 0; + + /** + * @brief Add a new entry into the hash table. + * + * @warning The key must not be null. + * @warning This method is threadsafe with regard to other calls to put(), + * putCompositeKey(), putValueAccessor(), and + * putValueAccessorCompositeKey(), but should not be used + * simultaneously with upsert(), upsertCompositeKey(), + * upsertValueAccessor(), or upsertValueAccessorCompositeKey(). + * @note This version is for single scalar keys, see also putCompositeKey(). + * @note If the hash table is (close to) full and resizable is true, this + * routine might result in rebuilding the entire hash table. + * + * @param key The key. + * @param value The value payload. + * @return HashTablePutResult::kOK if an entry was successfully inserted, + * HashTablePutResult::kDuplicateKey if allow_duplicate_keys is false + * and key was a duplicate, or HashTablePutResult::kOutOfSpace if + * resizable is false and storage space for the hash table has been + * exhausted. + **/ + HashTablePutResult put(const TypedValue &key, + const uint8_t &value); + + /** + * @brief Add a new entry into the hash table (composite key version). + * + * @warning No component of the key may be null. + * @warning This method is threadsafe with regard to other calls to put(), + * putCompositeKey(), putValueAccessor(), and + * putValueAccessorCompositeKey(), but should not be used + * simultaneously with upsert(), upsertCompositeKey(), + * upsertValueAccessor(), or upsertValueAccessorCompositeKey(). + * @note This version is for composite keys, see also put(). + * @note If the hash table is (close to) full and resizable is true, this + * routine might result in rebuilding the entire hash table. + * + * @param key The components of the key. + * @param value The value payload. + * @return HashTablePutResult::kOK if an entry was successfully inserted, + * HashTablePutResult::kDuplicateKey if allow_duplicate_keys is false + * and key was a duplicate, or HashTablePutResult::kOutOfSpace if + * resizable is false and storage space for the hash table has been + * exhausted. + **/ + HashTablePutResult putCompositeKey(const std::vector<TypedValue> &key, + const uint8_t &value); + + HashTablePutResult putCompositeKeyFast(const std::vector<TypedValue> &key, + const uint8_t *value_ptr); + + /** + * @brief Add (multiple) new entries into the hash table from a + * ValueAccessor. + * + * @warning This method is threadsafe with regard to other calls to put(), + * putCompositeKey(), putValueAccessor(), and + * putValueAccessorCompositeKey(), but should not be used + * simultaneously with upsert(), upsertCompositeKey(), + * upsertValueAccessor(), or upsertValueAccessorCompositeKey(). + * @note This version is for scalar keys, see also + * putValueAccessorCompositeKey(). + * @note If the hash table fills up while this call is in progress and + * resizable is true, this might result in rebuilding the entire hash + * table. + * + * @param accessor A ValueAccessor which will be used to access keys. + * beginIteration() should be called on accessor before calling this + * method. + * @param key_attr_id The attribute ID of the keys to be read from accessor. + * @param check_for_null_keys If true, each key will be checked to see if it + * is null before inserting it (null keys are skipped). This must be + * set to true if some of the keys that will be read from accessor may + * be null. + * @param functor A pointer to a functor, which should provide a call + * operator that takes const ValueAccessor& as an argument (or better + * yet, a templated call operator which takes a const reference to + * some subclass of ValueAccessor as an argument) and returns either + * a ValueT or a reference to a ValueT. The functor should generate + * the appropriate mapped value for the current tuple the accessor is + * iterating on. + * @return HashTablePutResult::kOK if all keys and generated values from + * accessor were successfully inserted. + * HashTablePutResult::kOutOfSpace is returned if this hash-table is + * non-resizable and ran out of space (note that some entries may + * still have been inserted, and accessor's iteration will be left on + * the first tuple which could not be inserted). + * HashTablePutResult::kDuplicateKey is returned if + * allow_duplicate_keys is false and a duplicate key is encountered + * (as with HashTablePutResult::kOutOfSpace, some entries may have + * been inserted, and accessor will be left on the tuple with a + * duplicate key). + **/ + template <typename FunctorT> + HashTablePutResult putValueAccessor(ValueAccessor *accessor, + const attribute_id key_attr_id, + const bool check_for_null_keys, + FunctorT *functor); + + /** + * @brief Add (multiple) new entries into the hash table from a + * ValueAccessor (composite key version). + * + * @warning This method is threadsafe with regard to other calls to put(), + * putCompositeKey(), putValueAccessor(), and + * putValueAccessorCompositeKey(), but should not be used + * simultaneously with upsert(), upsertCompositeKey(), + * upsertValueAccessor(), or upsertValueAccessorCompositeKey(). + * @note This version is for composite keys, see also putValueAccessor(). + * @note If the hash table fills up while this call is in progress and + * resizable is true, this might result in rebuilding the entire hash + * table. + * + * @param accessor A ValueAccessor which will be used to access keys. + * beginIteration() should be called on accessor before calling this + * method. + * @param key_attr_ids The attribute IDs of each key component to be read + * from accessor. + * @param check_for_null_keys If true, each key will be checked to see if it + * has a null component before inserting it (null keys are skipped). + * This must be set to true if some of the keys that will be read from + * accessor may be null. + * @param functor A pointer to a functor, which should provide a call + * operator that takes const ValueAccessor& as an argument (or better + * yet, a templated call operator which takes a const reference to + * some subclass of ValueAccessor as an argument) and returns either + * a ValueT or a reference to a ValueT. The functor should generate + * the appropriate mapped value for the current tuple the accessor is + * iterating on. + * @return HashTablePutResult::kOK if all keys and generated values from + * accessor were successfully inserted. + * HashTablePutResult::kOutOfSpace is returned if this hash-table is + * non-resizable and ran out of space (note that some entries may + * still have been inserted, and accessor's iteration will be left on + * the first tuple which could not be inserted). + * HashTablePutResult::kDuplicateKey is returned if + * allow_duplicate_keys is false and a duplicate key is encountered + * (as with HashTablePutResult::kOutOfSpace, some entries may have + * been inserted, and accessor will be left on the tuple with a + * duplicate key). + **/ + template <typename FunctorT> + HashTablePutResult putValueAccessorCompositeKey( + ValueAccessor *accessor, + const std::vector<attribute_id> &key_attr_ids, + const bool check_for_null_keys, + FunctorT *functor); + + /** + * @brief Apply a functor to the value mapped to a key, first inserting a new + * value if one is not already present. + * + * @warning The key must not be null. + * @warning This method is only usable if allow_duplicate_keys is false. + * @warning This method is threadsafe with regard to other calls to upsert(), + * upsertCompositeKey(), upsertValueAccessor(), and + * upsertValueAccessorCompositeKey(), but should not be used + * simultaneously with put(), putCompositeKey(), putValueAccessor(), + * or putValueAccessorCompositeKey(). + * @warning The ValueT* pointer passed to functor's call operator is only + * guaranteed to be valid for the duration of the call. The functor + * should not store a copy of the pointer and assume that it remains + * valid. + * @warning Although this method itself is threadsafe, the ValueT object + * accessed by functor is not guaranteed to be (although it is + * guaranteed that its initial insertion will be atomic). If it is + * possible for multiple threads to call upsert() with the same key + * at the same time, then their access to ValueT should be made + * threadsafe (e.g. with the use of atomic types, mutexes, or some + * other external synchronization). + * @note This version is for single scalar keys, see also + * upsertCompositeKey(). + * @note If the hash table is (close to) full and resizable is true, this + * routine might result in rebuilding the entire hash table. + * + * @param key The key. + * @param initial_value If there was not already a preexisting entry in this + * HashTable for the specified key, then the value will be initialized + * with a copy of initial_value. This parameter is ignored if a value + * is already present for key. + * @param functor A pointer to a functor, which should provide a call + * operator which takes ValueT* as an argument. The call operator will + * be invoked once on the value corresponding to key (which may be + * newly inserted and default-constructed). + * @return True on success, false if upsert failed because there was not + * enough space to insert a new entry in this HashTable. + **/ + template <typename FunctorT> + bool upsert(const TypedValue &key, + const uint8_t &initial_value, + FunctorT *functor); + + /** + * @brief Apply a functor to the value mapped to a key, first inserting a new + * value if one is not already present. + * + * @warning The key must not be null. + * @warning This method is only usable if allow_duplicate_keys is false. + * @warning This method is threadsafe with regard to other calls to upsert(), + * upsertCompositeKey(), upsertValueAccessor(), and + * upsertValueAccessorCompositeKey(), but should not be used + * simultaneously with put(), putCompositeKey(), putValueAccessor(), + * or putValueAccessorCompositeKey(). + * @warning The ValueT* pointer passed to functor's call operator is only + * guaranteed to be valid for the duration of the call. The functor + * should not store a copy of the pointer and assume that it remains + * valid. + * @warning Although this method itself is threadsafe, the ValueT object + * accessed by functor is not guaranteed to be (although it is + * guaranteed that its initial insertion will be atomic). If it is + * possible for multiple threads to call upsertCompositeKey() with + * the same key at the same time, then their access to ValueT should + * be made threadsafe (e.g. with the use of atomic types, mutexes, + * or some other external synchronization). + * @note This version is for composite keys, see also upsert(). + * @note If the hash table is (close to) full and resizable is true, this + * routine might result in rebuilding the entire hash table. + * + * @param key The key. + * @param initial_value If there was not already a preexisting entry in this + * HashTable for the specified key, then the value will be initialized + * with a copy of initial_value. This parameter is ignored if a value + * is already present for key. + * @param functor A pointer to a functor, which should provide a call + * operator which takes ValueT* as an argument. The call operator will + * be invoked once on the value corresponding to key (which may be + * newly inserted and default-constructed). + * @return True on success, false if upsert failed because there was not + * enough space to insert a new entry in this HashTable. + **/ + template <typename FunctorT> + bool upsertCompositeKey(const std::vector<TypedValue> &key, + const uint8_t &initial_value, + FunctorT *functor); + + + template <typename FunctorT> + bool upsertCompositeKeyFast(const std::vector<TypedValue> &key, + const uint8_t *init_value_ptr, + FunctorT *functor); + + bool upsertCompositeKeyNewFast(const std::vector<TypedValue> &key, + const uint8_t *init_value_ptr, + const uint8_t *source_state); + + /** + * @brief Apply a functor to (multiple) entries in this hash table, with keys + * drawn from a ValueAccessor. New values are first inserted if not + * already present. + * + * @warning This method is only usable if allow_duplicate_keys is false. + * @warning This method is threadsafe with regard to other calls to upsert(), + * upsertCompositeKey(), upsertValueAccessor(), and + * upsertValueAccessorCompositeKey(), but should not be used + * simultaneously with put(), putCompositeKey(), putValueAccessor(), + * or putValueAccessorCompositeKey(). + * @warning The ValueAccessor reference and ValueT* pointer passed to + * functor's call operator are only guaranteed to be valid for the + * duration of the call. The functor should not store a copy of + * these pointers and assume that they remain valid. + * @warning Although this method itself is threadsafe, the ValueT object + * accessed by functor is not guaranteed to be (although it is + * guaranteed that its initial insertion will be atomic). If it is + * possible for multiple threads to call upsertValueAccessor() with + * the same key at the same time, then their access to ValueT should + * be made threadsafe (e.g. with the use of atomic types, mutexes, + * or some other external synchronization). + * @note This version is for single scalar keys, see also + * upsertValueAccessorCompositeKey(). + * @note If the hash table is (close to) full and resizable is true, this + * routine might result in rebuilding the entire hash table. + * + * @param accessor A ValueAccessor which will be used to access keys. + * beginIteration() should be called on accessor before calling this + * method. + * @param key_attr_id The attribute ID of the keys to be read from accessor. + * @param check_for_null_keys If true, each key will be checked to see if it + * is null before upserting it (null keys are skipped). This must be + * set to true if some of the keys that will be read from accessor may + * be null. + * @param functor A pointer to a functor, which should provide a call + * operator that takes two arguments: const ValueAccessor& (or better + * yet, a templated call operator which takes a const reference to + * some subclass of ValueAccessor as its first argument) and ValueT*. + * The call operator will be invoked once for every tuple with a + * non-null key in accessor. + * @return True on success, false if upsert failed because there was not + * enough space to insert new entries for all the keys in accessor + * (note that some entries may still have been upserted, and + * accessor's iteration will be left on the first tuple which could + * not be inserted). + **/ + template <typename FunctorT> + bool upsertValueAccessor(ValueAccessor *accessor, + const attribute_id key_attr_id, + const bool check_for_null_keys, + const uint8_t &initial_value, + FunctorT *functor); + + + bool upsertValueAccessorFast(const std::vector<std::vector<attribute_id>> &argument_ids, + ValueAccessor *accessor, + const attribute_id key_attr_id, + const bool check_for_null_keys); + + /** + * @brief Apply a functor to (multiple) entries in this hash table, with keys + * drawn from a ValueAccessor. New values are first inserted if not + * already present. Composite key version. + * + * @warning This method is only usable if allow_duplicate_keys is false. + * @warning This method is threadsafe with regard to other calls to upsert(), + * upsertCompositeKey(), upsertValueAccessor(), and + * upsertValueAccessorCompositeKey(), but should not be used + * simultaneously with put(), putCompositeKey(), putValueAccessor(), + * or putValueAccessorCompositeKey(). + * @warning The ValueAccessor reference and ValueT* pointer passed to + * functor's call operator are only guaranteed to be valid for the + * duration of the call. The functor should not store a copy of + * these pointers and assume that they remain valid. + * @warning Although this method itself is threadsafe, the ValueT object + * accessed by functor is not guaranteed to be (although it is + * guaranteed that its initial insertion will be atomic). If it is + * possible for multiple threads to call upsertValueAccessor() with + * the same key at the same time, then their access to ValueT should + * be made threadsafe (e.g. with the use of atomic types, mutexes, + * or some other external synchronization). + * @note This version is for composite keys, see also upsertValueAccessor(). + * @note If the hash table is (close to) full and resizable is true, this + * routine might result in rebuilding the entire hash table. + * + * @param accessor A ValueAccessor which will be used to access keys. + * beginIteration() should be called on accessor before calling this + * method. + * @param key_attr_ids The attribute IDs of each key component to be read + * from accessor. + * @param check_for_null_keys If true, each key will be checked to see if it + * is null before upserting it (null keys are skipped). This must be + * set to true if some of the keys that will be read from accessor may + * be null. + * @param functor A pointer to a functor, which should provide a call + * operator that takes two arguments: const ValueAccessor& (or better + * yet, a templated call operator which takes a const reference to + * some subclass of ValueAccessor as its first argument) and ValueT*. + * The call operator will be invoked once for every tuple with a + * non-null key in accessor. + * @return True on success, false if upsert failed because there was not + * enough space to insert new entries for all the keys in accessor + * (note that some entries may still have been upserted, and + * accessor's iteration will be left on the first tuple which could + * not be inserted). + **/ + template <typename FunctorT> + bool upsertValueAccessorCompositeKey( + ValueAccessor *accessor, + const std::vector<attribute_id> &key_attr_ids, + const bool check_for_null_keys, + const uint8_t &initial_value, + FunctorT *functor); + + bool upsertValueAccessorCompositeKeyFast( + const std::vector<std::vector<attribute_id>> &argument, + ValueAccessor *accessor, + const std::vector<attribute_id> &key_attr_ids, + const bool check_for_null_keys); + + /** + * @brief Determine the number of entries (key-value pairs) contained in this + * HashTable. + * @note For some HashTable implementations, this is O(1), but for others it + * may be O(n) where n is the number of buckets. + * + * @warning This method assumes that no concurrent calls to put(), + * putCompositeKey(), putValueAccessor(), + * putValueAccessorCompositeKey(), upsert(), upsertCompositeKey(), + * upsertValueAccessor(), or upsertValueAccessorCompositeKey() are + * taking place (i.e. that this HashTable is immutable for the + * duration of the call). Concurrent calls to getSingle(), + * getSingleCompositeKey(), getAll(), getAllCompositeKey(), + * getAllFromValueAccessor(), getAllFromValueAccessorCompositeKey(), + * forEach(), and forEachCompositeKey() are safe. + * + * @return The number of entries in this HashTable. + **/ + virtual std::size_t numEntries() const = 0; + + /** + * @brief Lookup a key against this hash table to find a matching entry. + * + * @warning Only usable with the hash table that does not allow duplicate + * keys. + * @warning The key must not be null. + * @warning This method assumes that no concurrent calls to put(), + * putCompositeKey(), putValueAccessor(), + * putValueAccessorCompositeKey(), upsert(), upsertCompositeKey(), + * upsertValueAccessor(), or upsertValueAccessorCompositeKey() are + * taking place (i.e. that this HashTable is immutable for the + * duration of the call and as long as the returned pointer may be + * dereferenced). Concurrent calls to getSingle(), + * getSingleCompositeKey(), getAll(), getAllCompositeKey(), + * getAllFromValueAccessor(), getAllFromValueAccessorCompositeKey(), + * forEach(), and forEachCompositeKey() are safe. + * @note This version is for single scalar keys. See also + * getSingleCompositeKey(). + * + * @param key The key to look up. + * @return The value of a matched entry if a matching key is found. + * Otherwise, return NULL. + **/ + virtual const uint8_t* getSingle(const TypedValue &key) const = 0; + + /** + * @brief Lookup a composite key against this hash table to find a matching + * entry. + * + * @warning Only usable with the hash table that does not allow duplicate + * keys. + * @warning The key must not be null. + * @warning This method assumes that no concurrent calls to put(), + * putCompositeKey(), putValueAccessor(), + * putValueAccessorCompositeKey(), upsert(), upsertCompositeKey(), + * upsertValueAccessor(), or upsertValueAccessorCompositeKey() are + * taking place (i.e. that this HashTable is immutable for the + * duration of the call and as long as the returned pointer may be + * dereferenced). Concurrent calls to getSingle(), + * getSingleCompositeKey(), getAll(), getAllCompositeKey(), + * getAllFromValueAccessor(), getAllFromValueAccessorCompositeKey(), + * forEach(), and forEachCompositeKey() are safe. + * @note This version is for composite keys. See also getSingle(). + * + * @param key The key to look up. + * @return The value of a matched entry if a matching key is found. + * Otherwise, return NULL. + **/ + virtual const uint8_t* getSingleCompositeKey(const std::vector<TypedValue> &key) const = 0; + virtual const uint8_t* getSingleCompositeKey(const std::vector<TypedValue> &key, int index) const = 0; + + /** + * @brief Lookup a key against this hash table to find matching entries. + * + * @warning The key must not be null. + * @warning This method assumes that no concurrent calls to put(), + * putCompositeKey(), putValueAccessor(), + * putValueAccessorCompositeKey(), upsert(), upsertCompositeKey(), + * upsertValueAccessor(), or upsertValueAccessorCompositeKey() are + * taking place (i.e. that this HashTable is immutable for the + * duration of the call and as long as the returned pointer may be + * dereferenced). Concurrent calls to getSingle(), + * getSingleCompositeKey(), getAll(), getAllCompositeKey(), + * getAllFromValueAccessor(), getAllFromValueAccessorCompositeKey(), + * forEach(), and forEachCompositeKey() are safe. + * @note It is more efficient to call getSingle() if the hash table does not + * allow duplicate keys. + * @note This version is for single scalar keys. See also + * getAllCompositeKey(). + * + * @param key The key to look up. + * @param values A vector to hold values of all matching entries. Matches + * will be appended to the vector. + **/ + virtual void getAll(const TypedValue &key, std::vector<const uint8_t*> *values) const = 0; + + /** + * @brief Lookup a composite key against this hash table to find matching + * entries. + * + * @warning The key must not be null. + * @warning This method assumes that no concurrent calls to put(), + * putCompositeKey(), putValueAccessor(), + * putValueAccessorCompositeKey(), upsert(), upsertCompositeKey(), + * upsertValueAccessor(), or upsertValueAccessorCompositeKey() are + * taking place (i.e. that this HashTable is immutable for the + * duration of the call and as long as the returned pointer may be + * dereferenced). Concurrent calls to getSingle(), + * getSingleCompositeKey(), getAll(), getAllCompositeKey(), + * getAllFromValueAccessor(), getAllFromValueAccessorCompositeKey(), + * forEach(), and forEachCompositeKey() are safe. + * @note It is more efficient to call getSingleCompositeKey() if the hash + * table does not allow duplicate keys. + * @note This version is for composite keys. See also getAll(). + * + * @param key The key to look up. + * @param values A vector to hold values of all matching entries. Matches + * will be appended to the vector. + **/ + virtual void getAllCompositeKey(const std::vector<TypedValue> &key, + std::vector<const uint8_t*> *values) const = 0; + + /** + * @brief Lookup (multiple) keys from a ValueAccessor and apply a functor to + * the matching values. + * + * @warning This method assumes that no concurrent calls to put(), + * putCompositeKey(), putValueAccessor(), + * putValueAccessorCompositeKey(), upsert(), upsertCompositeKey(), + * upsertValueAccessor(), or upsertValueAccessorCompositeKey() are + * taking place (i.e. that this HashTable is immutable for the + * duration of the call and as long as the returned pointer may be + * dereferenced). Concurrent calls to getSingle(), + * getSingleCompositeKey(), getAll(), getAllCompositeKey(), + * getAllFromValueAccessor(), getAllFromValueAccessorCompositeKey(), + * forEach(), and forEachCompositeKey() are safe. + * @note This version is for single scalar keys. See also + * getAllFromValueAccessorCompositeKey(). + * + * @param accessor A ValueAccessor which will be used to access keys. + * beginIteration() should be called on accessor before calling this + * method. + * @param key_attr_id The attribute ID of the keys to be read from accessor. + * @param check_for_null_keys If true, each key will be checked to see if it + * is null before looking it up (null keys are skipped). This must be + * set to true if some of the keys that will be read from accessor may + * be null. + * @param functor A pointer to a functor, which should provide a call + * operator that takes 2 arguments: const ValueAccessor& (or better + * yet, a templated call operator which takes a const reference to + * some subclass of ValueAccessor as its first argument) and + * const ValueT&. The functor will be invoked once for each pair of a + * key taken from accessor and matching value. + **/ + template <typename FunctorT> + void getAllFromValueAccessor(ValueAccessor *accessor, + const attribute_id key_attr_id, + const bool check_for_null_keys, + FunctorT *functor) const; + + /** + * @brief Lookup (multiple) keys from a ValueAccessor, apply a functor to the + * matching values and additionally call a recordMatch() function of + * the functor when the first match for a key is found. + * @warning This method assumes that no concurrent calls to put(), + * putCompositeKey(), putValueAccessor(), + * putValueAccessorCompositeKey(), upsert(), upsertCompositeKey(), + * upsertValueAccessor(), or upsertValueAccessorCompositeKey() are + * taking place (i.e. that this HashTable is immutable for the + * duration of the call and as long as the returned pointer may be + * dereferenced). Concurrent calls to getSingle(), + * getSingleCompositeKey(), getAll(), getAllCompositeKey(), + * getAllFromValueAccessor(), getAllFromValueAccessorCompositeKey(), + * forEach(), and forEachCompositeKey() are safe. + * @note This version is for single scalar keys. See also + * getAllFromValueAccessorCompositeKeyWithExtraWorkForFirstMatch(). + * + * @param accessor A ValueAccessor which will be used to access keys. + * beginIteration() should be called on accessor before calling this + * method. + * @param key_attr_id The attribute ID of the keys to be read from accessor. + * @param check_for_null_keys If true, each key will be checked to see if it + * is null before looking it up (null keys are skipped). This must be + * set to true if some of the keys that will be read from accessor may + * be null. + * @param functor A pointer to a functor, which should provide two functions: + * 1) An operator that takes 2 arguments: const ValueAccessor& (or better + * yet, a templated call operator which takes a const reference to + * some subclass of ValueAccessor as its first argument) and + * const ValueT&. The operator will be invoked once for each pair of a + * key taken from accessor and matching value. + * 2) A function hasMatch that takes 1 argument: const ValueAccessor&. + * The function will be called only once for a key from accessor when + * the first match is found. + */ + template <typename FunctorT> + void getAllFromValueAccessorWithExtraWorkForFirstMatch( + ValueAccessor *accessor, + const attribute_id key_attr_id, + const bool check_for_null_keys, + FunctorT *functor) const; + + /** + * @brief Lookup (multiple) keys from a ValueAccessor, apply a functor to the + * matching values and additionally call a recordMatch() function of + * the functor when the first match for a key is found. Composite key + * version. + * @warning This method assumes that no concurrent calls to put(), + * putCompositeKey(), putValueAccessor(), + * putValueAccessorCompositeKey(), upsert(), upsertCompositeKey(), + * upsertValueAccessor(), or upsertValueAccessorCompositeKey() are + * taking place (i.e. that this HashTable is immutable for the + * duration of the call and as long as the returned pointer may be + * dereferenced). Concurrent calls to getSingle(), + * getSingleCompositeKey(), getAll(), getAllCompositeKey(), + * getAllFromValueAccessor(), getAllFromValueAccessorCompositeKey(), + * forEach(), and forEachCompositeKey() are safe. + * + * @param accessor A ValueAccessor which will be used to access keys. + * beginIteration() should be called on accessor before calling this + * method. + * @param key_attr_id The attribute ID of the keys to be read from accessor. + * @param check_for_null_keys If true, each key will be checked to see if it + * is null before looking it up (null keys are skipped). This must be + * set to true if some of the keys that will be read from accessor may + * be null. + * @param functor A pointer to a functor, which should provide two functions: + * 1) An operator that takes 2 arguments: const ValueAccessor& (or better + * yet, a templated call operator which takes a const reference to + * some subclass of ValueAccessor as its first argument) and + * const ValueT&. The operator will be invoked once for each pair of a + * key taken from accessor and matching value. + * 2) A function hasMatch that takes 1 argument: const ValueAccessor&. + * The function will be called only once for a key from accessor when + * the first match is found. + */ + template <typename FunctorT> + void getAllFromValueAccessorCompositeKeyWithExtraWorkForFirstMatch( + ValueAccessor *accessor, + const std::vector<attribute_id> &key_attr_ids, + const bool check_for_null_keys, + FunctorT *functor) const; + + /** + * @brief Lookup (multiple) keys from a ValueAccessor and apply a functor to + * the matching values. Composite key version. + * + * @warning This method assumes that no concurrent calls to put(), + * putCompositeKey(), putValueAccessor(), + * putValueAccessorCompositeKey(), upsert(), upsertCompositeKey(), + * upsertValueAccessor(), or upsertValueAccessorCompositeKey() are + * taking place (i.e. that this HashTable is immutable for the + * duration of the call and as long as the returned pointer may be + * dereferenced). Concurrent calls to getSingle(), + * getSingleCompositeKey(), getAll(), getAllCompositeKey(), + * getAllFromValueAccessor(), getAllFromValueAccessorCompositeKey(), + * forEach(), and forEachCompositeKey() are safe. + * @note This version is for composite keys. See also + * getAllFromValueAccessor(). + * + * @param accessor A ValueAccessor which will be used to access keys. + * beginIteration() should be called on accessor before calling this + * method. + * @param key_attr_ids The attribute IDs of each key component to be read + * from accessor. + * @param check_for_null_keys If true, each key will be checked to see if it + * has a null component before inserting it (null keys are skipped). + * This must be set to true if some of the keys that will be read from + * accessor may be null. + * @param functor A pointer to a functor, which should provide a call + * operator that takes 2 arguments: const ValueAccessor& (or better + * yet, a templated call operator which takes a const reference to + * some subclass of ValueAccessor as its first argument) and + * const ValueT&. The functor will be invoked once for each pair of a + * key taken from accessor and matching value. + **/ + template <typename FunctorT> + void getAllFromValueAccessorCompositeKey(ValueAccessor *accessor, + const std::vector<attribute_id> &key_attr_ids, + const bool check_for_null_keys, + FunctorT *functor) const; + + /** + * @brief Apply the functor to each key with a match in the hash table. + * + * @param accessor A ValueAccessor which will be used to access keys. + * beginIteration() should be called on accessor before calling this + * method. + * @param key_attr_id The attribute ID of the keys to be read from accessor. + * @param check_for_null_keys If true, each key will be checked to see if it + * is null before looking it up (null keys are skipped). This must be + * set to true if some of the keys that will be read from accessor may + * be null. + * @param functor A pointer to a functor which should provide an operator that + * takes 1 argument: const ValueAccessor&. The operator will be called + * only once for a key from accessor if there is a match. + */ + template <typename FunctorT> + void runOverKeysFromValueAccessorIfMatchFound(ValueAccessor *accessor, + const attribute_id key_attr_id, + const bool check_for_null_keys, + FunctorT *functor) const { + return runOverKeysFromValueAccessor<true>(accessor, + key_attr_id, + check_for_null_keys, + functor); + } + + /** + * @brief Apply the functor to each key with a match in the hash table. + * + * @param accessor A ValueAccessor which will be used to access keys. + * beginIteration() should be called on accessor before calling this + * method. + * @param key_attr_id The attribute ID of the keys to be read from accessor. + * @param check_for_null_keys If true, each key will be checked to see if it + * is null before looking it up (null keys are skipped). This must be + * set to true if some of the keys that will be read from accessor may + * be null. + * @param functor A pointer to a functor which should provide an operator that + * takes 1 argument: const ValueAccessor&. The operator will be called + * only once for a key from accessor if there is a match. + */ + template <typename FunctorT> + void runOverKeysFromValueAccessorIfMatchFoundCompositeKey( + ValueAccessor *accessor, + const std::vector<attribute_id> &key_attr_ids, + const bool check_for_null_keys, + FunctorT *functor) const { + return runOverKeysFromValueAccessorCompositeKey<true>(accessor, + key_attr_ids, + check_for_null_keys, + functor); + } + + /** + * @brief Apply the functor to each key without a match in the hash table. + * + * @param accessor A ValueAccessor which will be used to access keys. + * beginIteration() should be called on accessor before calling this + * method. + * @param key_attr_id The attribute ID of the keys to be read from accessor. + * @param check_for_null_keys If true, each key will be checked to see if it + * is null before looking it up (null keys are skipped). This must be + * set to true if some of the keys that will be read from accessor may + * be null. + * @param functor A pointer to a functor which should provide an operator that + * takes 1 argument: const ValueAccessor&. The operator will be called + * only once for a key from accessor if there is no match. + */ + template <typename FunctorT> + void runOverKeysFromValueAccessorIfMatchNotFound( + ValueAccessor *accessor, + const attribute_id key_attr_id, + const bool check_for_null_keys, + FunctorT *functor) const { + return runOverKeysFromValueAccessor<false>(accessor, + key_attr_id, + check_for_null_keys, + functor); + } + + /** + * @brief Apply the functor to each key without a match in the hash table. + * + * @param accessor A ValueAccessor which will be used to access keys. + * beginIteration() should be called on accessor before calling this + * method. + * @param key_attr_id The attribute ID of the keys to be read from accessor. + * @param check_for_null_keys If true, each key will be checked to see if it + * is null before looking it up (null keys are skipped). This must be + * set to true if some of the keys that will be read from accessor may + * be null. + * @param functor A pointer to a functor which should provide an operator that + * takes 1 argument: const ValueAccessor&. The operator will be called + * only once for a key from accessor if there is no match. + */ + template <typename FunctorT> + void runOverKeysFromValueAccessorIfMatchNotFoundCompositeKey( + ValueAccessor *accessor, + const std::vector<attribute_id> &key_attr_ids, + const bool check_for_null_keys, + FunctorT *functor) const { + return runOverKeysFromValueAccessorCompositeKey<false>(accessor, + key_attr_ids, + check_for_null_keys, + functor); + } + + /** + * @brief Apply a functor to each key, value pair in this hash table. + * + * @warning This method assumes that no concurrent calls to put(), + * putCompositeKey(), putValueAccessor(), + * putValueAccessorCompositeKey(), upsert(), upsertCompositeKey(), + * upsertValueAccessor(), or upsertValueAccessorCompositeKey() are + * taking place (i.e. that this HashTable is immutable for the + * duration of the call and as long as the returned pointer may be + * dereferenced). Concurrent calls to getSingle(), + * getSingleCompositeKey(), getAll(), getAllCompositeKey(), + * getAllFromValueAccessor(), getAllFromValueAccessorCompositeKey(), + * forEach(), and forEachCompositeKey() are safe. + * @note This version is for single scalar keys. See also + * forEachCompositeKey(). + * + * @param functor A pointer to a functor, which should provide a call + * operator which takes 2 arguments: const TypedValue&, const ValueT&. + * The call operator will be invoked once on each key, value pair in + * this hash table (note that if allow_duplicate_keys is true, + * the call may occur multiple times for the same key with different + * values). + * @return The number of key-value pairs visited. + **/ + template <typename FunctorT> + std::size_t forEach(FunctorT *functor) const; + + /** + * @brief Apply a functor to each key, value pair in this hash table. + * + * @warning This method assumes that no concurrent calls to put(), + * putCompositeKey(), putValueAccessor(), + * putValueAccessorCompositeKey(), upsert(), upsertCompositeKey(), + * upsertValueAccessor(), or upsertValueAccessorCompositeKey() are + * taking place (i.e. that this HashTable is immutable for the + * duration of the call and as long as the returned pointer may be + * dereferenced). Concurrent calls to getSingle(), + * getSingleCompositeKey(), getAll(), getAllCompositeKey(), + * getAllFromValueAccessor(), getAllFromValueAccessorCompositeKey(), + * forEach(), and forEachCompositeKey() are safe. + * @note This version is for composite keys. See also forEach(). + * + * @param functor A pointer to a functor, which should provide a call + * operator which takes 2 arguments: const std::vector<TypedValue>&, + * const ValueT&. The call operator will be invoked once on each key, + * value pair in this hash table (note that if allow_duplicate_keys is + * true, the call may occur multiple times for the same key with + * different values). + * @return The number of key-value pairs visited. + **/ + template <typename FunctorT> + std::size_t forEachCompositeKey(FunctorT *functor) const; + + template <typename FunctorT> + std::size_t forEachCompositeKeyFast(FunctorT *functor) const; + + template <typename FunctorT> + std::size_t forEachCompositeKeyFast(FunctorT *functor, int index) const; + /** + * @brief A call to this function will cause a bloom filter to be built + * during the build phase of this hash table. + **/ + inline void enableBuildSideBloomFilter() { + has_build_side_bloom_filter_ = true; + } + + /** + * @brief A call to this function will cause a set of bloom filters to be + * probed during the probe phase of this hash table. + **/ + inline void enableProbeSideBloomFilter() { + has_probe_side_bloom_filter_ = true; + } + + /** + * @brief This function sets the pointer to the bloom filter to be + * used during the build phase of this hash table. + * @warning Should call enable_build_side_bloom_filter() first to enable + * bloom filter usage during build phase. + * @note The ownership of the bloom filter lies with the caller. + * + * @param bloom_filter The pointer to the bloom filter. + **/ + inline void setBuildSideBloomFilter(BloomFilter *bloom_filter) { + build_bloom_filter_ = bloom_filter; + } + + /** + * @brief This function adds a pointer to the list of bloom filters to be + * used during the probe phase of this hash table. + * @warning Should call enable_probe_side_bloom_filter() first to enable + * bloom filter usage during probe phase. + * @note The ownership of the bloom filter lies with the caller. + * + * @param bloom_filter The pointer to the bloom filter. + **/ + inline void addProbeSideBloomFilter(const BloomFilter *bloom_filter) { + probe_bloom_filters_.emplace_back(bloom_filter); + } + + /** + * @brief This function adds a vector of attribute ids corresponding to a + * bloom filter used during the probe phase of this hash table. + * @warning Should call enable_probe_side_bloom_filter() first to enable + * bloom filter usage during probe phase. + * + * @param probe_attribute_ids The vector of attribute ids to use for probing + * the bloom filter. + **/ + inline void addProbeSideAttributeIds(std::vector<attribute_id> &&probe_attribute_ids) { + probe_attribute_ids_.push_back(probe_attribute_ids); + } + + protected: + /** + * @brief Constructor for new resizable hash table. + * + * @param key_types A vector of one or more types (>1 indicates a composite + * key). + * @param num_entries The estimated number of entries this hash table will + * hold. + * @param storage_manager The StorageManager to use (a StorageBlob will be + * allocated to hold this hash table's contents). + * @param adjust_hashes If true, the hash of a key should be modified by + * applying AdjustHash() so that it does not collide with one of the + * special values kEmptyHash or kPendingHash. If false, the hash is + * used as-is. + * @param use_scalar_literal_hash If true, the key is a single scalar literal + * (non-composite) that it is safe to use the simplified hash function + * TypedValue::getHashScalarLiteral() on. If false, the generic + * TypedValue::getHash() method will be used. + * @param preallocate_supported If true, this HashTable overrides + * preallocateForBulkInsert() to allow bulk-allocation of resources + * (i.e. buckets and variable-length key storage) in a single up-front + * pass when bulk-inserting entries. If false, resources are allocated + * on the fly for each entry. + **/ + FastHashTable(const std::vector<const Type*> &key_types, + const std::size_t num_entries, + const std::vector<AggregationHandle *> &handles, + const std::vector<std::size_t> &payload_sizes, + StorageManager *storage_manager, + const bool adjust_hashes, + const bool use_scalar_literal_hash, + const bool preallocate_supported) + : key_types_(key_types), + scalar_key_inline_(true), + key_inline_(nullptr), + adjust_hashes_(adjust_hashes), + use_scalar_literal_hash_(use_scalar_literal_hash), + preallocate_supported_(preallocate_supported), + handles_(handles), + total_payload_size_(std::accumulate(payload_sizes.begin(), payload_sizes.end(), sizeof(SpinMutex))), + storage_manager_(storage_manager), + hash_table_memory_(nullptr), + hash_table_memory_size_(0) { + DEBUG_ASSERT(resizable); + std::size_t running_sum = sizeof(SpinMutex); + for (auto size : payload_sizes) { + payload_offsets_.emplace_back(running_sum); + running_sum+=size; + } + } + + /** + * @brief Constructor for non-resizable hash table. + * + * @param key_types A vector of one or more types (>1 indicates a composite + * key). + * @param hash_table_memory A pointer to memory to use for this hash table. + * @param hash_table_memory_size The size of hash_table_memory in bytes. + * @param new_hash_table If true, this hash table is being constructed for + * the first time and hash_table_memory will be cleared. If false, + * reload a pre-existing hash table. + * @param hash_table_memory_zeroed If new_hash_table is true, setting this to + * true means that this HashTable will assume that hash_table_memory + * has already been zeroed-out (any newly-allocated block or blob + * memory from StorageManager is zeroed-out). If false, this HashTable + * will explicitly zero-fill its memory as neccessary. This parameter + * has no effect when new_hash_table is false. + * @param adjust_hashes If true, the hash of a key should be modified by + * applying AdjustHash() so that it does not collide with one of the + * special values kEmptyHash or kPendingHash. If false, the hash is + * used as-is. + * @param use_scalar_literal_hash If true, the key is a single scalar literal + * (non-composite) that it is safe to use the simplified hash function + * TypedValue::getHashScalarLiteral() on. If false, the generic + * TypedValue::getHash() method will be used. + * @param preallocate_supported If true, this HashTable overrides + * preallocateForBulkInsert() to allow bulk-allocation of resources + * (i.e. buckets and variable-length key storage) in a single up-front + * pass when bulk-inserting entries. If false, resources are allocated + * on the fly for each entry. + **/ + FastHashTable(const std::vector<const Type*> &key_types, + void *hash_table_memory, + const std::size_t hash_table_memory_size, + const bool new_hash_table, + const bool hash_table_memory_zeroed, + const bool adjust_hashes, + const bool use_scalar_literal_hash, + const bool preallocate_supported) + : key_types_(key_types), + scalar_key_inline_(true), + key_inline_(nullptr), + adjust_hashes_(adjust_hashes), + use_scalar_literal_hash_(use_scalar_literal_hash), + preallocate_supported_(preallocate_supported), + storage_manager_(nullptr), + hash_table_memory_(hash_table_memory), + hash_table_memory_size_(hash_table_memory_size) { + DEBUG_ASSERT(!resizable); + } + + // Adjust 'hash' so that it is not exactly equal to either of the special + // values kEmptyHash or kPendingHash. + inline constexpr static std::size_t AdjustHash(const std::size_t hash) { + return hash + (hash == kEmptyHash) - (hash == kPendingHash); + } + + // Set information about which key components are stored inline. This usually + // comes from a HashTableKeyManager, and is set by the constructor of a + // subclass of HashTable. + inline void setKeyInline(const std::vector<bool> *key_inline) { + scalar_key_inline_ = key_inline->front(); + key_inline_ = key_inline; + } + + // Generate a hash for a composite key by hashing each component of 'key' and + // mixing their bits with CombineHashes(). + inline std::size_t hashCompositeKey(const std::vector<TypedValue> &key) const; + + // If 'force_key_copy' is true and some part of a composite key is + // variable-length, calculate the total number of bytes for variable-length + // key components that need to be copied. Otherwise, return 0 to indicate + // that no variable-length copy is required. + inline std::size_t calculateVariableLengthCompositeKeyCopySize( + const std::vector<TypedValue> &key) const; + + // Helpers for put. If this HashTable is resizable, 'resize_shared_mutex_' + // should be locked in shared mode before calling either of these methods. + virtual HashTablePutResult putInternal(const TypedValue &key, + const std::size_t variable_key_size, + const uint8_t &value, + HashTablePreallocationState *prealloc_state) = 0; + virtual HashTablePutResult putCompositeKeyInternal(const std::vector<TypedValue> &key, + const std::size_t variable_key_size, + const uint8_t &value, + HashTablePreallocationState *prealloc_state) = 0; + + virtual HashTablePutResult putCompositeKeyInternalFast(const std::vector<TypedValue> &key, + const std::size_t variable_key_size, + const std::uint8_t *init_value_ptr, + HashTablePreallocationState *prealloc_state) = 0; + + + // Helpers for upsert. Both return a pointer to the value corresponding to + // 'key'. If this HashTable is resizable, 'resize_shared_mutex_' should be + // locked in shared mode while calling and using the returned pointer. May + // return NULL if there is not enough space to insert a new key, in which + // case a resizable HashTable should release the 'resize_shared_mutex_' and + // call resize(), then try again. + virtual uint8_t* upsertInternal(const TypedValue &key, + const std::size_t variable_key_size, + const uint8_t &initial_value) = 0; + virtual uint8_t* upsertInternalFast(const TypedValue &key, + const std::uint8_t *init_value_ptr, + const std::size_t variable_key_size) = 0; + virtual uint8_t* upsertCompositeKeyInternal(const std::vector<TypedValue> &key, + const std::size_t variable_key_size, + const uint8_t &initial_value) = 0; + + virtual uint8_t* upsertCompositeKeyInternalFast(const std::vector<TypedValue> &key, + const std::uint8_t *init_value_ptr, + const std::size_t variable_key_size) = 0; + + // Helpers for forEach. Each return true on success, false if no more entries + // exist to iterate over. After a successful call, '*key' is overwritten with + // the key of the next entry, '*value' points to the associated value, and + // '*entry_num' is incremented to the next (implementation defined) entry to + // check ('*entry_num' should initially be set to zero). + virtual bool getNextEntry(TypedValue *key, + const uint8_t **value, + std::size_t *entry_num) const = 0; + virtual bool getNextEntryCompositeKey(std::vector<TypedValue> *key, + const uint8_t **value, + std::size_t *entry_num) const = 0; + + // Helpers for getAllFromValueAccessor. Each return true on success, false if + // no more entries exist for the specified key. After a successful call, + // '*value' points to the associated value, and '*entry_num' is incremented + // to the next (implementation defined) entry to check ('*entry_num' should + // initially be set to zero). + virtual bool getNextEntryForKey(const TypedValue &key, + const std::size_t hash_code, + const uint8_t **value, + std::size_t *entry_num) const = 0; + virtual bool getNextEntryForCompositeKey(const std::vector<TypedValue> &key, + const std::size_t hash_code, + const uint8_t **value, + std::size_t *entry_num) const = 0; + + // Return true if key exists in the hash table. + virtual bool hasKey(const TypedValue &key) const = 0; + virtual bool hasCompositeKey(const std::vector<TypedValue> &key) const = 0; + + // For a resizable HashTable, grow to accomodate more entries. If + // 'extra_buckets' is not zero, it may serve as a "hint" to implementations + // that at least the requested number of extra buckets are required when + // resizing (mainly used in putValueAccessor() and + // putValueAccessorCompositeKey() when 'preallocate_supported_' is true). + // Implementations are free to ignore 'extra_buckets'. If + // 'extra_variable_storage' is not zero, implementations will attempt to + // allocate at least enough additional variable-key storage space to + // accomodate the number of bytes specified. 'retry_num' is intended ONLY for + // when resize() recursively calls itself and should not be set to nonzero by + // any other caller. + virtual void resize(const std::size_t extra_buckets, + const std::size_t extra_variable_storage, + const std::size_t retry_num = 0) = 0; + + // In the case where 'allow_duplicate_keys' is true, it is possible to + // pre-calculate the number of key-value entries and the amount of + // variable-length key storage that will be needed to insert all the + // entries from a ValueAccessor in putValueAccessor() or + // putValueAccessorCompositeKey() before actually inserting anything. Some + // HashTable implemetations (notably SeparateChainingHashTable) can achieve + // better performance by ammortizing the cost of allocating certain resources + // (buckets and variable-length key storage) in one up-front allocation. This + // method is intended to support that. Returns true and fills in + // '*prealloc_state' if pre-allocation was successful. Returns false if a + // resize() is needed. + virtual bool preallocateForBulkInsert(const std::size_t total_entries, + const std::size_t total_variable_key_size, + HashTablePreallocationState *prealloc_state) { + FATAL_ERROR("Called HashTable::preallocateForBulkInsert() on a HashTable " + "implementation that does not support preallocation."); + } + + // Type(s) of keys. + const std::vector<const Type*> key_types_; + + // Information about whether key components are stored inline or in a + // separate variable-length storage region. This is usually determined by a + // HashTableKeyManager and set by calling setKeyInline(). + bool scalar_key_inline_; + const std::vector<bool> *key_inline_; + + // Whether hashes should be adjusted by AdjustHash() before being used. + const bool adjust_hashes_; + // Whether it is safe to use the simplified TypedValue::getHashScalarLiteral() + // method instead of the generic TypedValue::getHash() method. + const bool use_scalar_literal_hash_; + // Whether preallocateForBulkInsert() is supported by this HashTable. + const bool preallocate_supported_; + + const std::vector<AggregationHandle *> handles_; + const std::size_t total_payload_size_; + std::vector<std::size_t> payload_offsets_; + + // Used only when resizable is true: + StorageManager *storage_manager_; + MutableBlobReference blob_; + // Locked in shared mode for most operations, exclusive mode during resize. + // Not locked at all for non-resizable HashTables. + alignas(kCacheLineBytes) SpinSharedMutex<true> resize_shared_mutex_; + + // Used only when resizable is false: + void *hash_table_memory_; + const std::size_t hash_table_memory_size_; +virtual size_t get_buckets_allocated() const {return 0;} + + private: + // Assign '*key_vector' with the attribute values specified by 'key_attr_ids' + // at the current position of 'accessor'. If 'check_for_null_keys' is true, + // stops and returns true if any of the values is null, otherwise returns + // false. + template <typename ValueAccessorT> + inline static bool GetCompositeKeyFromValueAccessor( + const ValueAccessorT &accessor, + const std::vector<attribute_id> &key_attr_ids, + const bool check_for_null_keys, + std::vector<TypedValue> *key_vector) { + for (std::vector<attribute_id>::size_type key_idx = 0; + key_idx < key_attr_ids.size(); + ++key_idx) { + (*key_vector)[key_idx] = accessor.getTypedValue(key_attr_ids[key_idx]); + if (check_for_null_keys && (*key_vector)[key_idx].isNull()) { + return true; + } + } + return false; + } + + // If run_if_match_found is true, apply the functor to each key if a match is + // found; otherwise, apply the functor if no match is found. + template <bool run_if_match_found, typename FunctorT> + void runOverKeysFromValueAccessor(ValueAccessor *accessor, + const attribute_id key_attr_id, + const bool check_for_null_keys, + FunctorT *functor) const; + + template <bool run_if_match_found, typename FunctorT> + void runOverKeysFromValueAccessorCompositeKey( + ValueAccessor *accessor, + const std::vector<attribute_id> &key_attr_ids, + const bool check_for_null_keys, + FunctorT *functor) const; + + // Method containing the actual logic implementing getAllFromValueAccessor(). + // Has extra template parameters that control behavior to avoid some + // inner-loop branching. + template <typename FunctorT, + bool check_for_null_keys, + bool adjust_hashes_template, + bool use_scalar_literal_hash_template> + void getAllFromValueAccessorImpl(ValueAccessor *accessor, + const attribute_id key_attr_id, + FunctorT *functor) const; + + // Data structures used for bloom filter optimized semi-joins. + bool has_build_side_bloom_filter_ = false; + bool has_probe_side_bloom_filter_ = false; + BloomFilter *build_bloom_filter_; + std::vector<const BloomFilter*> probe_bloom_filters_; + std::vector<std::vector<attribute_id>> probe_attribute_ids_; + DISALLOW_COPY_AND_ASSIGN(FastHashTable); +}; + + +/** + * @brief An instantiation of the HashTable template for use in aggregations. + * @note This has force_key_copy = true, so that we don't have dangling pointers + * to blocks that are evicted. + **/ +using AggregationStateFastHashTable = FastHashTable<true, false, true, false>; + +/** @} */ + +// ---------------------------------------------------------------------------- +// Implementations of template class methods follow. + +template <bool resizable, + bool serializable, + bool force_key_copy, + bool allow_duplicate_keys> +HashTablePutResult FastHashTable<resizable, serializable, force_key_copy, allow_duplicate_keys> + ::put(const TypedValue &key, + const uint8_t &value) { + const std::size_t variable_size = (force_key_copy && !scalar_key_inline_) ? key.getDataSize() + : 0; + if (resizable) { + HashTablePutResult result = HashTablePutResult::kOutOfSpace; + while (result == HashTablePutResult::kOutOfSpace) { + { + SpinSharedMutexSharedLock<true> lock(resize_shared_mutex_); + result = putInternal(key, variable_size, value, nullptr); + } + if (result == HashTablePutResult::kOutOfSpace) { + resize(0, variable_size); + } + } + return result; + } else { + return putInternal(key, variable_size, value, nullptr); + } +} + +template <bool resizable, + bool serializable, + bool force_key_copy, + bool allow_duplicate_keys> +HashTablePutResult FastHashTable<resizable, serializable, force_key_copy, allow_duplicate_keys> + ::putCompositeKey(const std::vector<TypedValue> &key, + const uint8_t& value) { + const std::size_t variable_size = calculateVariableLengthCompositeKeyCopySize(key); + if (resizable) { + HashTablePutResult result = HashTablePutResult::kOutOfSpace; + while (result == HashTablePutResult::kOutOfSpace) { + { + SpinSharedMutexSharedLock<true> lock(resize_shared_mutex_); + result = putCompositeKeyInternal(key, variable_size, value, nullptr); + } + if (result == HashTablePutResult::kOutOfSpace) { + resize(0, variable_size); + } + } + return result; + } else { + return putCompositeKeyInternal(key, variable_size, value, nullptr); + } +} + +template <bool resizable, + bool serializable, + bool force_key_copy, + bool allow_duplicate_keys> +HashTablePutResult FastHashTable<resizable, serializable, force_key_copy, allow_duplicate_keys> + ::putCompositeKeyFast(const std::vector<TypedValue> &key, + const std::uint8_t* init_value_ptr) { + const std::size_t variable_size = calculateVariableLengthCompositeKeyCopySize(key); + if (resizable) { + HashTablePutResult result = HashTablePutResult::kOutOfSpace; + while (result == HashTablePutResult::kOutOfSpace) { + { + SpinSharedMutexSharedLock<true> lock(resize_shared_mutex_); + result = putCompositeKeyInternalFast(key, variable_size, init_value_ptr, nullptr); + } + if (result == HashTablePutResult::kOutOfSpace) { + resize(0, variable_size); + } + } + return result; + } else { + return putCompositeKeyInternalFast(key, variable_size, init_value_ptr, nullptr); + } +} + + +template <bool resizable, + bool serializable, + bool force_key_copy, + bool allow_duplicate_keys> +template <typename FunctorT> +HashTablePutResult FastHashTable<resizable, serializable, force_key_copy, allow_duplicate_keys> + ::putValueAccessor(ValueAccessor *accessor, + const attribute_id key_attr_id, + const bool check_for_null_keys, + FunctorT *functor) { + HashTablePutResult result = HashTablePutResult::kOutOfSpace; + std::size_t variable_size; + HashTablePreallocationState prealloc_state; + bool using_prealloc = allow_duplicate_keys && preallocate_supported_; + return InvokeOnAnyValueAccessor( + accessor, + [&](auto *accessor) -> HashTablePutResult { // NOLINT(build/c++11) + if (using_prealloc) { + std::size_t total_entries = 0; + std::size_t total_variable_key_size = 0; + if (check_for_null_keys || (force_key_copy && !scalar_key_inline_)) { + // If we need to filter out nulls OR make variable copies, make a + // prepass over the ValueAccessor. + while (accessor->next()) { + TypedValue key = accessor->getTypedValue(key_attr_id); + if (check_for_null_keys && key.isNull()) { + continue; + } + ++total_entries; + total_variable_key_size += (force_key_copy && !scalar_key_inline_) ? key.getDataSize() : 0; + } + accessor->beginIteration(); + } else { + total_entries = accessor->getNumTuples(); + } + if (resizable) { + bool prealloc_succeeded = false; + while (!prealloc_succeeded) { + { + SpinSharedMutexSharedLock<true> lock(resize_shared_mutex_); + prealloc_succeeded = this->preallocateForBulkInsert(total_entries, + total_variable_key_size, + &prealloc_state); + } + if (!prealloc_succeeded) { + this->resize(total_entries, total_variable_key_size); + } + } + } else { + using_prealloc = this->preallocateForBulkInsert(total_entries, + total_variable_key_size, + &prealloc_state); + } + } + std::unique_ptr<BloomFilter> thread_local_bloom_filter; + if (has_build_side_bloom_filter_) { + thread_local_bloom_filter.reset(new BloomFilter(build_bloom_filter_->getRandomSeed(), + build_bloom_filter_->getNumberOfHashes(), + build_bloom_filter_->getBitArraySize())); + } + if (resizable) { + while (result == HashTablePutResult::kOutOfSpace) { + { + result = HashTablePutResult::kOK; + SpinSharedMutexSharedLock<true> lock(resize_shared_mutex_); + while (accessor->next()) { + TypedValue key = accessor->getTypedValue(key_attr_id); + if (check_for_null_keys && key.isNull()) { + continue; + } + variable_size = (force_key_copy && !scalar_key_inline_) ? key.getDataSize() : 0; + result = this->putInternal(key, + variable_size, + (*functor)(*accessor), + using_prealloc ? &prealloc_state : nullptr); + // Insert into bloom filter, if enabled. + if (has_build_side_bloom_filter_) { + thread_local_bloom_filter->insertUnSafe(static_cast<const std::uint8_t *>(key.getDataPtr()), + key.getDataSize()); + } + if (result == HashTablePutResult::kDuplicateKey) { + DEBUG_ASSERT(!using_prealloc); + return result; + } else if (result == HashTablePutResult::kOutOfSpace) { + DEBUG_ASSERT(!using_prealloc); + break; + } + } + } + if (result == HashTablePutResult::kOutOfSpace) { + this->resize(0, variable_size); + accessor->previous(); + } + } + } else { + while (accessor->next()) { + TypedValue key = accessor->getTypedValue(key_attr_id); + if (check_for_null_keys && key.isNull()) { + continue; + } + variable_size = (force_key_copy && !scalar_key_inline_) ? key.getDataSize() : 0; + result = this->putInternal(key, + variable_size, + (*functor)(*accessor), + using_prealloc ? &prealloc_state : nullptr); + // Insert into bloom filter, if enabled. + if (has_build_side_bloom_filter_) { + thread_local_bloom_filter->insertUnSafe(static_cast<const std::uint8_t *>(key.getDataPtr()), + key.getDataSize()); + } + if (result != HashTablePutResult::kOK) { + return result; + } + } + } + // Update the build side bloom filter with thread local copy, if available. + if (has_build_side_bloom_filter_) { + build_bloom_filter_->bitwiseOr(thread_local_bloom_filter.get()); + } + + return HashTablePutResult::kOK; + }); +} + +template <bool resizable, + bool serializable, + bool force_key_copy, + bool allow_duplicate_keys> +template <typename FunctorT> +HashTablePutResult FastHashTable<resizable, serializable, force_key_copy, allow_duplicate_keys> + ::putValueAccessorCompositeKey(ValueAccessor *accessor, + const std::vector<attribute_id> &key_attr_ids, + const bool check_for_null_keys, + FunctorT *functor) { + DEBUG_ASSERT(key_types_.size() == key_attr_ids.size()); + HashTablePutResult result = HashTablePutResult::kOutOfSpace; + std::size_t variable_size; + HashTablePreallocationState prealloc_state; + bool using_prealloc = allow_duplicate_keys && preallocate_supported_; + std::vector<TypedValue> key_vector; + key_vector.resize(key_attr_ids.size()); + return InvokeOnAnyValueAccessor( + accessor, + [&](auto *accessor) -> HashTablePutResult { // NOLINT(build/c++11) + if (using_prealloc) { + std::size_t total_entries = 0; + std::size_t total_variable_key_size = 0; + if (check_for_null_keys || force_key_copy) { + // If we need to filter out nulls OR make variable copies, make a + // prepass over the ValueAccessor. + while (accessor->next()) { + if (this->GetCompositeKeyFromValueAccessor(*accessor, + key_attr_ids, + check_for_null_keys, + &key_vector)) { + continue; + } + ++total_entries; + total_variable_key_size += this->calculateVariableLengthCompositeKeyCopySize(key_vector); + } + accessor->beginIteration(); + } else { + total_entries = accessor->getNumTuples(); + } + if (resizable) { + bool prealloc_succeeded = false; + while (!prealloc_succeeded) { + { + SpinSharedMutexSharedLock<true> lock(resize_shared_mutex_); + prealloc_succeeded = this->preallocateForBulkInsert(total_entries, + total_variable_key_size, + &prealloc_state); + } + if (!prealloc_succeeded) { + this->resize(total_entries, total_variable_key_size); + } + } + } else { + using_prealloc = this->preallocateForBulkInsert(total_entries, + total_variable_key_size, + &prealloc_state); + } + } + if (resizable) { + while (result == HashTablePutResult::kOutOfSpace) { + { + result = HashTablePutResult::kOK; + SpinSharedMutexSharedLock<true> lock(resize_shared_mutex_); + while (accessor->next()) { + if (this->GetCompositeKeyFromValueAccessor(*accessor, + key_attr_ids, + check_for_null_keys, + &key_vector)) { + continue; + } + variable_size = this->calculateVariableLengthCompositeKeyCopySize(key_vector); + result = this->putCompositeKeyInternal(key_vector, + variable_size, + (*functor)(*accessor), + using_prealloc ? &prealloc_state : nullptr); + if (result == HashTablePutResult::kDuplicateKey) { + DEBUG_ASSERT(!using_prealloc); + return result; + } else if (result == HashTablePutResult::kOutOfSpace) { + DEBUG_ASSERT(!using_prealloc); + break; + } + } + } + if (result == HashTablePutResult::kOutOfSpace) { + this->resize(0, variable_size); + accessor->previous(); + } + } + } else { + while (accessor->next()) { + if (this->GetCompositeKeyFromValueAccessor(*accessor, + key_attr_ids, + check_for_null_keys, + &key_vector)) { + continue; + } + variable_size = this->calculateVariableLengthCompositeKeyCopySize(key_vector); + result = this->putCompositeKeyInternal(key_vector, + variable_size, + (*functor)(*accessor), + using_prealloc ? &prealloc_state : nullptr); + if (result != HashTablePutResult::kOK) { + return result; + } + } + } + + return HashTablePutResult::kOK; + }); +} + +template <bool resizable, + bool serializable, + bool force_key_copy, + bool allow_duplicate_keys> +template <typename FunctorT> +bool FastHashTable<resizable, serializable, force_key_copy, allow_duplicate_keys> + ::upsert(const TypedValue &key, + const uint8_t &initial_value, + FunctorT *functor) { + DEBUG_ASSERT(!allow_duplicate_keys); + const std::size_t variable_size = (force_key_copy && !scalar_key_inline_) ? key.getDataSize() : 0; + if (resizable) { + for (;;) { + { + SpinSharedMutexSharedLock<true> resize_lock(resize_shared_mutex_); + uint8_t *value = upsertInternal(key, variable_size, initial_value); + if (value != nullptr) { + (*functor)(value); + return true; + } + } + resize(0, force_key_copy && !scalar_key_inline_ ? key.getDataSize() : 0); + } + } else { + uint8_t *value = upsertInternal(key, variable_size, initial_value); + if (value == nullptr) { + return false; + } else { + (*functor)(value); + return true; + } + } +} + +template <bool resizable, + bool serializable, + bool force_key_copy, + bool allow_duplicate_keys> +template <typename FunctorT> +bool FastHashTable<resizable, serializable, force_key_copy, allow_duplicate_keys> + ::upsertCompositeKey(const std::vector<TypedValue> &key, + const uint8_t &initial_value, + FunctorT *functor) { + DEBUG_ASSERT(!allow_duplicate_keys); + const std::size_t variable_size = calculateVariableLengthCompositeKeyCopySize(key); + if (resizable) { + for (;;) { + { + SpinSharedMutexSharedLock<true> resize_lock(resize_shared_mutex_); + uint8_t *value = upsertCompositeKeyInternal(key, variable_size, initial_value); + if (value
<TRUNCATED>