Initial commit for partitioned hash tables.
Project: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/commit/2eb8c9d3 Tree: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/tree/2eb8c9d3 Diff: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/diff/2eb8c9d3 Branch: refs/heads/partitioned-aggregation Commit: 2eb8c9d37bfd35871ed670d4f9dd7bc0930ac0d5 Parents: 06f3990 Author: Harshad Deshmukh <hbdeshm...@apache.org> Authored: Mon Aug 15 12:21:19 2016 -0500 Committer: Harshad Deshmukh <hbdeshm...@apache.org> Committed: Fri Sep 16 13:17:23 2016 -0500 ---------------------------------------------------------------------- storage/AggregationOperationState.hpp | 1 + storage/CMakeLists.txt | 11 ++ storage/PartitionedHashTablePool.hpp | 201 +++++++++++++++++++++++++++++ 3 files changed, 213 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/2eb8c9d3/storage/AggregationOperationState.hpp ---------------------------------------------------------------------- diff --git a/storage/AggregationOperationState.hpp b/storage/AggregationOperationState.hpp index 7956bc6..66af517 100644 --- a/storage/AggregationOperationState.hpp +++ b/storage/AggregationOperationState.hpp @@ -32,6 +32,7 @@ #include "storage/AggregationOperationState.pb.h" #include "storage/HashTableBase.hpp" #include "storage/HashTablePool.hpp" +#include "storage/PartitionedHashTablePool.hpp" #include "storage/StorageBlockInfo.hpp" #include "utility/Macros.hpp" http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/2eb8c9d3/storage/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/storage/CMakeLists.txt b/storage/CMakeLists.txt index f05cc46..3d6e8d4 100644 --- a/storage/CMakeLists.txt +++ b/storage/CMakeLists.txt @@ -235,6 +235,7 @@ add_library(quickstep_storage_PackedRowStoreTupleStorageSubBlock add_library(quickstep_storage_PackedRowStoreValueAccessor ../empty_src.cpp PackedRowStoreValueAccessor.hpp) +add_library(quickstep_storage_PartitionedHashTablePool ../empty_src.cpp PartitionedHashTablePool.hpp) add_library(quickstep_storage_PreloaderThread PreloaderThread.cpp PreloaderThread.hpp) add_library(quickstep_storage_SMAIndexSubBlock SMAIndexSubBlock.cpp SMAIndexSubBlock.hpp) add_library(quickstep_storage_SeparateChainingHashTable ../empty_src.cpp SeparateChainingHashTable.hpp) @@ -289,6 +290,7 @@ target_link_libraries(quickstep_storage_AggregationOperationState quickstep_storage_HashTableFactory quickstep_storage_HashTablePool quickstep_storage_InsertDestination + quickstep_storage_PartitionedHashTablePool quickstep_storage_StorageBlock quickstep_storage_StorageBlockInfo quickstep_storage_StorageManager @@ -850,6 +852,14 @@ target_link_libraries(quickstep_storage_PackedRowStoreValueAccessor quickstep_types_TypedValue quickstep_utility_BitVector quickstep_utility_Macros) +target_link_libraries(quickstep_storage_PartitionedHashTablePool + glog + quickstep_expressions_aggregation_AggregationHandle + quickstep_storage_FastHashTable + quickstep_storage_FastHashTableFactory + quickstep_storage_HashTableBase + quickstep_utility_Macros + quickstep_utility_StringUtil) target_link_libraries(quickstep_storage_PreloaderThread glog quickstep_catalog_CatalogDatabase @@ -1167,6 +1177,7 @@ target_link_libraries(quickstep_storage quickstep_storage_LinearOpenAddressingHashTable quickstep_storage_PackedRowStoreTupleStorageSubBlock quickstep_storage_PackedRowStoreValueAccessor + quickstep_storage_PartitionedHashTablePool quickstep_storage_PreloaderThread quickstep_storage_SMAIndexSubBlock quickstep_storage_SeparateChainingHashTable http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/2eb8c9d3/storage/PartitionedHashTablePool.hpp ---------------------------------------------------------------------- diff --git a/storage/PartitionedHashTablePool.hpp b/storage/PartitionedHashTablePool.hpp new file mode 100644 index 0000000..a71af44 --- /dev/null +++ b/storage/PartitionedHashTablePool.hpp @@ -0,0 +1,201 @@ +/** + * Copyright 2016, Quickstep Research Group, Computer Sciences Department, + * University of WisconsinâMadison. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +#ifndef QUICKSTEP_STORAGE_PARTITIONED_HASH_TABLE_POOL_HPP_ +#define QUICKSTEP_STORAGE_PARTITIONED_HASH_TABLE_POOL_HPP_ + +#include <chrono> +#include <memory> +#include <utility> +#include <vector> + +#include "expressions/aggregation/AggregationHandle.hpp" +#include "storage/HashTableBase.hpp" +#include "storage/FastHashTable.hpp" +#include "storage/FastHashTableFactory.hpp" +#include "utility/Macros.hpp" +#include "utility/StringUtil.hpp" + +#include "glog/logging.h" + +namespace quickstep { + +class StorageManager; +class Type; + +/** \addtogroup Storage + * @{ + */ + +/** + * @brief A pool of HashTables used for a single aggregation handle. Each + * HashTable represents values from a given partition, which is + * determined by the keys in the group by clause. + **/ +class PartitionedHashTablePool { + public: + /** + * @brief Constructor. + * + * @param estimated_num_entries The maximum number of entries in a hash table. + * @param num_partitions The number of partitions (i.e. number of HashTables) + * @param hash_table_impl_type The type of hash table implementation. + * @param group_by_types A vector of pointer of types which form the group by + * key. + * @param agg_handle The aggregation handle. + * @param storage_manager A pointer to the storage manager. + * + * @note The estimate of number of entries is quite inaccurate at this time. + * If we go by the current estimate, each hash table demands much + * larger space than it actually needs, which causes the system to + * either trigger evictions or worse - run out of memory. To fix this + * issue, we divide the estimate by 100. The division will not affect + * correctness, however it may allocate some hash tables smaller space + * than their requirement, causing them to be resized during build + * phase, which has a performance penalty. + **/ + PartitionedHashTablePool(const std::size_t estimated_num_entries, + const std::size_t num_partitions, + const HashTableImplType hash_table_impl_type, + const std::vector<const Type *> &group_by_types, + AggregationHandle *agg_handle, + StorageManager *storage_manager) + : estimated_num_entries_( + reduceEstimatedCardinality(estimated_num_entries, num_partitions)), + num_partitions_(num_partitions), + hash_table_impl_type_(hash_table_impl_type), + group_by_types_(group_by_types), + agg_handle_(DCHECK_NOTNULL(agg_handle)), + storage_manager_(DCHECK_NOTNULL(storage_manager)) { + initializeAllHashTables(); + } + + PartitionedHashTablePool(const std::size_t estimated_num_entries, + const std::size_t num_partitions, + const HashTableImplType hash_table_impl_type, + const std::vector<const Type *> &group_by_types, + const std::vector<std::size_t> &payload_sizes, + const std::vector<AggregationHandle *> &handles, + StorageManager *storage_manager) + : estimated_num_entries_( + reduceEstimatedCardinality(estimated_num_entries, num_partitions)), + num_partitions_(num_partitions), + hash_table_impl_type_(hash_table_impl_type), + group_by_types_(group_by_types), + payload_sizes_(payload_sizes), + handles_(handles), + storage_manager_(DCHECK_NOTNULL(storage_manager)) { + initializeAllHashTables(); + } + + /** + * @brief Check out a hash table for insertion. + * + * @param partition_id The ID of the partitioned HashTable. + * + * @return A hash table pointer for the given HashTable. + **/ + AggregationStateHashTableBase* getHashTable(const std::size_t partition_id) { + DCHECK_LT(partition_id, num_partitions_); + DCHECK_LT(partition_id, hash_tables_.size()); + return hash_tables_[partition_id].get(); + } + + /** + * @brief Check out a hash table for insertion. + * + * @param partition_id The ID of the partitioned HashTable. + * + * @return A hash table pointer for the given HashTable. + **/ + AggregationStateHashTableBase* getHashTableFast(const std::size_t partition_id) { + DCHECK_LT(partition_id, num_partitions_); + DCHECK_LT(partition_id, hash_tables_.size()); + return hash_tables_[partition_id].get(); + } + + /** + * @brief Get all the hash tables from the pool. + * + * @warning The caller should ensure that this call is made when no hash table + * is being checked in or checked out from the pool. In other words + * the hash table pool is in read-only state. + * + * @param All the hash tables in the pool. + * + **/ + std::vector<std::unique_ptr<AggregationStateHashTableBase>>* + getAllHashTables() { + return &hash_tables_; + } + + private: + void initializeAllHashTables() { + for (std::size_t part_num = 0; part_num < num_partitions_; ++part_num) { + AggregationStateHashTableBase *part_hash_table = createNewHashTableFast(); + hash_tables_.push_back( + std::unique_ptr<AggregationStateHashTableBase>(part_hash_table)); + } + } + + AggregationStateHashTableBase* createNewHashTable() { + return agg_handle_->createGroupByHashTable(hash_table_impl_type_, + group_by_types_, + estimated_num_entries_, + storage_manager_); + } + + AggregationStateHashTableBase* createNewHashTableFast() { + return AggregationStateFastHashTableFactory::CreateResizable( + hash_table_impl_type_, + group_by_types_, + estimated_num_entries_, + payload_sizes_, + handles_, + storage_manager_); + } + + inline std::size_t reduceEstimatedCardinality( + const std::size_t original_estimate, + const std::size_t num_partitions) const { + CHECK_NE(num_partitions, 0u); + return original_estimate / num_partitions; + } + + std::vector<std::unique_ptr<AggregationStateHashTableBase>> hash_tables_; + + const std::size_t estimated_num_entries_; + const std::size_t num_partitions_; + + const HashTableImplType hash_table_impl_type_; + + const std::vector<const Type *> group_by_types_; + + std::vector<std::size_t> payload_sizes_; + + AggregationHandle *agg_handle_; + const std::vector<AggregationHandle *> handles_; + StorageManager *storage_manager_; + + DISALLOW_COPY_AND_ASSIGN(PartitionedHashTablePool); +}; + +/** @} */ + +} // namespace quickstep + +#endif // QUICKSTEP_STORAGE_HASH_TABLE_POOL_HPP_