cache: dynamically determine the number of cache shards I noticed in a heavily cache-bound workload on a 48-core machine that there was a lot of contention on the cache shard spin locks. Manually bumping the number of cache shards to 64 instead of the default 16 seemed to reduce the amount of contention significantly.
Change-Id: I2181ba4863065a554068a9c6d390b7f49e094abc Reviewed-on: http://gerrit.cloudera.org:8080/4292 Tested-by: Kudu Jenkins Reviewed-by: Adar Dembo <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/bfb6f233 Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/bfb6f233 Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/bfb6f233 Branch: refs/heads/master Commit: bfb6f2338cd57a1c7f8405c97b66c3050c8920ff Parents: 22d68a0 Author: Todd Lipcon <[email protected]> Authored: Thu Sep 1 16:18:20 2016 -0700 Committer: Todd Lipcon <[email protected]> Committed: Fri Sep 2 01:57:37 2016 +0000 ---------------------------------------------------------------------- src/kudu/util/cache.cc | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/bfb6f233/src/kudu/util/cache.cc ---------------------------------------------------------------------- diff --git a/src/kudu/util/cache.cc b/src/kudu/util/cache.cc index 7dd8e8e..964d93c 100644 --- a/src/kudu/util/cache.cc +++ b/src/kudu/util/cache.cc @@ -10,9 +10,11 @@ #include <vector> #include "kudu/gutil/atomic_refcount.h" +#include "kudu/gutil/bits.h" #include "kudu/gutil/hash/city.h" #include "kudu/gutil/stl_util.h" #include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/sysinfo.h" #include "kudu/util/alignment.h" #include "kudu/util/atomic.h" #include "kudu/util/cache.h" @@ -368,8 +370,13 @@ void LRUCache::Erase(const Slice& key, uint32_t hash) { } } -static const int kNumShardBits = 4; -static const int kNumShards = 1 << kNumShardBits; +// Determine the number of bits of the hash that should be used to determine +// the cache shard. This, in turn, determines the number of shards. +int DetermineShardBits() { + int bits = Bits::Log2Ceiling(base::NumCPUs()); + VLOG(1) << "Will use " << (1 << bits) << " shards for LRU cache."; + return bits; +} class ShardedLRUCache : public Cache { private: @@ -379,26 +386,31 @@ class ShardedLRUCache : public Cache { MutexType id_mutex_; uint64_t last_id_; + // Number of bits of hash used to determine the shard. + const int shard_bits_; + static inline uint32_t HashSlice(const Slice& s) { return util_hash::CityHash64( reinterpret_cast<const char *>(s.data()), s.size()); } - static uint32_t Shard(uint32_t hash) { - return hash >> (32 - kNumShardBits); + uint32_t Shard(uint32_t hash) { + return hash >> (32 - shard_bits_); } public: explicit ShardedLRUCache(size_t capacity, const string& id) - : last_id_(0) { + : last_id_(0), + shard_bits_(DetermineShardBits()) { // A cache is often a singleton, so: // 1. We reuse its MemTracker if one already exists, and // 2. It is directly parented to the root MemTracker. mem_tracker_ = MemTracker::FindOrCreateTracker( -1, strings::Substitute("$0-sharded_lru_cache", id)); - const size_t per_shard = (capacity + (kNumShards - 1)) / kNumShards; - for (int s = 0; s < kNumShards; s++) { + int num_shards = 1 << shard_bits_; + const size_t per_shard = (capacity + (num_shards - 1)) / num_shards; + for (int s = 0; s < num_shards; s++) { gscoped_ptr<LRUCache> shard(new LRUCache(mem_tracker_.get())); shard->SetCapacity(per_shard); shards_.push_back(shard.release());
