Using perf, I see that sequence lock is being a bottleneck since it is being read by everyone. Giving it its own cache-line seems to help things quite a bit.
Using qht-bench, I measured the following for: $ ./tests/qht-bench -d 10 -n 24 -u <x> throughput base patch %change update 0 8.07 13.33 +65% 10 7.10 8.90 +25% 20 6.34 7.02 +10% 30 5.48 6.11 +9.6% 40 4.90 5.46 +11.42% I am not able to see any significant increases for lower thread counts though. Signed-off-by: Pranith Kumar <bobby.pr...@gmail.com> --- include/qemu/seqlock.h | 2 +- util/qht.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/qemu/seqlock.h b/include/qemu/seqlock.h index 8dee11d..954abe8 100644 --- a/include/qemu/seqlock.h +++ b/include/qemu/seqlock.h @@ -21,7 +21,7 @@ typedef struct QemuSeqLock QemuSeqLock; struct QemuSeqLock { unsigned sequence; -}; +} QEMU_ALIGNED(64); static inline void seqlock_init(QemuSeqLock *sl) { diff --git a/util/qht.c b/util/qht.c index ff4d2e6..4d82609 100644 --- a/util/qht.c +++ b/util/qht.c @@ -101,14 +101,14 @@ * be grabbed first. */ struct qht_bucket { - QemuSpin lock; QemuSeqLock sequence; + QemuSpin lock; uint32_t hashes[QHT_BUCKET_ENTRIES]; void *pointers[QHT_BUCKET_ENTRIES]; struct qht_bucket *next; } QEMU_ALIGNED(QHT_BUCKET_ALIGN); -QEMU_BUILD_BUG_ON(sizeof(struct qht_bucket) > QHT_BUCKET_ALIGN); +QEMU_BUILD_BUG_ON(sizeof(struct qht_bucket) > 2 * QHT_BUCKET_ALIGN); /** * struct qht_map - structure to track an array of buckets -- 2.10.1