perf analysis of the cow filter showed that a great deal of time was spent holding the global lock. This change makes the locking more fine-grained so now we are only using the global lock to protect the bitmap from concurrent access and nothing else.
I added a new read-modify-write lock to protect a few places where we use an RMW cycle to modify an unaligned block, which should be a fairly rare occurrence. Previously the global lock protected these. A benchmark shows this is a considerable improvement. Running the test from here: http://git.annexia.org/?p=libguestfs-talks.git;a=blob;f=2021-pipelines/benchmarking/testA6.sh;h=e19dca2504a9b3d76b8f472dc7caefd85113a54b;hb=HEAD Before this change: real 3m51.086s user 0m0.994s sys 0m3.568s After this change: real 2m1.091s user 0m1.038s sys 0m3.576s Compared to a similar pipeline using qemu-img convert and qcow2: http://git.annexia.org/?p=libguestfs-talks.git;a=blob;f=2021-pipelines/benchmarking/testA4.sh;h=88085d0aa6bb479cad83346292dc72d2b4d3117f;hb=HEAD real 2m22.368s user 0m54.415s sys 1m27.410s --- filters/cow/blk.h | 7 ------- filters/cow/blk.c | 27 ++++++++++++++++++++++++++- filters/cow/cow.c | 41 ++++++++++++++--------------------------- 3 files changed, 40 insertions(+), 35 deletions(-) diff --git a/filters/cow/blk.h b/filters/cow/blk.h index d96b9924..be3e764a 100644 --- a/filters/cow/blk.h +++ b/filters/cow/blk.h @@ -44,13 +44,6 @@ extern int blk_init (void); /* Close the overlay, free the bitmap. */ extern void blk_free (void); -/*---------------------------------------------------------------------- - * ** NOTE ** - * - * An exclusive lock must be held when you call any function below - * this line. - */ - /* Allocate or resize the overlay and bitmap. */ extern int blk_set_size (uint64_t new_size); diff --git a/filters/cow/blk.c b/filters/cow/blk.c index 51ba91c4..cbd40188 100644 --- a/filters/cow/blk.c +++ b/filters/cow/blk.c @@ -90,9 +90,12 @@ #include <alloca.h> #endif +#include <pthread.h> + #include <nbdkit-filter.h> #include "bitmap.h" +#include "cleanup.h" #include "fdatasync.h" #include "rounding.h" #include "pread.h" @@ -104,6 +107,9 @@ /* The temporary overlay. */ static int fd = -1; +/* This lock protects the bitmap from parallel access. */ +static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; + /* Bitmap. */ static struct bitmap bm; @@ -186,6 +192,8 @@ static uint64_t size = 0; int blk_set_size (uint64_t new_size) { + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock); + size = new_size; if (bitmap_resize (&bm, size) == -1) @@ -205,6 +213,7 @@ blk_set_size (uint64_t new_size) void blk_status (uint64_t blknum, bool *present, bool *trimmed) { + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock); enum bm_entry state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED); *present = state != BLOCK_NOT_ALLOCATED; @@ -219,7 +228,18 @@ blk_read (struct nbdkit_next_ops *next_ops, void *nxdata, uint64_t blknum, uint8_t *block, int *err) { off_t offset = blknum * BLKSIZE; - enum bm_entry state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED); + enum bm_entry state; + + /* The state might be modified from another thread - for example + * another thread might write (BLOCK_NOT_ALLOCATED -> + * BLOCK_ALLOCATED) while we are reading from the plugin, returning + * the old data. However a read issued after the write returns + * should always return the correct data. + */ + { + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock); + state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED); + } nbdkit_debug ("cow: blk_read block %" PRIu64 " (offset %" PRIu64 ") is %s", blknum, (uint64_t) offset, state_to_string (state)); @@ -260,6 +280,8 @@ int blk_cache (struct nbdkit_next_ops *next_ops, void *nxdata, uint64_t blknum, uint8_t *block, enum cache_mode mode, int *err) { + /* XXX Could make this lock more fine-grained with some thought. */ + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock); off_t offset = blknum * BLKSIZE; enum bm_entry state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED); unsigned n = BLKSIZE, tail = 0; @@ -322,6 +344,8 @@ blk_write (uint64_t blknum, const uint8_t *block, int *err) nbdkit_error ("pwrite: %m"); return -1; } + + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock); bitmap_set_blk (&bm, blknum, BLOCK_ALLOCATED); return 0; @@ -339,6 +363,7 @@ blk_trim (uint64_t blknum, int *err) * here. However it's not trivial since BLKSIZE is unrelated to the * overlay filesystem block size. */ + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock); bitmap_set_blk (&bm, blknum, BLOCK_TRIMMED); return 0; } diff --git a/filters/cow/cow.c b/filters/cow/cow.c index 93e10f24..54291476 100644 --- a/filters/cow/cow.c +++ b/filters/cow/cow.c @@ -45,16 +45,17 @@ #include <nbdkit-filter.h> #include "cleanup.h" - -#include "blk.h" #include "isaligned.h" #include "minmax.h" #include "rounding.h" -/* In order to handle parallel requests safely, this lock must be held - * when calling any blk_* functions. +#include "blk.h" + +/* Read-modify-write requests are serialized through this global lock. + * This is only used for unaligned requests which should be + * infrequent. */ -static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; +static pthread_mutex_t rmw_lock = PTHREAD_MUTEX_INITIALIZER; bool cow_on_cache; @@ -117,7 +118,6 @@ cow_get_size (struct nbdkit_next_ops *next_ops, void *nxdata, nbdkit_debug ("cow: underlying file size: %" PRIi64, size); - ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock); r = blk_set_size (size); if (r == -1) return -1; @@ -221,7 +221,6 @@ cow_pread (struct nbdkit_next_ops *next_ops, void *nxdata, uint64_t n = MIN (BLKSIZE - blkoffs, count); assert (block); - ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock); r = blk_read (next_ops, nxdata, blknum, block, err); if (r == -1) return -1; @@ -242,7 +241,6 @@ cow_pread (struct nbdkit_next_ops *next_ops, void *nxdata, * smarter here. */ while (count >= BLKSIZE) { - ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock); r = blk_read (next_ops, nxdata, blknum, buf, err); if (r == -1) return -1; @@ -256,7 +254,6 @@ cow_pread (struct nbdkit_next_ops *next_ops, void *nxdata, /* Unaligned tail */ if (count) { assert (block); - ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock); r = blk_read (next_ops, nxdata, blknum, block, err); if (r == -1) return -1; @@ -294,10 +291,10 @@ cow_pwrite (struct nbdkit_next_ops *next_ops, void *nxdata, uint64_t n = MIN (BLKSIZE - blkoffs, count); /* Do a read-modify-write operation on the current block. - * Hold the lock over the whole operation. + * Hold the rmw_lock over the whole operation. */ assert (block); - ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock); + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock); r = blk_read (next_ops, nxdata, blknum, block, err); if (r != -1) { memcpy (&block[blkoffs], buf, n); @@ -314,7 +311,6 @@ cow_pwrite (struct nbdkit_next_ops *next_ops, void *nxdata, /* Aligned body */ while (count >= BLKSIZE) { - ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock); r = blk_write (blknum, buf, err); if (r == -1) return -1; @@ -328,7 +324,7 @@ cow_pwrite (struct nbdkit_next_ops *next_ops, void *nxdata, /* Unaligned tail */ if (count) { assert (block); - ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock); + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock); r = blk_read (next_ops, nxdata, blknum, block, err); if (r != -1) { memcpy (block, buf, count); @@ -376,9 +372,9 @@ cow_zero (struct nbdkit_next_ops *next_ops, void *nxdata, uint64_t n = MIN (BLKSIZE - blkoffs, count); /* Do a read-modify-write operation on the current block. - * Hold the lock over the whole operation. + * Hold the rmw_lock over the whole operation. */ - ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock); + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock); r = blk_read (next_ops, nxdata, blknum, block, err); if (r != -1) { memset (&block[blkoffs], 0, n); @@ -399,7 +395,6 @@ cow_zero (struct nbdkit_next_ops *next_ops, void *nxdata, /* XXX There is the possibility of optimizing this: since this loop is * writing a whole, aligned block, we should use FALLOC_FL_ZERO_RANGE. */ - ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock); r = blk_write (blknum, block, err); if (r == -1) return -1; @@ -411,7 +406,7 @@ cow_zero (struct nbdkit_next_ops *next_ops, void *nxdata, /* Unaligned tail */ if (count) { - ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock); + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock); r = blk_read (next_ops, nxdata, blknum, block, err); if (r != -1) { memset (&block[count], 0, BLKSIZE - count); @@ -455,7 +450,7 @@ cow_trim (struct nbdkit_next_ops *next_ops, void *nxdata, /* Do a read-modify-write operation on the current block. * Hold the lock over the whole operation. */ - ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock); + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock); r = blk_read (next_ops, nxdata, blknum, block, err); if (r != -1) { memset (&block[blkoffs], 0, n); @@ -471,7 +466,6 @@ cow_trim (struct nbdkit_next_ops *next_ops, void *nxdata, /* Aligned body */ while (count >= BLKSIZE) { - ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock); r = blk_trim (blknum, err); if (r == -1) return -1; @@ -483,7 +477,7 @@ cow_trim (struct nbdkit_next_ops *next_ops, void *nxdata, /* Unaligned tail */ if (count) { - ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock); + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock); r = blk_read (next_ops, nxdata, blknum, block, err); if (r != -1) { memset (&block[count], 0, BLKSIZE - count); @@ -553,7 +547,6 @@ cow_cache (struct nbdkit_next_ops *next_ops, void *nxdata, /* Aligned body */ while (remaining) { - ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock); r = blk_cache (next_ops, nxdata, blknum, block, mode, err); if (r == -1) return -1; @@ -588,12 +581,6 @@ cow_extents (struct nbdkit_next_ops *next_ops, void *nxdata, assert (IS_ALIGNED (count, BLKSIZE)); assert (count > 0); /* We must make forward progress. */ - /* We hold the lock for the whole time, even when requesting extents - * from the plugin, because we want to present an atomic picture of - * the current state. - */ - ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock); - while (count > 0) { bool present, trimmed; struct nbdkit_extent e; -- 2.29.0.rc2 _______________________________________________ Libguestfs mailing list [email protected] https://listman.redhat.com/mailman/listinfo/libguestfs
