Re: [Qemu-devel] [PATCH v13 19/20] file-posix: Add image locking in perm operations

2017-04-20 Thread Kevin Wolf
Am 20.04.2017 um 09:52 hat Fam Zheng geschrieben:
> virtlockd in libvirt locks the first byte, so we start looking at the
> file bytes from 0x10.
> 
> The complication is in the transactional interface.  To make the reopen
> logic managable, and allow better reuse, the code is internally
> organized with a table from old mode to the new one.
> 
> Signed-off-by: Fam Zheng 

Looking at the very early patches in this series, I think it quickly
becomes obvious that we need to discuss one thing first:

> +static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t 
> shared,
> +  Error **errp)
> +{
> +bool is_shared;
> +BDRVRawState *s = bs->opaque;
> +
> +if (!RAW_LOCK_SUPPORTED) {
> +return 0;
> +}
> +if (s->lock_update) {
> +/* Override the previously stashed update. */
> +g_free(s->lock_update);
> +s->lock_update = NULL;
> +}
> +is_shared = !(perm & BLK_PERM_CONSISTENT_READ) && (shared & 
> BLK_PERM_WRITE);

Why do you check BLK_PERM_CONSISTENT_READ? The locks that we said we
would take on the image file represent BLK_PERM_WRITE, both in perm and
in shared, so this is what they should be checked against. Opening the
image in another process is fine if BLK_PERM_WRITE is set in shared,
even if BLK_PERM_CONSISTENT_READ is also set in perm.

BLK_PERM_CONSISTENT_READ is for cases where the contents of an image
is inherently invalid, not just because of a concurrent writer that we
might not be aware of, but because the image just doesn't makes sense on
it own. It may make sense as part of larger backing chain, though (the
only place where we clear the flag is for intermediate nodes in the
commit job). This semantics is more or less separate from what we want
to achieve here.

Of course, if we wanted, I guess we could individually map all 64
bits of each perm and shared to bytes to be locked in the file, so that
all permissions would be shared between qemu instances. That's probably
not worth the effort though.

And even if we did that, most likely you still wouldn't need any special
exceptions for BLK_PERM_CONSISTENT_READ, because it is always shared
except when one process wants to run a commit job - and for a commit
job, making sure that nobody else uses the image would probably be
right.

So if we really treat the file system level locks just as a mapping of
BLK_PERM_WRITE, things should become a bit easier in the early patches
of this series.

Kevin



Re: [Qemu-devel] [PATCH v13 19/20] file-posix: Add image locking in perm operations

2017-04-20 Thread Fam Zheng
On Thu, 04/20 15:52, Fam Zheng wrote:
> virtlockd in libvirt locks the first byte, so we start looking at the
> file bytes from 0x10.
> 
> The complication is in the transactional interface.  To make the reopen
> logic managable, and allow better reuse, the code is internally
> organized with a table from old mode to the new one.
> 
> Signed-off-by: Fam Zheng 
> ---
>  block/file-posix.c | 744 
> -
>  1 file changed, 741 insertions(+), 3 deletions(-)

Need to squash this in to fix the patchew make check error on centos 6:

diff --git a/block/file-posix.c b/block/file-posix.c
index b85ac9c..f1563ae 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1069,7 +1069,7 @@ static int raw_handle_lock_update(BlockDriverState *bs,
 int lock_fd;
 
 if (!RAW_LOCK_SUPPORTED) {
-return 0;
+goto cleanup;
 }
 
 if (bdrv_get_flags(bs) & BDRV_O_INACTIVE) {



[Qemu-devel] [PATCH v13 19/20] file-posix: Add image locking in perm operations

2017-04-20 Thread Fam Zheng
virtlockd in libvirt locks the first byte, so we start looking at the
file bytes from 0x10.

The complication is in the transactional interface.  To make the reopen
logic managable, and allow better reuse, the code is internally
organized with a table from old mode to the new one.

Signed-off-by: Fam Zheng 
---
 block/file-posix.c | 744 -
 1 file changed, 741 insertions(+), 3 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 24ea3ff..b85ac9c 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -131,8 +131,54 @@ do { \
 
 #define MAX_BLOCKSIZE  4096
 
+/* Posix file locking bytes. Libvirt takes byte 0, we start from byte 0x10,
+ * leaving a few more bytes for its future use. */
+#define RAW_LOCK_BYTE_MIN 0x10
+#define RAW_LOCK_BYTE_NO_OTHER_WRITER 0x10
+#define RAW_LOCK_BYTE_WRITE   0x11
+#ifdef F_OFD_SETLK
+#define RAW_LOCK_SUPPORTED 1
+#else
+#define RAW_LOCK_SUPPORTED 0
+#endif
+
+/*
+ ** reader that can tolerate writers: Don't do anything
+ *
+ ** reader that can't tolerate writers: Take shared lock on byte 0x10. Test
+ *  byte 0x11 is unlocked.
+ *
+ ** shared writer: Take shared lock on byte 0x11. Test byte 0x10 is unlocked.
+ *
+ ** exclusive writer: Take exclusive locks on both bytes.
+ */
+
+typedef enum {
+/* Read only and accept other writers. */
+RAW_L_READ_SHARE_RW,
+/* Read only and try to forbid other writers. */
+RAW_L_READ,
+/* Read/write and accept other writers. */
+RAW_L_WRITE_SHARE_RW,
+/* Read/write and try to forbid other writers. */
+RAW_L_WRITE,
+} BDRVRawLockMode;
+
+typedef struct BDRVRawLockUpdateState {
+/* A dup of @fd used for acquiring lock. */
+int image_fd;
+int lock_fd;
+int open_flags;
+BDRVRawLockMode new_lock;
+bool use_lock;
+} BDRVRawLockUpdateState;
+
 typedef struct BDRVRawState {
 int fd;
+/* A dup of @fd to make manipulating lock easier, especially during reopen,
+ * where this will accept BDRVRawReopenState.lock_fd. */
+int lock_fd;
+bool use_lock;
 int type;
 int open_flags;
 size_t buf_align;
@@ -147,6 +193,11 @@ typedef struct BDRVRawState {
 bool page_cache_inconsistent:1;
 bool has_fallocate;
 bool needs_alignment;
+/* The current lock mode we are in. Note that in incoming migration this is
+ * the "desired" mode to be applied at bdrv_invalidate_cache. */
+BDRVRawLockMode cur_lock_mode;
+/* Used by raw_check_perm/raw_set_perm. */
+BDRVRawLockUpdateState *lock_update;
 } BDRVRawState;
 
 typedef struct BDRVRawReopenState {
@@ -369,6 +420,64 @@ static void raw_parse_flags(int bdrv_flags, int 
*open_flags)
 }
 }
 
+static int raw_lock_fd(int fd, BDRVRawLockMode mode, Error **errp)
+{
+int ret;
+assert(fd >= 0);
+assert(RAW_LOCK_SUPPORTED);
+switch (mode) {
+case RAW_L_READ_SHARE_RW:
+ret = qemu_unlock_fd(fd, RAW_LOCK_BYTE_MIN, 2);
+if (ret) {
+error_setg_errno(errp, -ret, "Failed to unlock fd");
+goto fail;
+}
+break;
+case RAW_L_READ:
+ret = qemu_lock_fd(fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, false);
+if (ret) {
+error_setg_errno(errp, -ret, "Failed to lock share byte");
+goto fail;
+}
+ret = qemu_lock_fd_test(fd, RAW_LOCK_BYTE_WRITE, 1, true);
+if (ret) {
+error_setg_errno(errp, -ret, "Write byte lock is taken");
+goto fail;
+}
+break;
+case RAW_L_WRITE_SHARE_RW:
+ret = qemu_lock_fd(fd, RAW_LOCK_BYTE_WRITE, 1, true);
+if (ret) {
+error_setg_errno(errp, -ret, "Failed to lock write byte 
exclusively");
+goto fail;
+}
+ret = qemu_lock_fd(fd, RAW_LOCK_BYTE_WRITE, 1, false);
+if (ret) {
+error_setg_errno(errp, -ret, "Failed to downgrade lock write 
byte");
+goto fail;
+}
+ret = qemu_lock_fd_test(fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, true);
+if (ret) {
+error_setg_errno(errp, -ret, "Share byte lock is taken");
+goto fail;
+}
+break;
+case RAW_L_WRITE:
+ret = qemu_lock_fd(fd, RAW_LOCK_BYTE_MIN, 2, true);
+if (ret) {
+error_setg_errno(errp, -ret, "Failed to lock image");
+goto fail;
+}
+break;
+default:
+abort();
+}
+return 0;
+fail:
+qemu_unlock_fd(fd, RAW_LOCK_BYTE_MIN, 2);
+return ret;
+}
+
 static void raw_parse_filename(const char *filename, QDict *options,
Error **errp)
 {
@@ -403,6 +512,23 @@ static QemuOptsList raw_runtime_opts = {
 },
 };
 
+static BDRVRawLockMode raw_get_lock_mode(bool write, bool shared)
+{
+if (write) {
+if (shared) {
+return RAW_L_WRITE_SHARE_RW;
+} else {
+return RAW_L_WRITE;
+}
+} else {