virtlockd in libvirt locks the first byte, so we start looking at the
file bytes from 0x10.
The complication is in the transactional interface. To make the reopen
logic managable, and allow better reuse, the code is internally
organized with a table from old mode to the new one.
Signed-off-by: Fam Zheng
---
block/file-posix.c | 744 -
1 file changed, 741 insertions(+), 3 deletions(-)
diff --git a/block/file-posix.c b/block/file-posix.c
index 24ea3ff..b85ac9c 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -131,8 +131,54 @@ do { \
#define MAX_BLOCKSIZE 4096
+/* Posix file locking bytes. Libvirt takes byte 0, we start from byte 0x10,
+ * leaving a few more bytes for its future use. */
+#define RAW_LOCK_BYTE_MIN 0x10
+#define RAW_LOCK_BYTE_NO_OTHER_WRITER 0x10
+#define RAW_LOCK_BYTE_WRITE 0x11
+#ifdef F_OFD_SETLK
+#define RAW_LOCK_SUPPORTED 1
+#else
+#define RAW_LOCK_SUPPORTED 0
+#endif
+
+/*
+ ** reader that can tolerate writers: Don't do anything
+ *
+ ** reader that can't tolerate writers: Take shared lock on byte 0x10. Test
+ * byte 0x11 is unlocked.
+ *
+ ** shared writer: Take shared lock on byte 0x11. Test byte 0x10 is unlocked.
+ *
+ ** exclusive writer: Take exclusive locks on both bytes.
+ */
+
+typedef enum {
+/* Read only and accept other writers. */
+RAW_L_READ_SHARE_RW,
+/* Read only and try to forbid other writers. */
+RAW_L_READ,
+/* Read/write and accept other writers. */
+RAW_L_WRITE_SHARE_RW,
+/* Read/write and try to forbid other writers. */
+RAW_L_WRITE,
+} BDRVRawLockMode;
+
+typedef struct BDRVRawLockUpdateState {
+/* A dup of @fd used for acquiring lock. */
+int image_fd;
+int lock_fd;
+int open_flags;
+BDRVRawLockMode new_lock;
+bool use_lock;
+} BDRVRawLockUpdateState;
+
typedef struct BDRVRawState {
int fd;
+/* A dup of @fd to make manipulating lock easier, especially during reopen,
+ * where this will accept BDRVRawReopenState.lock_fd. */
+int lock_fd;
+bool use_lock;
int type;
int open_flags;
size_t buf_align;
@@ -147,6 +193,11 @@ typedef struct BDRVRawState {
bool page_cache_inconsistent:1;
bool has_fallocate;
bool needs_alignment;
+/* The current lock mode we are in. Note that in incoming migration this is
+ * the "desired" mode to be applied at bdrv_invalidate_cache. */
+BDRVRawLockMode cur_lock_mode;
+/* Used by raw_check_perm/raw_set_perm. */
+BDRVRawLockUpdateState *lock_update;
} BDRVRawState;
typedef struct BDRVRawReopenState {
@@ -369,6 +420,64 @@ static void raw_parse_flags(int bdrv_flags, int
*open_flags)
}
}
+static int raw_lock_fd(int fd, BDRVRawLockMode mode, Error **errp)
+{
+int ret;
+assert(fd >= 0);
+assert(RAW_LOCK_SUPPORTED);
+switch (mode) {
+case RAW_L_READ_SHARE_RW:
+ret = qemu_unlock_fd(fd, RAW_LOCK_BYTE_MIN, 2);
+if (ret) {
+error_setg_errno(errp, -ret, "Failed to unlock fd");
+goto fail;
+}
+break;
+case RAW_L_READ:
+ret = qemu_lock_fd(fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, false);
+if (ret) {
+error_setg_errno(errp, -ret, "Failed to lock share byte");
+goto fail;
+}
+ret = qemu_lock_fd_test(fd, RAW_LOCK_BYTE_WRITE, 1, true);
+if (ret) {
+error_setg_errno(errp, -ret, "Write byte lock is taken");
+goto fail;
+}
+break;
+case RAW_L_WRITE_SHARE_RW:
+ret = qemu_lock_fd(fd, RAW_LOCK_BYTE_WRITE, 1, true);
+if (ret) {
+error_setg_errno(errp, -ret, "Failed to lock write byte
exclusively");
+goto fail;
+}
+ret = qemu_lock_fd(fd, RAW_LOCK_BYTE_WRITE, 1, false);
+if (ret) {
+error_setg_errno(errp, -ret, "Failed to downgrade lock write
byte");
+goto fail;
+}
+ret = qemu_lock_fd_test(fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, true);
+if (ret) {
+error_setg_errno(errp, -ret, "Share byte lock is taken");
+goto fail;
+}
+break;
+case RAW_L_WRITE:
+ret = qemu_lock_fd(fd, RAW_LOCK_BYTE_MIN, 2, true);
+if (ret) {
+error_setg_errno(errp, -ret, "Failed to lock image");
+goto fail;
+}
+break;
+default:
+abort();
+}
+return 0;
+fail:
+qemu_unlock_fd(fd, RAW_LOCK_BYTE_MIN, 2);
+return ret;
+}
+
static void raw_parse_filename(const char *filename, QDict *options,
Error **errp)
{
@@ -403,6 +512,23 @@ static QemuOptsList raw_runtime_opts = {
},
};
+static BDRVRawLockMode raw_get_lock_mode(bool write, bool shared)
+{
+if (write) {
+if (shared) {
+return RAW_L_WRITE_SHARE_RW;
+} else {
+return RAW_L_WRITE;
+}
+} else {