[Qemu-block] [PATCH] iotests: Drop use of bash keyword 'function'

2018-11-16 Thread Eric Blake
Bash allows functions to be declared with or without the leading
keyword 'function'; but including the keyword does not comply with
POSIX syntax, and is confusing to ksh users where the use of the
keyword changes the scoping rules for functions.  Stick to the
POSIX form through iotests.

Done mechanically with:
  sed -i 's/^function //' $(git ls-files tests/qemu-iotests)

Signed-off-by: Eric Blake 
---

Based-on: <20181116155325.22428-1-berra...@redhat.com>
[0/6 Misc fixes to NBD]

 tests/qemu-iotests/common.nbd | 12 ++--
 tests/qemu-iotests/common.pattern | 16 
 tests/qemu-iotests/common.qemu|  8 
 tests/qemu-iotests/common.tls | 10 +-
 tests/qemu-iotests/035|  2 +-
 tests/qemu-iotests/037|  2 +-
 tests/qemu-iotests/038|  6 +++---
 tests/qemu-iotests/046|  6 +++---
 tests/qemu-iotests/047|  2 +-
 tests/qemu-iotests/049|  4 ++--
 tests/qemu-iotests/051|  4 ++--
 tests/qemu-iotests/067|  4 ++--
 tests/qemu-iotests/071|  4 ++--
 tests/qemu-iotests/077|  4 ++--
 tests/qemu-iotests/081|  4 ++--
 tests/qemu-iotests/082|  2 +-
 tests/qemu-iotests/085| 10 +-
 tests/qemu-iotests/086|  2 +-
 tests/qemu-iotests/087|  6 +++---
 tests/qemu-iotests/099|  6 +++---
 tests/qemu-iotests/109|  2 +-
 tests/qemu-iotests/112|  2 +-
 tests/qemu-iotests/142|  8 
 tests/qemu-iotests/153|  4 ++--
 tests/qemu-iotests/157|  4 ++--
 tests/qemu-iotests/172|  6 +++---
 tests/qemu-iotests/176|  2 +-
 tests/qemu-iotests/177|  2 +-
 tests/qemu-iotests/184|  4 ++--
 tests/qemu-iotests/186|  4 ++--
 tests/qemu-iotests/195|  4 ++--
 tests/qemu-iotests/204|  2 +-
 tests/qemu-iotests/223|  4 ++--
 tests/qemu-iotests/227|  4 ++--
 tests/qemu-iotests/232|  6 +++---
 35 files changed, 86 insertions(+), 86 deletions(-)

diff --git a/tests/qemu-iotests/common.nbd b/tests/qemu-iotests/common.nbd
index 0483ea7c55a..afcdb7ae7a0 100644
--- a/tests/qemu-iotests/common.nbd
+++ b/tests/qemu-iotests/common.nbd
@@ -23,7 +23,7 @@ nbd_unix_socket="${TEST_DIR}/qemu-nbd.sock"
 nbd_tcp_addr="127.0.0.1"
 nbd_pid_file="${TEST_DIR}/qemu-nbd.pid"

-function nbd_server_stop()
+nbd_server_stop()
 {
 local NBD_PID
 if [ -f "$nbd_pid_file" ]; then
@@ -36,7 +36,7 @@ function nbd_server_stop()
 rm -f "$nbd_unix_socket"
 }

-function nbd_server_wait_for_unix_socket()
+nbd_server_wait_for_unix_socket()
 {
 pid=$1

@@ -57,14 +57,14 @@ function nbd_server_wait_for_unix_socket()
 exit 1
 }

-function nbd_server_start_unix_socket()
+nbd_server_start_unix_socket()
 {
 nbd_server_stop
 $QEMU_NBD -v -t -k "$nbd_unix_socket" $@ &
 nbd_server_wait_for_unix_socket $!
 }

-function nbd_server_set_tcp_port()
+nbd_server_set_tcp_port()
 {
 for port in `seq 10809 10909`
 do
@@ -80,7 +80,7 @@ function nbd_server_set_tcp_port()
 exit 1
 }

-function nbd_server_wait_for_tcp_socket()
+nbd_server_wait_for_tcp_socket()
 {
 pid=$1

@@ -103,7 +103,7 @@ function nbd_server_wait_for_tcp_socket()
 exit 1
 }

-function nbd_server_start_tcp_socket()
+nbd_server_start_tcp_socket()
 {
 nbd_server_stop
 $QEMU_NBD -v -t -b $nbd_tcp_addr -p $nbd_tcp_port $@ &
diff --git a/tests/qemu-iotests/common.pattern 
b/tests/qemu-iotests/common.pattern
index 34f4a8dc9b4..b67bb341360 100644
--- a/tests/qemu-iotests/common.pattern
+++ b/tests/qemu-iotests/common.pattern
@@ -16,7 +16,7 @@
 # along with this program.  If not, see .
 #

-function do_is_allocated() {
+do_is_allocated() {
 local start=$1
 local size=$2
 local step=$3
@@ -27,11 +27,11 @@ function do_is_allocated() {
 done
 }

-function is_allocated() {
+is_allocated() {
 do_is_allocated "$@" | $QEMU_IO "$TEST_IMG" | _filter_qemu_io
 }

-function do_io() {
+do_io() {
 local op=$1
 local start=$2
 local size=$3
@@ -45,22 +45,22 @@ function do_io() {
 done
 }

-function io_pattern() {
+io_pattern() {
 do_io "$@" | $QEMU_IO "$TEST_IMG" | _filter_qemu_io
 }

-function io() {
+io() {
 local start=$2
 local pattern=$(( (start >> 9) % 256 ))

 do_io "$@" $pattern | $QEMU_IO "$TEST_IMG" | _filter_qemu_io
 }

-function io_zero() {
+io_zero() {
 do_io "$@" 0 | $QEMU_IO "$TEST_IMG" | _filter_qemu_io
 }

-function io_test() {
+io_test() {
 local op=$1
 local offset=$2
 local cluster_size=$3
@@ -100,7 +100,7 @@ function io_test() {
 offset=$((offset + num_large * ( l2_size + half_cluster )))
 }

-function io_test2() {
+io_test2() {
 local orig_offset=$1
 local cluster_size=$2
 local num=$3
diff --git a/tests/qemu-iotests/common.qemu b/tests/qemu-iotests/common.qemu

Re: [Qemu-block] [PATCH 3/6] tests: pull qemu-nbd iotest helpers into common.nbd file

2018-11-16 Thread Eric Blake

On 11/16/18 3:41 PM, Eric Blake wrote:


+#!/bin/bash


I know we're using bash,


+
+function nbd_server_stop()
+{



+function nbd_server_wait_for_unix_socket()


and bash supports 'function', but it is an obsolete syntactic sugar 
thing that I don't recommend using.  (In ksh, it actually makes a 
difference in behavior whether you use 'function' or not, and using it 
in 'bash' makes it harder to port code over to 'ksh' - and hence in bash 
it is obsolete because here it does NOT cause the change in behavior 
that ksh users expect)




Of course, I hit send too soon, before getting to my punchline:

Since we already have so many existing iotests that use 'function', it's 
better to clean that up as a separate patch.


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



Re: [Qemu-block] [PATCH 3/6] tests: pull qemu-nbd iotest helpers into common.nbd file

2018-11-16 Thread Eric Blake

On 11/16/18 9:53 AM, Daniel P. Berrangé wrote:

The helpers for starting/stopping qemu-nbd in 058 will be useful in
other test cases, so move them into a common.nbd file.

Signed-off-by: Daniel P. Berrangé 
---
  tests/qemu-iotests/058| 47 +
  tests/qemu-iotests/common.nbd | 56 +++
  2 files changed, 64 insertions(+), 39 deletions(-)
  create mode 100644 tests/qemu-iotests/common.nbd




-
-_cleanup_nbd()
-{



-_wait_for_nbd()
-{



+++ b/tests/qemu-iotests/common.nbd
@@ -0,0 +1,56 @@
+#!/bin/bash


I know we're using bash,


+
+function nbd_server_stop()
+{



+function nbd_server_wait_for_unix_socket()


and bash supports 'function', but it is an obsolete syntactic sugar 
thing that I don't recommend using.  (In ksh, it actually makes a 
difference in behavior whether you use 'function' or not, and using it 
in 'bash' makes it harder to port code over to 'ksh' - and hence in bash 
it is obsolete because here it does NOT cause the change in behavior 
that ksh users expect)


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



[Qemu-block] aio-posix: Fix concurrent aio_poll/set_fd_handler.

2018-11-16 Thread remy . noel
From: Remy Noel 

We no longer modify existing handlers entries and instead, always insert
those after having properly initialized those.

Also, we do not call aio_epoll_update for deleted handlers as this has
no impact whastoever.

Signed-off-by: Remy Noel 
---
 util/aio-posix.c | 85 
 util/aio-win32.c | 54 ++
 2 files changed, 74 insertions(+), 65 deletions(-)

diff --git a/util/aio-posix.c b/util/aio-posix.c
index 51c41ed3c9..b34d97292a 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -200,6 +200,31 @@ static AioHandler *find_aio_handler(AioContext *ctx, int 
fd)
 return NULL;
 }
 
+static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
+{
+/* If the GSource is in the process of being destroyed then
+ * g_source_remove_poll() causes an assertion failure.  Skip
+ * removal in that case, because glib cleans up its state during
+ * destruction anyway.
+ */
+if (!g_source_is_destroyed(>source)) {
+g_source_remove_poll(>source, >pfd);
+}
+
+/* If a read is in progress, just mark the node as deleted */
+if (qemu_lockcnt_count(>list_lock)) {
+node->deleted = 1;
+node->pfd.revents = 0;
+return false;
+}
+/* Otherwise, delete it for real.  We can't just mark it as
+ * deleted because deleted nodes are only cleaned up while
+ * no one is walking the handlers list.
+ */
+QLIST_REMOVE(node, node);
+return true;
+}
+
 void aio_set_fd_handler(AioContext *ctx,
 int fd,
 bool is_external,
@@ -209,6 +234,7 @@ void aio_set_fd_handler(AioContext *ctx,
 void *opaque)
 {
 AioHandler *node;
+AioHandler *new_node = NULL;
 bool is_new = false;
 bool deleted = false;
 int poll_disable_change;
@@ -223,50 +249,35 @@ void aio_set_fd_handler(AioContext *ctx,
 qemu_lockcnt_unlock(>list_lock);
 return;
 }
-
-/* If the GSource is in the process of being destroyed then
- * g_source_remove_poll() causes an assertion failure.  Skip
- * removal in that case, because glib cleans up its state during
- * destruction anyway.
- */
-if (!g_source_is_destroyed(>source)) {
-g_source_remove_poll(>source, >pfd);
-}
-
-/* If a read is in progress, just mark the node as deleted */
-if (qemu_lockcnt_count(>list_lock)) {
-node->deleted = 1;
-node->pfd.revents = 0;
-} else {
-/* Otherwise, delete it for real.  We can't just mark it as
- * deleted because deleted nodes are only cleaned up while
- * no one is walking the handlers list.
- */
-QLIST_REMOVE(node, node);
-deleted = true;
-}
+deleted = aio_remove_fd_handler(ctx, node);
 poll_disable_change = -!node->io_poll;
 } else {
 poll_disable_change = !io_poll - (node && !node->io_poll);
 if (node == NULL) {
-/* Alloc and insert if it's not already there */
-node = g_new0(AioHandler, 1);
-node->pfd.fd = fd;
-QLIST_INSERT_HEAD_RCU(>aio_handlers, node, node);
-
-g_source_add_poll(>source, >pfd);
 is_new = true;
 }
+/* Alloc and insert if it's not already there */
+new_node = g_new0(AioHandler, 1);
 
 /* Update handler with latest information */
-node->io_read = io_read;
-node->io_write = io_write;
-node->io_poll = io_poll;
-node->opaque = opaque;
-node->is_external = is_external;
+new_node->io_read = io_read;
+new_node->io_write = io_write;
+new_node->io_poll = io_poll;
+new_node->opaque = opaque;
+new_node->is_external = is_external;
+
+if (is_new) {
+new_node->pfd.fd = fd;
+} else {
+deleted = aio_remove_fd_handler(ctx, node);
+new_node->pfd = node->pfd;
+}
+g_source_add_poll(>source, _node->pfd);
 
-node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0);
-node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
+new_node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0);
+new_node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
+
+QLIST_INSERT_HEAD_RCU(>aio_handlers, new_node, node);
 }
 
 /* No need to order poll_disable_cnt writes against other updates;
@@ -278,7 +289,9 @@ void aio_set_fd_handler(AioContext *ctx,
 atomic_set(>poll_disable_cnt,
atomic_read(>poll_disable_cnt) + poll_disable_change);
 
-aio_epoll_update(ctx, node, is_new);
+if (new_node) {
+aio_epoll_update(ctx, new_node, is_new);
+}
 qemu_lockcnt_unlock(>list_lock);
 aio_notify(ctx);
 
diff --git a/util/aio-win32.c 

[Qemu-block] aio: Do not use list_lock as a sync mechanism for aio_handlers anymore.

2018-11-16 Thread remy . noel
From: Remy Noel 

It is still used for bottom halves though and to avoid concurent
set_fd_handlers (We could probably decorrelate the two, but
set_fd_handlers are quite rare so it probably isn't worth it).

Signed-off-by: Remy Noel 
---
 include/block/aio.h |  4 +++-
 util/aio-posix.c| 20 
 util/aio-win32.c|  9 -
 util/async.c|  7 +--
 4 files changed, 8 insertions(+), 32 deletions(-)

diff --git a/include/block/aio.h b/include/block/aio.h
index 0ca25dfec6..99c17a22f7 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -57,7 +57,9 @@ struct AioContext {
 /* Used by AioContext users to protect from multi-threaded access.  */
 QemuRecMutex lock;
 
-/* The list of registered AIO handlers.  Protected by ctx->list_lock. */
+/* The list of registered AIO handlers.
+ * RCU-enabled, writes rotected by ctx->list_lock.
+ */
 QLIST_HEAD(, AioHandler) aio_handlers;
 
 /* Used to avoid unnecessary event_notifier_set calls in aio_notify;
diff --git a/util/aio-posix.c b/util/aio-posix.c
index 83db3f65f4..46b7c571cc 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -341,7 +341,6 @@ static void poll_set_started(AioContext *ctx, bool started)
 
 ctx->poll_started = started;
 
-qemu_lockcnt_inc(>list_lock);
 rcu_read_lock();
 QLIST_FOREACH_RCU(node, >aio_handlers, node) {
 IOHandler *fn;
@@ -357,7 +356,6 @@ static void poll_set_started(AioContext *ctx, bool started)
 }
 }
 rcu_read_unlock();
-qemu_lockcnt_dec(>list_lock);
 }
 
 
@@ -374,12 +372,6 @@ bool aio_pending(AioContext *ctx)
 AioHandler *node;
 bool result = false;
 
-/*
- * We have to walk very carefully in case aio_set_fd_handler is
- * called while we're walking.
- */
-qemu_lockcnt_inc(>list_lock);
-
 rcu_read_lock();
 QLIST_FOREACH_RCU(node, >aio_handlers, node) {
 int revents;
@@ -397,7 +389,6 @@ bool aio_pending(AioContext *ctx)
 }
 }
 rcu_read_unlock();
-qemu_lockcnt_dec(>list_lock);
 
 return result;
 }
@@ -438,10 +429,8 @@ static bool aio_dispatch_handlers(AioContext *ctx)
 
 void aio_dispatch(AioContext *ctx)
 {
-qemu_lockcnt_inc(>list_lock);
 aio_bh_poll(ctx);
 aio_dispatch_handlers(ctx);
-qemu_lockcnt_dec(>list_lock);
 
 timerlistgroup_run_timers(>tlg);
 }
@@ -524,8 +513,6 @@ static bool run_poll_handlers_once(AioContext *ctx, int64_t 
*timeout)
  * Note that ctx->notify_me must be non-zero so this function can detect
  * aio_notify().
  *
- * Note that the caller must have incremented ctx->list_lock.
- *
  * Returns: true if progress was made, false otherwise
  */
 static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t 
*timeout)
@@ -534,7 +521,6 @@ static bool run_poll_handlers(AioContext *ctx, int64_t 
max_ns, int64_t *timeout)
 int64_t start_time, elapsed_time;
 
 assert(ctx->notify_me);
-assert(qemu_lockcnt_count(>list_lock) > 0);
 
 trace_run_poll_handlers_begin(ctx, max_ns, *timeout);
 
@@ -563,8 +549,6 @@ static bool run_poll_handlers(AioContext *ctx, int64_t 
max_ns, int64_t *timeout)
  *
  * ctx->notify_me must be non-zero so this function can detect aio_notify().
  *
- * Note that the caller must have incremented ctx->list_lock.
- *
  * Returns: true if progress was made, false otherwise
  */
 static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
@@ -609,8 +593,6 @@ bool aio_poll(AioContext *ctx, bool blocking)
 atomic_add(>notify_me, 2);
 }
 
-qemu_lockcnt_inc(>list_lock);
-
 if (ctx->poll_max_ns) {
 start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 }
@@ -713,8 +695,6 @@ bool aio_poll(AioContext *ctx, bool blocking)
 progress |= aio_dispatch_handlers(ctx);
 }
 
-qemu_lockcnt_dec(>list_lock);
-
 progress |= timerlistgroup_run_timers(>tlg);
 
 return progress;
diff --git a/util/aio-win32.c b/util/aio-win32.c
index d7c694e5ac..881bad0c28 100644
--- a/util/aio-win32.c
+++ b/util/aio-win32.c
@@ -176,7 +176,6 @@ bool aio_prepare(AioContext *ctx)
  * We have to walk very carefully in case aio_set_fd_handler is
  * called while we're walking.
  */
-qemu_lockcnt_inc(>list_lock);
 rcu_read_lock();
 
 /* fill fd sets */
@@ -206,7 +205,6 @@ bool aio_prepare(AioContext *ctx)
 }
 }
 rcu_read_unlock();
-qemu_lockcnt_dec(>list_lock);
 return have_select_revents;
 }
 
@@ -219,7 +217,6 @@ bool aio_pending(AioContext *ctx)
  * We have to walk very carefully in case aio_set_fd_handler is
  * called while we're walking.
  */
-qemu_lockcnt_inc(>list_lock);
 rcu_read_lock();
 QLIST_FOREACH_RCU(node, >aio_handlers, node) {
 if (node->pfd.revents && node->io_notify) {
@@ -238,7 +235,6 @@ bool aio_pending(AioContext *ctx)
 }
 
 rcu_read_unlock();
-qemu_lockcnt_dec(>list_lock);
 return result;
 }
 
@@ -296,10 +292,8 @@ static bool 

[Qemu-block] util/aio-posix: Use RCU for handler insertion.

2018-11-16 Thread remy . noel
From: Remy Noel 

get rid of the delete attribute.

We still need to get rid of the context list lock.

Signed-off-by: Remy Noel 
---
 util/aio-posix.c | 75 ++--
 util/aio-win32.c | 43 ++-
 2 files changed, 49 insertions(+), 69 deletions(-)

diff --git a/util/aio-posix.c b/util/aio-posix.c
index b34d97292a..83db3f65f4 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -16,6 +16,7 @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "block/block.h"
+#include "qemu/rcu.h"
 #include "qemu/rcu_queue.h"
 #include "qemu/sockets.h"
 #include "qemu/cutils.h"
@@ -26,13 +27,14 @@
 
 struct AioHandler
 {
+struct rcu_head rcu;
+
 GPollFD pfd;
 IOHandler *io_read;
 IOHandler *io_write;
 AioPollFn *io_poll;
 IOHandler *io_poll_begin;
 IOHandler *io_poll_end;
-int deleted;
 void *opaque;
 bool is_external;
 QLIST_ENTRY(AioHandler) node;
@@ -65,19 +67,25 @@ static bool aio_epoll_try_enable(AioContext *ctx)
 {
 AioHandler *node;
 struct epoll_event event;
+int r = 0;
+
 
+rcu_read_lock();
 QLIST_FOREACH_RCU(node, >aio_handlers, node) {
-int r;
-if (node->deleted || !node->pfd.events) {
+if (!node->pfd.events) {
 continue;
 }
 event.events = epoll_events_from_pfd(node->pfd.events);
 event.data.ptr = node;
 r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, );
 if (r) {
-return false;
+break;
 }
 }
+rcu_read_unlock();
+if (r) {
+return false;
+}
 ctx->epoll_enabled = true;
 return true;
 }
@@ -193,14 +201,13 @@ static AioHandler *find_aio_handler(AioContext *ctx, int 
fd)
 
 QLIST_FOREACH(node, >aio_handlers, node) {
 if (node->pfd.fd == fd)
-if (!node->deleted)
-return node;
+return node;
 }
 
 return NULL;
 }
 
-static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
+static void aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
 {
 /* If the GSource is in the process of being destroyed then
  * g_source_remove_poll() causes an assertion failure.  Skip
@@ -210,19 +217,7 @@ static bool aio_remove_fd_handler(AioContext *ctx, 
AioHandler *node)
 if (!g_source_is_destroyed(>source)) {
 g_source_remove_poll(>source, >pfd);
 }
-
-/* If a read is in progress, just mark the node as deleted */
-if (qemu_lockcnt_count(>list_lock)) {
-node->deleted = 1;
-node->pfd.revents = 0;
-return false;
-}
-/* Otherwise, delete it for real.  We can't just mark it as
- * deleted because deleted nodes are only cleaned up while
- * no one is walking the handlers list.
- */
-QLIST_REMOVE(node, node);
-return true;
+QLIST_REMOVE_RCU(node, node);
 }
 
 void aio_set_fd_handler(AioContext *ctx,
@@ -249,7 +244,8 @@ void aio_set_fd_handler(AioContext *ctx,
 qemu_lockcnt_unlock(>list_lock);
 return;
 }
-deleted = aio_remove_fd_handler(ctx, node);
+aio_remove_fd_handler(ctx, node);
+deleted = true;
 poll_disable_change = -!node->io_poll;
 } else {
 poll_disable_change = !io_poll - (node && !node->io_poll);
@@ -269,7 +265,8 @@ void aio_set_fd_handler(AioContext *ctx,
 if (is_new) {
 new_node->pfd.fd = fd;
 } else {
-deleted = aio_remove_fd_handler(ctx, node);
+aio_remove_fd_handler(ctx, node);
+deleted = true;
 new_node->pfd = node->pfd;
 }
 g_source_add_poll(>source, _node->pfd);
@@ -296,7 +293,7 @@ void aio_set_fd_handler(AioContext *ctx,
 aio_notify(ctx);
 
 if (deleted) {
-g_free(node);
+g_free_rcu(node, rcu);
 }
 }
 
@@ -345,13 +342,10 @@ static void poll_set_started(AioContext *ctx, bool 
started)
 ctx->poll_started = started;
 
 qemu_lockcnt_inc(>list_lock);
+rcu_read_lock();
 QLIST_FOREACH_RCU(node, >aio_handlers, node) {
 IOHandler *fn;
 
-if (node->deleted) {
-continue;
-}
-
 if (started) {
 fn = node->io_poll_begin;
 } else {
@@ -362,6 +356,7 @@ static void poll_set_started(AioContext *ctx, bool started)
 fn(node->opaque);
 }
 }
+rcu_read_unlock();
 qemu_lockcnt_dec(>list_lock);
 }
 
@@ -385,6 +380,7 @@ bool aio_pending(AioContext *ctx)
  */
 qemu_lockcnt_inc(>list_lock);
 
+rcu_read_lock();
 QLIST_FOREACH_RCU(node, >aio_handlers, node) {
 int revents;
 
@@ -400,6 +396,7 @@ bool aio_pending(AioContext *ctx)
 break;
 }
 }
+rcu_read_unlock();
 qemu_lockcnt_dec(>list_lock);
 
 return result;
@@ -410,14 +407,14 @@ static bool aio_dispatch_handlers(AioContext *ctx)
 AioHandler *node, *tmp;
 bool progress = false;
 
+

[Qemu-block] [PATCH] migration/block-dirty-bitmap: Silence coverity CID 1390625

2018-11-16 Thread John Snow
Coverity warns that backing_bs() could give us a NULL pointer, which
we then use without checking that it isn't.

In our loop condition, we check bs && bs->drv as a point of habit, but
by nature of the block graph, we cannot have null bs pointers here.

This loop skips only implicit nodes, which always have children, so
this loop should never encounter a null value.

Tighten the loop condition to coax Coverity into dropping
its false positive.

Suggested-by: Stefan Hajnoczi 
Signed-off-by: John Snow 
---
 migration/block-dirty-bitmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
index 5e90f44c2f..00c068fda3 100644
--- a/migration/block-dirty-bitmap.c
+++ b/migration/block-dirty-bitmap.c
@@ -284,7 +284,7 @@ static int init_dirty_bitmap_migration(void)
 const char *drive_name = bdrv_get_device_or_node_name(bs);
 
 /* skip automatically inserted nodes */
-while (bs && bs->drv && bs->implicit) {
+while (bs->drv && bs->implicit) {
 bs = backing_bs(bs);
 }
 
-- 
2.17.2




Re: [Qemu-block] KVM Forum block no[td]es

2018-11-16 Thread Max Reitz
I feel bad for trimming your mail so much, but that's just because I've
read it and I agree.  So, it's trimming of the good kind.
(Then again, when is trimming of a long mail ever bad?)

On 16.11.18 18:13, Kevin Wolf wrote:

[...]

> What we noted down about counters was more meaningful progress because
> we think this can actually provide the semantics that we need. Of
> course, there is still some hand-waving involved (we'll just check the
> counter when modifying the graph), which might not be that trivial to
> implement because bdrv_replace_child() can't return an error...

Sure, there is much to talk about there.  You're right, we probably need
a transaction system there, too, which makes things not so simple again.

And then there's the fact that bdrv_detach_child() currently cannot
fail, and we really don't want it to fail in e.g. bdrv_close().
Expressing that may be tricky, too.

>>> It doesn't feel too bad to me, but that's subjective.
>>
>> It's not worse than quiescing/draining, that's for sure.
> 
> How would _you_ know? :-)

I seem to recall to at least have read your patches... ;-)

[...]

> For the implementation with counters, maybe we actually need two of them
> like perm/shared in the permission system. One that means that we're
> going to modify the graph, and the other one that means that we can't
> tolerate graph modifications.
> 
> For the public interface, we might actually treat it mostly like
> permissions, just edge permissions, not node permissions.

Hm.  I see your point.  But having a "perm" field which would allow a
party to be able to do graph modifications for as long as it has claimed
that field would mean that the probably also want other parties not to
modify that exact same part of the graph in the meantime.  Like commit
which will modify part of the graph while over the whole lifetime of the
job it doesn't want that part to be modified by something else.

But then again, exactly that "perm" field solves the issue, too.  If you
are required to take the permission (and in the process block other
modifications on that edge) before doing graph modifications, the check
function can simply see that there is exactly one blocker, and infer
that this must be the party doing the modification itself.

>> So, no, I do not have a good technical counter-argument.  But my
>> argument of complexity still stands after having gone through
>> understanding how GRAPH_MOD might actually work for at least the third
>> time now, without it ever having made the next time any easier.
>>
>> Even if GRAPH_MOD were correctly implemented tomorrow, in a way
>> conforming to what we've discussed now, I have the strong feeling that I
>> still would get to a point where I forgot its meaning and would have to
>> go through all of this again.
>>
>> And that's completely disregarding people new to the codebase.
> 
> As I said, it's moot anyway. It doesn't provide the right semantics.

True, but the discussion itself isn't moot to me.  I wanted to stress
that given something that is rather hard to grasp, I prefer to throw it
away as long as it is not functional yet, whenever a more intuitive
alternative presents itself.

Max



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH 6/6] tests: exercise NBD server in TLS mode

2018-11-16 Thread Eric Blake

On 11/16/18 9:53 AM, Daniel P. Berrangé wrote:

Add tests that validate it is possible to connect to an NBD server
running TLS mode. Also test mis-matched TLS vs non-TLS connections
correctly fail.
---
  tests/qemu-iotests/233| 99 +++
  tests/qemu-iotests/233.out| 33 
  tests/qemu-iotests/common.nbd | 47 +
  tests/qemu-iotests/group  |  1 +
  4 files changed, 180 insertions(+)
  create mode 100755 tests/qemu-iotests/233
  create mode 100644 tests/qemu-iotests/233.out


Adding tests is non-invasive, so I have no objection to taking the 
entire series in 3.1.  Do you want me to touch up the nits I found 
earlier, or should I wait for a v2?


Reviewed-by: Eric Blake 


+# creator
+owner=berra...@redhat.com
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+here=`pwd`


Dead code. Mao Zhongyi has a patch to remove it.  I'm thinking of 
putting his patches in first, and then yours.




+
+echo
+echo "== check TLS client to plain server fails =="
+nbd_server_start_tcp_socket "$TEST_IMG"


The negative testing is just as important as positive testing, to prove 
TLS is adding the promised authorization.



+
+$QEMU_IMG info --image-opts \
+--object tls-creds-x509,dir=${tls_dir}/client1,endpoint=client,id=tls0 \
+driver=nbd,host=$nbd_tcp_addr,port=$nbd_tcp_port,tls-creds=tls0
+
+nbd_server_stop
+
+echo
+echo "== check plain client to TLS server fails =="
+
+nbd_server_start_tcp_socket --object 
tls-creds-x509,dir=${tls_dir}/server1,endpoint=server,id=tls0,verify-peer=yes --tls-creds 
tls0 "$TEST_IMG"


Long line


+
+$QEMU_IMG info nbd://localhost:$nbd_tcp_port
+
+echo
+echo "== check TLS works =="
+$QEMU_IMG info --image-opts \
+--object tls-creds-x509,dir=${tls_dir}/client1,endpoint=client,id=tls0 \
+driver=nbd,host=$nbd_tcp_addr,port=$nbd_tcp_port,tls-creds=tls0


Is it also worth reading or even writing, to ensure that more than just 
the handshake is working?



+
+echo
+echo "== check TLS with different CA fails =="
+$QEMU_IMG info --image-opts \
+--object tls-creds-x509,dir=${tls_dir}/client2,endpoint=client,id=tls0 \
+driver=nbd,host=$nbd_tcp_addr,port=$nbd_tcp_port,tls-creds=tls0
+
+# success, all done



+== check TLS client to plain server fails ==
+option negotiation failed: read failed: Unexpected end-of-file before all 
bytes were read


Annoying message; I wonder if we can clean that up. But not this patch's 
problem.



+qemu-img: Could not open 
'driver=nbd,host=127.0.0.1,port=10809,tls-creds=tls0': Denied by server for 
option 5 (starttls)
+server reported: TLS not configured


This 2-line message is better (and as such, explains why I think the 
message about early EOF should be silenced in this case).



+
+== check plain client to TLS server fails ==
+option negotiation failed: read failed: Unexpected end-of-file before all 
bytes were read
+qemu-img: Could not open 'nbd://localhost:10809': TLS negotiation required 
before option 8 (structured reply)
+server reported: Option 0x8 not permitted before TLS


Same story about a redundant message.


+write failed (error message): Unable to write to socket: Broken pipe


Hmm - is this the client complaining that it can't send more to the 
server (that's a bug if the server hung up, since the protocol allows 
the client to change its mind and try TLS after all), or the server 
complaining that the client hung up abruptly without sending 
NBD_OPT_ABORT? Qemu as client either tries TLS right away, or knows that 
if the server requires TLS and the client has no TLS credentials then 
the client will never succeed, so the client gives up - but it SHOULD be 
giving up with NBD_OPT_ABORT rather than just disconnecting.  I'll have 
to play with that.  Doesn't impact your patch, but might spur a followup 
fix :)



+
+== check TLS works ==
+image: nbd://127.0.0.1:10809
+file format: nbd
+virtual size: 64M (67108864 bytes)
+disk size: unavailable
+


Again, also doing a read and/or write to ensure the encrypted data is 
handled correctly would be nice. Can be in a followup patch.



+== check TLS with different CA fails ==
+option negotiation failed: Verify failed: No certificate was found.
+qemu-img: Could not open 
'driver=nbd,host=127.0.0.1,port=10809,tls-creds=tls0': The certificate hasn't 
got a known issuer
+*** done
diff --git a/tests/qemu-iotests/common.nbd b/tests/qemu-iotests/common.nbd
index 61e9e90fee..0483ea7c55 100644
--- a/tests/qemu-iotests/common.nbd
+++ b/tests/qemu-iotests/common.nbd
@@ -20,6 +20,7 @@
  #
  
  nbd_unix_socket="${TEST_DIR}/qemu-nbd.sock"

+nbd_tcp_addr="127.0.0.1"


Should this be in your earlier patch that created common.nbd? Maybe not, 
as it doesn't get used until the functions you add here for tcp support. 
Still, I wonder if we should split the addition of tcp support separate 
from the new test.



  nbd_pid_file="${TEST_DIR}/qemu-nbd.pid"
  
  function nbd_server_stop()

@@ -62,3 +63,49 @@ function 

Re: [Qemu-block] KVM Forum block no[td]es

2018-11-16 Thread Alberto Garcia
(I just wanted to reply quickly to this point, I'll read the rest of the
e-mail later)

On Fri 16 Nov 2018 05:34:08 PM CET, Max Reitz  wrote:
> > GRAPH_MOD with the meaning that Berto suggested isn't weird or
> > complicated to understand. It's only the wrong tool because it
> > blocks more than we want to block. But if we didn't care about that,
> > it could be just another permission like any other.

That's the way that I understood it. Or, more precisely, I'm basing my
current code on those assumptions because it's a simple starting point
to have something working with the current permission system. But I'm of
course open to trying a different approach.

> The meaning Berto has suggested (AFAIU) is never taking the permission
> at all but just querying whether it is shared by all current parents
> before doing a graph change.

That would be a quick way to test it, and I agree that it's unorthodox,
but it's useful to explain how I understood the meaning of GRAPH_MOD.

But you could for example make BDRVReopenState a parent of the BDS (and
replace BDRVReopenState.bs with a BdrvChild).

Berto



Re: [Qemu-block] KVM Forum block no[td]es

2018-11-16 Thread Kevin Wolf
Am 16.11.2018 um 17:34 hat Max Reitz geschrieben:
> On 16.11.18 16:51, Kevin Wolf wrote:
> > Am 16.11.2018 um 16:27 hat Max Reitz geschrieben:
> >> On 16.11.18 16:18, Kevin Wolf wrote:
> >>> Am 16.11.2018 um 16:03 hat Alberto Garcia geschrieben:
> > I don't think anything needs a way to generally block graph changes
> > around some node.  We only need to prevent changes to very specific
> > sets of edges.  This is something that the permission system just
> > cannot do.
> 
>  But what would you do then?
> >>>
> >>> I agree with you mostly in that I think that most problems that Max
> >>> mentioned aren't readl. The only real problem I see with GRAPH_MOD as a
> >>> permission on the node level is this overblocking
> >>
> >> I wholeheartedly disagree.  Yes, it is true that most of the issues I
> >> thought of can be fixed, and so those problems are not problems in a
> >> technical sense.  But to me this whole discussion points to the greatest
> >> issue I have, which is that GRAPH_MOD is just too complicated to
> >> understand.  And I don't like a solution that works on a technical level
> >> but that everybody is too afraid to touch because it's too weird.
> > 
> > GRAPH_MOD with the meaning that Berto suggested isn't weird or
> > complicated to understand. It's only the wrong tool because it blocks
> > more than we want to block. But if we didn't care about that, it could
> > be just another permission like any other.
> 
> The meaning Berto has suggested (AFAIU) is never taking the permission
> at all but just querying whether it is shared by all current parents
> before doing a graph change.
> 
> That is very weird to me because permissions are there to be taken.

It would be a shortcut for taking the permission temporarily, which
would possibly work inside QEMU because we're holding the BQL, though
I'm not sure whether I/O threads couldn't influence this. Anyway, with
our mapping of permissions to filesystem locks, it's definitely not
correct any more and you do need to actually temporarily acquire the
locks.

> The meaning I have suggested is that it would be taken before a graph
> change and released right afterwards.  I think this is still weird
> because we don't take, say, the WRITE permission before every
> bdrv*_write*() call and release it afterwards.
> 
> Sure you could say "actually, why not".  Please don't.[1]

Actually, why not? Not in the sense that it would be the right model for
WRITE, but it might still be the right one for GRAPH_MOD in most cases.

We do take the RESIZE permission in qmp_block_resize() before calling
blk_truncate(), and drop it right afterwards (by means of creating a
temporary BlockBackend only for the purpose of resizing), so there is
precedence.

Of course, all of this is moot because GRAPH_MOD can't provide the exact
semantics we want to have.

> > If you want to change the graph, you'd need to get the permission first,
> > and bdrv_replace_child_noperm() could assert that at least one parent of
> > the parent node has acquired the permission (unless you want to pass the
> > exact parent BdrvChild to it; maybe this is what we would really do
> > then).
> 
> Yep, that assertion would be like we do it in bdrv_co_write_req_prepare().
> 
> Thing is, you (as in "the caller that actually wants to do something",
> like mirror) don't just "get the permission".  Permissions are
> associated with children, so you cannot get this permission before you
> create your child.
> 
> So this is where "parent of a parent node" comes in?  I'm afraid I don't
> understand that.

Including GRAPH_MOD in the permission system means that it controls an
operation that reaches a node through a BdrvChild. In this case, the
operation is "modifiying a BdrvChild of the node that the permission
holding BdrvChild points to".

Okay, if you say that affecting children of a node instead of just the
node itself is a bit unusual, I might have to agree.

> What would make sense to me would be for the generic block layer to take
> this permission on new children (and drop it immediately after having
> attached the child) and children that are about to be detached.  Which
> again is just different from any other permission, because things
> outside of block.c can never take it, they can only choose not to share
> it.  So it'd be an internal permission, which is visible to the outside,
> and I think this is at least weirder than just having an internal
> counter with a clear external interface (inc/dec).

I don't think I'd want to implement it like that (but then, I'll repeat
that I wouldn't want to implement it at all because it's the wrong
tool).

> Also, please excuse me for it being Friday and all, but I don't quite
> believe in "It's actually clear, I have no idea why you claim to always
> take so long to understand it."  The code clearly has no idea what it's
> supposed to do, and I do not believe that the sole reason for that is
> that someone who did understand didn't have the 

Re: [Qemu-block] [PATCH] migration/block-dirty-bitmap: fix Coverity CID1390625

2018-11-16 Thread Peter Maydell
On 16 November 2018 at 16:16, John Snow  wrote:
> On 11/16/18 5:04 AM, Peter Maydell wrote:
>> Personally I think the main benefit of this sort of Coverity
>> warning is that it nudges you to work through and figure out
>> whether something really can be NULL or not. Stefan's fix
>> will satisfy Coverity, because what it's complaining about
>> is that the code in one place assumes a pointer can't be NULL
>> but in another place does have handling for NULL: it is the
>> inconsistency that triggers the warning. If backing_bs()
>> can't return NULL (ie if you would be happy in theory to put
>> an assert() in after the call) then I think Stefan's fix is
>> better. If it can return NULL ever then Vladimir's fix is
>> what you want.

> I really don't think it can, but I don't actually know how to verify it
> or to convince Coverity of the same. Stefan's suggestion seems most
> appropriate if it actually does calm Coverity down.

I think it will quieten Coverity; if it doesn't, then at that point
we can happily mark the issue a false-positive. But as I say what
Coverity is really identifying here is "you checked whether
this pointer was NULL, but then there's a code path forward
to a place where you used it without checking" -- so removing
the unnecessary check will make it happy.

> (Is it mechanically possible to violate this? That's very hard to audit
> and promise for you. There are always bugs lurking somewhere else.)

If you believe by design that it can't be NULL, then we're OK:
if it ever turns out that it isn't, then we will get a nice
easy-to-debug SEGV when we dereference the NULL pointer,
and we can find out then what bug resulted in our design
assumption being broken.

> Stefan's suggestion is probably most appropriate, *if* it actually
> shushes Coverity. I'll send you a small patch and you can find out? I
> don't want to task offload but I also genuinely don't know if that will
> hint to coverity strongly enough that this is a false positive.

Coverity runs only on git master, so fixes have to go into
master to be tested, unfortunately.

thanks
-- PMM



[Qemu-block] [PATCH for-3.1? v2 3/3] iotests: Test file-posix locking and reopen

2018-11-16 Thread Max Reitz
Signed-off-by: Max Reitz 
---
 tests/qemu-iotests/182 | 71 ++
 tests/qemu-iotests/182.out |  9 +
 2 files changed, 80 insertions(+)

diff --git a/tests/qemu-iotests/182 b/tests/qemu-iotests/182
index 4b31592fb8..3b7689c1d5 100755
--- a/tests/qemu-iotests/182
+++ b/tests/qemu-iotests/182
@@ -31,6 +31,7 @@ status=1  # failure is the default!
 _cleanup()
 {
 _cleanup_test_img
+rm -f "$TEST_IMG.overlay"
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
@@ -71,6 +72,76 @@ echo 'quit' | $QEMU -nographic -monitor stdio \
 
 _cleanup_qemu
 
+echo
+echo '=== Testing reopen ==='
+echo
+
+# This tests that reopening does not unshare any permissions it should
+# not unshare
+# (There was a bug where reopening shared exactly the opposite of the
+# permissions it was supposed to share)
+
+_launch_qemu
+
+_send_qemu_cmd $QEMU_HANDLE \
+"{'execute': 'qmp_capabilities'}" \
+'return'
+
+# Open the image without any format layer (we are not going to access
+# it, so that is fine)
+# This should keep all permissions shared.
+success_or_failure=y _send_qemu_cmd $QEMU_HANDLE \
+"{'execute': 'blockdev-add',
+  'arguments': {
+  'node-name': 'node0',
+  'driver': 'file',
+  'filename': '$TEST_IMG',
+  'locking': 'on'
+  } }" \
+'return' \
+'error'
+
+# This snapshot will perform a reopen to drop R/W to RO.
+# It should still keep all permissions shared.
+success_or_failure=y _send_qemu_cmd $QEMU_HANDLE \
+"{'execute': 'blockdev-snapshot-sync',
+  'arguments': {
+  'node-name': 'node0',
+  'snapshot-file': '$TEST_IMG.overlay',
+  'snapshot-node-name': 'node1'
+  } }" \
+'return' \
+'error'
+
+# Now open the same file again
+# This does not require any permissions (and does not unshare any), so
+# this will not conflict with node0.
+success_or_failure=y _send_qemu_cmd $QEMU_HANDLE \
+"{'execute': 'blockdev-add',
+  'arguments': {
+  'node-name': 'node1',
+  'driver': 'file',
+  'filename': '$TEST_IMG',
+  'locking': 'on'
+  } }" \
+'return' \
+'error'
+
+# Now we attach the image to a virtio-blk device.  This device does
+# require some permissions (at least WRITE and READ_CONSISTENT), so if
+# reopening node0 unshared any (which it should not have), this will
+# fail (but it should not).
+success_or_failure=y _send_qemu_cmd $QEMU_HANDLE \
+"{'execute': 'device_add',
+  'arguments': {
+  'driver': 'virtio-blk',
+  'drive': 'node1'
+  } }" \
+'return' \
+'error'
+
+_cleanup_qemu
+
 # success, all done
 echo "*** done"
 rm -f $seq.full
diff --git a/tests/qemu-iotests/182.out b/tests/qemu-iotests/182.out
index f1463c8862..af501ca3f3 100644
--- a/tests/qemu-iotests/182.out
+++ b/tests/qemu-iotests/182.out
@@ -5,4 +5,13 @@ Starting QEMU
 Starting a second QEMU using the same image should fail
 QEMU_PROG: -drive file=TEST_DIR/t.qcow2,if=none,id=drive0,file.locking=on: 
Failed to get "write" lock
 Is another process using the image [TEST_DIR/t.qcow2]?
+
+=== Testing reopen ===
+
+{"return": {}}
+{"return": {}}
+Formatting 'TEST_DIR/t.qcow2.overlay', fmt=qcow2 size=197120 
backing_file=TEST_DIR/t.qcow2 backing_fmt=file cluster_size=65536 
lazy_refcounts=off refcount_bits=16
+{"return": {}}
+{"return": {}}
+{"return": {}}
 *** done
-- 
2.17.2




[Qemu-block] [PATCH for-3.1? v2 2/3] file-posix: Fix shared locks on reopen commit

2018-11-16 Thread Max Reitz
s->locked_shared_perm is the set of bits locked in the file, which is
the inverse of the permissions actually shared.  So we need to pass them
as they are to raw_apply_lock_bytes() instead of inverting them again.

Reported-by: Alberto Garcia 
Signed-off-by: Max Reitz 
---
 block/file-posix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index df3a8d7cdf..8460d003f0 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -963,7 +963,7 @@ static void raw_reopen_commit(BDRVReopenState *state)
 
 /* Copy locks to the new fd before closing the old one. */
 raw_apply_lock_bytes(NULL, rs->fd, s->locked_perm,
- ~s->locked_shared_perm, false, _err);
+ s->locked_shared_perm, false, _err);
 if (local_err) {
 /* shouldn't fail in a sane host, but report it just in case. */
 error_report_err(local_err);
-- 
2.17.2




[Qemu-block] [PATCH for-3.1? v2 1/3] block: Always abort reopen after prepare succeeded

2018-11-16 Thread Max Reitz
bdrv_reopen_multiple() does not invoke bdrv_reopen_abort() for the
element of the reopen queue for which bdrv_reopen_prepare() failed,
because it assumes that the prepare function will have rolled back all
changes already.

However, bdrv_reopen_prepare() does not do this in every case: It may
notice an error after BlockDriver.bdrv_reopen_prepare() succeeded, and
it will not invoke BlockDriver.bdrv_reopen_abort() then; and neither
will bdrv_reopen_multiple(), as explained above.

This is wrong because we must always call .bdrv_reopen_commit() or
.bdrv_reopen_abort() after .bdrv_reopen_prepare() has succeeded.
Otherwise, the block driver has no chance to undo what it has done in
its implementation of .bdrv_reopen_prepare().

To fix this, bdrv_reopen_prepare() has to call .bdrv_reopen_abort() if
it wants to return an error after .bdrv_reopen_prepare() has succeeded.

Signed-off-by: Max Reitz 
---
 block.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/block.c b/block.c
index fd67e14dfa..3feac08535 100644
--- a/block.c
+++ b/block.c
@@ -3201,6 +3201,7 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, 
BlockReopenQueue *queue,
 QDict *orig_reopen_opts;
 char *discard = NULL;
 bool read_only;
+bool drv_prepared = false;
 
 assert(reopen_state != NULL);
 assert(reopen_state->bs->drv != NULL);
@@ -3285,6 +3286,8 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, 
BlockReopenQueue *queue,
 goto error;
 }
 
+drv_prepared = true;
+
 /* Options that are not handled are only okay if they are unchanged
  * compared to the old state. It is expected that some options are only
  * used for the initial open, but not reopen (e.g. filename) */
@@ -3350,6 +3353,15 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, 
BlockReopenQueue *queue,
 reopen_state->options = qobject_ref(orig_reopen_opts);
 
 error:
+if (ret < 0 && drv_prepared) {
+/* drv->bdrv_reopen_prepare() has succeeded, so we need to
+ * call drv->bdrv_reopen_abort() before signaling an error
+ * (bdrv_reopen_multiple() will not call bdrv_reopen_abort()
+ * when the respective bdrv_reopen_prepare() has failed) */
+if (drv->bdrv_reopen_abort) {
+drv->bdrv_reopen_abort(reopen_state);
+}
+}
 qemu_opts_del(opts);
 qobject_unref(orig_reopen_opts);
 g_free(discard);
-- 
2.17.2




[Qemu-block] [PATCH for-3.1? v2 0/3] block: Fix two minor reopen issues

2018-11-16 Thread Max Reitz
These are fixes for issues I found when looking after something Berto
has reported.  The second patch fixes that issue Berto found, the first
one is only kind of related.

For the first patch:  bdrv_reopen_abort() or bdrv_reopen_commit() are
called only if the respective bdrv_reopen_prepare() has succeeded.
However, bdrv_reopen_prepare() can fail even if the block driver’s
.bdrv_reopen_prepare() succeeded.  In that case, the block driver is
expecting a .bdrv_reopen_abort() or .bdrv_reopen_commit(), but will
never get it.

This is fixed by making bdrv_reopen_prepare() call .bdrv_reopen_abort()
if an error occurs after .bdrv_reopen_prepare() has already succeeded.

In practice this just prevents resource leaks, so nothing too dramatic
(fine for qemu-next).  It also means I’ve been incapable of writing a
test, sorry.


The second issue is more severe: file-posix applies the inverse share
locks when reopening.  Now the user can only directly do reopens from
the monitor for now, so that wouldn’t be so bad.  But of course there
are other places in qemu that implicitly reopen nodes, like dropping
R/W to RO or gaining R/W from RO.  Most of these places are symmetrical
at least (they’ll get R/W on an RO image for a short period of time and
then drop back to RO), so you’ll see the wrong shared permission locks
only for a short duration.  But at least blockdev-snapshot and
blockdev-snapshot-sync are one way; they drop R/W to RO and stay there.
So if you do a blockdev-snapshot*, the shared permission bits will be
flipped.  This is therefore very much user-visible.

It’s not catastrophic, though, so I’m not sure whether we need it in
3.1.  It’s definitely a bugfix, and I think we have patches queued for
the next RC already, so I think it makes sense to include at least
patches 2 and 3 as well.


v2:
- Patch 1: Reuse the existing error path instead of creating a new one
   [Berto]


git-backport-diff against v1:

Key:
[] : patches are identical
[] : number of functional differences between upstream/downstream patch
[down] : patch is downstream-only
The flags [FC] indicate (F)unctional and (C)ontextual differences, respectively

001/3:[0025] [FC] 'block: Always abort reopen after prepare succeeded'
002/3:[] [--] 'file-posix: Fix shared locks on reopen commit'
003/3:[] [--] 'iotests: Test file-posix locking and reopen'


Max Reitz (3):
  block: Always abort reopen after prepare succeeded
  file-posix: Fix shared locks on reopen commit
  iotests: Test file-posix locking and reopen

 block.c| 12 +++
 block/file-posix.c |  2 +-
 tests/qemu-iotests/182 | 71 ++
 tests/qemu-iotests/182.out |  9 +
 4 files changed, 93 insertions(+), 1 deletion(-)

-- 
2.17.2




Re: [Qemu-block] [PATCH 5/6] tests: add iotests helpers for dealing with TLS certificates

2018-11-16 Thread Eric Blake

On 11/16/18 9:53 AM, Daniel P. Berrangé wrote:

Add helpers to common.tls for creating TLS certificates for a CA,
server and client.


MUCH appreciated!  We NEED this coverage, easily automated.



Signed-off-by: Daniel P. Berrangé 
---
  tests/qemu-iotests/common.tls | 139 ++
  1 file changed, 139 insertions(+)
  create mode 100644 tests/qemu-iotests/common.tls

diff --git a/tests/qemu-iotests/common.tls b/tests/qemu-iotests/common.tls
new file mode 100644


I was a bit surprised that this wasn't 100755, but this matches the fact 
that none of the other common.* are executable. And after thinking more, 
it makes sense - they aren't standalone scripts, but designed to be 
sourced, and 'source' doesn't care about execute bits.



+tls_dir="${TEST_DIR}/tls"
+
+function tls_x509_cleanup()
+{
+rm -f ${tls_dir}/*.pem
+rm -f ${tls_dir}/*/*.pem
+rmdir ${tls_dir}/*
+rmdir ${tls_dir}


Why not just:
rm -rf $tls_dir

Also, the quoting is a bit inconsistent. if ${TEST_DIR} can contain 
spaces, then all uses of ${tls_dir} need to be in "".



+}
+
+
+function tls_x509_init()
+{
+mkdir "${tls_dir}"


And this just highlights the quoting inconsistency.  Should this use 
mkdir -p?



+
+function tls_x509_create_root_ca()
+{
+name=$1
+
+test -z "$name" && name=ca-cert


Could also be shortened as:

name=${1:-ca-cert}


+
+cat > ${tls_dir}/ca.info <

s/Cthulu/Cthulhu/ - after all, we don't want him coming after us just 
because we botched the spelling of his name :)



+ca
+cert_signing_key
+EOF
+
+certtool --generate-self-signed \
+ --load-privkey ${tls_dir}/key.pem \
+ --template ${tls_dir}/ca.info \
+ --outfile ${tls_dir}/$name-cert.pem 2>&1 | head -1


More missing ""


+
+rm -f ${tls_dir}/ca.info
+}
+
+
+function tls_x509_create_server()
+{
+caname=$1
+name=$2
+
+mkdir ${tls_dir}/$name
+cat > ${tls_dir}/cert.info <

Matched spelling


+function tls_x509_create_client()
+{
+caname=$1
+name=$2
+
+mkdir ${tls_dir}/$name
+cat > ${tls_dir}/cert.info <

And again

Needs several touch-ups, but the idea itself is sound.

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



Re: [Qemu-block] [PATCH for-3.1? 1/3] block: Always abort reopen after prepare succeeded

2018-11-16 Thread Max Reitz
On 16.11.18 17:02, Alberto Garcia wrote:
> On Fri 16 Nov 2018 04:53:00 PM CET, Max Reitz wrote:
>> bdrv_reopen_multiple() does not invoke bdrv_reopen_abort() for the
>> element of the reopen queue for which bdrv_reopen_prepare() failed,
>> because it assumes that the prepare function will have rolled back all
>> changes already.
>>
>> However, bdrv_reopen_prepare() does not do this in every case: It may
>> notice an error after BlockDriver.bdrv_reopen_prepare() succeeded, and
>> it will not invoke BlockDriver.bdrv_reopen_abort() then; and neither
>> will bdrv_reopen_multiple(), as explained above.
>>
>> This is wrong because we must always call .bdrv_reopen_commit() or
>> .bdrv_reopen_abort() after .bdrv_reopen_prepare() has succeeded.
>> Otherwise, the block driver has no chance to undo what it has done in
>> its implementation of .bdrv_reopen_prepare().
>>
>> To fix this, bdrv_reopen_prepare() has to call .bdrv_reopen_abort() if
>> it wants to return an error after .bdrv_reopen_prepare() has succeeded.
>>
>> Signed-off-by: Max Reitz 
>> ---
>>  block.c | 17 +++--
>>  1 file changed, 15 insertions(+), 2 deletions(-)
>>
>> diff --git a/block.c b/block.c
>> index fd67e14dfa..7f5859aa74 100644
>> --- a/block.c
>> +++ b/block.c
>> @@ -3332,7 +3332,7 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, 
>> BlockReopenQueue *queue,
>>  if (!qobject_is_equal(new, old)) {
>>  error_setg(errp, "Cannot change the option '%s'", 
>> entry->key);
>>  ret = -EINVAL;
>> -goto error;
>> +goto late_error;
>>  }
>>  } while ((entry = qdict_next(reopen_state->options, entry)));
>>  }
>> @@ -3340,7 +3340,7 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, 
>> BlockReopenQueue *queue,
>>  ret = bdrv_check_perm(reopen_state->bs, queue, reopen_state->perm,
>>reopen_state->shared_perm, NULL, errp);
>>  if (ret < 0) {
>> -goto error;
>> +goto late_error;
>>  }
>>  
>>  ret = 0;
>> @@ -3354,6 +3354,19 @@ error:
>>  qobject_unref(orig_reopen_opts);
>>  g_free(discard);
>>  return ret;
>> +
>> +late_error:
>> +/* drv->bdrv_reopen_prepare() has succeeded, so we need to call
>> + * drv->bdrv_reopen_abort() before signaling an error
>> + * (bdrv_reopen_multiple() will not call bdrv_reopen_abort() when
>> + * the respective bdrv_reopen_prepare() failed) */
>> +if (drv->bdrv_reopen_abort) {
>> +drv->bdrv_reopen_abort(reopen_state);
>> +}
>> +qemu_opts_del(opts);
>> +qobject_unref(orig_reopen_opts);
>> +g_free(discard);
>> +return ret;
>>  }
> 
> Instead of having two exit points you could also have something like
> bool drv_prepared, set it to 'true' after drv->bdrv_reopen_prepare() has
> succeeded and then simply add this at the end:
> 
> if (ret < 0 && drv_prepared) {
>drv->bdrv_reopen_abort(reopen_state);
> } 

Yup, sure.

Max




signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] KVM Forum block no[td]es

2018-11-16 Thread Max Reitz
On 16.11.18 16:51, Kevin Wolf wrote:
> Am 16.11.2018 um 16:27 hat Max Reitz geschrieben:
>> On 16.11.18 16:18, Kevin Wolf wrote:
>>> Am 16.11.2018 um 16:03 hat Alberto Garcia geschrieben:
> I don't think anything needs a way to generally block graph changes
> around some node.  We only need to prevent changes to very specific
> sets of edges.  This is something that the permission system just
> cannot do.

 But what would you do then?
>>>
>>> I agree with you mostly in that I think that most problems that Max
>>> mentioned aren't readl. The only real problem I see with GRAPH_MOD as a
>>> permission on the node level is this overblocking
>>
>> I wholeheartedly disagree.  Yes, it is true that most of the issues I
>> thought of can be fixed, and so those problems are not problems in a
>> technical sense.  But to me this whole discussion points to the greatest
>> issue I have, which is that GRAPH_MOD is just too complicated to
>> understand.  And I don't like a solution that works on a technical level
>> but that everybody is too afraid to touch because it's too weird.
> 
> GRAPH_MOD with the meaning that Berto suggested isn't weird or
> complicated to understand. It's only the wrong tool because it blocks
> more than we want to block. But if we didn't care about that, it could
> be just another permission like any other.

The meaning Berto has suggested (AFAIU) is never taking the permission
at all but just querying whether it is shared by all current parents
before doing a graph change.

That is very weird to me because permissions are there to be taken.


The meaning I have suggested is that it would be taken before a graph
change and released right afterwards.  I think this is still weird
because we don't take, say, the WRITE permission before every
bdrv*_write*() call and release it afterwards.

Sure you could say "actually, why not".  Please don't.[1]

> If you want to change the graph, you'd need to get the permission first,
> and bdrv_replace_child_noperm() could assert that at least one parent of
> the parent node has acquired the permission (unless you want to pass the
> exact parent BdrvChild to it; maybe this is what we would really do
> then).

Yep, that assertion would be like we do it in bdrv_co_write_req_prepare().

Thing is, you (as in "the caller that actually wants to do something",
like mirror) don't just "get the permission".  Permissions are
associated with children, so you cannot get this permission before you
create your child.

So this is where "parent of a parent node" comes in?  I'm afraid I don't
understand that.

What would make sense to me would be for the generic block layer to take
this permission on new children (and drop it immediately after having
attached the child) and children that are about to be detached.  Which
again is just different from any other permission, because things
outside of block.c can never take it, they can only choose not to share
it.  So it'd be an internal permission, which is visible to the outside,
and I think this is at least weirder than just having an internal
counter with a clear external interface (inc/dec).


Also, please excuse me for it being Friday and all, but I don't quite
believe in "It's actually clear, I have no idea why you claim to always
take so long to understand it."  The code clearly has no idea what it's
supposed to do, and I do not believe that the sole reason for that is
that someone who did understand didn't have the time or the need to fix
it, or to implement it correctly in the first place.

On the other hand it sounded a bit like Berto was about to fix it,
though, so there's that.

Or I just got you wrong and this is just the usual case of "Now that
we've talked about this through a couple of lengthy emails, it does make
sense" which is just all too familiar and exactly my issue with the
whole thing.

>> We have this discussion again and again, and in the end we always come
>> up with something that looks like it might work, but it's just so weird
>> that we can't even remember it.
> 
> I don't think we ever come up with something, weird or not, that
> achieves what we wanted to achieve - because the problem simply can't be
> solved properly at the node level.

Yes, I've phrased my disagreement wrong.  Of course I don't disagree
with you on that.  I just disagree that this is the only issue with
GRAPH_MOD.

>> Maybe it's just me, though.  Frankly, I think the permission system
>> itself is already too complicated as it is, but I don't have a simpler
>> solution there.
> 
> It doesn't feel too bad to me, but that's subjective.

It's not worse than quiescing/draining, that's for sure.

Max


[1] Please don't because on one hand I wouldn't have a counter-argument
for this.  Yes, maybe we should actually take permissions right before
the operations that require them.  Maybe keeping them around for as long
as a child stays attached to a parent is just being lazy.

I suppose the main argument against 

Re: [Qemu-block] [PATCH v2 05/13] block: Switch to 64-bit bl.max_transfer

2018-11-16 Thread Kevin Wolf
Am 16.11.2018 um 16:54 hat Eric Blake geschrieben:
> On 11/16/18 9:32 AM, Kevin Wolf wrote:
> > Am 15.11.2018 um 17:28 hat Eric Blake geschrieben:
> > > On 11/15/18 9:45 AM, Kevin Wolf wrote:
> > > > Am 15.11.2018 um 03:03 hat Eric Blake geschrieben:
> > > > > This change has no semantic impact: all drivers either leave the
> > > > > value at 0 (no inherent 32-bit limit is still translated into
> > > > > fragmentation below 2G; see the previous patch for that audit), or
> > > > > set it to a value less than 2G.  However, switching to a larger
> > > > > type and enforcing the 2G cap at the block layer makes it easier
> > > > > to audit specific drivers for their robustness to larger sizing,
> > > > > by letting them specify a value larger than INT_MAX if they have
> > > > > been audited to be 64-bit clean.
> > > > > 
> 
> > > > > +
> > > > > +/* Clamp max_transfer to 2G */
> > > > > +if (bs->bl.max_transfer > UINT32_MAX) {
> > > > 
> > > > UINT32_MAX is 4G, not 2G.
> > > > 
> > > > Would it make more sense to make BDRV_REQUEST_MAX_BYTES the maximum
> > > > anyway?
> > > 
> > > D'oh.  Yes, that's what I intended, possibly by spelling it INT_MAX (the
> > > fact that the 'if' goes away in patch 13 is not an excuse for sloppy 
> > > coding
> > > in the meantime).
> > 
> > INT_MAX is not a different spelling of BDRV_REQUEST_MAX_BYTES. The
> > latter is slightly lower (0x7e00).
> 
> Ah, but:
> 
> > +if (bs->bl.max_transfer > UINT32_MAX) {
> > +bs->bl.max_transfer = QEMU_ALIGN_DOWN(BDRV_REQUEST_MAX_BYTES,
> > +  MAX(bs->bl.opt_transfer,
> > +  
> > bs->bl.request_alignment));
> > +}
> 
> QEMU_ALIGN_DOWN() will change INT_MAX to BDRV_REQUEST_MAX_BYTES if
> bs->bl.request_alignment is 512 and opt_transfer is 0; and if either
> alignment number is larger than 512, max_transfer is capped even lower
> regardless of whether I was aligning INT_MAX or BDRV_REQUEST_MAX_BYTES down
> to an alignment boundary.

That's true. But why use INT_MAX and argue that it will be rounded to
the right value later when you can just use BDRV_REQUEST_MAX_BYTES,
which is obviously correct without additional correction?

Kevin



Re: [Qemu-block] [PATCH 4/6] tests: check if qemu-nbd is still alive before waiting

2018-11-16 Thread Eric Blake

On 11/16/18 9:53 AM, Daniel P. Berrangé wrote:

If the qemu-nbd UNIX socket has not shown up, the tests will sleep a bit
and then check again repeatedly for upto 30 seconds. This is pointless


s/upto/up to/


if the qemu-nbd process has quit due to an error, so check whether the
pid is still alive before waiting and retrying.


"But our tests are perfect and qemu-nbd never fails" :)

Yes, this makes sense.  Not 3.1 material on its own (after all, our 
testsuite isn't showing such failures, so we aren't wasting that time at 
the moment) - but worth including if the later patches end up in 3.1.




Signed-off-by: Daniel P. Berrangé 
---
  tests/qemu-iotests/common.nbd | 10 +-
  1 file changed, 9 insertions(+), 1 deletion(-)


Reviewed-by: Eric Blake 



diff --git a/tests/qemu-iotests/common.nbd b/tests/qemu-iotests/common.nbd
index f920a578f1..61e9e90fee 100644
--- a/tests/qemu-iotests/common.nbd
+++ b/tests/qemu-iotests/common.nbd
@@ -37,11 +37,19 @@ function nbd_server_stop()
  
  function nbd_server_wait_for_unix_socket()

  {
+pid=$1
+
  for ((i = 0; i < 300; i++))
  do
  if [ -r "$nbd_unix_socket" ]; then
  return
  fi
+kill -s 0 $pid 2>/dev/null
+if test $? != 0
+then
+echo "qemu-nbd unexpectedly quit"
+exit 1


Maybe the echo should be redirected to stderr.  But we aren't 
consistently doing that in other tests (_init_error does it, but other 
spots in check are not), so I'm not changing it.


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



Re: [Qemu-block] [PATCH] migration/block-dirty-bitmap: fix Coverity CID1390625

2018-11-16 Thread John Snow



On 11/16/18 5:04 AM, Peter Maydell wrote:
> On 16 November 2018 at 03:28, John Snow  wrote:
>> I looked again. I think Vladimir's patch will shut up Coverity for sure,
>> feel free to apply it if you want this out of your hair.
>>
>> Stefan suggests the following, however;
>>
>>
>> diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
>> index 5e90f44c2f..00c068fda3 100644
>> --- a/migration/block-dirty-bitmap.c
>> +++ b/migration/block-dirty-bitmap.c
>> @@ -284,7 +284,7 @@ static int init_dirty_bitmap_migration(void)
>>  const char *drive_name = bdrv_get_device_or_node_name(bs);
>>
>>  /* skip automatically inserted nodes */
>> -while (bs && bs->drv && bs->implicit) {
>> +while (bs->drv && bs->implicit) {
>>  bs = backing_bs(bs);
>>  }
>>
>>
>> that by removing the assumption that bs could ever be null here (it
>> shouldn't), that we'll coax coverity into not warning anymore. I don't
>> know if that will work, because backing_bs can theoretically return NULL
>> and might convince coverity there's a problem. In practice it won't be.
>>
>> I don't know how to check this to see if Stefan's suggestion is appropriate.
>>
>> For such a small, trivial issue though, just merge this and be done with
>> it, in my opinion. If you want to take this fix directly as a "build
>> fix" I wouldn't object.
> 
> Personally I think the main benefit of this sort of Coverity
> warning is that it nudges you to work through and figure out
> whether something really can be NULL or not. Stefan's fix
> will satisfy Coverity, because what it's complaining about
> is that the code in one place assumes a pointer can't be NULL
> but in another place does have handling for NULL: it is the
> inconsistency that triggers the warning. If backing_bs()
> can't return NULL (ie if you would be happy in theory to put
> an assert() in after the call) then I think Stefan's fix is
> better. If it can return NULL ever then Vladimir's fix is
> what you want.
> 
> thanks
> -- PMM
> 

I understand the point of Coverity.

I really don't think it can, but I don't actually know how to verify it
or to convince Coverity of the same. Stefan's suggestion seems most
appropriate if it actually does calm Coverity down.

The loop above, there, just iterates down a chain in a graph, looking
for the first node that satisfies a certain criteria -- that it's not an
implicit node. These are reserved for intermediary filter nodes that
should ALWAYS have a child. That is their intended design.

(Is it mechanically possible to violate this? That's very hard to audit
and promise for you. There are always bugs lurking somewhere else.)

I think the only two places where we create such nodes are in:

block/commit.c:commit_top_bs->implicit = true;
block/mirror.c:mirror_top_bs->implicit = true;


in commit, it's inserted explicitly above `top`.
in mirror, we use bdrv_append(mirror_top_bs, bs, ...) to put the
implicit node above the target.

In both cases, the loop *cannot* end at a node without a backing file.

Now, this is not to say there might not be a bug somewhere else when we
do graph manipulation that we might accidentally end up with such an
errant configuration ...

Stefan's suggestion is probably most appropriate, *if* it actually
shushes Coverity. I'll send you a small patch and you can find out? I
don't want to task offload but I also genuinely don't know if that will
hint to coverity strongly enough that this is a false positive.

--js



Re: [Qemu-block] [PATCH 3/6] tests: pull qemu-nbd iotest helpers into common.nbd file

2018-11-16 Thread Eric Blake

On 11/16/18 9:53 AM, Daniel P. Berrangé wrote:

The helpers for starting/stopping qemu-nbd in 058 will be useful in
other test cases, so move them into a common.nbd file.


In fact, I already have a proposed patch 228 that I'm trying to clean up 
for potential inclusion in 3.1 that could use this!  [I'm still 
hammering on those patches, so I don't know if they will make 3.1 or be 
delayed to 4.0 - this patch is not needed in 3.1 unless either your 
later patches are 3.1 material or my patch ends up ready in time]


https://lists.gnu.org/archive/html/qemu-devel/2018-08/msg00308.html



Signed-off-by: Daniel P. Berrangé 
---
  tests/qemu-iotests/058| 47 +
  tests/qemu-iotests/common.nbd | 56 +++
  2 files changed, 64 insertions(+), 39 deletions(-)
  create mode 100644 tests/qemu-iotests/common.nbd



Reviewed-by: Eric Blake 

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



Re: [Qemu-block] [PATCH 2/6 for-3.1] nbd: stop waiting for a NBD response with NBD_CMD_DISC

2018-11-16 Thread Eric Blake

On 11/16/18 9:53 AM, Daniel P. Berrangé wrote:

When sending a NBD_CMD_DISC message there is no reply expected,
however, the nbd_read_eof() coroutine is still waiting for a reply.
In a plain NBD connection this doesn't matter as it will just get an
EOF, however, on a TLS connection it will get an interrupted TLS data
packet.


Does that depend on whether the server tried to gracefully close the TLS 
connection (and flushed enough packets to do so)?  At any rate, while 
it's nice to wait for the server to cleanly end TLS, we can't control 
the behavior of all servers, and I agree with your analysis that having 
the server silently deal with an abrupt shutdown is nicer than polluting 
stderr, as it's not actually a data loss situation for either the client 
or the server.



The nbd_read_eof() will then print an error message on the
console due to missing reply to NBD_CMD_DISC.

This can be seen with qemu-img

   $ qemu-img info \
   --object tls-creds-x509,dir=tlsdir,id=tls0,endpoint=client \
   --image-opts driver=nbd,host=127.0.0.1,port=9000,tls-creds=tls0
   qemu-img: Cannot read from TLS channel: Input/output error
   image: nbd://127.0.0.1:9000
   file format: nbd
   virtual size: 10M (10485760 bytes)
   disk size: unavailable

Simply setting the 'quit' flag after sending NBD_CMD_DISC is enough to
get the coroutine to stop waiting for a reply and thus supress the error


s/supress/suppress/


message. This fixes a regression introduced in

   commit be41c100c0d67f6072ddd4910c4b6f7d239f025f
   Author: Vladimir Sementsov-Ogievskiy 
   Date:   Fri May 26 14:09:13 2017 +0300

 nbd/client.c: use errp instead of LOG

 Move to modern errp scheme from just LOGging errors.

 Signed-off-by: Vladimir Sementsov-Ogievskiy 
 Message-Id: <20170526110913.89098-1-vsement...@virtuozzo.com>
 Signed-off-by: Paolo Bonzini 

Signed-off-by: Daniel P. Berrangé 
---
  block/nbd-client.c | 1 +
  1 file changed, 1 insertion(+)


Reviewed-by: Eric Blake 

Agree that this is 3.1 material; will queue.

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



Re: [Qemu-block] [PATCH 1/6 for-3.1] nbd: fix whitespace in server error message

2018-11-16 Thread Eric Blake

On 11/16/18 9:53 AM, Daniel P. Berrangé wrote:

A space was missing after the option number was printed:

   Option 0x8not permitted before TLS

becomes

   Option 0x8 not permitted before TLS

This fixes

   commit 3668328303429f3bc93ab3365c66331600b06a2d
   Author: Eric Blake 
   Date:   Fri Oct 14 13:33:09 2016 -0500

 nbd: Send message along with server NBD_REP_ERR errors

Signed-off-by: Daniel P. Berrangé 
---
  nbd/server.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nbd/server.c b/nbd/server.c
index 4e8f5ae51b..12e8139f95 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -1135,7 +1135,7 @@ static int nbd_negotiate_options(NBDClient *client, 
uint16_t myflags,
  
  default:

  ret = nbd_opt_drop(client, NBD_REP_ERR_TLS_REQD, errp,
-   "Option 0x%" PRIx32
+   "Option 0x%" PRIx32 " "
 "not permitted before TLS", option);


Visually, I'd include the space in the next line instead of on its own 
(but that's aesthetics, not semantic). Agree that this is 3.1 material; 
I'll queue it up and send a PR on Monday.


Reviewed-by: Eric Blake 

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



Re: [Qemu-block] [PATCH for-3.1? 1/3] block: Always abort reopen after prepare succeeded

2018-11-16 Thread Alberto Garcia
On Fri 16 Nov 2018 04:53:00 PM CET, Max Reitz wrote:
> bdrv_reopen_multiple() does not invoke bdrv_reopen_abort() for the
> element of the reopen queue for which bdrv_reopen_prepare() failed,
> because it assumes that the prepare function will have rolled back all
> changes already.
>
> However, bdrv_reopen_prepare() does not do this in every case: It may
> notice an error after BlockDriver.bdrv_reopen_prepare() succeeded, and
> it will not invoke BlockDriver.bdrv_reopen_abort() then; and neither
> will bdrv_reopen_multiple(), as explained above.
>
> This is wrong because we must always call .bdrv_reopen_commit() or
> .bdrv_reopen_abort() after .bdrv_reopen_prepare() has succeeded.
> Otherwise, the block driver has no chance to undo what it has done in
> its implementation of .bdrv_reopen_prepare().
>
> To fix this, bdrv_reopen_prepare() has to call .bdrv_reopen_abort() if
> it wants to return an error after .bdrv_reopen_prepare() has succeeded.
>
> Signed-off-by: Max Reitz 
> ---
>  block.c | 17 +++--
>  1 file changed, 15 insertions(+), 2 deletions(-)
>
> diff --git a/block.c b/block.c
> index fd67e14dfa..7f5859aa74 100644
> --- a/block.c
> +++ b/block.c
> @@ -3332,7 +3332,7 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, 
> BlockReopenQueue *queue,
>  if (!qobject_is_equal(new, old)) {
>  error_setg(errp, "Cannot change the option '%s'", 
> entry->key);
>  ret = -EINVAL;
> -goto error;
> +goto late_error;
>  }
>  } while ((entry = qdict_next(reopen_state->options, entry)));
>  }
> @@ -3340,7 +3340,7 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, 
> BlockReopenQueue *queue,
>  ret = bdrv_check_perm(reopen_state->bs, queue, reopen_state->perm,
>reopen_state->shared_perm, NULL, errp);
>  if (ret < 0) {
> -goto error;
> +goto late_error;
>  }
>  
>  ret = 0;
> @@ -3354,6 +3354,19 @@ error:
>  qobject_unref(orig_reopen_opts);
>  g_free(discard);
>  return ret;
> +
> +late_error:
> +/* drv->bdrv_reopen_prepare() has succeeded, so we need to call
> + * drv->bdrv_reopen_abort() before signaling an error
> + * (bdrv_reopen_multiple() will not call bdrv_reopen_abort() when
> + * the respective bdrv_reopen_prepare() failed) */
> +if (drv->bdrv_reopen_abort) {
> +drv->bdrv_reopen_abort(reopen_state);
> +}
> +qemu_opts_del(opts);
> +qobject_unref(orig_reopen_opts);
> +g_free(discard);
> +return ret;
>  }

Instead of having two exit points you could also have something like
bool drv_prepared, set it to 'true' after drv->bdrv_reopen_prepare() has
succeeded and then simply add this at the end:

if (ret < 0 && drv_prepared) {
   drv->bdrv_reopen_abort(reopen_state);
} 

Berto



[Qemu-block] [PATCH 6/6] tests: exercise NBD server in TLS mode

2018-11-16 Thread Daniel P . Berrangé
Add tests that validate it is possible to connect to an NBD server
running TLS mode. Also test mis-matched TLS vs non-TLS connections
correctly fail.
---
 tests/qemu-iotests/233| 99 +++
 tests/qemu-iotests/233.out| 33 
 tests/qemu-iotests/common.nbd | 47 +
 tests/qemu-iotests/group  |  1 +
 4 files changed, 180 insertions(+)
 create mode 100755 tests/qemu-iotests/233
 create mode 100644 tests/qemu-iotests/233.out

diff --git a/tests/qemu-iotests/233 b/tests/qemu-iotests/233
new file mode 100755
index 00..8f3d5bd013
--- /dev/null
+++ b/tests/qemu-iotests/233
@@ -0,0 +1,99 @@
+#!/bin/bash
+#
+# Test NBD TLS certificate / authorization integration
+#
+# Copyright (C) 2018 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see .
+#
+
+# creator
+owner=berra...@redhat.com
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+here=`pwd`
+status=1   # failure is the default!
+
+_cleanup()
+{
+nbd_server_stop
+_cleanup_test_img
+tls_x509_cleanup
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+. ./common.pattern
+. ./common.tls
+. ./common.nbd
+
+_supported_fmt raw qcow2
+_supported_proto file
+_supported_os Linux
+_require_command QEMU_NBD
+
+nbd_server_set_tcp_port
+tls_x509_init
+
+echo
+echo "== preparing TLS creds =="
+
+tls_x509_create_root_ca "ca1"
+tls_x509_create_root_ca "ca2"
+tls_x509_create_server "ca1" "server1"
+tls_x509_create_client "ca1" "client1"
+tls_x509_create_client "ca2" "client2"
+
+echo
+echo "== preparing image =="
+_make_test_img 64M
+
+
+echo
+echo "== check TLS client to plain server fails =="
+nbd_server_start_tcp_socket "$TEST_IMG"
+
+$QEMU_IMG info --image-opts \
+--object tls-creds-x509,dir=${tls_dir}/client1,endpoint=client,id=tls0 \
+driver=nbd,host=$nbd_tcp_addr,port=$nbd_tcp_port,tls-creds=tls0
+
+nbd_server_stop
+
+echo
+echo "== check plain client to TLS server fails =="
+
+nbd_server_start_tcp_socket --object 
tls-creds-x509,dir=${tls_dir}/server1,endpoint=server,id=tls0,verify-peer=yes 
--tls-creds tls0 "$TEST_IMG"
+
+$QEMU_IMG info nbd://localhost:$nbd_tcp_port
+
+echo
+echo "== check TLS works =="
+$QEMU_IMG info --image-opts \
+--object tls-creds-x509,dir=${tls_dir}/client1,endpoint=client,id=tls0 \
+driver=nbd,host=$nbd_tcp_addr,port=$nbd_tcp_port,tls-creds=tls0
+
+echo
+echo "== check TLS with different CA fails =="
+$QEMU_IMG info --image-opts \
+--object tls-creds-x509,dir=${tls_dir}/client2,endpoint=client,id=tls0 \
+driver=nbd,host=$nbd_tcp_addr,port=$nbd_tcp_port,tls-creds=tls0
+
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/233.out b/tests/qemu-iotests/233.out
new file mode 100644
index 00..eaa410c270
--- /dev/null
+++ b/tests/qemu-iotests/233.out
@@ -0,0 +1,33 @@
+QA output created by 233
+
+== preparing TLS creds ==
+Generating a self signed certificate...
+Generating a self signed certificate...
+Generating a signed certificate...
+Generating a signed certificate...
+Generating a signed certificate...
+
+== preparing image ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
+
+== check TLS client to plain server fails ==
+option negotiation failed: read failed: Unexpected end-of-file before all 
bytes were read
+qemu-img: Could not open 
'driver=nbd,host=127.0.0.1,port=10809,tls-creds=tls0': Denied by server for 
option 5 (starttls)
+server reported: TLS not configured
+
+== check plain client to TLS server fails ==
+option negotiation failed: read failed: Unexpected end-of-file before all 
bytes were read
+qemu-img: Could not open 'nbd://localhost:10809': TLS negotiation required 
before option 8 (structured reply)
+server reported: Option 0x8 not permitted before TLS
+write failed (error message): Unable to write to socket: Broken pipe
+
+== check TLS works ==
+image: nbd://127.0.0.1:10809
+file format: nbd
+virtual size: 64M (67108864 bytes)
+disk size: unavailable
+
+== check TLS with different CA fails ==
+option negotiation failed: Verify failed: No certificate was found.
+qemu-img: Could not open 
'driver=nbd,host=127.0.0.1,port=10809,tls-creds=tls0': The certificate hasn't 
got a known issuer
+*** done
diff --git a/tests/qemu-iotests/common.nbd b/tests/qemu-iotests/common.nbd
index 

Re: [Qemu-block] [PATCH v2 05/13] block: Switch to 64-bit bl.max_transfer

2018-11-16 Thread Eric Blake

On 11/16/18 9:32 AM, Kevin Wolf wrote:

Am 15.11.2018 um 17:28 hat Eric Blake geschrieben:

On 11/15/18 9:45 AM, Kevin Wolf wrote:

Am 15.11.2018 um 03:03 hat Eric Blake geschrieben:

This change has no semantic impact: all drivers either leave the
value at 0 (no inherent 32-bit limit is still translated into
fragmentation below 2G; see the previous patch for that audit), or
set it to a value less than 2G.  However, switching to a larger
type and enforcing the 2G cap at the block layer makes it easier
to audit specific drivers for their robustness to larger sizing,
by letting them specify a value larger than INT_MAX if they have
been audited to be 64-bit clean.




+
+/* Clamp max_transfer to 2G */
+if (bs->bl.max_transfer > UINT32_MAX) {


UINT32_MAX is 4G, not 2G.

Would it make more sense to make BDRV_REQUEST_MAX_BYTES the maximum
anyway?


D'oh.  Yes, that's what I intended, possibly by spelling it INT_MAX (the
fact that the 'if' goes away in patch 13 is not an excuse for sloppy coding
in the meantime).


INT_MAX is not a different spelling of BDRV_REQUEST_MAX_BYTES. The
latter is slightly lower (0x7e00).


Ah, but:


+if (bs->bl.max_transfer > UINT32_MAX) {
+bs->bl.max_transfer = QEMU_ALIGN_DOWN(BDRV_REQUEST_MAX_BYTES,
+  MAX(bs->bl.opt_transfer,
+  bs->bl.request_alignment));
+}


QEMU_ALIGN_DOWN() will change INT_MAX to BDRV_REQUEST_MAX_BYTES if 
bs->bl.request_alignment is 512 and opt_transfer is 0; and if either 
alignment number is larger than 512, max_transfer is capped even lower 
regardless of whether I was aligning INT_MAX or BDRV_REQUEST_MAX_BYTES 
down to an alignment boundary.


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



[Qemu-block] [PATCH 5/6] tests: add iotests helpers for dealing with TLS certificates

2018-11-16 Thread Daniel P . Berrangé
Add helpers to common.tls for creating TLS certificates for a CA,
server and client.

Signed-off-by: Daniel P. Berrangé 
---
 tests/qemu-iotests/common.tls | 139 ++
 1 file changed, 139 insertions(+)
 create mode 100644 tests/qemu-iotests/common.tls

diff --git a/tests/qemu-iotests/common.tls b/tests/qemu-iotests/common.tls
new file mode 100644
index 00..6178ca5764
--- /dev/null
+++ b/tests/qemu-iotests/common.tls
@@ -0,0 +1,139 @@
+#!/bin/bash
+#
+# Helpers for TLS related config
+#
+# Copyright (C) 2018 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see .
+#
+
+tls_dir="${TEST_DIR}/tls"
+
+function tls_x509_cleanup()
+{
+rm -f ${tls_dir}/*.pem
+rm -f ${tls_dir}/*/*.pem
+rmdir ${tls_dir}/*
+rmdir ${tls_dir}
+}
+
+
+function tls_x509_init()
+{
+mkdir "${tls_dir}"
+
+# use a fixed key so we don't waste system entropy on
+# each test run
+cat > ${tls_dir}/key.pem < ${tls_dir}/ca.info <&1 | head -1
+
+rm -f ${tls_dir}/ca.info
+}
+
+
+function tls_x509_create_server()
+{
+caname=$1
+name=$2
+
+mkdir ${tls_dir}/$name
+cat > ${tls_dir}/cert.info <&1 | head -1
+ln -s ${tls_dir}/$caname-cert.pem ${tls_dir}/$name/ca-cert.pem
+ln -s ${tls_dir}/key.pem ${tls_dir}/$name/server-key.pem
+
+rm -f ${tls_dir}/cert.info
+}
+
+
+function tls_x509_create_client()
+{
+caname=$1
+name=$2
+
+mkdir ${tls_dir}/$name
+cat > ${tls_dir}/cert.info <&1 | head -1
+ln -s ${tls_dir}/$caname-cert.pem ${tls_dir}/$name/ca-cert.pem
+ln -s ${tls_dir}/key.pem ${tls_dir}/$name/client-key.pem
+
+rm -f  ${tls_dir}/cert.info
+}
-- 
2.19.1




Re: [Qemu-block] [PATCH for 3.1 v9] qcow2: Document some maximum size constraints

2018-11-16 Thread Kevin Wolf
Am 15.11.2018 um 19:34 hat Eric Blake geschrieben:
> Although off_t permits up to 63 bits (8EB) of file offsets, in
> practice, we're going to hit other limits first.  Document some
> of those limits in the qcow2 spec (some are inherent, others are
> implementation choices of qemu), and how choice of cluster size
> can influence some of the limits.
> 
> While we cannot map any uncompressed virtual cluster to any
> address higher than 64 PB (56 bits) (due to the current L1/L2
> field encoding stopping at bit 55), qemu's cap of 8M for the
> refcount table can still access larger host addresses for some
> combinations of large clusters and small refcount_order.  For
> comparison, ext4 with 4k blocks caps files at 16PB.
> 
> Another interesting limit: for compressed clusters, the L2 layout
> requires an ever-smaller maximum host offset as cluster size gets
> larger, down to a 512 TB maximum with 2M clusters.  In particular,
> note that with a cluster size of 8k or smaller, the L2 entry for
> a compressed cluster could technically point beyond the 64PB mark,
> but when you consider that with 8k clusters and refcount_order = 0,
> you cannot access beyond 512T without exceeding qemu's limit of an
> 8M cap on the refcount table, it is unlikely that any image in the
> wild has attempted to do so.  To be safe, let's document that bits
> beyond 55 in a compressed cluster must be 0.
> 
> Signed-off-by: Eric Blake 
> 
> ---
> v9: Yet more wording tweaks, to call out the difference between
> inherent (L2 reserved bit) and implementation (qemu's 32M L1 and
> 8M refcount) limits.
> 
> Designed to replace commit 1da4dc02 on Kevin's block branch.

Thanks, replaced the commit.

Kevin



[Qemu-block] [PATCH 3/6] tests: pull qemu-nbd iotest helpers into common.nbd file

2018-11-16 Thread Daniel P . Berrangé
The helpers for starting/stopping qemu-nbd in 058 will be useful in
other test cases, so move them into a common.nbd file.

Signed-off-by: Daniel P. Berrangé 
---
 tests/qemu-iotests/058| 47 +
 tests/qemu-iotests/common.nbd | 56 +++
 2 files changed, 64 insertions(+), 39 deletions(-)
 create mode 100644 tests/qemu-iotests/common.nbd

diff --git a/tests/qemu-iotests/058 b/tests/qemu-iotests/058
index 5eb8784669..cd73250c48 100755
--- a/tests/qemu-iotests/058
+++ b/tests/qemu-iotests/058
@@ -29,55 +29,19 @@ echo "QA output created by $seq"
 here=`pwd`
 status=1   # failure is the default!
 
-nbd_unix_socket=$TEST_DIR/test_qemu_nbd_socket
-nbd_snapshot_img="nbd:unix:$nbd_unix_socket"
-rm -f "${TEST_DIR}/qemu-nbd.pid"
-
-_cleanup_nbd()
-{
-local NBD_SNAPSHOT_PID
-if [ -f "${TEST_DIR}/qemu-nbd.pid" ]; then
-read NBD_SNAPSHOT_PID < "${TEST_DIR}/qemu-nbd.pid"
-rm -f "${TEST_DIR}/qemu-nbd.pid"
-if [ -n "$NBD_SNAPSHOT_PID" ]; then
-kill "$NBD_SNAPSHOT_PID"
-fi
-fi
-rm -f "$nbd_unix_socket"
-}
-
-_wait_for_nbd()
-{
-for ((i = 0; i < 300; i++))
-do
-if [ -r "$nbd_unix_socket" ]; then
-return
-fi
-sleep 0.1
-done
-echo "Failed in check of unix socket created by qemu-nbd"
-exit 1
-}
-
-converted_image=$TEST_IMG.converted
-
 _export_nbd_snapshot()
 {
-_cleanup_nbd
-$QEMU_NBD -v -t -k "$nbd_unix_socket" "$TEST_IMG" -l $1 &
-_wait_for_nbd
+nbd_server_start_unix_socket "$TEST_IMG" -l $1
 }
 
 _export_nbd_snapshot1()
 {
-_cleanup_nbd
-$QEMU_NBD -v -t -k "$nbd_unix_socket" "$TEST_IMG" -l snapshot.name=$1 &
-_wait_for_nbd
+nbd_server_start_unix_socket "$TEST_IMG" -l snapshot.name=$1
 }
 
 _cleanup()
 {
-_cleanup_nbd
+nbd_server_stop
 _cleanup_test_img
 rm -f "$converted_image"
 }
@@ -87,6 +51,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 . ./common.rc
 . ./common.filter
 . ./common.pattern
+. ./common.nbd
 
 _supported_fmt qcow2
 _supported_proto file
@@ -95,6 +60,10 @@ _require_command QEMU_NBD
 # Internal snapshots are (currently) impossible with refcount_bits=1
 _unsupported_imgopts 'refcount_bits=1[^0-9]'
 
+nbd_snapshot_img="nbd:unix:$nbd_unix_socket"
+
+converted_image=$TEST_IMG.converted
+
 # Use -f raw instead of -f $IMGFMT for the NBD connection
 QEMU_IO_NBD="$QEMU_IO -f raw --cache=$CACHEMODE"
 
diff --git a/tests/qemu-iotests/common.nbd b/tests/qemu-iotests/common.nbd
new file mode 100644
index 00..f920a578f1
--- /dev/null
+++ b/tests/qemu-iotests/common.nbd
@@ -0,0 +1,56 @@
+#!/bin/bash
+# -*- shell-script-mode -*-
+#
+# Helpers for NBD server related config
+#
+# Copyright (C) 2018 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see .
+#
+
+nbd_unix_socket="${TEST_DIR}/qemu-nbd.sock"
+nbd_pid_file="${TEST_DIR}/qemu-nbd.pid"
+
+function nbd_server_stop()
+{
+local NBD_PID
+if [ -f "$nbd_pid_file" ]; then
+read NBD_PID < "$nbd_pid_file"
+rm -f "$nbd_pid_file"
+if [ -n "$NBD_PID" ]; then
+kill "$NBD_PID"
+fi
+fi
+rm -f "$nbd_unix_socket"
+}
+
+function nbd_server_wait_for_unix_socket()
+{
+for ((i = 0; i < 300; i++))
+do
+if [ -r "$nbd_unix_socket" ]; then
+return
+fi
+sleep 0.1
+done
+echo "Failed in check of unix socket created by qemu-nbd"
+exit 1
+}
+
+function nbd_server_start_unix_socket()
+{
+nbd_server_stop
+$QEMU_NBD -v -t -k "$nbd_unix_socket" $@ &
+nbd_server_wait_for_unix_socket
+}
-- 
2.19.1




[Qemu-block] [PATCH 2/6 for-3.1] nbd: stop waiting for a NBD response with NBD_CMD_DISC

2018-11-16 Thread Daniel P . Berrangé
When sending a NBD_CMD_DISC message there is no reply expected,
however, the nbd_read_eof() coroutine is still waiting for a reply.
In a plain NBD connection this doesn't matter as it will just get an
EOF, however, on a TLS connection it will get an interrupted TLS data
packet. The nbd_read_eof() will then print an error message on the
console due to missing reply to NBD_CMD_DISC.

This can be seen with qemu-img

  $ qemu-img info \
  --object tls-creds-x509,dir=tlsdir,id=tls0,endpoint=client \
  --image-opts driver=nbd,host=127.0.0.1,port=9000,tls-creds=tls0
  qemu-img: Cannot read from TLS channel: Input/output error
  image: nbd://127.0.0.1:9000
  file format: nbd
  virtual size: 10M (10485760 bytes)
  disk size: unavailable

Simply setting the 'quit' flag after sending NBD_CMD_DISC is enough to
get the coroutine to stop waiting for a reply and thus supress the error
message. This fixes a regression introduced in

  commit be41c100c0d67f6072ddd4910c4b6f7d239f025f
  Author: Vladimir Sementsov-Ogievskiy 
  Date:   Fri May 26 14:09:13 2017 +0300

nbd/client.c: use errp instead of LOG

Move to modern errp scheme from just LOGging errors.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Message-Id: <20170526110913.89098-1-vsement...@virtuozzo.com>
Signed-off-by: Paolo Bonzini 

Signed-off-by: Daniel P. Berrangé 
---
 block/nbd-client.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/nbd-client.c b/block/nbd-client.c
index 76e9ca3abe..5f63e4b8f1 100644
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -961,6 +961,7 @@ void nbd_client_close(BlockDriverState *bs)
 }
 
 nbd_send_request(client->ioc, );
+client->quit = true;
 
 nbd_teardown_connection(bs);
 }
-- 
2.19.1




[Qemu-block] [PATCH 4/6] tests: check if qemu-nbd is still alive before waiting

2018-11-16 Thread Daniel P . Berrangé
If the qemu-nbd UNIX socket has not shown up, the tests will sleep a bit
and then check again repeatedly for upto 30 seconds. This is pointless
if the qemu-nbd process has quit due to an error, so check whether the
pid is still alive before waiting and retrying.

Signed-off-by: Daniel P. Berrangé 
---
 tests/qemu-iotests/common.nbd | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/common.nbd b/tests/qemu-iotests/common.nbd
index f920a578f1..61e9e90fee 100644
--- a/tests/qemu-iotests/common.nbd
+++ b/tests/qemu-iotests/common.nbd
@@ -37,11 +37,19 @@ function nbd_server_stop()
 
 function nbd_server_wait_for_unix_socket()
 {
+pid=$1
+
 for ((i = 0; i < 300; i++))
 do
 if [ -r "$nbd_unix_socket" ]; then
 return
 fi
+kill -s 0 $pid 2>/dev/null
+if test $? != 0
+then
+echo "qemu-nbd unexpectedly quit"
+exit 1
+fi
 sleep 0.1
 done
 echo "Failed in check of unix socket created by qemu-nbd"
@@ -52,5 +60,5 @@ function nbd_server_start_unix_socket()
 {
 nbd_server_stop
 $QEMU_NBD -v -t -k "$nbd_unix_socket" $@ &
-nbd_server_wait_for_unix_socket
+nbd_server_wait_for_unix_socket $!
 }
-- 
2.19.1




[Qemu-block] [PATCH for-3.1? 2/3] file-posix: Fix shared locks on reopen commit

2018-11-16 Thread Max Reitz
s->locked_shared_perm is the set of bits locked in the file, which is
the inverse of the permissions actually shared.  So we need to pass them
as they are to raw_apply_lock_bytes() instead of inverting them again.

Reported-by: Alberto Garcia 
Signed-off-by: Max Reitz 
---
 block/file-posix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index df3a8d7cdf..8460d003f0 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -963,7 +963,7 @@ static void raw_reopen_commit(BDRVReopenState *state)
 
 /* Copy locks to the new fd before closing the old one. */
 raw_apply_lock_bytes(NULL, rs->fd, s->locked_perm,
- ~s->locked_shared_perm, false, _err);
+ s->locked_shared_perm, false, _err);
 if (local_err) {
 /* shouldn't fail in a sane host, but report it just in case. */
 error_report_err(local_err);
-- 
2.17.2




[Qemu-block] [PATCH 1/6 for-3.1] nbd: fix whitespace in server error message

2018-11-16 Thread Daniel P . Berrangé
A space was missing after the option number was printed:

  Option 0x8not permitted before TLS

becomes

  Option 0x8 not permitted before TLS

This fixes

  commit 3668328303429f3bc93ab3365c66331600b06a2d
  Author: Eric Blake 
  Date:   Fri Oct 14 13:33:09 2016 -0500

nbd: Send message along with server NBD_REP_ERR errors

Signed-off-by: Daniel P. Berrangé 
---
 nbd/server.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nbd/server.c b/nbd/server.c
index 4e8f5ae51b..12e8139f95 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -1135,7 +1135,7 @@ static int nbd_negotiate_options(NBDClient *client, 
uint16_t myflags,
 
 default:
 ret = nbd_opt_drop(client, NBD_REP_ERR_TLS_REQD, errp,
-   "Option 0x%" PRIx32
+   "Option 0x%" PRIx32 " "
"not permitted before TLS", option);
 /* Let the client keep trying, unless they asked to
  * quit. In this mode, we've already sent an error, so
-- 
2.19.1




[Qemu-block] [PATCH for-3.1? 3/3] iotests: Test file-posix locking and reopen

2018-11-16 Thread Max Reitz
Signed-off-by: Max Reitz 
---
 tests/qemu-iotests/182 | 71 ++
 tests/qemu-iotests/182.out |  9 +
 2 files changed, 80 insertions(+)

diff --git a/tests/qemu-iotests/182 b/tests/qemu-iotests/182
index 4b31592fb8..3b7689c1d5 100755
--- a/tests/qemu-iotests/182
+++ b/tests/qemu-iotests/182
@@ -31,6 +31,7 @@ status=1  # failure is the default!
 _cleanup()
 {
 _cleanup_test_img
+rm -f "$TEST_IMG.overlay"
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
@@ -71,6 +72,76 @@ echo 'quit' | $QEMU -nographic -monitor stdio \
 
 _cleanup_qemu
 
+echo
+echo '=== Testing reopen ==='
+echo
+
+# This tests that reopening does not unshare any permissions it should
+# not unshare
+# (There was a bug where reopening shared exactly the opposite of the
+# permissions it was supposed to share)
+
+_launch_qemu
+
+_send_qemu_cmd $QEMU_HANDLE \
+"{'execute': 'qmp_capabilities'}" \
+'return'
+
+# Open the image without any format layer (we are not going to access
+# it, so that is fine)
+# This should keep all permissions shared.
+success_or_failure=y _send_qemu_cmd $QEMU_HANDLE \
+"{'execute': 'blockdev-add',
+  'arguments': {
+  'node-name': 'node0',
+  'driver': 'file',
+  'filename': '$TEST_IMG',
+  'locking': 'on'
+  } }" \
+'return' \
+'error'
+
+# This snapshot will perform a reopen to drop R/W to RO.
+# It should still keep all permissions shared.
+success_or_failure=y _send_qemu_cmd $QEMU_HANDLE \
+"{'execute': 'blockdev-snapshot-sync',
+  'arguments': {
+  'node-name': 'node0',
+  'snapshot-file': '$TEST_IMG.overlay',
+  'snapshot-node-name': 'node1'
+  } }" \
+'return' \
+'error'
+
+# Now open the same file again
+# This does not require any permissions (and does not unshare any), so
+# this will not conflict with node0.
+success_or_failure=y _send_qemu_cmd $QEMU_HANDLE \
+"{'execute': 'blockdev-add',
+  'arguments': {
+  'node-name': 'node1',
+  'driver': 'file',
+  'filename': '$TEST_IMG',
+  'locking': 'on'
+  } }" \
+'return' \
+'error'
+
+# Now we attach the image to a virtio-blk device.  This device does
+# require some permissions (at least WRITE and READ_CONSISTENT), so if
+# reopening node0 unshared any (which it should not have), this will
+# fail (but it should not).
+success_or_failure=y _send_qemu_cmd $QEMU_HANDLE \
+"{'execute': 'device_add',
+  'arguments': {
+  'driver': 'virtio-blk',
+  'drive': 'node1'
+  } }" \
+'return' \
+'error'
+
+_cleanup_qemu
+
 # success, all done
 echo "*** done"
 rm -f $seq.full
diff --git a/tests/qemu-iotests/182.out b/tests/qemu-iotests/182.out
index f1463c8862..af501ca3f3 100644
--- a/tests/qemu-iotests/182.out
+++ b/tests/qemu-iotests/182.out
@@ -5,4 +5,13 @@ Starting QEMU
 Starting a second QEMU using the same image should fail
 QEMU_PROG: -drive file=TEST_DIR/t.qcow2,if=none,id=drive0,file.locking=on: 
Failed to get "write" lock
 Is another process using the image [TEST_DIR/t.qcow2]?
+
+=== Testing reopen ===
+
+{"return": {}}
+{"return": {}}
+Formatting 'TEST_DIR/t.qcow2.overlay', fmt=qcow2 size=197120 
backing_file=TEST_DIR/t.qcow2 backing_fmt=file cluster_size=65536 
lazy_refcounts=off refcount_bits=16
+{"return": {}}
+{"return": {}}
+{"return": {}}
 *** done
-- 
2.17.2




[Qemu-block] [PATCH 0/6] Misc fixes to NBD

2018-11-16 Thread Daniel P . Berrangé
This does two minor fixes to the NBD code and adds significant coverage
of the NBD TLS support to detect future problems.

The first two patches should be for 3.1.

The tests can wait till 4.0 if desired.

Daniel P. Berrangé (6):
  nbd: fix whitespace in server error message
  nbd: stop waiting for a NBD response with NBD_CMD_DISC
  tests: pull qemu-nbd iotest helpers into common.nbd file
  tests: check if qemu-nbd is still alive before waiting
  tests: add iotests helpers for dealing with TLS certificates
  tests: exercise NBD server in TLS mode

 block/nbd-client.c|   1 +
 nbd/server.c  |   2 +-
 tests/qemu-iotests/058|  47 ++--
 tests/qemu-iotests/233|  99 
 tests/qemu-iotests/233.out|  33 
 tests/qemu-iotests/common.nbd | 111 +++
 tests/qemu-iotests/common.tls | 139 ++
 tests/qemu-iotests/group  |   1 +
 8 files changed, 393 insertions(+), 40 deletions(-)
 create mode 100755 tests/qemu-iotests/233
 create mode 100644 tests/qemu-iotests/233.out
 create mode 100644 tests/qemu-iotests/common.nbd
 create mode 100644 tests/qemu-iotests/common.tls

-- 
2.19.1




[Qemu-block] [PATCH for-3.1? 1/3] block: Always abort reopen after prepare succeeded

2018-11-16 Thread Max Reitz
bdrv_reopen_multiple() does not invoke bdrv_reopen_abort() for the
element of the reopen queue for which bdrv_reopen_prepare() failed,
because it assumes that the prepare function will have rolled back all
changes already.

However, bdrv_reopen_prepare() does not do this in every case: It may
notice an error after BlockDriver.bdrv_reopen_prepare() succeeded, and
it will not invoke BlockDriver.bdrv_reopen_abort() then; and neither
will bdrv_reopen_multiple(), as explained above.

This is wrong because we must always call .bdrv_reopen_commit() or
.bdrv_reopen_abort() after .bdrv_reopen_prepare() has succeeded.
Otherwise, the block driver has no chance to undo what it has done in
its implementation of .bdrv_reopen_prepare().

To fix this, bdrv_reopen_prepare() has to call .bdrv_reopen_abort() if
it wants to return an error after .bdrv_reopen_prepare() has succeeded.

Signed-off-by: Max Reitz 
---
 block.c | 17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/block.c b/block.c
index fd67e14dfa..7f5859aa74 100644
--- a/block.c
+++ b/block.c
@@ -3332,7 +3332,7 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, 
BlockReopenQueue *queue,
 if (!qobject_is_equal(new, old)) {
 error_setg(errp, "Cannot change the option '%s'", entry->key);
 ret = -EINVAL;
-goto error;
+goto late_error;
 }
 } while ((entry = qdict_next(reopen_state->options, entry)));
 }
@@ -3340,7 +3340,7 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, 
BlockReopenQueue *queue,
 ret = bdrv_check_perm(reopen_state->bs, queue, reopen_state->perm,
   reopen_state->shared_perm, NULL, errp);
 if (ret < 0) {
-goto error;
+goto late_error;
 }
 
 ret = 0;
@@ -3354,6 +3354,19 @@ error:
 qobject_unref(orig_reopen_opts);
 g_free(discard);
 return ret;
+
+late_error:
+/* drv->bdrv_reopen_prepare() has succeeded, so we need to call
+ * drv->bdrv_reopen_abort() before signaling an error
+ * (bdrv_reopen_multiple() will not call bdrv_reopen_abort() when
+ * the respective bdrv_reopen_prepare() failed) */
+if (drv->bdrv_reopen_abort) {
+drv->bdrv_reopen_abort(reopen_state);
+}
+qemu_opts_del(opts);
+qobject_unref(orig_reopen_opts);
+g_free(discard);
+return ret;
 }
 
 /*
-- 
2.17.2




[Qemu-block] [PATCH for-3.1? 0/3] block: Fix two minor reopen issues

2018-11-16 Thread Max Reitz
These are fixes for issues I found when looking after something Berto
has reported.  The second patch fixes that issue Berto found, the first
one is only kind of related.

For the first patch:  bdrv_reopen_abort() or bdrv_reopen_commit() are
called only if the respective bdrv_reopen_prepare() has succeeded.
However, bdrv_reopen_prepare() can fail even if the block driver’s
.bdrv_reopen_prepare() succeeded.  In that case, the block driver is
expecting a .bdrv_reopen_abort() or .bdrv_reopen_commit(), but will
never get it.

This is fixed by making bdrv_reopen_prepare() call .bdrv_reopen_abort()
if an error occurs after .bdrv_reopen_prepare() has already succeeded.

In practice this just prevents resource leaks, so nothing too dramatic
(fine for qemu-next).  It also means I’ve been incapable of writing a
test, sorry.


The second issue is more severe: file-posix applies the inverse share
locks when reopening.  Now the user can only directly do reopens from
the monitor for now, so that wouldn’t be so bad.  But of course there
are other places in qemu that implicitly reopen nodes, like dropping
R/W to RO or gaining R/W from RO.  Most of these places are symmetrical
at least (they’ll get R/W on an RO image for a short period of time and
then drop back to RO), so you’ll see the wrong shared permission locks
only for a short duration.  But at least blockdev-snapshot and
blockdev-snapshot-sync are one way; they drop R/W to RO and stay there.
So if you do a blockdev-snapshot*, the shared permission bits will be
flipped.  This is therefore very much user-visible.

It’s not catastrophic, though, so I’m not sure whether we need it in
3.1.  It’s definitely a bugfix, and I think we have patches queued for
the next RC already, so I think it makes sense to include at least
patches 2 and 3 as well.


Max Reitz (3):
  block: Always abort reopen after prepare succeeded
  file-posix: Fix shared locks on reopen commit
  iotests: Test file-posix locking and reopen

 block.c| 17 +++--
 block/file-posix.c |  2 +-
 tests/qemu-iotests/182 | 71 ++
 tests/qemu-iotests/182.out |  9 +
 4 files changed, 96 insertions(+), 3 deletions(-)

-- 
2.17.2




Re: [Qemu-block] KVM Forum block no[td]es

2018-11-16 Thread Kevin Wolf
Am 16.11.2018 um 16:27 hat Max Reitz geschrieben:
> On 16.11.18 16:18, Kevin Wolf wrote:
> > Am 16.11.2018 um 16:03 hat Alberto Garcia geschrieben:
> >>> I don't think anything needs a way to generally block graph changes
> >>> around some node.  We only need to prevent changes to very specific
> >>> sets of edges.  This is something that the permission system just
> >>> cannot do.
> >>
> >> But what would you do then?
> > 
> > I agree with you mostly in that I think that most problems that Max
> > mentioned aren't readl. The only real problem I see with GRAPH_MOD as a
> > permission on the node level is this overblocking
> 
> I wholeheartedly disagree.  Yes, it is true that most of the issues I
> thought of can be fixed, and so those problems are not problems in a
> technical sense.  But to me this whole discussion points to the greatest
> issue I have, which is that GRAPH_MOD is just too complicated to
> understand.  And I don't like a solution that works on a technical level
> but that everybody is too afraid to touch because it's too weird.

GRAPH_MOD with the meaning that Berto suggested isn't weird or
complicated to understand. It's only the wrong tool because it blocks
more than we want to block. But if we didn't care about that, it could
be just another permission like any other.

If you want to change the graph, you'd need to get the permission first,
and bdrv_replace_child_noperm() could assert that at least one parent of
the parent node has acquired the permission (unless you want to pass the
exact parent BdrvChild to it; maybe this is what we would really do
then).

> We have this discussion again and again, and in the end we always come
> up with something that looks like it might work, but it's just so weird
> that we can't even remember it.

I don't think we ever come up with something, weird or not, that
achieves what we wanted to achieve - because the problem simply can't be
solved properly at the node level.

> Maybe it's just me, though.  Frankly, I think the permission system
> itself is already too complicated as it is, but I don't have a simpler
> solution there.

It doesn't feel too bad to me, but that's subjective.

Kevin


signature.asc
Description: PGP signature


Re: [Qemu-block] Change in qemu 2.12 causes qemu-img convert to NBD to write more data

2018-11-16 Thread Kevin Wolf
Am 15.11.2018 um 23:27 hat Nir Soffer geschrieben:
> On Sun, Nov 11, 2018 at 6:11 PM Nir Soffer  wrote:
> 
> > On Wed, Nov 7, 2018 at 7:55 PM Nir Soffer  wrote:
> >
> >> On Wed, Nov 7, 2018 at 7:27 PM Kevin Wolf  wrote:
> >>
> >>> Am 07.11.2018 um 15:56 hat Nir Soffer geschrieben:
> >>> > Wed, Nov 7, 2018 at 4:36 PM Richard W.M. Jones 
> >>> wrote:
> >>> >
> >>> > > Another thing I tried was to change the NBD server (nbdkit) so that
> >>> it
> >>> > > doesn't advertise zero support to the client:
> >>> > >
> >>> > >   $ nbdkit --filter=log --filter=nozero memory size=6G
> >>> logfile=/tmp/log \
> >>> > >   --run './qemu-img convert ./fedora-28.img -n $nbd'
> >>> > >   $ grep '\.\.\.$' /tmp/log | sed 's/.*\([A-Z][a-z]*\).*/\1/' | uniq
> >>> -c
> >>> > >2154 Write
> >>> > >
> >>> > > Not surprisingly no zero commands are issued.  The size of the write
> >>> > > commands is very uneven -- it appears to be send one command per
> >>> block
> >>> > > of zeroes or data.
> >>> > >
> >>> > > Nir: If we could get information from imageio about whether zeroing
> >>> is
> >>> > > implemented efficiently or not by the backend, we could change
> >>> > > virt-v2v / nbdkit to advertise this back to qemu.
> >>> >
> >>> > There is no way to detect the capability, ioctl(BLKZEROOUT) always
> >>> > succeeds, falling back to manual zeroing in the kernel silently
> >>> >
> >>> > Even if we could, sending zero on the wire from qemu may be even
> >>> > slower, and it looks like qemu send even more requests in this case
> >>> > (2154 vs ~1300).
> >>> >
> >>> > Looks like this optimization in qemu side leads to worse performance,
> >>> > so it should not be enabled by default.
> >>>
> >>> Well, that's overgeneralising your case a bit. If the backend does
> >>> support efficient zero writes (which file systems, the most common case,
> >>> generally do), doing one big write_zeroes request at the start can
> >>> improve performance quite a bit.
> >>>
> >>> It seems the problem is that we can't really know whether the operation
> >>> will be efficient because the backends generally don't tell us. Maybe
> >>> NBD could introduce a flag for this, but in the general case it appears
> >>> to me that we'll have to have a command line option.
> >>>
> >>> However, I'm curious what your exact use case and the backend used in it
> >>> is? Can something be improved there to actually get efficient zero
> >>> writes and get even better performance than by just disabling the big
> >>> zero write?
> >>
> >>
> >> The backend is some NetApp storage connected via FC. I don't have
> >> more info on this. We get zero rate of about 1G/s on this storage, which
> >> is quite slow compared with other storage we tested.
> >>
> >> One option we check now is if this is the kernel silent fallback to manual
> >> zeroing when the server advertise wrong value of write_same_max_bytes.
> >>
> >
> > We eliminated this using blkdiscard. This is what we get on with this
> > storage
> > zeroing 100G LV:
> >
> > for i in 1 2 4 8 16 32; do time blkdiscard -z -p ${i}m
> > /dev/6e1d84f9-f939-46e9-b108-0427a08c280c/2d5c06ce-6536-4b3c-a7b6-13c6d8e55ade;
> > done
> >
> > real 4m50.851s
> > user 0m0.065s
> > sys 0m1.482s
> >
> > real 4m30.504s
> > user 0m0.047s
> > sys 0m0.870s
> >
> > real 4m19.443s
> > user 0m0.029s
> > sys 0m0.508s
> >
> > real 4m13.016s
> > user 0m0.020s
> > sys 0m0.284s
> >
> > real 2m45.888s
> > user 0m0.011s
> > sys 0m0.162s
> >
> > real 2m10.153s
> > user 0m0.003s
> > sys 0m0.100s
> >
> > We are investigating why we get low throughput on this server, and also
> > will check
> > several other servers.
> >
> > Having a command line option to control this behavior sounds good. I don't
> >> have enough data to tell what should be the default, but I think the safe
> >> way would be to keep old behavior.
> >>
> >
> > We file this bug:
> > https://bugzilla.redhat.com/1648622
> >
> 
> More data from even slower storage - zeroing 10G lv on Kaminario K2
> 
> # time blkdiscard -z -p 32m /dev/test_vg/test_lv2
> 
> real50m12.425s
> user0m0.018s
> sys 2m6.785s
> 
> Maybe something is wrong with this storage, since we see this:
> 
> # grep -s "" /sys/block/dm-29/queue/* | grep write_same_max_bytes
> /sys/block/dm-29/queue/write_same_max_bytes:512
> 
> Since BLKZEROOUT always fallback to manual slow zeroing silently,
> maybe we can disable the aggressive pre-zero of the entire device
> for block devices, and keep this optimization for files when fallocate()
> is supported?

I'm not sure what the detour through NBD changes, but qemu-img directly
on a block device doesn't use BLKZEROOUT first, but
FALLOC_FL_PUNCH_HOLE. Maybe we can add a flag that avoids anything that
could be slow, such as BLKZEROOUT, as a fallback (and also the slow
emulation that QEMU itself would do if all kernel calls fail).

Kevin



Re: [Qemu-block] [PATCH v2 05/13] block: Switch to 64-bit bl.max_transfer

2018-11-16 Thread Kevin Wolf
Am 15.11.2018 um 17:28 hat Eric Blake geschrieben:
> On 11/15/18 9:45 AM, Kevin Wolf wrote:
> > Am 15.11.2018 um 03:03 hat Eric Blake geschrieben:
> > > This change has no semantic impact: all drivers either leave the
> > > value at 0 (no inherent 32-bit limit is still translated into
> > > fragmentation below 2G; see the previous patch for that audit), or
> > > set it to a value less than 2G.  However, switching to a larger
> > > type and enforcing the 2G cap at the block layer makes it easier
> > > to audit specific drivers for their robustness to larger sizing,
> > > by letting them specify a value larger than INT_MAX if they have
> > > been audited to be 64-bit clean.
> > > 
> 
> > > +++ b/block/io.c
> > > @@ -159,6 +159,13 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error 
> > > **errp)
> > >   if (drv->bdrv_refresh_limits) {
> > >   drv->bdrv_refresh_limits(bs, errp);
> > >   }
> > > +
> > > +/* Clamp max_transfer to 2G */
> > > +if (bs->bl.max_transfer > UINT32_MAX) {
> > 
> > UINT32_MAX is 4G, not 2G.
> > 
> > Would it make more sense to make BDRV_REQUEST_MAX_BYTES the maximum
> > anyway?
> 
> D'oh.  Yes, that's what I intended, possibly by spelling it INT_MAX (the
> fact that the 'if' goes away in patch 13 is not an excuse for sloppy coding
> in the meantime).

INT_MAX is not a different spelling of BDRV_REQUEST_MAX_BYTES. The
latter is slightly lower (0x7e00).

Kevin



Re: [Qemu-block] KVM Forum block no[td]es

2018-11-16 Thread Max Reitz
On 16.11.18 16:18, Kevin Wolf wrote:
> Am 16.11.2018 um 16:03 hat Alberto Garcia geschrieben:
>>> I don't think anything needs a way to generally block graph changes
>>> around some node.  We only need to prevent changes to very specific
>>> sets of edges.  This is something that the permission system just
>>> cannot do.
>>
>> But what would you do then?
> 
> I agree with you mostly in that I think that most problems that Max
> mentioned aren't readl. The only real problem I see with GRAPH_MOD as a
> permission on the node level is this overblocking

I wholeheartedly disagree.  Yes, it is true that most of the issues I
thought of can be fixed, and so those problems are not problems in a
technical sense.  But to me this whole discussion points to the greatest
issue I have, which is that GRAPH_MOD is just too complicated to
understand.  And I don't like a solution that works on a technical level
but that everybody is too afraid to touch because it's too weird.

We have this discussion again and again, and in the end we always come
up with something that looks like it might work, but it's just so weird
that we can't even remember it.

Maybe it's just me, though.  Frankly, I think the permission system
itself is already too complicated as it is, but I don't have a simpler
solution there.

Max



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] KVM Forum block no[td]es

2018-11-16 Thread Alberto Garcia
On Fri 16 Nov 2018 04:03:12 PM CET, Alberto Garcia wrote:
>> top (T) -> intermediate (I) -> base (B)
>>
>> Now you commit from top to base.  Clearly you don't want the backing
>> chain between top and base to change.  So say you unshare the GRAPH_MOD
>> permission (implying that whenever a parent doesn't care, it just shares
>> it).  But as said above, if someone just drops the backing link from I
>> to B, the permission system won't catch that.
>
> Yes it will, because the block job (either mirror or commit depending on
> the case) is the parent of all three nodes (BlockJob.nodes is where the
> list is stored) and does not alow GRAPH_MOD in any of them[*]. So
> dropping that backing link fails (I also have a test case for that).

I forgot to add the footnote [*] in my previous e-mail:

The mirror block job (in particular when in "commit-active" mode, but
also in other cases) creates a "mirror_top" node and then blocks the
target and all intermediate nodes by calling block_job_add_bdrv() in
each one of them:

   mirror_top (M) -> top (T) -> intermediate (I) -> base (B)

The problem is that those intermediate nodes start from top's
backing_bs(), so top itself is not added to the block job's list of
children and doesn't have its permissions restricted during the job's
lifetime.

This looks like a bug, I'll see if there's a way to reproduce and I'll
prepare a patch.

Berto



Re: [Qemu-block] KVM Forum block no[td]es

2018-11-16 Thread Max Reitz
On 16.11.18 16:03, Alberto Garcia wrote:
> On Fri 16 Nov 2018 01:14:37 PM CET, Max Reitz wrote:
>> Permission system
>> =
>>
>> GRAPH_MOD
>> -
>>
>> We need some way for the commit job to prevent graph changes on its
>> chain while it is running.  Our current blocker doesn’t do the job,
>> however.  What to do?
>>
>> - We have no idea how to make a *permission* work.  Maybe the biggest
>>   problem is that it just doesn’t work as a permission, because the
>>   commit job doesn’t own the BdrvChildren that would need to be
>>   blocked (namely the @backing BdrvChild).
>
> What I'm doing at the moment in my blockdev-reopen series is check
> whether all parents of the node I want to change share the GRAPH_MOD
> permission. If any of them doesn't then the operation fails.
>
> This can be checked by calling bdrv_get_cumulative_perm() or simply
> bdrv_check_update_perm(..., BLK_PERM_GRAPH_MOD, ...).

 Sure.

> It solves the problem because e.g. the stream block job only allows
> BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED, so no graph
> modifications allowed.

 But the problem is that the commit job needs to unshare the permission
 on all backing links between base and top, so none of the backing
 links can be broken.  But it cannot do that because those backing
 links do not belong to it (but to the respective overlay nodes).
>>>
>>> I'm not sure if I'm following you. The commit block job has references
>>> to all intermediate nodes (with block_job_add_bdrv()) and only shares
>>> BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE. Anyone trying to obtain
>>> BLK_PERM_GRAPH_MOD on an intermediate node would see that the block job
>>> doesn't allow it.
>>
>> But first of all, why would anyone try to obtain it?  You only obtain
>> a permission when you attach a new parent (or when an existing parent
>> tries to change the ones it has).
> 
> You don't need to obtain it, you only need to see that the permission is
> shared (with bdrv_get_cumulative_perm()). Or, as you suggest if I got it
> right, attach a new "reopen" parent during bdrv_reopen_prepare() and
> release it on commit / abort.

OK, but that's just completely different from how all other permissions
work.

>> For instance, in the commit case, if you just remove a backing link,
>> then the chain is clearly broken and the graph has been modified, but
>> noone will have taken any permission to do so.
> 
> If you do what I said in my previous paragraph then you can't remove the
> backing link (this is one of my test cases).
> 
>> But even beyond that, commit doesn't care about all of the graph.  All
>> it cares about are the backing links.  So it should not prevent anyone
>> else from changing any links outside of the backing chain (which a
>> permission would clearly do).
>>
>> Let me give a concrete example:
>>
>> top (T) -> intermediate (I) -> base (B)
>>
>> Now you commit from top to base.  Clearly you don't want the backing
>> chain between top and base to change.  So say you unshare the GRAPH_MOD
>> permission (implying that whenever a parent doesn't care, it just shares
>> it).  But as said above, if someone just drops the backing link from I
>> to B, the permission system won't catch that.
> 
> Yes it will, because the block job (either mirror or commit depending on
> the case) is the parent of all three nodes (BlockJob.nodes is where the
> list is stored) and does not alow GRAPH_MOD in any of them[*]. So
> dropping that backing link fails (I also have a test case for that).
> 
>> The next thing is that how would you even try to catch it in the first
>> place?  The normal case would be for everything to both share and
>> claim the GRAPH_MOD permission, the latter so it can see if something
>> unshared it.  But that would mean that everything is holding that
>> permission all the time and it's impossible to unshare it.
> 
> As things are now, GRAPH_MOD is shared by default on backing files
> (bdrv_format_default_perms()), but I don't understand why you need that
> everything takes GRAPH_MOD, you only need it when you're actually
> planning to change the graph (e.g a block job).
> 
>> So the solution might be to take it first, and release it after having
>> attached the child.  And maybe you'd say that for the removal case,
>> you'd have to take the GRAPH_MOD permission before you remove any
>> child.
> 
> Yes, something like that.
> 
>> But I think that's weird, and also, there's another issue still.
>>
>> What if you try to attach a read-only NBD server to B?  That should be
>> OK in my mind.  But it is a graph change, so it wouldn't be allowed if
>> GRAPH_MOD is a permission, even though commit doesn't care at all.
> 
> I haven't examined that case (I'm not familiar with NBD),

It doesn't matter whether it's NBD.  What matters is that commit doesn't
care about non-backing relationships.

> but if the
> 

Re: [Qemu-block] KVM Forum block no[td]es

2018-11-16 Thread Kevin Wolf
Am 16.11.2018 um 16:03 hat Alberto Garcia geschrieben:
> > I don't think anything needs a way to generally block graph changes
> > around some node.  We only need to prevent changes to very specific
> > sets of edges.  This is something that the permission system just
> > cannot do.
> 
> But what would you do then?

I agree with you mostly in that I think that most problems that Max
mentioned aren't readl. The only real problem I see with GRAPH_MOD as a
permission on the node level is this overblocking - but that's bad
enough that I feel using permissions to block changes to a whole node is
not a good solution.

So what's the alternative? Max had a possible solution in the first
email in this thread:

> - A property of BdrvChild that can be set by a non-parent seems more
>   feasible, e.g. a counter where changing the child is possible only
>   if the counter is 0.  This also actually makes sense in what it
>   means.

The commit job would increment BdrvChild.block_change (or whatever we
would call it) for all bs->backing edges in the subchain.

Kevin



Re: [Qemu-block] KVM Forum block no[td]es

2018-11-16 Thread Alberto Garcia
On Fri 16 Nov 2018 01:14:37 PM CET, Max Reitz wrote:
> Permission system
> =
>
> GRAPH_MOD
> -
>
> We need some way for the commit job to prevent graph changes on its
> chain while it is running.  Our current blocker doesn’t do the job,
> however.  What to do?
>
> - We have no idea how to make a *permission* work.  Maybe the biggest
>   problem is that it just doesn’t work as a permission, because the
>   commit job doesn’t own the BdrvChildren that would need to be
>   blocked (namely the @backing BdrvChild).

 What I'm doing at the moment in my blockdev-reopen series is check
 whether all parents of the node I want to change share the GRAPH_MOD
 permission. If any of them doesn't then the operation fails.

 This can be checked by calling bdrv_get_cumulative_perm() or simply
 bdrv_check_update_perm(..., BLK_PERM_GRAPH_MOD, ...).
>>>
>>> Sure.
>>>
 It solves the problem because e.g. the stream block job only allows
 BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED, so no graph
 modifications allowed.
>>>
>>> But the problem is that the commit job needs to unshare the permission
>>> on all backing links between base and top, so none of the backing
>>> links can be broken.  But it cannot do that because those backing
>>> links do not belong to it (but to the respective overlay nodes).
>> 
>> I'm not sure if I'm following you. The commit block job has references
>> to all intermediate nodes (with block_job_add_bdrv()) and only shares
>> BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE. Anyone trying to obtain
>> BLK_PERM_GRAPH_MOD on an intermediate node would see that the block job
>> doesn't allow it.
>
> But first of all, why would anyone try to obtain it?  You only obtain
> a permission when you attach a new parent (or when an existing parent
> tries to change the ones it has).

You don't need to obtain it, you only need to see that the permission is
shared (with bdrv_get_cumulative_perm()). Or, as you suggest if I got it
right, attach a new "reopen" parent during bdrv_reopen_prepare() and
release it on commit / abort.

> For instance, in the commit case, if you just remove a backing link,
> then the chain is clearly broken and the graph has been modified, but
> noone will have taken any permission to do so.

If you do what I said in my previous paragraph then you can't remove the
backing link (this is one of my test cases).

> But even beyond that, commit doesn't care about all of the graph.  All
> it cares about are the backing links.  So it should not prevent anyone
> else from changing any links outside of the backing chain (which a
> permission would clearly do).
>
> Let me give a concrete example:
>
> top (T) -> intermediate (I) -> base (B)
>
> Now you commit from top to base.  Clearly you don't want the backing
> chain between top and base to change.  So say you unshare the GRAPH_MOD
> permission (implying that whenever a parent doesn't care, it just shares
> it).  But as said above, if someone just drops the backing link from I
> to B, the permission system won't catch that.

Yes it will, because the block job (either mirror or commit depending on
the case) is the parent of all three nodes (BlockJob.nodes is where the
list is stored) and does not alow GRAPH_MOD in any of them[*]. So
dropping that backing link fails (I also have a test case for that).

> The next thing is that how would you even try to catch it in the first
> place?  The normal case would be for everything to both share and
> claim the GRAPH_MOD permission, the latter so it can see if something
> unshared it.  But that would mean that everything is holding that
> permission all the time and it's impossible to unshare it.

As things are now, GRAPH_MOD is shared by default on backing files
(bdrv_format_default_perms()), but I don't understand why you need that
everything takes GRAPH_MOD, you only need it when you're actually
planning to change the graph (e.g a block job).

> So the solution might be to take it first, and release it after having
> attached the child.  And maybe you'd say that for the removal case,
> you'd have to take the GRAPH_MOD permission before you remove any
> child.

Yes, something like that.

> But I think that's weird, and also, there's another issue still.
>
> What if you try to attach a read-only NBD server to B?  That should be
> OK in my mind.  But it is a graph change, so it wouldn't be allowed if
> GRAPH_MOD is a permission, even though commit doesn't care at all.

I haven't examined that case (I'm not familiar with NBD), but if the
only problem is that the permission system is unnecessarily strict in
certain corner cases then maybe it's reasonable compromise _if_ there's
no simpler alternative (I'm not saying that there isn't!).

> I don't think anything needs a way to generally block graph changes
> around some node.  We only need to prevent changes to very specific
> sets of edges.  

Re: [Qemu-block] [Qemu-devel] qemu process crash: Assertion failed: QLIST_EMPTY(>tracked_requests)

2018-11-16 Thread Fernando Casas Schössow
Hi again,

I hit another occurrence of this issue and collected the qemy dump as well. So 
at the moment I have two qemu dumps for this issue happening on two different 
guests.
I wish I know how to analyze them myself but is not the case so if anyone in 
the list is willing to take a look I'm more than willing to share them.

Thanks in advance.

Fernando

On jue, oct 18, 2018 at 3:40 PM, Fernando Casas Schössow 
 wrote:
Hi Kevin,

Not at the moment. This is a production system and pretty much up to date but 
can't upgrade to 3.0 yet.
If the dump can be of any use, I can upload it somewhere for analysis.

BR,

Fernando

On jue, oct 18, 2018 at 2:38 PM, Kevin Wolf  wrote:
Hi Fernando, Am 18.10.2018 um 14:25 hat Fernando Casas Schössow geschrieben:
I hope this email finds you well and I apologize in advance for resurrecting 
this thread. I'm currently running on qemu 2.12.1 and I'm still having this 
issue every few days but now I managed to get a core dump generated (without 
including the guest memory). Would you take a look at the dump? Please let me 
know how do you prefer me to share it. The file is around 290MB as is but I can 
try to compress it.
would it be possible for you to test whether the bug still exists in QEMU 3.0? 
There were a few fixes related to request draining in that release, so maybe 
it's already solved now. Kevin






Re: [Qemu-block] [PATCH] migration/block-dirty-bitmap: fix Coverity CID1390625

2018-11-16 Thread Stefan Hajnoczi
On Tue, Oct 16, 2018 at 04:20:18PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> Theoretically possible that we finish the skipping loop with bs = NULL
> and the following code will crash trying to dereference it. Fix that.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  migration/block-dirty-bitmap.c | 4 
>  1 file changed, 4 insertions(+)
> 
> diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
> index 477826330c..6de808f95f 100644
> --- a/migration/block-dirty-bitmap.c
> +++ b/migration/block-dirty-bitmap.c
> @@ -288,6 +288,10 @@ static int init_dirty_bitmap_migration(void)
>  bs = backing_bs(bs);
>  }
>  
> +if (!bs || bs->implicit) {

Why is it necessary to check bs->implicit?


signature.asc
Description: PGP signature


Re: [Qemu-block] [PATCH v5 2/3] file-posix: Drop s->lock_fd

2018-11-16 Thread Max Reitz
On 16.11.18 15:02, Max Reitz wrote:
> On 16.11.18 14:58, Alberto Garcia wrote:
>> On Fri 16 Nov 2018 02:34:25 PM CET, Max Reitz wrote:
>>> To me that looks like a problem in the general reopen code.
>>> raw_reopen_prepare() is called and succeeds.  Then
>>> bdrv_reopen_prepare() notices the option wasn't handled and therefore
>>> fails.  bdrv_reopen_multiple() thus doesn't set bs_entry->prepared to
>>> true, which means raw_reopen_abort() won't be called.
>>>
>>> We should always call either BlockDriver.bdrv_reopen_commit() or
>>> BlockDriver.bdrv_reopen_abort() when BlockDriver.bdrv_reopen_prepare()
>>> succeeded.
>>
>> So you mean getting rid of BlockReopenQueueEntry.prepared altogether?
> 
> I mean just calling .bdrv_reopen_abort() in bdrv_reopen_prepare() if
> there was an error after .bdrv_reopen_prepare() succeeded.
> 
> I have a patch, I'm just trying to think of a useful test...

To elaborate, I didn't want to use your test for two reasons.  First, it
seemed generally too dependant on what file-posix's implementation does.
 Second, I couldn't get it to work in any shorter form (which seemed
weird) and why would it even print something about consistent_read?
That seemed weird, too.

I would've thought that just any case where bdrv_reopen_prepare() fails
after a successful .bdrv_reopen_prepare() could trigger the issue, but
that's not the case.

This is because the file locks are not applied to the new FD just by
duping it, they are only applied in raw_reopen_commit().  So if you
prepare without abort without any commit before, the problem disappears.

So there had to be something wrong in raw_reopen_commit(), too.  It
turns out that this is indeed in the above commit, it should pass
"s->locked_shared_perm" instead of "~s->locked_shared_perm" to
raw_apply_lock_bytes().

Max



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH v5 2/3] file-posix: Drop s->lock_fd

2018-11-16 Thread Max Reitz
On 16.11.18 14:58, Alberto Garcia wrote:
> On Fri 16 Nov 2018 02:34:25 PM CET, Max Reitz wrote:
>> To me that looks like a problem in the general reopen code.
>> raw_reopen_prepare() is called and succeeds.  Then
>> bdrv_reopen_prepare() notices the option wasn't handled and therefore
>> fails.  bdrv_reopen_multiple() thus doesn't set bs_entry->prepared to
>> true, which means raw_reopen_abort() won't be called.
>>
>> We should always call either BlockDriver.bdrv_reopen_commit() or
>> BlockDriver.bdrv_reopen_abort() when BlockDriver.bdrv_reopen_prepare()
>> succeeded.
> 
> So you mean getting rid of BlockReopenQueueEntry.prepared altogether?

I mean just calling .bdrv_reopen_abort() in bdrv_reopen_prepare() if
there was an error after .bdrv_reopen_prepare() succeeded.

I have a patch, I'm just trying to think of a useful test...

Max



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH v5 2/3] file-posix: Drop s->lock_fd

2018-11-16 Thread Alberto Garcia
On Fri 16 Nov 2018 02:34:25 PM CET, Max Reitz wrote:
> To me that looks like a problem in the general reopen code.
> raw_reopen_prepare() is called and succeeds.  Then
> bdrv_reopen_prepare() notices the option wasn't handled and therefore
> fails.  bdrv_reopen_multiple() thus doesn't set bs_entry->prepared to
> true, which means raw_reopen_abort() won't be called.
>
> We should always call either BlockDriver.bdrv_reopen_commit() or
> BlockDriver.bdrv_reopen_abort() when BlockDriver.bdrv_reopen_prepare()
> succeeded.

So you mean getting rid of BlockReopenQueueEntry.prepared altogether?

Berto



Re: [Qemu-block] [PATCH v5 2/3] file-posix: Drop s->lock_fd

2018-11-16 Thread Max Reitz
On 14.11.18 14:54, Alberto Garcia wrote:
> On Thu 11 Oct 2018 09:21:34 AM CEST, Fam Zheng wrote:
>> The lock_fd field is not strictly necessary because transferring locked
>> bytes from old fd to the new one shouldn't fail anyway. This spares the
>> user one fd per image.
>>
>> Signed-off-by: Fam Zheng 
>> Reviewed-by: Max Reitz 
> 
> One of my tests (not published yet) starts to fail after this
> patch. Here's how you can reproduce the error:
> 
> $ qemu-img create -f qcow2 hd.qcow2 4G
> $ qemu-system-x86_64 -qmp stdio
> 
> { "execute": "qmp_capabilities" }
> { "execute": "blockdev-add", "arguments": {"driver": "qcow2", "node-name": 
> "hd0", "file": {"driver": "file", "filename": "hd.qcow2", "locking": "on" }}}
> { "execute": "human-monitor-command", "arguments": {"command-line": "qemu-io 
> hd0 \"reopen -o file.locking=on\""}}
> { "execute": "human-monitor-command", "arguments": {"command-line": "qemu-io 
> hd0 \"reopen -o file.locking=off\""}}
> { "execute": "blockdev-del", "arguments": {"node-name": "hd0"}}
> { "execute": "blockdev-add", "arguments": {"driver": "qcow2", "node-name": 
> "hd0", "file": {"driver": "file", "filename": "hd.qcow2"}}}
> 
> {"error": {"class": "GenericError", "desc": "Failed to get \"consistent 
> read\" lock"}}

To me that looks like a problem in the general reopen code.
raw_reopen_prepare() is called and succeeds.  Then bdrv_reopen_prepare()
notices the option wasn't handled and therefore fails.
bdrv_reopen_multiple() thus doesn't set bs_entry->prepared to true,
which means raw_reopen_abort() won't be called.

We should always call either BlockDriver.bdrv_reopen_commit() or
BlockDriver.bdrv_reopen_abort() when BlockDriver.bdrv_reopen_prepare()
succeeded.

Max



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [Qemu-devel] [PATCH for-3.1] nvme: fix out-of-bounds access to the CMB

2018-11-16 Thread no-reply
Hi,

This series seems to have some coding style problems. See output below for
more information:

Message-id: 20181116093152.27227-1-pbonz...@redhat.com
Type: series
Subject: [Qemu-devel] [PATCH for-3.1] nvme: fix out-of-bounds access to the CMB

=== TEST SCRIPT BEGIN ===
#!/bin/bash

BASE=base
n=1
total=$(git log --oneline $BASE.. | wc -l)
failed=0

git config --local diff.renamelimit 0
git config --local diff.renames True
git config --local diff.algorithm histogram

commits="$(git log --format=%H --reverse $BASE..)"
for c in $commits; do
echo "Checking PATCH $n/$total: $(git log -n 1 --format=%s $c)..."
if ! git show $c --format=email | ./scripts/checkpatch.pl --mailback -; then
failed=1
echo
fi
n=$((n+1))
done

exit $failed
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
Switched to a new branch 'test'
3ab66e7 nvme: fix out-of-bounds access to the CMB

=== OUTPUT BEGIN ===
Checking PATCH 1/1: nvme: fix out-of-bounds access to the CMB...
ERROR: space required after that ',' (ctx:VxV)
#99: FILE: tests/nvme-test.c:53:
+pdev = qpci_device_find(qs->pcibus, QPCI_DEVFN(4,0));
 ^

total: 1 errors, 0 warnings, 91 lines checked

Your patch has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.

=== OUTPUT END ===

Test command exited with code: 1


---
Email generated automatically by Patchew [http://patchew.org/].
Please send your feedback to patchew-de...@redhat.com

Re: [Qemu-block] KVM Forum block no[td]es

2018-11-16 Thread Max Reitz
On 15.11.18 15:28, Alberto Garcia wrote:
> On Wed 14 Nov 2018 06:24:10 PM CET, Max Reitz wrote:
 Permission system
 =

 GRAPH_MOD
 -

 We need some way for the commit job to prevent graph changes on its
 chain while it is running.  Our current blocker doesn’t do the job,
 however.  What to do?

 - We have no idea how to make a *permission* work.  Maybe the biggest
   problem is that it just doesn’t work as a permission, because the
   commit job doesn’t own the BdrvChildren that would need to be
   blocked (namely the @backing BdrvChild).
>>>
>>> What I'm doing at the moment in my blockdev-reopen series is check
>>> whether all parents of the node I want to change share the GRAPH_MOD
>>> permission. If any of them doesn't then the operation fails.
>>>
>>> This can be checked by calling bdrv_get_cumulative_perm() or simply
>>> bdrv_check_update_perm(..., BLK_PERM_GRAPH_MOD, ...).
>>
>> Sure.
>>
>>> It solves the problem because e.g. the stream block job only allows
>>> BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED, so no graph
>>> modifications allowed.
>>
>> But the problem is that the commit job needs to unshare the permission
>> on all backing links between base and top, so none of the backing
>> links can be broken.  But it cannot do that because those backing
>> links do not belong to it (but to the respective overlay nodes).
> 
> I'm not sure if I'm following you. The commit block job has references
> to all intermediate nodes (with block_job_add_bdrv()) and only shares
> BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE. Anyone trying to obtain
> BLK_PERM_GRAPH_MOD on an intermediate node would see that the block job
> doesn't allow it.

But first of all, why would anyone try to obtain it?  You only obtain a
permission when you attach a new parent (or when an existing parent
tries to change the ones it has).

For instance, in the commit case, if you just remove a backing link,
then the chain is clearly broken and the graph has been modified, but
noone will have taken any permission to do so.

But even beyond that, commit doesn't care about all of the graph.  All
it cares about are the backing links.  So it should not prevent anyone
else from changing any links outside of the backing chain (which a
permission would clearly do).

Let me give a concrete example:

top (T) -> intermediate (I) -> base (B)

Now you commit from top to base.  Clearly you don't want the backing
chain between top and base to change.  So say you unshare the GRAPH_MOD
permission (implying that whenever a parent doesn't care, it just shares
it).  But as said above, if someone just drops the backing link from I
to B, the permission system won't catch that.

The next thing is that how would you even try to catch it in the first
place?  The normal case would be for everything to both share and claim
the GRAPH_MOD permission, the latter so it can see if something unshared
it.  But that would mean that everything is holding that permission all
the time and it's impossible to unshare it.

(You'd take the GRAPH_MOD permission for the backing link from I to B,
and then commit couldn't unshare it because, well, it's taken already.)

So the solution might be to take it first, and release it after having
attached the child.  And maybe you'd say that for the removal case,
you'd have to take the GRAPH_MOD permission before you remove any child.
 But I think that's weird, and also, there's another issue still.


What if you try to attach a read-only NBD server to B?  That should be
OK in my mind.  But it is a graph change, so it wouldn't be allowed if
GRAPH_MOD is a permission, even though commit doesn't care at all.

The fact is that commit clearly only cares about edges it is not
involved in as either parent or child (the backing links), and therefore
has not control over the permissions taken.  It does not care about any
other edges, which is different from what permissions do, because they
always apply to a whole set of edges.


I don't think anything needs a way to generally block graph changes
around some node.  We only need to prevent changes to very specific sets
of edges.  This is something that the permission system just cannot do.

   (We never quite knew what “taking the GRAPH_PERMISSION” or
   “unsharing the GRPAH_MOD permission” was supposed to mean.
   Figuring that out always took like half an our in any face-to-face
   meeting, and then we decided it was pretty much useless for any
   case we had at hand.)
>>>
>>> Yeah it's a bit unclear. It can mean "none of this node's children
>>> can be replaced / removed", which is what I'm using it for at the
>>> moment.
>>
>> Yes, but that's just not useful.  I don't think we have any case where
>> something would be annoyed by having its immediate child replaced
>> (other than the visible data changing, potentially).  Usually when we
>> say something (e.g. a block job) wants to prevent 

Re: [Qemu-block] [PATCH for-3.1] nvme: fix out-of-bounds access to the CMB

2018-11-16 Thread Li Qiang
Paolo Bonzini  于2018年11月16日周五 下午5:31写道:

> Because the CMB BAR has a min_access_size of 2, if you read the last
> byte it will try to memcpy *2* bytes from n->cmbuf, causing an off-by-one
> error.  This is CVE-2018-16847.
>
> Another way to fix this might be to register the CMB as a RAM memory
> region, which would also be more efficient.  However, that might be a
> change for big-endian machines; I didn't think this through and I don't
> know how real hardware works.  Add a basic testcase for the CMB in case
> somebody does this change later on.
>
> Cc: Keith Busch 
> Cc: qemu-block@nongnu.org
> Cc: Li Qiang 
> Signed-off-by: Paolo Bonzini 

---
>  hw/block/nvme.c|  2 +-
>  tests/Makefile.include |  2 +-
>  tests/nvme-test.c  | 58 +++---
>  3 files changed, 57 insertions(+), 5 deletions(-)
>
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 09d7c90259..5d92794ef7 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -1192,7 +1192,7 @@ static const MemoryRegionOps nvme_cmb_ops = {
>  .write = nvme_cmb_write,
>  .endianness = DEVICE_LITTLE_ENDIAN,
>  .impl = {
> -.min_access_size = 2,
> +.min_access_size = 1,
>  .max_access_size = 8,
>  },
>  };
> diff --git a/tests/Makefile.include b/tests/Makefile.include
> index 613242bc6e..fb0b449c02 100644
> --- a/tests/Makefile.include
> +++ b/tests/Makefile.include
> @@ -730,7 +730,7 @@ tests/test-hmp$(EXESUF): tests/test-hmp.o
>  tests/machine-none-test$(EXESUF): tests/machine-none-test.o
>  tests/drive_del-test$(EXESUF): tests/drive_del-test.o
> $(libqos-virtio-obj-y)
>  tests/qdev-monitor-test$(EXESUF): tests/qdev-monitor-test.o
> $(libqos-pc-obj-y)
> -tests/nvme-test$(EXESUF): tests/nvme-test.o
> +tests/nvme-test$(EXESUF): tests/nvme-test.o $(libqos-pc-obj-y)
>  tests/pvpanic-test$(EXESUF): tests/pvpanic-test.o
>  tests/i82801b11-test$(EXESUF): tests/i82801b11-test.o
>  tests/ac97-test$(EXESUF): tests/ac97-test.o
> diff --git a/tests/nvme-test.c b/tests/nvme-test.c
> index 7674a446e4..2abb3b6d19 100644
> --- a/tests/nvme-test.c
> +++ b/tests/nvme-test.c
> @@ -8,11 +8,64 @@
>   */
>
>  #include "qemu/osdep.h"
> +#include "qemu/units.h"
>  #include "libqtest.h"
> +#include "libqos/libqos-pc.h"
> +
> +static QOSState *qnvme_start(const char *extra_opts)
> +{
> +QOSState *qs;
> +const char *arch = qtest_get_arch();
> +const char *cmd = "-drive id=drv0,if=none,file=null-co://,format=raw "
> +  "-device nvme,addr=0x4.0,serial=foo,drive=drv0 %s";
> +
> +if (strcmp(arch, "i386") == 0 || strcmp(arch, "x86_64") == 0) {
> +qs = qtest_pc_boot(cmd, extra_opts ? : "");
> +global_qtest = qs->qts;
> +return qs;
> +}
> +
> +g_printerr("nvme tests are only available on x86\n");
> +exit(EXIT_FAILURE);
> +}
> +
> +static void qnvme_stop(QOSState *qs)
> +{
> +qtest_shutdown(qs);
> +}
>
> -/* Tests only initialization so far. TODO: Replace with functional tests
> */
>  static void nop(void)
>  {
> +QOSState *qs;
> +
> +qs = qnvme_start(NULL);
> +qnvme_stop(qs);
> +}
> +
> +static void nvmetest_cmb_test(void)
> +{
> +const int cmb_bar_size = 2 * MiB;
> +QOSState *qs;
> +QPCIDevice *pdev;
> +QPCIBar bar;
> +
> +qs = qnvme_start("-global nvme.cmb_size_mb=2");
> +pdev = qpci_device_find(qs->pcibus, QPCI_DEVFN(4,0));
> +g_assert(pdev != NULL);
> +
> +qpci_device_enable(pdev);
> +bar = qpci_iomap(pdev, 2, NULL);
> +
> +qpci_io_writel(pdev, bar, 0, 0xccbbaa99);
> +g_assert_cmpint(qpci_io_readb(pdev, bar, 0), ==, 0x99);
> +g_assert_cmpint(qpci_io_readw(pdev, bar, 0), ==, 0xaa99);
> +
> +/* Test partially out-of-bounds accesses.  */
> +qpci_io_writel(pdev, bar, cmb_bar_size - 1, 0x44332211);
> +g_assert_cmpint(qpci_io_readb(pdev, bar, cmb_bar_size - 1), ==, 0x11);
> +g_assert_cmpint(qpci_io_readw(pdev, bar, cmb_bar_size - 1), !=,
> 0x2211);
> +g_assert_cmpint(qpci_io_readl(pdev, bar, cmb_bar_size - 1), !=,
> 0x44332211);
>


Here seems need a g_free(pdev),




> +qnvme_stop(qs);
>  }
>
>  int main(int argc, char **argv)
> @@ -21,9 +74,8 @@ int main(int argc, char **argv)
>
>  g_test_init(, , NULL);
>  qtest_add_func("/nvme/nop", nop);
> +qtest_add_func("/nvme/cmb_test", nvmetest_cmb_test);
>
> -qtest_start("-drive id=drv0,if=none,file=null-co://,format=raw "
> -"-device nvme,drive=drv0,serial=foo");
>  ret = g_test_run();
>
>  qtest_end();
>

There is no qtest_start(), this qtest_end() seems trigger an assert in glib.
(tests/nvme-test:44053): GLib-CRITICAL **: g_hook_destroy_link: assertion
'hook != NULL' failed

Otherwise
Reviewed-by: Li Qiang 
Tested-by: Li Qiang 


Thanks,
Li Qiang



> --
> 2.19.1
>
>


Re: [Qemu-block] [PATCH] migration/block-dirty-bitmap: fix Coverity CID1390625

2018-11-16 Thread Peter Maydell
On 16 November 2018 at 03:28, John Snow  wrote:
> I looked again. I think Vladimir's patch will shut up Coverity for sure,
> feel free to apply it if you want this out of your hair.
>
> Stefan suggests the following, however;
>
>
> diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
> index 5e90f44c2f..00c068fda3 100644
> --- a/migration/block-dirty-bitmap.c
> +++ b/migration/block-dirty-bitmap.c
> @@ -284,7 +284,7 @@ static int init_dirty_bitmap_migration(void)
>  const char *drive_name = bdrv_get_device_or_node_name(bs);
>
>  /* skip automatically inserted nodes */
> -while (bs && bs->drv && bs->implicit) {
> +while (bs->drv && bs->implicit) {
>  bs = backing_bs(bs);
>  }
>
>
> that by removing the assumption that bs could ever be null here (it
> shouldn't), that we'll coax coverity into not warning anymore. I don't
> know if that will work, because backing_bs can theoretically return NULL
> and might convince coverity there's a problem. In practice it won't be.
>
> I don't know how to check this to see if Stefan's suggestion is appropriate.
>
> For such a small, trivial issue though, just merge this and be done with
> it, in my opinion. If you want to take this fix directly as a "build
> fix" I wouldn't object.

Personally I think the main benefit of this sort of Coverity
warning is that it nudges you to work through and figure out
whether something really can be NULL or not. Stefan's fix
will satisfy Coverity, because what it's complaining about
is that the code in one place assumes a pointer can't be NULL
but in another place does have handling for NULL: it is the
inconsistency that triggers the warning. If backing_bs()
can't return NULL (ie if you would be happy in theory to put
an assert() in after the call) then I think Stefan's fix is
better. If it can return NULL ever then Vladimir's fix is
what you want.

thanks
-- PMM



[Qemu-block] [PATCH for-3.1] nvme: fix out-of-bounds access to the CMB

2018-11-16 Thread Paolo Bonzini
Because the CMB BAR has a min_access_size of 2, if you read the last
byte it will try to memcpy *2* bytes from n->cmbuf, causing an off-by-one
error.  This is CVE-2018-16847.

Another way to fix this might be to register the CMB as a RAM memory
region, which would also be more efficient.  However, that might be a
change for big-endian machines; I didn't think this through and I don't
know how real hardware works.  Add a basic testcase for the CMB in case
somebody does this change later on.

Cc: Keith Busch 
Cc: qemu-block@nongnu.org
Cc: Li Qiang 
Signed-off-by: Paolo Bonzini 
---
 hw/block/nvme.c|  2 +-
 tests/Makefile.include |  2 +-
 tests/nvme-test.c  | 58 +++---
 3 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 09d7c90259..5d92794ef7 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1192,7 +1192,7 @@ static const MemoryRegionOps nvme_cmb_ops = {
 .write = nvme_cmb_write,
 .endianness = DEVICE_LITTLE_ENDIAN,
 .impl = {
-.min_access_size = 2,
+.min_access_size = 1,
 .max_access_size = 8,
 },
 };
diff --git a/tests/Makefile.include b/tests/Makefile.include
index 613242bc6e..fb0b449c02 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -730,7 +730,7 @@ tests/test-hmp$(EXESUF): tests/test-hmp.o
 tests/machine-none-test$(EXESUF): tests/machine-none-test.o
 tests/drive_del-test$(EXESUF): tests/drive_del-test.o $(libqos-virtio-obj-y)
 tests/qdev-monitor-test$(EXESUF): tests/qdev-monitor-test.o $(libqos-pc-obj-y)
-tests/nvme-test$(EXESUF): tests/nvme-test.o
+tests/nvme-test$(EXESUF): tests/nvme-test.o $(libqos-pc-obj-y)
 tests/pvpanic-test$(EXESUF): tests/pvpanic-test.o
 tests/i82801b11-test$(EXESUF): tests/i82801b11-test.o
 tests/ac97-test$(EXESUF): tests/ac97-test.o
diff --git a/tests/nvme-test.c b/tests/nvme-test.c
index 7674a446e4..2abb3b6d19 100644
--- a/tests/nvme-test.c
+++ b/tests/nvme-test.c
@@ -8,11 +8,64 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/units.h"
 #include "libqtest.h"
+#include "libqos/libqos-pc.h"
+
+static QOSState *qnvme_start(const char *extra_opts)
+{
+QOSState *qs;
+const char *arch = qtest_get_arch();
+const char *cmd = "-drive id=drv0,if=none,file=null-co://,format=raw "
+  "-device nvme,addr=0x4.0,serial=foo,drive=drv0 %s";
+
+if (strcmp(arch, "i386") == 0 || strcmp(arch, "x86_64") == 0) {
+qs = qtest_pc_boot(cmd, extra_opts ? : "");
+global_qtest = qs->qts;
+return qs;
+}
+
+g_printerr("nvme tests are only available on x86\n");
+exit(EXIT_FAILURE);
+}
+
+static void qnvme_stop(QOSState *qs)
+{
+qtest_shutdown(qs);
+}
 
-/* Tests only initialization so far. TODO: Replace with functional tests */
 static void nop(void)
 {
+QOSState *qs;
+
+qs = qnvme_start(NULL);
+qnvme_stop(qs);
+}
+
+static void nvmetest_cmb_test(void)
+{
+const int cmb_bar_size = 2 * MiB;
+QOSState *qs;
+QPCIDevice *pdev;
+QPCIBar bar;
+
+qs = qnvme_start("-global nvme.cmb_size_mb=2");
+pdev = qpci_device_find(qs->pcibus, QPCI_DEVFN(4,0));
+g_assert(pdev != NULL);
+
+qpci_device_enable(pdev);
+bar = qpci_iomap(pdev, 2, NULL);
+
+qpci_io_writel(pdev, bar, 0, 0xccbbaa99);
+g_assert_cmpint(qpci_io_readb(pdev, bar, 0), ==, 0x99);
+g_assert_cmpint(qpci_io_readw(pdev, bar, 0), ==, 0xaa99);
+
+/* Test partially out-of-bounds accesses.  */
+qpci_io_writel(pdev, bar, cmb_bar_size - 1, 0x44332211);
+g_assert_cmpint(qpci_io_readb(pdev, bar, cmb_bar_size - 1), ==, 0x11);
+g_assert_cmpint(qpci_io_readw(pdev, bar, cmb_bar_size - 1), !=, 0x2211);
+g_assert_cmpint(qpci_io_readl(pdev, bar, cmb_bar_size - 1), !=, 
0x44332211);
+qnvme_stop(qs);
 }
 
 int main(int argc, char **argv)
@@ -21,9 +74,8 @@ int main(int argc, char **argv)
 
 g_test_init(, , NULL);
 qtest_add_func("/nvme/nop", nop);
+qtest_add_func("/nvme/cmb_test", nvmetest_cmb_test);
 
-qtest_start("-drive id=drv0,if=none,file=null-co://,format=raw "
-"-device nvme,drive=drv0,serial=foo");
 ret = g_test_run();
 
 qtest_end();
-- 
2.19.1