Re: [PATCH v2 2/8] nbd: allow reconnect on open, with corresponding new options

2021-01-22 Thread Vladimir Sementsov-Ogievskiy

21.01.2021 04:44, Eric Blake wrote:

On 11/30/20 7:40 AM, Vladimir Sementsov-Ogievskiy wrote:

Note: currently, using new option with long timeout in qmp command
blockdev-add is not good idea, as qmp interface is blocking, so,
don't add it now, let's add it later after
"monitor: Optionally run handlers in coroutines" series merged.


If I'm not mistaken, that landed as of eb94b81a94.  Is it just the
commit message that needs an update, or does this patch need a respin?


Oh yes, you are right. I think the most reasonable thing is to keep this patch
in separate (for simple backporting to downstream without Kevin's series), and
add qmp support for the feature as additional new patch. Will do it on respin.





Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
  block/nbd.c | 115 +---
  1 file changed, 92 insertions(+), 23 deletions(-)




@@ -474,6 +484,11 @@ nbd_co_establish_connection(BlockDriverState *bs, Error 
**errp)
  s->wait_connect = true;
  qemu_coroutine_yield();
  
+if (!s->connect_thread) {

+error_setg(errp, "Connection attempt cancelled by other operation");
+return NULL;
+}


Does this need to use atomics for proper access to s->connect_thread
across threads?  Or are all the operations done by other coroutines but
within the same thread, so we are safe?


s->connect_thread is not accessed from connect_thread_func, so in this way we 
are safe. And variables shared between connect_thread_func and other driver code 
are protected by mutex.

What about accessing nbd bds from different threads.. In my observation, all the 
code is written in assumption that everything inside block-driver may be called 
from different coroutines but from one thread.. And we have a lot of s->* 
variables that are not atomic and not protected by mutexes, and all this works 
somehow:)

I remember Paolo answered me somewhere in mailing list, that actually, 
everything in block drivers and block/io must be thread-safe.. But I don't see 
this thread-safety in current code, so don't introduce it for new variables.





@@ -624,10 +645,15 @@ static coroutine_fn void 
nbd_reconnect_attempt(BDRVNBDState *s)
  bdrv_inc_in_flight(s->bs);
  
  out:

-s->connect_status = ret;
-error_free(s->connect_err);
-s->connect_err = NULL;
-error_propagate(>connect_err, local_err);
+if (s->connect_status == -ETIMEDOUT) {
+/* Don't rewrite timeout error by following cancel-provoked error */


Maybe:

/* Don't propagate a timeout error caused by a job cancellation. */


No, we want to keep ETIMEOUT





+static void open_timer_cb(void *opaque)
+{
+BDRVNBDState *s = opaque;
+
+if (!s->connect_status) {
+/* First attempt was not finished. We should set an error */
+s->connect_status = -ETIMEDOUT;
+error_setg(>connect_err, "First connection attempt is cancelled by "
+   "timeout");
+}
+
+nbd_teardown_connection_async(s->bs);
+open_timer_del(s);
+}
+
+static void open_timer_init(BDRVNBDState *s, uint64_t expire_time_ns)
+{
+assert(!s->open_timer && s->state == NBD_CLIENT_OPENING);
+s->open_timer = aio_timer_new(bdrv_get_aio_context(s->bs),
+  QEMU_CLOCK_REALTIME,
+  SCALE_NS,
+  open_timer_cb, s);
+timer_mod(s->open_timer, expire_time_ns);
+}
+




@@ -2180,6 +2235,14 @@ static QemuOptsList nbd_runtime_opts = {
  "future requests before a successful reconnect will "
  "immediately fail. Default 0",
  },
+{
+.name = "open-timeout",
+.type = QEMU_OPT_NUMBER,
+.help = "In seconds. If zero, nbd driver tries to establish "
+"connection only once, on fail open fails. If non-zero, "


If zero, the nbd driver tries the connection only once, and fails to
open if the connection fails.


+"nbd driver may do several attempts until success or "
+"@open-timeout seconds passed. Default 0",


If non-zero, the nbd driver will repeat connection attempts until
successful or until @open-timeout seconds have elapsed.


+},


Where is the QMP counterpart for setting this option?


Absent (as described in commit msg). Will do in a separate patch.




  { /* end of list */ }
  },
  };
@@ -2235,6 +2298,7 @@ static int nbd_process_options(BlockDriverState *bs, 
QDict *options,
  }
  
  s->reconnect_delay = qemu_opt_get_number(opts, "reconnect-delay", 0);

+s->open_timeout = qemu_opt_get_number(opts, "open-timeout", 0);
  
  ret = 0;
  
@@ -2268,6 +2332,11 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags,

  bdrv_inc_in_flight(bs);
  aio_co_schedule(bdrv_get_aio_context(bs), s->connection_co);
  
+if (s->open_timeout) {

+open_timer_init(s, 

Re: [PATCH v2 2/8] nbd: allow reconnect on open, with corresponding new options

2021-01-20 Thread Eric Blake
On 11/30/20 7:40 AM, Vladimir Sementsov-Ogievskiy wrote:
> Note: currently, using new option with long timeout in qmp command
> blockdev-add is not good idea, as qmp interface is blocking, so,
> don't add it now, let's add it later after
> "monitor: Optionally run handlers in coroutines" series merged.

If I'm not mistaken, that landed as of eb94b81a94.  Is it just the
commit message that needs an update, or does this patch need a respin?

> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  block/nbd.c | 115 +---
>  1 file changed, 92 insertions(+), 23 deletions(-)
> 

> @@ -474,6 +484,11 @@ nbd_co_establish_connection(BlockDriverState *bs, Error 
> **errp)
>  s->wait_connect = true;
>  qemu_coroutine_yield();
>  
> +if (!s->connect_thread) {
> +error_setg(errp, "Connection attempt cancelled by other operation");
> +return NULL;
> +}

Does this need to use atomics for proper access to s->connect_thread
across threads?  Or are all the operations done by other coroutines but
within the same thread, so we are safe?


> @@ -624,10 +645,15 @@ static coroutine_fn void 
> nbd_reconnect_attempt(BDRVNBDState *s)
>  bdrv_inc_in_flight(s->bs);
>  
>  out:
> -s->connect_status = ret;
> -error_free(s->connect_err);
> -s->connect_err = NULL;
> -error_propagate(>connect_err, local_err);
> +if (s->connect_status == -ETIMEDOUT) {
> +/* Don't rewrite timeout error by following cancel-provoked error */

Maybe:

/* Don't propagate a timeout error caused by a job cancellation. */


> +static void open_timer_cb(void *opaque)
> +{
> +BDRVNBDState *s = opaque;
> +
> +if (!s->connect_status) {
> +/* First attempt was not finished. We should set an error */
> +s->connect_status = -ETIMEDOUT;
> +error_setg(>connect_err, "First connection attempt is cancelled 
> by "
> +   "timeout");
> +}
> +
> +nbd_teardown_connection_async(s->bs);
> +open_timer_del(s);
> +}
> +
> +static void open_timer_init(BDRVNBDState *s, uint64_t expire_time_ns)
> +{
> +assert(!s->open_timer && s->state == NBD_CLIENT_OPENING);
> +s->open_timer = aio_timer_new(bdrv_get_aio_context(s->bs),
> +  QEMU_CLOCK_REALTIME,
> +  SCALE_NS,
> +  open_timer_cb, s);
> +timer_mod(s->open_timer, expire_time_ns);
> +}
> +


> @@ -2180,6 +2235,14 @@ static QemuOptsList nbd_runtime_opts = {
>  "future requests before a successful reconnect will "
>  "immediately fail. Default 0",
>  },
> +{
> +.name = "open-timeout",
> +.type = QEMU_OPT_NUMBER,
> +.help = "In seconds. If zero, nbd driver tries to establish "
> +"connection only once, on fail open fails. If non-zero, "

If zero, the nbd driver tries the connection only once, and fails to
open if the connection fails.

> +"nbd driver may do several attempts until success or "
> +"@open-timeout seconds passed. Default 0",

If non-zero, the nbd driver will repeat connection attempts until
successful or until @open-timeout seconds have elapsed.

> +},

Where is the QMP counterpart for setting this option?

>  { /* end of list */ }
>  },
>  };
> @@ -2235,6 +2298,7 @@ static int nbd_process_options(BlockDriverState *bs, 
> QDict *options,
>  }
>  
>  s->reconnect_delay = qemu_opt_get_number(opts, "reconnect-delay", 0);
> +s->open_timeout = qemu_opt_get_number(opts, "open-timeout", 0);
>  
>  ret = 0;
>  
> @@ -2268,6 +2332,11 @@ static int nbd_open(BlockDriverState *bs, QDict 
> *options, int flags,
>  bdrv_inc_in_flight(bs);
>  aio_co_schedule(bdrv_get_aio_context(bs), s->connection_co);
>  
> +if (s->open_timeout) {
> +open_timer_init(s, qemu_clock_get_ns(QEMU_CLOCK_REALTIME) +
> +s->open_timeout * NANOSECONDS_PER_SECOND);
> +}
> +
>  if (qemu_in_coroutine()) {
>  s->open_co = qemu_coroutine_self();
>  qemu_coroutine_yield();
> 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org




[PATCH v2 2/8] nbd: allow reconnect on open, with corresponding new options

2020-11-30 Thread Vladimir Sementsov-Ogievskiy
Note: currently, using new option with long timeout in qmp command
blockdev-add is not good idea, as qmp interface is blocking, so,
don't add it now, let's add it later after
"monitor: Optionally run handlers in coroutines" series merged.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/nbd.c | 115 +---
 1 file changed, 92 insertions(+), 23 deletions(-)

diff --git a/block/nbd.c b/block/nbd.c
index 3e1d6c2b17..d25acafaad 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -125,12 +125,14 @@ typedef struct BDRVNBDState {
 bool wait_in_flight;
 
 QEMUTimer *reconnect_delay_timer;
+QEMUTimer *open_timer;
 
 NBDClientRequest requests[MAX_NBD_REQUESTS];
 NBDReply reply;
 BlockDriverState *bs;
 
 /* Connection parameters */
+uint64_t open_timeout;
 uint32_t reconnect_delay;
 SocketAddress *saddr;
 char *export, *tlscredsid;
@@ -305,7 +307,7 @@ static void coroutine_fn 
nbd_client_co_drain_end(BlockDriverState *bs)
 }
 
 
-static void nbd_teardown_connection(BlockDriverState *bs)
+static void nbd_teardown_connection_async(BlockDriverState *bs)
 {
 BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
 
@@ -325,6 +327,14 @@ static void nbd_teardown_connection(BlockDriverState *bs)
 }
 nbd_co_establish_connection_cancel(bs, true);
 }
+}
+
+static void nbd_teardown_connection(BlockDriverState *bs)
+{
+BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
+
+nbd_teardown_connection_async(bs);
+
 if (qemu_in_coroutine()) {
 s->teardown_co = qemu_coroutine_self();
 /* connection_co resumes us when it terminates */
@@ -474,6 +484,11 @@ nbd_co_establish_connection(BlockDriverState *bs, Error 
**errp)
 s->wait_connect = true;
 qemu_coroutine_yield();
 
+if (!s->connect_thread) {
+error_setg(errp, "Connection attempt cancelled by other operation");
+return NULL;
+}
+
 qemu_mutex_lock(>mutex);
 
 switch (thr->state) {
@@ -529,6 +544,12 @@ static void 
nbd_co_establish_connection_cancel(BlockDriverState *bs,
 bool wake = false;
 bool do_free = false;
 
+if (!thr) {
+/* already detached or finished */
+assert(!s->wait_connect);
+return;
+}
+
 qemu_mutex_lock(>mutex);
 
 if (thr->state == CONNECT_THREAD_RUNNING) {
@@ -624,10 +645,15 @@ static coroutine_fn void 
nbd_reconnect_attempt(BDRVNBDState *s)
 bdrv_inc_in_flight(s->bs);
 
 out:
-s->connect_status = ret;
-error_free(s->connect_err);
-s->connect_err = NULL;
-error_propagate(>connect_err, local_err);
+if (s->connect_status == -ETIMEDOUT) {
+/* Don't rewrite timeout error by following cancel-provoked error */
+error_free(local_err);
+} else {
+s->connect_status = ret;
+error_free(s->connect_err);
+s->connect_err = NULL;
+error_propagate(>connect_err, local_err);
+}
 
 if (ret >= 0) {
 /* successfully connected */
@@ -636,11 +662,44 @@ out:
 }
 }
 
+static void open_timer_del(BDRVNBDState *s)
+{
+if (s->open_timer) {
+timer_del(s->open_timer);
+timer_free(s->open_timer);
+s->open_timer = NULL;
+}
+}
+
+static void open_timer_cb(void *opaque)
+{
+BDRVNBDState *s = opaque;
+
+if (!s->connect_status) {
+/* First attempt was not finished. We should set an error */
+s->connect_status = -ETIMEDOUT;
+error_setg(>connect_err, "First connection attempt is cancelled by "
+   "timeout");
+}
+
+nbd_teardown_connection_async(s->bs);
+open_timer_del(s);
+}
+
+static void open_timer_init(BDRVNBDState *s, uint64_t expire_time_ns)
+{
+assert(!s->open_timer && s->state == NBD_CLIENT_OPENING);
+s->open_timer = aio_timer_new(bdrv_get_aio_context(s->bs),
+  QEMU_CLOCK_REALTIME,
+  SCALE_NS,
+  open_timer_cb, s);
+timer_mod(s->open_timer, expire_time_ns);
+}
+
 static coroutine_fn void nbd_co_reconnect_loop(BDRVNBDState *s)
 {
 uint64_t timeout = 1 * NANOSECONDS_PER_SECOND;
 uint64_t max_timeout = 16 * NANOSECONDS_PER_SECOND;
-bool initial_connect = s->state == NBD_CLIENT_OPENING;
 
 if (s->state == NBD_CLIENT_CONNECTING_WAIT) {
 reconnect_delay_timer_init(s, qemu_clock_get_ns(QEMU_CLOCK_REALTIME) +
@@ -649,23 +708,9 @@ static coroutine_fn void 
nbd_co_reconnect_loop(BDRVNBDState *s)
 
 nbd_reconnect_attempt(s);
 
-if (initial_connect) {
-if (s->state == NBD_CLIENT_CONNECTED) {
-/* All good. Just kick nbd_open() to successfully return */
-if (s->open_co) {
-aio_co_wake(s->open_co);
-s->open_co = NULL;
-}
-aio_wait_kick();
-return;
-} else {
-/*
- * Failed. Currently, reconnect on open is not allowed, so quit.
- *