[Qemu-block] [RFC 3/3] blk: add 'reconnect' error action

Vladimir Sementsov-Ogievskiy Tue, 24 Apr 2018 06:14:26 -0700

New action works as follows:

Firstly, not stopping the vm, it tries to bdrv_reconnect several times
with given pause. Then, if we failed to reconnect fallthrough to 'stop'
error action.


TODO:
 - qapi docs
 - support other disks (only scsi here)
 - support block jobs
 - add configuration of timeout and tries count parameters

Signed-off-by: Vladimir Sementsov-Ogievskiy <[email protected]>
---
 qapi/block-core.json  |  4 ++--
 block/block-backend.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++-
 hw/scsi/scsi-disk.c   |  4 +++-
 3 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index c50517bff3..d4d87dbd4f 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -1028,7 +1028,7 @@
 # Since: 1.3
 ##
 { 'enum': 'BlockdevOnError',
-  'data': ['report', 'ignore', 'enospc', 'stop', 'auto'] }
+  'data': ['report', 'ignore', 'enospc', 'stop', 'auto', 'reconnect'] }
 
 ##
 # @MirrorSyncMode:
@@ -4351,7 +4351,7 @@
 # Since: 2.1
 ##
 { 'enum': 'BlockErrorAction',
-  'data': [ 'ignore', 'report', 'stop' ] }
+  'data': [ 'ignore', 'report', 'stop', 'reconnect' ] }
 
 
 ##
diff --git a/block/block-backend.c b/block/block-backend.c
index 681b240b12..81eb9a7bd0 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -89,6 +89,11 @@ struct BlockBackend {
      */
     unsigned int in_flight;
     AioWait wait;
+
+    bool reconnect_failed; /* TODO: worth tri-state variable? */
+    bool reconnecting;
+    unsigned int reconnect_max;
+    uint64_t reconnect_ns;
 };
 
 typedef struct BlockBackendAIOCB {
@@ -322,6 +327,8 @@ BlockBackend *blk_new(uint64_t perm, uint64_t shared_perm)
     blk->refcnt = 1;
     blk->perm = perm;
     blk->shared_perm = shared_perm;
+    blk->reconnect_max = 10; /* TODO configure */
+    blk->reconnect_ns = 5000000000; /* 5 seconds, TODO configure */
     blk_set_enable_write_cache(blk, true);
 
     block_acct_init(&blk->stats);
@@ -1079,6 +1086,7 @@ void blk_iostatus_disable(BlockBackend *blk)
 
 void blk_iostatus_reset(BlockBackend *blk)
 {
+    blk->reconnect_failed = false;
     if (blk_iostatus_is_enabled(blk)) {
         BlockDriverState *bs = blk_bs(blk);
         blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
@@ -1635,6 +1643,9 @@ BlockErrorAction blk_get_error_action(BlockBackend *blk, 
bool is_read,
     BlockdevOnError on_err = blk_get_on_error(blk, is_read);
 
     switch (on_err) {
+    case BLOCKDEV_ON_ERROR_RECONNECT:
+        return blk->reconnect_failed ? BLOCK_ERROR_ACTION_STOP :
+                                       BLOCK_ERROR_ACTION_RECONNECT;
     case BLOCKDEV_ON_ERROR_ENOSPC:
         return (error == ENOSPC) ?
                BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
@@ -1665,6 +1676,29 @@ static void send_qmp_error_event(BlockBackend *blk,
                                    &error_abort);
 }
 
+
+static void coroutine_fn blk_reconnect_co(void *opaque)
+{
+    BlockBackend *blk = opaque;
+    int i;
+
+    for (i = 0; i < blk->reconnect_max; i++) {
+        int ret;
+
+        qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, blk->reconnect_ns);
+
+        ret = bdrv_reconnect(blk_bs(blk), NULL);
+        if (ret == 0) {
+            blk->reconnecting = false;
+            blk_iostatus_reset(blk);
+            return;
+        }
+    }
+
+    blk->reconnecting = false;
+    blk->reconnect_failed = true;
+}
+
 /* This is done by device models because, while the block layer knows
  * about the error, it does not know whether an operation comes from
  * the device or the block layer (from a job, for example).
@@ -1674,7 +1708,19 @@ void blk_error_action(BlockBackend *blk, 
BlockErrorAction action,
 {
     assert(error >= 0);
 
-    if (action == BLOCK_ERROR_ACTION_STOP) {
+    if (action == BLOCK_ERROR_ACTION_RECONNECT) {
+        Coroutine *co;
+        blk_iostatus_set_err(blk, error);
+
+        if (blk->reconnecting || blk->reconnect_failed) {
+            return;
+        }
+
+        blk->reconnecting = true;
+
+        co = qemu_coroutine_create(blk_reconnect_co, blk);
+        aio_co_enter(blk_get_aio_context(blk), co);
+    } else if (action == BLOCK_ERROR_ACTION_STOP) {
         /* First set the iostatus, so that "info block" returns an iostatus
          * that matches the events raised so far (an additional error iostatus
          * is fine, but not a lost one).
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index ded23d36ca..f1c166dfda 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -474,7 +474,9 @@ static bool scsi_handle_rw_error(SCSIDiskReq *r, int error, 
bool acct_failed)
     }
 
     blk_error_action(s->qdev.conf.blk, action, is_read, error);
-    if (action == BLOCK_ERROR_ACTION_STOP) {
+    if (action == BLOCK_ERROR_ACTION_STOP ||
+        action == BLOCK_ERROR_ACTION_RECONNECT)
+    {
         scsi_req_retry(&r->req);
     }
     return action != BLOCK_ERROR_ACTION_IGNORE;
-- 
2.11.1

[Qemu-block] [RFC 3/3] blk: add 'reconnect' error action

Reply via email to