I would like to withdraw this patch series. Sorry for the inconvenience, and thank you for your understanding.
CJ Chen <cjc...@igel.co.jp> 於 2025年9月1日 週一 下午6:20寫道: > > When an iSCSI session drops or an I/O stalls past a hard timeout, > requests could complete after teardown and coroutines might be woken > in an invalid state. This patch clarifies task ownership and hardens > completion paths. > > Changes: > - Add a per-command deadline (I/O hard timeout). On expiry, cancel > the libiscsi task, detach the coroutine (co = NULL), and let the > callback free the task safely. > - Track inflight tasks and fail them promptly on session disconnect > when fail-fast is enabled. Throttle reconnect attempts and drive > them via the periodic timer and NOP keepalives. > - Always refresh the fd event mask after state changes. Tidy event, > read, and write handlers, and remove unused labels/duplicates. > - Arm deadlines only for heap-allocated tasks (read/write/flush). > Stack-based helper paths continue to wait synchronously without > deadlines. > > User-visible effect: under error conditions we now return -ETIMEDOUT > or -ENOTCONN instead of hanging or crashing. Normal I/O behavior is > unchanged. Internally, a 5s hard timeout with fail-fast is enforced to > avoid indefinite stalls. > > Resolves: https://gitlab.com/qemu-project/qemu/-/issues/3067 > > Signed-off-by: CJ Chen <cjc...@igel.co.jp> > --- > block/iscsi.c | 449 ++++++++++++++++++++++++++++++++++++++++---------- > 1 file changed, 358 insertions(+), 91 deletions(-) > > diff --git a/block/iscsi.c b/block/iscsi.c > index 15b96ee880..094b51c47c 100644 > --- a/block/iscsi.c > +++ b/block/iscsi.c > @@ -49,6 +49,7 @@ > #include "crypto/secret.h" > #include "scsi/utils.h" > #include "trace.h" > +#include "qemu/timer.h" > > /* Conflict between scsi/utils.h and libiscsi! :( */ > #define SCSI_XFER_NONE ISCSI_XFER_NONE > @@ -74,6 +75,8 @@ typedef struct IscsiLun { > QEMUTimer *nop_timer; > QEMUTimer *event_timer; > QemuMutex mutex; > + int64_t next_reconnect_ms; /* throttle repeated reconnects */ > + bool last_logged_in; /* for state-change logging */ > struct scsi_inquiry_logical_block_provisioning lbp; > struct scsi_inquiry_block_limits bl; > struct scsi_inquiry_device_designator *dd; > @@ -103,6 +106,9 @@ typedef struct IscsiLun { > bool dpofua; > bool has_write_same; > bool request_timed_out; > + uint32_t io_hard_timeout_ms; > + bool fail_fast; > + QTAILQ_HEAD(, IscsiTask) inflight; > } IscsiLun; > > typedef struct IscsiTask { > @@ -116,6 +122,12 @@ typedef struct IscsiTask { > QEMUTimer retry_timer; > int err_code; > char *err_str; > + QEMUTimer deadline_timer; > + bool deadline_armed; > + bool hard_timed_out; > + int64_t first_submit_ms; > + QTAILQ_ENTRY(IscsiTask) entry; > + bool on_list; > } IscsiTask; > > typedef struct IscsiAIOCB { > @@ -185,7 +197,9 @@ static void iscsi_co_generic_bh_cb(void *opaque) > struct IscsiTask *iTask = opaque; > > iTask->complete = 1; > - aio_co_wake(iTask->co); > + if (iTask->co) { > + aio_co_wake(iTask->co); > + } > } > > static void iscsi_retry_timer_expired(void *opaque) > @@ -232,75 +246,148 @@ static int iscsi_translate_sense(struct scsi_sense > *sense) > sense->ascq & 0xFF); > } > > +static void iscsi_fail_inflight(IscsiLun *s, int err) > +{ > + IscsiTask *it, *next; > + int n = 0; > + > + QTAILQ_FOREACH_SAFE(it, &s->inflight, entry, next) { > + if (it->deadline_armed) { > + timer_del(&it->deadline_timer); > + it->deadline_armed = false; > + } > + it->err_code = err ? err : -EIO; > + it->hard_timed_out = true; > + it->status = SCSI_STATUS_TIMEOUT; > + > + if (it->task) { > + iscsi_scsi_cancel_task(s->iscsi, it->task); > + } > + > + if (it->co) { > + replay_bh_schedule_oneshot_event(s->aio_context, > + iscsi_co_generic_bh_cb, it); > + } else { > + it->complete = 1; > + } > + QTAILQ_REMOVE(&s->inflight, it, entry); > + it->on_list = false; > + n++; > + } > +} > + > /* Called (via iscsi_service) with QemuMutex held. */ > static void > iscsi_co_generic_cb(struct iscsi_context *iscsi, int status, > - void *command_data, void *opaque) > + void *command_data, void *opaque) > { > struct IscsiTask *iTask = opaque; > struct scsi_task *task = command_data; > > + if (iTask->deadline_armed) { > + timer_del(&iTask->deadline_timer); > + iTask->deadline_armed = false; > + } > + > iTask->status = status; > iTask->do_retry = 0; > iTask->err_code = 0; > iTask->task = task; > > if (status != SCSI_STATUS_GOOD) { > - iTask->err_code = -EIO; > - if (iTask->retries++ < ISCSI_CMD_RETRIES) { > - if (status == SCSI_STATUS_BUSY || > - status == SCSI_STATUS_TIMEOUT || > - status == SCSI_STATUS_TASK_SET_FULL) { > - unsigned retry_time = > - exp_random(iscsi_retry_times[iTask->retries - 1]); > - if (status == SCSI_STATUS_TIMEOUT) { > - /* make sure the request is rescheduled AFTER the > - * reconnect is initiated */ > - retry_time = EVENT_INTERVAL * 2; > - iTask->iscsilun->request_timed_out = true; > - } > - error_report("iSCSI Busy/TaskSetFull/TimeOut" > - " (retry #%u in %u ms): %s", > - iTask->retries, retry_time, > - iscsi_get_error(iscsi)); > - aio_timer_init(iTask->iscsilun->aio_context, > - &iTask->retry_timer, QEMU_CLOCK_REALTIME, > - SCALE_MS, iscsi_retry_timer_expired, iTask); > - timer_mod(&iTask->retry_timer, > - qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + > retry_time); > - iTask->do_retry = 1; > - return; > - } else if (status == SCSI_STATUS_CHECK_CONDITION) { > - int error = iscsi_translate_sense(&task->sense); > - if (error == EAGAIN) { > - error_report("iSCSI CheckCondition: %s", > + if (iTask->hard_timed_out) { > + iTask->err_code = -ETIMEDOUT; > + iTask->err_str = g_strdup("iSCSI hard timeout"); > + iTask->do_retry = 0; > + } else { > + iTask->err_code = -EIO; > + if (iTask->retries++ < ISCSI_CMD_RETRIES) { > + if (status == SCSI_STATUS_BUSY || > + status == SCSI_STATUS_TIMEOUT || > + status == SCSI_STATUS_TASK_SET_FULL) { > + unsigned retry_time = > + exp_random(iscsi_retry_times[iTask->retries - 1]); > + if (status == SCSI_STATUS_TIMEOUT) { > + /* > + * make sure the request is rescheduled AFTER the > + * reconnect is initiated > + */ > + retry_time = EVENT_INTERVAL * 2; > + iTask->iscsilun->request_timed_out = true; > + } > + error_report("iSCSI Busy/TaskSetFull/TimeOut" > + " (retry #%u in %u ms): %s", > + iTask->retries, retry_time, > iscsi_get_error(iscsi)); > + aio_timer_init(iTask->iscsilun->aio_context, > + &iTask->retry_timer, QEMU_CLOCK_REALTIME, > + SCALE_MS, iscsi_retry_timer_expired, > iTask); > + timer_mod(&iTask->retry_timer, > + qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + > retry_time); > iTask->do_retry = 1; > + return; > + } else if (status == SCSI_STATUS_CHECK_CONDITION) { > + int error = iscsi_translate_sense(&task->sense); > + if (error == EAGAIN) { > + error_report("iSCSI CheckCondition: %s", > + iscsi_get_error(iscsi)); > + iTask->do_retry = 1; > + } else { > + iTask->err_code = -error; > + iTask->err_str = g_strdup(iscsi_get_error(iscsi)); > + } > } else { > - iTask->err_code = -error; > + if (!iTask->err_str) { > + iTask->err_str = g_strdup(iscsi_get_error(iscsi)); > + } > + } > + } else { > + if (!iTask->err_str) { > iTask->err_str = g_strdup(iscsi_get_error(iscsi)); > } > } > } > } > - > if (iTask->co) { > replay_bh_schedule_oneshot_event(iTask->iscsilun->aio_context, > iscsi_co_generic_bh_cb, iTask); > } else { > iTask->complete = 1; > + if (iTask->task) { > + scsi_free_scsi_task(iTask->task); > + iTask->task = NULL; > + } > + g_free(iTask->err_str); > + g_free(iTask); > } > + > } > > static void coroutine_fn > iscsi_co_init_iscsitask(IscsiLun *iscsilun, struct IscsiTask *iTask) > { > *iTask = (struct IscsiTask) { > - .co = qemu_coroutine_self(), > - .iscsilun = iscsilun, > + .co = qemu_coroutine_self(), > + .iscsilun = iscsilun, > + .deadline_armed = false, > + .hard_timed_out = false, > + .first_submit_ms = qemu_clock_get_ms(QEMU_CLOCK_REALTIME), > + .on_list = false, > }; > } > > +static IscsiTask * coroutine_fn iscsi_task_new(IscsiLun *iscsilun) > +{ > + IscsiTask *iTask = g_new0(IscsiTask, 1); > + iTask->co = qemu_coroutine_self(); > + iTask->iscsilun = iscsilun; > + iTask->deadline_armed = false; > + iTask->hard_timed_out = false; > + iTask->first_submit_ms = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); > + iTask->on_list = false; > + return iTask; > +} > + > #ifdef __linux__ > > /* Called (via iscsi_service) with QemuMutex held. */ > @@ -371,17 +458,85 @@ iscsi_set_events(IscsiLun *iscsilun) > } > } > > +/* Try to (re)connect, but throttle to avoid storms. */ > +static void iscsi_maybe_reconnect(IscsiLun *iscsilun) > +{ > + int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); > + if (now < iscsilun->next_reconnect_ms) { > + return; > + } > + iscsi_reconnect(iscsilun->iscsi); > + iscsilun->next_reconnect_ms = now + 2000; /* 2s throttle */ > + /* After changing connection state, refresh event mask immediately. */ > + iscsi_set_events(iscsilun); > +} > + > +static void iscsi_deadline_timer_expired(void *opaque) > +{ > + struct IscsiTask *iTask = opaque; > + IscsiLun *iscsilun = iTask->iscsilun; > + > + if (!iTask->deadline_armed) { > + return; > + } > + iTask->deadline_armed = false; > + iTask->hard_timed_out = true; > + iTask->status = SCSI_STATUS_TIMEOUT; > + iTask->err_code = -ETIMEDOUT; > + > + if (iscsilun) { > + qemu_mutex_lock(&iscsilun->mutex); > + if (iTask->task) { > + iscsi_scsi_cancel_task(iscsilun->iscsi, iTask->task); > + if (iTask->co) { > + replay_bh_schedule_oneshot_event(iscsilun->aio_context, > + iscsi_co_generic_bh_cb, > iTask); > + } > + iscsi_set_events(iscsilun); > + } > + qemu_mutex_unlock(&iscsilun->mutex); > + } > +} > + > +static inline void iscsi_arm_deadline(struct IscsiTask *iTask) > +{ > + IscsiLun *iscsilun = iTask->iscsilun; > + > + if (!iscsilun->io_hard_timeout_ms || iTask->deadline_armed) { > + return; > + } > + aio_timer_init(iscsilun->aio_context, &iTask->deadline_timer, > + QEMU_CLOCK_REALTIME, SCALE_MS, > + iscsi_deadline_timer_expired, iTask); > + timer_mod(&iTask->deadline_timer, > + iTask->first_submit_ms + iscsilun->io_hard_timeout_ms); > + iTask->deadline_armed = true; > +} > + > static void iscsi_timed_check_events(void *opaque) > { > IscsiLun *iscsilun = opaque; > > WITH_QEMU_LOCK_GUARD(&iscsilun->mutex) { > + bool logged_in_before = iscsilun->last_logged_in; > + bool logged_in_now; > /* check for timed out requests */ > iscsi_service(iscsilun->iscsi, 0); > + logged_in_now = iscsi_is_logged_in(iscsilun->iscsi); > + if (logged_in_before != logged_in_now) { > + iscsilun->last_logged_in = logged_in_now; > + if (logged_in_before && !logged_in_now && iscsilun->fail_fast) { > + iscsi_fail_inflight(iscsilun, -ENOTCONN); > + } > + } > > if (iscsilun->request_timed_out) { > iscsilun->request_timed_out = false; > - iscsi_reconnect(iscsilun->iscsi); > + iscsi_maybe_reconnect(iscsilun); > + } > + > + if (!logged_in_now) { > + iscsi_maybe_reconnect(iscsilun); > } > > /* > @@ -605,7 +760,7 @@ iscsi_co_writev(BlockDriverState *bs, int64_t sector_num, > int nb_sectors, > QEMUIOVector *iov, int flags) > { > IscsiLun *iscsilun = bs->opaque; > - struct IscsiTask iTask; > + IscsiTask *iTask; > uint64_t lba; > uint32_t num_sectors; > bool fua = flags & BDRV_REQ_FUA; > @@ -624,21 +779,27 @@ iscsi_co_writev(BlockDriverState *bs, int64_t > sector_num, int nb_sectors, > > lba = sector_qemu2lun(sector_num, iscsilun); > num_sectors = sector_qemu2lun(nb_sectors, iscsilun); > - iscsi_co_init_iscsitask(iscsilun, &iTask); > + iTask = iscsi_task_new(iscsilun); > qemu_mutex_lock(&iscsilun->mutex); > + if (iscsilun->fail_fast && !iscsi_is_logged_in(iscsilun->iscsi)) { > + qemu_mutex_unlock(&iscsilun->mutex); > + g_free(iTask->err_str); > + g_free(iTask); > + return -ENOTCONN; > + } > retry: > if (iscsilun->use_16_for_rw) { > #if LIBISCSI_API_VERSION >= (20160603) > - iTask.task = iscsi_write16_iov_task(iscsilun->iscsi, iscsilun->lun, > lba, > + iTask->task = iscsi_write16_iov_task(iscsilun->iscsi, iscsilun->lun, > lba, > NULL, num_sectors * > iscsilun->block_size, > iscsilun->block_size, 0, 0, fua, > 0, 0, > - iscsi_co_generic_cb, &iTask, > + iscsi_co_generic_cb, iTask, > (struct scsi_iovec *)iov->iov, > iov->niov); > } else { > - iTask.task = iscsi_write10_iov_task(iscsilun->iscsi, iscsilun->lun, > lba, > + iTask->task = iscsi_write10_iov_task(iscsilun->iscsi, iscsilun->lun, > lba, > NULL, num_sectors * > iscsilun->block_size, > iscsilun->block_size, 0, 0, fua, > 0, 0, > - iscsi_co_generic_cb, &iTask, > + iscsi_co_generic_cb, iTask, > (struct scsi_iovec *)iov->iov, > iov->niov); > } > #else > @@ -653,41 +814,62 @@ retry: > iscsi_co_generic_cb, &iTask); > } > #endif > - if (iTask.task == NULL) { > + if (iTask->task == NULL) { > qemu_mutex_unlock(&iscsilun->mutex); > + g_free(iTask); > return -ENOMEM; > } > #if LIBISCSI_API_VERSION < (20160603) > scsi_task_set_iov_out(iTask.task, (struct scsi_iovec *) iov->iov, > iov->niov); > #endif > - iscsi_co_wait_for_task(&iTask, iscsilun); > + iscsi_set_events(iscsilun); > + if (!iTask->on_list) { > + QTAILQ_INSERT_TAIL(&iscsilun->inflight, iTask, entry); > + iTask->on_list = true; > + } > + iscsi_arm_deadline(iTask); > + iscsi_co_wait_for_task(iTask, iscsilun); > > - if (iTask.task != NULL) { > - scsi_free_scsi_task(iTask.task); > - iTask.task = NULL; > + if (!iTask->hard_timed_out && iTask->task != NULL) { > + scsi_free_scsi_task(iTask->task); > + iTask->task = NULL; > } > > - if (iTask.do_retry) { > - iTask.complete = 0; > + if (iTask->do_retry) { > + iTask->complete = 0; > goto retry; > } > > - if (iTask.status != SCSI_STATUS_GOOD) { > + if (iTask->hard_timed_out) { > + r = iTask->err_code ? iTask->err_code : -ETIMEDOUT; > + iTask->co = NULL; > + if (iTask->on_list) { > + QTAILQ_REMOVE(&iscsilun->inflight, iTask, entry); > + iTask->on_list = false; > + } > + qemu_mutex_unlock(&iscsilun->mutex); > + return r; > + } > + > + if (iTask->status != SCSI_STATUS_GOOD) { > iscsi_allocmap_set_invalid(iscsilun, sector_num * BDRV_SECTOR_SIZE, > nb_sectors * BDRV_SECTOR_SIZE); > error_report("iSCSI WRITE10/16 failed at lba %" PRIu64 ": %s", lba, > - iTask.err_str); > - r = iTask.err_code; > - goto out_unlock; > + iTask->err_str); > + r = iTask->err_code; > + } else { > + iscsi_allocmap_set_allocated(iscsilun, sector_num * BDRV_SECTOR_SIZE, > + nb_sectors * BDRV_SECTOR_SIZE); > } > > - iscsi_allocmap_set_allocated(iscsilun, sector_num * BDRV_SECTOR_SIZE, > - nb_sectors * BDRV_SECTOR_SIZE); > - > -out_unlock: > + if (iTask->on_list) { > + QTAILQ_REMOVE(&iscsilun->inflight, iTask, entry); > + iTask->on_list = false; > + } > qemu_mutex_unlock(&iscsilun->mutex); > - g_free(iTask.err_str); > + g_free(iTask->err_str); > + g_free(iTask); > return r; > } > > @@ -733,6 +915,8 @@ retry: > ret = -ENOMEM; > goto out_unlock; > } > + iscsi_arm_deadline(&iTask); > + iscsi_set_events(iscsilun); > iscsi_co_wait_for_task(&iTask, iscsilun); > > if (iTask.do_retry) { > @@ -801,7 +985,7 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState > *bs, > QEMUIOVector *iov) > { > IscsiLun *iscsilun = bs->opaque; > - struct IscsiTask iTask; > + IscsiTask *iTask; > uint64_t lba; > uint32_t num_sectors; > int r = 0; > @@ -856,22 +1040,28 @@ static int coroutine_fn > iscsi_co_readv(BlockDriverState *bs, > lba = sector_qemu2lun(sector_num, iscsilun); > num_sectors = sector_qemu2lun(nb_sectors, iscsilun); > > - iscsi_co_init_iscsitask(iscsilun, &iTask); > + iTask = iscsi_task_new(iscsilun); > qemu_mutex_lock(&iscsilun->mutex); > + if (iscsilun->fail_fast && !iscsi_is_logged_in(iscsilun->iscsi)) { > + qemu_mutex_unlock(&iscsilun->mutex); > + g_free(iTask->err_str); > + g_free(iTask); > + return -ENOTCONN; > + } > retry: > if (iscsilun->use_16_for_rw) { > #if LIBISCSI_API_VERSION >= (20160603) > - iTask.task = iscsi_read16_iov_task(iscsilun->iscsi, iscsilun->lun, > lba, > + iTask->task = iscsi_read16_iov_task(iscsilun->iscsi, iscsilun->lun, > lba, > num_sectors * > iscsilun->block_size, > iscsilun->block_size, 0, 0, 0, 0, > 0, > - iscsi_co_generic_cb, &iTask, > + iscsi_co_generic_cb, iTask, > (struct scsi_iovec *)iov->iov, > iov->niov); > } else { > - iTask.task = iscsi_read10_iov_task(iscsilun->iscsi, iscsilun->lun, > lba, > + iTask->task = iscsi_read10_iov_task(iscsilun->iscsi, iscsilun->lun, > lba, > num_sectors * > iscsilun->block_size, > iscsilun->block_size, > 0, 0, 0, 0, 0, > - iscsi_co_generic_cb, &iTask, > + iscsi_co_generic_cb, iTask, > (struct scsi_iovec *)iov->iov, > iov->niov); > } > #else > @@ -887,70 +1077,119 @@ retry: > iscsi_co_generic_cb, &iTask); > } > #endif > - if (iTask.task == NULL) { > + if (iTask->task == NULL) { > qemu_mutex_unlock(&iscsilun->mutex); > + g_free(iTask); > return -ENOMEM; > } > #if LIBISCSI_API_VERSION < (20160603) > scsi_task_set_iov_in(iTask.task, (struct scsi_iovec *) iov->iov, > iov->niov); > #endif > - > - iscsi_co_wait_for_task(&iTask, iscsilun); > - if (iTask.task != NULL) { > - scsi_free_scsi_task(iTask.task); > - iTask.task = NULL; > + if (!iTask->on_list) { > + QTAILQ_INSERT_TAIL(&iscsilun->inflight, iTask, entry); > + iTask->on_list = true; > + } > + iscsi_arm_deadline(iTask); > + iscsi_co_wait_for_task(iTask, iscsilun); > + if (!iTask->hard_timed_out && iTask->task != NULL) { > + scsi_free_scsi_task(iTask->task); > + iTask->task = NULL; > } > > - if (iTask.do_retry) { > - iTask.complete = 0; > + if (iTask->do_retry) { > + iTask->complete = 0; > goto retry; > } > > - if (iTask.status != SCSI_STATUS_GOOD) { > + if (iTask->hard_timed_out) { > + r = iTask->err_code ? iTask->err_code : -ETIMEDOUT; > + iTask->co = NULL; > + if (iTask->on_list) { > + QTAILQ_REMOVE(&iscsilun->inflight, iTask, entry); > + iTask->on_list = false; > + } > + qemu_mutex_unlock(&iscsilun->mutex); > + return r; > + } > + > + if (iTask->status != SCSI_STATUS_GOOD) { > error_report("iSCSI READ10/16 failed at lba %" PRIu64 ": %s", > - lba, iTask.err_str); > - r = iTask.err_code; > + lba, iTask->err_str); > + r = iTask->err_code; > } > > + if (iTask->on_list) { > + QTAILQ_REMOVE(&iscsilun->inflight, iTask, entry); > + iTask->on_list = false; > + } > qemu_mutex_unlock(&iscsilun->mutex); > - g_free(iTask.err_str); > + g_free(iTask->err_str); > + g_free(iTask); > return r; > } > > static int coroutine_fn iscsi_co_flush(BlockDriverState *bs) > { > IscsiLun *iscsilun = bs->opaque; > - struct IscsiTask iTask; > + IscsiTask *iTask; > int r = 0; > > - iscsi_co_init_iscsitask(iscsilun, &iTask); > + iTask = iscsi_task_new(iscsilun); > qemu_mutex_lock(&iscsilun->mutex); > + if (iscsilun->fail_fast && !iscsi_is_logged_in(iscsilun->iscsi)) { > + qemu_mutex_unlock(&iscsilun->mutex); > + g_free(iTask->err_str); > + g_free(iTask); > + return -ENOTCONN; > + } > retry: > if (iscsi_synchronizecache10_task(iscsilun->iscsi, iscsilun->lun, 0, 0, > 0, > - 0, iscsi_co_generic_cb, &iTask) == > NULL) { > + 0, iscsi_co_generic_cb, iTask) == > NULL) { > qemu_mutex_unlock(&iscsilun->mutex); > + g_free(iTask); > return -ENOMEM; > } > + iscsi_set_events(iscsilun); > + if (!iTask->on_list) { > + QTAILQ_INSERT_TAIL(&iscsilun->inflight, iTask, entry); > + iTask->on_list = true; > + } > + iscsi_arm_deadline(iTask); > + iscsi_co_wait_for_task(iTask, iscsilun); > > - iscsi_co_wait_for_task(&iTask, iscsilun); > - > - if (iTask.task != NULL) { > - scsi_free_scsi_task(iTask.task); > - iTask.task = NULL; > + if (!iTask->hard_timed_out && iTask->task != NULL) { > + scsi_free_scsi_task(iTask->task); > + iTask->task = NULL; > } > > - if (iTask.do_retry) { > - iTask.complete = 0; > + if (iTask->do_retry) { > + iTask->complete = 0; > goto retry; > } > > - if (iTask.status != SCSI_STATUS_GOOD) { > - error_report("iSCSI SYNCHRONIZECACHE10 failed: %s", iTask.err_str); > - r = iTask.err_code; > + if (iTask->hard_timed_out) { > + r = iTask->err_code ? iTask->err_code : -ETIMEDOUT; > + iTask->co = NULL; /* detach */ > + if (iTask->on_list) { > + QTAILQ_REMOVE(&iscsilun->inflight, iTask, entry); > + iTask->on_list = false; > + } > + qemu_mutex_unlock(&iscsilun->mutex); > + return r; > } > > + if (iTask->status != SCSI_STATUS_GOOD) { > + error_report("iSCSI SYNCHRONIZECACHE10 failed: %s", iTask->err_str); > + r = iTask->err_code; > + } > + > + if (iTask->on_list) { > + QTAILQ_REMOVE(&iscsilun->inflight, iTask, entry); > + iTask->on_list = false; > + } > qemu_mutex_unlock(&iscsilun->mutex); > - g_free(iTask.err_str); > + g_free(iTask->err_str); > + g_free(iTask); > return r; > } > > @@ -1086,6 +1325,12 @@ static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState > *bs, > > data.size = 0; > qemu_mutex_lock(&iscsilun->mutex); > + if (iscsilun->fail_fast && !iscsi_is_logged_in(iscsilun->iscsi)) { > + qemu_mutex_unlock(&iscsilun->mutex); > + acb->status = -ENOTCONN; > + iscsi_schedule_bh(acb); > + return &acb->common; > + } > if (acb->task->xfer_dir == SCSI_XFER_WRITE) { > if (acb->ioh->iovec_count == 0) { > data.data = acb->ioh->dxferp; > @@ -1176,6 +1421,7 @@ retry: > goto out_unlock; > } > > + iscsi_set_events(iscsilun); > iscsi_co_wait_for_task(&iTask, iscsilun); > > if (iTask.task != NULL) { > @@ -1282,6 +1528,7 @@ retry: > return -ENOMEM; > } > > + iscsi_set_events(iscsilun); > iscsi_co_wait_for_task(&iTask, iscsilun); > > if (iTask.status == SCSI_STATUS_CHECK_CONDITION && > @@ -1415,14 +1662,24 @@ static void iscsi_nop_timed_event(void *opaque) > IscsiLun *iscsilun = opaque; > > QEMU_LOCK_GUARD(&iscsilun->mutex); > + /* If we are not logged in, use the nop timer as an additional reconnect > driver. */ > + if (!iscsi_is_logged_in(iscsilun->iscsi)) { > + iscsilun->request_timed_out = true; > + iscsi_maybe_reconnect(iscsilun); > + goto rearm; > + } > if (iscsi_get_nops_in_flight(iscsilun->iscsi) >= MAX_NOP_FAILURES) { > error_report("iSCSI: NOP timeout. Reconnecting..."); > iscsilun->request_timed_out = true; > } else if (iscsi_nop_out_async(iscsilun->iscsi, NULL, NULL, 0, NULL) != > 0) { > - error_report("iSCSI: failed to sent NOP-Out. Disabling NOP > messages."); > - return; > + /* Do NOT disable NOPs; treat as connection problem and try to > reconnect. */ > + error_report("iSCSI: failed to send NOP-Out. Triggering reconnect."); > + iscsilun->request_timed_out = true; > + iscsi_maybe_reconnect(iscsilun); > + /* keep NOPs enabled; next tick will try again */ > } > > +rearm: > timer_mod(iscsilun->nop_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + > NOP_INTERVAL); > iscsi_set_events(iscsilun); > } > @@ -1559,6 +1816,8 @@ static void iscsi_attach_aio_context(BlockDriverState > *bs, > IscsiLun *iscsilun = bs->opaque; > > iscsilun->aio_context = new_context; > + iscsilun->next_reconnect_ms = 0; > + iscsilun->last_logged_in = iscsi_is_logged_in(iscsilun->iscsi); > iscsi_set_events(iscsilun); > > /* Set up a timer for sending out iSCSI NOPs */ > @@ -1894,6 +2153,9 @@ static int iscsi_open(BlockDriverState *bs, QDict > *options, int flags, > warn_report("iSCSI: ignoring timeout value for libiscsi <1.15.0"); > } > #endif > + /* FORCE-ON policy: 5s hard timeout */ > + iscsilun->io_hard_timeout_ms = 5000; /* 5 seconds */ > + iscsilun->fail_fast = true; > > if (iscsi_full_connect_sync(iscsi, portal, lun) != 0) { > error_setg(errp, "iSCSI: Failed to connect to LUN : %s", > @@ -1905,6 +2167,8 @@ static int iscsi_open(BlockDriverState *bs, QDict > *options, int flags, > iscsilun->iscsi = iscsi; > iscsilun->aio_context = bdrv_get_aio_context(bs); > iscsilun->lun = lun; > + iscsilun->next_reconnect_ms = 0; > + iscsilun->last_logged_in = false; /* updated after connect */ > iscsilun->has_write_same = true; > > task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 0, 0, > @@ -2007,6 +2271,8 @@ static int iscsi_open(BlockDriverState *bs, QDict > *options, int flags, > > qemu_mutex_init(&iscsilun->mutex); > iscsi_attach_aio_context(bs, iscsilun->aio_context); > + iscsilun->last_logged_in = iscsi_is_logged_in(iscsilun->iscsi); > + QTAILQ_INIT(&iscsilun->inflight); > > /* Guess the internal cluster (page) size of the iscsi target by the > means > * of opt_unmap_gran. Transfer the unmap granularity only if it has a > @@ -2387,6 +2653,7 @@ retry: > goto out_unlock; > } > > + iscsi_set_events(dst_lun); > iscsi_co_wait_for_task(&iscsi_task, dst_lun); > > if (iscsi_task.do_retry) { > -- > 2.25.1 >