On 07/09/2016 20:57, Ashish Mittal wrote: > diff --git a/block/vxhs.c b/block/vxhs.c > new file mode 100644 > index 0000000..813b7c2 > --- /dev/null > +++ b/block/vxhs.c > @@ -0,0 +1,1341 @@ > +/* > + * QEMU Block driver for Veritas HyperScale (VxHS) > + * > + * This work is licensed under the terms of the GNU GPL, version 2 or later. > + * See the COPYING file in the top-level directory. > + * > + */ > + > +#include "vxhs.h" > +#include <qnio/qnio_api.h> > +#include "trace.h" > +#include "qapi/qmp/qerror.h" > +#include "qapi/qmp/qdict.h" > +#include "qapi/qmp/qstring.h" > + > +#define VXHS_OPT_FILENAME "filename" > +#define VXHS_OPT_VDISK_ID "vdisk_id" > +#define VXHS_OPT_SERVER "server." > +#define VXHS_OPT_HOST "host" > +#define VXHS_OPT_PORT "port" > + > +/* qnio client ioapi_ctx */ > +static void *global_qnio_ctx; > + > +/* vdisk prefix to pass to qnio */ > +static const char vdisk_prefix[] = "/dev/of/vdisk"; > + > +void vxhs_inc_acb_segment_count(void *ptr, int count) > +{ > + VXHSAIOCB *acb = ptr; > + BDRVVXHSState *s = acb->common.bs->opaque; > + > + VXHS_SPIN_LOCK(s->vdisk_acb_lock);
Please use QemuSpin instead. Paolo > + acb->segments += count; > + VXHS_SPIN_UNLOCK(s->vdisk_acb_lock); > +} > + > +void vxhs_dec_acb_segment_count(void *ptr, int count) > +{ > + VXHSAIOCB *acb = ptr; > + BDRVVXHSState *s = acb->common.bs->opaque; > + > + VXHS_SPIN_LOCK(s->vdisk_acb_lock); > + acb->segments -= count; > + VXHS_SPIN_UNLOCK(s->vdisk_acb_lock); > +} > + > +int vxhs_dec_and_get_acb_segment_count(void *ptr, int count) > +{ > + VXHSAIOCB *acb = ptr; > + BDRVVXHSState *s = acb->common.bs->opaque; > + int segcount = 0; > + > + > + VXHS_SPIN_LOCK(s->vdisk_acb_lock); > + acb->segments -= count; > + segcount = acb->segments; > + VXHS_SPIN_UNLOCK(s->vdisk_acb_lock); > + > + return segcount; > +} > + > +void vxhs_set_acb_buffer(void *ptr, void *buffer) > +{ > + VXHSAIOCB *acb = ptr; > + > + acb->buffer = buffer; > +} > + > +void vxhs_inc_vdisk_iocount(void *ptr, uint32_t count) > +{ > + BDRVVXHSState *s = ptr; > + > + VXHS_SPIN_LOCK(s->vdisk_lock); > + s->vdisk_aio_count += count; > + VXHS_SPIN_UNLOCK(s->vdisk_lock); > +} > + > +void vxhs_dec_vdisk_iocount(void *ptr, uint32_t count) > +{ > + BDRVVXHSState *s = ptr; > + > + VXHS_SPIN_LOCK(s->vdisk_lock); > + s->vdisk_aio_count -= count; > + VXHS_SPIN_UNLOCK(s->vdisk_lock); > +} > + > +void vxhs_iio_callback(uint32_t rfd, uint32_t reason, void *ctx, void *m) > +{ > + VXHSAIOCB *acb = NULL; > + BDRVVXHSState *s = NULL; > + int rv = 0; > + int segcount = 0; > + uint32_t error = 0; > + uint32_t opcode = 0; > + > + assert(m); > + if (m) { > + /* TODO: need common get message attrs, not two separate lib calls */ > + error = qnio_iio_extract_msg_error(m); > + opcode = qnio_iio_extract_msg_opcode(m); > + } > + switch (opcode) { > + case IRP_READ_REQUEST: > + case IRP_WRITE_REQUEST: > + > + /* > + * ctx is VXHSAIOCB* > + * ctx is NULL if error is VXERROR_CHANNEL_HUP or reason is > IIO_REASON_HUP > + */ > + if (ctx) { > + acb = ctx; > + s = acb->common.bs->opaque; > + } else { > + trace_vxhs_iio_callback(error, reason); > + goto out; > + } > + > + if (error) { > + trace_vxhs_iio_callback_iofail(error, reason, acb, acb->segments); > + > + if (reason == IIO_REASON_DONE || reason == IIO_REASON_EVENT) { > + /* > + * Storage agent failed while I/O was in progress > + * Fail over only if the qnio channel dropped, indicating > + * storage agent failure. Don't fail over in response to other > + * I/O errors such as disk failure. > + */ > + if (error == VXERROR_RETRY_ON_SOURCE || error == VXERROR_HUP || > + error == VXERROR_CHANNEL_HUP || error == -1) { > + /* > + * Start vDisk IO failover once callback is > + * called against all the pending IOs. > + * If vDisk has no redundancy enabled > + * then IO failover routine will mark > + * the vDisk failed and fail all the > + * AIOs without retry (stateless vDisk) > + */ > + VXHS_SPIN_LOCK(s->vdisk_lock); > + if (!OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) { > + OF_VDISK_SET_IOFAILOVER_IN_PROGRESS(s); > + } > + /* > + * Check if this acb is already queued before. > + * It is possible in case if I/Os are submitted > + * in multiple segments (QNIO_MAX_IO_SIZE). > + */ > + VXHS_SPIN_LOCK(s->vdisk_acb_lock); > + if (!OF_AIOCB_FLAGS_QUEUED(acb)) { > + QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, > + acb, retry_entry); > + OF_AIOCB_FLAGS_SET_QUEUED(acb); > + s->vdisk_aio_retry_qd++; > + trace_vxhs_iio_callback_retry(s->vdisk_guid, acb); > + } > + segcount = --acb->segments; > + VXHS_SPIN_UNLOCK(s->vdisk_acb_lock); > + /* > + * Decrement AIO count only when callback is called > + * against all the segments of aiocb. > + */ > + if (segcount == 0 && --s->vdisk_aio_count == 0) { > + /* > + * Start vDisk I/O failover > + */ > + VXHS_SPIN_UNLOCK(s->vdisk_lock); > + /* > + * TODO: > + * Need to explore further if it is possible to optimize > + * the failover operation on Virtual-Machine (global) > + * specific rather vDisk specific. > + */ > + vxhs_failover_io(s); > + goto out; > + } > + VXHS_SPIN_UNLOCK(s->vdisk_lock); > + goto out; > + } > + } else if (reason == IIO_REASON_HUP) { > + /* > + * Channel failed, spontaneous notification, > + * not in response to I/O > + */ > + trace_vxhs_iio_callback_chnlfail(error); > + /* > + * TODO: Start channel failover when no I/O is outstanding > + */ > + goto out; > + } else { > + trace_vxhs_iio_callback_fail(reason, acb, acb->segments, > + acb->size, error); > + } > + } > + /* > + * Set error into acb if not set. In case if acb is being > + * submitted in multiple segments then need to set the error > + * only once. > + * > + * Once acb done callback is called for the last segment > + * then acb->ret return status will be sent back to the > + * caller. > + */ > + VXHS_SPIN_LOCK(s->vdisk_acb_lock); > + if (error && !acb->ret) { > + acb->ret = error; > + } > + --acb->segments; > + segcount = acb->segments; > + assert(segcount >= 0); > + VXHS_SPIN_UNLOCK(s->vdisk_acb_lock); > + /* > + * Check if all the outstanding I/Os are done against acb. > + * If yes then send signal for AIO completion. > + */ > + if (segcount == 0) { > + rv = qemu_write_full(s->fds[VDISK_FD_WRITE], &acb, sizeof(acb)); > + if (rv != sizeof(acb)) { > + error_report("VXHS AIO completion failed: %s", strerror(errno)); > + abort(); > + } > + } > + break; > + > + case IRP_VDISK_CHECK_IO_FAILOVER_READY: > + /* ctx is BDRVVXHSState* */ > + assert(ctx); > + trace_vxhs_iio_callback_ready(((BDRVVXHSState *)ctx)->vdisk_guid, > + error); > + vxhs_failover_ioctl_cb(error, ctx); > + break; > + > + default: > + if (reason == IIO_REASON_HUP) { > + /* > + * Channel failed, spontaneous notification, > + * not in response to I/O > + */ > + trace_vxhs_iio_callback_chnfail(error, errno); > + /* > + * TODO: Start channel failover when no I/O is outstanding > + */ > + } else { > + trace_vxhs_iio_callback_unknwn(opcode, error); > + } > + break; > + } > +out: > + return; > +} > + > +void vxhs_complete_aio(VXHSAIOCB *acb, BDRVVXHSState *s) > +{ > + BlockCompletionFunc *cb = acb->common.cb; > + void *opaque = acb->common.opaque; > + int ret = 0; > + > + if (acb->ret != 0) { > + trace_vxhs_complete_aio(acb, acb->ret); > + /* > + * We mask all the IO errors generically as EIO for upper layers > + * Right now our IO Manager uses non standard error codes. Instead > + * of confusing upper layers with incorrect interpretation we are > + * doing this workaround. > + */ > + ret = (-EIO); > + } > + /* > + * Copy back contents from stablization buffer into original iovector > + * before returning the IO > + */ > + if (acb->buffer != NULL) { > + qemu_iovec_from_buf(acb->qiov, 0, acb->buffer, acb->qiov->size); > + free(acb->buffer); > + acb->buffer = NULL; > + } > + vxhs_dec_vdisk_iocount(s, 1); > + acb->aio_done = VXHS_IO_COMPLETED; > + qemu_aio_unref(acb); > + cb(opaque, ret); > +} > + > +/* > + * This is the HyperScale event handler registered to QEMU. > + * It is invoked when any IO gets completed and written on pipe > + * by callback called from QNIO thread context. Then it marks > + * the AIO as completed, and releases HyperScale AIO callbacks. > + */ > +void vxhs_aio_event_reader(void *opaque) > +{ > + BDRVVXHSState *s = opaque; > + ssize_t ret; > + > + do { > + char *p = (char *)&s->qnio_event_acb; > + > + ret = read(s->fds[VDISK_FD_READ], p + s->event_reader_pos, > + sizeof(s->qnio_event_acb) - s->event_reader_pos); > + if (ret > 0) { > + s->event_reader_pos += ret; > + if (s->event_reader_pos == sizeof(s->qnio_event_acb)) { > + s->event_reader_pos = 0; > + vxhs_complete_aio(s->qnio_event_acb, s); > + } > + } > + } while (ret < 0 && errno == EINTR); > +} > + > +/* > + * Call QNIO operation to create channels to do IO on vDisk. > + */ > + > +void *vxhs_setup_qnio(void) > +{ > + void *qnio_ctx = NULL; > + > + qnio_ctx = qnio_iio_init(vxhs_iio_callback); > + > + if (qnio_ctx != NULL) { > + trace_vxhs_setup_qnio(qnio_ctx); > + } else { > + trace_vxhs_setup_qnio_nwerror('.'); > + } > + > + return qnio_ctx; > +} > + > +static QemuOptsList runtime_opts = { > + .name = "vxhs", > + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), > + .desc = { > + { > + .name = VXHS_OPT_FILENAME, > + .type = QEMU_OPT_STRING, > + .help = "URI to the Veritas HyperScale image", > + }, > + { > + .name = VXHS_OPT_VDISK_ID, > + .type = QEMU_OPT_STRING, > + .help = "UUID of the VxHS vdisk", > + }, > + { /* end of list */ } > + }, > +}; > + > +static QemuOptsList runtime_tcp_opts = { > + .name = "vxhs_tcp", > + .head = QTAILQ_HEAD_INITIALIZER(runtime_tcp_opts.head), > + .desc = { > + { > + .name = VXHS_OPT_HOST, > + .type = QEMU_OPT_STRING, > + .help = "host address (ipv4 addresses)", > + }, > + { > + .name = VXHS_OPT_PORT, > + .type = QEMU_OPT_NUMBER, > + .help = "port number on which VxHSD is listening (default 9999)", > + .def_value_str = "9999" > + }, > + { > + .name = "to", > + .type = QEMU_OPT_NUMBER, > + .help = "max port number, not supported by VxHS", > + }, > + { > + .name = "ipv4", > + .type = QEMU_OPT_BOOL, > + .help = "ipv4 bool value, not supported by VxHS", > + }, > + { > + .name = "ipv6", > + .type = QEMU_OPT_BOOL, > + .help = "ipv6 bool value, not supported by VxHS", > + }, > + { /* end of list */ } > + }, > +}; > + > +/* > + * Parse the incoming URI and populate *options with all the host(s) > + * information. Host at index 0 is local storage agent. > + * Remaining are the reflection target storage agents. The local storage > agent > + * ip is the efficient internal address in the uri, e.g. 192.168.0.2. > + * The local storage agent address is stored at index 0. The reflection > target > + * ips, are the E-W data network addresses of the reflection node agents, > also > + * extracted from the uri. > + */ > +static int vxhs_parse_uri(const char *filename, QDict *options) > +{ > + gchar **target_list; > + URI *uri = NULL; > + char *hoststr, *portstr; > + char *vdisk_id = NULL; > + char *port; > + int ret = 0; > + int i = 0; > + > + trace_vxhs_parse_uri_filename(filename); > + target_list = g_strsplit(filename, "%7D", 0); > + assert(target_list != NULL && target_list[0] != NULL); > + > + for (i = 0; target_list[i] != NULL && *target_list[i]; i++) { > + uri = uri_parse(target_list[i]); > + if (!uri || !uri->server) { > + uri_free(uri); > + ret = -EINVAL; > + break; > + } > + > + hoststr = g_strdup_printf(VXHS_OPT_SERVER"%d.host", i); > + qdict_put(options, hoststr, qstring_from_str(uri->server)); > + > + portstr = g_strdup_printf(VXHS_OPT_SERVER"%d.port", i); > + if (uri->port) { > + port = g_strdup_printf("%d", uri->port); > + qdict_put(options, portstr, qstring_from_str(port)); > + g_free(port); > + } > + > + if (i == 0 && (strstr(uri->path, "vxhs") == NULL)) { > + vdisk_id = g_strdup_printf("%s%c", uri->path, '}'); > + qdict_put(options, "vdisk_id", qstring_from_str(vdisk_id)); > + } > + > + trace_vxhs_parse_uri_hostinfo(i + 1, uri->server, uri->port); > + g_free(hoststr); > + g_free(portstr); > + g_free(vdisk_id); > + uri_free(uri); > + } > + > + g_strfreev(target_list); > + return ret; > +} > + > +static void vxhs_parse_filename(const char *filename, QDict *options, > + Error **errp) > +{ > + if (qdict_haskey(options, "host") > + || qdict_haskey(options, "port") > + || qdict_haskey(options, "path")) > + { > + error_setg(errp, "host/port/path and a file name may not be > specified " > + "at the same time"); > + return; > + } > + > + if (strstr(filename, "://")) { > + int ret = vxhs_parse_uri(filename, options); > + if (ret < 0) { > + error_setg(errp, "Invalid URI. URI should be of the form " > + " vxhs://<host_ip>:<port>/{<vdisk_id>}"); > + } > + } > +} > + > +static int vxhs_qemu_init(QDict *options, BDRVVXHSState *s, > + int *cfd, int *rfd, Error **errp) > +{ > + QDict *backing_options = NULL; > + QemuOpts *opts, *tcp_opts; > + const char *vxhs_filename; > + char *of_vsa_addr = NULL; > + Error *local_err = NULL; > + const char *vdisk_id_opt; > + char *file_name = NULL; > + size_t num_servers = 0; > + char *str = NULL; > + int ret = 0; > + int i; > + > + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); > + qemu_opts_absorb_qdict(opts, options, &local_err); > + if (local_err) { > + ret = -EINVAL; > + goto out; > + } > + > + vxhs_filename = qemu_opt_get(opts, VXHS_OPT_FILENAME); > + if (vxhs_filename) { > + trace_vxhs_qemu_init_filename(vxhs_filename); > + } > + > + vdisk_id_opt = qemu_opt_get(opts, VXHS_OPT_VDISK_ID); > + if (!vdisk_id_opt) { > + error_setg(&local_err, QERR_MISSING_PARAMETER, VXHS_OPT_VDISK_ID); > + ret = -EINVAL; > + goto out; > + } > + s->vdisk_guid = g_strdup(vdisk_id_opt); > + trace_vxhs_qemu_init_vdisk(vdisk_id_opt); > + > + num_servers = qdict_array_entries(options, VXHS_OPT_SERVER); > + if (num_servers < 1) { > + error_setg(&local_err, QERR_MISSING_PARAMETER, "server"); > + ret = -EINVAL; > + goto out; > + } else if (num_servers > 4) { > + error_setg(&local_err, QERR_INVALID_PARAMETER, "server"); > + error_append_hint(errp, "Maximum 4 servers allowed.\n"); > + ret = -EINVAL; > + goto out; > + } > + trace_vxhs_qemu_init_numservers(num_servers); > + > + for (i = 0; i < num_servers; i++) { > + str = g_strdup_printf(VXHS_OPT_SERVER"%d.", i); > + qdict_extract_subqdict(options, &backing_options, str); > + > + /* Create opts info from runtime_tcp_opts list */ > + tcp_opts = qemu_opts_create(&runtime_tcp_opts, NULL, 0, > &error_abort); > + qemu_opts_absorb_qdict(tcp_opts, backing_options, &local_err); > + if (local_err) { > + qdict_del(backing_options, str); > + qemu_opts_del(tcp_opts); > + g_free(str); > + ret = -EINVAL; > + goto out; > + } > + > + s->vdisk_hostinfo[i].hostip = g_strdup(qemu_opt_get(tcp_opts, > + VXHS_OPT_HOST)); > + s->vdisk_hostinfo[i].port = g_ascii_strtoll(qemu_opt_get(tcp_opts, > + > VXHS_OPT_PORT), > + NULL, 0); > + > + s->vdisk_hostinfo[i].qnio_cfd = -1; > + s->vdisk_hostinfo[i].vdisk_rfd = -1; > + trace_vxhs_qemu_init(s->vdisk_hostinfo[i].hostip, > + s->vdisk_hostinfo[i].port); > + > + qdict_del(backing_options, str); > + qemu_opts_del(tcp_opts); > + g_free(str); > + } > + > + s->vdisk_nhosts = i; > + s->vdisk_cur_host_idx = 0; > + file_name = g_strdup_printf("%s%s", vdisk_prefix, s->vdisk_guid); > + of_vsa_addr = g_strdup_printf("of://%s:%d", > + > s->vdisk_hostinfo[s->vdisk_cur_host_idx].hostip, > + > s->vdisk_hostinfo[s->vdisk_cur_host_idx].port); > + > + /* > + * .bdrv_open() and .bdrv_create() run under the QEMU global mutex. > + */ > + if (global_qnio_ctx == NULL) { > + global_qnio_ctx = vxhs_setup_qnio(); > + if (global_qnio_ctx == NULL) { > + error_setg(&local_err, "Failed vxhs_setup_qnio"); > + ret = -EINVAL; > + goto out; > + } > + } > + > + *cfd = qnio_open_iio_conn(global_qnio_ctx, of_vsa_addr, 0); > + if (*cfd < 0) { > + error_setg(&local_err, "Failed qnio_open_iio_conn"); > + ret = -EIO; > + goto out; > + } > + *rfd = qnio_iio_devopen(global_qnio_ctx, *cfd, file_name, 0); > + if (*rfd < 0) { > + qnio_iio_close(global_qnio_ctx, *cfd); > + *cfd = -1; > + error_setg(&local_err, "Failed qnio_iio_devopen"); > + ret = -EIO; > + goto out; > + } > + > +out: > + g_free(file_name); > + g_free(of_vsa_addr); > + qemu_opts_del(opts); > + > + if (ret < 0) { > + for (i = 0; i < num_servers; i++) { > + g_free(s->vdisk_hostinfo[i].hostip); > + } > + g_free(s->vdisk_guid); > + s->vdisk_guid = NULL; > + errno = -ret; > + } > + error_propagate(errp, local_err); > + return ret; > +} > + > +int vxhs_open(BlockDriverState *bs, QDict *options, > + int bdrv_flags, Error **errp) > +{ > + BDRVVXHSState *s = bs->opaque; > + AioContext *aio_context; > + int qemu_qnio_cfd = -1; > + int device_opened = 0; > + int qemu_rfd = -1; > + int ret = 0; > + int i; > + > + ret = vxhs_qemu_init(options, s, &qemu_qnio_cfd, &qemu_rfd, errp); > + if (ret < 0) { > + trace_vxhs_open_fail(ret); > + return ret; > + } else { > + device_opened = 1; > + } > + > + s->qnio_ctx = global_qnio_ctx; > + s->vdisk_hostinfo[0].qnio_cfd = qemu_qnio_cfd; > + s->vdisk_hostinfo[0].vdisk_rfd = qemu_rfd; > + s->vdisk_size = 0; > + QSIMPLEQ_INIT(&s->vdisk_aio_retryq); > + > + /* > + * Create a pipe for communicating between two threads in different > + * context. Set handler for read event, which gets triggered when > + * IO completion is done by non-QEMU context. > + */ > + ret = qemu_pipe(s->fds); > + if (ret < 0) { > + trace_vxhs_open_epipe('.'); > + ret = -errno; > + goto errout; > + } > + fcntl(s->fds[VDISK_FD_READ], F_SETFL, O_NONBLOCK); > + > + aio_context = bdrv_get_aio_context(bs); > + aio_set_fd_handler(aio_context, s->fds[VDISK_FD_READ], > + false, vxhs_aio_event_reader, NULL, s); > + > + /* > + * Allocate/Initialize the spin-locks. > + * > + * NOTE: > + * Since spin lock is being allocated > + * dynamically hence moving acb struct > + * specific lock to BDRVVXHSState > + * struct. The reason being, > + * we don't want the overhead of spin > + * lock being dynamically allocated and > + * freed for every AIO. > + */ > + s->vdisk_lock = VXHS_SPIN_LOCK_ALLOC; > + s->vdisk_acb_lock = VXHS_SPIN_LOCK_ALLOC; > + > + return 0; > + > +errout: > + /* > + * Close remote vDisk device if it was opened before > + */ > + if (device_opened) { > + for (i = 0; i < s->vdisk_nhosts; i++) { > + if (s->vdisk_hostinfo[i].vdisk_rfd >= 0) { > + qnio_iio_devclose(s->qnio_ctx, 0, > + s->vdisk_hostinfo[i].vdisk_rfd); > + s->vdisk_hostinfo[i].vdisk_rfd = -1; > + } > + /* > + * close QNIO channel against cached channel open-fd > + */ > + if (s->vdisk_hostinfo[i].qnio_cfd >= 0) { > + qnio_iio_close(s->qnio_ctx, > + s->vdisk_hostinfo[i].qnio_cfd); > + s->vdisk_hostinfo[i].qnio_cfd = -1; > + } > + } > + } > + trace_vxhs_open_fail(ret); > + return ret; > +} > + > +static const AIOCBInfo vxhs_aiocb_info = { > + .aiocb_size = sizeof(VXHSAIOCB) > +}; > + > +/* > + * This allocates QEMU-VXHS callback for each IO > + * and is passed to QNIO. When QNIO completes the work, > + * it will be passed back through the callback. > + */ > +BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, > + int64_t sector_num, QEMUIOVector *qiov, > + int nb_sectors, > + BlockCompletionFunc *cb, > + void *opaque, int iodir) > +{ > + VXHSAIOCB *acb = NULL; > + BDRVVXHSState *s = bs->opaque; > + size_t size; > + uint64_t offset; > + int iio_flags = 0; > + int ret = 0; > + > + offset = sector_num * BDRV_SECTOR_SIZE; > + size = nb_sectors * BDRV_SECTOR_SIZE; > + > + acb = qemu_aio_get(&vxhs_aiocb_info, bs, cb, opaque); > + /* > + * Setup or initialize VXHSAIOCB. > + * Every single field should be initialized since > + * acb will be picked up from the slab without > + * initializing with zero. > + */ > + acb->io_offset = offset; > + acb->size = size; > + acb->ret = 0; > + acb->flags = 0; > + acb->aio_done = VXHS_IO_INPROGRESS; > + acb->segments = 0; > + acb->buffer = 0; > + acb->qiov = qiov; > + acb->direction = iodir; > + > + VXHS_SPIN_LOCK(s->vdisk_lock); > + if (OF_VDISK_FAILED(s)) { > + trace_vxhs_aio_rw(s->vdisk_guid, iodir, size, offset); > + VXHS_SPIN_UNLOCK(s->vdisk_lock); > + goto errout; > + } > + if (OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) { > + QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, acb, retry_entry); > + s->vdisk_aio_retry_qd++; > + OF_AIOCB_FLAGS_SET_QUEUED(acb); > + VXHS_SPIN_UNLOCK(s->vdisk_lock); > + trace_vxhs_aio_rw_retry(s->vdisk_guid, acb, 1); > + goto out; > + } > + s->vdisk_aio_count++; > + VXHS_SPIN_UNLOCK(s->vdisk_lock); > + > + iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC); > + > + switch (iodir) { > + case VDISK_AIO_WRITE: > + vxhs_inc_acb_segment_count(acb, 1); > + ret = qnio_iio_writev(s->qnio_ctx, > + s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd, > + qiov->iov, qiov->niov, offset, (void *)acb, iio_flags); > + break; > + case VDISK_AIO_READ: > + vxhs_inc_acb_segment_count(acb, 1); > + ret = qnio_iio_readv(s->qnio_ctx, > + s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd, > + qiov->iov, qiov->niov, offset, (void *)acb, iio_flags); > + break; > + default: > + trace_vxhs_aio_rw_invalid(iodir); > + goto errout; > + } > + > + if (ret != 0) { > + trace_vxhs_aio_rw_ioerr( > + s->vdisk_guid, iodir, size, offset, > + acb, acb->segments, ret, errno); > + /* > + * Don't retry I/Os against vDisk having no > + * redundancy or stateful storage on compute > + * > + * TODO: Revisit this code path to see if any > + * particular error needs to be handled. > + * At this moment failing the I/O. > + */ > + VXHS_SPIN_LOCK(s->vdisk_lock); > + if (s->vdisk_nhosts == 1) { > + trace_vxhs_aio_rw_iofail(s->vdisk_guid); > + s->vdisk_aio_count--; > + vxhs_dec_acb_segment_count(acb, 1); > + VXHS_SPIN_UNLOCK(s->vdisk_lock); > + goto errout; > + } > + if (OF_VDISK_FAILED(s)) { > + trace_vxhs_aio_rw_devfail( > + s->vdisk_guid, iodir, size, offset); > + s->vdisk_aio_count--; > + vxhs_dec_acb_segment_count(acb, 1); > + VXHS_SPIN_UNLOCK(s->vdisk_lock); > + goto errout; > + } > + if (OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) { > + /* > + * Queue all incoming io requests after failover starts. > + * Number of requests that can arrive is limited by io queue > depth > + * so an app blasting independent ios will not exhaust memory. > + */ > + QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, acb, retry_entry); > + s->vdisk_aio_retry_qd++; > + OF_AIOCB_FLAGS_SET_QUEUED(acb); > + s->vdisk_aio_count--; > + vxhs_dec_acb_segment_count(acb, 1); > + VXHS_SPIN_UNLOCK(s->vdisk_lock); > + trace_vxhs_aio_rw_retry(s->vdisk_guid, acb, 2); > + goto out; > + } > + OF_VDISK_SET_IOFAILOVER_IN_PROGRESS(s); > + QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, acb, retry_entry); > + s->vdisk_aio_retry_qd++; > + OF_AIOCB_FLAGS_SET_QUEUED(acb); > + vxhs_dec_acb_segment_count(acb, 1); > + trace_vxhs_aio_rw_retry(s->vdisk_guid, acb, 3); > + /* > + * Start I/O failover if there is no active > + * AIO within vxhs block driver. > + */ > + if (--s->vdisk_aio_count == 0) { > + VXHS_SPIN_UNLOCK(s->vdisk_lock); > + /* > + * Start IO failover > + */ > + vxhs_failover_io(s); > + goto out; > + } > + VXHS_SPIN_UNLOCK(s->vdisk_lock); > + } > + > +out: > + return &acb->common; > + > +errout: > + qemu_aio_unref(acb); > + return NULL; > +} > + > +BlockAIOCB *vxhs_aio_readv(BlockDriverState *bs, > + int64_t sector_num, QEMUIOVector *qiov, > + int nb_sectors, > + BlockCompletionFunc *cb, void *opaque) > +{ > + return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors, > + cb, opaque, VDISK_AIO_READ); > +} > + > +BlockAIOCB *vxhs_aio_writev(BlockDriverState *bs, > + int64_t sector_num, QEMUIOVector *qiov, > + int nb_sectors, > + BlockCompletionFunc *cb, void *opaque) > +{ > + return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors, > + cb, opaque, VDISK_AIO_WRITE); > +} > + > +/* > + * This is called by QEMU when a flush gets triggered from within > + * a guest at the block layer, either for IDE or SCSI disks. > + */ > +int vxhs_co_flush(BlockDriverState *bs) > +{ > + BDRVVXHSState *s = bs->opaque; > + uint64_t size = 0; > + int ret = 0; > + > + /* > + * VDISK_AIO_FLUSH ioctl is a no-op at present. > + */ > + ret = qnio_iio_ioctl(s->qnio_ctx, > + s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd, > + VDISK_AIO_FLUSH, &size, NULL, IIO_FLAG_SYNC); > + > + if (ret < 0) { > + /* > + * Currently not handling the flush ioctl > + * failure because of network connection > + * disconnect. Since all the writes are > + * commited into persistent storage hence > + * this flush call is noop and we can safely > + * return success status to the caller. > + * > + * If any write failure occurs for inflight > + * write AIO because of network disconnect > + * then anyway IO failover will be triggered. > + */ > + trace_vxhs_co_flush(s->vdisk_guid, ret, errno); > + ret = 0; > + } > + > + return ret; > +} > + > +unsigned long vxhs_get_vdisk_stat(BDRVVXHSState *s) > +{ > + void *ctx = NULL; > + int flags = 0; > + int64_t vdisk_size = 0; > + int ret = 0; > + > + ret = qnio_iio_ioctl(s->qnio_ctx, > + s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd, > + VDISK_STAT, &vdisk_size, ctx, flags); > + > + if (ret < 0) { > + trace_vxhs_get_vdisk_stat_err(s->vdisk_guid, ret, errno); > + return 0; > + } > + > + trace_vxhs_get_vdisk_stat(s->vdisk_guid, vdisk_size); > + return vdisk_size; > +} > + > +/* > + * Returns the size of vDisk in bytes. This is required > + * by QEMU block upper block layer so that it is visible > + * to guest. > + */ > +int64_t vxhs_getlength(BlockDriverState *bs) > +{ > + BDRVVXHSState *s = bs->opaque; > + int64_t vdisk_size = 0; > + > + if (s->vdisk_size > 0) { > + vdisk_size = s->vdisk_size; > + } else { > + /* > + * Fetch the vDisk size using stat ioctl > + */ > + vdisk_size = vxhs_get_vdisk_stat(s); > + if (vdisk_size > 0) { > + s->vdisk_size = vdisk_size; > + } > + } > + > + if (vdisk_size > 0) { > + return vdisk_size; /* return size in bytes */ > + } else { > + return -EIO; > + } > +} > + > +/* > + * Returns actual blocks allocated for the vDisk. > + * This is required by qemu-img utility. > + */ > +int64_t vxhs_get_allocated_blocks(BlockDriverState *bs) > +{ > + BDRVVXHSState *s = bs->opaque; > + int64_t vdisk_size = 0; > + > + if (s->vdisk_size > 0) { > + vdisk_size = s->vdisk_size; > + } else { > + /* > + * TODO: > + * Once HyperScale storage-virtualizer provides > + * actual physical allocation of blocks then > + * fetch that information and return back to the > + * caller but for now just get the full size. > + */ > + vdisk_size = vxhs_get_vdisk_stat(s); > + if (vdisk_size > 0) { > + s->vdisk_size = vdisk_size; > + } > + } > + > + if (vdisk_size > 0) { > + return vdisk_size; /* return size in bytes */ > + } else { > + return -EIO; > + } > +} > + > +void vxhs_close(BlockDriverState *bs) > +{ > + BDRVVXHSState *s = bs->opaque; > + int i; > + > + trace_vxhs_close(s->vdisk_guid); > + close(s->fds[VDISK_FD_READ]); > + close(s->fds[VDISK_FD_WRITE]); > + > + /* > + * Clearing all the event handlers for oflame registered to QEMU > + */ > + aio_set_fd_handler(bdrv_get_aio_context(bs), s->fds[VDISK_FD_READ], > + false, NULL, NULL, NULL); > + > + if (s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd >= 0) { > + qnio_iio_devclose(s->qnio_ctx, 0, > + s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd); > + } > + if (s->vdisk_lock) { > + VXHS_SPIN_LOCK_DESTROY(s->vdisk_lock); > + s->vdisk_lock = NULL; > + } > + if (s->vdisk_acb_lock) { > + VXHS_SPIN_LOCK_DESTROY(s->vdisk_acb_lock); > + s->vdisk_acb_lock = NULL; > + } > + > + g_free(s->vdisk_guid); > + s->vdisk_guid = NULL; > + > + for (i = 0; i < VXHS_MAX_HOSTS; i++) { > + /* > + * Close vDisk device > + */ > + if (s->vdisk_hostinfo[i].vdisk_rfd >= 0) { > + qnio_iio_devclose(s->qnio_ctx, 0, > + s->vdisk_hostinfo[i].vdisk_rfd); > + s->vdisk_hostinfo[i].vdisk_rfd = -1; > + } > + > + /* > + * Close Iridium channel against cached channel-fd > + */ > + if (s->vdisk_hostinfo[i].qnio_cfd >= 0) { > + qnio_iio_close(s->qnio_ctx, > + s->vdisk_hostinfo[i].qnio_cfd); > + s->vdisk_hostinfo[i].qnio_cfd = -1; > + } > + > + /* > + * Free hostip string which is allocated dynamically > + */ > + g_free(s->vdisk_hostinfo[i].hostip); > + s->vdisk_hostinfo[i].hostip = NULL; > + s->vdisk_hostinfo[i].port = 0; > + } > +} > + > +/* > + * If errors are consistent with storage agent failure: > + * - Try to reconnect in case error is transient or storage agent restarted. > + * - Currently failover is being triggered on per vDisk basis. There is > + * a scope of further optimization where failover can be global (per VM). > + * - In case of network (storage agent) failure, for all the vDisks, having > + * no redundancy, I/Os will be failed without attempting for I/O failover > + * because of stateless nature of vDisk. > + * - If local or source storage agent is down then send an ioctl to remote > + * storage agent to check if remote storage agent in a state to accept > + * application I/Os. > + * - Once remote storage agent is ready to accept I/O, start I/O shipping. > + * - If I/Os cannot be serviced then vDisk will be marked failed so that > + * new incoming I/Os are returned with failure immediately. > + * - If vDisk I/O failover is in progress then all new/inflight I/Os will > + * queued and will be restarted or failed based on failover operation > + * is successful or not. > + * - I/O failover can be started either in I/O forward or I/O backward > + * path. > + * - I/O failover will be started as soon as all the pending acb(s) > + * are queued and there is no pending I/O count. > + * - If I/O failover couldn't be completed within QNIO_CONNECT_TIMOUT_SECS > + * then vDisk will be marked failed and all I/Os will be completed with > + * error. > + */ > + > +int vxhs_switch_storage_agent(BDRVVXHSState *s) > +{ > + int res = 0; > + int flags = (IIO_FLAG_ASYNC | IIO_FLAG_DONE); > + > + trace_vxhs_switch_storage_agent( > + s->vdisk_hostinfo[s->vdisk_ask_failover_idx].hostip, > + s->vdisk_guid); > + > + res = vxhs_reopen_vdisk(s, s->vdisk_ask_failover_idx); > + if (res == 0) { > + res = qnio_iio_ioctl(s->qnio_ctx, > + s->vdisk_hostinfo[s->vdisk_ask_failover_idx].vdisk_rfd, > + VDISK_CHECK_IO_FAILOVER_READY, NULL, s, flags); > + } else { > + trace_vxhs_switch_storage_agent_failed( > + s->vdisk_hostinfo[s->vdisk_ask_failover_idx].hostip, > + s->vdisk_guid, res, errno); > + /* > + * TODO: calling vxhs_failover_ioctl_cb from here ties up the qnio > epoll > + * loop if qnio_iio_ioctl fails synchronously (-1) for all hosts in > io > + * target list. > + */ > + > + /* try next host */ > + vxhs_failover_ioctl_cb(res, s); > + } > + return res; > +} > + > +void vxhs_failover_ioctl_cb(int res, void *ctx) > +{ > + BDRVVXHSState *s = ctx; > + > + if (res == 0) { > + /* found failover target */ > + s->vdisk_cur_host_idx = s->vdisk_ask_failover_idx; > + s->vdisk_ask_failover_idx = 0; > + trace_vxhs_failover_ioctl_cb( > + s->vdisk_hostinfo[s->vdisk_cur_host_idx].hostip, > + s->vdisk_guid); > + VXHS_SPIN_LOCK(s->vdisk_lock); > + OF_VDISK_RESET_IOFAILOVER_IN_PROGRESS(s); > + VXHS_SPIN_UNLOCK(s->vdisk_lock); > + vxhs_handle_queued_ios(s); > + } else { > + /* keep looking */ > + trace_vxhs_failover_ioctl_cb_retry(s->vdisk_guid); > + s->vdisk_ask_failover_idx++; > + if (s->vdisk_ask_failover_idx == s->vdisk_nhosts) { > + /* pause and cycle through list again */ > + sleep(QNIO_CONNECT_RETRY_SECS); > + s->vdisk_ask_failover_idx = 0; > + } > + res = vxhs_switch_storage_agent(s); > + } > +} > + > +int vxhs_failover_io(BDRVVXHSState *s) > +{ > + int res = 0; > + > + trace_vxhs_failover_io(s->vdisk_guid); > + > + s->vdisk_ask_failover_idx = 0; > + res = vxhs_switch_storage_agent(s); > + > + return res; > +} > + > +/* > + * Try to reopen the vDisk on one of the available hosts > + * If vDisk reopen is successful on any of the host then > + * check if that node is ready to accept I/O. > + */ > +int vxhs_reopen_vdisk(BDRVVXHSState *s, int index) > +{ > + char *of_vsa_addr = NULL; > + char *file_name = NULL; > + int res = 0; > + > + > + /* > + * Close stale vdisk device remote fd since > + * it could be invalid fd after channel disconnect. > + * Reopen the vdisk to get the new fd. > + */ > + if (s->vdisk_hostinfo[index].vdisk_rfd >= 0) { > + qnio_iio_devclose(s->qnio_ctx, 0, > + s->vdisk_hostinfo[index].vdisk_rfd); > + s->vdisk_hostinfo[index].vdisk_rfd = -1; > + } > + > + /* > + * As part of vDisk reopen, close the QNIO channel > + * against cached channel-fd (fd is being cached into > + * vDisk hostinfo). > + */ > + if (s->vdisk_hostinfo[index].qnio_cfd >= 0) { > + qnio_iio_close(s->qnio_ctx, > + s->vdisk_hostinfo[index].qnio_cfd); > + s->vdisk_hostinfo[index].qnio_cfd = -1; > + } > + > + /* > + * Build storage agent address and vdisk device name strings > + */ > + file_name = g_strdup_printf("%s%s", vdisk_prefix, s->vdisk_guid); > + of_vsa_addr = g_strdup_printf("of://%s:%d", > + s->vdisk_hostinfo[index].hostip, s->vdisk_hostinfo[index].port); > + /* > + * Open qnio channel to storage agent if not opened before. > + */ > + if (s->vdisk_hostinfo[index].qnio_cfd < 0) { > + s->vdisk_hostinfo[index].qnio_cfd = > + qnio_open_iio_conn(global_qnio_ctx, of_vsa_addr, 0); > + if (s->vdisk_hostinfo[index].qnio_cfd < 0) { > + trace_vxhs_reopen_vdisk(s->vdisk_hostinfo[index].hostip); > + res = ENODEV; > + goto out; > + } > + } > + > + /* > + * Open vdisk device > + */ > + s->vdisk_hostinfo[index].vdisk_rfd = > + qnio_iio_devopen(global_qnio_ctx, > + s->vdisk_hostinfo[index].qnio_cfd, file_name, 0); > + > + if (s->vdisk_hostinfo[index].vdisk_rfd < 0) { > + /* > + * Close QNIO channel against cached channel-fd > + */ > + if (s->vdisk_hostinfo[index].qnio_cfd >= 0) { > + qnio_iio_close(s->qnio_ctx, > + s->vdisk_hostinfo[index].qnio_cfd); > + s->vdisk_hostinfo[index].qnio_cfd = -1; > + } > + > + trace_vxhs_reopen_vdisk_openfail(file_name); > + res = EIO; > + goto out; > + } > + > +out: > + g_free(of_vsa_addr); > + g_free(file_name); > + return res; > +} > + > +int vxhs_handle_queued_ios(BDRVVXHSState *s) > +{ > + VXHSAIOCB *acb = NULL; > + int res = 0; > + > + VXHS_SPIN_LOCK(s->vdisk_lock); > + while ((acb = QSIMPLEQ_FIRST(&s->vdisk_aio_retryq)) != NULL) { > + /* > + * Before we process the acb, check whether I/O failover > + * started again due to failback or cascading failure. > + */ > + if (OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) { > + VXHS_SPIN_UNLOCK(s->vdisk_lock); > + goto out; > + } > + QSIMPLEQ_REMOVE_HEAD(&s->vdisk_aio_retryq, retry_entry); > + s->vdisk_aio_retry_qd--; > + OF_AIOCB_FLAGS_RESET_QUEUED(acb); > + if (OF_VDISK_FAILED(s)) { > + VXHS_SPIN_UNLOCK(s->vdisk_lock); > + vxhs_fail_aio(acb, EIO); > + VXHS_SPIN_LOCK(s->vdisk_lock); > + } else { > + VXHS_SPIN_UNLOCK(s->vdisk_lock); > + res = vxhs_restart_aio(acb); > + trace_vxhs_handle_queued_ios(acb, res); > + VXHS_SPIN_LOCK(s->vdisk_lock); > + if (res) { > + QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, > + acb, retry_entry); > + OF_AIOCB_FLAGS_SET_QUEUED(acb); > + VXHS_SPIN_UNLOCK(s->vdisk_lock); > + goto out; > + } > + } > + } > + VXHS_SPIN_UNLOCK(s->vdisk_lock); > +out: > + return res; > +} > + > +int vxhs_restart_aio(VXHSAIOCB *acb) > +{ > + BDRVVXHSState *s = NULL; > + int iio_flags = 0; > + int res = 0; > + > + s = acb->common.bs->opaque; > + > + if (acb->direction == VDISK_AIO_WRITE) { > + vxhs_inc_vdisk_iocount(s, 1); > + vxhs_inc_acb_segment_count(acb, 1); > + iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC); > + res = qnio_iio_writev(s->qnio_ctx, > + s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd, > + acb->qiov->iov, acb->qiov->niov, > + acb->io_offset, (void *)acb, iio_flags); > + } > + > + if (acb->direction == VDISK_AIO_READ) { > + vxhs_inc_vdisk_iocount(s, 1); > + vxhs_inc_acb_segment_count(acb, 1); > + iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC); > + res = qnio_iio_readv(s->qnio_ctx, > + s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd, > + acb->qiov->iov, acb->qiov->niov, > + acb->io_offset, (void *)acb, iio_flags); > + } > + > + if (res != 0) { > + vxhs_dec_vdisk_iocount(s, 1); > + vxhs_dec_acb_segment_count(acb, 1); > + trace_vxhs_restart_aio(acb->direction, res, errno); > + } > + > + return res; > +} > + > +void vxhs_fail_aio(VXHSAIOCB *acb, int err) > +{ > + BDRVVXHSState *s = NULL; > + int segcount = 0; > + int rv = 0; > + > + s = acb->common.bs->opaque; > + > + trace_vxhs_fail_aio(s->vdisk_guid, acb); > + if (!acb->ret) { > + acb->ret = err; > + } > + VXHS_SPIN_LOCK(s->vdisk_acb_lock); > + segcount = acb->segments; > + VXHS_SPIN_UNLOCK(s->vdisk_acb_lock); > + if (segcount == 0) { > + /* > + * Complete the io request > + */ > + rv = qemu_write_full(s->fds[VDISK_FD_WRITE], &acb, sizeof(acb)); > + if (rv != sizeof(acb)) { > + error_report("VXHS AIO completion failed: %s", > + strerror(errno)); > + abort(); > + } > + } > +} > + > +static void vxhs_detach_aio_context(BlockDriverState *bs) > +{ > + BDRVVXHSState *s = bs->opaque; > + > + aio_set_fd_handler(bdrv_get_aio_context(bs), s->fds[VDISK_FD_READ], > + false, NULL, NULL, NULL); > + > +} > + > +static void vxhs_attach_aio_context(BlockDriverState *bs, > + AioContext *new_context) > +{ > + BDRVVXHSState *s = bs->opaque; > + > + aio_set_fd_handler(new_context, s->fds[VDISK_FD_READ], > + false, vxhs_aio_event_reader, NULL, s); > +} > + > +static BlockDriver bdrv_vxhs = { > + .format_name = "vxhs", > + .protocol_name = "vxhs", > + .instance_size = sizeof(BDRVVXHSState), > + .bdrv_file_open = vxhs_open, > + .bdrv_parse_filename = vxhs_parse_filename, > + .bdrv_close = vxhs_close, > + .bdrv_getlength = vxhs_getlength, > + .bdrv_get_allocated_file_size = vxhs_get_allocated_blocks, > + .bdrv_aio_readv = vxhs_aio_readv, > + .bdrv_aio_writev = vxhs_aio_writev, > + .bdrv_co_flush_to_disk = vxhs_co_flush, > + .bdrv_detach_aio_context = vxhs_detach_aio_context, > + .bdrv_attach_aio_context = vxhs_attach_aio_context, > +}; > + > +void bdrv_vxhs_init(void) > +{ > + trace_vxhs_bdrv_init('.'); > + bdrv_register(&bdrv_vxhs); > +} > + > +block_init(bdrv_vxhs_init); > diff --git a/block/vxhs.h b/block/vxhs.h > new file mode 100644 > index 0000000..2f3e4de > --- /dev/null > +++ b/block/vxhs.h > @@ -0,0 +1,236 @@ > +/* > + * QEMU Block driver for Veritas HyperScale (VxHS) > + * > + * This work is licensed under the terms of the GNU GPL, version 2 or later. > + * See the COPYING file in the top-level directory. > + * > + */ > + > +#ifndef VXHSD_H > +#define VXHSD_H > + > +#include <gmodule.h> > +#include <inttypes.h> > +#include <pthread.h> > + > +#include "qemu/osdep.h" > +#include "qapi/error.h" > +#include "qemu/error-report.h" > +#include "block/block_int.h" > +#include "qemu/uri.h" > +#include "qemu/queue.h" > + > +#define OF_GUID_STR_LEN 40 > +#define OF_GUID_STR_SZ (OF_GUID_STR_LEN + 1) > +#define QNIO_CONNECT_RETRY_SECS 5 > +#define QNIO_CONNECT_TIMOUT_SECS 120 > + > +/* constants from io_qnio.h */ > +#define IIO_REASON_DONE 0x00000004 > +#define IIO_REASON_EVENT 0x00000008 > +#define IIO_REASON_HUP 0x00000010 > + > +/* > + * IO specific flags > + */ > +#define IIO_FLAG_ASYNC 0x00000001 > +#define IIO_FLAG_DONE 0x00000010 > +#define IIO_FLAG_SYNC 0 > + > +/* constants from error.h */ > +#define VXERROR_RETRY_ON_SOURCE 44 > +#define VXERROR_HUP 901 > +#define VXERROR_CHANNEL_HUP 903 > + > +/* constants from iomgr.h and opcode.h */ > +#define IRP_READ_REQUEST 0x1FFF > +#define IRP_WRITE_REQUEST 0x2FFF > +#define IRP_VDISK_CHECK_IO_FAILOVER_READY 2020 > + > +/* Lock specific macros */ > +#define VXHS_SPIN_LOCK_ALLOC \ > + (qnio_ck_initialize_lock()) > +#define VXHS_SPIN_LOCK(lock) \ > + (qnio_ck_spin_lock(lock)) > +#define VXHS_SPIN_UNLOCK(lock) \ > + (qnio_ck_spin_unlock(lock)) > +#define VXHS_SPIN_LOCK_DESTROY(lock) \ > + (qnio_ck_destroy_lock(lock)) > + > +typedef enum { > + VXHS_IO_INPROGRESS, > + VXHS_IO_COMPLETED, > + VXHS_IO_ERROR > +} VXHSIOState; > + > + > +typedef void (*qnio_callback_t)(ssize_t retval, void *arg); > + > +#define VDISK_FD_READ 0 > +#define VDISK_FD_WRITE 1 > + > +#define QNIO_VDISK_NONE 0x00 > +#define QNIO_VDISK_CREATE 0x01 > + > +/* max IO size supported by QEMU NIO lib */ > +#define QNIO_MAX_IO_SIZE 4194304 > + > +#define VXHS_MAX_HOSTS 4 > + > +/* > + * Opcodes for making IOCTL on QEMU NIO library > + */ > +#define BASE_OPCODE_SHARED 1000 > +#define BASE_OPCODE_DAL 2000 > +#define IRP_VDISK_STAT (BASE_OPCODE_SHARED + 5) > +#define IRP_VDISK_GET_GEOMETRY (BASE_OPCODE_DAL + 17) > +#define IRP_VDISK_READ_PARTITION (BASE_OPCODE_DAL + 18) > +#define IRP_VDISK_FLUSH (BASE_OPCODE_DAL + 19) > + > +/* > + * BDRVVXHSState specific flags > + */ > +#define OF_VDISK_FLAGS_STATE_ACTIVE 0x0000000000000001 > +#define OF_VDISK_FLAGS_STATE_FAILED 0x0000000000000002 > +#define OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS 0x0000000000000004 > + > +#define OF_VDISK_ACTIVE(s) \ > + ((s)->vdisk_flags & OF_VDISK_FLAGS_STATE_ACTIVE) > +#define OF_VDISK_SET_ACTIVE(s) \ > + ((s)->vdisk_flags |= OF_VDISK_FLAGS_STATE_ACTIVE) > +#define OF_VDISK_RESET_ACTIVE(s) \ > + ((s)->vdisk_flags &= ~OF_VDISK_FLAGS_STATE_ACTIVE) > + > +#define OF_VDISK_FAILED(s) \ > + ((s)->vdisk_flags & OF_VDISK_FLAGS_STATE_FAILED) > +#define OF_VDISK_SET_FAILED(s) \ > + ((s)->vdisk_flags |= OF_VDISK_FLAGS_STATE_FAILED) > +#define OF_VDISK_RESET_FAILED(s) \ > + ((s)->vdisk_flags &= ~OF_VDISK_FLAGS_STATE_FAILED) > + > +#define OF_VDISK_IOFAILOVER_IN_PROGRESS(s) \ > + ((s)->vdisk_flags & OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS) > +#define OF_VDISK_SET_IOFAILOVER_IN_PROGRESS(s) \ > + ((s)->vdisk_flags |= OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS) > +#define OF_VDISK_RESET_IOFAILOVER_IN_PROGRESS(s) \ > + ((s)->vdisk_flags &= ~OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS) > + > +/* > + * VXHSAIOCB specific flags > + */ > +#define OF_ACB_QUEUED 0x00000001 > + > +#define OF_AIOCB_FLAGS_QUEUED(a) \ > + ((a)->flags & OF_ACB_QUEUED) > +#define OF_AIOCB_FLAGS_SET_QUEUED(a) \ > + ((a)->flags |= OF_ACB_QUEUED) > +#define OF_AIOCB_FLAGS_RESET_QUEUED(a) \ > + ((a)->flags &= ~OF_ACB_QUEUED) > + > +typedef struct qemu2qnio_ctx { > + uint32_t qnio_flag; > + uint64_t qnio_size; > + char *qnio_channel; > + char *target; > + qnio_callback_t qnio_cb; > +} qemu2qnio_ctx_t; > + > +typedef qemu2qnio_ctx_t qnio2qemu_ctx_t; > + > +typedef struct LibQNIOSymbol { > + const char *name; > + gpointer *addr; > +} LibQNIOSymbol; > + > +typedef void (*iio_cb_t) (uint32_t rfd, uint32_t reason, void *ctx, > + void *reply); > + > +/* > + * HyperScale AIO callbacks structure > + */ > +typedef struct VXHSAIOCB { > + BlockAIOCB common; > + size_t ret; > + size_t size; > + QEMUBH *bh; > + int aio_done; > + int segments; > + int flags; > + size_t io_offset; > + QEMUIOVector *qiov; > + void *buffer; > + int direction; /* IO direction (r/w) */ > + QSIMPLEQ_ENTRY(VXHSAIOCB) retry_entry; > +} VXHSAIOCB; > + > +typedef struct VXHSvDiskHostsInfo { > + int qnio_cfd; /* Channel FD */ > + int vdisk_rfd; /* vDisk remote FD */ > + char *hostip; /* Host's IP addresses */ > + int port; /* Host's port number */ > +} VXHSvDiskHostsInfo; > + > +/* > + * Structure per vDisk maintained for state > + */ > +typedef struct BDRVVXHSState { > + int fds[2]; > + int64_t vdisk_size; > + int64_t vdisk_blocks; > + int64_t vdisk_flags; > + int vdisk_aio_count; > + int event_reader_pos; > + VXHSAIOCB *qnio_event_acb; > + void *qnio_ctx; > + void *vdisk_lock; /* Lock to protect BDRVVXHSState */ > + void *vdisk_acb_lock; /* Protects ACB */ > + VXHSvDiskHostsInfo vdisk_hostinfo[VXHS_MAX_HOSTS]; /* Per host info > */ > + int vdisk_nhosts; /* Total number of hosts */ > + int vdisk_cur_host_idx; /* IOs are being shipped to > */ > + int vdisk_ask_failover_idx; /*asking permsn to ship > io*/ > + QSIMPLEQ_HEAD(aio_retryq, VXHSAIOCB) vdisk_aio_retryq; > + int vdisk_aio_retry_qd; /* Currently for debugging */ > + char *vdisk_guid; > +} BDRVVXHSState; > + > +void bdrv_vxhs_init(void); > +void *vxhs_setup_qnio(void); > +void vxhs_iio_callback(uint32_t rfd, uint32_t reason, void *ctx, void *m); > +void vxhs_aio_event_reader(void *opaque); > +void vxhs_complete_aio(VXHSAIOCB *acb, BDRVVXHSState *s); > +int vxhs_aio_flush_cb(void *opaque); > +unsigned long vxhs_get_vdisk_stat(BDRVVXHSState *s); > +int vxhs_open(BlockDriverState *bs, QDict *options, > + int bdrv_flags, Error **errp); > +void vxhs_close(BlockDriverState *bs); > +BlockAIOCB *vxhs_aio_readv(BlockDriverState *bs, int64_t sector_num, > + QEMUIOVector *qiov, int nb_sectors, > + BlockCompletionFunc *cb, void *opaque); > +BlockAIOCB *vxhs_aio_writev(BlockDriverState *bs, int64_t sector_num, > + QEMUIOVector *qiov, int nb_sectors, > + BlockCompletionFunc *cb, > + void *opaque); > +int64_t vxhs_get_allocated_blocks(BlockDriverState *bs); > +BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, int64_t sector_num, > + QEMUIOVector *qiov, int nb_sectors, > + BlockCompletionFunc *cb, > + void *opaque, int write); > +int vxhs_co_flush(BlockDriverState *bs); > +int64_t vxhs_getlength(BlockDriverState *bs); > +void vxhs_inc_vdisk_iocount(void *ptr, uint32_t delta); > +void vxhs_dec_vdisk_iocount(void *ptr, uint32_t delta); > +uint32_t vxhs_get_vdisk_iocount(void *ptr); > +void vxhs_inc_acb_segment_count(void *ptr, int count); > +void vxhs_dec_acb_segment_count(void *ptr, int count); > +int vxhs_dec_and_get_acb_segment_count(void *ptr, int count); > +void vxhs_set_acb_buffer(void *ptr, void *buffer); > +int vxhs_failover_io(BDRVVXHSState *s); > +int vxhs_reopen_vdisk(BDRVVXHSState *s, int hostinfo_index); > +int vxhs_switch_storage_agent(BDRVVXHSState *s); > +int vxhs_handle_queued_ios(BDRVVXHSState *s); > +int vxhs_restart_aio(VXHSAIOCB *acb); > +void vxhs_fail_aio(VXHSAIOCB *acb, int err); > +void vxhs_failover_ioctl_cb(int res, void *ctx); > + > + > +#endif > diff --git a/configure b/configure > index 5a9bda1..340824c 100755 > --- a/configure > +++ b/configure > @@ -320,6 +320,7 @@ vhdx="" > numa="" > tcmalloc="no" > jemalloc="no" > +vxhs="" > > # parse CC options first > for opt do > @@ -1150,6 +1151,11 @@ for opt do > ;; > --enable-jemalloc) jemalloc="yes" > ;; > + --disable-vxhs) vxhs="no" > + ;; > + --enable-vxhs) vxhs="yes" > + ;; > + > *) > echo "ERROR: unknown option $opt" > echo "Try '$0 --help' for more information" > @@ -1380,6 +1386,7 @@ disabled with --disable-FEATURE, default is enabled if > available: > numa libnuma support > tcmalloc tcmalloc support > jemalloc jemalloc support > + vxhs Veritas HyperScale vDisk backend support > > NOTE: The object files are built at the place where configure is launched > EOF > @@ -4555,6 +4562,43 @@ if do_cc -nostdlib -Wl,-r -Wl,--no-relax -o $TMPMO > $TMPO; then > fi > > ########################################## > +# Veritas HyperScale block driver VxHS > +# Check if libqnio is installed > +if test "$vxhs" != "no" ; then > + cat > $TMPC <<EOF > +#include <stdio.h> > +#include <qnio/qnio_api.h> > + > +void vxhs_inc_acb_segment_count(void *acb, int count); > +void vxhs_dec_acb_segment_count(void *acb, int count); > +void vxhs_set_acb_buffer(void *ptr, void *buffer); > + > +void vxhs_inc_acb_segment_count(void *ptr, int count) > +{ > +} > +void vxhs_dec_acb_segment_count(void *ptr, int count) > +{ > +} > +void vxhs_set_acb_buffer(void *ptr, void *buffer) > +{ > +} > +int main(void) { > + qnio_ck_initialize_lock(); > + return 0; > +} > +EOF > + vxhs_libs="-lqnioshim -lqnio" > + if compile_prog "" "$vxhs_libs" ; then > + vxhs=yes > + else > + if test "$vxhs" = "yes" ; then > + feature_not_found "vxhs block device" "Install libqnio. See github" > + fi > + vxhs=no > + fi > +fi > + > +########################################## > # End of CC checks > # After here, no more $cc or $ld runs > > @@ -4920,6 +4964,7 @@ echo "NUMA host support $numa" > echo "tcmalloc support $tcmalloc" > echo "jemalloc support $jemalloc" > echo "avx2 optimization $avx2_opt" > +echo "VxHS block device $vxhs" > > if test "$sdl_too_old" = "yes"; then > echo "-> Your SDL version is too old - please upgrade to have SDL support" > @@ -5507,6 +5552,12 @@ if test "$pthread_setname_np" = "yes" ; then > echo "CONFIG_PTHREAD_SETNAME_NP=y" >> $config_host_mak > fi > > +if test "$vxhs" = "yes" ; then > + echo "CONFIG_VXHS=y" >> $config_host_mak > + echo "VXHS_CFLAGS=$vxhs_cflags" >> $config_host_mak > + echo "VXHS_LIBS=$vxhs_libs" >> $config_host_mak > +fi > + > if test "$tcg_interpreter" = "yes"; then > QEMU_INCLUDES="-I\$(SRC_PATH)/tcg/tci $QEMU_INCLUDES" > elif test "$ARCH" = "sparc64" ; then >