Module Name: src Committed By: snj Date: Mon Sep 28 01:25:22 UTC 2009
Modified Files: src/sys/arch/xen/xen [netbsd-5]: xbd_xenbus.c xbdback_xenbus.c Log Message: Pull up following revision(s) (requested by bouyer in ticket #1026): sys/arch/xen/xen/xbd_xenbus.c: revision 1.43 via patch sys/arch/xen/xen/xbdback_xenbus.c: revision 1.25 xbdback: implement and publish "feature-flush-cache". xbd: if feature-flush-cache is present, use it for DIOCCACHESYNC. If not present, make DIOCCACHESYNC return EOPNOTSUPP and warn on first call. Should improve WAPBL reliability of Xen guests on a NetBSD dom0. Unfortunably not all linux guests seems to support this feature, and using feature-write-barrier would require a B_BARRIER flag in the buffer. To generate a diff of this commit: cvs rdiff -u -r1.34.2.2 -r1.34.2.3 src/sys/arch/xen/xen/xbd_xenbus.c cvs rdiff -u -r1.20 -r1.20.4.1 src/sys/arch/xen/xen/xbdback_xenbus.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/arch/xen/xen/xbd_xenbus.c diff -u src/sys/arch/xen/xen/xbd_xenbus.c:1.34.2.2 src/sys/arch/xen/xen/xbd_xenbus.c:1.34.2.3 --- src/sys/arch/xen/xen/xbd_xenbus.c:1.34.2.2 Mon Sep 28 00:42:34 2009 +++ src/sys/arch/xen/xen/xbd_xenbus.c Mon Sep 28 01:25:22 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: xbd_xenbus.c,v 1.34.2.2 2009/09/28 00:42:34 snj Exp $ */ +/* $NetBSD: xbd_xenbus.c,v 1.34.2.3 2009/09/28 01:25:22 snj Exp $ */ /* * Copyright (c) 2006 Manuel Bouyer. @@ -31,7 +31,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: xbd_xenbus.c,v 1.34.2.2 2009/09/28 00:42:34 snj Exp $"); +__KERNEL_RCSID(0, "$NetBSD: xbd_xenbus.c,v 1.34.2.3 2009/09/28 01:25:22 snj Exp $"); #include "opt_xen.h" #include "rnd.h" @@ -85,11 +85,24 @@ struct xbd_req { SLIST_ENTRY(xbd_req) req_next; uint16_t req_id; /* ID passed to backend */ - grant_ref_t req_gntref[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - int req_nr_segments; /* number of segments in this request */ - struct buf *req_bp; /* buffer associated with this request */ - void *req_data; /* pointer to the data buffer */ + union { + struct { + grant_ref_t req_gntref[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + int req_nr_segments; /* number of segments in this request */ + struct buf *req_bp; /* buffer associated with this request */ + void *req_data; /* pointer to the data buffer */ + } req_rw; + struct { + int s_error; + volatile int s_done; + } req_sync; + } u; }; +#define req_gntref u.req_rw.req_gntref +#define req_nr_segments u.req_rw.req_nr_segments +#define req_bp u.req_rw.req_bp +#define req_data u.req_rw.req_data +#define req_sync u.req_sync struct xbd_xenbus_softc { device_t sc_dev; @@ -105,6 +118,7 @@ struct xbd_req sc_reqs[XBD_RING_SIZE]; SLIST_HEAD(,xbd_req) sc_xbdreq_head; /* list of free requests */ + bool sc_xbdreq_wait; /* special waiting on xbd_req */ int sc_backend_status; /* our status with backend */ #define BLKIF_STATE_DISCONNECTED 0 @@ -117,6 +131,7 @@ uint64_t sc_xbdsize; /* size of disk in DEV_BSIZE */ u_long sc_info; /* VDISK_* */ u_long sc_handle; /* from backend */ + int sc_cache_flush; /* backend supports BLKIF_OP_FLUSH_DISKCACHE */ #if NRND > 0 rndsource_element_t sc_rnd_source; #endif @@ -494,6 +509,7 @@ { int err; unsigned long long sectors; + u_long cache_flush; err = xenbus_read_ul(NULL, sc->sc_xbusd->xbusd_path, "virtual-device", &sc->sc_handle, 10); @@ -517,6 +533,14 @@ if (err) panic("%s: can't read number from %s/sector-size\n", device_xname(sc->sc_dev), sc->sc_xbusd->xbusd_otherend); + err = xenbus_read_ul(NULL, sc->sc_xbusd->xbusd_otherend, + "feature-flush-cache", &cache_flush, 10); + if (err) + cache_flush = 0; + if (cache_flush > 0) + sc->sc_cache_flush = 1; + else + sc->sc_cache_flush = 0; xenbus_switch_state(sc->sc_xbusd, NULL, XenbusStateConnected); } @@ -540,9 +564,16 @@ for (i = sc->sc_ring.rsp_cons; i != resp_prod; i++) { blkif_response_t *rep = RING_GET_RESPONSE(&sc->sc_ring, i); struct xbd_req *xbdreq = &sc->sc_reqs[rep->id]; - bp = xbdreq->req_bp; DPRINTF(("xbd_handler(%p): b_bcount = %ld\n", - bp, (long)bp->b_bcount)); + xbdreq->req_bp, (long)bp->b_bcount)); + bp = xbdreq->req_bp; + if (rep->operation == BLKIF_OP_FLUSH_DISKCACHE) { + xbdreq->req_sync.s_error = rep->status; + xbdreq->req_sync.s_done = 1; + wakeup(xbdreq); + /* caller will free the req */ + continue; + } for (seg = xbdreq->req_nr_segments - 1; seg >= 0; seg--) { if (__predict_false( xengnt_status(xbdreq->req_gntref[seg]))) { @@ -584,13 +615,15 @@ biodone(bp); SLIST_INSERT_HEAD(&sc->sc_xbdreq_head, xbdreq, req_next); } +done: x86_lfence(); sc->sc_ring.rsp_cons = i; RING_FINAL_CHECK_FOR_RESPONSES(&sc->sc_ring, more_to_do); if (more_to_do) goto again; -done: dk_iodone(sc->sc_di, &sc->sc_dksc); + if (sc->sc_xbdreq_wait) + wakeup(&sc->sc_xbdreq_wait); return 1; } @@ -690,6 +723,10 @@ struct dk_softc *dksc; int error; struct disk *dk; + int s; + struct xbd_req *xbdreq; + blkif_request_t *req; + int notify; DPRINTF(("xbdioctl(%d, %08lx, %p, %d, %p)\n", dev, cmd, data, flag, l)); @@ -704,6 +741,57 @@ case DIOCSSTRATEGY: error = EOPNOTSUPP; break; + case DIOCCACHESYNC: + if (sc->sc_cache_flush <= 0) { + if (sc->sc_cache_flush == 0) { + aprint_error_dev(sc->sc_dev, + "WARNING: cache flush not supported " + "by backend\n"); + sc->sc_cache_flush = -1; + } + return EOPNOTSUPP; + } + + s = splbio(); + + while (RING_FULL(&sc->sc_ring)) { + sc->sc_xbdreq_wait = 1; + tsleep(&sc->sc_xbdreq_wait, PRIBIO, "xbdreq", 0); + } + sc->sc_xbdreq_wait = 0; + + xbdreq = SLIST_FIRST(&sc->sc_xbdreq_head); + if (__predict_false(xbdreq == NULL)) { + DPRINTF(("xbdioctl: no req\n")); + error = ENOMEM; + } else { + SLIST_REMOVE_HEAD(&sc->sc_xbdreq_head, req_next); + req = RING_GET_REQUEST(&sc->sc_ring, + sc->sc_ring.req_prod_pvt); + req->id = xbdreq->req_id; + req->operation = BLKIF_OP_FLUSH_DISKCACHE; + req->handle = sc->sc_handle; + xbdreq->req_sync.s_done = 0; + sc->sc_ring.req_prod_pvt++; + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->sc_ring, + notify); + if (notify) + hypervisor_notify_via_evtchn(sc->sc_evtchn); + /* request sent, no wait for completion */ + while (xbdreq->req_sync.s_done == 0) { + tsleep(xbdreq, PRIBIO, "xbdsync", 0); + } + if (xbdreq->req_sync.s_error == BLKIF_RSP_EOPNOTSUPP) + error = EOPNOTSUPP; + else if (xbdreq->req_sync.s_error == BLKIF_RSP_OKAY) + error = 0; + else + error = EIO; + SLIST_INSERT_HEAD(&sc->sc_xbdreq_head, xbdreq, + req_next); + } + splx(s); + break; default: error = dk_ioctl(sc->sc_di, dksc, dev, cmd, data, flag, l); break; @@ -761,7 +849,7 @@ } - if (RING_FULL(&sc->sc_ring)) { + if (RING_FULL(&sc->sc_ring) || sc->sc_xbdreq_wait) { DPRINTF(("xbdstart: ring_full\n")); ret = -1; goto out; Index: src/sys/arch/xen/xen/xbdback_xenbus.c diff -u src/sys/arch/xen/xen/xbdback_xenbus.c:1.20 src/sys/arch/xen/xen/xbdback_xenbus.c:1.20.4.1 --- src/sys/arch/xen/xen/xbdback_xenbus.c:1.20 Fri Oct 24 18:02:58 2008 +++ src/sys/arch/xen/xen/xbdback_xenbus.c Mon Sep 28 01:25:22 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: xbdback_xenbus.c,v 1.20 2008/10/24 18:02:58 jym Exp $ */ +/* $NetBSD: xbdback_xenbus.c,v 1.20.4.1 2009/09/28 01:25:22 snj Exp $ */ /* * Copyright (c) 2006 Manuel Bouyer. @@ -31,7 +31,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: xbdback_xenbus.c,v 1.20 2008/10/24 18:02:58 jym Exp $"); +__KERNEL_RCSID(0, "$NetBSD: xbdback_xenbus.c,v 1.20.4.1 2009/09/28 01:25:22 snj Exp $"); #include <sys/types.h> #include <sys/param.h> @@ -90,6 +90,31 @@ * it's finished, set xbdi->xbdi_cont (see below) to NULL and the return * doesn't matter. Otherwise it's passed as the second parameter to * the new value of xbdi->xbdi_cont. + * Here's how the call graph is supposed to be for a single I/O: + * xbdback_co_main() + * | |-> xbdback_co_cache_doflush() -> stall + * | xbdback_co_cache_flush2() <- xbdback_co_flush_done() <- + * | | | + * | |-> xbdback_co_cache_flush() -> xbdback_co_flush() -- + * xbdback_co_main_loop() -> xbdback_co_main_done() -> xbdback_co_flush() + * | | | + * | xbdback_co_main_done2() <- xbdback_co_flush_done() + * | | + * | xbdback_co_main() or NULL + * xbdback_co_io() -> xbdback_co_main_incr() -> xbdback_co_main_loop() + * | + * xbdback_co_io_gotreq() -> xbdback_co_flush() -> xbdback_co_flush() + * | | | + * xbdback_co_io_loop() --- <---------------- xbdback_co_flush_done() + * | | + * xbdback_co_io_gotio() | + * | | + * xbdback_co_io_gotio2()<- + * | |--------> xbdback_co_io_gotfrag + * | | + * xbdback_co_io_gotfrag2() <----------| + * | |--> xbdback_co_io_loop() + * xbdback_co_main_incr() */ typedef void *(* xbdback_cont_t)(struct xbdback_instance *, void *); @@ -143,6 +168,7 @@ grant_ref_t xbdi_thisgrt, xbdi_lastgrt; /* grants */ /* other state */ int xbdi_same_page; /* are we merging two segments on the same page? */ + uint xbdi_pendingreqs; /* number of I/O in fly */ }; /* Manipulation of the above reference count. */ /* xxx...@panix.com: not MP-safe, and move the i386 asm elsewhere. */ @@ -179,16 +205,35 @@ */ struct xbdback_io { struct work xio_work; - struct buf xio_buf; /* our I/O */ /* The instance pointer is duplicated for convenience. */ struct xbdback_instance *xio_xbdi; /* our xbd instance */ - SLIST_HEAD(, xbdback_fragment) xio_rq; /* xbd requests involved */ - vaddr_t xio_vaddr; /* the virtual address to map the request at */ - grant_ref_t xio_gref[XENSHM_MAX_PAGES_PER_REQUEST]; /* grants to map */ - grant_handle_t xio_gh[XENSHM_MAX_PAGES_PER_REQUEST];/* grants release */ - uint16_t xio_nrma; /* number of guest pages */ - uint16_t xio_mapped; + uint8_t xio_operation; + union { + struct { + struct buf xio_buf; /* our I/O */ + /* xbd requests involved */ + SLIST_HEAD(, xbdback_fragment) xio_rq; + /* the virtual address to map the request at */ + vaddr_t xio_vaddr; + /* grants to map */ + grant_ref_t xio_gref[XENSHM_MAX_PAGES_PER_REQUEST]; + /* grants release */ + grant_handle_t xio_gh[XENSHM_MAX_PAGES_PER_REQUEST]; + uint16_t xio_nrma; /* number of guest pages */ + uint16_t xio_mapped; + } xio_rw; + uint64_t xio_flush_id; + } u; }; +#define xio_buf u.xio_rw.xio_buf +#define xio_rq u.xio_rw.xio_rq +#define xio_vaddr u.xio_rw.xio_vaddr +#define xio_gref u.xio_rw.xio_gref +#define xio_gh u.xio_rw.xio_gh +#define xio_nrma u.xio_rw.xio_nrma +#define xio_mapped u.xio_rw.xio_mapped + +#define xio_flush_id u.xio_flush_id /* * Rather than have the xbdback_io keep an array of the @@ -235,6 +280,10 @@ static void *xbdback_co_main_done(struct xbdback_instance *, void *); static void *xbdback_co_main_done2(struct xbdback_instance *, void *); +static void *xbdback_co_cache_flush(struct xbdback_instance *, void *); +static void *xbdback_co_cache_flush2(struct xbdback_instance *, void *); +static void *xbdback_co_cache_doflush(struct xbdback_instance *, void *); + static void *xbdback_co_io(struct xbdback_instance *, void *); static void *xbdback_co_io_gotreq(struct xbdback_instance *, void *); static void *xbdback_co_io_loop(struct xbdback_instance *, void *); @@ -739,6 +788,13 @@ xbusd->xbusd_path, err); goto abort; } + err = xenbus_printf(xbt, xbusd->xbusd_path, "feature-flush-cache", + "%u", 1); + if (err) { + printf("xbdback: failed to write %s/feature-flush-cache: %d\n", + xbusd->xbusd_path, err); + goto abort; + } err = xenbus_transaction_end(xbt, 0); if (err == EAGAIN) goto again; @@ -858,6 +914,10 @@ case BLKIF_OP_WRITE: xbdi->xbdi_cont = xbdback_co_io; break; + case BLKIF_OP_FLUSH_DISKCACHE: + xbdi_get(xbdi); + xbdi->xbdi_cont = xbdback_co_cache_flush; + break; default: printf("xbdback_evthandler domain %d: unknown " "operation %d\n", xbdi->xbdi_domid, req->operation); @@ -908,6 +968,50 @@ } static void * +xbdback_co_cache_flush(struct xbdback_instance *xbdi, void *obj) +{ + (void)obj; + XENPRINTF(("xbdback_co_cache_flush %p %p\n", xbdi, obj)); + if (xbdi->xbdi_io != NULL) { + xbdi->xbdi_cont = xbdback_co_flush; + xbdi->xbdi_cont_aux = xbdback_co_cache_flush2; + } else { + xbdi->xbdi_cont = xbdback_co_cache_flush2; + } + return xbdi; +} + +static void * +xbdback_co_cache_flush2(struct xbdback_instance *xbdi, void *obj) +{ + (void)obj; + XENPRINTF(("xbdback_co_cache_flush2 %p %p\n", xbdi, obj)); + if (xbdi->xbdi_pendingreqs > 0) { + /* event or iodone will restart processing */ + xbdi->xbdi_cont = NULL; + return NULL; + } + xbdi->xbdi_cont = xbdback_co_cache_doflush; + return xbdback_pool_get(&xbdback_io_pool, xbdi); +} + +static void * +xbdback_co_cache_doflush(struct xbdback_instance *xbdi, void *obj) +{ + struct xbdback_io *xbd_io; + + XENPRINTF(("xbdback_co_cache_doflush %p %p\n", xbdi, obj)); + xbd_io = xbdi->xbdi_io = obj; + xbd_io->xio_xbdi = xbdi; + xbd_io->xio_operation = xbdi->xbdi_xen_req.operation; + xbd_io->xio_flush_id = xbdi->xbdi_xen_req.id; + workqueue_enqueue(xbdback_workqueue, &xbdi->xbdi_io->xio_work, NULL); + /* xbdback_do_io() will advance req pointer and restart processing */ + xbdi->xbdi_cont = xbdback_co_cache_doflush; + return NULL; +} + +static void * xbdback_co_io(struct xbdback_instance *xbdi, void *obj) { int error; @@ -1048,7 +1152,6 @@ if (xbdi->xbdi_io == NULL) { xbdi->xbdi_cont = xbdback_co_io_gotio; xio = xbdback_pool_get(&xbdback_io_pool, xbdi); - buf_init(&xio->xio_buf); return xio; } else { xbdi->xbdi_cont = xbdback_co_io_gotio2; @@ -1070,12 +1173,15 @@ int buf_flags; xbdi_get(xbdi); + atomic_inc_uint(&xbdi->xbdi_pendingreqs); xbd_io = xbdi->xbdi_io = obj; + buf_init(&xbd_io->xio_buf); xbd_io->xio_xbdi = xbdi; SLIST_INIT(&xbd_io->xio_rq); xbd_io->xio_nrma = 0; xbd_io->xio_mapped = 0; + xbd_io->xio_operation = xbdi->xbdi_xen_req.operation; start_offset = xbdi->xbdi_this_fs * VBD_BSIZE; @@ -1205,6 +1311,33 @@ struct xbdback_io *xbd_io = (void *)wk; KASSERT(&xbd_io->xio_work == wk); + if (xbd_io->xio_operation == BLKIF_OP_FLUSH_DISKCACHE) { + int error; + struct xbdback_instance *xbdi = xbd_io->xio_xbdi; + + error = VOP_IOCTL(xbdi->xbdi_vp, DIOCCACHESYNC, NULL, FWRITE, + kauth_cred_get()); + if (error) { + aprint_error("xbdback %s: DIOCCACHESYNC returned %d\n", + xbdi->xbdi_xbusd->xbusd_path, error); + if (error == EOPNOTSUPP || error == ENOTTY) + error = BLKIF_RSP_EOPNOTSUPP; + else + error = BLKIF_RSP_ERROR; + } else + error = BLKIF_RSP_OKAY; + xbdback_send_reply(xbdi, xbd_io->xio_flush_id, + xbd_io->xio_operation, error); + xbdback_pool_put(&xbdback_io_pool, xbd_io); + xbdi_put(xbdi); + /* handle next IO */ + xbdi->xbdi_io = NULL; + xbdi->xbdi_cont = xbdback_co_main_incr; + xbdback_trampoline(xbdi, xbdi); + return; + } + + /* should be read or write */ xbd_io->xio_buf.b_data = (void *)((vaddr_t)xbd_io->xio_buf.b_data + xbd_io->xio_vaddr); #ifdef DIAGNOSTIC @@ -1292,8 +1425,14 @@ xbdback_pool_put(&xbdback_request_pool, xbd_req); } xbdi_put(xbdi); + atomic_dec_uint(&xbdi->xbdi_pendingreqs); buf_destroy(&xbd_io->xio_buf); xbdback_pool_put(&xbdback_io_pool, xbd_io); + if (xbdi->xbdi_cont == NULL) { + /* check if there is more work to do */ + xbdi->xbdi_cont = xbdback_co_main; + xbdback_trampoline(xbdi, xbdi); + } } /*