Author: gibbs
Date: Mon Jun 13 20:36:29 2011
New Revision: 223059
URL: http://svn.freebsd.org/changeset/base/223059

Log:
  Several enhancements to the Xen block back driver.
  
  sys/dev/xen/blkback/blkback.c:
        o Implement front-end request coalescing.  This greatly improves the
          performance of front-end clients that are unaware of the dynamic
          request-size/number of requests negotiation available in the
          FreeBSD backend driver.  This required a large restructuring
          in how this driver records in-flight transactions and how those
          transactions are mapped into kernel KVA.  For example, the driver
          now includes a mini "KVA manager" that allocates ranges of
          contiguous KVA to patches of requests that are physically
          contiguous in the backing store so that a single bio or UIO
          segment can be used to represent the I/O.
  
        o Refuse to open any backend files or devices if the system
          has yet to mount root.  This avoids a panic.
  
        o Properly handle "onlined" devices.  An "onlined" backend
          device stays attached to its backing store across front-end
          disconnections.  This feature is intended to reduce latency
          when a front-end does a hand-off to another driver (e.g.
          PV aware bootloader to OS kernel) or during a VM reboot.
  
        o Harden the driver against a pathological/buggy front-end
          by carefully vetting front-end XenStore data such as the
          front-end state.
  
        o Add sysctls that report the negotiated number of
          segments per-request and the number of requests that
          can be concurrently in flight.
  
  Submitted by: kdm
  Reviewed by:  gibbs
  Sponsored by: Spectra Logic Corporation
  MFC after:    1 week

Modified:
  head/sys/dev/xen/blkback/blkback.c

Modified: head/sys/dev/xen/blkback/blkback.c
==============================================================================
--- head/sys/dev/xen/blkback/blkback.c  Mon Jun 13 20:34:12 2011        
(r223058)
+++ head/sys/dev/xen/blkback/blkback.c  Mon Jun 13 20:36:29 2011        
(r223059)
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2009-2010 Spectra Logic Corporation
+ * Copyright (c) 2009-2011 Spectra Logic Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -61,6 +61,8 @@ __FBSDID("$FreeBSD$");
 #include <sys/types.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
+#include <sys/sysctl.h>
+#include <sys/bitstring.h>
 
 #include <geom/geom.h>
 
@@ -153,9 +155,19 @@ MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "X
 #define        XBB_MAX_RING_PAGES                                              
    \
        BLKIF_RING_PAGES(BLKIF_SEGS_TO_BLOCKS(XBB_MAX_SEGMENTS_PER_REQUEST) \
                       * XBB_MAX_REQUESTS)
+/**
+ * The maximum number of ring pages that we can allow per request list.
+ * We limit this to the maximum number of segments per request, because
+ * that is already a reasonable number of segments to aggregate.  This
+ * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST,
+ * because that would leave situations where we can't dispatch even one
+ * large request.
+ */
+#define        XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST
 
 /*--------------------------- Forward Declarations 
---------------------------*/
 struct xbb_softc;
+struct xbb_xen_req;
 
 static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt,
                              ...) __attribute__((format(printf, 3, 4)));
@@ -163,16 +175,15 @@ static int  xbb_shutdown(struct xbb_soft
 static int  xbb_detach(device_t dev);
 
 /*------------------------------ Data Structures 
-----------------------------*/
-/**
- * \brief Object tracking an in-flight I/O from a Xen VBD consumer.
- */
-struct xbb_xen_req {
-       /**
-        * Linked list links used to aggregate idle request in the
-        * request free pool (xbb->request_free_slist).
-        */
-       SLIST_ENTRY(xbb_xen_req) links;
 
+STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req);
+
+typedef enum {
+       XBB_REQLIST_NONE        = 0x00,
+       XBB_REQLIST_MAPPED      = 0x01
+} xbb_reqlist_flags;
+
+struct xbb_xen_reqlist {
        /**
         * Back reference to the parent block back instance for this
         * request.  Used during bio_done handling.
@@ -180,17 +191,71 @@ struct xbb_xen_req {
        struct xbb_softc        *xbb;
 
        /**
-        * The remote domain's identifier for this I/O request.
+        * BLKIF_OP code for this request.
+        */
+       int                      operation;
+
+       /**
+        * Set to BLKIF_RSP_* to indicate request status.
+        *
+        * This field allows an error status to be recorded even if the
+        * delivery of this status must be deferred.  Deferred reporting
+        * is necessary, for example, when an error is detected during
+        * completion processing of one bio when other bios for this
+        * request are still outstanding.
+        */
+       int                      status;
+
+       /**
+        * Number of 512 byte sectors not transferred.
+        */
+       int                      residual_512b_sectors;
+
+       /**
+        * Starting sector number of the first request in the list.
+        */
+       off_t                    starting_sector_number;
+
+       /**
+        * If we're going to coalesce, the next contiguous sector would be
+        * this one.
+        */
+       off_t                    next_contig_sector;
+
+       /**
+        * Number of child requests in the list.
         */
-       uint64_t                 id;
+       int                      num_children;
+
+       /**
+        * Number of I/O requests dispatched to the backend.
+        */
+       int                      pendcnt;
+
+       /**
+        * Total number of segments for requests in the list.
+        */
+       int                      nr_segments;
+
+       /**
+        * Flags for this particular request list.
+        */
+       xbb_reqlist_flags        flags;
 
        /**
         * Kernel virtual address space reserved for this request
-        * structure and used to map the remote domain's pages for
+        * list structure and used to map the remote domain's pages for
         * this I/O, into our domain's address space.
         */
        uint8_t                 *kva;
 
+       /**
+        * Base, psuedo-physical address, corresponding to the start
+        * of this request's kva region.
+        */
+       uint64_t                 gnt_base;
+
+
 #ifdef XBB_USE_BOUNCE_BUFFERS
        /**
         * Pre-allocated domain local memory used to proxy remote
@@ -200,53 +265,91 @@ struct xbb_xen_req {
 #endif
 
        /**
-        * Base, psuedo-physical address, corresponding to the start
-        * of this request's kva region.
+        * Array of grant handles (one per page) used to map this request.
         */
-       uint64_t                 gnt_base;
+       grant_handle_t          *gnt_handles;
+
+       /**
+        * Device statistics request ordering type (ordered or simple).
+        */
+       devstat_tag_type         ds_tag_type;
+
+       /**
+        * Device statistics request type (read, write, no_data).
+        */
+       devstat_trans_flags      ds_trans_type;
+
+       /**
+        * The start time for this request.
+        */
+       struct bintime           ds_t0;
+
+       /**
+        * Linked list of contiguous requests with the same operation type.
+        */
+       struct xbb_xen_req_list  contig_req_list;
+
+       /**
+        * Linked list links used to aggregate idle requests in the
+        * request list free pool (xbb->reqlist_free_stailq) and pending
+        * requests waiting for execution (xbb->reqlist_pending_stailq).
+        */
+       STAILQ_ENTRY(xbb_xen_reqlist) links;
+};
+
+STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist);
+
+/**
+ * \brief Object tracking an in-flight I/O from a Xen VBD consumer.
+ */
+struct xbb_xen_req {
+       /**
+        * Linked list links used to aggregate requests into a reqlist
+        * and to store them in the request free pool.
+        */
+       STAILQ_ENTRY(xbb_xen_req) links;
+
+       /**
+        * The remote domain's identifier for this I/O request.
+        */
+       uint64_t                  id;
 
        /**
         * The number of pages currently mapped for this request.
         */
-       int                      nr_pages;
+       int                       nr_pages;
 
        /**
         * The number of 512 byte sectors comprising this requests.
         */
-       int                      nr_512b_sectors;
+       int                       nr_512b_sectors;
 
        /**
         * The number of struct bio requests still outstanding for this
         * request on the backend device.  This field is only used for  
         * device (rather than file) backed I/O.
         */
-       int                      pendcnt;
+       int                       pendcnt;
 
        /**
         * BLKIF_OP code for this request.
         */
-       int                      operation;
+       int                       operation;
 
        /**
-        * BLKIF_RSP status code for this request.
-        *
-        * This field allows an error status to be recorded even if the
-        * delivery of this status must be deferred.  Deferred reporting
-        * is necessary, for example, when an error is detected during
-        * completion processing of one bio when other bios for this
-        * request are still outstanding.
+        * Storage used for non-native ring requests.
         */
-       int                      status;
+       blkif_request_t          ring_req_storage;
 
        /**
-        * Device statistics request ordering type (ordered or simple).
+        * Pointer to the Xen request in the ring.
         */
-       devstat_tag_type         ds_tag_type;
+       blkif_request_t         *ring_req;
 
        /**
-        * Device statistics request type (read, write, no_data).
+        * Consumer index for this request.
         */
-       devstat_trans_flags      ds_trans_type;
+       RING_IDX                 req_ring_idx;
 
        /**
         * The start time for this request.
@@ -254,9 +357,9 @@ struct xbb_xen_req {
        struct bintime           ds_t0;
 
        /**
-        * Array of grant handles (one per page) used to map this request.
+        * Pointer back to our parent request list.
         */
-       grant_handle_t          *gnt_handles;
+       struct xbb_xen_reqlist  *reqlist;
 };
 SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req);
 
@@ -321,7 +424,10 @@ typedef enum
        XBBF_RESOURCE_SHORTAGE = 0x04,
 
        /** Connection teardown in progress. */
-       XBBF_SHUTDOWN          = 0x08
+       XBBF_SHUTDOWN          = 0x08,
+
+       /** A thread is already performing shutdown processing. */
+       XBBF_IN_SHUTDOWN       = 0x10
 } xbb_flag_t;
 
 /** Backend device type.  */
@@ -399,7 +505,7 @@ struct xbb_file_data {
         * Only a single file based request is outstanding per-xbb instance,
         * so we only need one of these.
         */
-       struct iovec    xiovecs[XBB_MAX_SEGMENTS_PER_REQUEST];
+       struct iovec    xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST];
 #ifdef XBB_USE_BOUNCE_BUFFERS
 
        /**
@@ -411,7 +517,7 @@ struct xbb_file_data {
         * bounce-out the read data.  This array serves as the temporary
         * storage for this saved data.
         */
-       struct iovec    saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQUEST];
+       struct iovec    saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST];
 
        /**
         * \brief Array of memoized bounce buffer kva offsets used
@@ -422,7 +528,7 @@ struct xbb_file_data {
         * the request sg elements is unavoidable. We memoize the computed
         * bounce address here to reduce the cost of the second walk.
         */
-       void            *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQUEST];
+       void            *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST];
 #endif /* XBB_USE_BOUNCE_BUFFERS */
 };
 
@@ -437,9 +543,9 @@ union xbb_backend_data {
 /**
  * Function signature of backend specific I/O handlers.
  */
-typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb, blkif_request_t *ring_req,
-                             struct xbb_xen_req *req, int nseg,
-                             int operation, int flags);
+typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb,
+                             struct xbb_xen_reqlist *reqlist, int operation,
+                             int flags);
 
 /**
  * Per-instance configuration data.
@@ -467,14 +573,23 @@ struct xbb_softc {
        xbb_dispatch_t            dispatch_io;
 
        /** The number of requests outstanding on the backend device/file. */
-       u_int                     active_request_count;
+       int                       active_request_count;
 
        /** Free pool of request tracking structures. */
-       struct xbb_xen_req_slist  request_free_slist;
+       struct xbb_xen_req_list   request_free_stailq;
 
        /** Array, sized at connection time, of request tracking structures. */
        struct xbb_xen_req       *requests;
 
+       /** Free pool of request list structures. */
+       struct xbb_xen_reqlist_list reqlist_free_stailq;
+
+       /** List of pending request lists awaiting execution. */
+       struct xbb_xen_reqlist_list reqlist_pending_stailq;
+
+       /** Array, sized at connection time, of request list structures. */
+       struct xbb_xen_reqlist   *request_lists;
+
        /**
         * Global pool of kva used for mapping remote domain ring
         * and I/O transaction data.
@@ -487,6 +602,15 @@ struct xbb_softc {
        /** The size of the global kva pool. */
        int                       kva_size;
 
+       /** The size of the KVA area used for request lists. */
+       int                       reqlist_kva_size;
+
+       /** The number of pages of KVA used for request lists */
+       int                       reqlist_kva_pages;
+
+       /** Bitmap of free KVA pages */
+       bitstr_t                 *kva_free;
+
        /**
         * \brief Cached value of the front-end's domain id.
         * 
@@ -508,12 +632,12 @@ struct xbb_softc {
        int                       abi;
 
        /**
-        * \brief The maximum number of requests allowed to be in
-        *        flight at a time.
+        * \brief The maximum number of requests and request lists allowed
+        *        to be in flight at a time.
         *
         * This value is negotiated via the XenStore.
         */
-       uint32_t                  max_requests;
+       u_int                     max_requests;
 
        /**
         * \brief The maximum number of segments (1 page per segment)
@@ -521,7 +645,15 @@ struct xbb_softc {
         *
         * This value is negotiated via the XenStore.
         */
-       uint32_t                  max_request_segments;
+       u_int                     max_request_segments;
+
+       /**
+        * \brief Maximum number of segments per request list.
+        *
+        * This value is derived from and will generally be larger than
+        * max_request_segments.
+        */
+       u_int                     max_reqlist_segments;
 
        /**
         * The maximum size of any request to this back-end
@@ -529,7 +661,13 @@ struct xbb_softc {
         *
         * This value is negotiated via the XenStore.
         */
-       uint32_t                  max_request_size;
+       u_int                     max_request_size;
+
+       /**
+        * The maximum size of any request list.  This is derived directly
+        * from max_reqlist_segments.
+        */
+       u_int                     max_reqlist_size;
 
        /** Various configuration and state bit flags. */
        xbb_flag_t                flags;
@@ -574,6 +712,7 @@ struct xbb_softc {
        struct vnode             *vn;
 
        union xbb_backend_data    backend;
+
        /** The native sector size of the backend. */
        u_int                     sector_size;
 
@@ -598,7 +737,14 @@ struct xbb_softc {
         *
         * Ring processing is serialized so we only need one of these.
         */
-       struct xbb_sg             xbb_sgs[XBB_MAX_SEGMENTS_PER_REQUEST];
+       struct xbb_sg             xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST];
+
+       /**
+        * Temporary grant table map used in xbb_dispatch_io().  When
+        * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the
+        * stack could cause a stack overflow.
+        */
+       struct gnttab_map_grant_ref   maps[XBB_MAX_SEGMENTS_PER_REQLIST];
 
        /** Mutex protecting per-instance data. */
        struct mtx                lock;
@@ -614,8 +760,51 @@ struct xbb_softc {
        int                       pseudo_phys_res_id;
 #endif
 
-       /** I/O statistics. */
+       /**
+        * I/O statistics from BlockBack dispatch down.  These are
+        * coalesced requests, and we start them right before execution.
+        */
        struct devstat           *xbb_stats;
+
+       /**
+        * I/O statistics coming into BlockBack.  These are the requests as
+        * we get them from BlockFront.  They are started as soon as we
+        * receive a request, and completed when the I/O is complete.
+        */
+       struct devstat           *xbb_stats_in;
+
+       /** Disable sending flush to the backend */
+       int                       disable_flush;
+
+       /** Send a real flush for every N flush requests */
+       int                       flush_interval;
+
+       /** Count of flush requests in the interval */
+       int                       flush_count;
+
+       /** Don't coalesce requests if this is set */
+       int                       no_coalesce_reqs;
+
+       /** Number of requests we have received */
+       uint64_t                  reqs_received;
+
+       /** Number of requests we have completed*/
+       uint64_t                  reqs_completed;
+
+       /** How many forced dispatches (i.e. without coalescing) have happend */
+       uint64_t                  forced_dispatch;
+
+       /** How many normal dispatches have happend */
+       uint64_t                  normal_dispatch;
+
+       /** How many total dispatches have happend */
+       uint64_t                  total_dispatch;
+
+       /** How many times we have run out of KVA */
+       uint64_t                  kva_shortages;
+
+       /** How many times we have run out of request structures */
+       uint64_t                  request_shortages;
 };
 
 /*---------------------------- Request Processing 
----------------------------*/
@@ -633,21 +822,14 @@ xbb_get_req(struct xbb_softc *xbb)
        struct xbb_xen_req *req;
 
        req = NULL;
-       mtx_lock(&xbb->lock);
 
-       /*
-        * Do not allow new requests to be allocated while we
-        * are shutting down.
-        */
-       if ((xbb->flags & XBBF_SHUTDOWN) == 0) {
-               if ((req = SLIST_FIRST(&xbb->request_free_slist)) != NULL) {
-                       SLIST_REMOVE_HEAD(&xbb->request_free_slist, links);
-                       xbb->active_request_count++;
-               } else {
-                       xbb->flags |= XBBF_RESOURCE_SHORTAGE;
-               }
+       mtx_assert(&xbb->lock, MA_OWNED);
+
+       if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) {
+               STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links);
+               xbb->active_request_count++;
        }
-       mtx_unlock(&xbb->lock);
+
        return (req);
 }
 
@@ -660,34 +842,40 @@ xbb_get_req(struct xbb_softc *xbb)
 static inline void
 xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req)
 {
-       int wake_thread;
+       mtx_assert(&xbb->lock, MA_OWNED);
 
-       mtx_lock(&xbb->lock);
-       wake_thread = xbb->flags & XBBF_RESOURCE_SHORTAGE;
-       xbb->flags &= ~XBBF_RESOURCE_SHORTAGE;
-       SLIST_INSERT_HEAD(&xbb->request_free_slist, req, links);
+       STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links);
        xbb->active_request_count--;
 
-       if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
-               /*
-                * Shutdown is in progress.  See if we can
-                * progress further now that one more request
-                * has completed and been returned to the
-                * free pool.
-                */
-               xbb_shutdown(xbb);
-       }
-       mtx_unlock(&xbb->lock);
+       KASSERT(xbb->active_request_count >= 0,
+               ("xbb_release_req: negative active count"));
+}
 
-       if (wake_thread != 0)
-               taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
+/**
+ * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool.
+ *
+ * \param xbb      Per-instance xbb configuration structure.
+ * \param req_list  The list of requests to free.
+ * \param nreqs            The number of items in the list.
+ */
+static inline void
+xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list,
+                int nreqs)
+{
+       mtx_assert(&xbb->lock, MA_OWNED);
+
+       STAILQ_CONCAT(&xbb->request_free_stailq, req_list);
+       xbb->active_request_count -= nreqs;
+
+       KASSERT(xbb->active_request_count >= 0,
+               ("xbb_release_reqs: negative active count"));
 }
 
 /**
  * Given a page index and 512b sector offset within that page,
  * calculate an offset into a request's kva region.
  *
- * \param req     The request structure whose kva region will be accessed.
+ * \param reqlist The request structure whose kva region will be accessed.
  * \param pagenr  The page index used to compute the kva offset.
  * \param sector  The 512b sector index used to compute the page relative
  *                kva offset.
@@ -695,9 +883,9 @@ xbb_release_req(struct xbb_softc *xbb, s
  * \return  The computed global KVA offset.
  */
 static inline uint8_t *
-xbb_req_vaddr(struct xbb_xen_req *req, int pagenr, int sector)
+xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
 {
-       return (req->kva + (PAGE_SIZE * pagenr) + (sector << 9));
+       return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9));
 }
 
 #ifdef XBB_USE_BOUNCE_BUFFERS
@@ -705,7 +893,7 @@ xbb_req_vaddr(struct xbb_xen_req *req, i
  * Given a page index and 512b sector offset within that page,
  * calculate an offset into a request's local bounce memory region.
  *
- * \param req     The request structure whose bounce region will be accessed.
+ * \param reqlist The request structure whose bounce region will be accessed.
  * \param pagenr  The page index used to compute the bounce offset.
  * \param sector  The 512b sector index used to compute the page relative
  *                bounce offset.
@@ -713,9 +901,9 @@ xbb_req_vaddr(struct xbb_xen_req *req, i
  * \return  The computed global bounce buffer address.
  */
 static inline uint8_t *
-xbb_req_bounce_addr(struct xbb_xen_req *req, int pagenr, int sector)
+xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int 
sector)
 {
-       return (req->bounce + (PAGE_SIZE * pagenr) + (sector << 9));
+       return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9));
 }
 #endif
 
@@ -724,7 +912,7 @@ xbb_req_bounce_addr(struct xbb_xen_req *
  * calculate an offset into the request's memory region that the
  * underlying backend device/file should use for I/O.
  *
- * \param req     The request structure whose I/O region will be accessed.
+ * \param reqlist The request structure whose I/O region will be accessed.
  * \param pagenr  The page index used to compute the I/O offset.
  * \param sector  The 512b sector index used to compute the page relative
  *                I/O offset.
@@ -736,12 +924,12 @@ xbb_req_bounce_addr(struct xbb_xen_req *
  * this request.
  */
 static inline uint8_t *
-xbb_req_ioaddr(struct xbb_xen_req *req, int pagenr, int sector)
+xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
 {
 #ifdef XBB_USE_BOUNCE_BUFFERS
-       return (xbb_req_bounce_addr(req, pagenr, sector));
+       return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector));
 #else
-       return (xbb_req_vaddr(req, pagenr, sector));
+       return (xbb_reqlist_vaddr(reqlist, pagenr, sector));
 #endif
 }
 
@@ -750,7 +938,7 @@ xbb_req_ioaddr(struct xbb_xen_req *req, 
  * an offset into the local psuedo-physical address space used to map a
  * front-end's request data into a request.
  *
- * \param req     The request structure whose pseudo-physical region
+ * \param reqlist The request list structure whose pseudo-physical region
  *                will be accessed.
  * \param pagenr  The page index used to compute the pseudo-physical offset.
  * \param sector  The 512b sector index used to compute the page relative
@@ -763,10 +951,126 @@ xbb_req_ioaddr(struct xbb_xen_req *req, 
  * this request.
  */
 static inline uintptr_t
-xbb_req_gntaddr(struct xbb_xen_req *req, int pagenr, int sector)
+xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
+{
+       struct xbb_softc *xbb;
+
+       xbb = reqlist->xbb;
+
+       return ((uintptr_t)(xbb->gnt_base_addr +
+               (uintptr_t)(reqlist->kva - xbb->kva) +
+               (PAGE_SIZE * pagenr) + (sector << 9)));
+}
+
+/**
+ * Get Kernel Virtual Address space for mapping requests.
+ *
+ * \param xbb         Per-instance xbb configuration structure.
+ * \param nr_pages    Number of pages needed.
+ * \param check_only  If set, check for free KVA but don't allocate it.
+ * \param have_lock   If set, xbb lock is already held.
+ *
+ * \return  On success, a pointer to the allocated KVA region.  Otherwise NULL.
+ *
+ * Note:  This should be unnecessary once we have either chaining or
+ * scatter/gather support for struct bio.  At that point we'll be able to
+ * put multiple addresses and lengths in one bio/bio chain and won't need
+ * to map everything into one virtual segment.
+ */
+static uint8_t *
+xbb_get_kva(struct xbb_softc *xbb, int nr_pages)
+{
+       intptr_t first_clear, num_clear;
+       uint8_t *free_kva;
+       int i;
+
+       KASSERT(nr_pages != 0, ("xbb_get_kva of zero length"));
+
+       first_clear = 0;
+       free_kva = NULL;
+
+       mtx_lock(&xbb->lock);
+
+       /*
+        * Look for the first available page.  If there are none, we're done.
+        */
+       bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear);
+
+       if (first_clear == -1)
+               goto bailout;
+
+       /*
+        * Starting at the first available page, look for consecutive free
+        * pages that will satisfy the user's request.
+        */
+       for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) {
+               /*
+                * If this is true, the page is used, so we have to reset
+                * the number of clear pages and the first clear page
+                * (since it pointed to a region with an insufficient number
+                * of clear pages).
+                */
+               if (bit_test(xbb->kva_free, i)) {
+                       num_clear = 0;
+                       first_clear = -1;
+                       continue;
+               }
+
+               if (first_clear == -1)
+                       first_clear = i;
+
+               /*
+                * If this is true, we've found a large enough free region
+                * to satisfy the request.
+                */
+               if (++num_clear == nr_pages) {
+
+                       bit_nset(xbb->kva_free, first_clear,
+                                first_clear + nr_pages - 1);
+
+                       free_kva = xbb->kva +
+                               (uint8_t *)(first_clear * PAGE_SIZE);
+
+                       KASSERT(free_kva >= (uint8_t *)xbb->kva &&
+                               free_kva + (nr_pages * PAGE_SIZE) <=
+                               (uint8_t *)xbb->ring_config.va,
+                               ("Free KVA %p len %d out of range, "
+                                "kva = %#jx, ring VA = %#jx\n", free_kva,
+                                nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva,
+                                (uintmax_t)xbb->ring_config.va));
+                       break;
+               }
+       }
+
+bailout:
+
+       if (free_kva == NULL) {
+               xbb->flags |= XBBF_RESOURCE_SHORTAGE;
+               xbb->kva_shortages++;
+       }
+
+       mtx_unlock(&xbb->lock);
+
+       return (free_kva);
+}
+
+/**
+ * Free allocated KVA.
+ *
+ * \param xbb      Per-instance xbb configuration structure.
+ * \param kva_ptr   Pointer to allocated KVA region.  
+ * \param nr_pages  Number of pages in the KVA region.
+ */
+static void
+xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages)
 {
-       return ((uintptr_t)(req->gnt_base
-                         + (PAGE_SIZE * pagenr) + (sector << 9)));
+       intptr_t start_page;
+
+       mtx_assert(&xbb->lock, MA_OWNED);
+
+       start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT;
+       bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1);
+
 }
 
 /**
@@ -775,23 +1079,23 @@ xbb_req_gntaddr(struct xbb_xen_req *req,
  * \param req  The request structure to unmap.
  */
 static void
-xbb_unmap_req(struct xbb_xen_req *req)
+xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist)
 {
-       struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQUEST];
+       struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST];
        u_int                         i;
        u_int                         invcount;
        int                           error;
 
        invcount = 0;
-       for (i = 0; i < req->nr_pages; i++) {
+       for (i = 0; i < reqlist->nr_segments; i++) {
 
-               if (req->gnt_handles[i] == GRANT_REF_INVALID)
+               if (reqlist->gnt_handles[i] == GRANT_REF_INVALID)
                        continue;
 
-               unmap[invcount].host_addr    = xbb_req_gntaddr(req, i, 0);
+               unmap[invcount].host_addr    = xbb_get_gntaddr(reqlist, i, 0);
                unmap[invcount].dev_bus_addr = 0;
-               unmap[invcount].handle       = req->gnt_handles[i];
-               req->gnt_handles[i]          = GRANT_REF_INVALID;
+               unmap[invcount].handle       = reqlist->gnt_handles[i];
+               reqlist->gnt_handles[i]      = GRANT_REF_INVALID;
                invcount++;
        }
 
@@ -801,6 +1105,175 @@ xbb_unmap_req(struct xbb_xen_req *req)
 }
 
 /**
+ * Allocate an internal transaction tracking structure from the free pool.
+ *
+ * \param xbb  Per-instance xbb configuration structure.
+ *
+ * \return  On success, a pointer to the allocated xbb_xen_reqlist structure.
+ *          Otherwise NULL.
+ */
+static inline struct xbb_xen_reqlist *
+xbb_get_reqlist(struct xbb_softc *xbb)
+{
+       struct xbb_xen_reqlist *reqlist;
+
+       reqlist = NULL;
+
+       mtx_assert(&xbb->lock, MA_OWNED);
+
+       if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) {
+
+               STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links);
+               reqlist->flags = XBB_REQLIST_NONE;
+               reqlist->kva = NULL;
+               reqlist->status = BLKIF_RSP_OKAY;
+               reqlist->residual_512b_sectors = 0;
+               reqlist->num_children = 0;
+               reqlist->nr_segments = 0;
+               STAILQ_INIT(&reqlist->contig_req_list);
+       }
+
+       return (reqlist);
+}
+
+/**
+ * Return an allocated transaction tracking structure to the free pool.
+ *
+ * \param xbb        Per-instance xbb configuration structure.
+ * \param req        The request list structure to free.
+ * \param wakeup     If set, wakeup the work thread if freeing this reqlist
+ *                   during a resource shortage condition.
+ */
+static inline void
+xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
+                   int wakeup)
+{
+
+       mtx_lock(&xbb->lock);
+
+       if (wakeup) {
+               wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE;
+               xbb->flags &= ~XBBF_RESOURCE_SHORTAGE;
+       }
+
+       if (reqlist->kva != NULL)
+               xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments);
+
+       xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children);
+
+       STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links);
+
+       if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
+               /*
+                * Shutdown is in progress.  See if we can
+                * progress further now that one more request
+                * has completed and been returned to the
+                * free pool.
+                */
+               xbb_shutdown(xbb);
+       }
+
+       mtx_unlock(&xbb->lock);
+
+       if (wakeup != 0)
+               taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
+}
+
+/**
+ * Request resources and do basic request setup.
+ *
+ * \param xbb          Per-instance xbb configuration structure.
+ * \param reqlist      Pointer to reqlist pointer.
+ * \param ring_req     Pointer to a block ring request.
+ * \param ring_index   The ring index of this request.
+ *
+ * \return  0 for success, non-zero for failure.
+ */
+static int
+xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist,
+                 blkif_request_t *ring_req, RING_IDX ring_idx)
+{
+       struct xbb_xen_reqlist *nreqlist;
+       struct xbb_xen_req     *nreq;
+
+       nreqlist = NULL;
+       nreq     = NULL;
+
+       mtx_lock(&xbb->lock);
+
+       /*
+        * We don't allow new resources to be allocated if we're in the
+        * process of shutting down.
+        */
+       if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
+               mtx_unlock(&xbb->lock);
+               return (1);
+       }
+
+       /*
+        * Allocate a reqlist if the caller doesn't have one already.
+        */
+       if (*reqlist == NULL) {
+               nreqlist = xbb_get_reqlist(xbb);
+               if (nreqlist == NULL)
+                       goto bailout_error;
+       }
+
+       /* We always allocate a request. */
+       nreq = xbb_get_req(xbb);
+       if (nreq == NULL)
+               goto bailout_error;
+
+       mtx_unlock(&xbb->lock);
+
+       if (*reqlist == NULL) {
+               *reqlist = nreqlist;
+               nreqlist->operation = ring_req->operation;
+               nreqlist->starting_sector_number = ring_req->sector_number;
+               STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist,
+                                  links);
+       }
+
+       nreq->reqlist = *reqlist;
+       nreq->req_ring_idx = ring_idx;
+
+       if (xbb->abi != BLKIF_PROTOCOL_NATIVE) {
+               bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req));
+               nreq->ring_req = &nreq->ring_req_storage;
+       } else {
+               nreq->ring_req = ring_req;
+       }
+
+       binuptime(&nreq->ds_t0);
+       devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0);
+       STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links);
+       (*reqlist)->num_children++;
+       (*reqlist)->nr_segments += ring_req->nr_segments;
+
+       return (0);
+
+bailout_error:
+
+       /*
+        * We're out of resources, so set the shortage flag.  The next time
+        * a request is released, we'll try waking up the work thread to
+        * see if we can allocate more resources.
+        */
+       xbb->flags |= XBBF_RESOURCE_SHORTAGE;
+       xbb->request_shortages++;
+
+       if (nreq != NULL)
+               xbb_release_req(xbb, nreq);
+
+       mtx_unlock(&xbb->lock);
+
+       if (nreqlist != NULL)
+               xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0);
+
+       return (1);
+}
+
+/**
  * Create and transmit a response to a blkif request.
  * 
  * \param xbb     Per-instance xbb configuration structure.
@@ -862,6 +1335,8 @@ xbb_send_response(struct xbb_softc *xbb,
                more_to_do = 1;
        }
 
+       xbb->reqs_completed++;
+
        mtx_unlock(&xbb->lock);
 
        if (more_to_do)
@@ -872,6 +1347,70 @@ xbb_send_response(struct xbb_softc *xbb,
 }
 
 /**
+ * Complete a request list.
+ *
+ * \param xbb        Per-instance xbb configuration structure.
+ * \param reqlist    Allocated internal request list structure.
+ */
+static void
+xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist)
+{
+       struct xbb_xen_req *nreq;
+       off_t               sectors_sent;
+
+       sectors_sent = 0;
+
+       if (reqlist->flags & XBB_REQLIST_MAPPED)
+               xbb_unmap_reqlist(reqlist);
+

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
[email protected] mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "[email protected]"

Reply via email to