On Sun, May 28, 2017 at 06:49:53PM +1000, Jonathan Matthew wrote:
> This adds a side effect free io path to nvme(4) allowing it to be used for
> hibernate.  I've only tested this in a qemu vm, which successfully hibernates
> but falls over weirdly trying to resume.

This version has survived contact with actual hardware, which cares about the
io queue being page aligned (as required by the nvme spec), whereas qemu 
doesn't.
I got around this by preallocating the queue during nvme_attach.

Once this is in, I'll make get_hibernate_io_function a bit more readable.

Index: conf/files
===================================================================
RCS file: /cvs/src/sys/conf/files,v
retrieving revision 1.645
diff -u -p -r1.645 files
--- conf/files  15 May 2017 11:23:25 -0000      1.645
+++ conf/files  29 May 2017 09:47:30 -0000
@@ -193,7 +193,7 @@ file        dev/ic/ahci.c                   ahci | 
(ahci_pci | 
 
 # NVM Express Controller
 device nvme: scsi
-file   dev/ic/nvme.c                   nvme
+file   dev/ic/nvme.c                   nvme needs-flag
 
 # LSI Logic Fusion-MPT Message Passing Interface
 device mpi: scsi
Index: arch/amd64/amd64/hibernate_machdep.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/hibernate_machdep.c,v
retrieving revision 1.38
diff -u -p -r1.38 hibernate_machdep.c
--- arch/amd64/amd64/hibernate_machdep.c        21 Aug 2015 07:01:38 -0000      
1.38
+++ arch/amd64/amd64/hibernate_machdep.c        29 May 2017 09:47:30 -0000
@@ -47,6 +47,7 @@
 #include "ahci.h"
 #include "softraid.h"
 #include "sd.h"
+#include "nvme.h"
 
 /* Hibernate support */
 void    hibernate_enter_resume_4k_pte(vaddr_t, paddr_t);
@@ -89,6 +90,8 @@ get_hibernate_io_function(dev_t dev)
                extern struct cfdriver sd_cd;
                extern int ahci_hibernate_io(dev_t dev, daddr_t blkno,
                    vaddr_t addr, size_t size, int op, void *page);
+               extern int nvme_hibernate_io(dev_t dev, daddr_t blkno,
+                   vaddr_t addr, size_t size, int op, void *page);
                extern int sr_hibernate_io(dev_t dev, daddr_t blkno,
                    vaddr_t addr, size_t size, int op, void *page);
                struct device *dv = disk_lookup(&sd_cd, DISKUNIT(dev));
@@ -98,6 +101,12 @@ get_hibernate_io_function(dev_t dev)
                    
strcmp(dv->dv_parent->dv_parent->dv_cfdata->cf_driver->cd_name,
                    "ahci") == 0)
                        return ahci_hibernate_io;
+#endif
+#if NNVME > 0
+               if (dv && dv->dv_parent && dv->dv_parent->dv_parent &&
+                   
strcmp(dv->dv_parent->dv_parent->dv_cfdata->cf_driver->cd_name,
+                   "nvme") == 0)
+                       return nvme_hibernate_io;
 #endif
 #if NSOFTRAID > 0
                if (dv && dv->dv_parent && dv->dv_parent->dv_parent &&
Index: dev/ic/nvmevar.h
===================================================================
RCS file: /cvs/src/sys/dev/ic/nvmevar.h,v
retrieving revision 1.10
diff -u -p -r1.10 nvmevar.h
--- dev/ic/nvmevar.h    27 May 2017 12:40:51 -0000      1.10
+++ dev/ic/nvmevar.h    29 May 2017 09:47:30 -0000
@@ -16,6 +16,9 @@
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
+#define NVME_IO_Q      1
+#define NVME_HIB_Q     2
+
 struct nvme_dmamem {
        bus_dmamap_t            ndm_map;
        bus_dma_segment_t       ndm_seg;
@@ -90,6 +93,7 @@ struct nvme_softc {
 
        struct nvme_queue       *sc_admin_q;
        struct nvme_queue       *sc_q;
+       struct nvme_queue       *sc_hib_q;
 
        struct mutex            sc_ccb_mtx;
        struct nvme_ccb         *sc_ccbs;
Index: dev/ic/nvme.c
===================================================================
RCS file: /cvs/src/sys/dev/ic/nvme.c,v
retrieving revision 1.56
diff -u -p -r1.56 nvme.c
--- dev/ic/nvme.c       27 May 2017 12:40:51 -0000      1.56
+++ dev/ic/nvme.c       29 May 2017 09:47:30 -0000
@@ -26,6 +26,8 @@
 #include <sys/mutex.h>
 #include <sys/pool.h>
 
+#include <sys/atomic.h>
+
 #include <machine/bus.h>
 
 #include <scsi/scsi_all.h>
@@ -86,6 +88,15 @@ void nvme_scsi_cmd(struct scsi_xfer *);
 int    nvme_scsi_probe(struct scsi_link *);
 void   nvme_scsi_free(struct scsi_link *);
 
+#ifdef HIBERNATE
+#include <uvm/uvm_extern.h>
+#include <sys/hibernate.h>
+#include <sys/disk.h>
+#include <sys/disklabel.h>
+
+int    nvme_hibernate_io(dev_t, daddr_t, vaddr_t, size_t, int, void *);
+#endif
+
 struct scsi_adapter nvme_switch = {
        nvme_scsi_cmd,          /* cmd */
        scsi_minphys,           /* minphys */
@@ -332,7 +343,7 @@ nvme_attach(struct nvme_softc *sc)
                goto free_admin_q;
        }
 
-       sc->sc_q = nvme_q_alloc(sc, 1, 128, sc->sc_dstrd);
+       sc->sc_q = nvme_q_alloc(sc, NVME_IO_Q, 128, sc->sc_dstrd);
        if (sc->sc_q == NULL) {
                printf("%s: unable to allocate io q\n", DEVNAME(sc));
                goto disable;
@@ -343,6 +354,12 @@ nvme_attach(struct nvme_softc *sc)
                goto free_q;
        }
 
+       sc->sc_hib_q = nvme_q_alloc(sc, NVME_HIB_Q, 4, sc->sc_dstrd);
+       if (sc->sc_hib_q == NULL) {
+               printf("%s: unable to allocate hibernate io queue\n", 
DEVNAME(sc));
+               goto free_hib_q;
+       }
+
        nvme_write4(sc, NVME_INTMC, 1);
 
        sc->sc_namespaces = mallocarray(sc->sc_nn, sizeof(*sc->sc_namespaces),
@@ -364,6 +381,8 @@ nvme_attach(struct nvme_softc *sc)
 
        return (0);
 
+free_hib_q:
+       nvme_q_free(sc, sc->sc_hib_q);
 free_q:
        nvme_q_free(sc, sc->sc_q);
 disable:
@@ -394,7 +413,7 @@ nvme_resume(struct nvme_softc *sc)
                return (1);
        }
 
-       sc->sc_q = nvme_q_alloc(sc, 1, 128, sc->sc_dstrd);
+       sc->sc_q = nvme_q_alloc(sc, NVME_IO_Q, 128, sc->sc_dstrd);
        if (sc->sc_q == NULL) {
                printf("%s: unable to allocate io q\n", DEVNAME(sc));
                goto disable;
@@ -1393,3 +1412,193 @@ nvme_dmamem_free(struct nvme_softc *sc, 
        free(ndm, M_DEVBUF, sizeof *ndm);
 }
 
+#ifdef HIBERNATE
+
+int
+nvme_hibernate_admin_cmd(struct nvme_softc *sc, struct nvme_sqe *sqe, struct 
nvme_cqe *cqe,
+    int cid)
+{
+       struct nvme_sqe *asqe = NVME_DMA_KVA(sc->sc_admin_q->q_sq_dmamem);
+       struct nvme_cqe *acqe = NVME_DMA_KVA(sc->sc_admin_q->q_cq_dmamem);
+       struct nvme_queue *q = sc->sc_admin_q;
+       int tail;
+       u_int16_t flags;
+
+       /* submit command */
+       tail = q->q_sq_tail;
+       if (++q->q_sq_tail >= q->q_entries)
+               q->q_sq_tail = 0;
+
+       asqe += tail;
+       bus_dmamap_sync(sc->sc_dmat, NVME_DMA_MAP(q->q_sq_dmamem),
+           sizeof(*sqe) * tail, sizeof(*sqe), BUS_DMASYNC_POSTWRITE);
+       *asqe = *sqe;
+       asqe->cid = cid;
+       bus_dmamap_sync(sc->sc_dmat, NVME_DMA_MAP(q->q_sq_dmamem),
+           sizeof(*sqe) * tail, sizeof(*sqe), BUS_DMASYNC_PREWRITE);
+
+       nvme_write4(sc, q->q_sqtdbl, q->q_sq_tail);
+
+       /* wait for completion */
+       acqe += q->q_cq_head;
+       for (;;) {
+               nvme_dmamem_sync(sc, q->q_cq_dmamem, BUS_DMASYNC_POSTREAD);
+               flags = lemtoh16(&acqe->flags);
+               if ((flags & NVME_CQE_PHASE) == q->q_cq_phase)
+                       break;
+       
+               delay(10);
+       }
+
+       if (++q->q_cq_head >= q->q_entries) {
+               q->q_cq_head = 0;
+               q->q_cq_phase ^= NVME_CQE_PHASE;
+       }
+       nvme_write4(sc, q->q_cqhdbl, q->q_cq_head);
+       if ((NVME_CQE_SC(flags) != NVME_CQE_SC_SUCCESS) || (acqe->cid != cid))
+               return (EIO);
+
+       return (0);
+}
+
+int
+nvme_hibernate_io(dev_t dev, daddr_t blkno, vaddr_t addr, size_t size,
+    int op, void *page)
+{
+       struct nvme_hibernate_page {
+               u_int64_t               prpl[MAXPHYS / PAGE_SIZE];
+
+               struct nvme_softc       *sc;
+               int                     nsid;
+               int                     sq_tail;
+               int                     cq_head;
+               int                     cqe_phase;
+
+               daddr_t                 poffset;
+               size_t                  psize;
+       } *my = page;
+       struct nvme_sqe_io *isqe;
+       struct nvme_cqe *icqe;
+       paddr_t data_phys, page_phys;
+       u_int64_t data_bus_phys, page_bus_phys;
+       u_int16_t flags;
+       int i;
+
+       if (op == HIB_INIT) {
+               struct device *disk;
+               struct device *scsibus;
+               extern struct cfdriver sd_cd;
+               struct scsi_link *link;
+               struct scsibus_softc *bus_sc;
+               struct nvme_sqe_q qsqe;
+               struct nvme_cqe qcqe;
+
+               /* find nvme softc */
+               disk = disk_lookup(&sd_cd, DISKUNIT(dev));
+               scsibus = disk->dv_parent;
+               my->sc = (struct nvme_softc *)disk->dv_parent->dv_parent;
+
+               /* find scsi_link, which tells us the target */
+               my->nsid = 0;
+               bus_sc = (struct scsibus_softc *)scsibus;
+               SLIST_FOREACH(link, &bus_sc->sc_link_list, bus_list) {
+                       if (link->device_softc == disk) {
+                               my->nsid = link->target + 1;
+                               break;
+                       }
+               }
+               if (my->nsid == 0)
+                       return (EIO);
+               
+               my->poffset = blkno;
+               my->psize = size;
+
+               memset(NVME_DMA_KVA(my->sc->sc_hib_q->q_cq_dmamem), 0,
+                   my->sc->sc_hib_q->q_entries * sizeof(struct nvme_cqe));
+               memset(NVME_DMA_KVA(my->sc->sc_hib_q->q_sq_dmamem), 0,
+                   my->sc->sc_hib_q->q_entries * sizeof(struct nvme_sqe));
+               
+               my->sq_tail = 0;
+               my->cq_head = 0;
+               my->cqe_phase = NVME_CQE_PHASE;
+
+               pmap_extract(pmap_kernel(), (vaddr_t)page, &page_phys);
+
+               memset(&qsqe, 0, sizeof(qsqe));
+               qsqe.opcode = NVM_ADMIN_ADD_IOCQ;
+               htolem64(&qsqe.prp1, 
NVME_DMA_DVA(my->sc->sc_hib_q->q_cq_dmamem));
+               htolem16(&qsqe.qsize, my->sc->sc_hib_q->q_entries - 1);
+               htolem16(&qsqe.qid, my->sc->sc_hib_q->q_id);
+               qsqe.qflags = NVM_SQE_CQ_IEN | NVM_SQE_Q_PC;
+               if (nvme_hibernate_admin_cmd(my->sc, (struct nvme_sqe *)&qsqe, 
&qcqe, 1) != 0)
+                       return (EIO);
+
+               memset(&qsqe, 0, sizeof(qsqe));
+               qsqe.opcode = NVM_ADMIN_ADD_IOSQ;
+               htolem64(&qsqe.prp1, 
NVME_DMA_DVA(my->sc->sc_hib_q->q_sq_dmamem));
+               htolem16(&qsqe.qsize, my->sc->sc_hib_q->q_entries - 1);
+               htolem16(&qsqe.qid, my->sc->sc_hib_q->q_id);
+               htolem16(&qsqe.cqid, my->sc->sc_hib_q->q_id);
+               qsqe.qflags = NVM_SQE_Q_PC;
+               if (nvme_hibernate_admin_cmd(my->sc, (struct nvme_sqe *)&qsqe, 
&qcqe, 2) != 0)
+                       return (EIO);
+
+               return (0);
+       }
+
+       if (op != HIB_W)
+               return (0);
+
+       isqe = NVME_DMA_KVA(my->sc->sc_hib_q->q_sq_dmamem);
+       isqe += my->sq_tail;
+       if (++my->sq_tail == my->sc->sc_hib_q->q_entries)
+               my->sq_tail = 0;
+
+       memset(isqe, 0, sizeof(*isqe));
+       isqe->opcode = NVM_CMD_WRITE;
+       htolem32(&isqe->nsid, my->nsid);
+
+       pmap_extract(pmap_kernel(), addr, &data_phys);
+       data_bus_phys = data_phys;
+       htolem64(&isqe->entry.prp[0], data_bus_phys);
+       if ((size > my->sc->sc_mps) && (size <= my->sc->sc_mps * 2)) {
+               htolem64(&isqe->entry.prp[1], data_bus_phys + my->sc->sc_mps);
+       } else if (size > my->sc->sc_mps * 2) {
+               pmap_extract(pmap_kernel(), (vaddr_t)page, &page_phys);
+               page_bus_phys = page_phys;
+               htolem64(&isqe->entry.prp[1], page_bus_phys + 
+                   offsetof(struct nvme_hibernate_page, prpl));
+               for (i = 1; i < (size / my->sc->sc_mps); i++) {
+                       htolem64(&my->prpl[i - 1], data_bus_phys +
+                           (i * my->sc->sc_mps));
+               }
+       }
+
+       isqe->slba = blkno + my->poffset;
+       isqe->nlb = (size / DEV_BSIZE) - 1;
+       isqe->cid = blkno % 0xffff;
+
+       nvme_write4(my->sc, NVME_SQTDBL(NVME_HIB_Q, my->sc->sc_dstrd), 
my->sq_tail);
+
+       icqe = NVME_DMA_KVA(my->sc->sc_hib_q->q_cq_dmamem);
+       icqe += my->cq_head;
+       for (;;) {
+               flags = lemtoh16(&icqe->flags);
+               if ((flags & NVME_CQE_PHASE) == my->cqe_phase)
+                       break;
+       
+               delay(10);
+       }
+
+       if (++my->cq_head == my->sc->sc_hib_q->q_entries) {
+               my->cq_head = 0;
+               my->cqe_phase ^= NVME_CQE_PHASE;
+       }
+       nvme_write4(my->sc, NVME_CQHDBL(NVME_HIB_Q, my->sc->sc_dstrd), 
my->cq_head);
+       if ((NVME_CQE_SC(flags) != NVME_CQE_SC_SUCCESS) || (icqe->cid != blkno 
% 0xffff))
+               return (EIO);
+
+       return (0);
+}
+
+#endif

Reply via email to