On Sun, May 28, 2017 at 06:49:53PM +1000, Jonathan Matthew wrote:
> This adds a side effect free io path to nvme(4) allowing it to be used for
> hibernate. I've only tested this in a qemu vm, which successfully hibernates
> but falls over weirdly trying to resume.
This version has survived contact with actual hardware, which cares about the
io queue being page aligned (as required by the nvme spec), whereas qemu
doesn't.
I got around this by preallocating the queue during nvme_attach.
Once this is in, I'll make get_hibernate_io_function a bit more readable.
Index: conf/files
===================================================================
RCS file: /cvs/src/sys/conf/files,v
retrieving revision 1.645
diff -u -p -r1.645 files
--- conf/files 15 May 2017 11:23:25 -0000 1.645
+++ conf/files 29 May 2017 09:47:30 -0000
@@ -193,7 +193,7 @@ file dev/ic/ahci.c ahci |
(ahci_pci |
# NVM Express Controller
device nvme: scsi
-file dev/ic/nvme.c nvme
+file dev/ic/nvme.c nvme needs-flag
# LSI Logic Fusion-MPT Message Passing Interface
device mpi: scsi
Index: arch/amd64/amd64/hibernate_machdep.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/hibernate_machdep.c,v
retrieving revision 1.38
diff -u -p -r1.38 hibernate_machdep.c
--- arch/amd64/amd64/hibernate_machdep.c 21 Aug 2015 07:01:38 -0000
1.38
+++ arch/amd64/amd64/hibernate_machdep.c 29 May 2017 09:47:30 -0000
@@ -47,6 +47,7 @@
#include "ahci.h"
#include "softraid.h"
#include "sd.h"
+#include "nvme.h"
/* Hibernate support */
void hibernate_enter_resume_4k_pte(vaddr_t, paddr_t);
@@ -89,6 +90,8 @@ get_hibernate_io_function(dev_t dev)
extern struct cfdriver sd_cd;
extern int ahci_hibernate_io(dev_t dev, daddr_t blkno,
vaddr_t addr, size_t size, int op, void *page);
+ extern int nvme_hibernate_io(dev_t dev, daddr_t blkno,
+ vaddr_t addr, size_t size, int op, void *page);
extern int sr_hibernate_io(dev_t dev, daddr_t blkno,
vaddr_t addr, size_t size, int op, void *page);
struct device *dv = disk_lookup(&sd_cd, DISKUNIT(dev));
@@ -98,6 +101,12 @@ get_hibernate_io_function(dev_t dev)
strcmp(dv->dv_parent->dv_parent->dv_cfdata->cf_driver->cd_name,
"ahci") == 0)
return ahci_hibernate_io;
+#endif
+#if NNVME > 0
+ if (dv && dv->dv_parent && dv->dv_parent->dv_parent &&
+
strcmp(dv->dv_parent->dv_parent->dv_cfdata->cf_driver->cd_name,
+ "nvme") == 0)
+ return nvme_hibernate_io;
#endif
#if NSOFTRAID > 0
if (dv && dv->dv_parent && dv->dv_parent->dv_parent &&
Index: dev/ic/nvmevar.h
===================================================================
RCS file: /cvs/src/sys/dev/ic/nvmevar.h,v
retrieving revision 1.10
diff -u -p -r1.10 nvmevar.h
--- dev/ic/nvmevar.h 27 May 2017 12:40:51 -0000 1.10
+++ dev/ic/nvmevar.h 29 May 2017 09:47:30 -0000
@@ -16,6 +16,9 @@
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
+#define NVME_IO_Q 1
+#define NVME_HIB_Q 2
+
struct nvme_dmamem {
bus_dmamap_t ndm_map;
bus_dma_segment_t ndm_seg;
@@ -90,6 +93,7 @@ struct nvme_softc {
struct nvme_queue *sc_admin_q;
struct nvme_queue *sc_q;
+ struct nvme_queue *sc_hib_q;
struct mutex sc_ccb_mtx;
struct nvme_ccb *sc_ccbs;
Index: dev/ic/nvme.c
===================================================================
RCS file: /cvs/src/sys/dev/ic/nvme.c,v
retrieving revision 1.56
diff -u -p -r1.56 nvme.c
--- dev/ic/nvme.c 27 May 2017 12:40:51 -0000 1.56
+++ dev/ic/nvme.c 29 May 2017 09:47:30 -0000
@@ -26,6 +26,8 @@
#include <sys/mutex.h>
#include <sys/pool.h>
+#include <sys/atomic.h>
+
#include <machine/bus.h>
#include <scsi/scsi_all.h>
@@ -86,6 +88,15 @@ void nvme_scsi_cmd(struct scsi_xfer *);
int nvme_scsi_probe(struct scsi_link *);
void nvme_scsi_free(struct scsi_link *);
+#ifdef HIBERNATE
+#include <uvm/uvm_extern.h>
+#include <sys/hibernate.h>
+#include <sys/disk.h>
+#include <sys/disklabel.h>
+
+int nvme_hibernate_io(dev_t, daddr_t, vaddr_t, size_t, int, void *);
+#endif
+
struct scsi_adapter nvme_switch = {
nvme_scsi_cmd, /* cmd */
scsi_minphys, /* minphys */
@@ -332,7 +343,7 @@ nvme_attach(struct nvme_softc *sc)
goto free_admin_q;
}
- sc->sc_q = nvme_q_alloc(sc, 1, 128, sc->sc_dstrd);
+ sc->sc_q = nvme_q_alloc(sc, NVME_IO_Q, 128, sc->sc_dstrd);
if (sc->sc_q == NULL) {
printf("%s: unable to allocate io q\n", DEVNAME(sc));
goto disable;
@@ -343,6 +354,12 @@ nvme_attach(struct nvme_softc *sc)
goto free_q;
}
+ sc->sc_hib_q = nvme_q_alloc(sc, NVME_HIB_Q, 4, sc->sc_dstrd);
+ if (sc->sc_hib_q == NULL) {
+ printf("%s: unable to allocate hibernate io queue\n",
DEVNAME(sc));
+ goto free_hib_q;
+ }
+
nvme_write4(sc, NVME_INTMC, 1);
sc->sc_namespaces = mallocarray(sc->sc_nn, sizeof(*sc->sc_namespaces),
@@ -364,6 +381,8 @@ nvme_attach(struct nvme_softc *sc)
return (0);
+free_hib_q:
+ nvme_q_free(sc, sc->sc_hib_q);
free_q:
nvme_q_free(sc, sc->sc_q);
disable:
@@ -394,7 +413,7 @@ nvme_resume(struct nvme_softc *sc)
return (1);
}
- sc->sc_q = nvme_q_alloc(sc, 1, 128, sc->sc_dstrd);
+ sc->sc_q = nvme_q_alloc(sc, NVME_IO_Q, 128, sc->sc_dstrd);
if (sc->sc_q == NULL) {
printf("%s: unable to allocate io q\n", DEVNAME(sc));
goto disable;
@@ -1393,3 +1412,193 @@ nvme_dmamem_free(struct nvme_softc *sc,
free(ndm, M_DEVBUF, sizeof *ndm);
}
+#ifdef HIBERNATE
+
+int
+nvme_hibernate_admin_cmd(struct nvme_softc *sc, struct nvme_sqe *sqe, struct
nvme_cqe *cqe,
+ int cid)
+{
+ struct nvme_sqe *asqe = NVME_DMA_KVA(sc->sc_admin_q->q_sq_dmamem);
+ struct nvme_cqe *acqe = NVME_DMA_KVA(sc->sc_admin_q->q_cq_dmamem);
+ struct nvme_queue *q = sc->sc_admin_q;
+ int tail;
+ u_int16_t flags;
+
+ /* submit command */
+ tail = q->q_sq_tail;
+ if (++q->q_sq_tail >= q->q_entries)
+ q->q_sq_tail = 0;
+
+ asqe += tail;
+ bus_dmamap_sync(sc->sc_dmat, NVME_DMA_MAP(q->q_sq_dmamem),
+ sizeof(*sqe) * tail, sizeof(*sqe), BUS_DMASYNC_POSTWRITE);
+ *asqe = *sqe;
+ asqe->cid = cid;
+ bus_dmamap_sync(sc->sc_dmat, NVME_DMA_MAP(q->q_sq_dmamem),
+ sizeof(*sqe) * tail, sizeof(*sqe), BUS_DMASYNC_PREWRITE);
+
+ nvme_write4(sc, q->q_sqtdbl, q->q_sq_tail);
+
+ /* wait for completion */
+ acqe += q->q_cq_head;
+ for (;;) {
+ nvme_dmamem_sync(sc, q->q_cq_dmamem, BUS_DMASYNC_POSTREAD);
+ flags = lemtoh16(&acqe->flags);
+ if ((flags & NVME_CQE_PHASE) == q->q_cq_phase)
+ break;
+
+ delay(10);
+ }
+
+ if (++q->q_cq_head >= q->q_entries) {
+ q->q_cq_head = 0;
+ q->q_cq_phase ^= NVME_CQE_PHASE;
+ }
+ nvme_write4(sc, q->q_cqhdbl, q->q_cq_head);
+ if ((NVME_CQE_SC(flags) != NVME_CQE_SC_SUCCESS) || (acqe->cid != cid))
+ return (EIO);
+
+ return (0);
+}
+
+int
+nvme_hibernate_io(dev_t dev, daddr_t blkno, vaddr_t addr, size_t size,
+ int op, void *page)
+{
+ struct nvme_hibernate_page {
+ u_int64_t prpl[MAXPHYS / PAGE_SIZE];
+
+ struct nvme_softc *sc;
+ int nsid;
+ int sq_tail;
+ int cq_head;
+ int cqe_phase;
+
+ daddr_t poffset;
+ size_t psize;
+ } *my = page;
+ struct nvme_sqe_io *isqe;
+ struct nvme_cqe *icqe;
+ paddr_t data_phys, page_phys;
+ u_int64_t data_bus_phys, page_bus_phys;
+ u_int16_t flags;
+ int i;
+
+ if (op == HIB_INIT) {
+ struct device *disk;
+ struct device *scsibus;
+ extern struct cfdriver sd_cd;
+ struct scsi_link *link;
+ struct scsibus_softc *bus_sc;
+ struct nvme_sqe_q qsqe;
+ struct nvme_cqe qcqe;
+
+ /* find nvme softc */
+ disk = disk_lookup(&sd_cd, DISKUNIT(dev));
+ scsibus = disk->dv_parent;
+ my->sc = (struct nvme_softc *)disk->dv_parent->dv_parent;
+
+ /* find scsi_link, which tells us the target */
+ my->nsid = 0;
+ bus_sc = (struct scsibus_softc *)scsibus;
+ SLIST_FOREACH(link, &bus_sc->sc_link_list, bus_list) {
+ if (link->device_softc == disk) {
+ my->nsid = link->target + 1;
+ break;
+ }
+ }
+ if (my->nsid == 0)
+ return (EIO);
+
+ my->poffset = blkno;
+ my->psize = size;
+
+ memset(NVME_DMA_KVA(my->sc->sc_hib_q->q_cq_dmamem), 0,
+ my->sc->sc_hib_q->q_entries * sizeof(struct nvme_cqe));
+ memset(NVME_DMA_KVA(my->sc->sc_hib_q->q_sq_dmamem), 0,
+ my->sc->sc_hib_q->q_entries * sizeof(struct nvme_sqe));
+
+ my->sq_tail = 0;
+ my->cq_head = 0;
+ my->cqe_phase = NVME_CQE_PHASE;
+
+ pmap_extract(pmap_kernel(), (vaddr_t)page, &page_phys);
+
+ memset(&qsqe, 0, sizeof(qsqe));
+ qsqe.opcode = NVM_ADMIN_ADD_IOCQ;
+ htolem64(&qsqe.prp1,
NVME_DMA_DVA(my->sc->sc_hib_q->q_cq_dmamem));
+ htolem16(&qsqe.qsize, my->sc->sc_hib_q->q_entries - 1);
+ htolem16(&qsqe.qid, my->sc->sc_hib_q->q_id);
+ qsqe.qflags = NVM_SQE_CQ_IEN | NVM_SQE_Q_PC;
+ if (nvme_hibernate_admin_cmd(my->sc, (struct nvme_sqe *)&qsqe,
&qcqe, 1) != 0)
+ return (EIO);
+
+ memset(&qsqe, 0, sizeof(qsqe));
+ qsqe.opcode = NVM_ADMIN_ADD_IOSQ;
+ htolem64(&qsqe.prp1,
NVME_DMA_DVA(my->sc->sc_hib_q->q_sq_dmamem));
+ htolem16(&qsqe.qsize, my->sc->sc_hib_q->q_entries - 1);
+ htolem16(&qsqe.qid, my->sc->sc_hib_q->q_id);
+ htolem16(&qsqe.cqid, my->sc->sc_hib_q->q_id);
+ qsqe.qflags = NVM_SQE_Q_PC;
+ if (nvme_hibernate_admin_cmd(my->sc, (struct nvme_sqe *)&qsqe,
&qcqe, 2) != 0)
+ return (EIO);
+
+ return (0);
+ }
+
+ if (op != HIB_W)
+ return (0);
+
+ isqe = NVME_DMA_KVA(my->sc->sc_hib_q->q_sq_dmamem);
+ isqe += my->sq_tail;
+ if (++my->sq_tail == my->sc->sc_hib_q->q_entries)
+ my->sq_tail = 0;
+
+ memset(isqe, 0, sizeof(*isqe));
+ isqe->opcode = NVM_CMD_WRITE;
+ htolem32(&isqe->nsid, my->nsid);
+
+ pmap_extract(pmap_kernel(), addr, &data_phys);
+ data_bus_phys = data_phys;
+ htolem64(&isqe->entry.prp[0], data_bus_phys);
+ if ((size > my->sc->sc_mps) && (size <= my->sc->sc_mps * 2)) {
+ htolem64(&isqe->entry.prp[1], data_bus_phys + my->sc->sc_mps);
+ } else if (size > my->sc->sc_mps * 2) {
+ pmap_extract(pmap_kernel(), (vaddr_t)page, &page_phys);
+ page_bus_phys = page_phys;
+ htolem64(&isqe->entry.prp[1], page_bus_phys +
+ offsetof(struct nvme_hibernate_page, prpl));
+ for (i = 1; i < (size / my->sc->sc_mps); i++) {
+ htolem64(&my->prpl[i - 1], data_bus_phys +
+ (i * my->sc->sc_mps));
+ }
+ }
+
+ isqe->slba = blkno + my->poffset;
+ isqe->nlb = (size / DEV_BSIZE) - 1;
+ isqe->cid = blkno % 0xffff;
+
+ nvme_write4(my->sc, NVME_SQTDBL(NVME_HIB_Q, my->sc->sc_dstrd),
my->sq_tail);
+
+ icqe = NVME_DMA_KVA(my->sc->sc_hib_q->q_cq_dmamem);
+ icqe += my->cq_head;
+ for (;;) {
+ flags = lemtoh16(&icqe->flags);
+ if ((flags & NVME_CQE_PHASE) == my->cqe_phase)
+ break;
+
+ delay(10);
+ }
+
+ if (++my->cq_head == my->sc->sc_hib_q->q_entries) {
+ my->cq_head = 0;
+ my->cqe_phase ^= NVME_CQE_PHASE;
+ }
+ nvme_write4(my->sc, NVME_CQHDBL(NVME_HIB_Q, my->sc->sc_dstrd),
my->cq_head);
+ if ((NVME_CQE_SC(flags) != NVME_CQE_SC_SUCCESS) || (icqe->cid != blkno
% 0xffff))
+ return (EIO);
+
+ return (0);
+}
+
+#endif