From: Waldemar Kozaczuk <[email protected]> Committer: WALDEMAR KOZACZUK <[email protected]> Branch: master
Implement NVMe driver This patch implements the NVMe block device driver. It is greatly based on the pull request submitted by Jan Braunwarth (see https://github.com/cloudius-systems/osv/pull/1284) so most credit goes to Jan. As his PR explains, OSv can be started with emulated NVMe disk on QEMU like so: ./scripts/run.py --nvme Compared to the Jan's PR, this patch is different in following ways: - removes all non-NVMe changes (various bug fixes or ioctl enhancements are part of separate PRs) - replaces most of the heap allocations by using stack which should reduce some contention - tweaks PRP-handling code to use lock-less ring buffer which should further reduce contention when allocating memory - fixes a bug in I/O queue CQ handling to correctly determine if SQ is not full - assumes single namespace - 1 (most logic to deal with more has been preserved) - reduces I/O queue size to 64 instead of 256 - makes code a little more DRY Please note that, as Jan points out, the block cache logic of splitting reads and writes into 512-byte requests causes very poor performance when stress testing at devfs level. However, this behavior is not NVMe specific and does not affect most applications that go through a VFS and filesystem driver (ZFS, EXT, ROFS) which use the strategy() method which does not use block cache. Based on my tests, the NVMe read performance (IOPs and bytes/s) is 60-70% of the virtio-blk on QEMU. I am not sure how much that is because of this implementation of the NVMe driver or it is because virtio-blk is by design much faster than anything emulated including NVMe. Closes #1203 Signed-off-by: Jan Braunwarth <[email protected]> Signed-off-by: Waldemar Kozaczuk <[email protected]> --- diff --git a/Makefile b/Makefile --- a/Makefile +++ b/Makefile @@ -898,6 +898,10 @@ drivers += drivers/virtio-vring.o ifeq ($(conf_drivers_mmio),1) drivers += drivers/virtio-mmio.o endif +ifeq ($(conf_drivers_nvme),1) +drivers += drivers/nvme.o +drivers += drivers/nvme-queue.o +endif drivers += drivers/virtio-net.o drivers += drivers/virtio-blk.o drivers += drivers/virtio-scsi.o diff --git a/arch/x64/arch-setup.cc b/arch/x64/arch-setup.cc --- a/arch/x64/arch-setup.cc +++ b/arch/x64/arch-setup.cc @@ -313,6 +313,9 @@ void arch_init_premain() #if CONF_drivers_ena #include "drivers/ena.hh" #endif +#if CONF_drivers_nvme +#include "drivers/nvme.hh" +#endif extern bool opt_pci_disabled; void arch_init_drivers() @@ -370,6 +373,9 @@ void arch_init_drivers() #endif #if CONF_drivers_ena drvman->register_driver(aws::ena::probe); +#endif +#if CONF_drivers_nvme + drvman->register_driver(nvme::driver::probe); #endif boot_time.event("drivers probe"); drvman->load_all(); diff --git a/conf/profiles/x64/all.mk b/conf/profiles/x64/all.mk --- a/conf/profiles/x64/all.mk +++ b/conf/profiles/x64/all.mk @@ -5,5 +5,6 @@ include conf/profiles/$(arch)/virtio-pci.mk include conf/profiles/$(arch)/vmware.mk include conf/profiles/$(arch)/xen.mk include conf/profiles/$(arch)/aws.mk +include conf/profiles/$(arch)/nvme.mk conf_drivers_vga?=1 diff --git a/conf/profiles/x64/base.mk b/conf/profiles/x64/base.mk --- a/conf/profiles/x64/base.mk +++ b/conf/profiles/x64/base.mk @@ -38,6 +38,11 @@ export conf_drivers_pci?=1 export conf_drivers_scsi?=1 endif +export conf_drivers_nvme?=0 +ifeq ($(conf_drivers_nvme),1) +export conf_drivers_pci?=1 +endif + export conf_drivers_vmxnet3?=0 ifeq ($(conf_drivers_vmxnet3),1) export conf_drivers_pci?=1 diff --git a/conf/profiles/x64/nvme.mk b/conf/profiles/x64/nvme.mk --- a/conf/profiles/x64/nvme.mk +++ b/conf/profiles/x64/nvme.mk @@ -0,0 +1,3 @@ +conf_drivers_pci?=1 + +conf_drivers_nvme?=1 diff --git a/core/debug.cc b/core/debug.cc --- a/core/debug.cc +++ b/core/debug.cc @@ -48,6 +48,7 @@ bool logger::parse_configuration(void) add_tag("dhcp", logger_info); add_tag("acpi", logger_error); add_tag("ena", logger_debug); + add_tag("nvme", logger_debug); return (true); } diff --git a/drivers/nvme-queue.cc b/drivers/nvme-queue.cc --- a/drivers/nvme-queue.cc +++ b/drivers/nvme-queue.cc @@ -0,0 +1,473 @@ +/* + * Copyright (C) 2023 Jan Braunwarth + * Copyright (C) 2024 Waldemar Kozaczuk + * + * This work is open source software, licensed under the terms of the + * BSD license as described in the LICENSE file in the top-level directory. + */ + +#include <sys/cdefs.h> + +#include <vector> +#include <memory> + +#include <osv/contiguous_alloc.hh> +#include <osv/bio.h> +#include <osv/trace.hh> +#include <osv/mempool.hh> +#include <osv/align.hh> + +#include "nvme-queue.hh" + +TRACEPOINT(trace_nvme_cq_wait, "nvme%d qid=%d, cq_head=%d", int, int, int); +TRACEPOINT(trace_nvme_cq_woken, "nvme%d qid=%d, have_elements=%d", int, int, bool); +TRACEPOINT(trace_nvme_cq_not_empty, "nvme%d qid=%d, not_empty=%d", int, int, bool); +TRACEPOINT(trace_nvme_cq_head_advance, "nvme%d qid=%d cq_head=%d", int, int, int); +TRACEPOINT(trace_nvme_cq_new_entry, "nvme%d qid=%d sqhd=%d", int, int, int); + +TRACEPOINT(trace_nvme_enable_interrupts, "nvme%d qid=%d", int, int); +TRACEPOINT(trace_nvme_disable_interrupts, "nvme%d qid=%d", int, int); + +TRACEPOINT(trace_nvme_req_done_error, "nvme%d qid=%d, cid=%d, status type=%#x, status code=%#x, bio=%p", int, int, u16, u8, u8, bio*); +TRACEPOINT(trace_nvme_req_done_success, "nvme%d qid=%d, cid=%d, bio=%p", int, int, u16, bio*); + +TRACEPOINT(trace_nvme_admin_cmd_submit, "nvme%d qid=%d, cid=%d, opc=%d", int, int, int, u8); +TRACEPOINT(trace_nvme_read_write_cmd_submit, "nvme%d qid=%d cid=%d, bio=%p, slba=%d, nlb=%d, write=%d", int, int, u16, void*, u64, u32, bool); + +TRACEPOINT(trace_nvme_sq_tail_advance, "nvme%d qid=%d, sq_tail=%d, sq_head=%d, depth=%d, full=%d", int, int, int, int, int, bool); +TRACEPOINT(trace_nvme_sq_full_wait, "nvme%d qid=%d, sq_tail=%d, sq_head=%d", int, int, int, int); +TRACEPOINT(trace_nvme_sq_full_wake, "nvme%d qid=%d, sq_tail=%d, sq_head=%d", int, int, int, int); + +TRACEPOINT(trace_nvme_cid_conflict, "nvme%d qid=%d, cid=%d", int, int, int); + +TRACEPOINT(trace_nvme_prp_alloc, "nvme%d qid=%d, prp=%p", int, int, void*); +TRACEPOINT(trace_nvme_prp_free, "nvme%d qid=%d, prp=%p", int, int, void*); + +using namespace memory; + +namespace nvme { + +queue_pair::queue_pair( + int did, + u32 id, + int qsize, + pci::device &dev, + u32* sq_doorbell, + u32* cq_doorbell, + std::map<u32, nvme_ns_t*>& ns) + : _id(id) + ,_driver_id(did) + ,_qsize(qsize) + ,_dev(&dev) + ,_sq(sq_doorbell) + ,_sq_full(false) + ,_cq(cq_doorbell) + ,_cq_phase_tag(1) + ,_ns(ns) +{ + size_t sq_buf_size = qsize * sizeof(nvme_sq_entry_t); + _sq._addr = (nvme_sq_entry_t*) alloc_phys_contiguous_aligned(sq_buf_size, mmu::page_size); + assert(_sq._addr); + memset(_sq._addr, 0, sq_buf_size); + + size_t cq_buf_size = qsize * sizeof(nvme_cq_entry_t); + _cq._addr = (nvme_cq_entry_t*) alloc_phys_contiguous_aligned(cq_buf_size, mmu::page_size); + assert(_cq._addr); + memset(_cq._addr, 0, cq_buf_size); + + assert(!completion_queue_not_empty()); +} + +queue_pair::~queue_pair() +{ + u64* free_prp; + while (_free_prp_lists.pop(free_prp)) + free_page((void*)free_prp); + + free_phys_contiguous_aligned(_sq._addr); + free_phys_contiguous_aligned(_cq._addr); +} + +inline void queue_pair::advance_sq_tail() +{ + _sq._tail = (_sq._tail + 1) % _qsize; + if (((_sq._tail + 1) % _qsize) == _sq._head) { + _sq_full = true; + } + trace_nvme_sq_tail_advance(_driver_id, _id, _sq._tail, _sq._head, + (_sq._tail >= _sq._head) ? _sq._tail - _sq._head : _sq._tail + (_qsize - _sq._head), + _sq_full); +} + +u16 queue_pair::submit_cmd(nvme_sq_entry_t* cmd) +{ + _sq._addr[_sq._tail] = *cmd; + advance_sq_tail(); + mmio_setl(_sq._doorbell, _sq._tail); + return _sq._tail; +} + +void queue_pair::wait_for_completion_queue_entries() +{ + trace_nvme_cq_wait(_driver_id, _id, _cq._head); + sched::thread::wait_until([this] { + bool have_elements = this->completion_queue_not_empty(); + if (!have_elements) { + this->enable_interrupts(); + //check if we got a new cqe between completion_queue_not_empty() + //and enable_interrupts() + have_elements = this->completion_queue_not_empty(); + if (have_elements) { + this->disable_interrupts(); + } + } + + trace_nvme_cq_woken(_driver_id, _id, have_elements); + return have_elements; + }); +} + +void queue_pair::map_prps(nvme_sq_entry_t* cmd, struct bio* bio, u64 datasize) +{ + void* data = (void*)mmu::virt_to_phys(bio->bio_data); + bio->bio_private = nullptr; + + // Depending on the datasize, we map PRPs (Physical Region Page) as follows: + // 0. We always set the prp1 field to the beginning of the data + // 1. If data falls within single 4K page then we simply set prp2 to 0 + // 2. If data falls within 2 pages then set prp2 to the second 4K-aligned part of data + // 3. Otherwise, allocate a physically contigous array long enough to hold addresses + // of remaining 4K pages of data + u64 addr = (u64) data; + cmd->rw.common.prp1 = addr; + cmd->rw.common.prp2 = 0; + + // Calculate number of 4K pages and therefore number of entries in the PRP + // list. The 1st entry rw.common.prp1 can be misaligned but every + // other one needs to be 4K-aligned + u64 first_page_start = align_down(addr, NVME_PAGESIZE); + u64 last_page_end = align_up(addr + datasize, NVME_PAGESIZE); + int num_of_pages = (last_page_end - first_page_start) / NVME_PAGESIZE; + + if (num_of_pages == 2) { + cmd->rw.common.prp2 = first_page_start + NVME_PAGESIZE; //2nd page start + } else if (num_of_pages > 2) { + // Allocate PRP list as the request is larger than 8K + // For now we can only accomodate datasize <= 2MB so single page + // should be exactly enough to map up to 512 pages of the request data + assert(num_of_pages / 512 == 0); + u64* prp_list = nullptr; + _free_prp_lists.pop(prp_list); + if (!prp_list) { // No free pre-allocated ones, so allocate new one + prp_list = (u64*) alloc_page(); + trace_nvme_prp_alloc(_driver_id, _id, prp_list); + } + + assert(prp_list != nullptr); + cmd->rw.common.prp2 = mmu::virt_to_phys(prp_list); + + // Save PRP list in bio so it can be de-allocated later + bio->bio_private = prp_list; + + // Fill in the PRP list with address of subsequent 4K pages + addr = first_page_start + NVME_PAGESIZE; //2nd page start + prp_list[0] = addr; + + for (int i = 1; i < num_of_pages - 1; i++) { + addr += NVME_PAGESIZE; + prp_list[i] = addr; + } + } +} + +nvme_cq_entry_t* queue_pair::get_completion_queue_entry() +{ + if (!completion_queue_not_empty()) { + return nullptr; + } + + auto* cqe = &_cq._addr[_cq._head]; + assert(cqe->p == _cq_phase_tag); + + trace_nvme_cq_new_entry(_driver_id, _id, cqe->sqhd); + return cqe; +} + +inline void queue_pair::advance_cq_head() +{ + trace_nvme_cq_head_advance(_driver_id, _id, _cq._head); + if (++_cq._head == _qsize) { + _cq._head = 0; + _cq_phase_tag = _cq_phase_tag ? 0 : 1; + } +} + +bool queue_pair::completion_queue_not_empty() const +{ + bool a = reinterpret_cast<volatile nvme_cq_entry_t*>(&_cq._addr[_cq._head])->p == _cq_phase_tag; + trace_nvme_cq_not_empty(_driver_id, _id, a); + return a; +} + +void queue_pair::enable_interrupts() +{ + _dev->msix_unmask_entry(_id); + trace_nvme_enable_interrupts(_driver_id, _id); +} + +void queue_pair::disable_interrupts() +{ + _dev->msix_mask_entry(_id); + trace_nvme_disable_interrupts(_driver_id, _id); +} + +io_queue_pair::io_queue_pair( + int driver_id, + int id, + int qsize, + pci::device& dev, + u32* sq_doorbell, + u32* cq_doorbell, + std::map<u32, nvme_ns_t*>& ns + ) : queue_pair( + driver_id, + id, + qsize, + dev, + sq_doorbell, + cq_doorbell, + ns + ) +{ + init_pending_bios(0); +} + +io_queue_pair::~io_queue_pair() +{ + for (auto bios : _pending_bios) { + if (bios) { + free(bios); + } + } +} + +void io_queue_pair::init_pending_bios(u32 level) +{ + _pending_bios[level] = (std::atomic<struct bio*> *) malloc(sizeof(std::atomic<struct bio*>) * _qsize); + for (u32 idx = 0; idx < _qsize; idx++) { + _pending_bios[level][idx] = {}; + } +} + +int io_queue_pair::make_request(struct bio* bio, u32 nsid = 1) +{ + u64 slba = bio->bio_offset; + u32 nlb = bio->bio_bcount; //do the blockshift in nvme_driver + + SCOPE_LOCK(_lock); + if (_sq_full) { + //Wait for free entries + _sq_full_waiter.reset(*sched::thread::current()); + trace_nvme_sq_full_wait(_driver_id, _id, _sq._tail, _sq._head); + sched::thread::wait_until([this] { return !(this->_sq_full); }); + _sq_full_waiter.clear(); + } + assert((((_sq._tail + 1) % _qsize) != _sq._head)); + // + // We need to check if there is an outstanding command that uses + // _sq._tail as command id. + // This happens if: + // 1. The SQ is full. Then we just have to wait for an open slot (see above) + // 2. The Controller already read a SQE but didnt post a CQE yet. + // This means we could post the command but need a different cid. To still + // use the cid as index to find the corresponding bios we use a matrix + // adding columns if we need them + u16 cid = _sq._tail; + while (_pending_bios[cid_to_row(cid)][cid_to_col(cid)].load()) { + trace_nvme_cid_conflict(_driver_id, _id, cid); + cid += _qsize; + auto level = cid_to_row(cid); + assert(level < max_pending_levels); + // Allocate next row of _pending_bios if needed + if (!_pending_bios[cid_to_row(cid)]) { + init_pending_bios(level); + } + } + //Save bio + _pending_bios[cid_to_row(cid)][cid_to_col(cid)] = bio; + + switch (bio->bio_cmd) { + case BIO_READ: + trace_nvme_read_write_cmd_submit(_driver_id, _id, cid, bio, slba, nlb, false); + submit_read_write_cmd(cid, nsid, NVME_CMD_READ, slba, nlb, bio); + break; + + case BIO_WRITE: + trace_nvme_read_write_cmd_submit(_driver_id, _id, cid, bio, slba, nlb, true); + submit_read_write_cmd(cid, nsid, NVME_CMD_WRITE, slba, nlb, bio); + break; + + case BIO_FLUSH: + submit_flush_cmd(cid, nsid); + break; + + default: + NVME_ERROR("Operation not implemented\n"); + return ENOTBLK; + } + return 0; +} + +void io_queue_pair::req_done() +{ + nvme_cq_entry_t* cqep = nullptr; + while (true) + { + wait_for_completion_queue_entries(); + while ((cqep = get_completion_queue_entry())) { + // Read full CQ entry onto stack so we can advance CQ head ASAP + // and release the CQ slot + nvme_cq_entry_t cqe = *cqep; + advance_cq_head(); + mmio_setl(_cq._doorbell, _cq._head); + // + // Wake up the requesting thread in case the submission queue was full before + auto old_sq_head = _sq._head.exchange(cqe.sqhd); //update sq_head + if (old_sq_head != cqe.sqhd && _sq_full) { + _sq_full = false; + if (_sq_full_waiter) { + trace_nvme_sq_full_wake(_driver_id, _id, _sq._tail, _sq._head); + _sq_full_waiter.wake_from_kernel_or_with_irq_disabled(); + } + } + // + // Read cid and release it + u16 cid = cqe.cid; + auto pending_bio = _pending_bios[cid_to_row(cid)][cid_to_col(cid)].exchange(nullptr); + assert(pending_bio); + // + // Save for future re-use or free PRP list saved under bio_private if any + if (pending_bio->bio_private) { + if (!_free_prp_lists.push((u64*)pending_bio->bio_private)) { + free_page(pending_bio->bio_private); //_free_prp_lists is full so free the page + trace_nvme_prp_free(_driver_id, _id, pending_bio->bio_private); + } + } + // Call biodone + if (cqe.sct != 0 || cqe.sc != 0) { + trace_nvme_req_done_error(_driver_id, _id, cid, cqe.sct, cqe.sc, pending_bio); + biodone(pending_bio, false); + NVME_ERROR("I/O queue: cid=%d, sct=%#x, sc=%#x, bio=%#x, slba=%llu, nlb=%llu\n", + cqe.cid, cqe.sct, cqe.sc, pending_bio, + pending_bio ? pending_bio->bio_offset : 0, + pending_bio ? pending_bio->bio_bcount : 0); + } else { + trace_nvme_req_done_success(_driver_id, _id, cid, pending_bio); + biodone(pending_bio, true); + } + } + } +} + +u16 io_queue_pair::submit_read_write_cmd(u16 cid, u32 nsid, int opc, u64 slba, u32 nlb, struct bio* bio) +{ + nvme_sq_entry_t cmd; + memset(&cmd, 0, sizeof(cmd)); + + cmd.rw.common.cid = cid; + cmd.rw.common.opc = opc; + cmd.rw.common.nsid = nsid; + cmd.rw.slba = slba; + cmd.rw.nlb = nlb - 1; + + u32 datasize = nlb << _ns[nsid]->blockshift; + map_prps(&cmd, bio, datasize); + + return submit_cmd(&cmd); +} + +u16 io_queue_pair::submit_flush_cmd(u16 cid, u32 nsid) +{ + nvme_sq_entry_t cmd; + memset(&cmd, 0, sizeof(cmd)); + + cmd.vs.common.opc = NVME_CMD_FLUSH; + cmd.vs.common.nsid = nsid; + cmd.vs.common.cid = cid; + + return submit_cmd(&cmd); +} + +admin_queue_pair::admin_queue_pair( + int driver_id, + int id, + int qsize, + pci::device& dev, + u32* sq_doorbell, + u32* cq_doorbell, + std::map<u32, nvme_ns_t*>& ns + ) : queue_pair( + driver_id, + id, + qsize, + dev, + sq_doorbell, + cq_doorbell, + ns +) {} + +void admin_queue_pair::req_done() +{ + nvme_cq_entry_t* cqe = nullptr; + while (true) + { + wait_for_completion_queue_entries(); + while ((cqe = get_completion_queue_entry())) { + u16 cid = cqe->cid; + if (cqe->sct != 0 || cqe->sc != 0) { + trace_nvme_req_done_error(_driver_id, _id, cid, cqe->sct, cqe->sc, nullptr); + NVME_ERROR("Admin queue cid=%d, sct=%#x, sc=%#x\n",cid,cqe->sct,cqe->sc); + } else { + trace_nvme_req_done_success(_driver_id, _id, cid, nullptr); + } + + _sq._head = cqe->sqhd; //Update sq_head + _req_res = *cqe; //Save the cqe so that the requesting thread can return it + + advance_cq_head(); + } + mmio_setl(_cq._doorbell, _cq._head); + + //Wake up the thread that requested the admin command + new_cq = true; + _req_waiter.wake_from_kernel_or_with_irq_disabled(); + } +} + +nvme_cq_entry_t +admin_queue_pair::submit_and_return_on_completion(nvme_sq_entry_t* cmd, void* data, unsigned int datasize) +{ + SCOPE_LOCK(_lock); + + _req_waiter.reset(*sched::thread::current()); + + //for now admin cid = sq_tail + u16 cid = _sq._tail; + cmd->rw.common.cid = cid; + + if (data != nullptr && datasize > 0) { + cmd->rw.common.prp1 = (u64)data; + cmd->rw.common.prp2 = 0; + } + + trace_nvme_admin_cmd_submit(_driver_id, _id, cid, cmd->set_features.common.opc); + submit_cmd(cmd); + + sched::thread::wait_until([this] { return this->new_cq; }); + _req_waiter.clear(); + + new_cq = false; + + return _req_res; +} +} diff --git a/drivers/nvme-queue.hh b/drivers/nvme-queue.hh --- a/drivers/nvme-queue.hh +++ b/drivers/nvme-queue.hh @@ -0,0 +1,190 @@ +/* + * Copyright (C) 2023 Jan Braunwarth + * Copyright (C) 2024 Waldemar Kozaczuk + * + * This work is open source software, licensed under the terms of the + * BSD license as described in the LICENSE file in the top-level directory. + */ + +#ifndef NVME_QUEUE_H +#define NVME_QUEUE_H + +#include "drivers/pci-device.hh" +#include "drivers/nvme-structs.h" + +#include <osv/bio.h> +#include <lockfree/ring.hh> + +#define nvme_tag "nvme" +#define nvme_d(...) tprintf_d(nvme_tag, __VA_ARGS__) +#define nvme_i(...) tprintf_i(nvme_tag, __VA_ARGS__) +#define nvme_w(...) tprintf_w(nvme_tag, __VA_ARGS__) +#define nvme_e(...) tprintf_e(nvme_tag, __VA_ARGS__) + +#define NVME_ERROR(...) nvme_e(__VA_ARGS__) + +#define NVME_PAGESIZE mmu::page_size +#define NVME_PAGESHIFT 12 + +namespace nvme { + +// Template to specify common elements of the submission and completion +// queue as described in the chapter 4.1 of the NVMe specification (see +// "https://www.nvmexpress.org/wp-content/uploads/NVM-Express-1_1a.pdf") +// The type T argument would be either nvme_sq_entry_t or nvme_cq_entry_t. +// +// The _tail, used by the producer, specifies the 0-based index of +// the next free slot to place new entry into the array _addr. After +// placing new entry, the _tail should be incremented - if it exceeds +// queue size, the it should roll to 0. +// +// The _head, used by the consumer, specifies the 0-based index of +// the entry to be fetched of the queue _addr. Likewise, the _head is +// incremented after, and if exceeds queue size, it should roll to 0. +// +// The queue is considered empty, if _head == _tail. +// The queue is considered full, if _head == (_tail + 1) +// +// The _doorbell points to the address where _tail of the submission +// queue is written to. For completion queue, it points to the address +// where the _head value is written to. +template<typename T> +struct queue { + queue(u32* doorbell) : + _addr(nullptr), _doorbell(doorbell), _head(0), _tail(0) {} + T* _addr; + volatile u32* _doorbell; + std::atomic<u32> _head; + u32 _tail; +}; + +// Pair of submission queue and completion queue - SQ and CQ. +// They work in tandem and share the same size. +class queue_pair +{ +public: + queue_pair( + int driver_id, + u32 id, + int qsize, + pci::device& dev, + u32* sq_doorbell, + u32* cq_doorbell, + std::map<u32, nvme_ns_t*>& ns + ); + + ~queue_pair(); + + u64 sq_phys_addr() { return (u64) mmu::virt_to_phys((void*) _sq._addr); } + u64 cq_phys_addr() { return (u64) mmu::virt_to_phys((void*) _cq._addr); } + + virtual void req_done() {}; + void wait_for_completion_queue_entries(); + bool completion_queue_not_empty() const; + + void enable_interrupts(); + void disable_interrupts(); + + u32 _id; +protected: + void advance_sq_tail(); + void advance_cq_head(); + + // PRP stands for Physical Region Page and is used to specify locations in + // physical memory for data tranfers. In essence, they are arrays of physical + // addresses of pages to read from or write to data. + void map_prps(nvme_sq_entry_t* cmd, struct bio* bio, u64 datasize); + + u16 submit_cmd(nvme_sq_entry_t* cmd); + + nvme_cq_entry_t* get_completion_queue_entry(); + + int _driver_id; + + // Length of the CQ and SQ + // Admin queue is 8 entries long, therefore occupies 640 bytes (8 * (64 + 16)) + // I/O queue is normally 64 entries long, therefore occupies 5K (64 * (64 + 16)) + u32 _qsize; + + pci::device* _dev; + + // Submission Queue (SQ) - each entry is 64 bytes in size + queue<nvme_sq_entry_t> _sq; + std::atomic<bool> _sq_full; + + // Completion Queue (CQ) - each entry is 16 bytes in size + queue<nvme_cq_entry_t> _cq; + u16 _cq_phase_tag; + + // Map of namespaces (for now there would normally be one entry keyed by 1) + std::map<u32, nvme_ns_t*> _ns; + + static constexpr size_t max_pending_levels = 4; + + // Let us hold to allocated PRP pages but also limit to up 16 ones + ring_spsc<u64*, unsigned, 16> _free_prp_lists; + + mutex _lock; +}; + +// Pair of SQ and CQ queues used for reading from and writing to (I/O) +class io_queue_pair : public queue_pair { +public: + io_queue_pair( + int driver_id, + int id, + int qsize, + pci::device& dev, + u32* sq_doorbell, + u32* cq_doorbell, + std::map<u32, nvme_ns_t*>& ns + ); + ~io_queue_pair(); + + int make_request(struct bio* bio, u32 nsid); + void req_done(); +private: + void init_pending_bios(u32 level); + + inline u16 cid_to_row(u16 cid) { return cid / _qsize; } + inline u16 cid_to_col(u16 cid) { return cid % _qsize; } + + u16 submit_read_write_cmd(u16 cid, u32 nsid, int opc, u64 slba, u32 nlb, struct bio* bio); + u16 submit_flush_cmd(u16 cid, u32 nsid); + + sched::thread_handle _sq_full_waiter; + + // Vector of arrays of pointers to struct bio used to track bio associated + // with given command. The scheme to generate 16-bit 'cid' is - + // _sq._tail + N * qsize - where N is typically 0 and is equal + // to a row in _pending_bios and _sq._tail is equal to a column. + // Given cid, we can easily identify a pending bio by calculating + // the row - cid / _qsize and column - cid % _qsize + std::atomic<struct bio*>* _pending_bios[max_pending_levels] = {}; +}; + +// Pair of SQ and CQ queues used for setting up/configuring controller +// like creating I/O queues +class admin_queue_pair : public queue_pair { +public: + admin_queue_pair( + int driver_id, + int id, + int qsize, + pci::device& dev, + u32* sq_doorbell, + u32* cq_doorbell, + std::map<u32, nvme_ns_t*>& ns + ); + + void req_done(); + nvme_cq_entry_t submit_and_return_on_completion(nvme_sq_entry_t* cmd, void* data = nullptr, unsigned int datasize = 0); +private: + sched::thread_handle _req_waiter; + nvme_cq_entry_t _req_res; + volatile bool new_cq; +}; + +} + +#endif diff --git a/drivers/nvme-structs.h b/drivers/nvme-structs.h --- a/drivers/nvme-structs.h +++ b/drivers/nvme-structs.h @@ -0,0 +1,647 @@ +/** + * Copyright (c) 2015-2016, Micron Technology, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * @file + * @brief NVMe header file + */ + +#ifndef NVME_STRUCTS_H +#define NVME_STRUCTS_H + +#include <stdint.h> + +__BEGIN_DECLS + +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + #pragma error "only support little endian CPU architecture" +#endif + +#ifndef _U_TYPE +#define _U_TYPE ///< bit size data types +typedef int8_t s8; ///< 8-bit signed +typedef int16_t s16; ///< 16-bit signed +typedef int32_t s32; ///< 32-bit signed +typedef int64_t s64; ///< 64-bit signed +typedef uint8_t u8; ///< 8-bit unsigned +typedef uint16_t u16; ///< 16-bit unsigned +typedef uint32_t u32; ///< 32-bit unsigned +typedef uint64_t u64; ///< 64-bit unsigned +#endif // _U_TYPE + +/// NVMe command op code +enum { + NVME_CMD_FLUSH = 0x0, ///< flush + NVME_CMD_WRITE = 0x1, ///< write + NVME_CMD_READ = 0x2, ///< read + NVME_CMD_WRITE_UNCOR = 0x4, ///< write uncorrectable + NVME_CMD_COMPARE = 0x5, ///< compare + NVME_CMD_DS_MGMT = 0x9, ///< dataset management +}; + +/// NVMe admin command op code +enum { + NVME_ACMD_DELETE_SQ = 0x0, ///< delete io submission queue + NVME_ACMD_CREATE_SQ = 0x1, ///< create io submission queue + NVME_ACMD_GET_LOG_PAGE = 0x2, ///< get log page + NVME_ACMD_DELETE_CQ = 0x4, ///< delete io completion queue + NVME_ACMD_CREATE_CQ = 0x5, ///< create io completion queue + NVME_ACMD_IDENTIFY = 0x6, ///< identify + NVME_ACMD_ABORT = 0x8, ///< abort + NVME_ACMD_SET_FEATURES = 0x9, ///< set features + NVME_ACMD_GET_FEATURES = 0xA, ///< get features + NVME_ACMD_ASYNC_EVENT = 0xC, ///< asynchronous event + NVME_ACMD_FW_ACTIVATE = 0x10, ///< firmware activate + NVME_ACMD_FW_DOWNLOAD = 0x11, ///< firmware image download +}; + +/// NVMe feature identifiers +enum { + NVME_FEATURE_ARBITRATION = 0x1, ///< arbitration + NVME_FEATURE_POWER_MGMT = 0x2, ///< power management + NVME_FEATURE_LBA_RANGE = 0x3, ///< LBA range type + NVME_FEATURE_TEMP_THRESHOLD = 0x4, ///< temperature threshold + NVME_FEATURE_ERROR_RECOVERY = 0x5, ///< error recovery + NVME_FEATURE_WRITE_CACHE = 0x6, ///< volatile write cache + NVME_FEATURE_NUM_QUEUES = 0x7, ///< number of queues + NVME_FEATURE_INT_COALESCING = 0x8, ///< interrupt coalescing + NVME_FEATURE_INT_VECTOR = 0x9, ///< interrupt vector config + NVME_FEATURE_WRITE_ATOMICITY = 0xA, ///< write atomicity + NVME_FEATURE_ASYNC_EVENT = 0xB, ///< async event config +}; + +/// Version +typedef union _nvme_version { + u32 val; ///< whole value + struct { + u8 rsvd; ///< reserved + u8 mnr; ///< minor version number + u16 mjr; ///< major version number + }; +} nvme_version_t; + +/// Admin queue attributes +typedef union _nvme_adminq_attr { + u32 val; ///< whole value + struct { + u16 asqs; ///< admin submission queue size + u16 acqs; ///< admin completion queue size + }; +} nvme_adminq_attr_t; + +/// Controller capabilities +typedef union _nvme_controller_cap { + u64 val; ///< whole value + struct { + u16 mqes; ///< max queue entries supported + u8 cqr : 1; ///< contiguous queues required + u8 ams : 2; ///< arbitration mechanism supported + u8 rsvd : 5; ///< reserved + u8 to; ///< timeout + + u32 dstrd : 4; ///< doorbell stride + u32 nssrs : 1; ///< NVM subsystem reset supported + u32 css : 8; ///< command set supported + u32 rsvd2 : 3; ///< reserved + u32 mpsmin : 4; ///< memory page size minimum + u32 mpsmax : 4; ///< memory page size maximum + u32 rsvd3 : 8; ///< reserved + }; +} nvme_controller_cap_t; + +/// Controller configuration register +typedef union _nvme_controller_config { + u32 val; ///< whole value + struct { + u32 en : 1; ///< enable + u32 rsvd : 3; ///< reserved + u32 css : 3; ///< I/O command set selected + u32 mps : 4; ///< memory page size + u32 ams : 3; ///< arbitration mechanism selected + u32 shn : 2; ///< shutdown notification + u32 iosqes : 4; ///< I/O submission queue entry size + u32 iocqes : 4; ///< I/O completion queue entry size + u32 rsvd2 : 8; ///< reserved + }; +} nvme_controller_config_t; + +/// Controller status register +typedef union _nvme_controller_status { + u32 val; ///< whole value + struct { + u32 rdy : 1; ///< ready + u32 cfs : 1; ///< controller fatal status + u32 shst : 2; ///< shutdown status + u32 rsvd : 28; ///< reserved + }; +} nvme_controller_status_t; + +/// Controller memory buffer location register +typedef union _nvme_cmbloc { + u32 val; ///< whole value + struct { + u32 bir : 3; ///< base indicator register + u32 rsvd : 9; ///< reserved + u32 ofst : 20; ///< offset (in cmbsz units) + }; +} nvme_cmbloc_t; + +/// Controller memory buffer size register +typedef union _nvme_cmbsz { + u32 val; ///< whole value + struct { + u32 sqs : 1; ///< submission queue support + u32 cqs : 1; ///< completion queue support + u32 lists : 1; ///< PRP SGL list support + u32 rds : 1; ///< read data support + u32 wds : 1; ///< write data support + u32 rsvd : 3; ///< reserved + u32 szu : 4; ///< size units (0=4K,1=64K,2=1M,3=16M,4=256M,5=4G,6=64G) + u32 sz : 20; ///< size (in cmbsz units) + }; +} nvme_cmbsz_t; + + + +enum nvme_sgl_descriptor_type { + NVME_SGL_DATA_BLOCK_TYPE = 0x0, + NVME_SGL_BIT_BUCKET_TYPE = 0x1, + NVME_SGL_SEGMENT_TYPE = 0x2, + NVME_SGL_LAST_SEGMENT_TYPE = 0x3, + NVME_SGL_KEYED_DATA_BLOCK_TYPE = 0x4, + NVME_SGL_TRANSPORT_DATA_BLOCK_TYPE = 0x5, + /* + *0x6 - 0xE reserved + */ + + NVME_SGL_VENDOR_SPECIFIC_TYPE = 0xF, +}; + +enum nvme_sgl_descriptor_subtype { + NVME_SGL_ADDRESS_SUBTYPE = 0x0, + NVME_SGL_OFFSET_SUBTYPE = 0x1, + //0xA - 0xF Nvme transport specific +}; + +struct __attribute__((packed)) nvme_sgl_descriptor_unkeyed { + u64 addr; + u32 length; + u8 reserved[3]; + u8 subtype:4; + u8 type:4; +}; + +struct __attribute__((packed)) nvme_sgl_descriptor_keyed { + u64 addr; + u64 length:24; + u64 key:32; + u64 subtype:4; + u64 type:4; +}; +union nvme_sgl_descriptor { + nvme_sgl_descriptor_keyed keyed; + nvme_sgl_descriptor_unkeyed unkeyed; +}; + +static_assert(sizeof(nvme_sgl_descriptor)==16); + + + + +/// Controller register (bar 0) +typedef struct _nvme_controller_reg { + nvme_controller_cap_t cap; ///< controller capabilities + nvme_version_t vs; ///< version + u32 intms; ///< interrupt mask set + u32 intmc; ///< interrupt mask clear + nvme_controller_config_t cc; ///< controller configuration + u32 rsvd; ///< reserved + nvme_controller_status_t csts; ///< controller status + u32 nssr; ///< NVM subsystem reset + nvme_adminq_attr_t aqa; ///< admin queue attributes + u64 asq; ///< admin submission queue base address + u64 acq; ///< admin completion queue base address + nvme_cmbloc_t cmbloc; ///< controller memory buffer location + nvme_cmbsz_t cmbsz; ///< controller memory buffer size + u32 rcss[1008]; ///< reserved and command set specific + u32 sq0tdbl[1024]; ///< sq0 tail doorbell at 0x1000 +} nvme_controller_reg_t; + +/// Common command header (cdw 0-9) +typedef struct _nvme_command_common { + u8 opc; ///< opcode + u8 fuse : 2; ///< fuse + u8 rsvd : 4; ///< reserved + u8 psdt : 2; ///< PRP or SGL for data transfer + u16 cid; ///< command id + u32 nsid; ///< namespace id + u64 cdw2_3; ///< reserved (cdw 2-3) + u64 mptr; ///< metadata pointer + union { + struct { + u64 prp1; ///< PRP entry 1 + u64 prp2; ///< PRP entry 2 + }; + nvme_sgl_descriptor sgl1; ///<SGL1 entry + + }; +} nvme_command_common_t; + +/// NVMe command: Read & Write +typedef struct _nvme_command_rw { + nvme_command_common_t common; ///< common cdw 0 + u64 slba; ///< starting LBA (cdw 10) + u16 nlb; ///< number of logical blocks + u16 rsvd12 : 10; ///< reserved (in cdw 12) + u16 prinfo : 4; ///< protection information field + u16 fua : 1; ///< force unit access + u16 lr : 1; ///< limited retry + u8 dsm; ///< dataset management + u8 rsvd13[3]; ///< reserved (in cdw 13) + u32 eilbrt; ///< exp initial block reference tag + u16 elbat; ///< exp logical block app tag + u16 elbatm; ///< exp logical block app tag mask +} nvme_command_rw_t; + +static_assert(sizeof(nvme_command_rw_t)==64); + +/// Admin and NVM Vendor Specific Command +typedef struct _nvme_command_vs { + nvme_command_common_t common; ///< common cdw 0 + union { + struct { + u32 ndt; ///< number of dwords data transfer + u32 ndm; ///< number of dwords metadata transfer + u32 cdw12_15[4]; ///< vendor specific (cdw 12-15) + }; + u32 cdw10_15[6]; ///< vendor specific (cdw 10-15) + }; +} nvme_command_vs_t; + +/// Admin command: Delete I/O Submission & Completion Queue +typedef struct _nvme_acmd_delete_ioq { + nvme_command_common_t common; ///< common cdw 0 + u16 qid; ///< queue id (cdw 10) + u16 rsvd10; ///< reserved (in cdw 10) + u32 cdw11_15[5]; ///< reserved (cdw 11-15) +} nvme_acmd_delete_ioq_t; + +/// Admin command: Create I/O Submission Queue +typedef struct _nvme_acmd_create_sq { + nvme_command_common_t common; ///< common cdw 0 + u16 qid; ///< queue id (cdw 10) + u16 qsize; ///< queue size + u16 pc : 1; ///< physically contiguous + u16 qprio : 2; ///< interrupt enabled + u16 rsvd11 : 13; ///< reserved (in cdw 11) + u16 cqid; ///< associated completion queue id + u32 cdw12_15[4]; ///< reserved (cdw 12-15) +} nvme_acmd_create_sq_t; + +/// Admin command: Get Log Page +typedef struct _nvme_acmd_get_log_page { + nvme_command_common_t common; ///< common cdw 0 + u8 lid; ///< log page id (cdw 10) + u8 rsvd10a; ///< reserved (in cdw 10) + u16 numd : 12; ///< number of dwords + u16 rsvd10b : 4; ///< reserved (in cdw 10) + u32 rsvd11[5]; ///< reserved (cdw 11-15) +} nvme_acmd_get_log_page_t; + +/// Admin command: Create I/O Completion Queue +typedef struct _nvme_acmd_create_cq { + nvme_command_common_t common; ///< common cdw 0 + u16 qid; ///< queue id (cdw 10) + u16 qsize; ///< queue size + u16 pc : 1; ///< physically contiguous + u16 ien : 1; ///< interrupt enabled + u16 rsvd11 : 14; ///< reserved (in cdw 11) + u16 iv; ///< interrupt vector + u32 cdw12_15[4]; ///< reserved (cdw 12-15) +} nvme_acmd_create_cq_t; + +/// Admin command: Identify +typedef struct _nvme_acmd_identify { + nvme_command_common_t common; ///< common cdw 0 + u32 cns; ///< controller or namespace (cdw 10) + u32 cdw11_15[5]; ///< reserved (cdw 11-15) +} nvme_acmd_identify_t; + +/// Admin command: Abort +typedef struct _nvme_acmd_abort { + nvme_command_common_t common; ///< common cdw 0 + u16 sqid; ///< submission queue id (cdw 10) + u16 cid; ///< command id + u32 cdw11_15[5]; ///< reserved (cdw 11-15) +} nvme_acmd_abort_t; + +struct nvme_sgls { + u32 reserved:2; + u32 tdbd_supp:1; ///< Transport Data Block descriptor supported + u32 offset_supp:1; ///< Offset Subtype supported + u32 sgl_mtpt_supp:1;///<SGL descriptor in Metadata pointer supported + u32 reserved2:25; + u32 sgl_supp:2; ///<SGL Support + +}; + +static_assert(sizeof(nvme_sgls)==4); + +/// Admin data: Identify Controller Data +typedef struct _nvme_identify_ctlr { + u16 vid; ///< PCI vendor id + u16 ssvid; ///< PCI subsystem vendor id + char sn[20]; ///< serial number + char mn[40]; ///< model number + char fr[8]; ///< firmware revision + u8 rab; ///< recommended arbitration burst + u8 ieee[3]; ///< IEEE OUI identifier + u8 mic; ///< multi-interface capabilities + u8 mdts; ///< max data transfer size + u8 rsvd78[178]; ///< reserved (78-255) + u16 oacs; ///< optional admin command support + u8 acl; ///< abort command limit + u8 aerl; ///< async event request limit + u8 frmw; ///< firmware updates + u8 lpa; ///< log page attributes + u8 elpe; ///< error log page entries + u8 npss; ///< number of power states support + u8 avscc; ///< admin vendor specific config + u8 rsvd265[247]; ///< reserved (265-511) + u8 sqes; ///< submission queue entry size + u8 cqes; ///< completion queue entry size + u8 rsvd514[2]; ///< reserved (514-515) + u32 nn; ///< number of namespaces + u16 oncs; ///< optional NVM command support + u16 fuses; ///< fused operation support + u8 fna; ///< format NVM attributes + u8 vwc; ///< volatile write cache + u16 awun; ///< atomic write unit normal + u16 awupf; ///< atomic write unit power fail + u8 nvscc; ///< NVM vendor specific config + u8 rsvd531[5]; ///< reserved (531-535) + nvme_sgls sgls; ///< SGL support + u8 rsvd540[164]; ///< reserved (540-703) + u8 rsvd704[1344]; ///< reserved (704-2047) + u8 psd[1024]; ///< power state 0-31 descriptors + u8 vs[1024]; ///< vendor specific +} nvme_identify_ctlr_t; + +/// Admin data: Identify Namespace - LBA Format Data +typedef struct _nvme_lba_format { + u16 ms; ///< metadata size + u8 lbads; ///< LBA data size + u8 rp : 2; ///< relative performance + u8 rsvd : 6; ///< reserved +} nvme_lba_format_t; + +/// Admin data: Identify Namespace Data +typedef struct _nvme_identify_ns { + u64 nsze; ///< namespace size + u64 ncap; ///< namespace capacity + u64 nuse; ///< namespace utilization + u8 nsfeat; ///< namespace features + u8 nlbaf; ///< number of LBA formats + u8 flbas; ///< formatted LBA size + u8 mc; ///< metadata capabilities + u8 dpc; ///< data protection capabilities + u8 dps; ///< data protection settings + u8 rsvd30[98]; ///< reserved (30-127) + nvme_lba_format_t lbaf[16]; ///< lba format support + u8 rsvd192[192]; ///< reserved (383-192) + u8 vs[3712]; ///< vendor specific +} nvme_identify_ns_t; + +/// Admin data: Get Log Page - Error Information +typedef struct _nvme_log_page_error { + u64 count; ///< error count + u16 sqid; ///< submission queue id + u16 cid; ///< command id + u16 sf; ///< status field + u8 byte; ///< parameter byte error location + u8 bit: 3; ///< parameter bit error location + u8 rsvd : 5; ///< reserved + u64 lba; ///< logical block address + u32 ns; ///< name space + u8 vspec; ///< vendor specific infomation + u8 rsvd29[35]; ///< reserved (29-63) +} nvme_log_page_error_t; + +/// Admin data: Get Log Page - SMART / Health Information +typedef struct _nvme_log_page_health { + u8 warn; ///< critical warning + u16 temp; ///< temperature + u8 avspare; ///< available spare + u8 avsparethresh; ///< available spare threshold + u8 used; ///< percentage used + u8 rsvd6[26]; ///< reserved (6-31) + u64 dur[2]; ///< data units read + u64 duw[2]; ///< data units written + u64 hrc[2]; ///< number of host read commands + u64 hwc[2]; ///< number of host write commands + u64 cbt[2]; ///< controller busy time + u64 pcycles[2]; ///< number of power cycles + u64 phours[2]; ///< power on hours + u64 unsafeshut[2]; ///< unsafe shutdowns + u64 merrors[2]; ///< media errors + u64 errlogs[2]; ///< number of error log entries + u64 rsvd192[320]; ///< reserved (192-511) +} nvme_log_page_health_t; + +/// Admin data: Get Log Page - Firmware Slot Information +typedef struct _nvme_log_page_fw { + u8 afi; ///< active firmware info + u8 rsvd1[7]; ///< reserved (1-7) + u64 fr[7]; ///< firmware revision for slot 1-7 + u8 rsvd64[448]; ///< reserved (64-511) +} nvme_log_page_fw_t; + +/// Admin feature: Arbitration +typedef struct _nvme_feature_arbitration { + u8 ab: 3; ///< arbitration burst + u8 rsvd: 5; ///< reserved + u8 lpw; ///< low priority weight + u8 mpw; ///< medium priority weight + u8 hpw; ///< high priority weight +} nvme_feature_arbitration_t; + +/// Admin feature: Power Management +typedef struct _nvme_feature_power_mgmt { + u32 ps: 5; ///< power state + u32 rsvd: 27; ///< reserved +} nvme_feature_power_mgmt_t; + +/// Admin feature: LBA Range Type Data +typedef struct _nvme_feature_lba_data { + struct { + u8 type; ///< type + u8 attributes; ///< attributes + u8 rsvd[14]; ///< reserved + u64 slba; ///< starting LBA + u64 nlb; ///< number of logical blocks + u8 guid[16]; ///< unique id + u8 rsvd48[16]; ///< reserved + } entry[64]; ///< LBA data entry +} nvme_feature_lba_data_t; + +/// Admin feature: LBA Range Type +typedef struct _nvme_feature_lba_range { + u32 num: 6; ///< number of LBA ranges + u32 rsvd: 26; ///< reserved +} nvme_feature_lba_range_t; + +/// Admin feature: Temperature Threshold +typedef struct _nvme_feature_temp_threshold { + u16 tmpth; ///< temperature threshold + u16 rsvd; ///< reserved +} nvme_feature_temp_threshold_t; + +/// Admin feature: Error Recovery +typedef struct _nvme_feature_error_recovery { + u16 tler; ///< time limited error recovery + u16 rsvd; ///< reserved +} nvme_feature_error_recovery_t; + +/// Admin feature: Volatile Write Cache +typedef struct _nvme_feature_write_cache { + u32 wce: 1; ///< volatile write cache + u32 rsvd: 31; ///< reserved +} nvme_feature_write_cache_t; + +/// Admin feature: Number of Queues +typedef struct _nvme_feature_num_queues { + u16 nsq; ///< numer of submission queues + u16 ncq; ///< numer of completion queues +} nvme_feature_num_queues_t; + +/// Admin feature: Interrupt Coalescing +typedef struct _nvme_feature_int_coalescing { + u8 thr; ///< aggregation threshold + u8 time; ///< aggregation time + u16 rsvd; ///< reserved +} nvme_feature_int_coalescing_t; + +/// Admin feature: Interrupt Vector Configuration +typedef struct _nvme_feature_int_vector { + u16 iv; ///< interrupt vector + u16 cd: 1; ///< coalescing disable + u16 rsvd: 15; ///< reserved +} nvme_feature_int_vector_t; + +/// Admin feature: Write Atomicity +typedef struct _nvme_feature_write_atomicity { + u32 dn: 1; ///< disable normal + u32 rsvd: 31; ///< reserved +} nvme_feature_write_atomicity_t; + +/// Admin feature: Async Event Configuration +typedef struct _nvme_feature_async_event { + u8 smart; ///< SMART / health critical warnings + u8 rsvd[3]; ///< reserved +} nvme_feature_async_event_t; + +/// Admin command: Get Feature +typedef struct _nvme_acmd_get_features { + nvme_command_common_t common; ///< common cdw 0 + u8 fid; ///< feature id (cdw 10:0-7) + u8 rsvd10[3]; ///< reserved (cdw 10:8-31) +} nvme_acmd_get_features_t; + +/// Admin command: Set Feature +typedef struct _nvme_acmd_set_features { + nvme_command_common_t common; ///< common cdw 0 + u8 fid; ///< feature id (cdw 10:0-7) + u8 rsvd10[3]; ///< reserved (cdw 10:8-31) + u32 val; ///< cdw 11 +} nvme_acmd_set_features_t; + +/// Submission queue entry +typedef union _nvme_sq_entry { + nvme_command_rw_t rw; ///< read/write command + nvme_command_vs_t vs; ///< admin and vendor specific command + + nvme_acmd_abort_t abort; ///< admin abort command + nvme_acmd_create_cq_t create_cq; ///< admin create IO completion queue + nvme_acmd_create_sq_t create_sq; ///< admin create IO submission queue + nvme_acmd_delete_ioq_t delete_ioq; ///< admin delete IO queue + nvme_acmd_identify_t identify; ///< admin identify command + nvme_acmd_get_log_page_t get_log_page; ///< get log page command + nvme_acmd_get_features_t get_features; ///< get feature + nvme_acmd_set_features_t set_features; ///< set feature +} nvme_sq_entry_t; + +/// Completion queue entry +typedef struct _nvme_cq_entry { + u32 cs; ///< command specific + u32 rsvd; ///< reserved + u16 sqhd; ///< submission queue head + u16 sqid; ///< submission queue id + u16 cid; ///< command id + union { + u16 psf; ///< phase bit and status field + struct { + u16 p : 1; ///< phase tag id + u16 sc : 8; ///< status code + u16 sct : 3; ///< status code type + u16 rsvd3 : 2; ///< reserved + u16 m : 1; ///< more + u16 dnr : 1; ///< do not retry + }; + }; +} nvme_cq_entry_t; + +typedef union _nvme_psf { + u16 psf; ///< phase bit and status field + struct { + u16 p : 1; ///< phase tag id + u16 sc : 8; ///< status code + u16 sct : 3; ///< status code type + u16 rsvd3 : 2; ///< reserved + u16 m : 1; ///< more + u16 dnr : 1; ///< do not retry + }; +} nvme_psf_t; + +struct _nvme_device; + +/// Namespace attributes structure +typedef struct _nvme_ns { + u32 id; ///< namespace id + u64 blockcount; ///< total number of available blocks + u16 blocksize; ///< logical block size + u16 blockshift; ///< block size shift value + u16 bpshift; ///< block to page shift +} nvme_ns_t; + +__END_DECLS + +#endif // NVME_STRUCTS_H + diff --git a/drivers/nvme.cc b/drivers/nvme.cc --- a/drivers/nvme.cc +++ b/drivers/nvme.cc @@ -0,0 +1,570 @@ +/* + * Copyright (C) 2023 Jan Braunwarth + * Copyright (C) 2024 Waldemar Kozaczuk + * + * This work is open source software, licensed under the terms of the + * BSD license as described in the LICENSE file in the top-level directory. + */ + +#include <sys/cdefs.h> + +#include "drivers/nvme.hh" +#include "drivers/pci-device.hh" +#include <osv/interrupt.hh> + +#include <cassert> +#include <sstream> +#include <string> +#include <string.h> +#include <map> +#include <errno.h> +#include <osv/debug.h> + +#include <osv/sched.hh> +#include <osv/trace.hh> +#include <osv/aligned_new.hh> + +#include <osv/device.h> +#include <osv/bio.h> +#include <osv/ioctl.h> +#include <osv/contiguous_alloc.hh> +#include <osv/aligned_new.hh> + +using namespace memory; + +#include <sys/mman.h> +#include <sys/refcount.h> + +#include <osv/drivers_config.h> + +TRACEPOINT(trace_nvme_strategy, "bio=%p, bcount=%lu", struct bio*, size_t); + +#define QEMU_VID 0x1b36 + +namespace nvme { + +int driver::_disk_idx = 0; +int driver::_instance = 0; + +struct nvme_priv { + devop_strategy_t strategy; + driver* drv; + u32 nsid; +}; + +static void nvme_strategy(struct bio* bio) { + auto* prv = reinterpret_cast<struct nvme_priv*>(bio->bio_dev->private_data); + trace_nvme_strategy(bio, bio->bio_bcount); + prv->drv->make_request(bio); +} + +static int +nvme_read(struct device *dev, struct uio *uio, int io_flags) +{ + return bdev_read(dev, uio, io_flags); +} + +static int +nvme_write(struct device *dev, struct uio *uio, int io_flags) +{ + return bdev_write(dev, uio, io_flags); +} + +static int +nvme_open(struct device *dev, int ioflags) +{ + return 0; +} + +static struct devops nvme_devops { + nvme_open, + no_close, + nvme_read, + nvme_write, + no_ioctl, + no_devctl, + multiplex_strategy, +}; + +struct ::driver _driver = { + "nvme", + &nvme_devops, + sizeof(struct nvme_priv), +}; + +static void setup_features_cmd(nvme_sq_entry_t* cmd, u8 feature_id, u32 val) +{ + memset(cmd, 0, sizeof(nvme_sq_entry_t)); + cmd->set_features.common.opc = NVME_ACMD_SET_FEATURES; + cmd->set_features.fid = feature_id; + cmd->set_features.val = val; +} + +enum CMD_IDENTIFY_CNS { + CMD_IDENTIFY_NAMESPACE = 0, + CMD_IDENTIFY_CONTROLLER = 1, +}; + +#define NVME_NAMESPACE_DEFAULT_NS 1 + +static void setup_identify_cmd(nvme_sq_entry_t* cmd, u32 namespace_id, u32 cns) +{ + memset(cmd, 0, sizeof(nvme_sq_entry_t)); + cmd->identify.common.opc = NVME_ACMD_IDENTIFY; + cmd->identify.common.nsid = namespace_id; + cmd->identify.cns = cns; +} + +driver::driver(pci::device &pci_dev) + : _dev(pci_dev) + , _msi(&pci_dev) +{ + auto parse_ok = parse_pci_config(); + assert(parse_ok); + + enable_msix(); + + _id = _instance++; + + _doorbell_stride = 1 << (2 + _control_reg->cap.dstrd); + + //Wait for controller to become ready + assert(wait_for_controller_ready_change(1) == 0); + + //Disable controller + assert(enable_disable_controller(false) == 0); + + init_controller_config(); + + create_admin_queue(); + + //Enable controller + assert(enable_disable_controller(true) == 0); + + assert(identify_controller() == 0); + + assert(identify_namespace(NVME_NAMESPACE_DEFAULT_NS) == 0); + + //Enable write cache if available + if (_identify_controller->vwc & 0x1 && NVME_VWC_ENABLED) { + enable_write_cache(); + } + + //Create IO queues + create_io_queues(); + + if (_identify_controller->vid != QEMU_VID) { + set_interrupt_coalescing(20, 2); + } + + std::string dev_name("vblk"); + dev_name += std::to_string(_disk_idx++); + + struct device* dev = device_create(&_driver, dev_name.c_str(), D_BLK); + struct nvme_priv* prv = reinterpret_cast<struct nvme_priv*>(dev->private_data); + + unsigned int nsid = NVME_NAMESPACE_DEFAULT_NS; + const auto& ns = _ns_data[nsid]; + off_t size = ((off_t) ns->blockcount) << ns->blockshift; + + prv->strategy = nvme_strategy; + prv->drv = this; + prv->nsid = nsid; + dev->size = size; + //IO size greater than 4096 << 9 would mean we need + //more than 1 page for the prplist which is not implemented + dev->max_io_size = mmu::page_size << ((9 < _identify_controller->mdts)? 9 : _identify_controller->mdts); + + read_partition_table(dev); + + debugf("nvme: Add device instances %d as %s, devsize=%lld, serial number:%s\n", + _id, dev_name.c_str(), dev->size, _identify_controller->sn); +} + +int driver::set_number_of_queues(u16 num, u16* ret) +{ + nvme_sq_entry_t cmd; + setup_features_cmd(&cmd, NVME_FEATURE_NUM_QUEUES, (num << 16) | num); + auto res = _admin_queue->submit_and_return_on_completion(&cmd); + + u16 cq_num = res.cs >> 16; + u16 sq_num = res.cs & 0xffff; + + nvme_d("Queues supported: CQ num=%d, SQ num=%d, MSI/X entries=%d", + res.cs >> 16, res.cs & 0xffff, _dev.msix_get_num_entries()); + + if (res.sct != 0 || res.sc != 0) + return EIO; + + if (num > cq_num || num > sq_num) { + *ret = (cq_num > sq_num) ? cq_num : sq_num; + } else { + *ret = num; + } + return 0; +} + +int driver::set_interrupt_coalescing(u8 threshold, u8 time) +{ + nvme_sq_entry_t cmd; + setup_features_cmd(&cmd, NVME_FEATURE_INT_COALESCING, threshold | (time << 8)); + auto res = _admin_queue->submit_and_return_on_completion(&cmd); + + if (res.sct != 0 || res.sc != 0) { + nvme_e("Failed to enable interrupt coalescing: sc=%#x sct=%#x", res.sc, res.sct); + return EIO; + } else { + nvme_i("Enabled interrupt coalescing"); + return 0; + } +} + +void driver::enable_write_cache() +{ + nvme_sq_entry_t cmd; + setup_features_cmd(&cmd, NVME_FEATURE_WRITE_CACHE, 1); + auto res = _admin_queue->submit_and_return_on_completion(&cmd); + if (res.sct != 0 || res.sc != 0) { + nvme_e("Failed to enable write cache: sc=%#x sct=%#x", res.sc, res.sct); + } else { + nvme_i("Enabled write cache"); + } +} + +void driver::create_io_queues() +{ + u16 ret; + if (NVME_QUEUE_PER_CPU_ENABLED) { + set_number_of_queues(sched::cpus.size(), &ret); + } else { + set_number_of_queues(1, &ret); + } + assert(ret >= 1); + + int qsize = (NVME_IO_QUEUE_SIZE < _control_reg->cap.mqes) ? NVME_IO_QUEUE_SIZE : _control_reg->cap.mqes + 1; + if (NVME_QUEUE_PER_CPU_ENABLED) { + for(sched::cpu* cpu : sched::cpus) { + int qid = cpu->id + 1; + create_io_queue(qid, qsize, cpu); + } + } else { + create_io_queue(1, qsize); + } +} + +enum NVME_CONTROLLER_EN { + CTRL_EN_DISABLE = 0, + CTRL_EN_ENABLE = 1, +}; + +int driver::enable_disable_controller(bool enable) +{ + nvme_controller_config_t cc; + cc.val = mmio_getl(&_control_reg->cc); + + u32 expected_en = enable ? CTRL_EN_DISABLE : CTRL_EN_ENABLE; + u32 new_en = enable ? CTRL_EN_ENABLE : CTRL_EN_DISABLE; + + assert(cc.en == expected_en); //check current status + cc.en = new_en; + + mmio_setl(&_control_reg->cc, cc.val); + return wait_for_controller_ready_change(new_en); +} + +int driver::wait_for_controller_ready_change(int ready) +{ + int timeout = mmio_getb(&_control_reg->cap.to) * 10000; // timeout in 0.05ms steps + nvme_controller_status_t csts; + for (int i = 0; i < timeout; i++) { + csts.val = mmio_getl(&_control_reg->csts); + if (csts.rdy == ready) return 0; + usleep(50); + } + NVME_ERROR("timeout=%d waiting for ready %d", timeout, ready); + return ETIME; +} + +#define NVME_CTRL_CONFIG_IO_CQ_ENTRY_SIZE_16_BYTES 4 +#define NVME_CTRL_CONFIG_IO_SQ_ENTRY_SIZE_64_BYTES 6 +#define NVME_CTRL_CONFIG_PAGE_SIZE_4K 0 + +void driver::init_controller_config() +{ + nvme_controller_config_t cc; + cc.val = mmio_getl(&_control_reg->cc.val); + cc.iocqes = NVME_CTRL_CONFIG_IO_CQ_ENTRY_SIZE_16_BYTES; + cc.iosqes = NVME_CTRL_CONFIG_IO_SQ_ENTRY_SIZE_64_BYTES; + cc.mps = NVME_CTRL_CONFIG_PAGE_SIZE_4K; + + mmio_setl(&_control_reg->cc, cc.val); +} + +void driver::create_admin_queue() +{ + u32* sq_doorbell = _control_reg->sq0tdbl; + u32* cq_doorbell = (u32*) ((u64)sq_doorbell + _doorbell_stride); + + int qsize = NVME_ADMIN_QUEUE_SIZE; + _admin_queue = std::unique_ptr<admin_queue_pair>( + aligned_new<admin_queue_pair>(_id, 0, qsize, _dev, sq_doorbell, cq_doorbell, _ns_data)); + + register_admin_interrupt(); + + nvme_adminq_attr_t aqa; + aqa.val = 0; + aqa.asqs = aqa.acqs = qsize - 1; + + mmio_setl(&_control_reg->aqa, aqa.val); + mmio_setq(&_control_reg->asq, _admin_queue->sq_phys_addr()); + mmio_setq(&_control_reg->acq, _admin_queue->cq_phys_addr()); +} + +template<typename Q> +void setup_create_io_queue_cmd(Q* create_queue_cmd, int qid, int qsize, u8 command_opcode, u64 queue_addr) +{ + assert(create_queue_cmd); + memset(create_queue_cmd, 0, sizeof (*create_queue_cmd)); + + create_queue_cmd->common.opc = command_opcode; + create_queue_cmd->common.prp1 = queue_addr; + create_queue_cmd->qid = qid; + create_queue_cmd->qsize = qsize - 1; + create_queue_cmd->pc = 1; +} + +int driver::create_io_queue(int qid, int qsize, sched::cpu* cpu, int qprio) +{ + int iv = qid; + + u32* sq_doorbell = (u32*) ((u64) _control_reg->sq0tdbl + 2 * _doorbell_stride * qid); + u32* cq_doorbell = (u32*) ((u64) sq_doorbell + _doorbell_stride); + + // create queue pair with allocated SQ and CQ ring buffers + auto queue = std::unique_ptr<io_queue_pair>( + aligned_new<io_queue_pair>(_id, iv, qsize, _dev, sq_doorbell, cq_doorbell, _ns_data)); + + // create completion queue command + nvme_acmd_create_cq_t cmd_cq; + setup_create_io_queue_cmd<nvme_acmd_create_cq_t>( + &cmd_cq, qid, qsize, NVME_ACMD_CREATE_CQ, queue->cq_phys_addr()); + + cmd_cq.iv = iv; + cmd_cq.ien = 1; + + // create submission queue command + nvme_acmd_create_sq_t cmd_sq; + setup_create_io_queue_cmd<nvme_acmd_create_sq_t>( + &cmd_sq, qid, qsize, NVME_ACMD_CREATE_SQ, queue->sq_phys_addr()); + + cmd_sq.qprio = qprio; + cmd_sq.cqid = qid; + + _io_queues.push_back(std::move(queue)); + + register_io_interrupt(iv, qid - 1, cpu); + + //According to the NVMe spec, the completion queue (CQ) needs to be created before the submission queue (SQ) + _admin_queue->submit_and_return_on_completion((nvme_sq_entry_t*)&cmd_cq); + _admin_queue->submit_and_return_on_completion((nvme_sq_entry_t*)&cmd_sq); + + debugf("nvme: Created I/O queue pair for qid:%d with size:%d\n", qid, qsize); + + return 0; +} + +int driver::identify_controller() +{ + assert(_admin_queue); + nvme_sq_entry_t cmd; + setup_identify_cmd(&cmd, 0, CMD_IDENTIFY_CONTROLLER); + auto data = new nvme_identify_ctlr_t; + auto res = _admin_queue->submit_and_return_on_completion(&cmd, (void*) mmu::virt_to_phys(data), mmu::page_size); + + if (res.sc != 0 || res.sct != 0) { + NVME_ERROR("Identify controller failed nvme%d, sct=%d, sc=%d", _id, res.sct, res.sc); + return EIO; + } + + _identify_controller.reset(data); + return 0; +} + +int driver::identify_namespace(u32 nsid) +{ + assert(_admin_queue); + nvme_sq_entry_t cmd; + setup_identify_cmd(&cmd, nsid, CMD_IDENTIFY_NAMESPACE); + auto data = std::unique_ptr<nvme_identify_ns_t>(new nvme_identify_ns_t); + auto res = _admin_queue->submit_and_return_on_completion(&cmd, (void*) mmu::virt_to_phys(data.get()), mmu::page_size); + if (res.sc != 0 || res.sct != 0) { + NVME_ERROR("Identify namespace failed nvme%d nsid=%d, sct=%d, sc=%d", _id, nsid, res.sct, res.sc); + return EIO; + } + + _ns_data.insert(std::make_pair(nsid, new nvme_ns_t)); + _ns_data[nsid]->blockcount = data->ncap; + _ns_data[nsid]->blockshift = data->lbaf[data->flbas & 0xF].lbads; + _ns_data[nsid]->blocksize = 1 << _ns_data[nsid]->blockshift; + _ns_data[nsid]->bpshift = NVME_PAGESHIFT - _ns_data[nsid]->blockshift; + _ns_data[nsid]->id = nsid; + + nvme_i("Identified namespace with nsid=%d, blockcount=%d, blocksize=%d", + nsid, _ns_data[nsid]->blockcount, _ns_data[nsid]->blocksize); + return 0; +} + +int driver::make_request(bio* bio, u32 nsid) +{ + if (bio->bio_bcount % _ns_data[nsid]->blocksize || bio->bio_offset % _ns_data[nsid]->blocksize) { + NVME_ERROR("bio request not block-aligned length=%d, offset=%d blocksize=%d\n",bio->bio_bcount, bio->bio_offset, _ns_data[nsid]->blocksize); + return EINVAL; + } + bio->bio_offset = bio->bio_offset >> _ns_data[nsid]->blockshift; + bio->bio_bcount = bio->bio_bcount >> _ns_data[nsid]->blockshift; + + assert((bio->bio_offset + bio->bio_bcount) <= _ns_data[nsid]->blockcount); + + if (bio->bio_cmd == BIO_FLUSH && (_identify_controller->vwc == 0 || !NVME_VWC_ENABLED )) { + biodone(bio, true); + return 0; + } + + unsigned int qidx = sched::current_cpu->id % _io_queues.size(); + return _io_queues[qidx]->make_request(bio, nsid); +} + +void driver::register_admin_interrupt() +{ + sched::thread* aq_thread = sched::thread::make([this] { this->_admin_queue->req_done(); }, + sched::thread::attr().name("nvme" + std::to_string(_id) + "_aq_req_done")); + aq_thread->start(); + + assert(msix_register(0, [this] { this->_admin_queue->disable_interrupts(); }, aq_thread)); +} + +void driver::enable_msix() +{ + _dev.set_bus_master(true); + _dev.msix_enable(); + assert(_dev.is_msix()); + + unsigned int vectors_num = 1; //at least for admin + if (NVME_QUEUE_PER_CPU_ENABLED) { + vectors_num += sched::cpus.size(); + } else { + vectors_num += 1; + } + + assert(vectors_num <= _dev.msix_get_num_entries()); + _msix_vectors = std::vector<std::unique_ptr<msix_vector>>(vectors_num); +} + +bool driver::msix_register(unsigned iv, + // high priority ISR + std::function<void ()> isr, + // bottom half + sched::thread *t, + bool assign_affinity) +{ + //Mask all interrupts... + _dev.msix_mask_all(); + _dev.msix_mask_entry(iv); + + auto vec = std::unique_ptr<msix_vector>(new msix_vector(&_dev)); + _msi.assign_isr(vec.get(), + [=]() mutable { + isr(); + t->wake_with_irq_disabled(); + }); + + if (!_msi.setup_entry(iv, vec.get())) { + return false; + } + + if (assign_affinity && t) { + vec->set_affinity(t->get_cpu()->arch.apic_id); + } + + if (iv < _msix_vectors.size()) { + _msix_vectors[iv] = std::move(vec); + } else { + NVME_ERROR("binding_entry %d registration failed\n",iv); + return false; + } + _msix_vectors[iv]->msix_unmask_entries(); + + _dev.msix_unmask_all(); + _dev.msix_unmask_entry(iv); + return true; +} + +//qid should be the index that corresponds to the queue in _io_queues. +//In general qid = iv - 1 +bool driver::register_io_interrupt(unsigned int iv, unsigned int qid, sched::cpu* cpu) +{ + sched::thread* t; + bool ok; + + if (_io_queues.size() <= qid) { + NVME_ERROR("queue %d not initialized\n",qid); + return false; + } + + if (_io_queues[qid]->_id != iv) + nvme_w("Queue %d ->_id = %d != iv %d\n", qid, _io_queues[qid]->_id, iv); + + t = sched::thread::make([this,qid] { this->_io_queues[qid]->req_done(); }, + sched::thread::attr().name("nvme" + std::to_string(_id) + "_ioq" + std::to_string(qid) + "_iv" + std::to_string(iv))); + t->start(); + + // If cpu specified, let us pin the worker thread to this cpu + bool pin = cpu != nullptr; + if (pin) { + sched::thread::pin(t, cpu); + } + + ok = msix_register(iv, [this,qid] { this->_io_queues[qid]->disable_interrupts(); }, t, pin); + if (not ok) + NVME_ERROR("Interrupt registration failed: queue=%d interruptvector=%d\n", qid, iv); + return ok; +} + +void driver::dump_config(void) +{ + u8 B, D, F; + _dev.get_bdf(B, D, F); + + _dev.dump_config(); + nvme_d("%s [%x:%x.%x] vid:id= %x:%x", get_name().c_str(), + (u16)B, (u16)D, (u16)F, + _dev.get_vendor_id(), + _dev.get_device_id()); +} + +bool driver::parse_pci_config() +{ + _bar0 = _dev.get_bar(1); + if (_bar0 == nullptr) { + return false; + } + _bar0->map(); + if (!_bar0->is_mapped()) { + return false; + } + _control_reg = (nvme_controller_reg_t*) _bar0->get_mmio(); + return true; +} + +hw_driver* driver::probe(hw_device* dev) +{ + if (auto pci_dev = dynamic_cast<pci::device*>(dev)) { + if ((pci_dev->get_base_class_code() == pci::function::PCI_CLASS_STORAGE) && + (pci_dev->get_sub_class_code() == pci::function::PCI_SUB_CLASS_STORAGE_NVMC) && + (pci_dev->get_programming_interface() == 2))// detect NVMe device + return aligned_new<driver>(*pci_dev); + } + return nullptr; +} + +} diff --git a/drivers/nvme.hh b/drivers/nvme.hh --- a/drivers/nvme.hh +++ b/drivers/nvme.hh @@ -0,0 +1,114 @@ +/* + * Copyright (C) 2023 Jan Braunwarth + * Copyright (C) 2024 Waldemar Kozaczuk + * + * This work is open source software, licensed under the terms of the + * BSD license as described in the LICENSE file in the top-level directory. + */ + +#ifndef NVME_DRIVER_H +#define NVME_DRIVER_H + +#include "drivers/nvme-structs.h" +#include "drivers/driver.hh" +#include "drivers/pci-device.hh" +#include <osv/mempool.hh> +#include <osv/interrupt.hh> +#include <osv/msi.hh> +#include "drivers/nvme-queue.hh" +#include <vector> +#include <memory> +#include <map> + +#define NVME_QUEUE_PER_CPU_ENABLED 1 + +//Volatile Write Cache +#define NVME_VWC_ENABLED 1 + +#define NVME_ADMIN_QUEUE_SIZE 8 + +//Will be lower if the device doesnt support the specified queue size +#define NVME_IO_QUEUE_SIZE 64 + +namespace nvme { + +enum NVME_IO_QUEUE_PRIORITY { + NVME_IO_QUEUE_PRIORITY_URGENT = 0, + NVME_IO_QUEUE_PRIORITY_HIGH = 1, + NVME_IO_QUEUE_PRIORITY_MEDIUM = 2, + NVME_IO_QUEUE_PRIORITY_LOW = 3, +}; + +class driver : public hw_driver { +public: + explicit driver(pci::device& dev); + virtual ~driver() {}; + + virtual std::string get_name() const { return "nvme"; } + + virtual void dump_config(); + + int make_request(struct bio* bio, u32 nsid = 1); + static hw_driver* probe(hw_device* dev); + + std::map<u32, nvme_ns_t*> _ns_data; + +private: + int identify_controller(); + int identify_namespace(u32 ns); + + void create_admin_queue(); + void register_admin_interrupt(); + + void create_io_queues(); + int create_io_queue(int qid, int qsize = NVME_IO_QUEUE_SIZE, + sched::cpu* cpu = nullptr, int qprio = NVME_IO_QUEUE_PRIORITY_HIGH); + bool register_io_interrupt(unsigned int iv, unsigned int qid, + sched::cpu* cpu = nullptr); + + void init_controller_config(); + + int enable_disable_controller(bool enable); + int wait_for_controller_ready_change(int ready); + + int set_number_of_queues(u16 num, u16* ret); + int set_interrupt_coalescing(u8 threshold, u8 time); + + bool parse_pci_config(); + void enable_msix(); + + void enable_write_cache(); + + bool msix_register(unsigned iv, + // high priority ISR + std::function<void ()> isr, + // bottom half + sched::thread *t, + // set affinity of the vector to the cpu running t + bool assign_affinity = false); + + //Maintains the nvme instance number for multiple adapters + static int _instance; + int _id; + + //Disk index number + static int _disk_idx; + + std::vector<std::unique_ptr<msix_vector>> _msix_vectors; + + std::unique_ptr<admin_queue_pair> _admin_queue; + + std::vector<std::unique_ptr<io_queue_pair>> _io_queues; + u32 _doorbell_stride; + + std::unique_ptr<nvme_identify_ctlr_t> _identify_controller; + nvme_controller_reg_t* _control_reg = nullptr; + + pci::device& _dev; + interrupt_manager _msi; + + pci::bar *_bar0 = nullptr; +}; + +} +#endif diff --git a/scripts/run.py b/scripts/run.py --- a/scripts/run.py +++ b/scripts/run.py @@ -172,6 +172,10 @@ def start_osv_qemu(options): "-device", "virtio-scsi-pci,id=scsi0%s" % options.virtio_device_suffix, "-drive", "file=%s,if=none,id=hd0,media=disk,%s" % (options.image_file, aio), "-device", "scsi-hd,bus=scsi0.0,drive=hd0,scsi-id=1,lun=0%s" % boot_index] + elif options.nvme: + args += [ + "-device", "nvme,serial=deadbeef,drive=nvm%s" % (boot_index), + "-drive", "file=%s,if=none,id=nvm,%s" % (options.image_file, aio)] elif options.ide: args += [ "-hda", options.image_file] @@ -198,6 +202,15 @@ def start_osv_qemu(options): "-object", "memory-backend-file,id=mem,size=%s,mem-path=/dev/shm,share=on" % options.memsize, "-numa", "node,memdev=mem"] + if options.second_nvme_image: + args += [ + "-drive", "file=%s,if=none,id=nvm1" % (options.second_nvme_image), + "-device", "nvme,serial=deadbeef,drive=nvm1,"] + + if options.pass_pci: + args += [ + "-device", "vfio-pci,host=%s" % (options.pass_pci)] + if options.no_shutdown: args += ["-no-reboot", "-no-shutdown"] @@ -532,6 +545,8 @@ def main(options): help="don't start OSv till otherwise specified, e.g. through the QEMU monitor or a remote gdb") parser.add_argument("-i", "--image", action="store", default=None, metavar="IMAGE", help="path to disk image file. defaults to build/$mode/usr.img") + parser.add_argument("-N", "--nvme",action="store_true", default=False, + help="use NVMe instead of virtio-blk") parser.add_argument("-S", "--scsi", action="store_true", default=False, help="use virtio-scsi instead of virtio-blk") parser.add_argument("-A", "--sata", action="store_true", default=False, @@ -626,6 +641,10 @@ def main(options): help="static ip addresses (forwarded to respective kernel command line option)") parser.add_argument("--bootchart", action="store_true", help="bootchart mode (forwarded to respective kernel command line option") + parser.add_argument("--second-nvme-image", action="store", + help="Path to an optional disk image that should be attached to the instance as NVMe device") + parser.add_argument("--pass-pci", action="store", + help="passthrough a pci device in given slot if bound to vfio driver") cmdargs = parser.parse_args() cmdargs.opt_path = "debug" if cmdargs.debug else "release" if cmdargs.release else "last" -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to [email protected]. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/000000000000502767061b82cc4e%40google.com.
