Re: [PATCH] hw/nvme: fix mo field in io mgnt send
On May 8 09:36, Vincent Fu wrote: > On 5/7/24 10:05, Vincent Fu wrote: > > On 5/6/24 04:06, Klaus Jensen wrote: > > > The Management Operation field of I/O Management Send is only 8 bits, > > > not 16. > > > > > > Fixes: 73064edfb864 ("hw/nvme: flexible data placement emulation") > > > Signed-off-by: Klaus Jensen > > > --- > > > hw/nvme/ctrl.c | 2 +- > > > 1 file changed, 1 insertion(+), 1 deletion(-) > > > > > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c > > > index 9e7bbebc8bb0..ede5f281dd7c 100644 > > > --- a/hw/nvme/ctrl.c > > > +++ b/hw/nvme/ctrl.c > > > @@ -4387,7 +4387,7 @@ static uint16_t nvme_io_mgmt_send(NvmeCtrl *n, > > > NvmeRequest *req) > > > { > > > NvmeCmd *cmd = >cmd; > > > uint32_t cdw10 = le32_to_cpu(cmd->cdw10); > > > - uint8_t mo = (cdw10 & 0xff); > > > + uint8_t mo = cdw10 & 0xf; > > > switch (mo) { > > > case NVME_IOMS_MO_NOP: > > > > > > --- > > > base-commit: 84b0eb1826f690aa8d51984644318ee6c810f5bf > > > change-id: 20240506-fix-ioms-mo-97098c6c5396 > > > > > > Best regards, > > > > Reviewed-by: Vincent Fu > > Klaus, upon taking a second look, the original code is correct. The proposed > change would only keep the least significant 4 bits of the MO field. The > original code gives you the 8 bits needed. > > Let me withdraw my Reviewed-by. > > Vincent That was embarrasing. Thanks for catching that Vincent :) signature.asc Description: PGP signature
Re: [PATCH v3 10/11] hw/nvme: add reservation protocal command
tpl, > + ignore_key, nvme_misc_cb, > + req); > +break; > +case NVME_RESV_REGISTER_ACTION_UNREGISTER: > +req->aiocb = blk_aio_pr_register(ns->blkconf.blk, key_info.cr_key, 0, > + 0, aptpl, ignore_key, > + nvme_misc_cb, req); > +break; > +case NVME_RESV_REGISTER_ACTION_REPLACE: > +req->aiocb = blk_aio_pr_register(ns->blkconf.blk, key_info.cr_key, > + key_info.nr_key, 0, aptpl, > ignore_key, > + nvme_misc_cb, req); > +break; > +default: > +return NVME_INVALID_FIELD; > +} > + > +return NVME_NO_COMPLETE; > +} > + > +static uint16_t nvme_resv_release(NvmeCtrl *n, NvmeRequest *req) > +{ > +int ret; > +uint64_t cr_key; > +NvmeNamespace *ns = req->ns; > +uint32_t cdw10 = le32_to_cpu(req->cmd.cdw10); > +uint8_t action = cdw10 & 0x7; > +NVMEResvType type = cdw10 >> 8 & 0xff; > + > +ret = nvme_h2c(n, (uint8_t *)_key, sizeof(cr_key), req); > +if (ret) { > +return ret; > +} > + > +switch (action) { > +case NVME_RESV_RELEASE_ACTION_RELEASE: > +req->aiocb = blk_aio_pr_release(ns->blkconf.blk, cr_key, > +nvme_pr_type_to_block(type), > +nvme_misc_cb, req); > +break; > +case NVME_RESV_RELEASE_ACTION_CLEAR: > +req->aiocb = blk_aio_pr_clear(ns->blkconf.blk, cr_key, > + nvme_misc_cb, req); > +break; > +default: > +return NVME_INVALID_FIELD; > +} > + > +return NVME_NO_COMPLETE; > +} > + > +static uint16_t nvme_resv_acquire(NvmeCtrl *n, NvmeRequest *req) > +{ > +int ret; > +NvmeKeyInfo key_info; > +NvmeNamespace *ns = req->ns; > +uint32_t cdw10 = le32_to_cpu(req->cmd.cdw10); > +uint8_t action = cdw10 & 0x7; > +NVMEResvType type = cdw10 >> 8 & 0xff; > + > +ret = nvme_h2c(n, (uint8_t *)_info, sizeof(NvmeKeyInfo), req); > +if (ret) { > +return ret; > +} > + > +switch (action) { > +case NVME_RESV_ACQUIRE_ACTION_ACQUIRE: > +req->aiocb = blk_aio_pr_reserve(ns->blkconf.blk, key_info.cr_key, > +nvme_pr_type_to_block(type), > +nvme_misc_cb, req); > +break; > +case NVME_RESV_ACQUIRE_ACTION_PREEMPT: > +req->aiocb = blk_aio_pr_preempt(ns->blkconf.blk, > + key_info.cr_key, key_info.nr_key, > + nvme_pr_type_to_block(type), > + false, nvme_misc_cb, req); > +break; > +case NVME_RESV_ACQUIRE_ACTION_PREEMPT_AND_ABORT: > +req->aiocb = blk_aio_pr_preempt(ns->blkconf.blk, key_info.cr_key, > +key_info.nr_key, type, true, > +nvme_misc_cb, req); > +break; > +default: > +return NVME_INVALID_FIELD; > +} > + > +return NVME_NO_COMPLETE; > +} > + > +typedef struct NvmeResvKeys { > +uint32_t generation; > +uint32_t num_keys; > +uint64_t *keys; > +NvmeRequest *req; > +} NvmeResvKeys; > + > +typedef struct NvmeReadReservation { > +uint32_t generation; > +uint64_t key; > +BlockPrType type; > +NvmeRequest *req; > +NvmeResvKeys *keys_info; > +} NvmeReadReservation; > + > +static int _nvme_resv_read_reservation_cb(NvmeReadReservation *reservation) Nit: you can drop the leading underscore. But I have no problems introducing this to hw/nvme, so Acked-by: Klaus Jensen I will give this a proper review once reviews trickle in on the core block layer changes (since this obviously depends on that). signature.asc Description: PGP signature
Re: [PATCH v3 09/11] hw/nvme: enable namespace rescap function
On May 17 17:52, Changqi Lu wrote: > This commit enables the rescap function in the > namespace by detecting the supported reservation > function in the backend driver. > > Signed-off-by: Changqi Lu > Signed-off-by: zhenwei pi > --- > hw/nvme/ns.c | 8 > 1 file changed, 8 insertions(+) > > diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c > index ea8db175db..bb09117f4b 100644 > --- a/hw/nvme/ns.c > +++ b/hw/nvme/ns.c > @@ -20,6 +20,7 @@ > #include "qemu/bitops.h" > #include "sysemu/sysemu.h" > #include "sysemu/block-backend.h" > +#include "block/block_int.h" > > #include "nvme.h" > #include "trace.h" > @@ -55,6 +56,13 @@ void nvme_ns_init_format(NvmeNamespace *ns) > } > > id_ns->npda = id_ns->npdg = npdg - 1; > + > +/* > + * The persistent reservation capacities of block > + * and nvme are currently defined the same. > + * If there are subsequent changes, this part needs to be changed. > + */ > +id_ns->rescap = blk_bs(ns->blkconf.blk)->file->bs->bl.pr_cap; This is very brittle. I see that you have an enum for both th eblock layer and nvme. It is tricky to remember to update this if it changes in the block layer. > } > > static int nvme_ns_init(NvmeNamespace *ns, Error **errp) > -- > 2.20.1 > -- One of us - No more doubt, silence or taboo about mental illness. signature.asc Description: PGP signature
Re: [PATCH v3 06/11] block/nvme: add reservation command protocol constants
On May 17 17:52, Changqi Lu wrote: > Add constants for the NVMe persistent command protocol. > The constants include the reservation command opcode and > reservation type values defined in section 7 of the NVMe > 2.0 specification. > > Signed-off-by: Changqi Lu > Signed-off-by: zhenwei pi > --- > include/block/nvme.h | 61 > 1 file changed, 61 insertions(+) > > diff --git a/include/block/nvme.h b/include/block/nvme.h > index bb231d0b9a..84e2b2e401 100644 > --- a/include/block/nvme.h > +++ b/include/block/nvme.h > @@ -633,6 +633,10 @@ enum NvmeIoCommands { > NVME_CMD_WRITE_ZEROES = 0x08, > NVME_CMD_DSM= 0x09, > NVME_CMD_VERIFY = 0x0c, > +NVME_CMD_RESV_REGISTER = 0x0d, > +NVME_CMD_RESV_REPORT= 0x0e, > +NVME_CMD_RESV_ACQUIRE = 0x11, > +NVME_CMD_RESV_RELEASE = 0x15, > NVME_CMD_IO_MGMT_RECV = 0x12, > NVME_CMD_COPY = 0x19, > NVME_CMD_IO_MGMT_SEND = 0x1d, > @@ -641,6 +645,63 @@ enum NvmeIoCommands { > NVME_CMD_ZONE_APPEND= 0x7d, > }; > > +typedef enum { > +NVME_RESV_REGISTER_ACTION_REGISTER = 0x00, > +NVME_RESV_REGISTER_ACTION_UNREGISTER= 0x01, > +NVME_RESV_REGISTER_ACTION_REPLACE = 0x02, > +} NVME_RESV_REGISTER_ACTION; Existing style would name this `NvmeReservationRegisterAction`. signature.asc Description: PGP signature
Re: [PATCH v3 08/11] hw/nvme: enable ONCS reservations
On May 17 17:52, Changqi Lu wrote: > This commit enables ONCS to support the reservation > function at the controller level. It also lays the > groundwork for detecting and enabling the reservation > function on a per-namespace basis in RESCAP. > > Signed-off-by: Changqi Lu > Signed-off-by: zhenwei pi > --- > hw/nvme/ctrl.c | 3 ++- > 1 file changed, 2 insertions(+), 1 deletion(-) > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c > index 127c3d2383..182307a48b 100644 > --- a/hw/nvme/ctrl.c > +++ b/hw/nvme/ctrl.c > @@ -8248,7 +8248,8 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice > *pci_dev) > id->nn = cpu_to_le32(NVME_MAX_NAMESPACES); > id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP | > NVME_ONCS_FEATURES | NVME_ONCS_DSM | > - NVME_ONCS_COMPARE | NVME_ONCS_COPY); > + NVME_ONCS_COMPARE | NVME_ONCS_COPY | > + NVME_ONCS_RESRVATIONS); > > /* > * NOTE: If this device ever supports a command set that does NOT use 0x0 > -- > 2.20.1 > Should be merged with patch 10. signature.asc Description: PGP signature
Re: [PATCH v3 09/11] hw/nvme: enable namespace rescap function
On May 17 17:52, Changqi Lu wrote: > This commit enables the rescap function in the > namespace by detecting the supported reservation > function in the backend driver. > > Signed-off-by: Changqi Lu > Signed-off-by: zhenwei pi > --- > hw/nvme/ns.c | 8 > 1 file changed, 8 insertions(+) > > diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c > index ea8db175db..bb09117f4b 100644 > --- a/hw/nvme/ns.c > +++ b/hw/nvme/ns.c > @@ -20,6 +20,7 @@ > #include "qemu/bitops.h" > #include "sysemu/sysemu.h" > #include "sysemu/block-backend.h" > +#include "block/block_int.h" > > #include "nvme.h" > #include "trace.h" > @@ -55,6 +56,13 @@ void nvme_ns_init_format(NvmeNamespace *ns) > } > > id_ns->npda = id_ns->npdg = npdg - 1; > + > +/* > + * The persistent reservation capacities of block > + * and nvme are currently defined the same. > + * If there are subsequent changes, this part needs to be changed. > + */ > +id_ns->rescap = blk_bs(ns->blkconf.blk)->file->bs->bl.pr_cap; > } > > static int nvme_ns_init(NvmeNamespace *ns, Error **errp) > -- > 2.20.1 > This should probably be merged with path 10. I don't think it make sense on it's own? signature.asc Description: PGP signature
Re: [PATCH 8/9] hw/nvme: add reservation protocal command
On May 8 17:36, Changqi Lu wrote: > Add reservation acquire, reservation register, > reservation release and reservation report commands > in the nvme device layer. > > By introducing these commands, this enables the nvme > device to perform reservation-related tasks, including > querying keys, querying reservation status, registering > reservation keys, initiating and releasing reservations, > as well as clearing and preempting reservations held by > other keys. > > These commands are crucial for management and control of > shared storage resources in a persistent manner. > > Signed-off-by: Changqi Lu > Signed-off-by: zhenwei pi > --- > hw/nvme/ctrl.c | 304 ++- > hw/nvme/nvme.h | 4 + > include/block/nvme.h | 37 ++ > 3 files changed, 344 insertions(+), 1 deletion(-) > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c > index 127c3d2383..1f881fc44c 100644 > --- a/hw/nvme/ctrl.c > +++ b/hw/nvme/ctrl.c > @@ -294,6 +294,10 @@ static const uint32_t nvme_cse_iocs_nvm[256] = { > [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP, > [NVME_CMD_IO_MGMT_RECV] = NVME_CMD_EFF_CSUPP, > [NVME_CMD_IO_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, > +[NVME_CMD_RESV_REGISTER]= NVME_CMD_EFF_CSUPP, > +[NVME_CMD_RESV_REPORT] = NVME_CMD_EFF_CSUPP, > +[NVME_CMD_RESV_ACQUIRE] = NVME_CMD_EFF_CSUPP, > +[NVME_CMD_RESV_RELEASE] = NVME_CMD_EFF_CSUPP, > }; We need to indicate support for these commands in ONCS, right? And that should depend on if or not the underlying block device supports it. signature.asc Description: PGP signature
Re: [RFC 0/1] hw/nvme: add atomic write support
On Apr 15 16:46, Alan Adamson wrote: > Since there is discussion in the Linux NVMe Driver community to add NVMe > Atomic Write > support, it would be desirable to test it with qemu nvme emulation. > > Initially, this RFC will focus on supporting NVMe controller atomic write > parameters(AWUN, > AWUPF, and ACWU) but will be extended to support Namespace parameters (NAWUN, > NAWUPF > and NACWU). > > Atomic Write Parameters for NVMe QEMU > - > New NVMe QEMU Parameters (See NVMe Specification for details): > atomic.dn (default off) - Set the value of Disable Normal. > atomic.awun=UINT16 (default: 0) > atomic.awupf=UINT16 (default: 0) > atomic.acwu=UINT16 (default: 0) > > qemu command line example: > qemu-system-x86_64 -cpu host --enable-kvm -smp cpus=4 -no-reboot -m > 8192M -drive file=./disk.img,if=ide \ > -boot c -device e1000,netdev=net0,mac=DE:CC:CC:EF:99:88 -netdev > tap,id=net0 \ > -device > nvme,id=nvme-ctrl-0,serial=nvme-1,atomic.dn=off,atomic.awun=63,atomic.awupf=63,atomic.acwu=0 > \ > -drive file=./nvme.img,if=none,id=nvm-1 -device > nvme-ns,drive=nvm-1,bus=nvme-ctrl-0 nvme-ns,drive=nvm-1,bus=nvme-ctrl-0 > > Making Writes Atomic: > - > - Prior to a command being pulled off the SQ and executed, a check is made to > see if it > conflicts "atomically" with a currently executing command. > - All currently executing commands on the same namespace, across all SQs need > to be checked. > - If an atomic conflict is detected, the command is not started and remains > on the queue. > > Testing > --- > NVMe QEMU Parameters used: > atomic.dn=off,atomic.awun=63,atomic.awupf=63,atomic.acwu=0 > > # nvme id-ctrl /dev/nvme0 | grep awun > awun : 63 > # nvme id-ctrl /dev/nvme0 | grep awupf > awupf : 63 > # nvme id-ctrl /dev/nvme0 | grep acwu > acwu : 0< Since qemu-nvme doesn't support Compare and Write, this is > always zero > # nvme get-feature /dev/nvme0 -f 0xa > get-feature:0x0a (Write Atomicity Normal), Current value: > # > > # fio --filename=/dev/nvme0n1 --direct=1 --rw=randwrite --bs=32k > --iodepth=256 --name=iops --numjobs=50 --verify=crc64 --verify_fatal=1 > --ioengine=libaio > > When executed without atomic write support, eventually the following error > will be > observed: > > crc64: verify failed at file /dev/nvme0n1 offset 857669632, length > 32768 > (requested block: offset=857669632, length=32768, flags=88) > Expected CRC: 9c87d3539dafdca0 > Received CRC: d521f7ea3b69d2ee > > When executed with atomic write support, this error no longer happens. > > Questions > - > AWUN vs AWUPF - Does the nvme emulation need to do treat these differently? > Currently the > larger of the two will be used as the max atomic write size. > > Future Work > --- > - Namespace support (NAWUN, NAWUPF and NACWU) > - Namespace Boundary support (NABSN, NABO, and NABSPF) > - Atomic Compare and Write Unit (ACWU) > > Alan Adamson (1): > nvme: add atomic write support > > hw/nvme/ctrl.c | 147 - > hw/nvme/nvme.h | 17 ++ > 2 files changed, 163 insertions(+), 1 deletion(-) > > -- > 2.39.3 > Hi Alan, I have no obvious qualms about this. It is clearly useful for driver testing and verification and does not negatively impact the performance when this "faked" feature is not enabled. Acked-by: Klaus Jensen signature.asc Description: PGP signature
[PATCH] hw/nvme: fix mo field in io mgnt send
From: Klaus Jensen The Management Operation field of I/O Management Send is only 8 bits, not 16. Fixes: 73064edfb864 ("hw/nvme: flexible data placement emulation") Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 9e7bbebc8bb0..ede5f281dd7c 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -4387,7 +4387,7 @@ static uint16_t nvme_io_mgmt_send(NvmeCtrl *n, NvmeRequest *req) { NvmeCmd *cmd = >cmd; uint32_t cdw10 = le32_to_cpu(cmd->cdw10); -uint8_t mo = (cdw10 & 0xff); +uint8_t mo = cdw10 & 0xf; switch (mo) { case NVME_IOMS_MO_NOP: --- base-commit: 84b0eb1826f690aa8d51984644318ee6c810f5bf change-id: 20240506-fix-ioms-mo-97098c6c5396 Best regards, -- Klaus Jensen
Re: [PATCH v2 3/4] hw/nvme: Support SR-IOV VFs more than 127
On Apr 1 04:30, Minwoo Im wrote: > From: Minwoo Im > > The number of virtual functions(VFs) supported in SR-IOV is 64k as per > spec. To test a large number of MSI-X vectors mapping to CPU matrix in > the QEMU system, we need much more than 127 VFs. This patch made > support for 256 VFs per a physical function(PF). > With patch 2 in place, shouldn't it be relatively straight forward to convert the static array to be dynamic and just use numvfs to size it? Then we won't have to add another patch when someone comes around and wants to bump this again ;) signature.asc Description: PGP signature
Re: [PATCH] hw/nvme: Add support for setting the MQES for the NVMe emulation
On May 1 12:27, Berg, John wrote: > On Thu, 2024-04-04 at 15:01 +0200, Klaus Jensen wrote: > > On Apr 4 13:04, John Berg wrote: > > > From: John Berg > > > > > > The MQES field in the CAP register describes the Maximum Queue > > > Entries > > > Supported for the IO queues of an NVMe controller. Adding a +1 to > > > the > > > value in this field results in the total queue size. A full queue > > > is > > > when a queue of size N contains N - 1 entries, and the minimum > > > queue > > > size is 2. Thus the lowest MQES value is 1. > > > > > > This patch adds the new mqes property to the NVMe emulation which > > > allows > > > a user to specify the maximum queue size by setting this property. > > > This > > > is useful as it enables testing of NVMe controller where the MQES > > > is > > > relatively small. The smallest NVMe queue size supported in NVMe is > > > 2 > > > submission and completion entries, which means that the smallest > > > legal > > > mqes value is 1. > > > > > > The following example shows how the mqes can be set for a the NVMe > > > emulation: > > > > > > -drive id=nvme0,if=none,file=nvme.img,format=raw > > > -device nvme,drive=nvme0,serial=foo,mqes=1 > > > > > > If the mqes property is not provided then the default mqes will > > > still be > > > 0x7ff (the queue size is 2048 entries). > > > > > > Signed-off-by: John Berg > > > --- > > > hw/nvme/ctrl.c | 9 - > > > hw/nvme/nvme.h | 1 + > > > 2 files changed, 9 insertions(+), 1 deletion(-) > > > > > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c > > > index 127c3d2383..86cda9bc73 100644 > > > --- a/hw/nvme/ctrl.c > > > +++ b/hw/nvme/ctrl.c > > > @@ -7805,6 +7805,12 @@ static bool nvme_check_params(NvmeCtrl *n, > > > Error **errp) > > > return false; > > > } > > > > > > + if (params->mqes < 1) > > > + { > > > > Please keep the `{` on the same line as the `if`. I think > > checkpatch.pl > > should catch this. > > > > No need to send a v2, I'll fix it up when I apply it to nvme-next :) > > > > Thanks! > > > Hello > > Sorry for chasing. I was just wondering when this patch will be > applied. I can send a second revision if that helps. > No need for the sorry. My apologies. It feel off my radar, so thanks for the bump. I've queued this on nvme-next, will send a pull for master ASAP. signature.asc Description: PGP signature
Re: [PATCH] hw/nvme: Add support for setting the MQES for the NVMe emulation
On Apr 4 13:04, John Berg wrote: > From: John Berg > > The MQES field in the CAP register describes the Maximum Queue Entries > Supported for the IO queues of an NVMe controller. Adding a +1 to the > value in this field results in the total queue size. A full queue is > when a queue of size N contains N - 1 entries, and the minimum queue > size is 2. Thus the lowest MQES value is 1. > > This patch adds the new mqes property to the NVMe emulation which allows > a user to specify the maximum queue size by setting this property. This > is useful as it enables testing of NVMe controller where the MQES is > relatively small. The smallest NVMe queue size supported in NVMe is 2 > submission and completion entries, which means that the smallest legal > mqes value is 1. > > The following example shows how the mqes can be set for a the NVMe > emulation: > > -drive id=nvme0,if=none,file=nvme.img,format=raw > -device nvme,drive=nvme0,serial=foo,mqes=1 > > If the mqes property is not provided then the default mqes will still be > 0x7ff (the queue size is 2048 entries). > > Signed-off-by: John Berg > --- > hw/nvme/ctrl.c | 9 - > hw/nvme/nvme.h | 1 + > 2 files changed, 9 insertions(+), 1 deletion(-) > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c > index 127c3d2383..86cda9bc73 100644 > --- a/hw/nvme/ctrl.c > +++ b/hw/nvme/ctrl.c > @@ -7805,6 +7805,12 @@ static bool nvme_check_params(NvmeCtrl *n, Error > **errp) > return false; > } > > +if (params->mqes < 1) > +{ Please keep the `{` on the same line as the `if`. I think checkpatch.pl should catch this. No need to send a v2, I'll fix it up when I apply it to nvme-next :) Thanks! signature.asc Description: PGP signature
Re: [PATCH] hw/nvme: Add support for setting the MQES for the NVMe emulation
On Apr 4 13:04, John Berg wrote: > From: John Berg > > The MQES field in the CAP register describes the Maximum Queue Entries > Supported for the IO queues of an NVMe controller. Adding a +1 to the > value in this field results in the total queue size. A full queue is > when a queue of size N contains N - 1 entries, and the minimum queue > size is 2. Thus the lowest MQES value is 1. > > This patch adds the new mqes property to the NVMe emulation which allows > a user to specify the maximum queue size by setting this property. This > is useful as it enables testing of NVMe controller where the MQES is > relatively small. The smallest NVMe queue size supported in NVMe is 2 > submission and completion entries, which means that the smallest legal > mqes value is 1. > > The following example shows how the mqes can be set for a the NVMe > emulation: > > -drive id=nvme0,if=none,file=nvme.img,format=raw > -device nvme,drive=nvme0,serial=foo,mqes=1 > > If the mqes property is not provided then the default mqes will still be > 0x7ff (the queue size is 2048 entries). > > Signed-off-by: John Berg LGTM, Reviewed-by: Klaus Jensen signature.asc Description: PGP signature
Re: [PATCH 17/19] hw/nvme: fix -Werror=maybe-uninitialized
On Mar 28 14:20, marcandre.lur...@redhat.com wrote: > From: Marc-André Lureau > > ../hw/nvme/ctrl.c:6081:21: error: ‘result’ may be used uninitialized > [-Werror=maybe-uninitialized] > > It's not obvious that 'result' is set in all code paths. When is > a returned argument, it's even less clear. > > Looking at various assignments, 0 seems to be a suitable default value. > > Signed-off-by: Marc-André Lureau > --- > hw/nvme/ctrl.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c > index c2b17de987..127c3d2383 100644 > --- a/hw/nvme/ctrl.c > +++ b/hw/nvme/ctrl.c > @@ -5894,7 +5894,7 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, > NvmeRequest *req) > uint32_t dw10 = le32_to_cpu(cmd->cdw10); > uint32_t dw11 = le32_to_cpu(cmd->cdw11); > uint32_t nsid = le32_to_cpu(cmd->nsid); > -uint32_t result; > +uint32_t result = 0; > uint8_t fid = NVME_GETSETFEAT_FID(dw10); > NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10); > uint16_t iv; > -- > 2.44.0 > Thanks! Reviewed-by: Klaus Jensen signature.asc Description: PGP signature
[PULL v2 3/6] MAINTAINERS: add Jesper as reviewer on hw/nvme
From: Klaus Jensen My colleague, Jesper, will be assiting with hw/nvme related reviews. Add him with R: so he gets automatically bugged going forward. Cc: Jesper Devantier Acked-by: Jesper Devantier Signed-off-by: Klaus Jensen --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 12f5e47a11f4..7f96ce857486 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2407,6 +2407,7 @@ F: docs/system/devices/virtio-snd.rst nvme M: Keith Busch M: Klaus Jensen +R: Jesper Devantier L: qemu-block@nongnu.org S: Supported F: hw/nvme/* -- 2.44.0
[PULL v2 4/6] hw/nvme: Add NVMe NGUID property
From: Roque Arcudia Hernandez This patch adds a way to specify an NGUID for a given NVMe Namespace using a string of hexadecimal digits with an optional '-' separator to group bytes. For instance: -device nvme-ns,nguid="e9accd3b83904e13167cf0593437f57d" If provided, the NGUID will be part of the Namespace Identification Descriptor list and the Identify Namespace data. Signed-off-by: Roque Arcudia Hernandez Signed-off-by: Nabih Estefan Reviewed-by: Klaus Jensen Signed-off-by: Klaus Jensen --- docs/system/devices/nvme.rst | 7 ++ hw/nvme/ctrl.c | 12 +++ hw/nvme/meson.build | 2 +- hw/nvme/nguid.c | 187 +++ hw/nvme/ns.c | 2 + hw/nvme/nvme.h | 26 +++-- 6 files changed, 229 insertions(+), 7 deletions(-) create mode 100644 hw/nvme/nguid.c diff --git a/docs/system/devices/nvme.rst b/docs/system/devices/nvme.rst index 4ea957baed10..d2b1ca96455f 100644 --- a/docs/system/devices/nvme.rst +++ b/docs/system/devices/nvme.rst @@ -81,6 +81,13 @@ There are a number of parameters available: Set the UUID of the namespace. This will be reported as a "Namespace UUID" descriptor in the Namespace Identification Descriptor List. +``nguid`` + Set the NGUID of the namespace. This will be reported as a "Namespace Globally + Unique Identifier" descriptor in the Namespace Identification Descriptor List. + It is specified as a string of hexadecimal digits containing exactly 16 bytes + or "auto" for a random value. An optional '-' separator could be used to group + bytes. If not specified the NGUID will remain all zeros. + ``eui64`` Set the EUI-64 of the namespace. This will be reported as a "IEEE Extended Unique Identifier" descriptor in the Namespace Identification Descriptor List. diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index abc0387f2ca8..6c5a2b875da8 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -5640,6 +5640,10 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) NvmeIdNsDescr hdr; uint8_t v[NVME_NIDL_UUID]; } QEMU_PACKED uuid = {}; +struct { +NvmeIdNsDescr hdr; +uint8_t v[NVME_NIDL_NGUID]; +} QEMU_PACKED nguid = {}; struct { NvmeIdNsDescr hdr; uint64_t v; @@ -5668,6 +5672,14 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) pos += sizeof(uuid); } +if (!nvme_nguid_is_null(>params.nguid)) { +nguid.hdr.nidt = NVME_NIDT_NGUID; +nguid.hdr.nidl = NVME_NIDL_NGUID; +memcpy(nguid.v, ns->params.nguid.data, NVME_NIDL_NGUID); +memcpy(pos, , sizeof(nguid)); +pos += sizeof(nguid); +} + if (ns->params.eui64) { eui64.hdr.nidt = NVME_NIDT_EUI64; eui64.hdr.nidl = NVME_NIDL_EUI64; diff --git a/hw/nvme/meson.build b/hw/nvme/meson.build index 1a6a2ca2f307..7d5caa53c280 100644 --- a/hw/nvme/meson.build +++ b/hw/nvme/meson.build @@ -1 +1 @@ -system_ss.add(when: 'CONFIG_NVME_PCI', if_true: files('ctrl.c', 'dif.c', 'ns.c', 'subsys.c')) +system_ss.add(when: 'CONFIG_NVME_PCI', if_true: files('ctrl.c', 'dif.c', 'ns.c', 'subsys.c', 'nguid.c')) \ No newline at end of file diff --git a/hw/nvme/nguid.c b/hw/nvme/nguid.c new file mode 100644 index ..829832bd9f41 --- /dev/null +++ b/hw/nvme/nguid.c @@ -0,0 +1,187 @@ +/* + * QEMU NVMe NGUID functions + * + * Copyright 2024 Google LLC + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ + +#include "qemu/osdep.h" +#include "qapi/visitor.h" +#include "qemu/ctype.h" +#include "nvme.h" + +#define NGUID_SEPARATOR '-' + +#define NGUID_VALUE_AUTO "auto" + +#define NGUID_FMT \ +"%02hhx%02hhx%02hhx%02hhx" \ +"%02hhx%02hhx%02hhx%02hhx" \ +"%02hhx%02hhx%02hhx%02hhx" \ +"%02hhx%02hhx%02hhx%02hhx" + +#define NGUID_STR_LEN (2 * NGUID_LEN + 1) + +bool nvme_nguid_is_null(const NvmeNGUID *nguid) +{ +static NvmeNGUID null_nguid; +return memcmp(nguid, _nguid, sizeof(NvmeNGUID)) == 0; +} + +static void nvme_nguid_generate(NvmeNGUID *out) +{ +int i; +uint32_t x; + +QEMU_BUILD_BUG_ON((NGUID_LEN % sizeof(x)) != 0); + +for (i = 0; i < NGUID_LEN; i += sizeof(x)) { +x = g_random_int(); +memcpy(>data[i], , sizeof(x)); +} +} + +/* + * The Linux Kernel typically prints the NGUID of a
[PULL v2 5/6] hw/nvme: generalize the mbar size helper
From: Klaus Jensen Generalize the mbar size helper such that it can handle cases where the MSI-X table and PBA are expected to be in an exclusive bar. Cc: qemu-sta...@nongnu.org Reviewed-by: Jesper Wendel Devantier Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 28 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 6c5a2b875da8..efcfd7171066 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -8015,13 +8015,18 @@ static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev) memory_region_set_enabled(>pmr.dev->mr, false); } -static uint64_t nvme_bar_size(unsigned total_queues, unsigned total_irqs, - unsigned *msix_table_offset, - unsigned *msix_pba_offset) +static uint64_t nvme_mbar_size(unsigned total_queues, unsigned total_irqs, + unsigned *msix_table_offset, + unsigned *msix_pba_offset) { -uint64_t bar_size, msix_table_size, msix_pba_size; +uint64_t bar_size, msix_table_size; bar_size = sizeof(NvmeBar) + 2 * total_queues * NVME_DB_SIZE; + +if (total_irqs == 0) { +goto out; +} + bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB); if (msix_table_offset) { @@ -8036,11 +8041,10 @@ static uint64_t nvme_bar_size(unsigned total_queues, unsigned total_irqs, *msix_pba_offset = bar_size; } -msix_pba_size = QEMU_ALIGN_UP(total_irqs, 64) / 8; -bar_size += msix_pba_size; +bar_size += QEMU_ALIGN_UP(total_irqs, 64) / 8; -bar_size = pow2ceil(bar_size); -return bar_size; +out: +return pow2ceil(bar_size); } static void nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset) @@ -8048,7 +8052,7 @@ static void nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset) uint16_t vf_dev_id = n->params.use_intel_id ? PCI_DEVICE_ID_INTEL_NVME : PCI_DEVICE_ID_REDHAT_NVME; NvmePriCtrlCap *cap = >pri_ctrl_cap; -uint64_t bar_size = nvme_bar_size(le16_to_cpu(cap->vqfrsm), +uint64_t bar_size = nvme_mbar_size(le16_to_cpu(cap->vqfrsm), le16_to_cpu(cap->vifrsm), NULL, NULL); @@ -8087,7 +8091,7 @@ static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) ERRP_GUARD(); uint8_t *pci_conf = pci_dev->config; uint64_t bar_size; -unsigned msix_table_offset, msix_pba_offset; +unsigned msix_table_offset = 0, msix_pba_offset = 0; int ret; pci_conf[PCI_INTERRUPT_PIN] = 1; @@ -8110,8 +8114,8 @@ static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) } /* add one to max_ioqpairs to account for the admin queue pair */ -bar_size = nvme_bar_size(n->params.max_ioqpairs + 1, n->params.msix_qsize, - _table_offset, _pba_offset); +bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1, n->params.msix_qsize, + _table_offset, _pba_offset); memory_region_init(>bar0, OBJECT(n), "nvme-bar0", bar_size); memory_region_init_io(>iomem, OBJECT(n), _mmio_ops, n, "nvme", -- 2.44.0
[PULL v2 0/6] hw/nvme updates
From: Klaus Jensen Hi, Sorry about the compilation error in v1. Did a full CI run for v2. https://gitlab.com/birkelund/qemu/-/pipelines/1210559370 The following changes since commit 8f3f329f5e0117bd1a23a79ab751f8a7d3471e4b: Merge tag 'migration-20240311-pull-request' of https://gitlab.com/peterx/qemu into staging (2024-03-12 11:35:41 +) are available in the Git repository at: https://gitlab.com/birkelund/qemu.git tags/nvme-next-pull-request for you to fetch changes up to fa905f65c5549703279f68c253914799b10ada47: hw/nvme: add machine compatibility parameter to enable msix exclusive bar (2024-03-12 16:05:53 +0100) hw/nvme updates -BEGIN PGP SIGNATURE- iQEzBAABCgAdFiEEUigzqnXi3OaiR2bATeGvMW1PDekFAmXwj+wACgkQTeGvMW1P DelOsAf+Jg51zf3vtWpe4MS/WtULjSr5GtnXMJ5hkHS0WdKOiLW3P+pUZXbsohmh faVlYeCWptF1CFGfxBf4Trc7XzJy8J6W1YJEofs/07hIAnazo9pwk5shoVu4oiex HVsBg7/9y7DuiEEg1MRvVvW895cP60WmG1AqU63SYwrVgxZ51ZH0XNuyRhQeYC/6 OSXJ3FDYu2iJQ58uEzGEwv8vhskIpEFTdz0J6gQVxIdzFBbuk87VgZo6pqwgfMBm /65K85TgFBT4SASc7a2iSUv+iAqSCA6Jdy0VWxCYCikiv5nuPCMCrlbvqcVp+i2B GKtgfFXhtgepxx6jmYd03EkRjCrxUA== =W3gg -END PGP SIGNATURE- Klaus Jensen (4): hw/nvme: fix invalid check on mcl MAINTAINERS: add Jesper as reviewer on hw/nvme hw/nvme: generalize the mbar size helper hw/nvme: add machine compatibility parameter to enable msix exclusive bar Minwoo Im (1): hw/nvme: separate 'serial' property for VFs Roque Arcudia Hernandez (1): hw/nvme: Add NVMe NGUID property MAINTAINERS | 1 + docs/system/devices/nvme.rst | 7 ++ hw/core/machine.c| 1 + hw/nvme/ctrl.c | 99 +-- hw/nvme/meson.build | 2 +- hw/nvme/nguid.c | 187 +++ hw/nvme/ns.c | 2 + hw/nvme/nvme.h | 27 +++-- 8 files changed, 291 insertions(+), 35 deletions(-) create mode 100644 hw/nvme/nguid.c -- 2.44.0
[PULL v2 6/6] hw/nvme: add machine compatibility parameter to enable msix exclusive bar
From: Klaus Jensen Commit 1901b4967c3f ("hw/block/nvme: move msix table and pba to BAR 0") moved the MSI-X table and PBA to BAR 0 to make room for enabling CMR and PMR at the same time. As reported by Julien Grall in #2184, this breaks migration through system hibernation. Add a machine compatibility parameter and set it on machines pre 6.0 to enable the old behavior automatically, restoring the hibernation migration support. Cc: qemu-sta...@nongnu.org Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2184 Fixes: 1901b4967c3f ("hw/block/nvme: move msix table and pba to BAR 0") Reported-by: Julien Grall jul...@xen.org Tested-by: Julien Grall jul...@xen.org Reviewed-by: Jesper Wendel Devantier Signed-off-by: Klaus Jensen --- hw/core/machine.c | 1 + hw/nvme/ctrl.c| 53 +-- hw/nvme/nvme.h| 1 + 3 files changed, 39 insertions(+), 16 deletions(-) diff --git a/hw/core/machine.c b/hw/core/machine.c index 0e9d646b61e1..37e5d4c8dfd5 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -102,6 +102,7 @@ GlobalProperty hw_compat_5_2[] = { { "PIIX4_PM", "smm-compat", "on"}, { "virtio-blk-device", "report-discard-granularity", "off" }, { "virtio-net-pci-base", "vectors", "3"}, +{ "nvme", "msix-exclusive-bar", "on"}, }; const size_t hw_compat_5_2_len = G_N_ELEMENTS(hw_compat_5_2); diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index efcfd7171066..036b15403a02 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -7810,6 +7810,11 @@ static bool nvme_check_params(NvmeCtrl *n, Error **errp) } if (n->pmr.dev) { +if (params->msix_exclusive_bar) { +error_setg(errp, "not enough BARs available to enable PMR"); +return false; +} + if (host_memory_backend_is_mapped(n->pmr.dev)) { error_setg(errp, "can't use already busy memdev: %s", object_get_canonical_path_component(OBJECT(n->pmr.dev))); @@ -8113,24 +8118,38 @@ static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) pcie_ari_init(pci_dev, 0x100); } -/* add one to max_ioqpairs to account for the admin queue pair */ -bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1, n->params.msix_qsize, - _table_offset, _pba_offset); - -memory_region_init(>bar0, OBJECT(n), "nvme-bar0", bar_size); -memory_region_init_io(>iomem, OBJECT(n), _mmio_ops, n, "nvme", - msix_table_offset); -memory_region_add_subregion(>bar0, 0, >iomem); - -if (pci_is_vf(pci_dev)) { -pcie_sriov_vf_register_bar(pci_dev, 0, >bar0); -} else { +if (n->params.msix_exclusive_bar && !pci_is_vf(pci_dev)) { +bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1, 0, NULL, NULL); +memory_region_init_io(>iomem, OBJECT(n), _mmio_ops, n, "nvme", + bar_size); pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY | - PCI_BASE_ADDRESS_MEM_TYPE_64, >bar0); + PCI_BASE_ADDRESS_MEM_TYPE_64, >iomem); +ret = msix_init_exclusive_bar(pci_dev, n->params.msix_qsize, 4, errp); +} else { +assert(n->params.msix_qsize >= 1); + +/* add one to max_ioqpairs to account for the admin queue pair */ +bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1, + n->params.msix_qsize, _table_offset, + _pba_offset); + +memory_region_init(>bar0, OBJECT(n), "nvme-bar0", bar_size); +memory_region_init_io(>iomem, OBJECT(n), _mmio_ops, n, "nvme", + msix_table_offset); +memory_region_add_subregion(>bar0, 0, >iomem); + +if (pci_is_vf(pci_dev)) { +pcie_sriov_vf_register_bar(pci_dev, 0, >bar0); +} else { +pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY | + PCI_BASE_ADDRESS_MEM_TYPE_64, >bar0); +} + +ret = msix_init(pci_dev, n->params.msix_qsize, +>bar0, 0, msix_table_offset, +>bar0, 0, msix_pba_offset, 0, errp); } -ret = msix_init(pci_dev, n->params.msix_qsize, ->bar0, 0, msix_table_offset, ->bar0, 0, msix_pba_offset, 0, errp); + if (ret == -ENOTSUP) { /* report that msix is not supported, but do not error out */ warn_report_err(*errp); @@ -8434,6 +8453,8 @@ static Property nvme_props[] = { params.sriov_max_vi_per_vf, 0), DEFINE_PROP_UINT8("sriov_m
[PULL v2 1/6] hw/nvme: separate 'serial' property for VFs
From: Minwoo Im Currently, when a VF is created, it uses the 'params' object of the PF as it is. In other words, the 'params.serial' string memory area is also shared. In this situation, if the VF is removed from the system, the PF's 'params.serial' object is released with object_finalize() followed by object_property_del_all() which release the memory for 'serial' property. If that happens, the next VF created will inherit a serial from a corrupted memory area. If this happens, an error will occur when comparing subsys->serial and n->params.serial in the nvme_subsys_register_ctrl() function. Cc: qemu-sta...@nongnu.org Fixes: 44c2c09488db ("hw/nvme: Add support for SR-IOV") Signed-off-by: Minwoo Im Reviewed-by: Klaus Jensen Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 76fe0397045b..94ef63945725 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -8309,9 +8309,15 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp) if (pci_is_vf(pci_dev)) { /* * VFs derive settings from the parent. PF's lifespan exceeds - * that of VF's, so it's safe to share params.serial. + * that of VF's. */ memcpy(>params, >params, sizeof(NvmeParams)); + +/* + * Set PF's serial value to a new string memory to prevent 'serial' + * property object release of PF when a VF is removed from the system. + */ +n->params.serial = g_strdup(pn->params.serial); n->subsys = pn->subsys; } -- 2.44.0
[PULL v2 2/6] hw/nvme: fix invalid check on mcl
From: Klaus Jensen The number of logical blocks within a source range is converted into a 1s based number at the time of parsing. However, when verifying the copy length we add one again, causing the check against MCL to fail in error. Cc: qemu-sta...@nongnu.org Fixes: 381ab99d8587 ("hw/nvme: check maximum copy length (MCL) for COPY") Reviewed-by: Minwoo Im Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 94ef63945725..abc0387f2ca8 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -2855,7 +2855,7 @@ static inline uint16_t nvme_check_copy_mcl(NvmeNamespace *ns, uint32_t nlb; nvme_copy_source_range_parse(iocb->ranges, idx, iocb->format, NULL, , NULL, NULL, NULL); -copy_len += nlb + 1; +copy_len += nlb; } if (copy_len > ns->id_ns.mcl) { -- 2.44.0
Re: [PULL 0/6] hw/nvme updates
On Mar 12 11:34, Peter Maydell wrote: > On Mon, 11 Mar 2024 at 19:11, Klaus Jensen wrote: > > > > From: Klaus Jensen > > > > Hi, > > > > The following changes since commit 7489f7f3f81dcb776df8c1b9a9db281fc21bf05f: > > > > Merge tag 'hw-misc-20240309' of https://github.com/philmd/qemu into > > staging (2024-03-09 20:12:21 +) > > > > are available in the Git repository at: > > > > https://gitlab.com/birkelund/qemu.git tags/nvme-next-pull-request > > > > for you to fetch changes up to a1505d799232939bf90c1b3e1fc20e81cd398404: > > > > hw/nvme: add machine compatibility parameter to enable msix exclusive bar > > (2024-03-11 20:07:41 +0100) > > > > > > hw/nvme updates > > -BEGIN PGP SIGNATURE- > > > > iQEzBAABCgAdFiEEUigzqnXi3OaiR2bATeGvMW1PDekFAmXvVsYACgkQTeGvMW1P > > DemWtwf9HU3cjtvCp8AeHGoPFTwp8/Vx3cQlQ6ilADKSDm44up2+M504xE/Mdviv > > 6y3PTPe1yiEpg/MbjWTX/df5lo+VdNoCuCyjph9mea0s1QAjCfVpl+KLMUVF/Oj5 > > y1Iz9PQqOVDJ3O4xlgmPTfd8NXE/frNJaiXAjFuBxF2+4lilD5kMxpyu7DXbLiy2 > > Szd1I3DhFAEOLEbrSSRDI3Fpy0KBdRzdKuUfmRdrHzbmhzHJefW7wnZ3aAiDboaD > > Ny7y/aovmjGymMp9GrBKWhUFPfSUtJ8l8j4Z7acQs+VDxg8lcAHCJKOyqCBTspUL > > PSnDe6E/CRyjrG2fUVXTLb6YW1eibQ== > > =Ld7a > > -END PGP SIGNATURE- > > Hi; I'm afraid this fails to build for some jobs, eg > https://gitlab.com/qemu-project/qemu/-/jobs/6373091994 > https://gitlab.com/qemu-project/qemu/-/jobs/6373091978 > https://gitlab.com/qemu-project/qemu/-/jobs/6373091975 > > ../hw/nvme/ctrl.c: In function ‘nvme_realize’: > ../hw/nvme/ctrl.c:8146:15: error: ‘msix_pba_offset’ may be used > uninitialized in this function [-Werror=maybe-uninitialized] > 8146 | ret = msix_init(pci_dev, n->params.msix_qsize, > | ^~~~ > 8147 | >bar0, 0, msix_table_offset, > | ~~~ > 8148 | >bar0, 0, msix_pba_offset, 0, errp); > | ~~ > ../hw/nvme/ctrl.c:8099:33: note: ‘msix_pba_offset’ was declared here > 8099 | unsigned msix_table_offset, msix_pba_offset; > | ^~~ > ../hw/nvme/ctrl.c:8135:9: error: ‘msix_table_offset’ may be used > uninitialized in this function [-Werror=maybe-uninitialized] > 8135 | memory_region_init_io(>iomem, OBJECT(n), _mmio_ops, n, "nvme", > | ^~ > 8136 | msix_table_offset); > | ~~ > ../hw/nvme/ctrl.c:8099:14: note: ‘msix_table_offset’ was declared here > 8099 | unsigned msix_table_offset, msix_pba_offset; > | ^ > cc1: all warnings being treated as errors > > > I think this is because the compiler notices that nvme_mbar_size() has > an early-exit code path which never initializes *msix_table_offset > and *msix-pba_offset. > Crap, yeah. I'll fix it up. Thanks! signature.asc Description: PGP signature
[PULL 2/6] hw/nvme: fix invalid check on mcl
From: Klaus Jensen The number of logical blocks within a source range is converted into a 1s based number at the time of parsing. However, when verifying the copy length we add one again, causing the check against MCL to fail in error. Cc: qemu-sta...@nongnu.org Fixes: 381ab99d8587 ("hw/nvme: check maximum copy length (MCL) for COPY") Reviewed-by: Minwoo Im Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 94ef63945725..abc0387f2ca8 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -2855,7 +2855,7 @@ static inline uint16_t nvme_check_copy_mcl(NvmeNamespace *ns, uint32_t nlb; nvme_copy_source_range_parse(iocb->ranges, idx, iocb->format, NULL, , NULL, NULL, NULL); -copy_len += nlb + 1; +copy_len += nlb; } if (copy_len > ns->id_ns.mcl) { -- 2.44.0
[PULL 6/6] hw/nvme: add machine compatibility parameter to enable msix exclusive bar
From: Klaus Jensen Commit 1901b4967c3f ("hw/block/nvme: move msix table and pba to BAR 0") moved the MSI-X table and PBA to BAR 0 to make room for enabling CMR and PMR at the same time. As reported by Julien Grall in #2184, this breaks migration through system hibernation. Add a machine compatibility parameter and set it on machines pre 6.0 to enable the old behavior automatically, restoring the hibernation migration support. Cc: qemu-sta...@nongnu.org Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2184 Fixes: 1901b4967c3f ("hw/block/nvme: move msix table and pba to BAR 0") Reported-by: Julien Grall jul...@xen.org Tested-by: Julien Grall jul...@xen.org Reviewed-by: Jesper Wendel Devantier Signed-off-by: Klaus Jensen --- hw/core/machine.c | 1 + hw/nvme/ctrl.c| 51 --- hw/nvme/nvme.h| 1 + 3 files changed, 37 insertions(+), 16 deletions(-) diff --git a/hw/core/machine.c b/hw/core/machine.c index 9ac5d5389a6c..f3012bca1370 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -100,6 +100,7 @@ GlobalProperty hw_compat_5_2[] = { { "PIIX4_PM", "smm-compat", "on"}, { "virtio-blk-device", "report-discard-granularity", "off" }, { "virtio-net-pci-base", "vectors", "3"}, +{ "nvme", "msix-exclusive-bar", "on"}, }; const size_t hw_compat_5_2_len = G_N_ELEMENTS(hw_compat_5_2); diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 5ee8deda22a4..6210b7098845 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -7810,6 +7810,11 @@ static bool nvme_check_params(NvmeCtrl *n, Error **errp) } if (n->pmr.dev) { +if (params->msix_exclusive_bar) { +error_setg(errp, "not enough BARs available to enable PMR"); +return false; +} + if (host_memory_backend_is_mapped(n->pmr.dev)) { error_setg(errp, "can't use already busy memdev: %s", object_get_canonical_path_component(OBJECT(n->pmr.dev))); @@ -8113,24 +8118,36 @@ static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) pcie_ari_init(pci_dev, 0x100); } -/* add one to max_ioqpairs to account for the admin queue pair */ -bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1, n->params.msix_qsize, - _table_offset, _pba_offset); - -memory_region_init(>bar0, OBJECT(n), "nvme-bar0", bar_size); -memory_region_init_io(>iomem, OBJECT(n), _mmio_ops, n, "nvme", - msix_table_offset); -memory_region_add_subregion(>bar0, 0, >iomem); - -if (pci_is_vf(pci_dev)) { -pcie_sriov_vf_register_bar(pci_dev, 0, >bar0); -} else { +if (n->params.msix_exclusive_bar && !pci_is_vf(pci_dev)) { +bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1, 0, NULL, NULL); +memory_region_init_io(>iomem, OBJECT(n), _mmio_ops, n, "nvme", + bar_size); pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY | - PCI_BASE_ADDRESS_MEM_TYPE_64, >bar0); + PCI_BASE_ADDRESS_MEM_TYPE_64, >iomem); +ret = msix_init_exclusive_bar(pci_dev, n->params.msix_qsize, 4, errp); +} else { +/* add one to max_ioqpairs to account for the admin queue pair */ +bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1, + n->params.msix_qsize, _table_offset, + _pba_offset); + +memory_region_init(>bar0, OBJECT(n), "nvme-bar0", bar_size); +memory_region_init_io(>iomem, OBJECT(n), _mmio_ops, n, "nvme", + msix_table_offset); +memory_region_add_subregion(>bar0, 0, >iomem); + +if (pci_is_vf(pci_dev)) { +pcie_sriov_vf_register_bar(pci_dev, 0, >bar0); +} else { +pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY | + PCI_BASE_ADDRESS_MEM_TYPE_64, >bar0); +} + +ret = msix_init(pci_dev, n->params.msix_qsize, +>bar0, 0, msix_table_offset, +>bar0, 0, msix_pba_offset, 0, errp); } -ret = msix_init(pci_dev, n->params.msix_qsize, ->bar0, 0, msix_table_offset, ->bar0, 0, msix_pba_offset, 0, errp); + if (ret == -ENOTSUP) { /* report that msix is not supported, but do not error out */ warn_report_err(*errp); @@ -8434,6 +8451,8 @@ static Property nvme_props[] = { params.sriov_max_vi_per_vf, 0), DEFINE_PROP_UINT8("sriov_max_vq_per_vf", NvmeCtrl,
[PULL 5/6] hw/nvme: generalize the mbar size helper
From: Klaus Jensen Generalize the mbar size helper such that it can handle cases where the MSI-X table and PBA are expected to be in an exclusive bar. Cc: qemu-sta...@nongnu.org Reviewed-by: Jesper Wendel Devantier Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 26 +++--- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 6c5a2b875da8..5ee8deda22a4 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -8015,13 +8015,18 @@ static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev) memory_region_set_enabled(>pmr.dev->mr, false); } -static uint64_t nvme_bar_size(unsigned total_queues, unsigned total_irqs, - unsigned *msix_table_offset, - unsigned *msix_pba_offset) +static uint64_t nvme_mbar_size(unsigned total_queues, unsigned total_irqs, + unsigned *msix_table_offset, + unsigned *msix_pba_offset) { -uint64_t bar_size, msix_table_size, msix_pba_size; +uint64_t bar_size, msix_table_size; bar_size = sizeof(NvmeBar) + 2 * total_queues * NVME_DB_SIZE; + +if (total_irqs == 0) { +goto out; +} + bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB); if (msix_table_offset) { @@ -8036,11 +8041,10 @@ static uint64_t nvme_bar_size(unsigned total_queues, unsigned total_irqs, *msix_pba_offset = bar_size; } -msix_pba_size = QEMU_ALIGN_UP(total_irqs, 64) / 8; -bar_size += msix_pba_size; +bar_size += QEMU_ALIGN_UP(total_irqs, 64) / 8; -bar_size = pow2ceil(bar_size); -return bar_size; +out: +return pow2ceil(bar_size); } static void nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset) @@ -8048,7 +8052,7 @@ static void nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset) uint16_t vf_dev_id = n->params.use_intel_id ? PCI_DEVICE_ID_INTEL_NVME : PCI_DEVICE_ID_REDHAT_NVME; NvmePriCtrlCap *cap = >pri_ctrl_cap; -uint64_t bar_size = nvme_bar_size(le16_to_cpu(cap->vqfrsm), +uint64_t bar_size = nvme_mbar_size(le16_to_cpu(cap->vqfrsm), le16_to_cpu(cap->vifrsm), NULL, NULL); @@ -8110,8 +8114,8 @@ static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) } /* add one to max_ioqpairs to account for the admin queue pair */ -bar_size = nvme_bar_size(n->params.max_ioqpairs + 1, n->params.msix_qsize, - _table_offset, _pba_offset); +bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1, n->params.msix_qsize, + _table_offset, _pba_offset); memory_region_init(>bar0, OBJECT(n), "nvme-bar0", bar_size); memory_region_init_io(>iomem, OBJECT(n), _mmio_ops, n, "nvme", -- 2.44.0
[PULL 3/6] MAINTAINERS: add Jesper as reviewer on hw/nvme
From: Klaus Jensen My colleague, Jesper, will be assiting with hw/nvme related reviews. Add him with R: so he gets automatically bugged going forward. Cc: Jesper Devantier Acked-by: Jesper Devantier Signed-off-by: Klaus Jensen --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 4d96f855de5c..e21e18e93c63 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2407,6 +2407,7 @@ F: docs/system/devices/virtio-snd.rst nvme M: Keith Busch M: Klaus Jensen +R: Jesper Devantier L: qemu-block@nongnu.org S: Supported F: hw/nvme/* -- 2.44.0
[PULL 4/6] hw/nvme: Add NVMe NGUID property
From: Roque Arcudia Hernandez This patch adds a way to specify an NGUID for a given NVMe Namespace using a string of hexadecimal digits with an optional '-' separator to group bytes. For instance: -device nvme-ns,nguid="e9accd3b83904e13167cf0593437f57d" If provided, the NGUID will be part of the Namespace Identification Descriptor list and the Identify Namespace data. Signed-off-by: Roque Arcudia Hernandez Signed-off-by: Nabih Estefan Reviewed-by: Klaus Jensen Signed-off-by: Klaus Jensen --- docs/system/devices/nvme.rst | 7 ++ hw/nvme/ctrl.c | 12 +++ hw/nvme/meson.build | 2 +- hw/nvme/nguid.c | 187 +++ hw/nvme/ns.c | 2 + hw/nvme/nvme.h | 26 +++-- 6 files changed, 229 insertions(+), 7 deletions(-) create mode 100644 hw/nvme/nguid.c diff --git a/docs/system/devices/nvme.rst b/docs/system/devices/nvme.rst index 4ea957baed10..d2b1ca96455f 100644 --- a/docs/system/devices/nvme.rst +++ b/docs/system/devices/nvme.rst @@ -81,6 +81,13 @@ There are a number of parameters available: Set the UUID of the namespace. This will be reported as a "Namespace UUID" descriptor in the Namespace Identification Descriptor List. +``nguid`` + Set the NGUID of the namespace. This will be reported as a "Namespace Globally + Unique Identifier" descriptor in the Namespace Identification Descriptor List. + It is specified as a string of hexadecimal digits containing exactly 16 bytes + or "auto" for a random value. An optional '-' separator could be used to group + bytes. If not specified the NGUID will remain all zeros. + ``eui64`` Set the EUI-64 of the namespace. This will be reported as a "IEEE Extended Unique Identifier" descriptor in the Namespace Identification Descriptor List. diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index abc0387f2ca8..6c5a2b875da8 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -5640,6 +5640,10 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) NvmeIdNsDescr hdr; uint8_t v[NVME_NIDL_UUID]; } QEMU_PACKED uuid = {}; +struct { +NvmeIdNsDescr hdr; +uint8_t v[NVME_NIDL_NGUID]; +} QEMU_PACKED nguid = {}; struct { NvmeIdNsDescr hdr; uint64_t v; @@ -5668,6 +5672,14 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) pos += sizeof(uuid); } +if (!nvme_nguid_is_null(>params.nguid)) { +nguid.hdr.nidt = NVME_NIDT_NGUID; +nguid.hdr.nidl = NVME_NIDL_NGUID; +memcpy(nguid.v, ns->params.nguid.data, NVME_NIDL_NGUID); +memcpy(pos, , sizeof(nguid)); +pos += sizeof(nguid); +} + if (ns->params.eui64) { eui64.hdr.nidt = NVME_NIDT_EUI64; eui64.hdr.nidl = NVME_NIDL_EUI64; diff --git a/hw/nvme/meson.build b/hw/nvme/meson.build index 1a6a2ca2f307..7d5caa53c280 100644 --- a/hw/nvme/meson.build +++ b/hw/nvme/meson.build @@ -1 +1 @@ -system_ss.add(when: 'CONFIG_NVME_PCI', if_true: files('ctrl.c', 'dif.c', 'ns.c', 'subsys.c')) +system_ss.add(when: 'CONFIG_NVME_PCI', if_true: files('ctrl.c', 'dif.c', 'ns.c', 'subsys.c', 'nguid.c')) \ No newline at end of file diff --git a/hw/nvme/nguid.c b/hw/nvme/nguid.c new file mode 100644 index ..829832bd9f41 --- /dev/null +++ b/hw/nvme/nguid.c @@ -0,0 +1,187 @@ +/* + * QEMU NVMe NGUID functions + * + * Copyright 2024 Google LLC + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ + +#include "qemu/osdep.h" +#include "qapi/visitor.h" +#include "qemu/ctype.h" +#include "nvme.h" + +#define NGUID_SEPARATOR '-' + +#define NGUID_VALUE_AUTO "auto" + +#define NGUID_FMT \ +"%02hhx%02hhx%02hhx%02hhx" \ +"%02hhx%02hhx%02hhx%02hhx" \ +"%02hhx%02hhx%02hhx%02hhx" \ +"%02hhx%02hhx%02hhx%02hhx" + +#define NGUID_STR_LEN (2 * NGUID_LEN + 1) + +bool nvme_nguid_is_null(const NvmeNGUID *nguid) +{ +static NvmeNGUID null_nguid; +return memcmp(nguid, _nguid, sizeof(NvmeNGUID)) == 0; +} + +static void nvme_nguid_generate(NvmeNGUID *out) +{ +int i; +uint32_t x; + +QEMU_BUILD_BUG_ON((NGUID_LEN % sizeof(x)) != 0); + +for (i = 0; i < NGUID_LEN; i += sizeof(x)) { +x = g_random_int(); +memcpy(>data[i], , sizeof(x)); +} +} + +/* + * The Linux Kernel typically prints the NGUID of a
[PULL 0/6] hw/nvme updates
From: Klaus Jensen Hi, The following changes since commit 7489f7f3f81dcb776df8c1b9a9db281fc21bf05f: Merge tag 'hw-misc-20240309' of https://github.com/philmd/qemu into staging (2024-03-09 20:12:21 +) are available in the Git repository at: https://gitlab.com/birkelund/qemu.git tags/nvme-next-pull-request for you to fetch changes up to a1505d799232939bf90c1b3e1fc20e81cd398404: hw/nvme: add machine compatibility parameter to enable msix exclusive bar (2024-03-11 20:07:41 +0100) hw/nvme updates -BEGIN PGP SIGNATURE- iQEzBAABCgAdFiEEUigzqnXi3OaiR2bATeGvMW1PDekFAmXvVsYACgkQTeGvMW1P DemWtwf9HU3cjtvCp8AeHGoPFTwp8/Vx3cQlQ6ilADKSDm44up2+M504xE/Mdviv 6y3PTPe1yiEpg/MbjWTX/df5lo+VdNoCuCyjph9mea0s1QAjCfVpl+KLMUVF/Oj5 y1Iz9PQqOVDJ3O4xlgmPTfd8NXE/frNJaiXAjFuBxF2+4lilD5kMxpyu7DXbLiy2 Szd1I3DhFAEOLEbrSSRDI3Fpy0KBdRzdKuUfmRdrHzbmhzHJefW7wnZ3aAiDboaD Ny7y/aovmjGymMp9GrBKWhUFPfSUtJ8l8j4Z7acQs+VDxg8lcAHCJKOyqCBTspUL PSnDe6E/CRyjrG2fUVXTLb6YW1eibQ== =Ld7a -END PGP SIGNATURE- Klaus Jensen (4): hw/nvme: fix invalid check on mcl MAINTAINERS: add Jesper as reviewer on hw/nvme hw/nvme: generalize the mbar size helper hw/nvme: add machine compatibility parameter to enable msix exclusive bar Minwoo Im (1): hw/nvme: separate 'serial' property for VFs Roque Arcudia Hernandez (1): hw/nvme: Add NVMe NGUID property MAINTAINERS | 1 + docs/system/devices/nvme.rst | 7 ++ hw/core/machine.c| 1 + hw/nvme/ctrl.c | 95 +- hw/nvme/meson.build | 2 +- hw/nvme/nguid.c | 187 +++ hw/nvme/ns.c | 2 + hw/nvme/nvme.h | 27 +++-- 8 files changed, 288 insertions(+), 34 deletions(-) create mode 100644 hw/nvme/nguid.c -- 2.44.0
[PULL 1/6] hw/nvme: separate 'serial' property for VFs
From: Minwoo Im Currently, when a VF is created, it uses the 'params' object of the PF as it is. In other words, the 'params.serial' string memory area is also shared. In this situation, if the VF is removed from the system, the PF's 'params.serial' object is released with object_finalize() followed by object_property_del_all() which release the memory for 'serial' property. If that happens, the next VF created will inherit a serial from a corrupted memory area. If this happens, an error will occur when comparing subsys->serial and n->params.serial in the nvme_subsys_register_ctrl() function. Cc: qemu-sta...@nongnu.org Fixes: 44c2c09488db ("hw/nvme: Add support for SR-IOV") Signed-off-by: Minwoo Im Reviewed-by: Klaus Jensen Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 76fe0397045b..94ef63945725 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -8309,9 +8309,15 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp) if (pci_is_vf(pci_dev)) { /* * VFs derive settings from the parent. PF's lifespan exceeds - * that of VF's, so it's safe to share params.serial. + * that of VF's. */ memcpy(>params, >params, sizeof(NvmeParams)); + +/* + * Set PF's serial value to a new string memory to prevent 'serial' + * property object release of PF when a VF is removed from the system. + */ +n->params.serial = g_strdup(pn->params.serial); n->subsys = pn->subsys; } -- 2.44.0
Re: [PATCH 0/2] hw/nvme: fix hibernation-based migration
On Mar 10 12:07, Klaus Jensen wrote: > Julien Grall, in #2184, reported that hibernation-based migration with > hw/nvme is broken when suspending on a pre 6.0 QEMU and resuming on a > more recent one. This is because the BAR layout was changed in 6.0. > > Fix this by adding a machine compatibility parameter that restores the > old behavior. > > Signed-off-by: Klaus Jensen > --- > Klaus Jensen (2): > hw/nvme: generalize the mbar size helper > hw/nvme: add machine compatibility parameter to enable msix exclusive > bar > > hw/core/machine.c | 1 + > hw/nvme/ctrl.c| 73 > --- > hw/nvme/nvme.h| 1 + > 3 files changed, 50 insertions(+), 25 deletions(-) > --- > base-commit: f901bf11b3ddf852e591593b09b8aa7a177f9a0b > change-id: 20240310-fix-msix-exclusive-bar-d65564414a2c > > Best regards, > -- > Klaus Jensen > Whoops, forgot Fixes: and Resolves: tags. Will add that on the pull. signature.asc Description: PGP signature
[PATCH 0/2] hw/nvme: fix hibernation-based migration
Julien Grall, in #2184, reported that hibernation-based migration with hw/nvme is broken when suspending on a pre 6.0 QEMU and resuming on a more recent one. This is because the BAR layout was changed in 6.0. Fix this by adding a machine compatibility parameter that restores the old behavior. Signed-off-by: Klaus Jensen --- Klaus Jensen (2): hw/nvme: generalize the mbar size helper hw/nvme: add machine compatibility parameter to enable msix exclusive bar hw/core/machine.c | 1 + hw/nvme/ctrl.c| 73 --- hw/nvme/nvme.h| 1 + 3 files changed, 50 insertions(+), 25 deletions(-) --- base-commit: f901bf11b3ddf852e591593b09b8aa7a177f9a0b change-id: 20240310-fix-msix-exclusive-bar-d65564414a2c Best regards, -- Klaus Jensen
[PATCH 1/2] hw/nvme: generalize the mbar size helper
From: Klaus Jensen Generalize the mbar size helper such that it can handle cases where the MSI-X table and PBA are expected to be in an exclusive bar. Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 26 +++--- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 76fe0397045b..8cca8a762124 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -8003,13 +8003,18 @@ static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev) memory_region_set_enabled(>pmr.dev->mr, false); } -static uint64_t nvme_bar_size(unsigned total_queues, unsigned total_irqs, - unsigned *msix_table_offset, - unsigned *msix_pba_offset) +static uint64_t nvme_mbar_size(unsigned total_queues, unsigned total_irqs, + unsigned *msix_table_offset, + unsigned *msix_pba_offset) { -uint64_t bar_size, msix_table_size, msix_pba_size; +uint64_t bar_size, msix_table_size; bar_size = sizeof(NvmeBar) + 2 * total_queues * NVME_DB_SIZE; + +if (total_irqs == 0) { +goto out; +} + bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB); if (msix_table_offset) { @@ -8024,11 +8029,10 @@ static uint64_t nvme_bar_size(unsigned total_queues, unsigned total_irqs, *msix_pba_offset = bar_size; } -msix_pba_size = QEMU_ALIGN_UP(total_irqs, 64) / 8; -bar_size += msix_pba_size; +bar_size += QEMU_ALIGN_UP(total_irqs, 64) / 8; -bar_size = pow2ceil(bar_size); -return bar_size; +out: +return pow2ceil(bar_size); } static void nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset) @@ -8036,7 +8040,7 @@ static void nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset) uint16_t vf_dev_id = n->params.use_intel_id ? PCI_DEVICE_ID_INTEL_NVME : PCI_DEVICE_ID_REDHAT_NVME; NvmePriCtrlCap *cap = >pri_ctrl_cap; -uint64_t bar_size = nvme_bar_size(le16_to_cpu(cap->vqfrsm), +uint64_t bar_size = nvme_mbar_size(le16_to_cpu(cap->vqfrsm), le16_to_cpu(cap->vifrsm), NULL, NULL); @@ -8098,8 +8102,8 @@ static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) } /* add one to max_ioqpairs to account for the admin queue pair */ -bar_size = nvme_bar_size(n->params.max_ioqpairs + 1, n->params.msix_qsize, - _table_offset, _pba_offset); +bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1, n->params.msix_qsize, + _table_offset, _pba_offset); memory_region_init(>bar0, OBJECT(n), "nvme-bar0", bar_size); memory_region_init_io(>iomem, OBJECT(n), _mmio_ops, n, "nvme", -- 2.43.0
[PATCH 2/2] hw/nvme: add machine compatibility parameter to enable msix exclusive bar
From: Klaus Jensen Commit 1901b4967c3f ("hw/block/nvme: move msix table and pba to BAR 0") moved the MSI-X table and PBA to BAR 0 to make room for enabling CMR and PMR at the same time. As reported by Julien Grall in #2184, this breaks migration through system hibernation. Add a machine compatibility parameter and set it on machines pre 6.0 to enable the old behavior automatically, restoring the hibernation migration support. Reported-by: Julien Grall Tested-by: Julien Grall Signed-off-by: Klaus Jensen --- hw/core/machine.c | 1 + hw/nvme/ctrl.c| 51 +++ hw/nvme/nvme.h| 1 + 3 files changed, 37 insertions(+), 16 deletions(-) diff --git a/hw/core/machine.c b/hw/core/machine.c index 9ac5d5389a6c..f3012bca1370 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -100,6 +100,7 @@ GlobalProperty hw_compat_5_2[] = { { "PIIX4_PM", "smm-compat", "on"}, { "virtio-blk-device", "report-discard-granularity", "off" }, { "virtio-net-pci-base", "vectors", "3"}, +{ "nvme", "msix-exclusive-bar", "on"}, }; const size_t hw_compat_5_2_len = G_N_ELEMENTS(hw_compat_5_2); diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 8cca8a762124..649467723e7e 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -7798,6 +7798,11 @@ static bool nvme_check_params(NvmeCtrl *n, Error **errp) } if (n->pmr.dev) { +if (params->msix_exclusive_bar) { +error_setg(errp, "not enough BARs available to enable PMR"); +return false; +} + if (host_memory_backend_is_mapped(n->pmr.dev)) { error_setg(errp, "can't use already busy memdev: %s", object_get_canonical_path_component(OBJECT(n->pmr.dev))); @@ -8101,24 +8106,36 @@ static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) pcie_ari_init(pci_dev, 0x100); } -/* add one to max_ioqpairs to account for the admin queue pair */ -bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1, n->params.msix_qsize, - _table_offset, _pba_offset); - -memory_region_init(>bar0, OBJECT(n), "nvme-bar0", bar_size); -memory_region_init_io(>iomem, OBJECT(n), _mmio_ops, n, "nvme", - msix_table_offset); -memory_region_add_subregion(>bar0, 0, >iomem); - -if (pci_is_vf(pci_dev)) { -pcie_sriov_vf_register_bar(pci_dev, 0, >bar0); -} else { +if (n->params.msix_exclusive_bar && !pci_is_vf(pci_dev)) { +bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1, 0, NULL, NULL); +memory_region_init_io(>iomem, OBJECT(n), _mmio_ops, n, "nvme", + bar_size); pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY | - PCI_BASE_ADDRESS_MEM_TYPE_64, >bar0); + PCI_BASE_ADDRESS_MEM_TYPE_64, >iomem); +ret = msix_init_exclusive_bar(pci_dev, n->params.msix_qsize, 4, errp); +} else { +/* add one to max_ioqpairs to account for the admin queue pair */ +bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1, + n->params.msix_qsize, _table_offset, + _pba_offset); + +memory_region_init(>bar0, OBJECT(n), "nvme-bar0", bar_size); +memory_region_init_io(>iomem, OBJECT(n), _mmio_ops, n, "nvme", + msix_table_offset); +memory_region_add_subregion(>bar0, 0, >iomem); + +if (pci_is_vf(pci_dev)) { +pcie_sriov_vf_register_bar(pci_dev, 0, >bar0); +} else { +pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY | + PCI_BASE_ADDRESS_MEM_TYPE_64, >bar0); +} + +ret = msix_init(pci_dev, n->params.msix_qsize, +>bar0, 0, msix_table_offset, +>bar0, 0, msix_pba_offset, 0, errp); } -ret = msix_init(pci_dev, n->params.msix_qsize, ->bar0, 0, msix_table_offset, ->bar0, 0, msix_pba_offset, 0, errp); + if (ret == -ENOTSUP) { /* report that msix is not supported, but do not error out */ warn_report_err(*errp); @@ -8416,6 +8433,8 @@ static Property nvme_props[] = { params.sriov_max_vi_per_vf, 0), DEFINE_PROP_UINT8("sriov_max_vq_per_vf", NvmeCtrl, params.sriov_max_vq_per_vf, 0), +DEFINE_PROP_BOOL("msix-exclusive-bar", NvmeCtrl, params.msix_exclusive_bar, + false), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/nvme/nvme.h b/hw
Re: [PATCH v2] hw/nvme/ns: Add NVMe NGUID property
On Feb 22 17:50, Nabih Estefan wrote: > From: Roque Arcudia Hernandez > > This patch adds a way to specify an NGUID for a given NVMe Namespace using a > string of hexadecimal digits with an optional '-' separator to group bytes. > For > instance: > > -device nvme-ns,nguid="e9accd3b83904e13167cf0593437f57d" > > If provided, the NGUID will be part of the Namespace Identification Descriptor > list and the Identify Namespace data. > > Signed-off-by: Roque Arcudia Hernandez > Signed-off-by: Nabih Estefan > Reviewed-by: Klaus Jensen Thanks, applied to nvme-next! signature.asc Description: PGP signature
Re: [PATCH] hw/nvme: fix invalid endian conversion
On Feb 26 09:18, Philippe Mathieu-Daudé wrote: > On 22/2/24 10:29, Klaus Jensen wrote: > > From: Klaus Jensen > > > > numcntl is one byte and so is max_vfs. Using cpu_to_le16 on big endian > > hosts results in numcntl being set to 0. > > > > Fix by dropping the endian conversion. > > > > Fixes: 746d42b13368 ("hw/nvme: Initialize capability structures for > > primary/secondary controllers") > > Reported-by: Kevin Wolf > > Signed-off-by: Klaus Jensen > > --- > > hw/nvme/ctrl.c | 2 +- > > 1 file changed, 1 insertion(+), 1 deletion(-) > > Hi Klaus, I'm not seeing other NVMe patches on the list, > so I'll queue this on my hw-misc tree, but feel free to > object and I'll unqueue :) > > Thanks, > No, thats perfect! Thanks! :) signature.asc Description: PGP signature
Re: [PATCH] hw/nvme: fix invalid endian conversion
On Feb 22 11:08, Philippe Mathieu-Daudé wrote: > Hi Klaus, > > On 22/2/24 10:29, Klaus Jensen wrote: > > From: Klaus Jensen > > > > numcntl is one byte and so is max_vfs. Using cpu_to_le16 on big endian > > hosts results in numcntl being set to 0. > > > > Fix by dropping the endian conversion. > > > > Fixes: 746d42b13368 ("hw/nvme: Initialize capability structures for > > primary/secondary controllers") > > Isn't it commit 99f48ae7ae ("Add support for Secondary Controller > List") instead? > Thanks Philippe, Yes, that is the commit that had the bug originally. I'll fix it up. signature.asc Description: PGP signature
[PATCH] hw/nvme: fix invalid endian conversion
From: Klaus Jensen numcntl is one byte and so is max_vfs. Using cpu_to_le16 on big endian hosts results in numcntl being set to 0. Fix by dropping the endian conversion. Fixes: 746d42b13368 ("hw/nvme: Initialize capability structures for primary/secondary controllers") Reported-by: Kevin Wolf Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index f026245d1e9e..76fe0397045b 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -7924,7 +7924,7 @@ static void nvme_init_state(NvmeCtrl *n) n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1); QTAILQ_INIT(>aer_queue); -list->numcntl = cpu_to_le16(max_vfs); +list->numcntl = max_vfs; for (i = 0; i < max_vfs; i++) { sctrl = >sec[i]; sctrl->pcid = cpu_to_le16(n->cntlid); --- base-commit: 760b4dcdddba4a40b9fa0eb78fdfc7eda7cb83d0 change-id: 20240222-fix-sriov-numcntl-5ebe46a42176 Best regards, -- Klaus Jensen
Re: [PATCH] hw/nvme/ns: Add NVMe NGUID property
On Feb 21 17:38, Nabih Estefan wrote: > From: Roque Arcudia Hernandez > > This patch adds a way to specify an NGUID for a given NVMe Namespace using a > string of hexadecimal digits with an optional '-' separator to group bytes. > For > instance: > > -device nvme-ns,nguid="e9accd3b83904e13167cf0593437f57d" > > If provided, the NGUID will be part of the Namespace Identification Descriptor > list and the Identify Namespace data. > > Signed-off-by: Roque Arcudia Hernandez > Signed-off-by: Nabih Estefan Hi Thanks! Looks good, Reviewed-by: Klaus Jensen Only a minor nit below. > diff --git a/hw/nvme/nguid.c b/hw/nvme/nguid.c > new file mode 100644 > index 00..3e3e0567c5 > --- /dev/null > +++ b/hw/nvme/nguid.c > @@ -0,0 +1,192 @@ > +/* > + * QEMU NVMe NGUID functions > + * > + * Copyright 2024 Google LLC > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms of the GNU General Public License as published by the > + * Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, but > WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License > + * for more details. > + */ > + > +#include "qemu/osdep.h" > +#include "qapi/visitor.h" > +#include "qemu/ctype.h" > +#include "nvme.h" > + > +#define NGUID_SEPARATOR '-' > + > +#define NGUID_VALUE_AUTO "auto" > + > +#define NGUID_FMT \ > +"%02hhx%02hhx%02hhx%02hhx" \ > +"%02hhx%02hhx%02hhx%02hhx" \ > +"%02hhx%02hhx%02hhx%02hhx" \ > +"%02hhx%02hhx%02hhx%02hhx" > + > +#define NGUID_STR_LEN (2 * NGUID_LEN + 1) > + > +bool nvme_nguid_is_null(const NvmeNGUID *nguid) > +{ > +int i; > +for (i = 0; i < NGUID_LEN; i++) { > +if (nguid->data[i] != 0) { > +return false; > +} > +} > +return true; > +} Maybe just bool nvme_nguid_is_null(const NvmeNGUID *nguid) { static NvmeNGUID null_nguid; return memcmp(nguid, _nguid, sizeof(NvmeNGUID)) == 0; } signature.asc Description: PGP signature
Re: [PATCH v6 01/15] hw/nvme: Use pcie_sriov_num_vfs()
On Feb 21 00:33, Akihiko Odaki wrote: > On 2024/02/20 23:53, Kevin Wolf wrote: > > Am 20.02.2024 um 15:29 hat Kevin Wolf geschrieben: > > > Am 20.02.2024 um 13:24 hat Akihiko Odaki geschrieben: > > > > nvme_sriov_pre_write_ctrl() used to directly inspect SR-IOV > > > > configurations to know the number of VFs being disabled due to SR-IOV > > > > configuration writes, but the logic was flawed and resulted in > > > > out-of-bound memory access. > > > > > > > > It assumed PCI_SRIOV_NUM_VF always has the number of currently enabled > > > > VFs, but it actually doesn't in the following cases: > > > > - PCI_SRIOV_NUM_VF has been set but PCI_SRIOV_CTRL_VFE has never been. > > > > - PCI_SRIOV_NUM_VF was written after PCI_SRIOV_CTRL_VFE was set. > > > > - VFs were only partially enabled because of realization failure. > > > > > > > > It is a responsibility of pcie_sriov to interpret SR-IOV configurations > > > > and pcie_sriov does it correctly, so use pcie_sriov_num_vfs(), which it > > > > provides, to get the number of enabled VFs before and after SR-IOV > > > > configuration writes. > > > > > > > > Cc: qemu-sta...@nongnu.org > > > > Fixes: CVE-2024-26328 > > > > Fixes: 11871f53ef8e ("hw/nvme: Add support for the Virtualization > > > > Management command") > > > > Suggested-by: Michael S. Tsirkin > > > > Signed-off-by: Akihiko Odaki > > > > --- > > > > hw/nvme/ctrl.c | 26 -- > > > > 1 file changed, 8 insertions(+), 18 deletions(-) > > > > > > > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c > > > > index f026245d1e9e..7a56e7b79b4d 100644 > > > > --- a/hw/nvme/ctrl.c > > > > +++ b/hw/nvme/ctrl.c > > > > @@ -8466,36 +8466,26 @@ static void nvme_pci_reset(DeviceState *qdev) > > > > nvme_ctrl_reset(n, NVME_RESET_FUNCTION); > > > > } > > > > -static void nvme_sriov_pre_write_ctrl(PCIDevice *dev, uint32_t address, > > > > - uint32_t val, int len) > > > > +static void nvme_sriov_post_write_config(PCIDevice *dev, uint16_t > > > > old_num_vfs) > > > > { > > > > NvmeCtrl *n = NVME(dev); > > > > NvmeSecCtrlEntry *sctrl; > > > > -uint16_t sriov_cap = dev->exp.sriov_cap; > > > > -uint32_t off = address - sriov_cap; > > > > -int i, num_vfs; > > > > +int i; > > > > -if (!sriov_cap) { > > > > -return; > > > > -} > > > > - > > > > -if (range_covers_byte(off, len, PCI_SRIOV_CTRL)) { > > > > -if (!(val & PCI_SRIOV_CTRL_VFE)) { > > > > -num_vfs = pci_get_word(dev->config + sriov_cap + > > > > PCI_SRIOV_NUM_VF); > > > > -for (i = 0; i < num_vfs; i++) { > > > > -sctrl = >sec_ctrl_list.sec[i]; > > > > -nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), > > > > false); > > > > -} > > > > -} > > > > +for (i = pcie_sriov_num_vfs(dev); i < old_num_vfs; i++) { > > > > +sctrl = >sec_ctrl_list.sec[i]; > > > > +nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false); > > > > } > > > > } > > > > > > Maybe I'm missing something, but if the concern is that 'i' could run > > > beyond the end of the array, I don't see anything that limits > > > pcie_sriov_num_vfs() to the static size of 127 that n->sec_ctrl_list.sec > > > has. register_vfs() seems to just take whatever 16 bit value the guest > > > wrote without imposing additional restrictions. > > > > > > If there is some mechanism that makes register_vf() fail if we exceed > > > the limit, maybe an assertion with a comment would be in order because > > > it doesn't seem obvious. I couldn't find any code that enforces it, > > > sriov_max_vfs only ever seems to be used as a hint for the guest. > > > > > > If not, do we need another check that fails gracefully in the error > > > case? > > > > Ok, I see now that patch 2 fixes this. But then the commit message is > > wrong because it implies that this patch is the only thing you need to > > fix the problem with nvme. You can't say "Fixes: CVE-2024-26328" if half > > of the fix is still missing. > > I didn't assign CVE-2024-26328 for the case that the value of > PCI_SRIOV_NUM_VF is greater than PCI_SRIOV_TOTAL_VF; it's what > CVE-2024-26327 deals with. > > The problem I dealt here is that the value of PCI_SRIOV_NUM_VF may not > represent the actual number of enabled VFs because another register > (PCI_SRIOV_CTRL_VFE) is not set, for example. > > If an assertion to be added, I think it should be in pcie_sriov_num_vfs() > and ensure the returning value is less than the value of PCI_SRIOV_TOTAL_VF > (aka sriov_max_vfs in hw/nvme/ctrl.c), but I think it's fine without it. > > > > > Maybe asserting old_num_vfs < n->sec_ctrl_list.numcntl would still be a > > good idea. But looking at this one, it seems to me that numcntl isn't > > completely correct either: > > > > list->numcntl = cpu_to_le16(max_vfs); > > > > Both list->numcntl and max_vfs are uint8_t, so I think this will always > > be 0 on big endian
Re: [PATCH v6 01/15] hw/nvme: Use pcie_sriov_num_vfs()
On Feb 20 15:29, Kevin Wolf wrote: > Am 20.02.2024 um 13:24 hat Akihiko Odaki geschrieben: > > nvme_sriov_pre_write_ctrl() used to directly inspect SR-IOV > > configurations to know the number of VFs being disabled due to SR-IOV > > configuration writes, but the logic was flawed and resulted in > > out-of-bound memory access. > > > > It assumed PCI_SRIOV_NUM_VF always has the number of currently enabled > > VFs, but it actually doesn't in the following cases: > > - PCI_SRIOV_NUM_VF has been set but PCI_SRIOV_CTRL_VFE has never been. > > - PCI_SRIOV_NUM_VF was written after PCI_SRIOV_CTRL_VFE was set. > > - VFs were only partially enabled because of realization failure. > > > > It is a responsibility of pcie_sriov to interpret SR-IOV configurations > > and pcie_sriov does it correctly, so use pcie_sriov_num_vfs(), which it > > provides, to get the number of enabled VFs before and after SR-IOV > > configuration writes. > > > > Cc: qemu-sta...@nongnu.org > > Fixes: CVE-2024-26328 > > Fixes: 11871f53ef8e ("hw/nvme: Add support for the Virtualization > > Management command") > > Suggested-by: Michael S. Tsirkin > > Signed-off-by: Akihiko Odaki > > --- > > hw/nvme/ctrl.c | 26 -- > > 1 file changed, 8 insertions(+), 18 deletions(-) > > > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c > > index f026245d1e9e..7a56e7b79b4d 100644 > > --- a/hw/nvme/ctrl.c > > +++ b/hw/nvme/ctrl.c > > @@ -8466,36 +8466,26 @@ static void nvme_pci_reset(DeviceState *qdev) > > nvme_ctrl_reset(n, NVME_RESET_FUNCTION); > > } > > > > -static void nvme_sriov_pre_write_ctrl(PCIDevice *dev, uint32_t address, > > - uint32_t val, int len) > > +static void nvme_sriov_post_write_config(PCIDevice *dev, uint16_t > > old_num_vfs) > > { > > NvmeCtrl *n = NVME(dev); > > NvmeSecCtrlEntry *sctrl; > > -uint16_t sriov_cap = dev->exp.sriov_cap; > > -uint32_t off = address - sriov_cap; > > -int i, num_vfs; > > +int i; > > > > -if (!sriov_cap) { > > -return; > > -} > > - > > -if (range_covers_byte(off, len, PCI_SRIOV_CTRL)) { > > -if (!(val & PCI_SRIOV_CTRL_VFE)) { > > -num_vfs = pci_get_word(dev->config + sriov_cap + > > PCI_SRIOV_NUM_VF); > > -for (i = 0; i < num_vfs; i++) { > > -sctrl = >sec_ctrl_list.sec[i]; > > -nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false); > > -} > > -} > > +for (i = pcie_sriov_num_vfs(dev); i < old_num_vfs; i++) { > > +sctrl = >sec_ctrl_list.sec[i]; > > +nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false); > > } > > } > > Maybe I'm missing something, but if the concern is that 'i' could run > beyond the end of the array, I don't see anything that limits > pcie_sriov_num_vfs() to the static size of 127 that n->sec_ctrl_list.sec > has. register_vfs() seems to just take whatever 16 bit value the guest > wrote without imposing additional restrictions. > Hi Kevin, Thanks for reviewing, I believe patch 2 in this series fixes that missing validation of NumVFs. signature.asc Description: PGP signature
Re: [PATCH v5 01/11] hw/nvme: Use pcie_sriov_num_vfs()
On Feb 18 13:56, Akihiko Odaki wrote: > nvme_sriov_pre_write_ctrl() used to directly inspect SR-IOV > configurations to know the number of VFs being disabled due to SR-IOV > configuration writes, but the logic was flawed and resulted in > out-of-bound memory access. > > It assumed PCI_SRIOV_NUM_VF always has the number of currently enabled > VFs, but it actually doesn't in the following cases: > - PCI_SRIOV_NUM_VF has been set but PCI_SRIOV_CTRL_VFE has never been. > - PCI_SRIOV_NUM_VF was written after PCI_SRIOV_CTRL_VFE was set. > - VFs were only partially enabled because of realization failure. > > It is a responsibility of pcie_sriov to interpret SR-IOV configurations > and pcie_sriov does it correctly, so use pcie_sriov_num_vfs(), which it > provides, to get the number of enabled VFs before and after SR-IOV > configuration writes. > > Cc: qemu-sta...@nongnu.org > Fixes: 11871f53ef8e ("hw/nvme: Add support for the Virtualization Management > command") > Suggested-by: Michael S. Tsirkin > Signed-off-by: Akihiko Odaki Thanks Akihiko, I'll pick this up for hw/nvme nvme-next as-is. Reviewed-by: Klaus Jensen > --- > hw/nvme/ctrl.c | 26 -- > 1 file changed, 8 insertions(+), 18 deletions(-) > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c > index f026245d1e9e..7a56e7b79b4d 100644 > --- a/hw/nvme/ctrl.c > +++ b/hw/nvme/ctrl.c > @@ -8466,36 +8466,26 @@ static void nvme_pci_reset(DeviceState *qdev) > nvme_ctrl_reset(n, NVME_RESET_FUNCTION); > } > > -static void nvme_sriov_pre_write_ctrl(PCIDevice *dev, uint32_t address, > - uint32_t val, int len) > +static void nvme_sriov_post_write_config(PCIDevice *dev, uint16_t > old_num_vfs) > { > NvmeCtrl *n = NVME(dev); > NvmeSecCtrlEntry *sctrl; > -uint16_t sriov_cap = dev->exp.sriov_cap; > -uint32_t off = address - sriov_cap; > -int i, num_vfs; > +int i; > > -if (!sriov_cap) { > -return; > -} > - > -if (range_covers_byte(off, len, PCI_SRIOV_CTRL)) { > -if (!(val & PCI_SRIOV_CTRL_VFE)) { > -num_vfs = pci_get_word(dev->config + sriov_cap + > PCI_SRIOV_NUM_VF); > -for (i = 0; i < num_vfs; i++) { > -sctrl = >sec_ctrl_list.sec[i]; > -nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false); > -} > -} > +for (i = pcie_sriov_num_vfs(dev); i < old_num_vfs; i++) { > +sctrl = >sec_ctrl_list.sec[i]; > +nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false); > } > } > > static void nvme_pci_write_config(PCIDevice *dev, uint32_t address, >uint32_t val, int len) > { > -nvme_sriov_pre_write_ctrl(dev, address, val, len); > +uint16_t old_num_vfs = pcie_sriov_num_vfs(dev); > + > pci_default_write_config(dev, address, val, len); > pcie_cap_flr_write_config(dev, address, val, len); > +nvme_sriov_post_write_config(dev, old_num_vfs); > } > > static const VMStateDescription nvme_vmstate = { > > -- > 2.43.1 > -- One of us - No more doubt, silence or taboo about mental illness. signature.asc Description: PGP signature
Re: [PATCH v4 0/3] Initial support for SPDM Responders
On Feb 15 14:44, Jonathan Cameron wrote: > On Tue, 13 Feb 2024 12:44:00 +1000 > Alistair Francis wrote: > > Hi All, > > Just wanted to add that back in v2 Klaus Jensen stated: > > "I have no problem with picking this up for nvme, but I'd rather not take > the full series through my tree without reviews/acks from the pci > maintainers." > > So I'd like to add my request that Michael and/or Marcell takes a look > when they have time. > > I've been carrying more or less the first 2 patches in my CXL staging > tree for a couple of years (the initial Linux Kernel support that Lukas > Wunner is now handling was developed against this) and I would love > to see this upstream. Along with PCI and CXL and NVME usecases this > is a major part of the Confidential Compute device assignment story > via PCI/TDISP and CXL equivalent. > > It's not changed in significant ways since v2 back in October last year. > Would someone be willing to sign up to maintain the spdm_socket backend? signature.asc Description: PGP signature
Re: [PATCH] hw/nvme: fix invalid check on mcl
On Feb 8 21:33, Minwoo Im wrote: > On 24-02-08 13:22:48, Klaus Jensen wrote: > > From: Klaus Jensen > > > > The number of logical blocks within a source range is converted into a > > 1s based number at the time of parsing. However, when verifying the copy > > length we add one again, causing the check against MCL to fail in error. > > > > Fixes: 381ab99d8587 ("hw/nvme: check maximum copy length (MCL) for COPY") > > Signed-off-by: Klaus Jensen > > Hi Klaus, > > Reviewed-by: Minwoo Im > > Thanks! > Thanks Minwoo, applied to nvme-next! signature.asc Description: PGP signature
[PATCH] hw/nvme: fix invalid check on mcl
From: Klaus Jensen The number of logical blocks within a source range is converted into a 1s based number at the time of parsing. However, when verifying the copy length we add one again, causing the check against MCL to fail in error. Fixes: 381ab99d8587 ("hw/nvme: check maximum copy length (MCL) for COPY") Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index f026245d1e9e..05c667158a3a 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -2855,7 +2855,7 @@ static inline uint16_t nvme_check_copy_mcl(NvmeNamespace *ns, uint32_t nlb; nvme_copy_source_range_parse(iocb->ranges, idx, iocb->format, NULL, , NULL, NULL, NULL); -copy_len += nlb + 1; +copy_len += nlb; } if (copy_len > ns->id_ns.mcl) { --- base-commit: 39a6e4f87e7b75a45b08d6dc8b8b7c2954c87440 change-id: 20240208-fix-copy-mcl-check-3a6d95327154 Best regards, -- Klaus Jensen
Re: NVME hotplug support ?
On Jan 29 14:13, Damien Hedde wrote: > > > On 1/24/24 08:47, Hannes Reinecke wrote: > > On 1/24/24 07:52, Philippe Mathieu-Daudé wrote: > > > Hi Hannes, > > > > > > [+Markus as QOM/QDev rubber duck] > > > > > > On 23/1/24 13:40, Hannes Reinecke wrote: > > > > On 1/23/24 11:59, Damien Hedde wrote: > > > > > Hi all, > > > > > > > > > > We are currently looking into hotplugging nvme devices and > > > > > it is currently not possible: > > > > > When nvme was introduced 2 years ago, the feature was disabled. > > > > > > commit cc6fb6bc506e6c47ed604fcb7b7413dff0b7d845 > > > > > > Author: Klaus Jensen > > > > > > Date: Tue Jul 6 10:48:40 2021 +0200 > > > > > > > > > > > > hw/nvme: mark nvme-subsys non-hotpluggable > > > > > > We currently lack the infrastructure to handle > > > > > > subsystem hotplugging, so > > > > > > disable it. > > > > > > > > > > Do someone know what's lacking or anyone have some tips/idea > > > > > of what we should develop to add the support ? > > > > > > > > > Problem is that the object model is messed up. In qemu > > > > namespaces are attached to controllers, which in turn are > > > > children of the PCI device. > > > > There are subsystems, but these just reference the controller. > > > > > > > > So if you hotunplug the PCI device you detach/destroy the > > > > controller and detach the namespaces from the controller. > > > > But if you hotplug the PCI device again the NVMe controller will > > > > be attached to the PCI device, but the namespace are still be > > > > detached. > > > > > > > > Klaus said he was going to fix that, and I dimly remember some patches > > > > floating around. But apparently it never went anywhere. > > > > > > > > Fundamental problem is that the NVMe hierarchy as per spec is > > > > incompatible with the qemu object model; qemu requires a strict > > > > tree model where every object has exactly _one_ parent. > > > > > > The modelling problem is not clear to me. > > > Do you have an example of how the NVMe hierarchy should be? > > > > > Sure. > > > > As per NVMe spec we have this hierarchy: > > > > ---> subsys --- > > | | > > | V > > controller namespaces > > > > There can be several controllers, and several > > namespaces. > > The initiator (ie the linux 'nvme' driver) connects > > to a controller, queries the subsystem for the attached > > namespaces, and presents each namespace as a block device. > > > > For Qemu we have the problem that every device _must_ be > > a direct descendant of the parent (expressed by the fact > > that each 'parent' object is embedded in the device object). > > > > So if we were to present a NVMe PCI device, the controller > > must be derived from the PCI device: > > > > pci -> controller > > > > but now we have to express the NVMe hierarchy, too: > > > > pci -> ctrl1 -> subsys1 -> namespace1 > > > > which actually works. > > We can easily attach several namespaces: > > > > pci -> ctrl1 ->subsys1 -> namespace2 > > > > For a single controller and a single subsystem. > > However, as mentioned above, there can be _several_ > > controllers attached to the same subsystem. > > So we can express the second controller: > > > > pci -> ctrl2 > > > > but we cannot attach the controller to 'subsys1' > > as then 'subsys1' would need to be derived from > > 'ctrl2', and not (as it is now) from 'ctrl1'. > > > > The most logical step would be to have 'subsystems' > > their own entity, independent of any controllers. > > But then the block devices (which are derived from > > the namespaces) could not be traced back > > to the PCI device, and a PCI hotplug would not > > 'automatically' disconnect the nvme block devices. > > > > Plus the subsystem would be independent from the NVMe > > PCI devices, so you could have a subsystem with > > no controllers attached. And one would wonder who > > should be responsible for cleaning up that. > > > > Thanks for the details ! > > My use case is the simple one with no nvme subsystem/namespaces: > - hotplug a pci nvme device (nvme controller) as in the following CLI (which > automatically put the drive into a default namespace) > > ./qemu-system-aarch64 -nographic -M virt \ >-drive file=nvme0.disk,if=none,id=nvme-drive0 \ >-device nvme,serial=nvme0,id=nvmedev0,drive=nvme-drive0 > AFAIK, you just need a pci root port to plug the device into. -drive file=nvme0.disk,if=none,id=nvme-drive0 \ -device "pcie-root-port,id=pcie_root_port0,chassis=1,slot=0" \ -device nvme,serial=nvme0,id=nvmedev0,drive=nvme-drive0 Then, you can use the qemu monitor to `device_del nvmedev0` and add it with `device_add nvme,serial=nvme0,id=nvmedev0,drive=nvme-drive0`. The "drive" (blockdev) will stick around after the device_del. signature.asc Description: PGP signature
Re: NVME hotplug support ?
On Jan 23 13:40, Hannes Reinecke wrote: > On 1/23/24 11:59, Damien Hedde wrote: > > Hi all, > > > > We are currently looking into hotplugging nvme devices and it is currently > > not possible: > > When nvme was introduced 2 years ago, the feature was disabled. > > > commit cc6fb6bc506e6c47ed604fcb7b7413dff0b7d845 > > > Author: Klaus Jensen > > > Date: Tue Jul 6 10:48:40 2021 +0200 > > > > > > hw/nvme: mark nvme-subsys non-hotpluggable > > > We currently lack the infrastructure to handle subsystem hotplugging, > > > so > > > disable it. > > > > Do someone know what's lacking or anyone have some tips/idea of what we > > should develop to add the support ? > > > Problem is that the object model is messed up. In qemu namespaces are > attached to controllers, which in turn are children of the PCI device. > There are subsystems, but these just reference the controller. > > So if you hotunplug the PCI device you detach/destroy the controller and > detach the namespaces from the controller. > But if you hotplug the PCI device again the NVMe controller will be attached > to the PCI device, but the namespace are still be detached. > > Klaus said he was going to fix that, and I dimly remember some patches > floating around. But apparently it never went anywhere. > > Fundamental problem is that the NVMe hierarchy as per spec is incompatible > with the qemu object model; qemu requires a strict > tree model where every object has exactly _one_ parent. > A little history might help to nuance this just a bit. And to defend the current model ;) When we added support for multiple namespaces we did not consider subsystem support, so the namespaces would just be associated directly with a parent controller (in QDev terms, the parent has a bus that the namespace devices are attached to). When we added subsystems, where namespaces may be attached to several controllers, it became necessary to break the controller/namespace parent/child relationship. The problem was that removing the controller would take all the bus children with it, causing namespaces to be removed from other controllers in the subsystem. We fixed this by reparenting the namespaces to the subsystem device instead. I think this model fits the NVMe hierarchy as good as possible. Controllers and namespaces are considered children of the subsystem (as they are in NVMe). Now, the problem with namespaces not being re-attached is partly false. If the namespaces are 'shared=on', they will be automatically attached to any new controller attached to the subsystem. However, if they are private, that is is not the case. In NVMe, a private namespace just means a namespace that can only be attached to a single controller at a time. It is not entirely unlikely that you have a private namespace that you then reassign to controller B when controller A is removed. Now, what we could do is track the last controller identifier that a private namespace was attached to, and if the same controller identifier is added to the subsystem, we could reattach the private namespace. However, broadly, I think the current model does a pretty good job in supporting experimentation with hotplug, multipath and failover configurations. signature.asc Description: PGP signature
Re: NVME hotplug support ?
On Jan 23 10:59, Damien Hedde wrote: > Hi all, > > We are currently looking into hotplugging nvme devices and it is currently > not possible: > When nvme was introduced 2 years ago, the feature was disabled. > > commit cc6fb6bc506e6c47ed604fcb7b7413dff0b7d845 > > Author: Klaus Jensen > > Date: Tue Jul 6 10:48:40 2021 +0200 > > > >hw/nvme: mark nvme-subsys non-hotpluggable > > > >We currently lack the infrastructure to handle subsystem hotplugging, so > >disable it. > > Do someone know what's lacking or anyone have some tips/idea of what we > should develop to add the support ? > > Regards, > -- > Damien > That's not entirely true. The *subsystem* is non-hotpluggable, but individual controllers can be hotplugged. Even into an existing subsystem. However, you cannot hotplug pci devices unless you set up a pcie root port. Say, -device "pcie-root-port,id=pcie_root_port0,chassis=1,slot=0" -device "nvme,id=nvme0,serial=nvme0,bus=pcie_root_port0" nvme0 can then be removed with device_del and added back as a new device with device_add. signature.asc Description: PGP signature
Re: [RFC v2 0/7] Add persistence to NVMe ZNS emulation
On Nov 27 16:56, Sam Li wrote: > ZNS emulation follows NVMe ZNS spec but the state of namespace > zones does not persist accross restarts of QEMU. This patch makes the > metadata of ZNS emulation persistent by using new block layer APIs and > the qcow2 img as backing file. It is the second part after the patches > - adding full zoned storage emulation to qcow2 driver. > https://patchwork.kernel.org/project/qemu-devel/cover/20231127043703.49489-1-faithilike...@gmail.com/ > > The metadata of ZNS emulation divides into two parts, zone metadata and > zone descriptor extension data. The zone metadata is composed of zone > states, zone type, wp and zone attributes. The zone information can be > stored at an uint64_t wp to save space and easy access. The structure of > wp of each zone is as follows: > |(4)| zone type (1)| zone attr (8)| wp (51) || > > The zone descriptor extension data is relatively small comparing to the > overall size therefore we adopt the option that store zded of all zones > in an array regardless of the valid bit set. > > Creating a zns format qcow2 image file adds one more option zd_extension_size > to zoned device configurations. > > To attach this file as emulated zns drive in the command line of QEMU, use: > -drive file=${znsimg},id=nvmezns0,format=qcow2,if=none \ > -device nvme-ns,drive=nvmezns0,bus=nvme0,nsid=1,uuid=xxx \ > > Sorry, send this one more time due to network problems. > > v1->v2: > - split [v1 2/5] patch to three (doc, config, block layer API) > - adapt qcow2 v6 > > Sam Li (7): > docs/qcow2: add zd_extension_size option to the zoned format feature > qcow2: add zd_extension configurations to zoned metadata > hw/nvme: use blk_get_*() to access zone info in the block layer > hw/nvme: add blk_get_zone_extension to access zd_extensions > hw/nvme: make the metadata of ZNS emulation persistent > hw/nvme: refactor zone append write using block layer APIs > hw/nvme: make ZDED persistent > > block/block-backend.c | 88 ++ > block/qcow2.c | 119 ++- > block/qcow2.h |2 + > docs/interop/qcow2.txt|3 + > hw/nvme/ctrl.c| 1247 - > hw/nvme/ns.c | 162 +--- > hw/nvme/nvme.h| 95 +-- > include/block/block-common.h |9 + > include/block/block_int-common.h |8 + > include/sysemu/block-backend-io.h | 11 + > include/sysemu/dma.h |3 + > qapi/block-core.json |4 + > system/dma-helpers.c | 17 + > 13 files changed, 647 insertions(+), 1121 deletions(-) > > -- > 2.40.1 > Hi Sam, This is awesome. For the hw/nvme parts, Acked-by: Klaus Jensen I'll give it a proper R-b when you drop the RFC status. signature.asc Description: PGP signature
Re: [PATCH v2 3/3] hw/nvme: Add SPDM over DOE support
On Oct 17 15:21, Alistair Francis wrote: > From: Wilfred Mallawa > > Setup Data Object Exchance (DOE) as an extended capability for the NVME > controller and connect SPDM to it (CMA) to it. > > Signed-off-by: Wilfred Mallawa > Signed-off-by: Alistair Francis Acked-by: Klaus Jensen I have no problem with picking this up for nvme, but I'd rather not take the full series through my tree without reviews/acks from the pci maintainers. signature.asc Description: PGP signature
Re: [RFC PATCH v3 65/78] hw/nvme: add fallthrough pseudo-keyword
le *dif, > break; > } > > -/* fallthrough */ > +fallthrough; > case NVME_ID_NS_DPS_TYPE_1: > case NVME_ID_NS_DPS_TYPE_2: > if (be16_to_cpu(dif->g16.apptag) != 0x) { > @@ -229,7 +229,7 @@ static uint16_t nvme_dif_prchk_crc64(NvmeNamespace *ns, > NvmeDifTuple *dif, > break; > } > > -/* fallthrough */ > +fallthrough; > case NVME_ID_NS_DPS_TYPE_1: > case NVME_ID_NS_DPS_TYPE_2: > if (be16_to_cpu(dif->g64.apptag) != 0x) { > -- > 2.39.2 > > Reviewed-by: Klaus Jensen signature.asc Description: PGP signature
Re: [PATCH 3/3] hw/nvme: Add SPDM over DOE support
On Sep 15 21:27, Alistair Francis wrote: > From: Wilfred Mallawa > > Setup Data Object Exchance (DOE) as an extended capability for the NVME > controller and connect SPDM to it (CMA) to it. > > Signed-off-by: Wilfred Mallawa > Signed-off-by: Alistair Francis > --- > docs/specs/index.rst| 1 + > docs/specs/spdm.rst | 56 + > include/hw/pci/pci_device.h | 5 > include/hw/pci/pcie_doe.h | 3 ++ > hw/nvme/ctrl.c | 52 ++ > hw/nvme/trace-events| 1 + > 6 files changed, 118 insertions(+) > create mode 100644 docs/specs/spdm.rst > This looks reasonable enough, but could this not be implemented at the PCI layer? I do not see anything that is tied specifically to the nvme device, why can the spdm parameter not be a PCIDevice parameter such that all PCIDevice-derived devices gains this functionality? signature.asc Description: PGP signature
[PATCH] hw/nvme: Clean up local variable shadowing in nvme_ns_init()
From: Klaus Jensen Fix local variable shadowing in nvme_ns_init(). Reported-by: Markus Armbruster Signed-off-by: Klaus Jensen --- hw/nvme/ns.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c index 44aba8f4d9cf..0eabcf5cf500 100644 --- a/hw/nvme/ns.c +++ b/hw/nvme/ns.c @@ -107,7 +107,7 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp) ns->pif = ns->params.pif; -static const NvmeLBAF lbaf[16] = { +static const NvmeLBAF defaults[16] = { [0] = { .ds = 9 }, [1] = { .ds = 9, .ms = 8 }, [2] = { .ds = 9, .ms = 16 }, @@ -120,7 +120,7 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp) ns->nlbaf = 8; -memcpy(_ns->lbaf, , sizeof(lbaf)); +memcpy(_ns->lbaf, , sizeof(defaults)); for (i = 0; i < ns->nlbaf; i++) { NvmeLBAF *lbaf = _ns->lbaf[i]; --- base-commit: b55e4b9c0525560577384adfc6d30eb0daa8d7be change-id: 20230925-fix-local-shadowing-9606793e8ae9 Best regards, -- Klaus Jensen
Re: [PATCH v6 0/3] hw/{i2c,nvme}: mctp endpoint, nvme management interface model
On Sep 20 07:54, Corey Minyard wrote: > On Wed, Sep 20, 2023 at 12:48:03PM +0100, Jonathan Cameron via wrote: > > On Thu, 14 Sep 2023 11:53:40 +0200 > > Klaus Jensen wrote: > > > > > This adds a generic MCTP endpoint model that other devices may derive > > > from. > > > > > > Also included is a very basic implementation of an NVMe-MI device, > > > supporting only a small subset of the required commands. > > > > > > Since this all relies on i2c target mode, this can currently only be > > > used with an SoC that includes the Aspeed I2C controller. > > > > > > The easiest way to get up and running with this, is to grab my buildroot > > > overlay[1] (aspeed_ast2600evb_nmi_defconfig). It includes modified a > > > modified dts as well as a couple of required packages. > > > > > > QEMU can then be launched along these lines: > > > > > > qemu-system-arm \ > > > -nographic \ > > > -M ast2600-evb \ > > > -kernel output/images/zImage \ > > > -initrd output/images/rootfs.cpio \ > > > -dtb output/images/aspeed-ast2600-evb-nmi.dtb \ > > > -nic user,hostfwd=tcp::-:22 \ > > > -device nmi-i2c,address=0x3a \ > > > -serial mon:stdio > > > > > > From within the booted system, > > > > > > mctp addr add 8 dev mctpi2c15 > > > mctp link set mctpi2c15 up > > > mctp route add 9 via mctpi2c15 > > > mctp neigh add 9 dev mctpi2c15 lladdr 0x3a > > > mi-mctp 1 9 info > > > > > > Comments are very welcome! > > > > > > [1]: https://github.com/birkelund/hwtests/tree/main/br2-external > > > > > > Signed-off-by: Klaus Jensen > > > > Hi Klaus, > > > > Silly question, but who is likely to pick this up? + likely to be soon? > > > > I'm going to post the CXL stuff that makes use of the core support shortly > > and whilst I can point at this patch set on list, I'd keen to see it > > upstream > > to reduce the dependencies (it's got 2 sets ahead of it of CXL stuff > > anyway but that will all hopefully go through Michael Tsirkin's tree > > for PCI stuff in one go). > > I can pick it up, but he can just request a merge, too. > > I did have a question I asked earlier about tests. It would be unusual > at this point to add something like this without having some tests, > especially injecting invalid data. > Hi all, Sorry for the late reply. I'm currently at SDC, but I will write up some tests when I get back to in the office on Monday. Corey, what kinds of tests would be best here? Avocado "acceptance" tests or would you like to see something lower level? Cheers, Klaus signature.asc Description: PGP signature
[PATCH v6 3/3] hw/nvme: add nvme management interface model
From: Klaus Jensen Add the 'nmi-i2c' device that emulates an NVMe Management Interface controller. Initial support is very basic (Read NMI DS, Configuration Get). This is based on previously posted code by Padmakar Kalghatgi, Arun Kumar Agasar and Saurav Kumar. Reviewed-by: Jonathan Cameron Signed-off-by: Klaus Jensen --- hw/nvme/Kconfig | 4 + hw/nvme/meson.build | 1 + hw/nvme/nmi-i2c.c| 407 +++ hw/nvme/trace-events | 6 + 4 files changed, 418 insertions(+) diff --git a/hw/nvme/Kconfig b/hw/nvme/Kconfig index cfa2ab0f9d5a..e1f6360c0f4b 100644 --- a/hw/nvme/Kconfig +++ b/hw/nvme/Kconfig @@ -2,3 +2,7 @@ config NVME_PCI bool default y if PCI_DEVICES || PCIE_DEVICES depends on PCI + +config NVME_NMI_I2C +bool +default y if I2C_MCTP diff --git a/hw/nvme/meson.build b/hw/nvme/meson.build index 1a6a2ca2f307..7bc85f31c149 100644 --- a/hw/nvme/meson.build +++ b/hw/nvme/meson.build @@ -1 +1,2 @@ system_ss.add(when: 'CONFIG_NVME_PCI', if_true: files('ctrl.c', 'dif.c', 'ns.c', 'subsys.c')) +system_ss.add(when: 'CONFIG_NVME_NMI_I2C', if_true: files('nmi-i2c.c')) diff --git a/hw/nvme/nmi-i2c.c b/hw/nvme/nmi-i2c.c new file mode 100644 index ..bf4648db0457 --- /dev/null +++ b/hw/nvme/nmi-i2c.c @@ -0,0 +1,407 @@ +/* + * SPDX-License-Identifier: GPL-2.0-or-later + * + * SPDX-FileCopyrightText: Copyright (c) 2023 Samsung Electronics Co., Ltd. + * + * SPDX-FileContributor: Padmakar Kalghatgi + * SPDX-FileContributor: Arun Kumar Agasar + * SPDX-FileContributor: Saurav Kumar + * SPDX-FileContributor: Klaus Jensen + */ + +#include "qemu/osdep.h" +#include "qemu/crc32c.h" +#include "hw/registerfields.h" +#include "hw/i2c/i2c.h" +#include "hw/i2c/mctp.h" +#include "net/mctp.h" +#include "trace.h" + +/* NVM Express Management Interface 1.2c, Section 3.1 */ +#define NMI_MAX_MESSAGE_LENGTH 4224 + +#define TYPE_NMI_I2C_DEVICE "nmi-i2c" +OBJECT_DECLARE_SIMPLE_TYPE(NMIDevice, NMI_I2C_DEVICE) + +typedef struct NMIDevice { +MCTPI2CEndpoint mctp; + +uint8_t buffer[NMI_MAX_MESSAGE_LENGTH]; +uint8_t scratch[NMI_MAX_MESSAGE_LENGTH]; + +size_t len; +int64_t pos; +} NMIDevice; + +FIELD(NMI_MCTPD, MT, 0, 7) +FIELD(NMI_MCTPD, IC, 7, 1) + +#define NMI_MCTPD_MT_NMI 0x4 +#define NMI_MCTPD_IC_ENABLED 0x1 + +FIELD(NMI_NMP, ROR, 7, 1) +FIELD(NMI_NMP, NMIMT, 3, 4) + +#define NMI_NMP_NMIMT_NVME_MI 0x1 +#define NMI_NMP_NMIMT_NVME_ADMIN 0x2 + +typedef struct NMIMessage { +uint8_t mctpd; +uint8_t nmp; +uint8_t rsvd2[2]; +uint8_t payload[]; /* includes the Message Integrity Check */ +} NMIMessage; + +typedef struct NMIRequest { + uint8_t opc; + uint8_t rsvd1[3]; + uint32_t dw0; + uint32_t dw1; + uint32_t mic; +} NMIRequest; + +FIELD(NMI_CMD_READ_NMI_DS_DW0, DTYP, 24, 8) + +typedef enum NMIReadDSType { +NMI_CMD_READ_NMI_DS_SUBSYSTEM = 0x0, +NMI_CMD_READ_NMI_DS_PORTS = 0x1, +NMI_CMD_READ_NMI_DS_CTRL_LIST = 0x2, +NMI_CMD_READ_NMI_DS_CTRL_INFO = 0x3, +NMI_CMD_READ_NMI_DS_OPT_CMD_SUPPORT = 0x4, +NMI_CMD_READ_NMI_DS_MEB_CMD_SUPPORT = 0x5, +} NMIReadDSType; + +#define NMI_STATUS_INVALID_PARAMETER 0x4 + +static void nmi_scratch_append(NMIDevice *nmi, const void *buf, size_t count) +{ +assert(nmi->pos + count <= NMI_MAX_MESSAGE_LENGTH); + +memcpy(nmi->scratch + nmi->pos, buf, count); +nmi->pos += count; +} + +static void nmi_set_parameter_error(NMIDevice *nmi, uint8_t bit, uint16_t byte) +{ +/* NVM Express Management Interface 1.2c, Figure 30 */ +struct resp { +uint8_t status; +uint8_t bit; +uint16_t byte; +}; + +struct resp buf = { +.status = NMI_STATUS_INVALID_PARAMETER, +.bit = bit & 0x3, +.byte = byte, +}; + +nmi_scratch_append(nmi, , sizeof(buf)); +} + +static void nmi_set_error(NMIDevice *nmi, uint8_t status) +{ +const uint8_t buf[4] = {status,}; + +nmi_scratch_append(nmi, buf, sizeof(buf)); +} + +static void nmi_handle_mi_read_nmi_ds(NMIDevice *nmi, NMIRequest *request) +{ +I2CSlave *i2c = I2C_SLAVE(nmi); + +uint32_t dw0 = le32_to_cpu(request->dw0); +uint8_t dtyp = FIELD_EX32(dw0, NMI_CMD_READ_NMI_DS_DW0, DTYP); + +trace_nmi_handle_mi_read_nmi_ds(dtyp); + +static const uint8_t nmi_ds_subsystem[36] = { +0x00, /* success */ +0x20, 0x00, /* response data length */ +0x00, /* reserved */ +0x00, /* number of ports */ +0x01, /* major version */ +0x01, /* minor version */ +}; + +/* + * Cannot be static (or const) since we need to patch in the i2c + * address. + */ +const uint8_t nmi_ds_ports[36] = { +0x00, /* success */ +0x20, 0x00, /* response data length */ +0x00, /* reserved */ +0x02, /* por
[PATCH v6 2/3] hw/i2c: add mctp core
From: Klaus Jensen Add an abstract MCTP over I2C endpoint model. This implements MCTP control message handling as well as handling the actual I2C transport (packetization). Devices are intended to derive from this and implement the class methods. Parts of this implementation is inspired by code[1] previously posted by Jonathan Cameron. Squashed a fix[2] from Matt Johnston. [1]: https://lore.kernel.org/qemu-devel/20220520170128.4436-1-jonathan.came...@huawei.com/ [2]: https://lore.kernel.org/qemu-devel/20221121080445.ga29...@codeconstruct.com.au/ Tested-by: Jonathan Cameron Reviewed-by: Jonathan Cameron Signed-off-by: Klaus Jensen --- MAINTAINERS | 7 + hw/arm/Kconfig| 1 + hw/i2c/Kconfig| 4 + hw/i2c/mctp.c | 432 ++ hw/i2c/meson.build| 1 + hw/i2c/trace-events | 13 ++ include/hw/i2c/mctp.h | 125 +++ include/net/mctp.h| 35 8 files changed, 618 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 00562f924f7a..3208ebb1bcde 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3404,6 +3404,13 @@ F: tests/qtest/adm1272-test.c F: tests/qtest/max34451-test.c F: tests/qtest/isl_pmbus_vr-test.c +MCTP I2C Transport +M: Klaus Jensen +S: Maintained +F: hw/i2c/mctp.c +F: include/hw/i2c/mctp.h +F: include/net/mctp.h + Firmware schema specifications M: Philippe Mathieu-Daudé R: Daniel P. Berrange diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig index 7e6834844051..5bcb1e0e8a6f 100644 --- a/hw/arm/Kconfig +++ b/hw/arm/Kconfig @@ -541,6 +541,7 @@ config ASPEED_SOC select DS1338 select FTGMAC100 select I2C +select I2C_MCTP select DPS310 select PCA9552 select SERIAL diff --git a/hw/i2c/Kconfig b/hw/i2c/Kconfig index 14886b35dac2..2b2a50b83d1e 100644 --- a/hw/i2c/Kconfig +++ b/hw/i2c/Kconfig @@ -6,6 +6,10 @@ config I2C_DEVICES # to any board's i2c bus bool +config I2C_MCTP +bool +select I2C + config SMBUS bool select I2C diff --git a/hw/i2c/mctp.c b/hw/i2c/mctp.c new file mode 100644 index ..8d8e74567745 --- /dev/null +++ b/hw/i2c/mctp.c @@ -0,0 +1,432 @@ +/* + * SPDX-License-Identifier: GPL-2.0-or-later + * + * SPDX-FileCopyrightText: Copyright (c) 2023 Samsung Electronics Co., Ltd. + * SPDX-FileContributor: Klaus Jensen + */ + +#include "qemu/osdep.h" +#include "qemu/main-loop.h" + +#include "hw/qdev-properties.h" +#include "hw/i2c/i2c.h" +#include "hw/i2c/smbus_master.h" +#include "hw/i2c/mctp.h" +#include "net/mctp.h" + +#include "trace.h" + +/* DSP0237 1.2.0, Figure 1 */ +typedef struct MCTPI2CPacketHeader { +uint8_t dest; +#define MCTP_I2C_COMMAND_CODE 0xf +uint8_t command_code; +uint8_t byte_count; +uint8_t source; +} MCTPI2CPacketHeader; + +typedef struct MCTPI2CPacket { +MCTPI2CPacketHeader i2c; +MCTPPacket mctp; +} MCTPI2CPacket; + +#define i2c_mctp_payload_offset offsetof(MCTPI2CPacket, mctp.payload) +#define i2c_mctp_payload(buf) (buf + i2c_mctp_payload_offset) + +/* DSP0236 1.3.0, Figure 20 */ +typedef struct MCTPControlMessage { +#define MCTP_MESSAGE_TYPE_CONTROL 0x0 +uint8_t type; +#define MCTP_CONTROL_FLAGS_RQ (1 << 7) +#define MCTP_CONTROL_FLAGS_D(1 << 6) +uint8_t flags; +uint8_t command_code; +uint8_t data[]; +} MCTPControlMessage; + +enum MCTPControlCommandCodes { +MCTP_CONTROL_SET_EID= 0x01, +MCTP_CONTROL_GET_EID= 0x02, +MCTP_CONTROL_GET_VERSION= 0x04, +MCTP_CONTROL_GET_MESSAGE_TYPE_SUPPORT = 0x05, +}; + +#define MCTP_CONTROL_ERROR_UNSUPPORTED_CMD 0x5 + +#define i2c_mctp_control_data_offset \ +(i2c_mctp_payload_offset + offsetof(MCTPControlMessage, data)) +#define i2c_mctp_control_data(buf) (buf + i2c_mctp_control_data_offset) + +/** + * The byte count field in the SMBUS Block Write containers the number of bytes + * *following* the field itself. + * + * This is at least 5. + * + * 1 byte for the MCTP/I2C piggy-backed I2C source address in addition to the + * size of the MCTP transport/packet header. + */ +#define MCTP_I2C_BYTE_COUNT_OFFSET (sizeof(MCTPPacketHeader) + 1) + +void i2c_mctp_schedule_send(MCTPI2CEndpoint *mctp) +{ +I2CBus *i2c = I2C_BUS(qdev_get_parent_bus(DEVICE(mctp))); + +mctp->tx.state = I2C_MCTP_STATE_TX_START_SEND; + +i2c_bus_master(i2c, mctp->tx.bh); +} + +static void i2c_mctp_tx(void *opaque) +{ +DeviceState *dev = DEVICE(opaque); +I2CBus *i2c = I2C_BUS(qdev_get_parent_bus(dev)); +I2CSlave *slave = I2C_SLAVE(dev); +MCTPI2CEndpoint *mctp = MCTP_I2C_ENDPOINT(dev); +MCTPI2CEndpointClass *mc = MCTP_I2C_ENDPOINT_GET_CLASS(mctp); +MCTPI2CPacket *pkt = (MCTPI2CPacket *)mctp->buffer; +uint8_t flags = 0; + +switch (mctp->tx.state) { +case I2C_MCTP_STAT
[PATCH v6 0/3] hw/{i2c,nvme}: mctp endpoint, nvme management interface model
This adds a generic MCTP endpoint model that other devices may derive from. Also included is a very basic implementation of an NVMe-MI device, supporting only a small subset of the required commands. Since this all relies on i2c target mode, this can currently only be used with an SoC that includes the Aspeed I2C controller. The easiest way to get up and running with this, is to grab my buildroot overlay[1] (aspeed_ast2600evb_nmi_defconfig). It includes modified a modified dts as well as a couple of required packages. QEMU can then be launched along these lines: qemu-system-arm \ -nographic \ -M ast2600-evb \ -kernel output/images/zImage \ -initrd output/images/rootfs.cpio \ -dtb output/images/aspeed-ast2600-evb-nmi.dtb \ -nic user,hostfwd=tcp::-:22 \ -device nmi-i2c,address=0x3a \ -serial mon:stdio >From within the booted system, mctp addr add 8 dev mctpi2c15 mctp link set mctpi2c15 up mctp route add 9 via mctpi2c15 mctp neigh add 9 dev mctpi2c15 lladdr 0x3a mi-mctp 1 9 info Comments are very welcome! [1]: https://github.com/birkelund/hwtests/tree/main/br2-external Signed-off-by: Klaus Jensen --- Changes in v6: - Use nmi_scratch_append() directly where it makes sense. Fixes bug observed by Andrew. - Link to v5: https://lore.kernel.org/r/20230905-nmi-i2c-v5-0-0001d372a...@samsung.com Changes in v5: - Added a nmi_scratch_append() that asserts available space in the scratch buffer. This is a similar defensive strategy as used in hw/i2c/mctp.c - Various small fixups in response to review (Jonathan) - Link to v4: https://lore.kernel.org/r/20230823-nmi-i2c-v4-0-2b0f86e5b...@samsung.com --- Klaus Jensen (3): hw/i2c: add smbus pec utility function hw/i2c: add mctp core hw/nvme: add nvme management interface model MAINTAINERS | 7 + hw/arm/Kconfig| 1 + hw/i2c/Kconfig| 4 + hw/i2c/mctp.c | 432 ++ hw/i2c/meson.build| 1 + hw/i2c/smbus_master.c | 26 +++ hw/i2c/trace-events | 13 ++ hw/nvme/Kconfig | 4 + hw/nvme/meson.build | 1 + hw/nvme/nmi-i2c.c | 407 +++ hw/nvme/trace-events | 6 + include/hw/i2c/mctp.h | 125 include/hw/i2c/smbus_master.h | 2 + include/net/mctp.h| 35 14 files changed, 1064 insertions(+) --- base-commit: 005ad32358f12fe9313a4a01918a55e60d4f39e5 change-id: 20230822-nmi-i2c-d804ed5be7e6 Best regards, -- Klaus Jensen
[PATCH v6 1/3] hw/i2c: add smbus pec utility function
From: Klaus Jensen Add i2c_smbus_pec() to calculate the SMBus Packet Error Code for a message. Reviewed-by: Jonathan Cameron Signed-off-by: Klaus Jensen --- hw/i2c/smbus_master.c | 26 ++ include/hw/i2c/smbus_master.h | 2 ++ 2 files changed, 28 insertions(+) diff --git a/hw/i2c/smbus_master.c b/hw/i2c/smbus_master.c index 6a53c34e70b7..01a8e4700222 100644 --- a/hw/i2c/smbus_master.c +++ b/hw/i2c/smbus_master.c @@ -15,6 +15,32 @@ #include "hw/i2c/i2c.h" #include "hw/i2c/smbus_master.h" +static uint8_t crc8(uint16_t data) +{ +int i; + +for (i = 0; i < 8; i++) { +if (data & 0x8000) { +data ^= 0x1070U << 3; +} + +data <<= 1; +} + +return (uint8_t)(data >> 8); +} + +uint8_t i2c_smbus_pec(uint8_t crc, uint8_t *buf, size_t len) +{ +int i; + +for (i = 0; i < len; i++) { +crc = crc8((crc ^ buf[i]) << 8); +} + +return crc; +} + /* Master device commands. */ int smbus_quick_command(I2CBus *bus, uint8_t addr, int read) { diff --git a/include/hw/i2c/smbus_master.h b/include/hw/i2c/smbus_master.h index bb13bc423c22..d90f81767d86 100644 --- a/include/hw/i2c/smbus_master.h +++ b/include/hw/i2c/smbus_master.h @@ -27,6 +27,8 @@ #include "hw/i2c/i2c.h" +uint8_t i2c_smbus_pec(uint8_t crc, uint8_t *buf, size_t len); + /* Master device commands. */ int smbus_quick_command(I2CBus *bus, uint8_t addr, int read); int smbus_receive_byte(I2CBus *bus, uint8_t addr); -- 2.42.0
Re: [PATCH v5 3/3] hw/nvme: add nvme management interface model
On Sep 12 13:50, Andrew Jeffery wrote: > Hi Klaus, > > On Tue, 2023-09-05 at 10:38 +0200, Klaus Jensen wrote: > > > > > > +static void nmi_handle_mi_config_get(NMIDevice *nmi, NMIRequest > > > *request) > > > +{ > > > + uint32_t dw0 = le32_to_cpu(request->dw0); > > > + uint8_t identifier = FIELD_EX32(dw0, > > > NMI_CMD_CONFIGURATION_GET_DW0, > > > + IDENTIFIER); > > > + const uint8_t *buf; > > > + > > > + static const uint8_t smbus_freq[4] = { > > > + 0x00, /* success */ > > > + 0x01, 0x00, 0x00, /* 100 kHz */ > > > + }; > > > + > > > + static const uint8_t mtu[4] = { > > > + 0x00, /* success */ > > > + 0x40, 0x00, /* 64 */ > > > + 0x00, /* reserved */ > > > + }; > > > + > > > + trace_nmi_handle_mi_config_get(identifier); > > > + > > > + switch (identifier) { > > > + case NMI_CMD_CONFIGURATION_GET_SMBUS_FREQ: > > > + buf = smbus_freq; > > > + break; > > > + > > > + case NMI_CMD_CONFIGURATION_GET_MCTP_TRANSMISSION_UNIT: > > > + buf = mtu; > > > + break; > > > + > > > + default: > > > + nmi_set_parameter_error(nmi, 0x0, offsetof(NMIRequest, > > > dw0)); > > > + return; > > > + } > > > + > > > + nmi_scratch_append(nmi, buf, sizeof(buf)); > > > +} > > When I tried to build this patch I got: > > ``` > In file included from /usr/include/string.h:535, > from /home/andrew/src/qemu.org/qemu/include/qemu/osdep.h:112, > from ../hw/nvme/nmi-i2c.c:12: > In function ‘memcpy’, > inlined from ‘nmi_scratch_append’ at ../hw/nvme/nmi-i2c.c:80:5, > inlined from ‘nmi_handle_mi_config_get’ at ../hw/nvme/nmi-i2c.c:246:5, > inlined from ‘nmi_handle_mi’ at ../hw/nvme/nmi-i2c.c:266:9, > inlined from ‘nmi_handle’ at ../hw/nvme/nmi-i2c.c:313:9: > /usr/include/x86_64-linux-gnu/bits/string_fortified.h:29:10: error: > ‘__builtin_memcpy’ forming offset [4, 7] is out of the bounds [0, 4] > [-Werror=array-bounds=] >29 | return __builtin___memcpy_chk (__dest, __src, __len, > | ^ >30 | __glibc_objsize0 (__dest)); > | ~~ > ``` > > It wasn't clear initially from the error that the source of the problem > was the size associated with the source buffer, especially as there is > some pointer arithmetic being done to derive `__dest`. > > Anyway, what we're trying to express is that the size to copy from buf > is the size of the array pointed to by buf. However, buf is declared as > a pointer to uint8_t, which loses the length information. To fix that I > think we need: > > - const uint8_t *buf; > + const uint8_t (*buf)[4]; > > and then: > > - nmi_scratch_append(nmi, buf, sizeof(buf)); > + nmi_scratch_append(nmi, buf, sizeof(*buf)); > > Andrew > Hi Andrew, Nice (and important) catch! Just curious, are you massaging QEMU's build system into adding additional checks or how did your compiler catch this? Thanks, Klaus signature.asc Description: PGP signature
Re: [PATCH v3 1/5] block: remove AIOCBInfo->get_aio_context()
On Sep 12 19:10, Stefan Hajnoczi wrote: > The synchronous bdrv_aio_cancel() function needs the acb's AioContext so > it can call aio_poll() to wait for cancellation. > > It turns out that all users run under the BQL in the main AioContext, so > this callback is not needed. > > Remove the callback, mark bdrv_aio_cancel() GLOBAL_STATE_CODE just like > its blk_aio_cancel() caller, and poll the main loop AioContext. > > The purpose of this cleanup is to identify bdrv_aio_cancel() as an API > that does not work with the multi-queue block layer. > > Signed-off-by: Stefan Hajnoczi > --- > include/block/aio.h| 1 - > include/block/block-global-state.h | 2 ++ > include/block/block-io.h | 1 - > block/block-backend.c | 17 - > block/io.c | 23 --- > hw/nvme/ctrl.c | 7 --- > softmmu/dma-helpers.c | 8 > util/thread-pool.c | 8 > 8 files changed, 10 insertions(+), 57 deletions(-) > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c > index 539d273553..ee7273daa1 100644 > --- a/hw/nvme/ctrl.c > +++ b/hw/nvme/ctrl.c > @@ -2130,11 +2130,6 @@ static inline bool nvme_is_write(NvmeRequest *req) > rw->opcode == NVME_CMD_WRITE_ZEROES; > } > > -static AioContext *nvme_get_aio_context(BlockAIOCB *acb) > -{ > -return qemu_get_aio_context(); > -} > - > static void nvme_misc_cb(void *opaque, int ret) > { > NvmeRequest *req = opaque; > @@ -3302,7 +3297,6 @@ static void nvme_flush_cancel(BlockAIOCB *acb) > static const AIOCBInfo nvme_flush_aiocb_info = { > .aiocb_size = sizeof(NvmeFlushAIOCB), > .cancel_async = nvme_flush_cancel, > -.get_aio_context = nvme_get_aio_context, > }; > > static void nvme_do_flush(NvmeFlushAIOCB *iocb); > @@ -6478,7 +6472,6 @@ static void nvme_format_cancel(BlockAIOCB *aiocb) > static const AIOCBInfo nvme_format_aiocb_info = { > .aiocb_size = sizeof(NvmeFormatAIOCB), > .cancel_async = nvme_format_cancel, > -.get_aio_context = nvme_get_aio_context, > }; > > static void nvme_format_set(NvmeNamespace *ns, uint8_t lbaf, uint8_t mset, Reviewed-by: Klaus Jensen signature.asc Description: PGP signature
[PULL 0/2] hw/nvme: updates
From: Klaus Jensen Hi, The following changes since commit 9ef497755afc252fb8e060c9ea6b0987abfd20b6: Merge tag 'pull-vfio-20230911' of https://github.com/legoater/qemu into staging (2023-09-11 09:13:08 -0400) are available in the Git repository at: https://gitlab.com/birkelund/qemu.git tags/nvme-next-pull-request for you to fetch changes up to b3c8246750b7077add335559341268f2956f6470: hw/nvme: Avoid dynamic stack allocation (2023-09-12 16:17:05 +0200) hw/nvme updates Two fixes for dynamic array allocation. -BEGIN PGP SIGNATURE- iQEzBAABCgAdFiEEUigzqnXi3OaiR2bATeGvMW1PDekFAmUAc8AACgkQTeGvMW1P DelwhQgAxD7imw85V89Dz58LgrFoq5XZz2cq6Q5BsudyZd8FW5r7lOn9c1i0Yu2x iiP93FX0b5LPQ9/8/liz3oHu1HZ7+hX+VeDZSQ1/bugfXM/eDSPA7lf7GG1np312 9lKRs8o+T4Di7v93kdiEi6G3b0jQSmZ722aMa54isk58hy1mcUTnGxvPZpVZutTP lYhwuElQIsnnKXB0jaRlpcDkpXdHJ1wwziaYLM7pus+tElMiSkFP05j2pX9iigKu 7g+Hs+DaqrOzdoF/6uu72IKygq3/5H8iou1No/7OICWbFti5Qhhra0OKQE6nrlKd 51fnWA6VjpO5g9+diwRRYbjEiOrkqQ== =wn4B -END PGP SIGNATURE- Peter Maydell (1): hw/nvme: Avoid dynamic stack allocation Philippe Mathieu-Daudé (1): hw/nvme: Use #define to avoid variable length array hw/nvme/ctrl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- 2.42.0
[PULL 2/2] hw/nvme: Avoid dynamic stack allocation
From: Peter Maydell Instead of using a variable-length array in nvme_map_prp(), allocate on the stack with a g_autofree pointer. The codebase has very few VLAs, and if we can get rid of them all we can make the compiler error on new additions. This is a defensive measure against security bugs where an on-stack dynamic allocation isn't correctly size-checked (e.g. CVE-2021-3527). Signed-off-by: Peter Maydell Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index d99a6f5c9a2e..90687b168ae1 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -894,7 +894,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1, len -= trans_len; if (len) { if (len > n->page_size) { -uint64_t prp_list[n->max_prp_ents]; +g_autofree uint64_t *prp_list = g_new(uint64_t, n->max_prp_ents); uint32_t nents, prp_trans; int i = 0; -- 2.42.0
[PULL 1/2] hw/nvme: Use #define to avoid variable length array
From: Philippe Mathieu-Daudé In nvme_map_sgl() we create an array segment[] whose size is the 'const int SEG_CHUNK_SIZE'. Since this is C, rather than C++, a "const int foo" is not a true constant, it's merely a variable with a constant value, and so semantically segment[] is a variable-length array. Switch SEG_CHUNK_SIZE to a #define so that we can make the segment[] array truly fixed-size, in the sense that it doesn't trigger the -Wvla warning. The codebase has very few VLAs, and if we can get rid of them all we can make the compiler error on new additions. This is a defensive measure against security bugs where an on-stack dynamic allocation isn't correctly size-checked (e.g. CVE-2021-3527). [PMM: rebased (function has moved file), expand commit message based on discussion from previous version of patch] Signed-off-by: Philippe Mathieu-Daudé Signed-off-by: Peter Maydell Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 539d27355313..d99a6f5c9a2e 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -1045,7 +1045,7 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl, * descriptors and segment chain) than the command transfer size, so it is * not bounded by MDTS. */ -const int SEG_CHUNK_SIZE = 256; +#define SEG_CHUNK_SIZE 256 NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld; uint64_t nsgld; -- 2.42.0
Re: [PATCH 0/2] nvme: avoid dynamic stack allocations
On Sep 12 15:15, Peter Maydell wrote: > On Mon, 14 Aug 2023 at 08:09, Klaus Jensen wrote: > > > > On Aug 11 18:47, Peter Maydell wrote: > > > The QEMU codebase has very few C variable length arrays, and if we can > > > get rid of them all we can make the compiler error on new additions. > > > This is a defensive measure against security bugs where an on-stack > > > dynamic allocation isn't correctly size-checked (e.g. CVE-2021-3527). > > > > > > We last had a go at this a few years ago, when Philippe wrote > > > patches for this: > > > https://patchew.org/QEMU/20210505211047.1496765-1-phi...@redhat.com/ > > > Some of the fixes made it into the tree, but some didn't (either > > > because of lack of review or because review found some changes > > > that needed to be made). I'm going through the remainder as a > > > non-urgent Friday afternoon task... > > > > > > This patchset deals with two VLAs in the NVME code. > > > > > > thanks > > > -- PMM > > > > > > Peter Maydell (1): > > > hw/nvme: Avoid dynamic stack allocation > > > > > > Philippe Mathieu-Daudé (1): > > > hw/nvme: Use #define to avoid variable length array > > > > > > hw/nvme/ctrl.c | 4 ++-- > > > 1 file changed, 2 insertions(+), 2 deletions(-) > > > > > > -- > > > 2.34.1 > > > > > > > Thanks Peter, > > > > Applied to nvme-next! > > > Hi Klaus -- did these patches get lost? They don't seem to > have appeared in master yet. > > thanks > -- PMM Oh. I never sent the pull - I'll do that right away! Thanks for the reminder! signature.asc Description: PGP signature
[PATCH v5 2/3] hw/i2c: add mctp core
From: Klaus Jensen Add an abstract MCTP over I2C endpoint model. This implements MCTP control message handling as well as handling the actual I2C transport (packetization). Devices are intended to derive from this and implement the class methods. Parts of this implementation is inspired by code[1] previously posted by Jonathan Cameron. Squashed a fix[2] from Matt Johnston. [1]: https://lore.kernel.org/qemu-devel/20220520170128.4436-1-jonathan.came...@huawei.com/ [2]: https://lore.kernel.org/qemu-devel/20221121080445.ga29...@codeconstruct.com.au/ Tested-by: Jonathan Cameron Reviewed-by: Jonathan Cameron Signed-off-by: Klaus Jensen --- MAINTAINERS | 7 + hw/arm/Kconfig| 1 + hw/i2c/Kconfig| 4 + hw/i2c/mctp.c | 432 ++ hw/i2c/meson.build| 1 + hw/i2c/trace-events | 13 ++ include/hw/i2c/mctp.h | 125 +++ include/net/mctp.h| 35 8 files changed, 618 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 6111b6b4d928..8ca71167607d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3395,6 +3395,13 @@ F: tests/qtest/adm1272-test.c F: tests/qtest/max34451-test.c F: tests/qtest/isl_pmbus_vr-test.c +MCTP I2C Transport +M: Klaus Jensen +S: Maintained +F: hw/i2c/mctp.c +F: include/hw/i2c/mctp.h +F: include/net/mctp.h + Firmware schema specifications M: Philippe Mathieu-Daudé R: Daniel P. Berrange diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig index 7e6834844051..5bcb1e0e8a6f 100644 --- a/hw/arm/Kconfig +++ b/hw/arm/Kconfig @@ -541,6 +541,7 @@ config ASPEED_SOC select DS1338 select FTGMAC100 select I2C +select I2C_MCTP select DPS310 select PCA9552 select SERIAL diff --git a/hw/i2c/Kconfig b/hw/i2c/Kconfig index 14886b35dac2..2b2a50b83d1e 100644 --- a/hw/i2c/Kconfig +++ b/hw/i2c/Kconfig @@ -6,6 +6,10 @@ config I2C_DEVICES # to any board's i2c bus bool +config I2C_MCTP +bool +select I2C + config SMBUS bool select I2C diff --git a/hw/i2c/mctp.c b/hw/i2c/mctp.c new file mode 100644 index ..8d8e74567745 --- /dev/null +++ b/hw/i2c/mctp.c @@ -0,0 +1,432 @@ +/* + * SPDX-License-Identifier: GPL-2.0-or-later + * + * SPDX-FileCopyrightText: Copyright (c) 2023 Samsung Electronics Co., Ltd. + * SPDX-FileContributor: Klaus Jensen + */ + +#include "qemu/osdep.h" +#include "qemu/main-loop.h" + +#include "hw/qdev-properties.h" +#include "hw/i2c/i2c.h" +#include "hw/i2c/smbus_master.h" +#include "hw/i2c/mctp.h" +#include "net/mctp.h" + +#include "trace.h" + +/* DSP0237 1.2.0, Figure 1 */ +typedef struct MCTPI2CPacketHeader { +uint8_t dest; +#define MCTP_I2C_COMMAND_CODE 0xf +uint8_t command_code; +uint8_t byte_count; +uint8_t source; +} MCTPI2CPacketHeader; + +typedef struct MCTPI2CPacket { +MCTPI2CPacketHeader i2c; +MCTPPacket mctp; +} MCTPI2CPacket; + +#define i2c_mctp_payload_offset offsetof(MCTPI2CPacket, mctp.payload) +#define i2c_mctp_payload(buf) (buf + i2c_mctp_payload_offset) + +/* DSP0236 1.3.0, Figure 20 */ +typedef struct MCTPControlMessage { +#define MCTP_MESSAGE_TYPE_CONTROL 0x0 +uint8_t type; +#define MCTP_CONTROL_FLAGS_RQ (1 << 7) +#define MCTP_CONTROL_FLAGS_D(1 << 6) +uint8_t flags; +uint8_t command_code; +uint8_t data[]; +} MCTPControlMessage; + +enum MCTPControlCommandCodes { +MCTP_CONTROL_SET_EID= 0x01, +MCTP_CONTROL_GET_EID= 0x02, +MCTP_CONTROL_GET_VERSION= 0x04, +MCTP_CONTROL_GET_MESSAGE_TYPE_SUPPORT = 0x05, +}; + +#define MCTP_CONTROL_ERROR_UNSUPPORTED_CMD 0x5 + +#define i2c_mctp_control_data_offset \ +(i2c_mctp_payload_offset + offsetof(MCTPControlMessage, data)) +#define i2c_mctp_control_data(buf) (buf + i2c_mctp_control_data_offset) + +/** + * The byte count field in the SMBUS Block Write containers the number of bytes + * *following* the field itself. + * + * This is at least 5. + * + * 1 byte for the MCTP/I2C piggy-backed I2C source address in addition to the + * size of the MCTP transport/packet header. + */ +#define MCTP_I2C_BYTE_COUNT_OFFSET (sizeof(MCTPPacketHeader) + 1) + +void i2c_mctp_schedule_send(MCTPI2CEndpoint *mctp) +{ +I2CBus *i2c = I2C_BUS(qdev_get_parent_bus(DEVICE(mctp))); + +mctp->tx.state = I2C_MCTP_STATE_TX_START_SEND; + +i2c_bus_master(i2c, mctp->tx.bh); +} + +static void i2c_mctp_tx(void *opaque) +{ +DeviceState *dev = DEVICE(opaque); +I2CBus *i2c = I2C_BUS(qdev_get_parent_bus(dev)); +I2CSlave *slave = I2C_SLAVE(dev); +MCTPI2CEndpoint *mctp = MCTP_I2C_ENDPOINT(dev); +MCTPI2CEndpointClass *mc = MCTP_I2C_ENDPOINT_GET_CLASS(mctp); +MCTPI2CPacket *pkt = (MCTPI2CPacket *)mctp->buffer; +uint8_t flags = 0; + +switch (mctp->tx.state) { +case I2C_MCTP_STAT
[PATCH v5 0/3] hw/{i2c,nvme}: mctp endpoint, nvme management interface model
This adds a generic MCTP endpoint model that other devices may derive from. Also included is a very basic implementation of an NVMe-MI device, supporting only a small subset of the required commands. Since this all relies on i2c target mode, this can currently only be used with an SoC that includes the Aspeed I2C controller. The easiest way to get up and running with this, is to grab my buildroot overlay[1] (aspeed_ast2600evb_nmi_defconfig). It includes modified a modified dts as well as a couple of required packages. QEMU can then be launched along these lines: qemu-system-arm \ -nographic \ -M ast2600-evb \ -kernel output/images/zImage \ -initrd output/images/rootfs.cpio \ -dtb output/images/aspeed-ast2600-evb-nmi.dtb \ -nic user,hostfwd=tcp::-:22 \ -device nmi-i2c,address=0x3a \ -serial mon:stdio >From within the booted system, mctp addr add 8 dev mctpi2c15 mctp link set mctpi2c15 up mctp route add 9 via mctpi2c15 mctp neigh add 9 dev mctpi2c15 lladdr 0x3a mi-mctp 1 9 info Comments are very welcome! [1]: https://github.com/birkelund/hwtests/tree/main/br2-external Signed-off-by: Klaus Jensen --- Changes in v5: - Added a nmi_scratch_append() that asserts available space in the scratch buffer. This is a similar defensive strategy as used in hw/i2c/mctp.c - Various small fixups in response to review (Jonathan) - Link to v4: https://lore.kernel.org/r/20230823-nmi-i2c-v4-0-2b0f86e5b...@samsung.com --- Klaus Jensen (3): hw/i2c: add smbus pec utility function hw/i2c: add mctp core hw/nvme: add nvme management interface model MAINTAINERS | 7 + hw/arm/Kconfig| 1 + hw/i2c/Kconfig| 4 + hw/i2c/mctp.c | 432 ++ hw/i2c/meson.build| 1 + hw/i2c/smbus_master.c | 26 +++ hw/i2c/trace-events | 13 ++ hw/nvme/Kconfig | 4 + hw/nvme/meson.build | 1 + hw/nvme/nmi-i2c.c | 424 + hw/nvme/trace-events | 6 + include/hw/i2c/mctp.h | 125 include/hw/i2c/smbus_master.h | 2 + include/net/mctp.h| 35 14 files changed, 1081 insertions(+) --- base-commit: 17780edd81d27fcfdb7a802efc870a99788bd2fc change-id: 20230822-nmi-i2c-d804ed5be7e6 Best regards, -- Klaus Jensen
[PATCH v5 1/3] hw/i2c: add smbus pec utility function
From: Klaus Jensen Add i2c_smbus_pec() to calculate the SMBus Packet Error Code for a message. Reviewed-by: Jonathan Cameron Signed-off-by: Klaus Jensen --- hw/i2c/smbus_master.c | 26 ++ include/hw/i2c/smbus_master.h | 2 ++ 2 files changed, 28 insertions(+) diff --git a/hw/i2c/smbus_master.c b/hw/i2c/smbus_master.c index 6a53c34e70b7..01a8e4700222 100644 --- a/hw/i2c/smbus_master.c +++ b/hw/i2c/smbus_master.c @@ -15,6 +15,32 @@ #include "hw/i2c/i2c.h" #include "hw/i2c/smbus_master.h" +static uint8_t crc8(uint16_t data) +{ +int i; + +for (i = 0; i < 8; i++) { +if (data & 0x8000) { +data ^= 0x1070U << 3; +} + +data <<= 1; +} + +return (uint8_t)(data >> 8); +} + +uint8_t i2c_smbus_pec(uint8_t crc, uint8_t *buf, size_t len) +{ +int i; + +for (i = 0; i < len; i++) { +crc = crc8((crc ^ buf[i]) << 8); +} + +return crc; +} + /* Master device commands. */ int smbus_quick_command(I2CBus *bus, uint8_t addr, int read) { diff --git a/include/hw/i2c/smbus_master.h b/include/hw/i2c/smbus_master.h index bb13bc423c22..d90f81767d86 100644 --- a/include/hw/i2c/smbus_master.h +++ b/include/hw/i2c/smbus_master.h @@ -27,6 +27,8 @@ #include "hw/i2c/i2c.h" +uint8_t i2c_smbus_pec(uint8_t crc, uint8_t *buf, size_t len); + /* Master device commands. */ int smbus_quick_command(I2CBus *bus, uint8_t addr, int read); int smbus_receive_byte(I2CBus *bus, uint8_t addr); -- 2.42.0
[PATCH v5 3/3] hw/nvme: add nvme management interface model
From: Klaus Jensen Add the 'nmi-i2c' device that emulates an NVMe Management Interface controller. Initial support is very basic (Read NMI DS, Configuration Get). This is based on previously posted code by Padmakar Kalghatgi, Arun Kumar Agasar and Saurav Kumar. Signed-off-by: Klaus Jensen --- hw/nvme/Kconfig | 4 + hw/nvme/meson.build | 1 + hw/nvme/nmi-i2c.c| 424 +++ hw/nvme/trace-events | 6 + 4 files changed, 435 insertions(+) diff --git a/hw/nvme/Kconfig b/hw/nvme/Kconfig index 8ac90942e55e..1d89a4f4ecea 100644 --- a/hw/nvme/Kconfig +++ b/hw/nvme/Kconfig @@ -2,3 +2,7 @@ config NVME_PCI bool default y if PCI_DEVICES depends on PCI + +config NVME_NMI_I2C +bool +default y if I2C_MCTP diff --git a/hw/nvme/meson.build b/hw/nvme/meson.build index 1a6a2ca2f307..7bc85f31c149 100644 --- a/hw/nvme/meson.build +++ b/hw/nvme/meson.build @@ -1 +1,2 @@ system_ss.add(when: 'CONFIG_NVME_PCI', if_true: files('ctrl.c', 'dif.c', 'ns.c', 'subsys.c')) +system_ss.add(when: 'CONFIG_NVME_NMI_I2C', if_true: files('nmi-i2c.c')) diff --git a/hw/nvme/nmi-i2c.c b/hw/nvme/nmi-i2c.c new file mode 100644 index ..6e0ba7bd2a37 --- /dev/null +++ b/hw/nvme/nmi-i2c.c @@ -0,0 +1,424 @@ +/* + * SPDX-License-Identifier: GPL-2.0-or-later + * + * SPDX-FileCopyrightText: Copyright (c) 2023 Samsung Electronics Co., Ltd. + * + * SPDX-FileContributor: Padmakar Kalghatgi + * SPDX-FileContributor: Arun Kumar Agasar + * SPDX-FileContributor: Saurav Kumar + * SPDX-FileContributor: Klaus Jensen + */ + +#include "qemu/osdep.h" +#include "qemu/crc32c.h" +#include "hw/registerfields.h" +#include "hw/i2c/i2c.h" +#include "hw/i2c/mctp.h" +#include "net/mctp.h" +#include "trace.h" + +/* NVM Express Management Interface 1.2c, Section 3.1 */ +#define NMI_MAX_MESSAGE_LENGTH 4224 + +#define TYPE_NMI_I2C_DEVICE "nmi-i2c" +OBJECT_DECLARE_SIMPLE_TYPE(NMIDevice, NMI_I2C_DEVICE) + +typedef struct NMIDevice { +MCTPI2CEndpoint mctp; + +uint8_t buffer[NMI_MAX_MESSAGE_LENGTH]; +uint8_t scratch[NMI_MAX_MESSAGE_LENGTH]; + +size_t len; +int64_t pos; +} NMIDevice; + +FIELD(NMI_MCTPD, MT, 0, 7) +FIELD(NMI_MCTPD, IC, 7, 1) + +#define NMI_MCTPD_MT_NMI 0x4 +#define NMI_MCTPD_IC_ENABLED 0x1 + +FIELD(NMI_NMP, ROR, 7, 1) +FIELD(NMI_NMP, NMIMT, 3, 4) + +#define NMI_NMP_NMIMT_NVME_MI 0x1 +#define NMI_NMP_NMIMT_NVME_ADMIN 0x2 + +typedef struct NMIMessage { +uint8_t mctpd; +uint8_t nmp; +uint8_t rsvd2[2]; +uint8_t payload[]; /* includes the Message Integrity Check */ +} NMIMessage; + +typedef struct NMIRequest { + uint8_t opc; + uint8_t rsvd1[3]; + uint32_t dw0; + uint32_t dw1; + uint32_t mic; +} NMIRequest; + +FIELD(NMI_CMD_READ_NMI_DS_DW0, DTYP, 24, 8) + +typedef enum NMIReadDSType { +NMI_CMD_READ_NMI_DS_SUBSYSTEM = 0x0, +NMI_CMD_READ_NMI_DS_PORTS = 0x1, +NMI_CMD_READ_NMI_DS_CTRL_LIST = 0x2, +NMI_CMD_READ_NMI_DS_CTRL_INFO = 0x3, +NMI_CMD_READ_NMI_DS_OPT_CMD_SUPPORT = 0x4, +NMI_CMD_READ_NMI_DS_MEB_CMD_SUPPORT = 0x5, +} NMIReadDSType; + +#define NMI_STATUS_INVALID_PARAMETER 0x4 + +static void nmi_scratch_append(NMIDevice *nmi, const void *buf, size_t count) +{ +assert(nmi->pos + count <= NMI_MAX_MESSAGE_LENGTH); + +memcpy(nmi->scratch + nmi->pos, buf, count); +nmi->pos += count; +} + +static void nmi_set_parameter_error(NMIDevice *nmi, uint8_t bit, uint16_t byte) +{ +/* NVM Express Management Interface 1.2c, Figure 30 */ +struct resp { +uint8_t status; +uint8_t bit; +uint16_t byte; +}; + +struct resp buf = { +.status = NMI_STATUS_INVALID_PARAMETER, +.bit = bit & 0x3, +.byte = byte, +}; + +nmi_scratch_append(nmi, , sizeof(buf)); +} + +static void nmi_set_error(NMIDevice *nmi, uint8_t status) +{ +const uint8_t buf[4] = {status,}; + +nmi_scratch_append(nmi, buf, sizeof(buf)); +} + +static void nmi_handle_mi_read_nmi_ds(NMIDevice *nmi, NMIRequest *request) +{ +I2CSlave *i2c = I2C_SLAVE(nmi); + +uint32_t dw0 = le32_to_cpu(request->dw0); +uint8_t dtyp = FIELD_EX32(dw0, NMI_CMD_READ_NMI_DS_DW0, DTYP); +const uint8_t *buf; +size_t len; + +trace_nmi_handle_mi_read_nmi_ds(dtyp); + +static const uint8_t nmi_ds_subsystem[36] = { +0x00, /* success */ +0x20, 0x00, /* response data length */ +0x00, /* reserved */ +0x00, /* number of ports */ +0x01, /* major version */ +0x01, /* minor version */ +}; + +/* + * Cannot be static (or const) since we need to patch in the i2c + * address. + */ +const uint8_t nmi_ds_ports[36] = { +0x00, /* success */ +0x20, 0x00, /* response data length */ +0x00, /* reserved */ +0x02, /* port typ
Re: [PATCH v4 2/3] hw/i2c: add mctp core
On Aug 30 15:31, Jonathan Cameron wrote: > On Wed, 23 Aug 2023 11:21:59 +0200 > Klaus Jensen wrote: > > > From: Klaus Jensen > > > > Add an abstract MCTP over I2C endpoint model. This implements MCTP > > control message handling as well as handling the actual I2C transport > > (packetization). > > > > Devices are intended to derive from this and implement the class > > methods. > > > > Parts of this implementation is inspired by code[1] previously posted by > > Jonathan Cameron. > > > > Squashed a fix[2] from Matt Johnston. > > > > [1]: > > https://lore.kernel.org/qemu-devel/20220520170128.4436-1-jonathan.came...@huawei.com/ > > [2]: > > https://lore.kernel.org/qemu-devel/20221121080445.ga29...@codeconstruct.com.au/ > > > > Signed-off-by: Klaus Jensen > > I made the minor changes to the CXL FM-API PoC to use this and all works as > expected so > > Tested-by: Jonathan Cameron > > > Some minor things inline. With those tidied up or ignored with clear > reasoning. > > Reviewed-by: Jonathan Cameron > > > > diff --git a/hw/i2c/mctp.c b/hw/i2c/mctp.c > > new file mode 100644 > > index ..217073d62435 > > --- /dev/null > > +++ b/hw/i2c/mctp.c > > > ... > > > > +static int i2c_mctp_event_cb(I2CSlave *i2c, enum i2c_event event) > > +{ > > +MCTPI2CEndpoint *mctp = MCTP_I2C_ENDPOINT(i2c); > > +MCTPI2CEndpointClass *mc = MCTP_I2C_ENDPOINT_GET_CLASS(mctp); > > +MCTPI2CPacket *pkt = (MCTPI2CPacket *)mctp->buffer; > > +size_t payload_len; > > +uint8_t pec, pktseq, msgtype; > > + > > +switch (event) { > > +case I2C_START_SEND: > > +if (mctp->state == I2C_MCTP_STATE_IDLE) { > > +mctp->state = I2C_MCTP_STATE_RX_STARTED; > > +} else if (mctp->state != I2C_MCTP_STATE_RX) { > > +return -1; > > +} > > + > > +/* the i2c core eats the slave address, so put it back in */ > > +pkt->i2c.dest = i2c->address << 1; > > +mctp->len = 1; > > + > > +return 0; > > + > > +case I2C_FINISH: > > +if (mctp->len < sizeof(MCTPI2CPacket) + 1) { > > +trace_i2c_mctp_drop_short_packet(mctp->len); > > +goto drop; > > +} > > + > > +payload_len = mctp->len - (1 + offsetof(MCTPI2CPacket, > > mctp.payload)); > > + > > +if (pkt->i2c.byte_count + 3 != mctp->len - 1) { > > +trace_i2c_mctp_drop_invalid_length(pkt->i2c.byte_count + 3, > > + mctp->len - 1); > > +goto drop; > > +} > > + > > +pec = i2c_smbus_pec(0, mctp->buffer, mctp->len - 1); > > +if (mctp->buffer[mctp->len - 1] != pec) { > > +trace_i2c_mctp_drop_invalid_pec(mctp->buffer[mctp->len - 1], > > pec); > > +goto drop; > > +} > > + > > +if (!(pkt->mctp.hdr.eid.dest == mctp->my_eid || > > + pkt->mctp.hdr.eid.dest == 0)) { > > +trace_i2c_mctp_drop_invalid_eid(pkt->mctp.hdr.eid.dest, > > +mctp->my_eid); > > +goto drop; > > +} > > + > > +pktseq = FIELD_EX8(pkt->mctp.hdr.flags, MCTP_H_FLAGS, PKTSEQ); > > + > > +if (FIELD_EX8(pkt->mctp.hdr.flags, MCTP_H_FLAGS, SOM)) { > > +mctp->tx.is_control = false; > > + > > +if (mctp->state == I2C_MCTP_STATE_RX) { > > +mc->reset(mctp); > > +} > > + > > +mctp->state = I2C_MCTP_STATE_RX; > > + > > +mctp->tx.addr = pkt->i2c.source >> 1; > > +mctp->tx.eid = pkt->mctp.hdr.eid.source; > > +mctp->tx.tag = FIELD_EX8(pkt->mctp.hdr.flags, MCTP_H_FLAGS, > > TAG); > > +mctp->tx.pktseq = pktseq; > > + > > +msgtype = FIELD_EX8(pkt->mctp.payload[0], MCTP_MESSAGE_H, > > TYPE); > > + > > +if (msgtype == MCTP_MESSAGE_TYPE_CONTROL) { > > +mctp->tx.is_control = true; > > + > > +i2c_mctp_handle_control(mctp); > > + > > +return 0; > > +} > > +} else if (mctp->state == I2C_MCTP_STATE_RX_STARTED) { > > +trace_i2c_mctp_dro
Re: [PULL 1/2] hw/nvme: fix null pointer access in directive receive
On Aug 24 14:44, Philippe Mathieu-Daudé wrote: > On 9/8/23 15:39, Klaus Jensen wrote: > > From: Klaus Jensen > > > > nvme_directive_receive() does not check if an endurance group has been > > configured (set) prior to testing if flexible data placement is enabled > > or not. > > > > Fix this. > > > > Cc: qemu-sta...@nongnu.org > > Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1815 > > Fixes: 73064edfb864 ("hw/nvme: flexible data placement emulation") > > Reviewed-by: Jesper Wendel Devantier > > Signed-off-by: Klaus Jensen > > --- > > hw/nvme/ctrl.c | 2 +- > > 1 file changed, 1 insertion(+), 1 deletion(-) > > > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c > > index d217ae91b506..e5b5c7034d2b 100644 > > --- a/hw/nvme/ctrl.c > > +++ b/hw/nvme/ctrl.c > > @@ -6900,7 +6900,7 @@ static uint16_t nvme_directive_receive(NvmeCtrl *n, > > NvmeRequest *req) > > case NVME_DIRECTIVE_IDENTIFY: > > switch (doper) { > > case NVME_DIRECTIVE_RETURN_PARAMS: > > -if (ns->endgrp->fdp.enabled) { > > +if (ns->endgrp && ns->endgrp->fdp.enabled) { > > This patch fixes CVE-2023-40360 ("QEMU: NVMe: NULL pointer > dereference in nvme_directive_receive"). Were you aware of > the security implications? > Yes, but I was not aware of the CVE being assigned at the time. I don't think it was? But if what you are saying is that it was my responsibility as maintainer, to get that reported and assigned, then I apologies and will of course keep that in mind going forward! signature.asc Description: PGP signature
[PATCH v4 3/3] hw/nvme: add nvme management interface model
From: Klaus Jensen Add the 'nmi-i2c' device that emulates an NVMe Management Interface controller. Initial support is very basic (Read NMI DS, Configuration Get). This is based on previously posted code by Padmakar Kalghatgi, Arun Kumar Agasar and Saurav Kumar. Signed-off-by: Klaus Jensen --- hw/nvme/Kconfig | 4 + hw/nvme/meson.build | 1 + hw/nvme/nmi-i2c.c| 418 +++ hw/nvme/trace-events | 6 + 4 files changed, 429 insertions(+) diff --git a/hw/nvme/Kconfig b/hw/nvme/Kconfig index 8ac90942e55e..1d89a4f4ecea 100644 --- a/hw/nvme/Kconfig +++ b/hw/nvme/Kconfig @@ -2,3 +2,7 @@ config NVME_PCI bool default y if PCI_DEVICES depends on PCI + +config NVME_NMI_I2C +bool +default y if I2C_MCTP diff --git a/hw/nvme/meson.build b/hw/nvme/meson.build index 1a6a2ca2f307..7bc85f31c149 100644 --- a/hw/nvme/meson.build +++ b/hw/nvme/meson.build @@ -1 +1,2 @@ system_ss.add(when: 'CONFIG_NVME_PCI', if_true: files('ctrl.c', 'dif.c', 'ns.c', 'subsys.c')) +system_ss.add(when: 'CONFIG_NVME_NMI_I2C', if_true: files('nmi-i2c.c')) diff --git a/hw/nvme/nmi-i2c.c b/hw/nvme/nmi-i2c.c new file mode 100644 index ..9040ba28a87c --- /dev/null +++ b/hw/nvme/nmi-i2c.c @@ -0,0 +1,418 @@ +/* + * SPDX-License-Identifier: GPL-2.0-or-later + * + * SPDX-FileCopyrightText: Copyright (c) 2023 Samsung Electronics Co., Ltd. + * + * SPDX-FileContributor: Padmakar Kalghatgi + * SPDX-FileContributor: Arun Kumar Agasar + * SPDX-FileContributor: Saurav Kumar + * SPDX-FileContributor: Klaus Jensen + */ + +#include "qemu/osdep.h" +#include "qemu/crc32c.h" +#include "hw/registerfields.h" +#include "hw/i2c/i2c.h" +#include "hw/i2c/mctp.h" +#include "net/mctp.h" +#include "trace.h" + +#define NMI_MAX_MESSAGE_LENGTH 4224 + +#define TYPE_NMI_I2C_DEVICE "nmi-i2c" +OBJECT_DECLARE_SIMPLE_TYPE(NMIDevice, NMI_I2C_DEVICE) + +typedef struct NMIDevice { +MCTPI2CEndpoint mctp; + +uint8_t buffer[NMI_MAX_MESSAGE_LENGTH]; +uint8_t scratch[NMI_MAX_MESSAGE_LENGTH]; + +size_t len; +int64_t pos; +} NMIDevice; + +FIELD(NMI_MCTPD, MT, 0, 7) +FIELD(NMI_MCTPD, IC, 7, 1) + +#define NMI_MCTPD_MT_NMI 0x4 +#define NMI_MCTPD_IC_ENABLED 0x1 + +FIELD(NMI_NMP, ROR, 7, 1) +FIELD(NMI_NMP, NMIMT, 3, 4) + +#define NMI_NMP_NMIMT_NVME_MI 0x1 +#define NMI_NMP_NMIMT_NVME_ADMIN 0x2 + +typedef struct NMIMessage { +uint8_t mctpd; +uint8_t nmp; +uint8_t rsvd2[2]; +uint8_t payload[]; /* includes the Message Integrity Check */ +} NMIMessage; + +typedef struct NMIRequest { + uint8_t opc; + uint8_t rsvd1[3]; + uint32_t dw0; + uint32_t dw1; + uint32_t mic; +} NMIRequest; + +typedef enum NMIReadDSType { +NMI_CMD_READ_NMI_DS_SUBSYSTEM = 0x0, +NMI_CMD_READ_NMI_DS_PORTS = 0x1, +NMI_CMD_READ_NMI_DS_CTRL_LIST = 0x2, +NMI_CMD_READ_NMI_DS_CTRL_INFO = 0x3, +NMI_CMD_READ_NMI_DS_OPT_CMD_SUPPORT = 0x4, +NMI_CMD_READ_NMI_DS_MEB_CMD_SUPPORT = 0x5, +} NMIReadDSType; + +#define NMI_STATUS_INVALID_PARAMETER 0x4 + +static void nmi_set_parameter_error(NMIDevice *nmi, uint8_t bit, uint16_t byte) +{ +/* NVM Express Management Interface 1.2c, Figure 30 */ +struct resp { +uint8_t status; +uint8_t bit; +uint16_t byte; +}; + +struct resp *buf = (struct resp *)(nmi->scratch + nmi->pos); + +buf->status = NMI_STATUS_INVALID_PARAMETER; +buf->bit = bit & 0x3; +buf->byte = byte; + +nmi->pos += sizeof(struct resp); +} + +static void nmi_set_error(NMIDevice *nmi, uint8_t status) +{ +uint8_t buf[4] = {}; + +buf[0] = status; + +memcpy(nmi->scratch + nmi->pos, buf, 4); +nmi->pos += 4; +} + +static void nmi_handle_mi_read_nmi_ds(NMIDevice *nmi, NMIRequest *request) +{ +I2CSlave *i2c = I2C_SLAVE(nmi); + +uint32_t dw0 = le32_to_cpu(request->dw0); +uint8_t dtyp = (dw0 >> 24) & 0xf; +uint8_t *buf; +size_t len; + +trace_nmi_handle_mi_read_nmi_ds(dtyp); + +static uint8_t nmi_ds_subsystem[36] = { +0x00, /* success */ +0x20, 0x00, /* response data length */ +0x00, /* reserved */ +0x00, /* number of ports */ +0x01, /* major version */ +0x01, /* minor version */ +}; + +/* cannot be static since we need to patch in the i2c address */ +uint8_t nmi_ds_ports[36] = { +0x00, /* success */ +0x20, 0x00, /* response data length */ +0x00, /* reserved */ +0x02, /* port type (smbus) */ +0x00, /* reserved */ +0x40, 0x00, /* maximum mctp transission unit size (64 bytes) */ +0x00, 0x00, 0x00, 0x00, /* management endpoint buffer size */ +0x00, /* vpd i2c address */ +0x00, /* vpd i2c frequency */ +0x00, /* management endpoi
[PATCH v4 2/3] hw/i2c: add mctp core
From: Klaus Jensen Add an abstract MCTP over I2C endpoint model. This implements MCTP control message handling as well as handling the actual I2C transport (packetization). Devices are intended to derive from this and implement the class methods. Parts of this implementation is inspired by code[1] previously posted by Jonathan Cameron. Squashed a fix[2] from Matt Johnston. [1]: https://lore.kernel.org/qemu-devel/20220520170128.4436-1-jonathan.came...@huawei.com/ [2]: https://lore.kernel.org/qemu-devel/20221121080445.ga29...@codeconstruct.com.au/ Signed-off-by: Klaus Jensen --- MAINTAINERS | 7 + hw/arm/Kconfig| 1 + hw/i2c/Kconfig| 4 + hw/i2c/mctp.c | 428 ++ hw/i2c/meson.build| 1 + hw/i2c/trace-events | 13 ++ include/hw/i2c/mctp.h | 127 +++ include/net/mctp.h| 35 + 8 files changed, 616 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 6111b6b4d928..8ca71167607d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3395,6 +3395,13 @@ F: tests/qtest/adm1272-test.c F: tests/qtest/max34451-test.c F: tests/qtest/isl_pmbus_vr-test.c +MCTP I2C Transport +M: Klaus Jensen +S: Maintained +F: hw/i2c/mctp.c +F: include/hw/i2c/mctp.h +F: include/net/mctp.h + Firmware schema specifications M: Philippe Mathieu-Daudé R: Daniel P. Berrange diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig index 7e6834844051..5bcb1e0e8a6f 100644 --- a/hw/arm/Kconfig +++ b/hw/arm/Kconfig @@ -541,6 +541,7 @@ config ASPEED_SOC select DS1338 select FTGMAC100 select I2C +select I2C_MCTP select DPS310 select PCA9552 select SERIAL diff --git a/hw/i2c/Kconfig b/hw/i2c/Kconfig index 14886b35dac2..2b2a50b83d1e 100644 --- a/hw/i2c/Kconfig +++ b/hw/i2c/Kconfig @@ -6,6 +6,10 @@ config I2C_DEVICES # to any board's i2c bus bool +config I2C_MCTP +bool +select I2C + config SMBUS bool select I2C diff --git a/hw/i2c/mctp.c b/hw/i2c/mctp.c new file mode 100644 index ..217073d62435 --- /dev/null +++ b/hw/i2c/mctp.c @@ -0,0 +1,428 @@ +/* + * SPDX-License-Identifier: GPL-2.0-or-later + * + * SPDX-FileCopyrightText: Copyright (c) 2023 Samsung Electronics Co., Ltd. + * SPDX-FileContributor: Klaus Jensen + */ + +#include "qemu/osdep.h" +#include "qemu/main-loop.h" + +#include "hw/qdev-properties.h" +#include "hw/i2c/i2c.h" +#include "hw/i2c/smbus_master.h" +#include "hw/i2c/mctp.h" +#include "net/mctp.h" + +#include "trace.h" + +/* DSP0237 1.2.0, Figure 1 */ +typedef struct MCTPI2CPacketHeader { +uint8_t dest; +#define MCTP_I2C_COMMAND_CODE 0xf +uint8_t command_code; +uint8_t byte_count; +uint8_t source; +} MCTPI2CPacketHeader; + +typedef struct MCTPI2CPacket { +MCTPI2CPacketHeader i2c; +MCTPPacket mctp; +} MCTPI2CPacket; + +#define i2c_mctp_payload_offset offsetof(MCTPI2CPacket, mctp.payload) +#define i2c_mctp_payload(buf) (buf + i2c_mctp_payload_offset) + +/* DSP0236 1.3.0, Figure 20 */ +typedef struct MCTPControlMessage { +#define MCTP_MESSAGE_TYPE_CONTROL 0x0 +uint8_t type; +#define MCTP_CONTROL_FLAGS_RQ (1 << 7) +#define MCTP_CONTROL_FLAGS_D(1 << 6) +uint8_t flags; +uint8_t command_code; +uint8_t data[]; +} MCTPControlMessage; + +enum MCTPControlCommandCodes { +MCTP_CONTROL_SET_EID= 0x01, +MCTP_CONTROL_GET_EID= 0x02, +MCTP_CONTROL_GET_VERSION= 0x04, +MCTP_CONTROL_GET_MESSAGE_TYPE_SUPPORT = 0x05, +}; + +#define MCTP_CONTROL_ERROR_UNSUPPORTED_CMD 0x5 + +#define i2c_mctp_control_data_offset \ +(i2c_mctp_payload_offset + offsetof(MCTPControlMessage, data)) +#define i2c_mctp_control_data(buf) (buf + i2c_mctp_control_data_offset) + +/** + * The byte count field in the SMBUS Block Write containers the number of bytes + * *following* the field itself. + * + * This is at least 5. + * + * 1 byte for the MCTP/I2C piggy-backed I2C source address in addition to the + * size of the MCTP transport/packet header. + */ +#define MCTP_I2C_BYTE_COUNT_OFFSET (sizeof(MCTPPacketHeader) + 1) + +void i2c_mctp_schedule_send(MCTPI2CEndpoint *mctp) +{ +I2CBus *i2c = I2C_BUS(qdev_get_parent_bus(DEVICE(mctp))); + +mctp->tx.state = I2C_MCTP_STATE_TX_START_SEND; + +i2c_bus_master(i2c, mctp->tx.bh); +} + +static void i2c_mctp_tx(void *opaque) +{ +DeviceState *dev = DEVICE(opaque); +I2CBus *i2c = I2C_BUS(qdev_get_parent_bus(dev)); +I2CSlave *slave = I2C_SLAVE(dev); +MCTPI2CEndpoint *mctp = MCTP_I2C_ENDPOINT(dev); +MCTPI2CEndpointClass *mc = MCTP_I2C_ENDPOINT_GET_CLASS(mctp); +MCTPI2CPacket *pkt = (MCTPI2CPacket *)mctp->buffer; +uint8_t flags = 0; + +switch (mctp->tx.state) { +case I2C_MCTP_STATE_TX_SEND_BYTE: +if (mctp->pos < mctp->len)
[PATCH v4 0/3] hw/{i2c,nvme}: mctp endpoint, nvme management interface model
This adds a generic MCTP endpoint model that other devices may derive from. Also included is a very basic implementation of an NVMe-MI device, supporting only a small subset of the required commands. Since this all relies on i2c target mode, this can currently only be used with an SoC that includes the Aspeed I2C controller. The easiest way to get up and running with this, is to grab my buildroot overlay[1] (aspeed_ast2600evb_nmi_defconfig). It includes modified a modified dts as well as a couple of required packages. QEMU can then be launched along these lines: qemu-system-arm \ -nographic \ -M ast2600-evb \ -kernel output/images/zImage \ -initrd output/images/rootfs.cpio \ -dtb output/images/aspeed-ast2600-evb-nmi.dtb \ -nic user,hostfwd=tcp::-:22 \ -device nmi-i2c,address=0x3a \ -serial mon:stdio >From within the booted system, mctp addr add 8 dev mctpi2c15 mctp link set mctpi2c15 up mctp route add 9 via mctpi2c15 mctp neigh add 9 dev mctpi2c15 lladdr 0x3a mi-mctp 1 9 info Comments are very welcome! [1]: https://github.com/birkelund/hwtests/tree/main/br2-external Changes since v3 - Inlined the POLY define in the crc8 function (Philippe) - Changed a bunch of fields to use registerfields.h - From Jonathan: + Added references to specs defining the structures. - From Corey: + Reworked the buffer handling (again) ;) Derived devices can now never write into the mctp core buffers. Instead, the mctp core will request a buffer pointer and copy from there. Internally, within the mctp core, writes to internal buffers are also checked. Changes since v2 - Applied a bunch of feedback from Jonathan: + Moved a lot of internally used structs out of the include headers and into the source files. + Added spec references in various places + Split the patch for i2c_smbus_pec() into its own + Fix a compile error (and bug) in nmi-i2c.c. - From Corey: + Reworked the buffer handling. The deriving devices now returns a pointer to their own buffer that the mctp core copies into. + Added a couple of extra debugging trace events. Changes since v1 - Fix SPDX-License tag for hw/nvme/nmi-i2c.c (Philippe) - Add some asserts to verify buffer indices (by request from Corey). - Drop short packets that could result in underflow (Corey) - Move i2c_smbus_pec() to smbus common code (Corey) - A couple of logic fixes (patch from Jeremy squashed in) - Added a patch to handle messages with dest eid 0 (Matt) Maybe squash this as well. Signed-off-by: Klaus Jensen --- Klaus Jensen (3): hw/i2c: add smbus pec utility function hw/i2c: add mctp core hw/nvme: add nvme management interface model MAINTAINERS | 7 + hw/arm/Kconfig| 1 + hw/i2c/Kconfig| 4 + hw/i2c/mctp.c | 428 ++ hw/i2c/meson.build| 1 + hw/i2c/smbus_master.c | 26 +++ hw/i2c/trace-events | 13 ++ hw/nvme/Kconfig | 4 + hw/nvme/meson.build | 1 + hw/nvme/nmi-i2c.c | 418 + hw/nvme/trace-events | 6 + include/hw/i2c/mctp.h | 127 + include/hw/i2c/smbus_master.h | 2 + include/net/mctp.h| 35 14 files changed, 1073 insertions(+) --- base-commit: b0dd9a7d6dd15a6898e9c585b521e6bec79b25aa change-id: 20230822-nmi-i2c-d804ed5be7e6 Best regards, -- Klaus Jensen
[PATCH v4 1/3] hw/i2c: add smbus pec utility function
From: Klaus Jensen Add i2c_smbus_pec() to calculate the SMBus Packet Error Code for a message. Signed-off-by: Klaus Jensen --- hw/i2c/smbus_master.c | 26 ++ include/hw/i2c/smbus_master.h | 2 ++ 2 files changed, 28 insertions(+) diff --git a/hw/i2c/smbus_master.c b/hw/i2c/smbus_master.c index 6a53c34e70b7..01a8e4700222 100644 --- a/hw/i2c/smbus_master.c +++ b/hw/i2c/smbus_master.c @@ -15,6 +15,32 @@ #include "hw/i2c/i2c.h" #include "hw/i2c/smbus_master.h" +static uint8_t crc8(uint16_t data) +{ +int i; + +for (i = 0; i < 8; i++) { +if (data & 0x8000) { +data ^= 0x1070U << 3; +} + +data <<= 1; +} + +return (uint8_t)(data >> 8); +} + +uint8_t i2c_smbus_pec(uint8_t crc, uint8_t *buf, size_t len) +{ +int i; + +for (i = 0; i < len; i++) { +crc = crc8((crc ^ buf[i]) << 8); +} + +return crc; +} + /* Master device commands. */ int smbus_quick_command(I2CBus *bus, uint8_t addr, int read) { diff --git a/include/hw/i2c/smbus_master.h b/include/hw/i2c/smbus_master.h index bb13bc423c22..d90f81767d86 100644 --- a/include/hw/i2c/smbus_master.h +++ b/include/hw/i2c/smbus_master.h @@ -27,6 +27,8 @@ #include "hw/i2c/i2c.h" +uint8_t i2c_smbus_pec(uint8_t crc, uint8_t *buf, size_t len); + /* Master device commands. */ int smbus_quick_command(I2CBus *bus, uint8_t addr, int read); int smbus_receive_byte(I2CBus *bus, uint8_t addr); -- 2.42.0
Re: [PATCH v2 0/4] Add full zoned storage emulation to qcow2 driver
On Aug 14 16:57, Sam Li wrote: > This patch series add a new extension - zoned format - to the > qcow2 driver thereby allowing full zoned storage emulation on > the qcow2 img file. Users can attach such a qcow2 file to the > guest as a zoned device. > > To create a qcow2 file with zoned format, use command like this: > $ qemu-img create -f qcow2 test.qcow2 -o size=768M -o > zone_size=64M -o zone_capacity=64M -o zone_nr_conv=0 -o > max_append_sectors=512 -o max_open_zones=0 -o max_active_zones=0 > -o zoned_profile=zbc > > Then add it to the QEMU command line: > -blockdev > node-name=drive1,driver=qcow2,file.driver=file,file.filename=../qemu/test.qcow2 > \ > -device virtio-blk-pci,drive=drive1 \ > > v1->v2: > - add more tests to qemu-io zoned commands > - make zone append change state to full when wp reaches end > - add documentation to qcow2 zoned extension header > - address review comments (Stefan): > * fix zoned_mata allocation size > * use bitwise or than addition > * fix wp index overflow and locking > * cleanups: comments, naming > > Sam Li (4): > docs/qcow2: add the zoned format feature > qcow2: add configurations for zoned format extension > qcow2: add zoned emulation capability > iotests: test the zoned format feature for qcow2 file > > block/qcow2.c| 799 ++- > block/qcow2.h| 23 + > docs/interop/qcow2.txt | 26 + > docs/system/qemu-block-drivers.rst.inc | 39 ++ > include/block/block-common.h | 5 + > include/block/block_int-common.h | 16 + > qapi/block-core.json | 46 +- > tests/qemu-iotests/tests/zoned-qcow2 | 135 > tests/qemu-iotests/tests/zoned-qcow2.out | 140 > 9 files changed, 1214 insertions(+), 15 deletions(-) > create mode 100755 tests/qemu-iotests/tests/zoned-qcow2 > create mode 100644 tests/qemu-iotests/tests/zoned-qcow2.out > Hi Sam, Thanks for this and for the RFC for hw/nvme - this is an awesome improvement. Can you explain the need for the zoned_profile? I understand that only ZNS requires potentially setting zone_capacity and configuring extended descriptors. When an image is hooked up to a block emulation device that doesnt understand cap < size or extended descriptors, it could just would fail on the cap < size and just ignore the extended descriptor space. Do we really need to add the complexity of the user explicitly having to set the profile? I also think it is fair for the QEMU zoned block api to accomodate both variations - if a particular configuration is supported or not is up to the emulating device. Checking the profile from hw/nvme or hw/block/virtio is the same as checking if cap < size or possibly the presence of extended descriptors. Thanks, Klaus signature.asc Description: PGP signature
Re: [PATCH 0/2] nvme: avoid dynamic stack allocations
On Aug 11 18:47, Peter Maydell wrote: > The QEMU codebase has very few C variable length arrays, and if we can > get rid of them all we can make the compiler error on new additions. > This is a defensive measure against security bugs where an on-stack > dynamic allocation isn't correctly size-checked (e.g. CVE-2021-3527). > > We last had a go at this a few years ago, when Philippe wrote > patches for this: > https://patchew.org/QEMU/20210505211047.1496765-1-phi...@redhat.com/ > Some of the fixes made it into the tree, but some didn't (either > because of lack of review or because review found some changes > that needed to be made). I'm going through the remainder as a > non-urgent Friday afternoon task... > > This patchset deals with two VLAs in the NVME code. > > thanks > -- PMM > > Peter Maydell (1): > hw/nvme: Avoid dynamic stack allocation > > Philippe Mathieu-Daudé (1): > hw/nvme: Use #define to avoid variable length array > > hw/nvme/ctrl.c | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > -- > 2.34.1 > Thanks Peter, Applied to nvme-next! signature.asc Description: PGP signature
[PULL 2/2] hw/nvme: fix null pointer access in ruh update
From: Klaus Jensen The Reclaim Unit Update operation in I/O Management Receive does not verify the presence of a configured endurance group prior to accessing it. Fix this. Cc: qemu-sta...@nongnu.org Fixes: 73064edfb864 ("hw/nvme: flexible data placement emulation") Reviewed-by: Jesper Wendel Devantier Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index e5b5c7034d2b..539d27355313 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -4361,7 +4361,13 @@ static uint16_t nvme_io_mgmt_send_ruh_update(NvmeCtrl *n, NvmeRequest *req) uint32_t npid = (cdw10 >> 1) + 1; unsigned int i = 0; g_autofree uint16_t *pids = NULL; -uint32_t maxnpid = n->subsys->endgrp.fdp.nrg * n->subsys->endgrp.fdp.nruh; +uint32_t maxnpid; + +if (!ns->endgrp || !ns->endgrp->fdp.enabled) { +return NVME_FDP_DISABLED | NVME_DNR; +} + +maxnpid = n->subsys->endgrp.fdp.nrg * n->subsys->endgrp.fdp.nruh; if (unlikely(npid >= MIN(NVME_FDP_MAXPIDS, maxnpid))) { return NVME_INVALID_FIELD | NVME_DNR; -- 2.41.0
[PULL 1/2] hw/nvme: fix null pointer access in directive receive
From: Klaus Jensen nvme_directive_receive() does not check if an endurance group has been configured (set) prior to testing if flexible data placement is enabled or not. Fix this. Cc: qemu-sta...@nongnu.org Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1815 Fixes: 73064edfb864 ("hw/nvme: flexible data placement emulation") Reviewed-by: Jesper Wendel Devantier Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index d217ae91b506..e5b5c7034d2b 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -6900,7 +6900,7 @@ static uint16_t nvme_directive_receive(NvmeCtrl *n, NvmeRequest *req) case NVME_DIRECTIVE_IDENTIFY: switch (doper) { case NVME_DIRECTIVE_RETURN_PARAMS: -if (ns->endgrp->fdp.enabled) { +if (ns->endgrp && ns->endgrp->fdp.enabled) { id.supported |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT; id.enabled |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT; id.persistent |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT; -- 2.41.0
[PULL 0/2] hw/nvme: more fixes
From: Klaus Jensen Hi, The following changes since commit a8fc5165aab02f328ccd148aafec1e59fd1426eb: Merge tag 'nvme-next-pull-request' of https://gitlab.com/birkelund/qemu into staging (2023-08-08 16:39:20 -0700) are available in the Git repository at: https://gitlab.com/birkelund/qemu.git tags/nvme-fixes-pull-request for you to fetch changes up to 3439ba9c5da943d96f7a3c86e0a7eb2ff48de41c: hw/nvme: fix null pointer access in ruh update (2023-08-09 15:32:32 +0200) hw/nvme: fixes -BEGIN PGP SIGNATURE- iQEzBAABCgAdFiEEUigzqnXi3OaiR2bATeGvMW1PDekFAmTTlmcACgkQTeGvMW1P DemjjggAnhEvaJ4fgS9rsvtxCwtzLNc405xMpNxh6rPaxa+sL3RXPIrW6vWG13+W VcHw8DI8EV4DzAFP919ZmTUq9/boRbgxx84bStlILUPHWol8+eGYVVfT75wFKszx d4Vi3nyPSGlrxieSrosARqimcUDtFtDGGAxjvEcKgzhkcU3a8DVYAOmx/hdlWJJQ KSk4h/E1pKItFbvv+w9yszsbToeZN65oIy7kQtFgx0JOULyWvEYSVygotw/AruF3 FPQ0nrJuZ115w3cJWDszznVJ6+3EcWbD3luQc3zE1FOPp76EkAOkcnPh1XbBJrE2 2BsCX/XnXcZT7BWSJbEzGXLsHjqsPg== =Zy0+ -END PGP SIGNATURE- Klaus Jensen (2): hw/nvme: fix null pointer access in directive receive hw/nvme: fix null pointer access in ruh update hw/nvme/ctrl.c | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) -- 2.41.0
[PATCH 2/2] hw/nvme: fix null pointer access in ruh update
From: Klaus Jensen The Reclaim Unit Update operation in I/O Management Receive does not verify the presence of a configured endurance group prior to accessing it. Fix this. Cc: qemu-sta...@nongnu.org Fixes: 73064edfb864 ("hw/nvme: flexible data placement emulation") Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index e5b5c7034d2b..539d27355313 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -4361,7 +4361,13 @@ static uint16_t nvme_io_mgmt_send_ruh_update(NvmeCtrl *n, NvmeRequest *req) uint32_t npid = (cdw10 >> 1) + 1; unsigned int i = 0; g_autofree uint16_t *pids = NULL; -uint32_t maxnpid = n->subsys->endgrp.fdp.nrg * n->subsys->endgrp.fdp.nruh; +uint32_t maxnpid; + +if (!ns->endgrp || !ns->endgrp->fdp.enabled) { +return NVME_FDP_DISABLED | NVME_DNR; +} + +maxnpid = n->subsys->endgrp.fdp.nrg * n->subsys->endgrp.fdp.nruh; if (unlikely(npid >= MIN(NVME_FDP_MAXPIDS, maxnpid))) { return NVME_INVALID_FIELD | NVME_DNR; -- 2.41.0
[PATCH 0/2] hw/nvme: two fixes
From: Klaus Jensen Fix two potential accesses to null pointers. Klaus Jensen (2): hw/nvme: fix null pointer access in directive receive hw/nvme: fix null pointer access in ruh update hw/nvme/ctrl.c | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) -- 2.41.0
[PATCH 1/2] hw/nvme: fix null pointer access in directive receive
From: Klaus Jensen nvme_directive_receive() does not check if an endurance group has been configured (set) prior to testing if flexible data placement is enabled or not. Fix this. Cc: qemu-sta...@nongnu.org Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1815 Fixes: 73064edfb864 ("hw/nvme: flexible data placement emulation") Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index d217ae91b506..e5b5c7034d2b 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -6900,7 +6900,7 @@ static uint16_t nvme_directive_receive(NvmeCtrl *n, NvmeRequest *req) case NVME_DIRECTIVE_IDENTIFY: switch (doper) { case NVME_DIRECTIVE_RETURN_PARAMS: -if (ns->endgrp->fdp.enabled) { +if (ns->endgrp && ns->endgrp->fdp.enabled) { id.supported |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT; id.enabled |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT; id.persistent |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT; -- 2.41.0
[PULL v2] hw/nvme fixes
From: Klaus Jensen Hi, There was a small typo in the last pull. This replaces it. The following changes since commit 0450cf08976f9036feaded438031b4cba94f6452: Merge tag 'fixes-pull-request' of https://gitlab.com/marcandre.lureau/qemu into staging (2023-08-07 13:55:00 -0700) are available in the Git repository at: https://gitlab.com/birkelund/qemu.git tags/nvme-next-pull-request for you to fetch changes up to ec5a138ce63ce460575a44cf9ec3172c33eb0fd6: docs: update hw/nvme documentation for protection information (2023-08-08 15:28:05 +0200) hw/nvme fixes - fix for invalid protection information calculation -BEGIN PGP SIGNATURE- iQEzBAABCgAdFiEEUigzqnXi3OaiR2bATeGvMW1PDekFAmTSREoACgkQTeGvMW1P DekH6Qf/e3gi0KloAUpbTQvGmBA6XmkJFAtOdZn7IJXVCowjYTIKU84DrdPyT1c1 rofL4w0klKG5c4Or/Cs4dH/ASxTWaQZRlFAYxsTW3nUX74MnaFDRZcN2geb30ws7 ryejVEKeHNWH/YYY4Ny55wO3tmy2ILAKnbiadiXhj4dQfCK1GzZnrx10PWxLNlkZ KRhiXLNBHpPnDlrLq7/nLs+/0cMrrqEz6ISm/Ju4iUczAH/wmqEbR/yD3pAwmH07 PCaSeegOpwscovI5TWRelOJlzIXb6D8Xk9d3dGL5x/eeN7GlkgERX4MAcNYKwe8T JNR8y2ErTEj2nLU/juES1EpiR2gYKw== =vJlA -END PGP SIGNATURE- Ankit Kumar (2): hw/nvme: fix CRC64 for guard tag docs: update hw/nvme documentation for protection information docs/system/devices/nvme.rst | 12 +--- hw/nvme/dif.c| 4 ++-- 2 files changed, 11 insertions(+), 5 deletions(-) -- 2.41.0
[PULL 1/2] hw/nvme: fix CRC64 for guard tag
From: Ankit Kumar The nvme CRC64 generator expects the caller to pass inverted seed value. Pass inverted crc value for metadata buffer. Cc: qemu-sta...@nongnu.org Fixes: 44219b6029fc ("hw/nvme: 64-bit pi support") Signed-off-by: Ankit Kumar Signed-off-by: Klaus Jensen --- hw/nvme/dif.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/nvme/dif.c b/hw/nvme/dif.c index 63c44c86ab55..01b19c33734e 100644 --- a/hw/nvme/dif.c +++ b/hw/nvme/dif.c @@ -115,7 +115,7 @@ static void nvme_dif_pract_generate_dif_crc64(NvmeNamespace *ns, uint8_t *buf, uint64_t crc = crc64_nvme(~0ULL, buf, ns->lbasz); if (pil) { -crc = crc64_nvme(crc, mbuf, pil); +crc = crc64_nvme(~crc, mbuf, pil); } dif->g64.guard = cpu_to_be64(crc); @@ -246,7 +246,7 @@ static uint16_t nvme_dif_prchk_crc64(NvmeNamespace *ns, NvmeDifTuple *dif, uint64_t crc = crc64_nvme(~0ULL, buf, ns->lbasz); if (pil) { -crc = crc64_nvme(crc, mbuf, pil); +crc = crc64_nvme(~crc, mbuf, pil); } trace_pci_nvme_dif_prchk_guard_crc64(be64_to_cpu(dif->g64.guard), crc); -- 2.41.0
[PULL 2/2] docs: update hw/nvme documentation for protection information
From: Ankit Kumar Add missing entry for pif ("protection information format"). Protection information size can be 8 or 16 bytes, Update the pil entry as per the NVM command set specification. Signed-off-by: Ankit Kumar Signed-off-by: Klaus Jensen --- docs/system/devices/nvme.rst | 12 +--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/docs/system/devices/nvme.rst b/docs/system/devices/nvme.rst index 2a3af268f7a5..32ff287cd78e 100644 --- a/docs/system/devices/nvme.rst +++ b/docs/system/devices/nvme.rst @@ -271,9 +271,15 @@ The virtual namespace device supports DIF- and DIX-based protection information ``pil=UINT8`` (default: ``0``) Controls the location of the protection information within the metadata. Set - to ``1`` to transfer protection information as the first eight bytes of - metadata. Otherwise, the protection information is transferred as the last - eight bytes. + to ``1`` to transfer protection information as the first bytes of metadata. + Otherwise, the protection information is transferred as the last bytes of + metadata. + +``pif=UINT8`` (default: ``0``) + By default, the namespace device uses 16 bit guard protection information + format (``pif=0``). Set to ``2`` to enable 64 bit guard protection + information format. This requires at least 16 bytes of metadata. Note that + ``pif=2`` (32 bit guards) are currently not supported. Virtualization Enhancements and SR-IOV (Experimental Support) - -- 2.41.0
[PULL 0/2] hw/nvme late fix
From: Klaus Jensen Hi, This is a fix for hw/nvme protection information discovered by Ankit late in the cycle. This is not a regression, but a long standing bug and not critical (obviously no users of this until now, no potential for crash or similar, just plain wrong). If this can make it for -rc3 that is great, but it can easily be deferred to a stable release or -rc4 if other fixes require it. Thanks, The following changes since commit 0450cf08976f9036feaded438031b4cba94f6452: Merge tag 'fixes-pull-request' of https://gitlab.com/marcandre.lureau/qemu into staging (2023-08-07 13:55:00 -0700) are available in the Git repository at: https://gitlab.com/birkelund/qemu.git tags/nvme-next-pull-request for you to fetch changes up to bb223df9403beada5b2ab408d2d9a82471432a21: docs: update hw/nvme documentation for protection information (2023-08-08 08:10:15 +0200) hw/nvme late fix - late fix for nvme pi -BEGIN PGP SIGNATURE- iQEzBAABCgAdFiEEUigzqnXi3OaiR2bATeGvMW1PDekFAmTR3uMACgkQTeGvMW1P DenZzgf/T5pb1dQhYthpJ/OOLcvtttpRChQSvZJ8uxyzWoNS/9hKn5gd8buvdGwt fO1QU3Ogh6ArwZ9GT5OqLV05d9vMuvJlPxpbqOs8XZACobH+nb3CqXBcX1F7TxqV j9OmFH4UGPDo42hkT+jqa+kHc9hxmpwg+f6Wlpad+ZJ2UAel0/19JsQItln8JQ/I Jxd07Q5qcj06RtwcPf/0WUOs4I6sTkifu7uZIrx1YjYN4/jQmaF2L0MQjUw1ktLF hFXSW3iarDKh2fFlfR2fMkkoLLnoS6NoZnTj3fBDabcuMfpJlf7WZ5fuYpOlCKtB kpN9/WaGpZtXmWAg82R7wlgR4D9vkw== =EuEc -END PGP SIGNATURE- Ankit Kumar (2): hw/nvme: fix CRC64 for guard tag docs: update hw/nvme documentation for protection information docs/system/devices/nvme.rst | 12 +--- hw/nvme/dif.c| 4 ++-- 2 files changed, 11 insertions(+), 5 deletions(-) -- 2.41.0
Re: [PATCH] hw/nvme: fix oob memory read in fdp events log
+CC qemu-stable On Aug 3 20:44, Klaus Jensen wrote: > From: Klaus Jensen > > As reported by Trend Micro's Zero Day Initiative, an oob memory read > vulnerability exists in nvme_fdp_events(). The host-provided offset is > not verified. > > Fix this. > > This is only exploitable when Flexible Data Placement mode (fdp=on) is > enabled. > > Fixes: CVE-2023-4135 > Fixes: 73064edfb864 ("hw/nvme: flexible data placement emulation") > Reported-by: Trend Micro's Zero Day Initiative > Signed-off-by: Klaus Jensen > --- > hw/nvme/ctrl.c | 5 + > 1 file changed, 5 insertions(+) > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c > index f2e5a2fa737b..e9b5a55811b8 100644 > --- a/hw/nvme/ctrl.c > +++ b/hw/nvme/ctrl.c > @@ -5120,6 +5120,11 @@ static uint16_t nvme_fdp_events(NvmeCtrl *n, uint32_t > endgrpid, > } > > log_size = sizeof(NvmeFdpEventsLog) + ebuf->nelems * > sizeof(NvmeFdpEvent); > + > +if (off >= log_size) { > +return NVME_INVALID_FIELD | NVME_DNR; > +} > + > trans_len = MIN(log_size - off, buf_len); > elog = g_malloc0(log_size); > elog->num_events = cpu_to_le32(ebuf->nelems); > -- > 2.41.0 > -- One of us - No more doubt, silence or taboo about mental illness. signature.asc Description: PGP signature
[PULL 0/2] hw/nvme fixes
From: Klaus Jensen Hi, The following changes since commit 9400601a689a128c25fa9c21e932562e0eeb7a26: Merge tag 'pull-tcg-20230806-3' of https://gitlab.com/rth7680/qemu into staging (2023-08-06 16:47:48 -0700) are available in the Git repository at: https://gitlab.com/birkelund/qemu.git tags/nvme-next-pull-request for you to fetch changes up to 6a33f2e920ec0b489a77200888e3692664077f2d: hw/nvme: fix compliance issue wrt. iosqes/iocqes (2023-08-07 12:27:24 +0200) hw/nvme fixes - two fixes for hw/nvme -BEGIN PGP SIGNATURE- iQEzBAABCgAdFiEEUigzqnXi3OaiR2bATeGvMW1PDekFAmTQ2y4ACgkQTeGvMW1P DenpWQf/WFgEljzgTcgxlfZhCyzWGwVNgKqRxlTuF6ELqm8BajCuCeA5ias6AXOr x/gZ0VqrL91L5tRIH5Q0sdC+HBFC1yMs66jopdzc1oL1eYu1HTrLIqMDtkXp/K/P PyGah2t4qEMtacSkad+hmB68ViUkkmhkxrWYIeufUQTfLNF5pBqNvB1kQON3jmXE a1jI/PabYxi8Km0rfFJD6SUGmL9+m7MY/SyZAy+4EZZ1OEnp5jb3o9lbdwbhIU5e dRX4NW4BEDiOJeIcNVDiQkXv2/Lna1B51RVMvM4owpk0eRvRXMSqs2DQ5/jp/nGb 8uChUJ0QW68I4e9ptTfxmBsr4pSktg== =0nwp -END PGP SIGNATURE- Klaus Jensen (2): hw/nvme: fix oob memory read in fdp events log hw/nvme: fix compliance issue wrt. iosqes/iocqes hw/nvme/ctrl.c | 51 +++- hw/nvme/nvme.h | 9 ++-- hw/nvme/trace-events | 1 + 3 files changed, 25 insertions(+), 36 deletions(-) -- 2.41.0
[PULL 1/2] hw/nvme: fix oob memory read in fdp events log
From: Klaus Jensen As reported by Trend Micro's Zero Day Initiative, an oob memory read vulnerability exists in nvme_fdp_events(). The host-provided offset is not verified. Fix this. This is only exploitable when Flexible Data Placement mode (fdp=on) is enabled. Fixes: CVE-2023-4135 Fixes: 73064edfb864 ("hw/nvme: flexible data placement emulation") Reported-by: Trend Micro's Zero Day Initiative Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 5 + 1 file changed, 5 insertions(+) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index f2e5a2fa737b..e9b5a55811b8 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -5120,6 +5120,11 @@ static uint16_t nvme_fdp_events(NvmeCtrl *n, uint32_t endgrpid, } log_size = sizeof(NvmeFdpEventsLog) + ebuf->nelems * sizeof(NvmeFdpEvent); + +if (off >= log_size) { +return NVME_INVALID_FIELD | NVME_DNR; +} + trans_len = MIN(log_size - off, buf_len); elog = g_malloc0(log_size); elog->num_events = cpu_to_le32(ebuf->nelems); -- 2.41.0
[PULL 2/2] hw/nvme: fix compliance issue wrt. iosqes/iocqes
From: Klaus Jensen As of prior to this patch, the controller checks the value of CC.IOCQES and CC.IOSQES prior to enabling the controller. As reported by Ben in GitLab issue #1691, this is not spec compliant. The controller should only check these values when queues are created. This patch moves these checks to nvme_create_cq(). We do not need to check it in nvme_create_sq() since that will error out if the completion queue is not already created. Also, since the controller exclusively supports SQEs of size 64 bytes and CQEs of size 16 bytes, hard code that. Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1691 Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 46 hw/nvme/nvme.h | 9 +++-- hw/nvme/trace-events | 1 + 3 files changed, 20 insertions(+), 36 deletions(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index e9b5a55811b8..d217ae91b506 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -1507,7 +1507,7 @@ static void nvme_post_cqes(void *opaque) req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase); req->cqe.sq_id = cpu_to_le16(sq->sqid); req->cqe.sq_head = cpu_to_le16(sq->head); -addr = cq->dma_addr + cq->tail * n->cqe_size; +addr = cq->dma_addr + (cq->tail << NVME_CQES); ret = pci_dma_write(PCI_DEVICE(n), addr, (void *)>cqe, sizeof(req->cqe)); if (ret) { @@ -5300,10 +5300,18 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req) uint16_t qsize = le16_to_cpu(c->qsize); uint16_t qflags = le16_to_cpu(c->cq_flags); uint64_t prp1 = le64_to_cpu(c->prp1); +uint32_t cc = ldq_le_p(>bar.cc); +uint8_t iocqes = NVME_CC_IOCQES(cc); +uint8_t iosqes = NVME_CC_IOSQES(cc); trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags, NVME_CQ_FLAGS_IEN(qflags) != 0); +if (iosqes != NVME_SQES || iocqes != NVME_CQES) { +trace_pci_nvme_err_invalid_create_cq_entry_size(iosqes, iocqes); +return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR; +} + if (unlikely(!cqid || cqid > n->conf_ioqpairs || n->cq[cqid] != NULL)) { trace_pci_nvme_err_invalid_create_cq_cqid(cqid); return NVME_INVALID_QID | NVME_DNR; @@ -7000,7 +7008,7 @@ static void nvme_process_sq(void *opaque) } while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(>req_list))) { -addr = sq->dma_addr + sq->head * n->sqe_size; +addr = sq->dma_addr + (sq->head << NVME_SQES); if (nvme_addr_read(n, addr, (void *), sizeof(cmd))) { trace_pci_nvme_err_addr_read(addr); trace_pci_nvme_err_cfs(); @@ -7225,34 +7233,6 @@ static int nvme_start_ctrl(NvmeCtrl *n) NVME_CAP_MPSMAX(cap)); return -1; } -if (unlikely(NVME_CC_IOCQES(cc) < - NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) { -trace_pci_nvme_err_startfail_cqent_too_small( -NVME_CC_IOCQES(cc), -NVME_CTRL_CQES_MIN(cap)); -return -1; -} -if (unlikely(NVME_CC_IOCQES(cc) > - NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) { -trace_pci_nvme_err_startfail_cqent_too_large( -NVME_CC_IOCQES(cc), -NVME_CTRL_CQES_MAX(cap)); -return -1; -} -if (unlikely(NVME_CC_IOSQES(cc) < - NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) { -trace_pci_nvme_err_startfail_sqent_too_small( -NVME_CC_IOSQES(cc), -NVME_CTRL_SQES_MIN(cap)); -return -1; -} -if (unlikely(NVME_CC_IOSQES(cc) > - NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) { -trace_pci_nvme_err_startfail_sqent_too_large( -NVME_CC_IOSQES(cc), -NVME_CTRL_SQES_MAX(cap)); -return -1; -} if (unlikely(!NVME_AQA_ASQS(aqa))) { trace_pci_nvme_err_startfail_asqent_sz_zero(); return -1; @@ -7265,8 +7245,6 @@ static int nvme_start_ctrl(NvmeCtrl *n) n->page_bits = page_bits; n->page_size = page_size; n->max_prp_ents = n->page_size / sizeof(uint64_t); -n->cqe_size = 1 << NVME_CC_IOCQES(cc); -n->sqe_size = 1 << NVME_CC_IOSQES(cc); nvme_init_cq(>admin_cq, n, acq, 0, 0, NVME_AQA_ACQS(aqa) + 1, 1); nvme_init_sq(>admin_sq, n, asq, 0, 0, NVME_AQA_ASQS(aqa) + 1); @@ -8235,8 +8213,8 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING); id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL); -id->sqes = (0x6 << 4) | 0x6; -id->cqes = (0x4 << 4) | 0x4; +id->sqes = (NVME_SQES << 4) | NVME_SQES; +id->cqes = (NVME_CQES << 4) | NVME_CQE
[PATCH] hw/nvme: fix oob memory read in fdp events log
From: Klaus Jensen As reported by Trend Micro's Zero Day Initiative, an oob memory read vulnerability exists in nvme_fdp_events(). The host-provided offset is not verified. Fix this. This is only exploitable when Flexible Data Placement mode (fdp=on) is enabled. Fixes: CVE-2023-4135 Fixes: 73064edfb864 ("hw/nvme: flexible data placement emulation") Reported-by: Trend Micro's Zero Day Initiative Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 5 + 1 file changed, 5 insertions(+) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index f2e5a2fa737b..e9b5a55811b8 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -5120,6 +5120,11 @@ static uint16_t nvme_fdp_events(NvmeCtrl *n, uint32_t endgrpid, } log_size = sizeof(NvmeFdpEventsLog) + ebuf->nelems * sizeof(NvmeFdpEvent); + +if (off >= log_size) { +return NVME_INVALID_FIELD | NVME_DNR; +} + trans_len = MIN(log_size - off, buf_len); elog = g_malloc0(log_size); elog->num_events = cpu_to_le32(ebuf->nelems); -- 2.41.0
[PULL 1/1] hw/nvme: use stl/ldl pci dma api
From: Klaus Jensen Use the stl/ldl pci dma api for writing/reading doorbells. This removes the explicit endian conversions. Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Cédric Le Goater Tested-by: Cédric Le Goater Reviewed-by: Thomas Huth Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 42 +- 1 file changed, 13 insertions(+), 29 deletions(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index dadc2dc7da10..f2e5a2fa737b 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -1468,20 +1468,16 @@ static inline void nvme_blk_write(BlockBackend *blk, int64_t offset, static void nvme_update_cq_eventidx(const NvmeCQueue *cq) { -uint32_t v = cpu_to_le32(cq->head); - trace_pci_nvme_update_cq_eventidx(cq->cqid, cq->head); -pci_dma_write(PCI_DEVICE(cq->ctrl), cq->ei_addr, , sizeof(v)); +stl_le_pci_dma(PCI_DEVICE(cq->ctrl), cq->ei_addr, cq->head, + MEMTXATTRS_UNSPECIFIED); } static void nvme_update_cq_head(NvmeCQueue *cq) { -uint32_t v; - -pci_dma_read(PCI_DEVICE(cq->ctrl), cq->db_addr, , sizeof(v)); - -cq->head = le32_to_cpu(v); +ldl_le_pci_dma(PCI_DEVICE(cq->ctrl), cq->db_addr, >head, + MEMTXATTRS_UNSPECIFIED); trace_pci_nvme_update_cq_head(cq->cqid, cq->head); } @@ -6801,7 +6797,6 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req) PCIDevice *pci = PCI_DEVICE(n); uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1); uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2); -uint32_t v; int i; /* Address should be page aligned */ @@ -6819,8 +6814,6 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req) NvmeCQueue *cq = n->cq[i]; if (sq) { -v = cpu_to_le32(sq->tail); - /* * CAP.DSTRD is 0, so offset of ith sq db_addr is (i<<3) * nvme_process_db() uses this hard-coded way to calculate @@ -6828,7 +6821,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req) */ sq->db_addr = dbs_addr + (i << 3); sq->ei_addr = eis_addr + (i << 3); -pci_dma_write(pci, sq->db_addr, , sizeof(sq->tail)); +stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED); if (n->params.ioeventfd && sq->sqid != 0) { if (!nvme_init_sq_ioeventfd(sq)) { @@ -6838,12 +6831,10 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req) } if (cq) { -v = cpu_to_le32(cq->head); - /* CAP.DSTRD is 0, so offset of ith cq db_addr is (i<<3)+(1<<2) */ cq->db_addr = dbs_addr + (i << 3) + (1 << 2); cq->ei_addr = eis_addr + (i << 3) + (1 << 2); -pci_dma_write(pci, cq->db_addr, , sizeof(cq->head)); +stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED); if (n->params.ioeventfd && cq->cqid != 0) { if (!nvme_init_cq_ioeventfd(cq)) { @@ -6974,20 +6965,16 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) static void nvme_update_sq_eventidx(const NvmeSQueue *sq) { -uint32_t v = cpu_to_le32(sq->tail); - trace_pci_nvme_update_sq_eventidx(sq->sqid, sq->tail); -pci_dma_write(PCI_DEVICE(sq->ctrl), sq->ei_addr, , sizeof(v)); +stl_le_pci_dma(PCI_DEVICE(sq->ctrl), sq->ei_addr, sq->tail, + MEMTXATTRS_UNSPECIFIED); } static void nvme_update_sq_tail(NvmeSQueue *sq) { -uint32_t v; - -pci_dma_read(PCI_DEVICE(sq->ctrl), sq->db_addr, , sizeof(v)); - -sq->tail = le32_to_cpu(v); +ldl_le_pci_dma(PCI_DEVICE(sq->ctrl), sq->db_addr, >tail, + MEMTXATTRS_UNSPECIFIED); trace_pci_nvme_update_sq_tail(sq->sqid, sq->tail); } @@ -7592,7 +7579,7 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size) static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) { PCIDevice *pci = PCI_DEVICE(n); -uint32_t qid, v; +uint32_t qid; if (unlikely(addr & ((1 << 2) - 1))) { NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned, @@ -7659,8 +7646,7 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) start_sqs = nvme_cq_full(cq) ? 1 : 0; cq->head = new_head; if (!qid && n->dbbuf_enabled) { -v = cpu_to_le32(cq->head); -pci_dma_write(pci, cq->db_addr, , sizeof(cq->head)); +stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED); } if (start_sqs) { NvmeSQueue *sq; @@ -7720,8 +7706,6 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr add
[PULL 0/1] hw/nvme fixes
From: Klaus Jensen Hi, This should also fix coverity cid 1518067 and 1518066. The following changes since commit ccb86f079a9e4d94918086a9df18c1844347aff8: Merge tag 'pull-nbd-2023-07-28' of https://repo.or.cz/qemu/ericb into staging (2023-07-28 09:56:57 -0700) are available in the Git repository at: https://gitlab.com/birkelund/qemu.git tags/nvme-next-pull-request for you to fetch changes up to c1e244b6552efdff5612a33c6630aaf95964eaf5: hw/nvme: use stl/ldl pci dma api (2023-07-30 20:09:54 +0200) hw/nvme fixes - use the stl/ldl pci dma api -BEGIN PGP SIGNATURE- iQEzBAABCgAdFiEEUigzqnXi3OaiR2bATeGvMW1PDekFAmTGuc8ACgkQTeGvMW1P Dek41wgAwqgRmtUhmmaQJJpF5Pya3J7n3Zkbp+cULdnSp/su7W7yIUTcTzdbr34d 9LbNHmWerXYinlIxG08ZWw2lq0TwApKj+8gv/wf8H7dG86/pBYfoQvOlkNx2QKyR vtRNlILCEbJpbSfY3LbFNvRGOkArr6HkzT4hZprUIfCvRg58u5oIxEx/ZYa+m3WU ED0y/46e7HbVbmbwJKrn4EK3k0zGdFyeINRZ5TB5DML3lCTX6eaZTLUXGIb7LLcK Xyv6/TCkPTggDszTam24kx0A7DhC+3f2C8DsJg7H8jnWb1F+oq/2EJam/0HU22Uk n348MrWOusuF7kbHMCP9h28gYT3aWw== =KjVO -END PGP SIGNATURE- Klaus Jensen (1): hw/nvme: use stl/ldl pci dma api hw/nvme/ctrl.c | 42 +- 1 file changed, 13 insertions(+), 29 deletions(-) -- 2.41.0
[PATCH] hw/nvme: use stl/ldl pci dma api
From: Klaus Jensen Use the stl/ldl pci dma api for writing/reading doorbells. This removes the explicit endian conversions. Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 42 +- 1 file changed, 13 insertions(+), 29 deletions(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index dadc2dc7da10..f2e5a2fa737b 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -1468,20 +1468,16 @@ static inline void nvme_blk_write(BlockBackend *blk, int64_t offset, static void nvme_update_cq_eventidx(const NvmeCQueue *cq) { -uint32_t v = cpu_to_le32(cq->head); - trace_pci_nvme_update_cq_eventidx(cq->cqid, cq->head); -pci_dma_write(PCI_DEVICE(cq->ctrl), cq->ei_addr, , sizeof(v)); +stl_le_pci_dma(PCI_DEVICE(cq->ctrl), cq->ei_addr, cq->head, + MEMTXATTRS_UNSPECIFIED); } static void nvme_update_cq_head(NvmeCQueue *cq) { -uint32_t v; - -pci_dma_read(PCI_DEVICE(cq->ctrl), cq->db_addr, , sizeof(v)); - -cq->head = le32_to_cpu(v); +ldl_le_pci_dma(PCI_DEVICE(cq->ctrl), cq->db_addr, >head, + MEMTXATTRS_UNSPECIFIED); trace_pci_nvme_update_cq_head(cq->cqid, cq->head); } @@ -6801,7 +6797,6 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req) PCIDevice *pci = PCI_DEVICE(n); uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1); uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2); -uint32_t v; int i; /* Address should be page aligned */ @@ -6819,8 +6814,6 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req) NvmeCQueue *cq = n->cq[i]; if (sq) { -v = cpu_to_le32(sq->tail); - /* * CAP.DSTRD is 0, so offset of ith sq db_addr is (i<<3) * nvme_process_db() uses this hard-coded way to calculate @@ -6828,7 +6821,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req) */ sq->db_addr = dbs_addr + (i << 3); sq->ei_addr = eis_addr + (i << 3); -pci_dma_write(pci, sq->db_addr, , sizeof(sq->tail)); +stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED); if (n->params.ioeventfd && sq->sqid != 0) { if (!nvme_init_sq_ioeventfd(sq)) { @@ -6838,12 +6831,10 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req) } if (cq) { -v = cpu_to_le32(cq->head); - /* CAP.DSTRD is 0, so offset of ith cq db_addr is (i<<3)+(1<<2) */ cq->db_addr = dbs_addr + (i << 3) + (1 << 2); cq->ei_addr = eis_addr + (i << 3) + (1 << 2); -pci_dma_write(pci, cq->db_addr, , sizeof(cq->head)); +stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED); if (n->params.ioeventfd && cq->cqid != 0) { if (!nvme_init_cq_ioeventfd(cq)) { @@ -6974,20 +6965,16 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) static void nvme_update_sq_eventidx(const NvmeSQueue *sq) { -uint32_t v = cpu_to_le32(sq->tail); - trace_pci_nvme_update_sq_eventidx(sq->sqid, sq->tail); -pci_dma_write(PCI_DEVICE(sq->ctrl), sq->ei_addr, , sizeof(v)); +stl_le_pci_dma(PCI_DEVICE(sq->ctrl), sq->ei_addr, sq->tail, + MEMTXATTRS_UNSPECIFIED); } static void nvme_update_sq_tail(NvmeSQueue *sq) { -uint32_t v; - -pci_dma_read(PCI_DEVICE(sq->ctrl), sq->db_addr, , sizeof(v)); - -sq->tail = le32_to_cpu(v); +ldl_le_pci_dma(PCI_DEVICE(sq->ctrl), sq->db_addr, >tail, + MEMTXATTRS_UNSPECIFIED); trace_pci_nvme_update_sq_tail(sq->sqid, sq->tail); } @@ -7592,7 +7579,7 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size) static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) { PCIDevice *pci = PCI_DEVICE(n); -uint32_t qid, v; +uint32_t qid; if (unlikely(addr & ((1 << 2) - 1))) { NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned, @@ -7659,8 +7646,7 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) start_sqs = nvme_cq_full(cq) ? 1 : 0; cq->head = new_head; if (!qid && n->dbbuf_enabled) { -v = cpu_to_le32(cq->head); -pci_dma_write(pci, cq->db_addr, , sizeof(cq->head)); +stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED); } if (start_sqs) { NvmeSQueue *sq; @@ -7720,8 +7706,6 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) sq->tail = new_tail; if (!qid && n->dbbuf_enabled) { -v = c
Re: [PULL 1/1] hw/nvme: fix endianness issue for shadow doorbells
On Jul 20 09:51, Peter Maydell wrote: > On Thu, 20 Jul 2023 at 09:49, Klaus Jensen wrote: > > > > On Jul 20 09:43, Peter Maydell wrote: > > > On Wed, 19 Jul 2023 at 21:13, Michael Tokarev wrote: > > > > > > > > 19.07.2023 10:36, Klaus Jensen wrote: > > > > pu(req->cmd.dptr.prp2); > > > > > +uint32_t v; > > > > > > > > > if (sq) { > > > > > +v = cpu_to_le32(sq->tail); > > > > > > > > > -pci_dma_write(pci, sq->db_addr, >tail, > > > > > sizeof(sq->tail)); > > > > > +pci_dma_write(pci, sq->db_addr, , sizeof(sq->tail)); > > > > > > > > This and similar cases hurts my eyes. > > > > > > > > Why we pass address of v here, but use sizeof(sq->tail) ? > > > > > > > > Yes, I know both in theory should be of the same size, but heck, > > > > this is puzzling at best, and confusing in a regular case. > > > > > > > > Dunno how it slipped in the review, it instantly catched my eye > > > > in a row of applied patches.. > > > > > > > > Also, why v is computed a few lines before it is used, with > > > > some expressions between the assignment and usage? > > > > > > > > How about the following patch: > > > > > > If you're going to change this, better to take the approach > > > Philippe suggested in review of using stl_le_pci_dma(). > > > > > > https://lore.kernel.org/qemu-devel/376e5e45-a3e7-0029-603a-b7ad9673f...@linaro.org/ > > > > > > > Yup, that was my plan for next. But the original patch was already > > verified on hardware and mutiple testes, so wanted to go with that for > > the "fix". > > > > But yes, I will refactor into the much nicer stl/ldl api. > > FWIW, I don't think this bug fix was so urgent that we > needed to go with a quick fix and a followup -- we're > not yet that close to 8.1 release. > Alright, noted ;) I will spin this into the other fix I have under review. signature.asc Description: PGP signature
Re: [PULL 1/1] hw/nvme: fix endianness issue for shadow doorbells
On Jul 20 09:43, Peter Maydell wrote: > On Wed, 19 Jul 2023 at 21:13, Michael Tokarev wrote: > > > > 19.07.2023 10:36, Klaus Jensen wrote: > > pu(req->cmd.dptr.prp2); > > > +uint32_t v; > > > > > if (sq) { > > > +v = cpu_to_le32(sq->tail); > > > > > -pci_dma_write(pci, sq->db_addr, >tail, sizeof(sq->tail)); > > > +pci_dma_write(pci, sq->db_addr, , sizeof(sq->tail)); > > > > This and similar cases hurts my eyes. > > > > Why we pass address of v here, but use sizeof(sq->tail) ? > > > > Yes, I know both in theory should be of the same size, but heck, > > this is puzzling at best, and confusing in a regular case. > > > > Dunno how it slipped in the review, it instantly catched my eye > > in a row of applied patches.. > > > > Also, why v is computed a few lines before it is used, with > > some expressions between the assignment and usage? > > > > How about the following patch: > > If you're going to change this, better to take the approach > Philippe suggested in review of using stl_le_pci_dma(). > > https://lore.kernel.org/qemu-devel/376e5e45-a3e7-0029-603a-b7ad9673f...@linaro.org/ > Yup, that was my plan for next. But the original patch was already verified on hardware and mutiple testes, so wanted to go with that for the "fix". But yes, I will refactor into the much nicer stl/ldl api. signature.asc Description: PGP signature
[PATCH for-8.1] hw/nvme: fix compliance issue wrt. iosqes/iocqes
From: Klaus Jensen As of prior to this patch, the controller checks the value of CC.IOCQES and CC.IOSQES prior to enabling the controller. As reported by Ben in GitLab issue #1691, this is not spec compliant. The controller should only check these values when queues are created. This patch moves these checks to nvme_create_cq(). We do not need to check it in nvme_create_sq() since that will error out if the completion queue is not already created. Also, since the controlle exclusively supports SQEs of size 64 bytes and CQEs of size 16 bytes, hard code that. Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1691 Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 46 hw/nvme/nvme.h | 9 +++-- hw/nvme/trace-events | 1 + 3 files changed, 20 insertions(+), 36 deletions(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 8e8e870b9a80..414e9ea60e05 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -1511,7 +1511,7 @@ static void nvme_post_cqes(void *opaque) req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase); req->cqe.sq_id = cpu_to_le16(sq->sqid); req->cqe.sq_head = cpu_to_le16(sq->head); -addr = cq->dma_addr + cq->tail * n->cqe_size; +addr = cq->dma_addr + (cq->tail << NVME_CQES); ret = pci_dma_write(PCI_DEVICE(n), addr, (void *)>cqe, sizeof(req->cqe)); if (ret) { @@ -5299,10 +5299,18 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req) uint16_t qsize = le16_to_cpu(c->qsize); uint16_t qflags = le16_to_cpu(c->cq_flags); uint64_t prp1 = le64_to_cpu(c->prp1); +uint32_t cc = ldq_le_p(>bar.cc); +uint8_t iocqes = NVME_CC_IOCQES(cc); +uint8_t iosqes = NVME_CC_IOSQES(cc); trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags, NVME_CQ_FLAGS_IEN(qflags) != 0); +if (iosqes != NVME_SQES || iocqes != NVME_CQES) { +trace_pci_nvme_err_invalid_create_cq_entry_size(iosqes, iocqes); +return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR; +} + if (unlikely(!cqid || cqid > n->conf_ioqpairs || n->cq[cqid] != NULL)) { trace_pci_nvme_err_invalid_create_cq_cqid(cqid); return NVME_INVALID_QID | NVME_DNR; @@ -7003,7 +7011,7 @@ static void nvme_process_sq(void *opaque) } while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(>req_list))) { -addr = sq->dma_addr + sq->head * n->sqe_size; +addr = sq->dma_addr + (sq->head << NVME_SQES); if (nvme_addr_read(n, addr, (void *), sizeof(cmd))) { trace_pci_nvme_err_addr_read(addr); trace_pci_nvme_err_cfs(); @@ -7228,34 +7236,6 @@ static int nvme_start_ctrl(NvmeCtrl *n) NVME_CAP_MPSMAX(cap)); return -1; } -if (unlikely(NVME_CC_IOCQES(cc) < - NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) { -trace_pci_nvme_err_startfail_cqent_too_small( -NVME_CC_IOCQES(cc), -NVME_CTRL_CQES_MIN(cap)); -return -1; -} -if (unlikely(NVME_CC_IOCQES(cc) > - NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) { -trace_pci_nvme_err_startfail_cqent_too_large( -NVME_CC_IOCQES(cc), -NVME_CTRL_CQES_MAX(cap)); -return -1; -} -if (unlikely(NVME_CC_IOSQES(cc) < - NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) { -trace_pci_nvme_err_startfail_sqent_too_small( -NVME_CC_IOSQES(cc), -NVME_CTRL_SQES_MIN(cap)); -return -1; -} -if (unlikely(NVME_CC_IOSQES(cc) > - NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) { -trace_pci_nvme_err_startfail_sqent_too_large( -NVME_CC_IOSQES(cc), -NVME_CTRL_SQES_MAX(cap)); -return -1; -} if (unlikely(!NVME_AQA_ASQS(aqa))) { trace_pci_nvme_err_startfail_asqent_sz_zero(); return -1; @@ -7268,8 +7248,6 @@ static int nvme_start_ctrl(NvmeCtrl *n) n->page_bits = page_bits; n->page_size = page_size; n->max_prp_ents = n->page_size / sizeof(uint64_t); -n->cqe_size = 1 << NVME_CC_IOCQES(cc); -n->sqe_size = 1 << NVME_CC_IOSQES(cc); nvme_init_cq(>admin_cq, n, acq, 0, 0, NVME_AQA_ACQS(aqa) + 1, 1); nvme_init_sq(>admin_sq, n, asq, 0, 0, NVME_AQA_ASQS(aqa) + 1); @@ -8238,8 +8216,8 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING); id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL); -id->sqes = (0x6 << 4) | 0x6; -id->cqes = (0x4 << 4) | 0x4; +id->sqes = (NVME_SQES << 4) | NVME_SQES; +id->cqes = (NVME_CQES << 4) | NVME_CQES;
[PULL 1/1] hw/nvme: fix endianness issue for shadow doorbells
From: Klaus Jensen In commit 2fda0726e514 ("hw/nvme: fix missing endian conversions for doorbell buffers"), we fixed shadow doorbells for big-endian guests running on little endian hosts. But I did not fix little-endian guests on big-endian hosts. Fix this. Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1765 Fixes: 3f7fe8de3d49 ("hw/nvme: Implement shadow doorbell buffer support") Cc: qemu-sta...@nongnu.org Reported-by: Thomas Huth Tested-by: Cédric Le Goater Tested-by: Thomas Huth Reviewed-by: Keith Busch Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 18 +- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 8e8e870b9a80..dadc2dc7da10 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -6801,6 +6801,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req) PCIDevice *pci = PCI_DEVICE(n); uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1); uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2); +uint32_t v; int i; /* Address should be page aligned */ @@ -6818,6 +6819,8 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req) NvmeCQueue *cq = n->cq[i]; if (sq) { +v = cpu_to_le32(sq->tail); + /* * CAP.DSTRD is 0, so offset of ith sq db_addr is (i<<3) * nvme_process_db() uses this hard-coded way to calculate @@ -6825,7 +6828,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req) */ sq->db_addr = dbs_addr + (i << 3); sq->ei_addr = eis_addr + (i << 3); -pci_dma_write(pci, sq->db_addr, >tail, sizeof(sq->tail)); +pci_dma_write(pci, sq->db_addr, , sizeof(sq->tail)); if (n->params.ioeventfd && sq->sqid != 0) { if (!nvme_init_sq_ioeventfd(sq)) { @@ -6835,10 +6838,12 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req) } if (cq) { +v = cpu_to_le32(cq->head); + /* CAP.DSTRD is 0, so offset of ith cq db_addr is (i<<3)+(1<<2) */ cq->db_addr = dbs_addr + (i << 3) + (1 << 2); cq->ei_addr = eis_addr + (i << 3) + (1 << 2); -pci_dma_write(pci, cq->db_addr, >head, sizeof(cq->head)); +pci_dma_write(pci, cq->db_addr, , sizeof(cq->head)); if (n->params.ioeventfd && cq->cqid != 0) { if (!nvme_init_cq_ioeventfd(cq)) { @@ -7587,7 +7592,7 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size) static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) { PCIDevice *pci = PCI_DEVICE(n); -uint32_t qid; +uint32_t qid, v; if (unlikely(addr & ((1 << 2) - 1))) { NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned, @@ -7654,7 +7659,8 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) start_sqs = nvme_cq_full(cq) ? 1 : 0; cq->head = new_head; if (!qid && n->dbbuf_enabled) { -pci_dma_write(pci, cq->db_addr, >head, sizeof(cq->head)); +v = cpu_to_le32(cq->head); +pci_dma_write(pci, cq->db_addr, , sizeof(cq->head)); } if (start_sqs) { NvmeSQueue *sq; @@ -7714,6 +7720,8 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) sq->tail = new_tail; if (!qid && n->dbbuf_enabled) { +v = cpu_to_le32(sq->tail); + /* * The spec states "the host shall also update the controller's * corresponding doorbell property to match the value of that entry @@ -7727,7 +7735,7 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) * including ones that run on Linux, are not updating Admin Queues, * so we can't trust reading it for an appropriate sq tail. */ -pci_dma_write(pci, sq->db_addr, >tail, sizeof(sq->tail)); +pci_dma_write(pci, sq->db_addr, , sizeof(sq->tail)); } qemu_bh_schedule(sq->bh); -- 2.41.0
[PULL 0/1] hw/nvme fixes
From: Klaus Jensen Hi, The following changes since commit 361d5397355276e3007825cc17217c1e4d4320f7: Merge tag 'block-pull-request' of https://gitlab.com/stefanha/qemu into staging (2023-07-17 15:49:27 +0100) are available in the Git repository at: https://gitlab.com/birkelund/qemu.git tags/nvme-next-pull-request for you to fetch changes up to ea3c76f1494d0c75873c3b470e6e048202661ad8: hw/nvme: fix endianness issue for shadow doorbells (2023-07-19 09:33:54 +0200) hw/nvme fixes * fix shadow doorbell endian issue -BEGIN PGP SIGNATURE- iQEzBAABCgAdFiEEUigzqnXi3OaiR2bATeGvMW1PDekFAmS3kkAACgkQTeGvMW1P DenG1ggArIHi1dQQBIG1ubzHx/C+93cybpKwT73/5wfO7BT8CCh1v+qrH/6SsYUT 5O7y1MaCLDV4ocf5dRQseXFK0tpjo7EqDnr25UhcSunQ+d2Tn7MAIuubQOFD+Axh 5gIwOEJbKqw9apJgnVWnInTBd//ManOgh6OyC1uJ+DEJE7ISJzLlJeWaBekiWpAA hNL1zsR5+eTcwnewDRmMs4FlKBlSfgcNgNYnz8tfpnW0DzXKuiY4ITnk6kX9eMAM kDlbjFjlgoTPZ8IsYcyhVCJMcH8jqY/LuZcaF7XHHsdX7fa5p17C6rR1hxVyDs+E rydOtWetQDhXlyakE+Jp2RB3HLcSmg== =j1TL -END PGP SIGNATURE- Klaus Jensen (1): hw/nvme: fix endianness issue for shadow doorbells hw/nvme/ctrl.c | 18 +- 1 file changed, 13 insertions(+), 5 deletions(-) -- 2.41.0
Re: [PATCH] hw/nvme: fix endianness issue for shadow doorbells
On Jul 18 13:18, Philippe Mathieu-Daudé wrote: > On 18/7/23 12:35, Klaus Jensen wrote: > > From: Klaus Jensen > > > > In commit 2fda0726e514 ("hw/nvme: fix missing endian conversions for > > doorbell buffers"), we fixed shadow doorbells for big-endian guests > > running on little endian hosts. But I did not fix little-endian guests > > on big-endian hosts. Fix this. > > > > Solves issue #1765. > > > > Fixes: 3f7fe8de3d49 ("hw/nvme: Implement shadow doorbell buffer support") > > Cc: qemu-sta...@nongnu.org > > Reported-by: Thomas Huth > > Signed-off-by: Klaus Jensen > > --- > > hw/nvme/ctrl.c | 18 +- > > 1 file changed, 13 insertions(+), 5 deletions(-) > > > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c > > index 8e8e870b9a80..dadc2dc7da10 100644 > > --- a/hw/nvme/ctrl.c > > +++ b/hw/nvme/ctrl.c > > @@ -6801,6 +6801,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const > > NvmeRequest *req) > > PCIDevice *pci = PCI_DEVICE(n); > > uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1); > > uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2); > > +uint32_t v; > > int i; > > /* Address should be page aligned */ > > @@ -6818,6 +6819,8 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const > > NvmeRequest *req) > > NvmeCQueue *cq = n->cq[i]; > > if (sq) { > > +v = cpu_to_le32(sq->tail); > > + > > /* > >* CAP.DSTRD is 0, so offset of ith sq db_addr is (i<<3) > >* nvme_process_db() uses this hard-coded way to calculate > > @@ -6825,7 +6828,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const > > NvmeRequest *req) > >*/ > > sq->db_addr = dbs_addr + (i << 3); > > sq->ei_addr = eis_addr + (i << 3); > > -pci_dma_write(pci, sq->db_addr, >tail, sizeof(sq->tail)); > > +pci_dma_write(pci, sq->db_addr, , sizeof(sq->tail)); > > Or use the PCI DMA ldst API which does the swapping for you: > > stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED); > Nice! That's definitely preferable. I'll queue that up for next, but leave this patch as-is since it's been tested on a real big-endian host and gotten its tags ;) signature.asc Description: PGP signature
[PATCH] hw/nvme: fix endianness issue for shadow doorbells
From: Klaus Jensen In commit 2fda0726e514 ("hw/nvme: fix missing endian conversions for doorbell buffers"), we fixed shadow doorbells for big-endian guests running on little endian hosts. But I did not fix little-endian guests on big-endian hosts. Fix this. Solves issue #1765. Fixes: 3f7fe8de3d49 ("hw/nvme: Implement shadow doorbell buffer support") Cc: qemu-sta...@nongnu.org Reported-by: Thomas Huth Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 18 +- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 8e8e870b9a80..dadc2dc7da10 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -6801,6 +6801,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req) PCIDevice *pci = PCI_DEVICE(n); uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1); uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2); +uint32_t v; int i; /* Address should be page aligned */ @@ -6818,6 +6819,8 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req) NvmeCQueue *cq = n->cq[i]; if (sq) { +v = cpu_to_le32(sq->tail); + /* * CAP.DSTRD is 0, so offset of ith sq db_addr is (i<<3) * nvme_process_db() uses this hard-coded way to calculate @@ -6825,7 +6828,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req) */ sq->db_addr = dbs_addr + (i << 3); sq->ei_addr = eis_addr + (i << 3); -pci_dma_write(pci, sq->db_addr, >tail, sizeof(sq->tail)); +pci_dma_write(pci, sq->db_addr, , sizeof(sq->tail)); if (n->params.ioeventfd && sq->sqid != 0) { if (!nvme_init_sq_ioeventfd(sq)) { @@ -6835,10 +6838,12 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req) } if (cq) { +v = cpu_to_le32(cq->head); + /* CAP.DSTRD is 0, so offset of ith cq db_addr is (i<<3)+(1<<2) */ cq->db_addr = dbs_addr + (i << 3) + (1 << 2); cq->ei_addr = eis_addr + (i << 3) + (1 << 2); -pci_dma_write(pci, cq->db_addr, >head, sizeof(cq->head)); +pci_dma_write(pci, cq->db_addr, , sizeof(cq->head)); if (n->params.ioeventfd && cq->cqid != 0) { if (!nvme_init_cq_ioeventfd(cq)) { @@ -7587,7 +7592,7 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size) static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) { PCIDevice *pci = PCI_DEVICE(n); -uint32_t qid; +uint32_t qid, v; if (unlikely(addr & ((1 << 2) - 1))) { NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned, @@ -7654,7 +7659,8 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) start_sqs = nvme_cq_full(cq) ? 1 : 0; cq->head = new_head; if (!qid && n->dbbuf_enabled) { -pci_dma_write(pci, cq->db_addr, >head, sizeof(cq->head)); +v = cpu_to_le32(cq->head); +pci_dma_write(pci, cq->db_addr, , sizeof(cq->head)); } if (start_sqs) { NvmeSQueue *sq; @@ -7714,6 +7720,8 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) sq->tail = new_tail; if (!qid && n->dbbuf_enabled) { +v = cpu_to_le32(sq->tail); + /* * The spec states "the host shall also update the controller's * corresponding doorbell property to match the value of that entry @@ -7727,7 +7735,7 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) * including ones that run on Linux, are not updating Admin Queues, * so we can't trust reading it for an appropriate sq tail. */ -pci_dma_write(pci, sq->db_addr, >tail, sizeof(sq->tail)); +pci_dma_write(pci, sq->db_addr, , sizeof(sq->tail)); } qemu_bh_schedule(sq->bh); -- 2.41.0