Re: [Qemu-devel] [PATCH v3 11/23] hw/pvrdma: Add support to allow guest to configure GID table
On Sat, Nov 17, 2018 at 02:48:34PM +0200, Marcel Apfelbaum wrote: > > > On 11/13/18 9:13 AM, Yuval Shaia wrote: > > The control over the RDMA device's GID table is done by updating the > > device's Ethernet function addresses. > > Usually the first GID entry is determine by the MAC address, the second > > s/determine/determined > > > by the first IPv6 address and the third by the IPv4 address. Other > > entries can be added by adding more IP addresses. The opposite is the > > same, i.e. whenever an address is removed, the corresponding GID entry > > is removed. > > > > The process is done by the network and RDMA stacks. Whenever an address > > is added the ib_core driver is notified and calls the device driver > > add_gid function which in turn update the device. > > > > To support this in pvrdma device we need to hook into the create_bind > > and destroy_bind HW commands triggered by pvrdma driver in guest. > > Whenever a changed is made to the pvrdma device's GID table a special > without 'a' > > > QMP messages is sent to be processed by libvirt to update the address of > > the backend Ethernet device. > > So the device can't be used without libvirt? How can we > use it anyway only with QEMU ? We can't. (actually we can but this involve a hack since we have to know in advance the IP addresses that will be assigned to the net device in the guest) > > > > > Signed-off-by: Yuval Shaia > > --- > > hw/rdma/rdma_backend.c | 243 +++- > > rdma_backend.c is gettting larger... 1261 lines only, let's revisit it on next major change. > > > hw/rdma/rdma_backend.h | 22 ++-- > > hw/rdma/rdma_backend_defs.h | 3 +- > > hw/rdma/rdma_rm.c | 104 ++- > > hw/rdma/rdma_rm.h | 17 ++- > > hw/rdma/rdma_rm_defs.h | 9 +- > > hw/rdma/rdma_utils.h| 15 +++ > > hw/rdma/vmw/pvrdma.h| 2 +- > > hw/rdma/vmw/pvrdma_cmd.c| 55 > > hw/rdma/vmw/pvrdma_main.c | 25 +--- > > hw/rdma/vmw/pvrdma_qp_ops.c | 20 +++ > > 11 files changed, 370 insertions(+), 145 deletions(-) > > > > diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c > > index 3eb0099f8d..5675504165 100644 > > --- a/hw/rdma/rdma_backend.c > > +++ b/hw/rdma/rdma_backend.c > > @@ -18,12 +18,14 @@ > > #include "qapi/error.h" > > #include "qapi/qmp/qlist.h" > > #include "qapi/qmp/qnum.h" > > +#include "qapi/qapi-events-rdma.h" > > #include > > #include > > #include > > #include > > +#include "contrib/rdmacm-mux/rdmacm-mux.h" > > #include "trace.h" > > #include "rdma_utils.h" > > #include "rdma_rm.h" > > @@ -300,11 +302,11 @@ static int build_host_sge_array(RdmaDeviceResources > > *rdma_dev_res, > > return 0; > > } > > -static int mad_send(RdmaBackendDev *backend_dev, struct ibv_sge *sge, > > -uint32_t num_sge) > > +static int mad_send(RdmaBackendDev *backend_dev, uint8_t sgid_idx, > > +union ibv_gid *sgid, struct ibv_sge *sge, uint32_t > > num_sge) > > { > > -struct backend_umad umad = {0}; > > -char *hdr, *msg; > > +RdmaCmMuxMsg msg = {0}; > > +char *hdr, *data; > > int ret; > > pr_dbg("num_sge=%d\n", num_sge); > > @@ -313,41 +315,50 @@ static int mad_send(RdmaBackendDev *backend_dev, > > struct ibv_sge *sge, > > return -EINVAL; > > } > > -umad.hdr.length = sge[0].length + sge[1].length; > > -pr_dbg("msg_len=%d\n", umad.hdr.length); > > +msg.hdr.msg_type = RDMACM_MUX_MSG_TYPE_MAD; > > +memcpy(msg.hdr.sgid.raw, sgid->raw, sizeof(msg.hdr.sgid)); > > -if (umad.hdr.length > sizeof(umad.mad)) { > > +msg.umad_len = sge[0].length + sge[1].length; > > +pr_dbg("umad_len=%d\n", msg.umad_len); > > + > > +if (msg.umad_len > sizeof(msg.umad.mad)) { > > return -ENOMEM; > > } > > -umad.hdr.addr.qpn = htobe32(1); > > -umad.hdr.addr.grh_present = 1; > > -umad.hdr.addr.gid_index = backend_dev->backend_gid_idx; > > -memcpy(umad.hdr.addr.gid, backend_dev->gid.raw, > > sizeof(umad.hdr.addr.gid)); > > -umad.hdr.addr.hop_limit = 1; > > +msg.umad.hdr.addr.qpn = htobe32(1); > > +msg.umad.hdr.addr.grh_present = 1; > > +pr_dbg("sgid_idx=%d\n", sgid_idx); > > +pr_dbg("sgid=0x%llx\n", sgid->global.interface_id); > > +msg.umad.hdr.addr.gid_index = sgid_idx; > > +memcpy(msg.umad.hdr.addr.gid, sgid->raw, > > sizeof(msg.umad.hdr.addr.gid)); > > +msg.umad.hdr.addr.hop_limit = 1; > > hdr = rdma_pci_dma_map(backend_dev->dev, sge[0].addr, sge[0].length); > > -msg = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length); > > +data = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length); > > + > > +pr_dbg_buf("mad_hdr", hdr, sge[0].length); > > +pr_dbg_buf("mad_data", data, sge[1].length); > > -memcpy([0], hdr, sge[0].length); > > -memcpy([sge[0].length], msg, sge[1].length); > > +memcpy([0],
Re: [Qemu-devel] [PATCH v2 13/22] hw/pvrdma: Make sure PCI function 0 is vmxnet3
On Sat, Nov 17, 2018 at 01:41:41PM +0200, Marcel Apfelbaum wrote: > > > On 11/11/18 9:45 AM, Yuval Shaia wrote: > > On Sat, Nov 10, 2018 at 08:27:44PM +0200, Marcel Apfelbaum wrote: > > > > > > On 11/8/18 6:08 PM, Yuval Shaia wrote: > > > > Guest driver enforces it, we should also. > > > > > > > > Signed-off-by: Yuval Shaia > > > > --- > > > >hw/rdma/vmw/pvrdma.h | 2 ++ > > > >hw/rdma/vmw/pvrdma_main.c | 3 +++ > > > >2 files changed, 5 insertions(+) > > > > > > > > diff --git a/hw/rdma/vmw/pvrdma.h b/hw/rdma/vmw/pvrdma.h > > > > index b019cb843a..10a3c4fb7c 100644 > > > > --- a/hw/rdma/vmw/pvrdma.h > > > > +++ b/hw/rdma/vmw/pvrdma.h > > > > @@ -20,6 +20,7 @@ > > > >#include "hw/pci/pci.h" > > > >#include "hw/pci/msix.h" > > > >#include "chardev/char-fe.h" > > > > +#include "hw/net/vmxnet3_defs.h" > > > >#include "../rdma_backend_defs.h" > > > >#include "../rdma_rm_defs.h" > > > > @@ -85,6 +86,7 @@ typedef struct PVRDMADev { > > > >RdmaBackendDev backend_dev; > > > >RdmaDeviceResources rdma_dev_res; > > > >CharBackend mad_chr; > > > > +VMXNET3State *func0; > > > >} PVRDMADev; > > > >#define PVRDMA_DEV(dev) OBJECT_CHECK(PVRDMADev, (dev), > > > > PVRDMA_HW_NAME) > > > > diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c > > > > index ac8c092db0..fa6468d221 100644 > > > > --- a/hw/rdma/vmw/pvrdma_main.c > > > > +++ b/hw/rdma/vmw/pvrdma_main.c > > > > @@ -576,6 +576,9 @@ static void pvrdma_realize(PCIDevice *pdev, Error > > > > **errp) > > > >return; > > > >} > > > > +/* Break if not vmxnet3 device in slot 0 */ > > > > +dev->func0 = VMXNET3(pci_get_function_0(pdev)); > > > > + > > > I don't see the error code flow in case VMXNET3 is not func 0. > > > Am I missing something? > > Yes, this is a dynamic cast that will break the process when fail to cast. > > > > This is the error message that you will get in case that device on function > > 0 is not vmxnet3: > > > > pvrdma_main.c:589:pvrdma_realize: Object 0x557b959841a0 is not an instance > > of type vmxnet3 > > I am not sure we will see this error if QEMU is compiled in Release mode. > I think object_dynamic_cast_assert throws this error only if > CONFIG_QOM_CAST_DEBUG > is set, and is possible the mentioned flag is not set in Release. > > Thanks, > Marcel Done. Thanks. > > > > > > > > > Thanks, > > > Marcel > > > > > > >memdev_root = object_resolve_path("/objects", NULL); > > > >if (memdev_root) { > > > >object_child_foreach(memdev_root, pvrdma_check_ram_shared, > > > > _shared); >
Re: [Qemu-devel] [PATCH v3 05/23] hw/rdma: Add support for MAD packets
On Sat, Nov 17, 2018 at 02:06:48PM +0200, Marcel Apfelbaum wrote: > Hi Yuval, > > On 11/13/18 9:12 AM, Yuval Shaia wrote: > > MAD (Management Datagram) packets are widely used by various modules > > Please add a link to Spec, I sent it in the V1 mail-thread > Please add it also as a comment in the code. I know MADs > are a complicated matter, but if somebody wants to have a look... Added in v4, thanks. > > > both in kernel and in user space for example the rdma_* API which is > > used to create and maintain "connection" layer on top of RDMA uses > > several types of MAD packets. > > To support MAD packets the device uses an external utility > > (contrib/rdmacm-mux) to relay packets from and to the guest driver. > > > > Signed-off-by: Yuval Shaia > > --- > > hw/rdma/rdma_backend.c | 263 +++- > > hw/rdma/rdma_backend.h | 4 +- > > hw/rdma/rdma_backend_defs.h | 10 +- > > hw/rdma/vmw/pvrdma.h| 2 + > > hw/rdma/vmw/pvrdma_main.c | 4 +- > > 5 files changed, 273 insertions(+), 10 deletions(-) > > > > diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c > > index 1e148398a2..3eb0099f8d 100644 > > --- a/hw/rdma/rdma_backend.c > > +++ b/hw/rdma/rdma_backend.c > > rdma_backend is getting huge, have you consider taking out > mad related code? 1261 lines only, is that so big? comparing to other files under hw/ i don't see much difference here. > > @@ -16,8 +16,13 @@ > > #include "qemu/osdep.h" > > #include "qemu/error-report.h" > > #include "qapi/error.h" > > +#include "qapi/qmp/qlist.h" > > +#include "qapi/qmp/qnum.h" > > #include > > +#include > > +#include > > +#include > > #include "trace.h" > > #include "rdma_utils.h" > > @@ -33,16 +38,25 @@ > > #define VENDOR_ERR_MAD_SEND 0x206 > > #define VENDOR_ERR_INVLKEY 0x207 > > #define VENDOR_ERR_MR_SMALL 0x208 > > +#define VENDOR_ERR_INV_MAD_BUFF 0x209 > > +#define VENDOR_ERR_INV_NUM_SGE 0x210 > > #define THR_NAME_LEN 16 > > #define THR_POLL_TO 5000 > > +#define MAD_HDR_SIZE sizeof(struct ibv_grh) > > + > > typedef struct BackendCtx { > > -uint64_t req_id; > > void *up_ctx; > > bool is_tx_req; > > +struct ibv_sge sge; /* Used to save MAD recv buffer */ > > } BackendCtx; > > +struct backend_umad { > > +struct ib_user_mad hdr; > > +char mad[RDMA_MAX_PRIVATE_DATA]; > > +}; > > + > > static void (*comp_handler)(int status, unsigned int vendor_err, void > > *ctx); > > static void dummy_comp_handler(int status, unsigned int vendor_err, void > > *ctx) > > @@ -286,6 +300,49 @@ static int build_host_sge_array(RdmaDeviceResources > > *rdma_dev_res, > > return 0; > > } > > +static int mad_send(RdmaBackendDev *backend_dev, struct ibv_sge *sge, > > +uint32_t num_sge) > > +{ > > +struct backend_umad umad = {0}; > > +char *hdr, *msg; > > +int ret; > > + > > +pr_dbg("num_sge=%d\n", num_sge); > > + > > +if (num_sge != 2) { > > +return -EINVAL; > > +} > > + > > +umad.hdr.length = sge[0].length + sge[1].length; > > +pr_dbg("msg_len=%d\n", umad.hdr.length); > > + > > +if (umad.hdr.length > sizeof(umad.mad)) { > > +return -ENOMEM; > > +} > > + > > +umad.hdr.addr.qpn = htobe32(1); > > +umad.hdr.addr.grh_present = 1; > > +umad.hdr.addr.gid_index = backend_dev->backend_gid_idx; > > +memcpy(umad.hdr.addr.gid, backend_dev->gid.raw, > > sizeof(umad.hdr.addr.gid)); > > +umad.hdr.addr.hop_limit = 1; > > + > > +hdr = rdma_pci_dma_map(backend_dev->dev, sge[0].addr, sge[0].length); > > +msg = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length); > > + > > If rdma_pci_dma_map fails it will return NULL Done. > > > +memcpy([0], hdr, sge[0].length); > > +memcpy([sge[0].length], msg, sge[1].length); > > + > > ... and here we access a NULL pointer. > Maybe is possible to return some error here. > > +rdma_pci_dma_unmap(backend_dev->dev, msg, sge[1].length); > > +rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length); > > + > > +ret = qemu_chr_fe_write(backend_dev->mad_chr_be, (const uint8_t > > *), > > +sizeof(umad)); > > + > > +pr_dbg("qemu_chr_fe_write=%d\n", ret); > > + > > +return (ret != sizeof(umad)); > > +} > > + > > void rdma_backend_post_send(RdmaBackendDev *backend_dev, > > RdmaBackendQP *qp, uint8_t qp_type, > > struct ibv_sge *sge, uint32_t num_sge, > > @@ -304,9 +361,13 @@ void rdma_backend_post_send(RdmaBackendDev > > *backend_dev, > > comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx); > > } else if (qp_type == IBV_QPT_GSI) { > > pr_dbg("QP1\n"); > > -comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx); > > +rc = mad_send(backend_dev, sge, num_sge); > > +if (rc) { > > +
Re: [Qemu-devel] [PATCH 2/2] scsi-disk: fix rerror/werror=ignore
On Sat, Oct 13, 2018 at 11:53:23AM +0200, Paolo Bonzini wrote: > rerror=ignore was returning true from scsi_handle_rw_error but the callers > were not > calling scsi_req_complete when rerror=ignore returns true (this is the > correct thing > to do when true is returned after executing a passthrough command). Fix this > by > calling it in scsi_handle_rw_error. > > Signed-off-by: Paolo Bonzini > --- > hw/scsi/scsi-disk.c | 7 ++- > 1 file changed, 6 insertions(+), 1 deletion(-) > > diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c > index 4074d7c..e2c5408 100644 > --- a/hw/scsi/scsi-disk.c > +++ b/hw/scsi/scsi-disk.c > @@ -473,10 +473,15 @@ static bool scsi_handle_rw_error(SCSIDiskReq *r, int > error, bool acct_failed) > } > > blk_error_action(s->qdev.conf.blk, action, is_read, error); > +if (action == BLOCK_ERROR_ACTION_IGNORE) { > +scsi_req_complete(>req, 0); > +return true; > +} > + > if (action == BLOCK_ERROR_ACTION_STOP) { > scsi_req_retry(>req); > } > -return action != BLOCK_ERROR_ACTION_IGNORE; > +return false; > } This commit seems to cause an assertion failure in qemu which is trivial to reproduce: (1) Create an NBD disk which returns EIO for every request: $ nbdkit -f -v --filter=error memory size=64M error-rate=100% (2) Attach the disk to qemu as a virtio-scsi device: $ x86_64-softmmu/qemu-system-x86_64 -device virtio-scsi,id=scsi -drive file=nbd:localhost:10809,format=raw,id=hd0,if=none -device scsi-hd,drive=hd0 qemu-system-x86_64: hw/scsi/scsi-bus.c:1374: scsi_req_complete: Assertion `req->status == -1' failed. More details including a stack trace here: https://bugzilla.redhat.com/show_bug.cgi?id=1650975 Rich. -- Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones Read my programming and virtualization blog: http://rwmj.wordpress.com virt-builder quickly builds VMs from scratch http://libguestfs.org/virt-builder.1.html
[Qemu-devel] [PATCH v4 16/23] hw/pvrdma: Fill all CQE fields
Add ability to pass specific WC attributes to CQE such as GRH_BIT flag. Signed-off-by: Yuval Shaia Reviewed-by: Marcel Apfelbaum --- hw/rdma/rdma_backend.c | 59 +++-- hw/rdma/rdma_backend.h | 4 +-- hw/rdma/vmw/pvrdma_qp_ops.c | 31 +++ 3 files changed, 58 insertions(+), 36 deletions(-) diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c index db78009bbb..b1259012e5 100644 --- a/hw/rdma/rdma_backend.c +++ b/hw/rdma/rdma_backend.c @@ -59,13 +59,24 @@ struct backend_umad { char mad[RDMA_MAX_PRIVATE_DATA]; }; -static void (*comp_handler)(int status, unsigned int vendor_err, void *ctx); +static void (*comp_handler)(void *ctx, struct ibv_wc *wc); -static void dummy_comp_handler(int status, unsigned int vendor_err, void *ctx) +static void dummy_comp_handler(void *ctx, struct ibv_wc *wc) { pr_err("No completion handler is registered\n"); } +static inline void complete_work(enum ibv_wc_status status, uint32_t vendor_err, + void *ctx) +{ +struct ibv_wc wc = {0}; + +wc.status = status; +wc.vendor_err = vendor_err; + +comp_handler(ctx, ); +} + static void poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq *ibcq) { int i, ne; @@ -90,7 +101,7 @@ static void poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq *ibcq) } pr_dbg("Processing %s CQE\n", bctx->is_tx_req ? "send" : "recv"); -comp_handler(wc[i].status, wc[i].vendor_err, bctx->up_ctx); +comp_handler(bctx->up_ctx, [i]); rdma_rm_dealloc_cqe_ctx(rdma_dev_res, wc[i].wr_id); g_free(bctx); @@ -184,8 +195,8 @@ static void start_comp_thread(RdmaBackendDev *backend_dev) comp_handler_thread, backend_dev, QEMU_THREAD_DETACHED); } -void rdma_backend_register_comp_handler(void (*handler)(int status, -unsigned int vendor_err, void *ctx)) +void rdma_backend_register_comp_handler(void (*handler)(void *ctx, + struct ibv_wc *wc)) { comp_handler = handler; } @@ -412,14 +423,14 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev, if (!qp->ibqp) { /* This field does not get initialized for QP0 and QP1 */ if (qp_type == IBV_QPT_SMI) { pr_dbg("QP0 unsupported\n"); -comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx); +complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx); } else if (qp_type == IBV_QPT_GSI) { pr_dbg("QP1\n"); rc = mad_send(backend_dev, sgid_idx, sgid, sge, num_sge); if (rc) { -comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx); +complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx); } else { -comp_handler(IBV_WC_SUCCESS, 0, ctx); +complete_work(IBV_WC_SUCCESS, 0, ctx); } } return; @@ -428,7 +439,7 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev, pr_dbg("num_sge=%d\n", num_sge); if (!num_sge) { pr_dbg("num_sge=0\n"); -comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_NO_SGE, ctx); +complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_NO_SGE, ctx); return; } @@ -439,21 +450,21 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev, rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, _id, bctx); if (unlikely(rc)) { pr_dbg("Failed to allocate cqe_ctx\n"); -comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx); +complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx); goto out_free_bctx; } rc = build_host_sge_array(backend_dev->rdma_dev_res, new_sge, sge, num_sge); if (rc) { pr_dbg("Error: Failed to build host SGE array\n"); -comp_handler(IBV_WC_GENERAL_ERR, rc, ctx); +complete_work(IBV_WC_GENERAL_ERR, rc, ctx); goto out_dealloc_cqe_ctx; } if (qp_type == IBV_QPT_UD) { wr.wr.ud.ah = create_ah(backend_dev, qp->ibpd, sgid_idx, dgid); if (!wr.wr.ud.ah) { -comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx); +complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx); goto out_dealloc_cqe_ctx; } wr.wr.ud.remote_qpn = dqpn; @@ -471,7 +482,7 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev, if (rc) { pr_dbg("Fail (%d, %d) to post send WQE to qpn %d\n", rc, errno, qp->ibqp->qp_num); -comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx); +complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx); goto out_dealloc_cqe_ctx; } @@ -540,13 +551,13 @@ void rdma_backend_post_recv(RdmaBackendDev *backend_dev, if (!qp->ibqp) { /* This field does not get
[Qemu-devel] [PATCH v4 19/23] vl: Introduce shutdown_notifiers
Notifier will be used for signaling shutdown event to inform system is shutdown. This will allow devices and other component to run some cleanup code needed before VM is shutdown. Signed-off-by: Yuval Shaia Reviewed-by: Cornelia Huck --- include/sysemu/sysemu.h | 1 + vl.c| 15 ++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 8d6095d98b..0d15f16492 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -80,6 +80,7 @@ void qemu_register_wakeup_notifier(Notifier *notifier); void qemu_system_shutdown_request(ShutdownCause reason); void qemu_system_powerdown_request(void); void qemu_register_powerdown_notifier(Notifier *notifier); +void qemu_register_shutdown_notifier(Notifier *notifier); void qemu_system_debug_request(void); void qemu_system_vmstop_request(RunState reason); void qemu_system_vmstop_request_prepare(void); diff --git a/vl.c b/vl.c index 1fcacc5caa..d33d52522c 100644 --- a/vl.c +++ b/vl.c @@ -1578,6 +1578,8 @@ static NotifierList suspend_notifiers = NOTIFIER_LIST_INITIALIZER(suspend_notifiers); static NotifierList wakeup_notifiers = NOTIFIER_LIST_INITIALIZER(wakeup_notifiers); +static NotifierList shutdown_notifiers = +NOTIFIER_LIST_INITIALIZER(shutdown_notifiers); static uint32_t wakeup_reason_mask = ~(1 << QEMU_WAKEUP_REASON_NONE); ShutdownCause qemu_shutdown_requested_get(void) @@ -1809,6 +1811,12 @@ static void qemu_system_powerdown(void) notifier_list_notify(_notifiers, NULL); } +static void qemu_system_shutdown(ShutdownCause cause) +{ +qapi_event_send_shutdown(shutdown_caused_by_guest(cause)); +notifier_list_notify(_notifiers, ); +} + void qemu_system_powerdown_request(void) { trace_qemu_system_powerdown_request(); @@ -1821,6 +1829,11 @@ void qemu_register_powerdown_notifier(Notifier *notifier) notifier_list_add(_notifiers, notifier); } +void qemu_register_shutdown_notifier(Notifier *notifier) +{ +notifier_list_add(_notifiers, notifier); +} + void qemu_system_debug_request(void) { debug_requested = 1; @@ -1848,7 +1861,7 @@ static bool main_loop_should_exit(void) request = qemu_shutdown_requested(); if (request) { qemu_kill_report(); -qapi_event_send_shutdown(shutdown_caused_by_guest(request)); +qemu_system_shutdown(request); if (no_shutdown) { vm_stop(RUN_STATE_SHUTDOWN); } else { -- 2.17.2
[Qemu-devel] [PATCH v4 02/23] hw/rdma: Add ability to force notification without re-arm
Upon completion of incoming packet the device pushes CQE to driver's RX ring and notify the driver (msix). While for data-path incoming packets the driver needs the ability to control whether it wished to receive interrupts or not, for control-path packets such as incoming MAD the driver needs to be notified anyway, it even do not need to re-arm the notification bit. Enhance the notification field to support this. Signed-off-by: Yuval Shaia Reviewed-by: Marcel Apfelbaum --- hw/rdma/rdma_rm.c | 12 ++-- hw/rdma/rdma_rm_defs.h | 8 +++- hw/rdma/vmw/pvrdma_qp_ops.c | 6 -- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/hw/rdma/rdma_rm.c b/hw/rdma/rdma_rm.c index 8d59a42cd1..4f10fcabcc 100644 --- a/hw/rdma/rdma_rm.c +++ b/hw/rdma/rdma_rm.c @@ -263,7 +263,7 @@ int rdma_rm_alloc_cq(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev, } cq->opaque = opaque; -cq->notify = false; +cq->notify = CNT_CLEAR; rc = rdma_backend_create_cq(backend_dev, >backend_cq, cqe); if (rc) { @@ -291,7 +291,10 @@ void rdma_rm_req_notify_cq(RdmaDeviceResources *dev_res, uint32_t cq_handle, return; } -cq->notify = notify; +if (cq->notify != CNT_SET) { +cq->notify = notify ? CNT_ARM : CNT_CLEAR; +} + pr_dbg("notify=%d\n", cq->notify); } @@ -349,6 +352,11 @@ int rdma_rm_alloc_qp(RdmaDeviceResources *dev_res, uint32_t pd_handle, return -EINVAL; } +if (qp_type == IBV_QPT_GSI) { +scq->notify = CNT_SET; +rcq->notify = CNT_SET; +} + qp = res_tbl_alloc(_res->qp_tbl, _qpn); if (!qp) { return -ENOMEM; diff --git a/hw/rdma/rdma_rm_defs.h b/hw/rdma/rdma_rm_defs.h index 7228151239..9b399063d3 100644 --- a/hw/rdma/rdma_rm_defs.h +++ b/hw/rdma/rdma_rm_defs.h @@ -49,10 +49,16 @@ typedef struct RdmaRmPD { uint32_t ctx_handle; } RdmaRmPD; +typedef enum CQNotificationType { +CNT_CLEAR, +CNT_ARM, +CNT_SET, +} CQNotificationType; + typedef struct RdmaRmCQ { RdmaBackendCQ backend_cq; void *opaque; -bool notify; +CQNotificationType notify; } RdmaRmCQ; /* MR (DMA region) */ diff --git a/hw/rdma/vmw/pvrdma_qp_ops.c b/hw/rdma/vmw/pvrdma_qp_ops.c index c668afd0ed..762700a205 100644 --- a/hw/rdma/vmw/pvrdma_qp_ops.c +++ b/hw/rdma/vmw/pvrdma_qp_ops.c @@ -89,8 +89,10 @@ static int pvrdma_post_cqe(PVRDMADev *dev, uint32_t cq_handle, pvrdma_ring_write_inc(>dsr_info.cq); pr_dbg("cq->notify=%d\n", cq->notify); -if (cq->notify) { -cq->notify = false; +if (cq->notify != CNT_CLEAR) { +if (cq->notify == CNT_ARM) { +cq->notify = CNT_CLEAR; +} post_interrupt(dev, INTR_VEC_CMD_COMPLETION_Q); } -- 2.17.2
Re: [Qemu-devel] [PATCH-for-3.1] ps2kbd: default to scan enabled after reset
Ping again. v3.0 didn't contain 143c04c7e0639e53086519592ead15d2556bfbf2, so this commit fixes a regression. Le 10/11/2018 à 21:53, Hervé Poussineau a écrit : Ping. Le 21/10/2018 à 21:07, Hervé Poussineau a écrit : A check for scan_enabled has been added to ps2_keyboard_event in commit 143c04c7e0639e53086519592ead15d2556bfbf2 to prevent stream corruption. This works well as long as operating system is resetting keyboard, or enabling it. This fixes IBM 40p firmware, which doesn't bother sending KBD_CMD_RESET, KBD_CMD_ENABLE or KBD_CMD_RESET_ENABLE before trying to use the keyboard. Signed-off-by: Hervé Poussineau --- hw/input/ps2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/input/ps2.c b/hw/input/ps2.c index fdfcadf9a1..eded4f0f8d 100644 --- a/hw/input/ps2.c +++ b/hw/input/ps2.c @@ -938,7 +938,7 @@ static void ps2_kbd_reset(void *opaque) trace_ps2_kbd_reset(opaque); ps2_common_reset(>common); - s->scan_enabled = 0; + s->scan_enabled = 1; s->translate = 0; s->scancode_set = 2; s->modifiers = 0;
[Qemu-devel] [PATCH v4 17/23] hw/pvrdma: Fill error code in command's response
Driver checks error code let's set it. Signed-off-by: Yuval Shaia --- hw/rdma/vmw/pvrdma_cmd.c | 67 1 file changed, 48 insertions(+), 19 deletions(-) diff --git a/hw/rdma/vmw/pvrdma_cmd.c b/hw/rdma/vmw/pvrdma_cmd.c index 0d3c818c20..a326c5d470 100644 --- a/hw/rdma/vmw/pvrdma_cmd.c +++ b/hw/rdma/vmw/pvrdma_cmd.c @@ -131,7 +131,8 @@ static int query_port(PVRDMADev *dev, union pvrdma_cmd_req *req, if (rdma_backend_query_port(>backend_dev, (struct ibv_port_attr *))) { -return -ENOMEM; +resp->hdr.err = -ENOMEM; +goto out; } memset(resp, 0, sizeof(*resp)); @@ -150,7 +151,9 @@ static int query_port(PVRDMADev *dev, union pvrdma_cmd_req *req, resp->attrs.active_width = 1; resp->attrs.active_speed = 1; -return 0; +out: +pr_dbg("ret=%d\n", resp->hdr.err); +return resp->hdr.err; } static int query_pkey(PVRDMADev *dev, union pvrdma_cmd_req *req, @@ -170,7 +173,7 @@ static int query_pkey(PVRDMADev *dev, union pvrdma_cmd_req *req, resp->pkey = PVRDMA_PKEY; pr_dbg("pkey=0x%x\n", resp->pkey); -return 0; +return resp->hdr.err; } static int create_pd(PVRDMADev *dev, union pvrdma_cmd_req *req, @@ -200,7 +203,9 @@ static int destroy_pd(PVRDMADev *dev, union pvrdma_cmd_req *req, rdma_rm_dealloc_pd(>rdma_dev_res, cmd->pd_handle); -return 0; +rsp->hdr.err = 0; + +return rsp->hdr.err; } static int create_mr(PVRDMADev *dev, union pvrdma_cmd_req *req, @@ -251,7 +256,9 @@ static int destroy_mr(PVRDMADev *dev, union pvrdma_cmd_req *req, rdma_rm_dealloc_mr(>rdma_dev_res, cmd->mr_handle); -return 0; +rsp->hdr.err = 0; + +return rsp->hdr.err; } static int create_cq_ring(PCIDevice *pci_dev , PvrdmaRing **ring, @@ -353,7 +360,8 @@ static int destroy_cq(PVRDMADev *dev, union pvrdma_cmd_req *req, cq = rdma_rm_get_cq(>rdma_dev_res, cmd->cq_handle); if (!cq) { pr_dbg("Invalid CQ handle\n"); -return -EINVAL; +rsp->hdr.err = -EINVAL; +goto out; } ring = (PvrdmaRing *)cq->opaque; @@ -364,7 +372,11 @@ static int destroy_cq(PVRDMADev *dev, union pvrdma_cmd_req *req, rdma_rm_dealloc_cq(>rdma_dev_res, cmd->cq_handle); -return 0; +rsp->hdr.err = 0; + +out: +pr_dbg("ret=%d\n", rsp->hdr.err); +return rsp->hdr.err; } static int create_qp_rings(PCIDevice *pci_dev, uint64_t pdir_dma, @@ -553,7 +565,8 @@ static int destroy_qp(PVRDMADev *dev, union pvrdma_cmd_req *req, qp = rdma_rm_get_qp(>rdma_dev_res, cmd->qp_handle); if (!qp) { pr_dbg("Invalid QP handle\n"); -return -EINVAL; +rsp->hdr.err = -EINVAL; +goto out; } rdma_rm_dealloc_qp(>rdma_dev_res, cmd->qp_handle); @@ -567,7 +580,11 @@ static int destroy_qp(PVRDMADev *dev, union pvrdma_cmd_req *req, rdma_pci_dma_unmap(PCI_DEVICE(dev), ring->ring_state, TARGET_PAGE_SIZE); g_free(ring); -return 0; +rsp->hdr.err = 0; + +out: +pr_dbg("ret=%d\n", rsp->hdr.err); +return rsp->hdr.err; } static int create_bind(PVRDMADev *dev, union pvrdma_cmd_req *req, @@ -580,7 +597,8 @@ static int create_bind(PVRDMADev *dev, union pvrdma_cmd_req *req, pr_dbg("index=%d\n", cmd->index); if (cmd->index >= MAX_PORT_GIDS) { -return -EINVAL; +rsp->hdr.err = -EINVAL; +goto out; } pr_dbg("gid[%d]=0x%llx,0x%llx\n", cmd->index, @@ -590,10 +608,15 @@ static int create_bind(PVRDMADev *dev, union pvrdma_cmd_req *req, rc = rdma_rm_add_gid(>rdma_dev_res, >backend_dev, dev->backend_eth_device_name, gid, cmd->index); if (rc < 0) { -return -EINVAL; +rsp->hdr.err = rc; +goto out; } -return 0; +rsp->hdr.err = 0; + +out: +pr_dbg("ret=%d\n", rsp->hdr.err); +return rsp->hdr.err; } static int destroy_bind(PVRDMADev *dev, union pvrdma_cmd_req *req, @@ -606,7 +629,8 @@ static int destroy_bind(PVRDMADev *dev, union pvrdma_cmd_req *req, pr_dbg("index=%d\n", cmd->index); if (cmd->index >= MAX_PORT_GIDS) { -return -EINVAL; +rsp->hdr.err = -EINVAL; +goto out; } rc = rdma_rm_del_gid(>rdma_dev_res, >backend_dev, @@ -617,7 +641,11 @@ static int destroy_bind(PVRDMADev *dev, union pvrdma_cmd_req *req, goto out; } -return 0; +rsp->hdr.err = 0; + +out: +pr_dbg("ret=%d\n", rsp->hdr.err); +return rsp->hdr.err; } static int create_uc(PVRDMADev *dev, union pvrdma_cmd_req *req, @@ -634,9 +662,8 @@ static int create_uc(PVRDMADev *dev, union pvrdma_cmd_req *req, resp->hdr.err = rdma_rm_alloc_uc(>rdma_dev_res, cmd->pfn, >ctx_handle); -pr_dbg("ret=%d\n", resp->hdr.err); - -return 0; +pr_dbg("ret=%d\n", rsp->hdr.err); +return rsp->hdr.err; } static int destroy_uc(PVRDMADev *dev, union
[Qemu-devel] [Bug 1803872] [NEW] gcc 8.2 reports stringop-truncation when building qemu
Public bug reported: QEMU 3.0 block/sheepdog.c: In function 'find_vdi_name': block/sheepdog.c:1239:5: error: 'strncpy' specified bound 256 equals destination size [-Werror=stringop-truncation] strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN); ^~ If this is the intended behavior, please suppress the warning. For example: #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstringop-truncation" strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN); #pragma GCC diagnostic pop ** Affects: qemu Importance: Undecided Status: New -- You received this bug notification because you are a member of qemu- devel-ml, which is subscribed to QEMU. https://bugs.launchpad.net/bugs/1803872 Title: gcc 8.2 reports stringop-truncation when building qemu Status in QEMU: New Bug description: QEMU 3.0 block/sheepdog.c: In function 'find_vdi_name': block/sheepdog.c:1239:5: error: 'strncpy' specified bound 256 equals destination size [-Werror=stringop-truncation] strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN); ^~ If this is the intended behavior, please suppress the warning. For example: #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstringop-truncation" strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN); #pragma GCC diagnostic pop To manage notifications about this bug go to: https://bugs.launchpad.net/qemu/+bug/1803872/+subscriptions
Re: [Qemu-devel] [PATCH v3 22/23] hw/rdma: Do not call rdma_backend_del_gid on an empty gid
On Sat, Nov 17, 2018 at 02:25:55PM +0200, Marcel Apfelbaum wrote: > > > On 11/13/18 9:13 AM, Yuval Shaia wrote: > > When device goes down the function fini_ports loops over all entries in > > gid table regardless of the fact whether entry is valid or not. In case > > that entry is not valid we'd like to skip from any further processing in > > backend device. > > > > Signed-off-by: Yuval Shaia > > --- > > hw/rdma/rdma_rm.c | 4 > > 1 file changed, 4 insertions(+) > > > > diff --git a/hw/rdma/rdma_rm.c b/hw/rdma/rdma_rm.c > > index 35a96d9a64..e3f6b2f6ea 100644 > > --- a/hw/rdma/rdma_rm.c > > +++ b/hw/rdma/rdma_rm.c > > @@ -555,6 +555,10 @@ int rdma_rm_del_gid(RdmaDeviceResources *dev_res, > > RdmaBackendDev *backend_dev, > > { > > int rc; > > +if (!dev_res->port.gid_tbl[gid_idx].gid.global.interface_id) { > > +return 0; > > +} > > + > > rc = rdma_backend_del_gid(backend_dev, ifname, > > _res->port.gid_tbl[gid_idx].gid); > > if (rc < 0) { > > Reviewed-by: Marcel Apfelbaum There seems to be a missing space separator between "Apfelbaum" and "<". Is that ok? > > Thanks, > Marcel >
[Qemu-devel] [PATCH v4 01/23] contrib/rdmacm-mux: Add implementation of RDMA User MAD multiplexer
RDMA MAD kernel module (ibcm) disallow more than one MAD-agent for a given MAD class. This does not go hand-by-hand with qemu pvrdma device's requirements where each VM is MAD agent. Fix it by adding implementation of RDMA MAD multiplexer service which on one hand register as a sole MAD agent with the kernel module and on the other hand gives service to more than one VM. Design Overview: A server process is registered to UMAD framework (for this to work the rdma_cm kernel module needs to be unloaded) and creates a unix socket to listen to incoming request from clients. A client process (such as QEMU) connects to this unix socket and registers with its own GID. TX: --- When client needs to send rdma_cm MAD message it construct it the same way as without this multiplexer, i.e. creates a umad packet but this time it writes its content to the socket instead of calling umad_send(). The server, upon receiving such a message fetch local_comm_id from it so a context for this session can be maintain and relay the message to UMAD layer by calling umad_send(). RX: --- The server creates a worker thread to process incoming rdma_cm MAD messages. When an incoming message arrived (umad_recv()) the server, depending on the message type (attr_id) looks for target client by either searching in gid->fd table or in local_comm_id->fd table. With the extracted fd the server relays to incoming message to the client. Signed-off-by: Yuval Shaia Reviewed-by: Shamir Rabinovitch --- MAINTAINERS | 1 + Makefile | 3 + Makefile.objs| 1 + contrib/rdmacm-mux/Makefile.objs | 4 + contrib/rdmacm-mux/main.c| 780 +++ contrib/rdmacm-mux/rdmacm-mux.h | 56 +++ 6 files changed, 845 insertions(+) create mode 100644 contrib/rdmacm-mux/Makefile.objs create mode 100644 contrib/rdmacm-mux/main.c create mode 100644 contrib/rdmacm-mux/rdmacm-mux.h diff --git a/MAINTAINERS b/MAINTAINERS index 98a1856afc..e087d58ac6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2231,6 +2231,7 @@ S: Maintained F: hw/rdma/* F: hw/rdma/vmw/* F: docs/pvrdma.txt +F: contrib/rdmacm-mux/* Build and test automation - diff --git a/Makefile b/Makefile index f2947186a4..94072776ff 100644 --- a/Makefile +++ b/Makefile @@ -418,6 +418,7 @@ dummy := $(call unnest-vars,, \ elf2dmp-obj-y \ ivshmem-client-obj-y \ ivshmem-server-obj-y \ +rdmacm-mux-obj-y \ libvhost-user-obj-y \ vhost-user-scsi-obj-y \ vhost-user-blk-obj-y \ @@ -725,6 +726,8 @@ vhost-user-scsi$(EXESUF): $(vhost-user-scsi-obj-y) libvhost-user.a $(call LINK, $^) vhost-user-blk$(EXESUF): $(vhost-user-blk-obj-y) libvhost-user.a $(call LINK, $^) +rdmacm-mux$(EXESUF): $(rdmacm-mux-obj-y) $(COMMON_LDADDS) + $(call LINK, $^) module_block.h: $(SRC_PATH)/scripts/modules/module_block.py config-host.mak $(call quiet-command,$(PYTHON) $< $@ \ diff --git a/Makefile.objs b/Makefile.objs index 1e1ff387d7..cc7df3ad80 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -194,6 +194,7 @@ vhost-user-scsi.o-cflags := $(LIBISCSI_CFLAGS) vhost-user-scsi.o-libs := $(LIBISCSI_LIBS) vhost-user-scsi-obj-y = contrib/vhost-user-scsi/ vhost-user-blk-obj-y = contrib/vhost-user-blk/ +rdmacm-mux-obj-y = contrib/rdmacm-mux/ ## trace-events-subdirs = diff --git a/contrib/rdmacm-mux/Makefile.objs b/contrib/rdmacm-mux/Makefile.objs new file mode 100644 index 00..be3eacb6f7 --- /dev/null +++ b/contrib/rdmacm-mux/Makefile.objs @@ -0,0 +1,4 @@ +ifdef CONFIG_PVRDMA +CFLAGS += -libumad -Wno-format-truncation +rdmacm-mux-obj-y = main.o +endif diff --git a/contrib/rdmacm-mux/main.c b/contrib/rdmacm-mux/main.c new file mode 100644 index 00..98de95ee64 --- /dev/null +++ b/contrib/rdmacm-mux/main.c @@ -0,0 +1,780 @@ +/* + * QEMU paravirtual RDMA - rdmacm-mux implementation + * + * Copyright (C) 2018 Oracle + * Copyright (C) 2018 Red Hat Inc + * + * Authors: + * Yuval Shaia + * Marcel Apfelbaum + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "sys/poll.h" +#include "sys/ioctl.h" +#include "pthread.h" +#include "syslog.h" + +#include "infiniband/verbs.h" +#include "infiniband/umad.h" +#include "infiniband/umad_types.h" +#include "infiniband/umad_sa.h" +#include "infiniband/umad_cm.h" + +#include "rdmacm-mux.h" + +#define SCALE_US 1000 +#define COMMID_TTL 2 /* How many SCALE_US a context of MAD session is saved */ +#define SLEEP_SECS 5 /* This is used both in poll() and thread */ +#define SERVER_LISTEN_BACKLOG 10 +#define MAX_CLIENTS 4096 +#define MAD_RMPP_VERSION 0 +#define MAD_METHOD_MASK0 0x8 + +#define
[Qemu-devel] [PATCH v4 12/23] vmxnet3: Move some definitions to header file
pvrdma setup requires vmxnet3 device on PCI function 0 and PVRDMA device on PCI function 1. pvrdma device needs to access vmxnet3 device object for several reasons: 1. Make sure PCI function 0 is vmxnet3. 2. To monitor vmxnet3 device state. 3. To configure node_guid accoring to vmxnet3 device's MAC address. To be able to access vmxnet3 device the definition of VMXNET3State is moved to a new header file. Signed-off-by: Yuval Shaia Reviewed-by: Dmitry Fleytman --- hw/net/vmxnet3.c | 116 +--- hw/net/vmxnet3_defs.h | 133 ++ 2 files changed, 134 insertions(+), 115 deletions(-) create mode 100644 hw/net/vmxnet3_defs.h diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c index 3648630386..54746a4030 100644 --- a/hw/net/vmxnet3.c +++ b/hw/net/vmxnet3.c @@ -18,7 +18,6 @@ #include "qemu/osdep.h" #include "hw/hw.h" #include "hw/pci/pci.h" -#include "net/net.h" #include "net/tap.h" #include "net/checksum.h" #include "sysemu/sysemu.h" @@ -29,6 +28,7 @@ #include "migration/register.h" #include "vmxnet3.h" +#include "vmxnet3_defs.h" #include "vmxnet_debug.h" #include "vmware_utils.h" #include "net_tx_pkt.h" @@ -131,23 +131,11 @@ typedef struct VMXNET3Class { DeviceRealize parent_dc_realize; } VMXNET3Class; -#define TYPE_VMXNET3 "vmxnet3" -#define VMXNET3(obj) OBJECT_CHECK(VMXNET3State, (obj), TYPE_VMXNET3) - #define VMXNET3_DEVICE_CLASS(klass) \ OBJECT_CLASS_CHECK(VMXNET3Class, (klass), TYPE_VMXNET3) #define VMXNET3_DEVICE_GET_CLASS(obj) \ OBJECT_GET_CLASS(VMXNET3Class, (obj), TYPE_VMXNET3) -/* Cyclic ring abstraction */ -typedef struct { -hwaddr pa; -uint32_t size; -uint32_t cell_size; -uint32_t next; -uint8_t gen; -} Vmxnet3Ring; - static inline void vmxnet3_ring_init(PCIDevice *d, Vmxnet3Ring *ring, hwaddr pa, @@ -245,108 +233,6 @@ vmxnet3_dump_rx_descr(struct Vmxnet3_RxDesc *descr) descr->rsvd, descr->dtype, descr->ext1, descr->btype); } -/* Device state and helper functions */ -#define VMXNET3_RX_RINGS_PER_QUEUE (2) - -typedef struct { -Vmxnet3Ring tx_ring; -Vmxnet3Ring comp_ring; - -uint8_t intr_idx; -hwaddr tx_stats_pa; -struct UPT1_TxStats txq_stats; -} Vmxnet3TxqDescr; - -typedef struct { -Vmxnet3Ring rx_ring[VMXNET3_RX_RINGS_PER_QUEUE]; -Vmxnet3Ring comp_ring; -uint8_t intr_idx; -hwaddr rx_stats_pa; -struct UPT1_RxStats rxq_stats; -} Vmxnet3RxqDescr; - -typedef struct { -bool is_masked; -bool is_pending; -bool is_asserted; -} Vmxnet3IntState; - -typedef struct { -PCIDevice parent_obj; -NICState *nic; -NICConf conf; -MemoryRegion bar0; -MemoryRegion bar1; -MemoryRegion msix_bar; - -Vmxnet3RxqDescr rxq_descr[VMXNET3_DEVICE_MAX_RX_QUEUES]; -Vmxnet3TxqDescr txq_descr[VMXNET3_DEVICE_MAX_TX_QUEUES]; - -/* Whether MSI-X support was installed successfully */ -bool msix_used; -hwaddr drv_shmem; -hwaddr temp_shared_guest_driver_memory; - -uint8_t txq_num; - -/* This boolean tells whether RX packet being indicated has to */ -/* be split into head and body chunks from different RX rings */ -bool rx_packets_compound; - -bool rx_vlan_stripping; -bool lro_supported; - -uint8_t rxq_num; - -/* Network MTU */ -uint32_t mtu; - -/* Maximum number of fragments for indicated TX packets */ -uint32_t max_tx_frags; - -/* Maximum number of fragments for indicated RX packets */ -uint16_t max_rx_frags; - -/* Index for events interrupt */ -uint8_t event_int_idx; - -/* Whether automatic interrupts masking enabled */ -bool auto_int_masking; - -bool peer_has_vhdr; - -/* TX packets to QEMU interface */ -struct NetTxPkt *tx_pkt; -uint32_t offload_mode; -uint32_t cso_or_gso_size; -uint16_t tci; -bool needs_vlan; - -struct NetRxPkt *rx_pkt; - -bool tx_sop; -bool skip_current_tx_pkt; - -uint32_t device_active; -uint32_t last_command; - -uint32_t link_status_and_speed; - -Vmxnet3IntState interrupt_states[VMXNET3_MAX_INTRS]; - -uint32_t temp_mac; /* To store the low part first */ - -MACAddr perm_mac; -uint32_t vlan_table[VMXNET3_VFT_SIZE]; -uint32_t rx_mode; -MACAddr *mcast_list; -uint32_t mcast_list_len; -uint32_t mcast_list_buff_size; /* needed for live migration. */ - -/* Compatibility flags for migration */ -uint32_t compat_flags; -} VMXNET3State; - /* Interrupt management */ /* diff --git a/hw/net/vmxnet3_defs.h b/hw/net/vmxnet3_defs.h new file mode 100644 index 00..6c19d29b12 --- /dev/null +++ b/hw/net/vmxnet3_defs.h @@ -0,0 +1,133 @@ +/* + *
[Qemu-devel] [PATCH v4 14/23] hw/rdma: Initialize node_guid from vmxnet3 mac address
node_guid should be set once device is load. Make node_guid be GID format (32 bit) of PCI function 0 vmxnet3 device's MAC. A new function was added to do the conversion. So for example the MAC 56:b6:44:e9:62:dc will be converted to GID 54b6:44ff:fee9:62dc. Signed-off-by: Yuval Shaia Reviewed-by: Marcel Apfelbaum --- hw/rdma/rdma_utils.h | 9 + hw/rdma/vmw/pvrdma_cmd.c | 10 -- hw/rdma/vmw/pvrdma_main.c | 5 - 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/hw/rdma/rdma_utils.h b/hw/rdma/rdma_utils.h index 989db249ef..202abb3366 100644 --- a/hw/rdma/rdma_utils.h +++ b/hw/rdma/rdma_utils.h @@ -63,4 +63,13 @@ extern unsigned long pr_dbg_cnt; void *rdma_pci_dma_map(PCIDevice *dev, dma_addr_t addr, dma_addr_t plen); void rdma_pci_dma_unmap(PCIDevice *dev, void *buffer, dma_addr_t len); +static inline void addrconf_addr_eui48(uint8_t *eui, const char *addr) +{ +memcpy(eui, addr, 3); +eui[3] = 0xFF; +eui[4] = 0xFE; +memcpy(eui + 5, addr + 3, 3); +eui[0] ^= 2; +} + #endif diff --git a/hw/rdma/vmw/pvrdma_cmd.c b/hw/rdma/vmw/pvrdma_cmd.c index a334f6205e..2979582fac 100644 --- a/hw/rdma/vmw/pvrdma_cmd.c +++ b/hw/rdma/vmw/pvrdma_cmd.c @@ -592,16 +592,6 @@ static int create_bind(PVRDMADev *dev, union pvrdma_cmd_req *req, return -EINVAL; } -/* TODO: Since drivers stores node_guid at load_dsr phase then this - * assignment is not relevant, i need to figure out a way how to - * retrieve MAC of our netdev */ -if (!cmd->index) { -dev->node_guid = -dev->rdma_dev_res.ports[0].gid_tbl[0].gid.global.interface_id; -pr_dbg("dev->node_guid=0x%llx\n", - (long long unsigned int)be64_to_cpu(dev->node_guid)); -} - return 0; } diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c index b35b5dc5f0..150404dfa6 100644 --- a/hw/rdma/vmw/pvrdma_main.c +++ b/hw/rdma/vmw/pvrdma_main.c @@ -264,7 +264,7 @@ static void init_dsr_dev_caps(PVRDMADev *dev) dsr->caps.sys_image_guid = 0; pr_dbg("sys_image_guid=%" PRIx64 "\n", dsr->caps.sys_image_guid); -dsr->caps.node_guid = cpu_to_be64(dev->node_guid); +dsr->caps.node_guid = dev->node_guid; pr_dbg("node_guid=%" PRIx64 "\n", be64_to_cpu(dsr->caps.node_guid)); dsr->caps.phys_port_cnt = MAX_PORTS; @@ -588,6 +588,9 @@ static void pvrdma_realize(PCIDevice *pdev, Error **errp) } dev->func0 = VMXNET3(func0); +addrconf_addr_eui48((unsigned char *)>node_guid, +(const char *)>func0->conf.macaddr.a); + memdev_root = object_resolve_path("/objects", NULL); if (memdev_root) { object_child_foreach(memdev_root, pvrdma_check_ram_shared, _shared); -- 2.17.2
[Qemu-devel] [PATCH v4 15/23] hw/pvrdma: Make device state depend on Ethernet function state
User should be able to control the device by changing Ethernet function state so if user runs 'ifconfig ens3 down' the PVRDMA function should be down as well. Signed-off-by: Yuval Shaia Reviewed-by: Marcel Apfelbaum --- hw/rdma/vmw/pvrdma_cmd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/rdma/vmw/pvrdma_cmd.c b/hw/rdma/vmw/pvrdma_cmd.c index 2979582fac..0d3c818c20 100644 --- a/hw/rdma/vmw/pvrdma_cmd.c +++ b/hw/rdma/vmw/pvrdma_cmd.c @@ -139,7 +139,8 @@ static int query_port(PVRDMADev *dev, union pvrdma_cmd_req *req, resp->hdr.ack = PVRDMA_CMD_QUERY_PORT_RESP; resp->hdr.err = 0; -resp->attrs.state = attrs.state; +resp->attrs.state = dev->func0->device_active ? attrs.state : +PVRDMA_PORT_DOWN; resp->attrs.max_mtu = attrs.max_mtu; resp->attrs.active_mtu = attrs.active_mtu; resp->attrs.phys_state = attrs.phys_state; -- 2.17.2
[Qemu-devel] [PATCH v4 21/23] hw/rdma: Do not use bitmap_zero_extend to free bitmap
bitmap_zero_extend is designed to work for extending, not for shrinking. Using g_free instead. Signed-off-by: Yuval Shaia Reviewed-by: Marcel Apfelbaum --- hw/rdma/rdma_rm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/rdma/rdma_rm.c b/hw/rdma/rdma_rm.c index b7d4ebe972..ca127c8c26 100644 --- a/hw/rdma/rdma_rm.c +++ b/hw/rdma/rdma_rm.c @@ -43,7 +43,7 @@ static inline void res_tbl_free(RdmaRmResTbl *tbl) { qemu_mutex_destroy(>lock); g_free(tbl->tbl); -bitmap_zero_extend(tbl->bitmap, tbl->tbl_sz, 0); +g_free(tbl->bitmap); } static inline void *res_tbl_get(RdmaRmResTbl *tbl, uint32_t handle) -- 2.17.2
Re: [Qemu-devel] [PATCH v3 17/23] hw/pvrdma: Fill error code in command's response
On Sat, Nov 17, 2018 at 02:22:31PM +0200, Marcel Apfelbaum wrote: > > > On 11/13/18 9:13 AM, Yuval Shaia wrote: > > Driver checks error code let's set it. > > > > Signed-off-by: Yuval Shaia > > --- > > hw/rdma/vmw/pvrdma_cmd.c | 67 > > 1 file changed, 48 insertions(+), 19 deletions(-) > > > > diff --git a/hw/rdma/vmw/pvrdma_cmd.c b/hw/rdma/vmw/pvrdma_cmd.c > > index 0d3c818c20..a326c5d470 100644 > > --- a/hw/rdma/vmw/pvrdma_cmd.c > > +++ b/hw/rdma/vmw/pvrdma_cmd.c > > @@ -131,7 +131,8 @@ static int query_port(PVRDMADev *dev, union > > pvrdma_cmd_req *req, > > if (rdma_backend_query_port(>backend_dev, > > (struct ibv_port_attr *))) { > > -return -ENOMEM; > > +resp->hdr.err = -ENOMEM; > > +goto out; > > } > > memset(resp, 0, sizeof(*resp)); > > @@ -150,7 +151,9 @@ static int query_port(PVRDMADev *dev, union > > pvrdma_cmd_req *req, > > resp->attrs.active_width = 1; > > resp->attrs.active_speed = 1; > > -return 0; > > +out: > > +pr_dbg("ret=%d\n", resp->hdr.err); > > +return resp->hdr.err; > > } > > static int query_pkey(PVRDMADev *dev, union pvrdma_cmd_req *req, > > @@ -170,7 +173,7 @@ static int query_pkey(PVRDMADev *dev, union > > pvrdma_cmd_req *req, > > resp->pkey = PVRDMA_PKEY; > > pr_dbg("pkey=0x%x\n", resp->pkey); > > -return 0; > > +return resp->hdr.err; > > } > > static int create_pd(PVRDMADev *dev, union pvrdma_cmd_req *req, > > @@ -200,7 +203,9 @@ static int destroy_pd(PVRDMADev *dev, union > > pvrdma_cmd_req *req, > > rdma_rm_dealloc_pd(>rdma_dev_res, cmd->pd_handle); > > -return 0; > > +rsp->hdr.err = 0; > > Is it possible to ensure err is 0 by default during hdr creation > instead of manually setting it every time? Yes we can but since these handlers any fills some other fields in the response i thought it will be clean if they will fill the op-status as well. I believe filling it at "handler" level is more modular. Do you think filling it outside will make the code cleaner? > > Thanks, > Marcel > > > + > > +return rsp->hdr.err; > > } > > static int create_mr(PVRDMADev *dev, union pvrdma_cmd_req *req, > > @@ -251,7 +256,9 @@ static int destroy_mr(PVRDMADev *dev, union > > pvrdma_cmd_req *req, > > rdma_rm_dealloc_mr(>rdma_dev_res, cmd->mr_handle); > > -return 0; > > +rsp->hdr.err = 0; > > + > > +return rsp->hdr.err; > > } > > static int create_cq_ring(PCIDevice *pci_dev , PvrdmaRing **ring, > > @@ -353,7 +360,8 @@ static int destroy_cq(PVRDMADev *dev, union > > pvrdma_cmd_req *req, > > cq = rdma_rm_get_cq(>rdma_dev_res, cmd->cq_handle); > > if (!cq) { > > pr_dbg("Invalid CQ handle\n"); > > -return -EINVAL; > > +rsp->hdr.err = -EINVAL; > > +goto out; > > } > > ring = (PvrdmaRing *)cq->opaque; > > @@ -364,7 +372,11 @@ static int destroy_cq(PVRDMADev *dev, union > > pvrdma_cmd_req *req, > > rdma_rm_dealloc_cq(>rdma_dev_res, cmd->cq_handle); > > -return 0; > > +rsp->hdr.err = 0; > > + > > +out: > > +pr_dbg("ret=%d\n", rsp->hdr.err); > > +return rsp->hdr.err; > > } > > static int create_qp_rings(PCIDevice *pci_dev, uint64_t pdir_dma, > > @@ -553,7 +565,8 @@ static int destroy_qp(PVRDMADev *dev, union > > pvrdma_cmd_req *req, > > qp = rdma_rm_get_qp(>rdma_dev_res, cmd->qp_handle); > > if (!qp) { > > pr_dbg("Invalid QP handle\n"); > > -return -EINVAL; > > +rsp->hdr.err = -EINVAL; > > +goto out; > > } > > rdma_rm_dealloc_qp(>rdma_dev_res, cmd->qp_handle); > > @@ -567,7 +580,11 @@ static int destroy_qp(PVRDMADev *dev, union > > pvrdma_cmd_req *req, > > rdma_pci_dma_unmap(PCI_DEVICE(dev), ring->ring_state, > > TARGET_PAGE_SIZE); > > g_free(ring); > > -return 0; > > +rsp->hdr.err = 0; > > + > > +out: > > +pr_dbg("ret=%d\n", rsp->hdr.err); > > +return rsp->hdr.err; > > } > > static int create_bind(PVRDMADev *dev, union pvrdma_cmd_req *req, > > @@ -580,7 +597,8 @@ static int create_bind(PVRDMADev *dev, union > > pvrdma_cmd_req *req, > > pr_dbg("index=%d\n", cmd->index); > > if (cmd->index >= MAX_PORT_GIDS) { > > -return -EINVAL; > > +rsp->hdr.err = -EINVAL; > > +goto out; > > } > > pr_dbg("gid[%d]=0x%llx,0x%llx\n", cmd->index, > > @@ -590,10 +608,15 @@ static int create_bind(PVRDMADev *dev, union > > pvrdma_cmd_req *req, > > rc = rdma_rm_add_gid(>rdma_dev_res, >backend_dev, > >dev->backend_eth_device_name, gid, cmd->index); > > if (rc < 0) { > > -return -EINVAL; > > +rsp->hdr.err = rc; > > +goto out; > > } > > -return 0; > > +rsp->hdr.err = 0; > > + > > +out: > > +pr_dbg("ret=%d\n", rsp->hdr.err); > > +return rsp->hdr.err; > > } > > static int destroy_bind(PVRDMADev
[Qemu-devel] [PATCH v4 10/23] json: Define new QMP message for pvrdma
pvrdma requires that the same GID attached to it will be attached to the backend device in the host. A new QMP messages is defined so pvrdma device can broadcast any change made to its GID table. This event is captured by libvirt which in turn will update the GID table in the backend device. Signed-off-by: Yuval Shaia Reviewed-by: Marcel Apfelbaum --- MAINTAINERS | 1 + Makefile | 3 ++- Makefile.objs | 4 qapi/qapi-schema.json | 1 + qapi/rdma.json| 38 ++ 5 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 qapi/rdma.json diff --git a/MAINTAINERS b/MAINTAINERS index e087d58ac6..a149f68a8f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2232,6 +2232,7 @@ F: hw/rdma/* F: hw/rdma/vmw/* F: docs/pvrdma.txt F: contrib/rdmacm-mux/* +F: qapi/rdma.json Build and test automation - diff --git a/Makefile b/Makefile index 94072776ff..db4ce60ee5 100644 --- a/Makefile +++ b/Makefile @@ -599,7 +599,8 @@ qapi-modules = $(SRC_PATH)/qapi/qapi-schema.json $(SRC_PATH)/qapi/common.json \ $(SRC_PATH)/qapi/tpm.json \ $(SRC_PATH)/qapi/trace.json \ $(SRC_PATH)/qapi/transaction.json \ - $(SRC_PATH)/qapi/ui.json + $(SRC_PATH)/qapi/ui.json \ + $(SRC_PATH)/qapi/rdma.json qapi/qapi-builtin-types.c qapi/qapi-builtin-types.h \ qapi/qapi-types.c qapi/qapi-types.h \ diff --git a/Makefile.objs b/Makefile.objs index cc7df3ad80..76d8028f2f 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -21,6 +21,7 @@ util-obj-y += qapi/qapi-types-tpm.o util-obj-y += qapi/qapi-types-trace.o util-obj-y += qapi/qapi-types-transaction.o util-obj-y += qapi/qapi-types-ui.o +util-obj-y += qapi/qapi-types-rdma.o util-obj-y += qapi/qapi-builtin-visit.o util-obj-y += qapi/qapi-visit.o util-obj-y += qapi/qapi-visit-block-core.o @@ -40,6 +41,7 @@ util-obj-y += qapi/qapi-visit-tpm.o util-obj-y += qapi/qapi-visit-trace.o util-obj-y += qapi/qapi-visit-transaction.o util-obj-y += qapi/qapi-visit-ui.o +util-obj-y += qapi/qapi-visit-rdma.o util-obj-y += qapi/qapi-events.o util-obj-y += qapi/qapi-events-block-core.o util-obj-y += qapi/qapi-events-block.o @@ -58,6 +60,7 @@ util-obj-y += qapi/qapi-events-tpm.o util-obj-y += qapi/qapi-events-trace.o util-obj-y += qapi/qapi-events-transaction.o util-obj-y += qapi/qapi-events-ui.o +util-obj-y += qapi/qapi-events-rdma.o util-obj-y += qapi/qapi-introspect.o chardev-obj-y = chardev/ @@ -155,6 +158,7 @@ common-obj-y += qapi/qapi-commands-tpm.o common-obj-y += qapi/qapi-commands-trace.o common-obj-y += qapi/qapi-commands-transaction.o common-obj-y += qapi/qapi-commands-ui.o +common-obj-y += qapi/qapi-commands-rdma.o common-obj-y += qapi/qapi-introspect.o common-obj-y += qmp.o hmp.o endif diff --git a/qapi/qapi-schema.json b/qapi/qapi-schema.json index 65b6dc2f6f..a650d80f83 100644 --- a/qapi/qapi-schema.json +++ b/qapi/qapi-schema.json @@ -94,3 +94,4 @@ { 'include': 'trace.json' } { 'include': 'introspect.json' } { 'include': 'misc.json' } +{ 'include': 'rdma.json' } diff --git a/qapi/rdma.json b/qapi/rdma.json new file mode 100644 index 00..804c68ab36 --- /dev/null +++ b/qapi/rdma.json @@ -0,0 +1,38 @@ +# -*- Mode: Python -*- +# + +## +# = RDMA device +## + +## +# @RDMA_GID_STATUS_CHANGED: +# +# Emitted when guest driver adds/deletes GID to/from device +# +# @netdev: RoCE Network Device name - char * +# +# @gid-status: Add or delete indication - bool +# +# @subnet-prefix: Subnet Prefix - uint64 +# +# @interface-id : Interface ID - uint64 +# +# Since: 3.2 +# +# Example: +# +# <- {"timestamp": {"seconds": 1541579657, "microseconds": 986760}, +# "event": "RDMA_GID_STATUS_CHANGED", +# "data": +# {"netdev": "bridge0", +# "interface-id": 15880512517475447892, +# "gid-status": true, +# "subnet-prefix": 33022}} +# +## +{ 'event': 'RDMA_GID_STATUS_CHANGED', + 'data': { 'netdev': 'str', +'gid-status': 'bool', +'subnet-prefix' : 'uint64', +'interface-id' : 'uint64' } } -- 2.17.2
[Qemu-devel] [PATCH v4 08/23] hw/pvrdma: Set the correct opcode for recv completion
The function pvrdma_post_cqe populates CQE entry with opcode from the given completion element. For receive operation value was not set. Fix it by setting it to IBV_WC_RECV. Signed-off-by: Yuval Shaia Reviewed-by: Marcel Apfelbaum --- hw/rdma/vmw/pvrdma_qp_ops.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/rdma/vmw/pvrdma_qp_ops.c b/hw/rdma/vmw/pvrdma_qp_ops.c index 762700a205..7b0f440fda 100644 --- a/hw/rdma/vmw/pvrdma_qp_ops.c +++ b/hw/rdma/vmw/pvrdma_qp_ops.c @@ -196,8 +196,9 @@ int pvrdma_qp_recv(PVRDMADev *dev, uint32_t qp_handle) comp_ctx = g_malloc(sizeof(CompHandlerCtx)); comp_ctx->dev = dev; comp_ctx->cq_handle = qp->recv_cq_handle; -comp_ctx->cqe.qp = qp_handle; comp_ctx->cqe.wr_id = wqe->hdr.wr_id; +comp_ctx->cqe.qp = qp_handle; +comp_ctx->cqe.opcode = IBV_WC_RECV; rdma_backend_post_recv(>backend_dev, >rdma_dev_res, >backend_qp, qp->qp_type, -- 2.17.2
[Qemu-devel] [PATCH v4 04/23] hw/rdma: Abort send-op if fail to create addr handler
Function create_ah might return NULL, let's exit with an error. Signed-off-by: Yuval Shaia Reviewed-by: Marcel Apfelbaum --- hw/rdma/rdma_backend.c | 4 1 file changed, 4 insertions(+) diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c index d7a4bbd91f..1e148398a2 100644 --- a/hw/rdma/rdma_backend.c +++ b/hw/rdma/rdma_backend.c @@ -338,6 +338,10 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev, if (qp_type == IBV_QPT_UD) { wr.wr.ud.ah = create_ah(backend_dev, qp->ibpd, backend_dev->backend_gid_idx, dgid); +if (!wr.wr.ud.ah) { +comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx); +goto out_dealloc_cqe_ctx; +} wr.wr.ud.remote_qpn = dqpn; wr.wr.ud.remote_qkey = dqkey; } -- 2.17.2
[Qemu-devel] [PATCH v4 03/23] hw/rdma: Return qpn 1 if ibqp is NULL
Device is not supporting QP0, only QP1. Signed-off-by: Yuval Shaia Reviewed-by: Marcel Apfelbaum --- hw/rdma/rdma_backend.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/rdma/rdma_backend.h b/hw/rdma/rdma_backend.h index 86e8fe8ab6..3ccc9a2494 100644 --- a/hw/rdma/rdma_backend.h +++ b/hw/rdma/rdma_backend.h @@ -33,7 +33,7 @@ static inline union ibv_gid *rdma_backend_gid(RdmaBackendDev *dev) static inline uint32_t rdma_backend_qpn(const RdmaBackendQP *qp) { -return qp->ibqp ? qp->ibqp->qp_num : 0; +return qp->ibqp ? qp->ibqp->qp_num : 1; } static inline uint32_t rdma_backend_mr_lkey(const RdmaBackendMR *mr) -- 2.17.2
[Qemu-devel] [PATCH v4 13/23] hw/pvrdma: Make sure PCI function 0 is vmxnet3
Guest driver enforces it, we should also. Signed-off-by: Yuval Shaia --- hw/rdma/vmw/pvrdma.h | 2 ++ hw/rdma/vmw/pvrdma_main.c | 12 2 files changed, 14 insertions(+) diff --git a/hw/rdma/vmw/pvrdma.h b/hw/rdma/vmw/pvrdma.h index b019cb843a..10a3c4fb7c 100644 --- a/hw/rdma/vmw/pvrdma.h +++ b/hw/rdma/vmw/pvrdma.h @@ -20,6 +20,7 @@ #include "hw/pci/pci.h" #include "hw/pci/msix.h" #include "chardev/char-fe.h" +#include "hw/net/vmxnet3_defs.h" #include "../rdma_backend_defs.h" #include "../rdma_rm_defs.h" @@ -85,6 +86,7 @@ typedef struct PVRDMADev { RdmaBackendDev backend_dev; RdmaDeviceResources rdma_dev_res; CharBackend mad_chr; +VMXNET3State *func0; } PVRDMADev; #define PVRDMA_DEV(dev) OBJECT_CHECK(PVRDMADev, (dev), PVRDMA_HW_NAME) diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c index ac8c092db0..b35b5dc5f0 100644 --- a/hw/rdma/vmw/pvrdma_main.c +++ b/hw/rdma/vmw/pvrdma_main.c @@ -565,6 +565,7 @@ static void pvrdma_realize(PCIDevice *pdev, Error **errp) PVRDMADev *dev = PVRDMA_DEV(pdev); Object *memdev_root; bool ram_shared = false; +PCIDevice *func0; init_pr_dbg(); @@ -576,6 +577,17 @@ static void pvrdma_realize(PCIDevice *pdev, Error **errp) return; } +func0 = pci_get_function_0(pdev); +/* Break if not vmxnet3 device in slot 0 */ +if (strcmp(object_get_typename(>qdev.parent_obj), TYPE_VMXNET3)) { +pr_dbg("func0 type is %s\n", + object_get_typename(>qdev.parent_obj)); +error_setg(errp, "Device on %x.0 must be %s", PCI_SLOT(pdev->devfn), + TYPE_VMXNET3); +return; +} +dev->func0 = VMXNET3(func0); + memdev_root = object_resolve_path("/objects", NULL); if (memdev_root) { object_child_foreach(memdev_root, pvrdma_check_ram_shared, _shared); -- 2.17.2
Re: [Qemu-devel] [PATCH v3 01/23] contrib/rdmacm-mux: Add implementation of RDMA User MAD multiplexer
On Sat, Nov 17, 2018 at 07:27:43PM +0200, Shamir Rabinovitch wrote: > On Tue, Nov 13, 2018 at 09:13:14AM +0200, Yuval Shaia wrote: > > RDMA MAD kernel module (ibcm) disallow more than one MAD-agent for a > > given MAD class. > > This does not go hand-by-hand with qemu pvrdma device's requirements > > where each VM is MAD agent. > > Fix it by adding implementation of RDMA MAD multiplexer service which on > > one hand register as a sole MAD agent with the kernel module and on the > > other hand gives service to more than one VM. > > > > Design Overview: > > > > A server process is registered to UMAD framework (for this to work the > > rdma_cm kernel module needs to be unloaded) and creates a unix socket to > > listen to incoming request from clients. > > A client process (such as QEMU) connects to this unix socket and > > registers with its own GID. > > > > TX: > > --- > > When client needs to send rdma_cm MAD message it construct it the same > > way as without this multiplexer, i.e. creates a umad packet but this > > time it writes its content to the socket instead of calling umad_send(). > > The server, upon receiving such a message fetch local_comm_id from it so > > a context for this session can be maintain and relay the message to UMAD > > layer by calling umad_send(). > > > > RX: > > --- > > The server creates a worker thread to process incoming rdma_cm MAD > > messages. When an incoming message arrived (umad_recv()) the server, > > depending on the message type (attr_id) looks for target client by > > either searching in gid->fd table or in local_comm_id->fd table. With > > the extracted fd the server relays to incoming message to the client. > > > > Signed-off-by: Yuval Shaia > > --- > > MAINTAINERS | 1 + > > Makefile | 3 + > > Makefile.objs| 1 + > > contrib/rdmacm-mux/Makefile.objs | 4 + > > contrib/rdmacm-mux/main.c| 771 +++ > > contrib/rdmacm-mux/rdmacm-mux.h | 56 +++ > > 6 files changed, 836 insertions(+) > > create mode 100644 contrib/rdmacm-mux/Makefile.objs > > create mode 100644 contrib/rdmacm-mux/main.c > > create mode 100644 contrib/rdmacm-mux/rdmacm-mux.h > > > > Reviewed-by: Shamir Rabinovitch Thanks Shamir! Thanks a lot also for all the MAD related tips and comments you gave off-list, it make the code much more mature and correct. >
[Qemu-devel] [PATCH v4 06/23] hw/pvrdma: Make function reset_device return void
This function cannot fail - fix it to return void Signed-off-by: Yuval Shaia Reviewed-by: Marcel Apfelbaum --- hw/rdma/vmw/pvrdma_main.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c index 6c8c0154fa..fc2abd34af 100644 --- a/hw/rdma/vmw/pvrdma_main.c +++ b/hw/rdma/vmw/pvrdma_main.c @@ -369,13 +369,11 @@ static int unquiesce_device(PVRDMADev *dev) return 0; } -static int reset_device(PVRDMADev *dev) +static void reset_device(PVRDMADev *dev) { pvrdma_stop(dev); pr_dbg("Device reset complete\n"); - -return 0; } static uint64_t regs_read(void *opaque, hwaddr addr, unsigned size) -- 2.17.2
[Qemu-devel] [PATCH v4 05/23] hw/rdma: Add support for MAD packets
MAD (Management Datagram) packets are widely used by various modules both in kernel and in user space for example the rdma_* API which is used to create and maintain "connection" layer on top of RDMA uses several types of MAD packets. For more information please refer to chapter 13.4 in Volume 1 Architecture Specification, Release 1.1 available here: https://www.infinibandta.org/ibta-specifications-download/ To support MAD packets the device uses an external utility (contrib/rdmacm-mux) to relay packets from and to the guest driver. Signed-off-by: Yuval Shaia --- hw/rdma/rdma_backend.c | 275 +++- hw/rdma/rdma_backend.h | 4 +- hw/rdma/rdma_backend_defs.h | 10 +- hw/rdma/vmw/pvrdma.h| 2 + hw/rdma/vmw/pvrdma_main.c | 4 +- 5 files changed, 285 insertions(+), 10 deletions(-) diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c index 1e148398a2..7c220a5798 100644 --- a/hw/rdma/rdma_backend.c +++ b/hw/rdma/rdma_backend.c @@ -16,8 +16,13 @@ #include "qemu/osdep.h" #include "qemu/error-report.h" #include "qapi/error.h" +#include "qapi/qmp/qlist.h" +#include "qapi/qmp/qnum.h" #include +#include +#include +#include #include "trace.h" #include "rdma_utils.h" @@ -33,16 +38,25 @@ #define VENDOR_ERR_MAD_SEND 0x206 #define VENDOR_ERR_INVLKEY 0x207 #define VENDOR_ERR_MR_SMALL 0x208 +#define VENDOR_ERR_INV_MAD_BUFF 0x209 +#define VENDOR_ERR_INV_NUM_SGE 0x210 #define THR_NAME_LEN 16 #define THR_POLL_TO 5000 +#define MAD_HDR_SIZE sizeof(struct ibv_grh) + typedef struct BackendCtx { -uint64_t req_id; void *up_ctx; bool is_tx_req; +struct ibv_sge sge; /* Used to save MAD recv buffer */ } BackendCtx; +struct backend_umad { +struct ib_user_mad hdr; +char mad[RDMA_MAX_PRIVATE_DATA]; +}; + static void (*comp_handler)(int status, unsigned int vendor_err, void *ctx); static void dummy_comp_handler(int status, unsigned int vendor_err, void *ctx) @@ -286,6 +300,61 @@ static int build_host_sge_array(RdmaDeviceResources *rdma_dev_res, return 0; } +static int mad_send(RdmaBackendDev *backend_dev, struct ibv_sge *sge, +uint32_t num_sge) +{ +struct backend_umad umad = {0}; +char *hdr, *msg; +int ret; + +pr_dbg("num_sge=%d\n", num_sge); + +if (num_sge != 2) { +return -EINVAL; +} + +umad.hdr.length = sge[0].length + sge[1].length; +pr_dbg("msg_len=%d\n", umad.hdr.length); + +if (umad.hdr.length > sizeof(umad.mad)) { +return -ENOMEM; +} + +umad.hdr.addr.qpn = htobe32(1); +umad.hdr.addr.grh_present = 1; +umad.hdr.addr.gid_index = backend_dev->backend_gid_idx; +memcpy(umad.hdr.addr.gid, backend_dev->gid.raw, sizeof(umad.hdr.addr.gid)); +umad.hdr.addr.hop_limit = 1; + +hdr = rdma_pci_dma_map(backend_dev->dev, sge[0].addr, sge[0].length); +if (!hdr) { +pr_dbg("Fail to map to sge[0]\n"); +return -ENOMEM; +} +msg = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length); +if (!msg) { +pr_dbg("Fail to map to sge[1]\n"); +rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length); +return -ENOMEM; +} + +pr_dbg_buf("mad_hdr", hdr, sge[0].length); +pr_dbg_buf("mad_data", data, sge[1].length); + +memcpy([0], hdr, sge[0].length); +memcpy([sge[0].length], msg, sge[1].length); + +rdma_pci_dma_unmap(backend_dev->dev, msg, sge[1].length); +rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length); + +ret = qemu_chr_fe_write(backend_dev->mad_chr_be, (const uint8_t *), +sizeof(umad)); + +pr_dbg("qemu_chr_fe_write=%d\n", ret); + +return (ret != sizeof(umad)); +} + void rdma_backend_post_send(RdmaBackendDev *backend_dev, RdmaBackendQP *qp, uint8_t qp_type, struct ibv_sge *sge, uint32_t num_sge, @@ -304,9 +373,13 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev, comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx); } else if (qp_type == IBV_QPT_GSI) { pr_dbg("QP1\n"); -comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx); +rc = mad_send(backend_dev, sge, num_sge); +if (rc) { +comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx); +} else { +comp_handler(IBV_WC_SUCCESS, 0, ctx); +} } -pr_dbg("qp->ibqp is NULL for qp_type %d!!!\n", qp_type); return; } @@ -370,6 +443,48 @@ out_free_bctx: g_free(bctx); } +static unsigned int save_mad_recv_buffer(RdmaBackendDev *backend_dev, + struct ibv_sge *sge, uint32_t num_sge, + void *ctx) +{ +BackendCtx *bctx; +int rc; +uint32_t bctx_id; + +if (num_sge != 1) { +pr_dbg("Invalid
[Qemu-devel] [PATCH v4 00/23] Add support for RDMA MAD
Hi all. This is a major enhancement to the pvrdma device to allow it to work with state of the art applications such as MPI. As described in patch #5, MAD packets are management packets that are used for many purposes including but not limited to communication layer above IB verbs API. Patch 1 exposes new external executable (under contrib) that aims to address a specific limitation in the RDMA usrespace MAD stack. This patch-set mainly present MAD enhancement but during the work on it i came across some bugs and enhancement needed to be implemented before doing any MAD coding. This is the role of patches 2 to 4, 7 to 9 and 15 to 17. Patches 6 and 18 are cosmetic changes while not relevant to this patchset still introduce with it since (at least for 6) hard to decouple. Patches 12 to 15 couple pvrdma device with vmxnet3 device as this is the configuration enforced by pvrdma driver in guest - a vmxnet3 device in function 0 and pvrdma device in function 1 in the same PCI slot. Patch 12 moves needed code from vmxnet3 device to a new header file that can be used by pvrdma code while Patches 13 to 15 use of it. Along with this patch-set there is a parallel patch posted to libvirt to apply the change needed there as part of the process implemented in patches 10 and 11. This change is needed so that guest would be able to configure any IP to the Ethernet function of the pvrdma device. https://www.redhat.com/archives/libvir-list/2018-November/msg00135.html Since we maintain external resources such as GIDs on host GID table we need to do some cleanup before going down. This is the job of patches 19 and 20. Patches 21 and 22 contain a fixes for bugs detected during the work on processing cleanup code during shutdown. Patch 23 fixes documentation. v1 -> v2: * Fix compilation issue detected when compiling for mingw * Address comment from Eric Blake re version of QEMU in json message * Fix example from QMP message in json file * Fix case where a VM tries to remove an invalid GID from GID table * rdmacm-mux: Cleanup entries in socket-gids table when socket is closed * Cleanup resources (GIDs, QPs etc) when VM goes down v2 -> v3: * Address comment from Cornelia Huck for patch #19 * Add some R-Bs from Marcel Apfelbaum and Dmitry Fleytman * Update docs/pvrdma.txt with the changes made by this patchset * Address comments from Shamir Rabinovitch for UMAD multiplexer v3 -> v4: * Address some comments from Marcel * Add some R-Bs from Cornelia Huck and Shamir Rabinovitch Yuval Shaia (23): contrib/rdmacm-mux: Add implementation of RDMA User MAD multiplexer hw/rdma: Add ability to force notification without re-arm hw/rdma: Return qpn 1 if ibqp is NULL hw/rdma: Abort send-op if fail to create addr handler hw/rdma: Add support for MAD packets hw/pvrdma: Make function reset_device return void hw/pvrdma: Make default pkey 0x hw/pvrdma: Set the correct opcode for recv completion hw/pvrdma: Set the correct opcode for send completion json: Define new QMP message for pvrdma hw/pvrdma: Add support to allow guest to configure GID table vmxnet3: Move some definitions to header file hw/pvrdma: Make sure PCI function 0 is vmxnet3 hw/rdma: Initialize node_guid from vmxnet3 mac address hw/pvrdma: Make device state depend on Ethernet function state hw/pvrdma: Fill all CQE fields hw/pvrdma: Fill error code in command's response hw/rdma: Remove unneeded code that handles more that one port vl: Introduce shutdown_notifiers hw/pvrdma: Clean device's resource when system is shutdown hw/rdma: Do not use bitmap_zero_extend to free bitmap hw/rdma: Do not call rdma_backend_del_gid on an empty gid docs: Update pvrdma device documentation MAINTAINERS | 2 + Makefile | 6 +- Makefile.objs| 5 + contrib/rdmacm-mux/Makefile.objs | 4 + contrib/rdmacm-mux/main.c| 780 +++ contrib/rdmacm-mux/rdmacm-mux.h | 56 +++ docs/pvrdma.txt | 103 +++- hw/net/vmxnet3.c | 116 + hw/net/vmxnet3_defs.h| 133 ++ hw/rdma/rdma_backend.c | 496 +--- hw/rdma/rdma_backend.h | 28 +- hw/rdma/rdma_backend_defs.h | 13 +- hw/rdma/rdma_rm.c| 120 - hw/rdma/rdma_rm.h| 17 +- hw/rdma/rdma_rm_defs.h | 21 +- hw/rdma/rdma_utils.h | 24 + hw/rdma/vmw/pvrdma.h | 10 +- hw/rdma/vmw/pvrdma_cmd.c | 119 +++-- hw/rdma/vmw/pvrdma_main.c| 58 ++- hw/rdma/vmw/pvrdma_qp_ops.c | 62 ++- include/sysemu/sysemu.h | 1 + qapi/qapi-schema.json| 1 + qapi/rdma.json | 38 ++ vl.c | 15 +- 24 files changed, 1921 insertions(+), 307 deletions(-) create mode 100644 contrib/rdmacm-mux/Makefile.objs create mode
[Qemu-devel] [PATCH v4 18/23] hw/rdma: Remove unneeded code that handles more that one port
Device supports only one port, let's remove a dead code that handles more than one port. Signed-off-by: Yuval Shaia Reviewed-by: Marcel Apfelbaum --- hw/rdma/rdma_rm.c | 34 -- hw/rdma/rdma_rm.h | 2 +- hw/rdma/rdma_rm_defs.h | 4 ++-- 3 files changed, 19 insertions(+), 21 deletions(-) diff --git a/hw/rdma/rdma_rm.c b/hw/rdma/rdma_rm.c index 250254561c..b7d4ebe972 100644 --- a/hw/rdma/rdma_rm.c +++ b/hw/rdma/rdma_rm.c @@ -545,7 +545,7 @@ int rdma_rm_add_gid(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev, return -EINVAL; } -memcpy(_res->ports[0].gid_tbl[gid_idx].gid, gid, sizeof(*gid)); +memcpy(_res->port.gid_tbl[gid_idx].gid, gid, sizeof(*gid)); return 0; } @@ -556,15 +556,15 @@ int rdma_rm_del_gid(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev, int rc; rc = rdma_backend_del_gid(backend_dev, ifname, - _res->ports[0].gid_tbl[gid_idx].gid); + _res->port.gid_tbl[gid_idx].gid); if (rc) { pr_dbg("Fail to delete gid\n"); return -EINVAL; } -memset(dev_res->ports[0].gid_tbl[gid_idx].gid.raw, 0, - sizeof(dev_res->ports[0].gid_tbl[gid_idx].gid)); -dev_res->ports[0].gid_tbl[gid_idx].backend_gid_index = -1; +memset(dev_res->port.gid_tbl[gid_idx].gid.raw, 0, + sizeof(dev_res->port.gid_tbl[gid_idx].gid)); +dev_res->port.gid_tbl[gid_idx].backend_gid_index = -1; return 0; } @@ -577,16 +577,16 @@ int rdma_rm_get_backend_gid_index(RdmaDeviceResources *dev_res, return -EINVAL; } -if (unlikely(dev_res->ports[0].gid_tbl[sgid_idx].backend_gid_index == -1)) { -dev_res->ports[0].gid_tbl[sgid_idx].backend_gid_index = +if (unlikely(dev_res->port.gid_tbl[sgid_idx].backend_gid_index == -1)) { +dev_res->port.gid_tbl[sgid_idx].backend_gid_index = rdma_backend_get_gid_index(backend_dev, - _res->ports[0].gid_tbl[sgid_idx].gid); + _res->port.gid_tbl[sgid_idx].gid); } pr_dbg("backend_gid_index=%d\n", - dev_res->ports[0].gid_tbl[sgid_idx].backend_gid_index); + dev_res->port.gid_tbl[sgid_idx].backend_gid_index); -return dev_res->ports[0].gid_tbl[sgid_idx].backend_gid_index; +return dev_res->port.gid_tbl[sgid_idx].backend_gid_index; } static void destroy_qp_hash_key(gpointer data) @@ -596,15 +596,13 @@ static void destroy_qp_hash_key(gpointer data) static void init_ports(RdmaDeviceResources *dev_res) { -int i, j; +int i; -memset(dev_res->ports, 0, sizeof(dev_res->ports)); +memset(_res->port, 0, sizeof(dev_res->port)); -for (i = 0; i < MAX_PORTS; i++) { -dev_res->ports[i].state = IBV_PORT_DOWN; -for (j = 0; j < MAX_PORT_GIDS; j++) { -dev_res->ports[i].gid_tbl[j].backend_gid_index = -1; -} +dev_res->port.state = IBV_PORT_DOWN; +for (i = 0; i < MAX_PORT_GIDS; i++) { +dev_res->port.gid_tbl[i].backend_gid_index = -1; } } @@ -613,7 +611,7 @@ static void fini_ports(RdmaDeviceResources *dev_res, { int i; -dev_res->ports[0].state = IBV_PORT_DOWN; +dev_res->port.state = IBV_PORT_DOWN; for (i = 0; i < MAX_PORT_GIDS; i++) { rdma_rm_del_gid(dev_res, backend_dev, ifname, i); } diff --git a/hw/rdma/rdma_rm.h b/hw/rdma/rdma_rm.h index a7169b4e89..3c602c04c0 100644 --- a/hw/rdma/rdma_rm.h +++ b/hw/rdma/rdma_rm.h @@ -79,7 +79,7 @@ int rdma_rm_get_backend_gid_index(RdmaDeviceResources *dev_res, static inline union ibv_gid *rdma_rm_get_gid(RdmaDeviceResources *dev_res, int sgid_idx) { -return _res->ports[0].gid_tbl[sgid_idx].gid; +return _res->port.gid_tbl[sgid_idx].gid; } #endif diff --git a/hw/rdma/rdma_rm_defs.h b/hw/rdma/rdma_rm_defs.h index 7b3435f991..0ba61d1838 100644 --- a/hw/rdma/rdma_rm_defs.h +++ b/hw/rdma/rdma_rm_defs.h @@ -18,7 +18,7 @@ #include "rdma_backend_defs.h" -#define MAX_PORTS 1 +#define MAX_PORTS 1 /* Do not change - we support only one port */ #define MAX_PORT_GIDS 255 #define MAX_GIDS MAX_PORT_GIDS #define MAX_PORT_PKEYS1 @@ -97,7 +97,7 @@ typedef struct RdmaRmPort { } RdmaRmPort; typedef struct RdmaDeviceResources { -RdmaRmPort ports[MAX_PORTS]; +RdmaRmPort port; RdmaRmResTbl pd_tbl; RdmaRmResTbl mr_tbl; RdmaRmResTbl uc_tbl; -- 2.17.2
[Qemu-devel] [PATCH v4 09/23] hw/pvrdma: Set the correct opcode for send completion
opcode for WC should be set by the device and not taken from work element. Signed-off-by: Yuval Shaia Reviewed-by: Marcel Apfelbaum --- hw/rdma/vmw/pvrdma_qp_ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/rdma/vmw/pvrdma_qp_ops.c b/hw/rdma/vmw/pvrdma_qp_ops.c index 7b0f440fda..3388be1926 100644 --- a/hw/rdma/vmw/pvrdma_qp_ops.c +++ b/hw/rdma/vmw/pvrdma_qp_ops.c @@ -154,7 +154,7 @@ int pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle) comp_ctx->cq_handle = qp->send_cq_handle; comp_ctx->cqe.wr_id = wqe->hdr.wr_id; comp_ctx->cqe.qp = qp_handle; -comp_ctx->cqe.opcode = wqe->hdr.opcode; +comp_ctx->cqe.opcode = IBV_WC_SEND; rdma_backend_post_send(>backend_dev, >backend_qp, qp->qp_type, (struct ibv_sge *)>sge[0], wqe->hdr.num_sge, -- 2.17.2
Re: [Qemu-devel] [PATCH for-3.1] fdc: fix segfault in fdctrl_stop_transfer() when DMA is disabled
On 13/11/2018 20:29, John Snow wrote: > On 11/13/18 8:16 AM, Kevin Wolf wrote: >> Am 12.11.2018 um 20:58 hat John Snow geschrieben: >>> >>> >>> On 11/11/18 4:40 AM, Mark Cave-Ayland wrote: Commit c8a35f1cf0f "fdc: use IsaDma interface instead of global DMA_* functions" accidentally introduced a segfault in fdctrl_stop_transfer() for non-DMA transfers. If fdctrl->dma_chann has not been configured then the fdctrl->dma interface reference isn't initialised during isabus_fdc_realize(). Unfortunately fdctrl_stop_transfer() unconditionally references the DMA interface when finishing the transfer causing a NULL pointer dereference. Fix the issue by adding a check in fdctrl_stop_transfer() so that the DMA interface reference and release method is only invoked if fdctrl->dma_chann has been set. (This issue was discovered by Martin testing a recent change in the NetBSD installer under qemu-system-sparc) Reported-by: Martin Husemann Signed-off-by: Mark Cave-Ayland --- hw/block/fdc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/block/fdc.c b/hw/block/fdc.c index 2e9c1e1e2f..6f19f127a5 100644 --- a/hw/block/fdc.c +++ b/hw/block/fdc.c @@ -1617,7 +1617,7 @@ static void fdctrl_stop_transfer(FDCtrl *fdctrl, uint8_t status0, fdctrl->fifo[5] = cur_drv->sect; fdctrl->fifo[6] = FD_SECTOR_SC; fdctrl->data_dir = FD_DIR_READ; -if (!(fdctrl->msr & FD_MSR_NONDMA)) { +if (fdctrl->dma_chann != -1 && !(fdctrl->msr & FD_MSR_NONDMA)) { IsaDmaClass *k = ISADMA_GET_CLASS(fdctrl->dma); k->release_DREQ(fdctrl->dma, fdctrl->dma_chann); } >>> >>> Thanks. >>> >>> Reviewed-by: John Snow >>> >>> ... Kevin, would you mind staging this one-off for the next RC? >> >> No problem, I'm applying this to my block branch. However, my pull >> request for -rc1 is already merged, so this will have to wait until next >> week and -rc2. >> >> Kevin >> > > I think that should be fine. Thank you! Thanks everyone! Kevin, any chance you could also add a CC: qemu-stable@ tag when you apply this to your branch? This will help ensure that when the next NetBSD release appears the fix should already be available for most people. ATB, Mark.
[Qemu-devel] [PATCH v4 11/23] hw/pvrdma: Add support to allow guest to configure GID table
The control over the RDMA device's GID table is done by updating the device's Ethernet function addresses. Usually the first GID entry is determined by the MAC address, the second by the first IPv6 address and the third by the IPv4 address. Other entries can be added by adding more IP addresses. The opposite is the same, i.e. whenever an address is removed, the corresponding GID entry is removed. The process is done by the network and RDMA stacks. Whenever an address is added the ib_core driver is notified and calls the device driver add_gid function which in turn update the device. To support this in pvrdma device we need to hook into the create_bind and destroy_bind HW commands triggered by pvrdma driver in guest. Whenever changed is made to the pvrdma port's GID table a special QMP messages is sent to be processed by libvirt to update the address of the backend Ethernet device. Signed-off-by: Yuval Shaia --- hw/rdma/rdma_backend.c | 268 hw/rdma/rdma_backend.h | 22 +-- hw/rdma/rdma_backend_defs.h | 3 +- hw/rdma/rdma_rm.c | 104 +- hw/rdma/rdma_rm.h | 17 ++- hw/rdma/rdma_rm_defs.h | 9 +- hw/rdma/rdma_utils.h| 15 ++ hw/rdma/vmw/pvrdma.h| 2 +- hw/rdma/vmw/pvrdma_cmd.c| 55 +--- hw/rdma/vmw/pvrdma_main.c | 25 +--- hw/rdma/vmw/pvrdma_qp_ops.c | 20 +++ 11 files changed, 394 insertions(+), 146 deletions(-) diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c index 7c220a5798..db78009bbb 100644 --- a/hw/rdma/rdma_backend.c +++ b/hw/rdma/rdma_backend.c @@ -18,12 +18,14 @@ #include "qapi/error.h" #include "qapi/qmp/qlist.h" #include "qapi/qmp/qnum.h" +#include "qapi/qapi-events-rdma.h" #include #include #include #include +#include "contrib/rdmacm-mux/rdmacm-mux.h" #include "trace.h" #include "rdma_utils.h" #include "rdma_rm.h" @@ -300,11 +302,35 @@ static int build_host_sge_array(RdmaDeviceResources *rdma_dev_res, return 0; } -static int mad_send(RdmaBackendDev *backend_dev, struct ibv_sge *sge, -uint32_t num_sge) +static int check_mux_op_status(CharBackend *mad_chr_be) { -struct backend_umad umad = {0}; -char *hdr, *msg; +RdmaCmMuxMsg msg = {0}; +int ret; + +ret = qemu_chr_fe_read_all(mad_chr_be, (uint8_t *), sizeof(msg)); +if (ret != sizeof(msg)) { +pr_dbg("Invalid message size %d, expecting %ld\n", ret, sizeof(msg)); +return -EIO; +} + +if (msg.hdr.msg_type != RDMACM_MUX_MSG_TYPE_RESP) { +pr_dbg("Invalid message type %d\n", msg.hdr.msg_type); +return -EIO; +} + +if (msg.hdr.err_code != RDMACM_MUX_ERR_CODE_OK) { +pr_dbg("Operation failed in mux, error code %d\n", msg.hdr.err_code); +return -EIO; +} + +return 0; +} + +static int mad_send(RdmaBackendDev *backend_dev, uint8_t sgid_idx, +union ibv_gid *sgid, struct ibv_sge *sge, uint32_t num_sge) +{ +RdmaCmMuxMsg msg = {0}; +char *hdr, *data; int ret; pr_dbg("num_sge=%d\n", num_sge); @@ -313,26 +339,31 @@ static int mad_send(RdmaBackendDev *backend_dev, struct ibv_sge *sge, return -EINVAL; } -umad.hdr.length = sge[0].length + sge[1].length; -pr_dbg("msg_len=%d\n", umad.hdr.length); +msg.hdr.msg_type = RDMACM_MUX_MSG_TYPE_MAD; +memcpy(msg.hdr.sgid.raw, sgid->raw, sizeof(msg.hdr.sgid)); + +msg.umad_len = sge[0].length + sge[1].length; +pr_dbg("umad_len=%d\n", msg.umad_len); -if (umad.hdr.length > sizeof(umad.mad)) { +if (msg.umad_len > sizeof(msg.umad.mad)) { return -ENOMEM; } -umad.hdr.addr.qpn = htobe32(1); -umad.hdr.addr.grh_present = 1; -umad.hdr.addr.gid_index = backend_dev->backend_gid_idx; -memcpy(umad.hdr.addr.gid, backend_dev->gid.raw, sizeof(umad.hdr.addr.gid)); -umad.hdr.addr.hop_limit = 1; +msg.umad.hdr.addr.qpn = htobe32(1); +msg.umad.hdr.addr.grh_present = 1; +pr_dbg("sgid_idx=%d\n", sgid_idx); +pr_dbg("sgid=0x%llx\n", sgid->global.interface_id); +msg.umad.hdr.addr.gid_index = sgid_idx; +memcpy(msg.umad.hdr.addr.gid, sgid->raw, sizeof(msg.umad.hdr.addr.gid)); +msg.umad.hdr.addr.hop_limit = 1; hdr = rdma_pci_dma_map(backend_dev->dev, sge[0].addr, sge[0].length); if (!hdr) { pr_dbg("Fail to map to sge[0]\n"); return -ENOMEM; } -msg = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length); -if (!msg) { +data = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length); +if (!data) { pr_dbg("Fail to map to sge[1]\n"); rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length); return -ENOMEM; @@ -341,25 +372,36 @@ static int mad_send(RdmaBackendDev *backend_dev, struct ibv_sge *sge, pr_dbg_buf("mad_hdr", hdr, sge[0].length); pr_dbg_buf("mad_data", data, sge[1].length); -memcpy([0], hdr, sge[0].length); -
[Qemu-devel] [PATCH v4 22/23] hw/rdma: Do not call rdma_backend_del_gid on an empty gid
When device goes down the function fini_ports loops over all entries in gid table regardless of the fact whether entry is valid or not. In case that entry is not valid we'd like to skip from any further processing in backend device. Signed-off-by: Yuval Shaia Reviewed-by: Marcel Apfelbaum --- hw/rdma/rdma_rm.c | 4 1 file changed, 4 insertions(+) diff --git a/hw/rdma/rdma_rm.c b/hw/rdma/rdma_rm.c index ca127c8c26..f5b1295890 100644 --- a/hw/rdma/rdma_rm.c +++ b/hw/rdma/rdma_rm.c @@ -555,6 +555,10 @@ int rdma_rm_del_gid(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev, { int rc; +if (!dev_res->port.gid_tbl[gid_idx].gid.global.interface_id) { +return 0; +} + rc = rdma_backend_del_gid(backend_dev, ifname, _res->port.gid_tbl[gid_idx].gid); if (rc) { -- 2.17.2
[Qemu-devel] [PATCH v4 23/23] docs: Update pvrdma device documentation
Interface with the device is changed with the addition of support for MAD packets. Adjust documentation accordingly. While there fix a minor mistake which may lead to think that there is a relation between using RXE on host and the compatibility with bare-metal peers. Signed-off-by: Yuval Shaia --- docs/pvrdma.txt | 103 +++- 1 file changed, 84 insertions(+), 19 deletions(-) diff --git a/docs/pvrdma.txt b/docs/pvrdma.txt index 5599318159..f82b2a69d2 100644 --- a/docs/pvrdma.txt +++ b/docs/pvrdma.txt @@ -9,8 +9,9 @@ It works with its Linux Kernel driver AS IS, no need for any special guest modifications. While it complies with the VMware device, it can also communicate with bare -metal RDMA-enabled machines and does not require an RDMA HCA in the host, it -can work with Soft-RoCE (rxe). +metal RDMA-enabled machines as peers. + +It does not require an RDMA HCA in the host, it can work with Soft-RoCE (rxe). It does not require the whole guest RAM to be pinned allowing memory over-commit and, even if not implemented yet, migration support will be @@ -78,29 +79,93 @@ the required RDMA libraries. 3. Usage + + +3.1 VM Memory settings +==+++= Currently the device is working only with memory backed RAM and it must be mark as "shared": -m 1G \ -object memory-backend-ram,id=mb1,size=1G,share \ -numa node,memdev=mb1 \ -The pvrdma device is composed of two functions: - - Function 0 is a vmxnet Ethernet Device which is redundant in Guest - but is required to pass the ibdevice GID using its MAC. - Examples: - For an rxe backend using eth0 interface it will use its mac: - -device vmxnet3,addr=.0,multifunction=on,mac= - For an SRIOV VF, we take the Ethernet Interface exposed by it: - -device vmxnet3,multifunction=on,mac= - - Function 1 is the actual device: - -device pvrdma,addr=.1,backend-dev=,backend-gid-idx=,backend-port= - where the ibdevice can be rxe or RDMA VF (e.g. mlx5_4) - Note: Pay special attention that the GID at backend-gid-idx matches vmxnet's MAC. - The rules of conversion are part of the RoCE spec, but since manual conversion - is not required, spotting problems is not hard: -Example: GID: fe80::::7efe:90ff:fecb:743a - MAC: 7c:fe:90:cb:74:3a -Note the difference between the first byte of the MAC and the GID. + +3.2 MAD Multiplexer +=== +MAD Multiplexer is a service that exposes MAD-like interface for VMs in +order to overcome the limitation where only single entity can register with +MAD layer to send and receive RDMA-CM MAD packets. + +To build rdmacm-mux run +# make rdmacm-mux + +The application accepts 3 command line arguments and exposes a UNIX socket +to pass control and data to it. +-s unix-socket-path Path to unix socket to listen on + (default /var/run/rdmacm-mux) +-d rdma-device-name Name of RDMA device to register with + (default rxe0) +-p rdma-device-port Port number of RDMA device to register with + (default 1) +The final UNIX socket file name is a concatenation of the 3 arguments so +for example for device mlx5_0 on port 2 this /var/run/rdmacm-mux-mlx5_0-2 +will be created. + +Please refer to contrib/rdmacm-mux for more details. + + +3.3 PCI devices settings + +RoCE device exposes two functions - an Ethernet and RDMA. +To support it, pvrdma device is composed of two PCI functions, an Ethernet +device of type vmxnet3 on PCI slot 0 and a PVRDMA device on PCI slot 1. The +Ethernet function can be used for other Ethernet purposes such as IP. + + +3.4 Device parameters += +- netdev: Specifies the Ethernet device on host. For Soft-RoCE (rxe) this + would be the Ethernet device used to create it. For any other physical + RoCE device this would be the netdev name of the device. +- ibdev: The IB device name on host for example rxe0, mlx5_0 etc. +- mad-chardev: The name of the MAD multiplexer char device. +- ibport: In case of multi-port device (such as Mellanox's HCA) this + specify the port to use. If not set 1 will be used. +- dev-caps-max-mr-size: The maximum size of MR. +- dev-caps-max-qp: Maximum number of QPs. +- dev-caps-max-sge: Maximum number of SGE elements in WR. +- dev-caps-max-cq: Maximum number of CQs. +- dev-caps-max-mr: Maximum number of MRs. +- dev-caps-max-pd: Maximum number of PDs. +- dev-caps-max-ah: Maximum number of AHs. + +Notes: +- The first 3 parameters are mandatory settings, the rest have their + defaults. +- The last 8 parameters (the ones that prefixed by dev-caps) defines the top + limits but the final values is adjusted by the backend device limitations. + +3.5 Example +=== +Define bridge device with vmxnet3 network backend: + + + + + + + +Define pvrdma device: + + + + + + + + + + -- 2.17.2
[Qemu-devel] [PATCH v4 07/23] hw/pvrdma: Make default pkey 0xFFFF
Commit 6e7dba23af ("hw/pvrdma: Make default pkey 0x") exports default pkey as external definition but omit the change from 0x7FFF to 0x. Fixes: 6e7dba23af ("hw/pvrdma: Make default pkey 0x") Signed-off-by: Yuval Shaia Reviewed-by: Marcel Apfelbaum --- hw/rdma/vmw/pvrdma.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/rdma/vmw/pvrdma.h b/hw/rdma/vmw/pvrdma.h index e3742d893a..15c3f28b86 100644 --- a/hw/rdma/vmw/pvrdma.h +++ b/hw/rdma/vmw/pvrdma.h @@ -52,7 +52,7 @@ #define PVRDMA_FW_VERSION14 /* Some defaults */ -#define PVRDMA_PKEY 0x7FFF +#define PVRDMA_PKEY 0x typedef struct DSRInfo { dma_addr_t dma; -- 2.17.2
[Qemu-devel] [PATCH v4 20/23] hw/pvrdma: Clean device's resource when system is shutdown
In order to clean some external resources such as GIDs, QPs etc, register to receive notification when VM is shutdown. Signed-off-by: Yuval Shaia Reviewed-by: Marcel Apfelbaum --- hw/rdma/vmw/pvrdma.h | 2 ++ hw/rdma/vmw/pvrdma_main.c | 12 2 files changed, 14 insertions(+) diff --git a/hw/rdma/vmw/pvrdma.h b/hw/rdma/vmw/pvrdma.h index 10a3c4fb7c..ffae36986e 100644 --- a/hw/rdma/vmw/pvrdma.h +++ b/hw/rdma/vmw/pvrdma.h @@ -17,6 +17,7 @@ #define PVRDMA_PVRDMA_H #include "qemu/units.h" +#include "qemu/notify.h" #include "hw/pci/pci.h" #include "hw/pci/msix.h" #include "chardev/char-fe.h" @@ -87,6 +88,7 @@ typedef struct PVRDMADev { RdmaDeviceResources rdma_dev_res; CharBackend mad_chr; VMXNET3State *func0; +Notifier shutdown_notifier; } PVRDMADev; #define PVRDMA_DEV(dev) OBJECT_CHECK(PVRDMADev, (dev), PVRDMA_HW_NAME) diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c index 150404dfa6..474cbfd78d 100644 --- a/hw/rdma/vmw/pvrdma_main.c +++ b/hw/rdma/vmw/pvrdma_main.c @@ -24,6 +24,7 @@ #include "hw/qdev-properties.h" #include "cpu.h" #include "trace.h" +#include "sysemu/sysemu.h" #include "../rdma_rm.h" #include "../rdma_backend.h" @@ -559,6 +560,14 @@ static int pvrdma_check_ram_shared(Object *obj, void *opaque) return 0; } +static void pvrdma_shutdown_notifier(Notifier *n, void *opaque) +{ +PVRDMADev *dev = container_of(n, PVRDMADev, shutdown_notifier); +PCIDevice *pci_dev = PCI_DEVICE(dev); + +pvrdma_fini(pci_dev); +} + static void pvrdma_realize(PCIDevice *pdev, Error **errp) { int rc; @@ -632,6 +641,9 @@ static void pvrdma_realize(PCIDevice *pdev, Error **errp) goto out; } +dev->shutdown_notifier.notify = pvrdma_shutdown_notifier; +qemu_register_shutdown_notifier(>shutdown_notifier); + out: if (rc) { error_append_hint(errp, "Device fail to load\n"); -- 2.17.2
[Qemu-devel] [Bug 1803872] Re: gcc 8.2 reports stringop-truncation when building qemu
** Description changed: QEMU 3.0 block/sheepdog.c: In function 'find_vdi_name': block/sheepdog.c:1239:5: error: 'strncpy' specified bound 256 equals destination size [-Werror=stringop-truncation] - strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN); - ^~ + strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN); + ^~ - - If this is the intended behavior, please suppress the warning. For example: + If this is the intended behavior, please suppress the warning. For + example: #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstringop-truncation" - strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN); + strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN); #pragma GCC diagnostic pop + + This also happens on other sources, for example hw/acpi/core.c, so + another option is to suppress it globally on CFLAGS (-Wno-stringop- + truncation) -- You received this bug notification because you are a member of qemu- devel-ml, which is subscribed to QEMU. https://bugs.launchpad.net/bugs/1803872 Title: gcc 8.2 reports stringop-truncation when building qemu Status in QEMU: New Bug description: QEMU 3.0 block/sheepdog.c: In function 'find_vdi_name': block/sheepdog.c:1239:5: error: 'strncpy' specified bound 256 equals destination size [-Werror=stringop-truncation] strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN); ^~ If this is the intended behavior, please suppress the warning. For example: #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstringop-truncation" strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN); #pragma GCC diagnostic pop This also happens on other sources, for example hw/acpi/core.c, so another option is to suppress it globally on CFLAGS (-Wno-stringop- truncation) To manage notifications about this bug go to: https://bugs.launchpad.net/qemu/+bug/1803872/+subscriptions
Re: [Qemu-devel] [PATCH] docker: dockerfile for openSUSE Leap
Dario Faggioli writes: > Dockerfile for building an openSUSE Leap container. > > Tracks the latest release (at the time of writing this > patch, it is Leap 15). > > Signed-off-by: Dario Faggioli > --- > Cc: "Alex Bennée" > Cc: Fam Zheng > Cc: "Philippe Mathieu-Daudé" > --- > tests/docker/dockerfiles/opensuse-leap.docker | 62 > + > 1 file changed, 62 insertions(+) > create mode 100644 tests/docker/dockerfiles/opensuse-leap.docker > > diff --git a/tests/docker/dockerfiles/opensuse-leap.docker > b/tests/docker/dockerfiles/opensuse-leap.docker > new file mode 100644 > index 00..9d00861e66 > --- /dev/null > +++ b/tests/docker/dockerfiles/opensuse-leap.docker > @@ -0,0 +1,62 @@ > +FROM opensuse/leap > +ENV PACKAGES \ > +tar \ > +usbredir-devel \ > +virglrenderer-devel \ > +vte-devel \ > +which \ > +xen-devel > +zlib-devel \ This hasn't been tested because the docker image fails to build due to continuation breakage. -- Alex Bennée
[Qemu-devel] 3.1.0-rc{0,1} doesn't start
hello I'm building qemu from source and happily using it since a bit (2.3.0) Since 3.1.0-rc0 (including latest 3.1.0-rc1) I'm no more able to start qemu, getting: 8< install:115> qemu qemu: error: failed to set MSR 0x10a to 0x0 qemu: /home/balducci/tmp/install-us-d/qemu-3.1.0-rc1.d/qemu-3.1.0-rc0/target/i386/kvm.c:2185: kvm_put_msrs: Assertion `ret == cpu->kvm_msr_buf->nmsrs' failed. Aborted >8 I have no idea about what the reason might be, apologies. Actually, I have found a recent (2018-10-16) post which might be related to this (it mentions the same error message from qemu): https://lkml.org/lkml/2018/10/16/440; but I'm not in the position to go through. AFAICS, the commit mentioned in the link is present in the 4.19.2 kernel I'm using, so...? I can add that 3.0.0 works nicely (everything else unchanged, including running kernel 4.19.2) OTOH, 3.1.0-rc0 dumps the same error message if I boot into 4.17.14 or 4.18.16 kernels. I enclose my specs hoping that somebody can spot where the problem might be. I will be happy to send any other detail which might be useful. I suspect that this might be some problem on my side, as I couldn't find any similar report (apart some old (qemu-2.8.50) threads, that didn't help) thanks a lot in advance for any hint/help ciao gabriele Here are my specs: # # command to run qemu is: qemu -m 2G /opt/windog \ -accel kvm,thread=multi \ -netdev user,id=net0,smb=/home/balducci \ -device rtl8139,netdev=net0 # # qemu build configuration: --prefix=/opt/stow.d/versions/qemu-3.1.0-rc1/usr --libdir=/opt/stow.d/versions/qemu-3.1.0-rc1/usr/lib64 --sysconfdir=/opt/stow.d/versions/qemu-3.1.0-rc1/etc --localstatedir=/var/run --docdir=/opt/stow.d/versions/qemu-3.1.0-rc1/usr/share/doc/qemu --target-list=x86_64-softmmu --audio-drv-list=alsa # install:154> uname -sr Linux 4.19.2 # install:155> cat /proc/cpuinfo processor : 0 vendor_id : AuthenticAMD cpu family : 21 model : 48 model name : AMD Athlon(tm) X4 860K Quad Core Processor stepping: 1 microcode : 0x6003106 cpu MHz : 3473.492 cache size : 2048 KB physical id : 0 siblings: 4 core id : 0 cpu cores : 2 apicid : 16 initial apicid : 0 fpu : yes fpu_exception : yes cpuid level : 13 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 popcnt aes xsave avx f16c lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs xop skinit wdt lwp fma4 tce nodeid_msr tbm topoext perfctr_core perfctr_nb bpext ptsc cpb hw_pstate ssbd vmmcall fsgsbase bmi1 xsaveopt arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold overflow_recov bugs: fxsave_leak sysret_ss_attrs null_seg spectre_v1 spectre_v2 spec_store_bypass bogomips: 7380.73 TLB size: 1536 4K pages clflush size: 64 cache_alignment : 64 address sizes : 48 bits physical, 48 bits virtual power management: ts ttp tm 100mhzsteps hwpstate cpb eff_freq_ro [13] [...cpus 1 2 3 omitted...] # install:156> egrep KVM .config-4.19.2 CONFIG_HAVE_KVM=y CONFIG_HAVE_KVM_IRQCHIP=y CONFIG_HAVE_KVM_IRQFD=y CONFIG_HAVE_KVM_IRQ_ROUTING=y CONFIG_HAVE_KVM_EVENTFD=y CONFIG_KVM_MMIO=y CONFIG_KVM_ASYNC_PF=y CONFIG_HAVE_KVM_MSI=y CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT=y CONFIG_KVM_VFIO=y CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT=y CONFIG_KVM_COMPAT=y CONFIG_HAVE_KVM_IRQ_BYPASS=y CONFIG_KVM=y # CONFIG_KVM_INTEL is not set CONFIG_KVM_AMD=y [of course, I can send the whole kernel configuration file, if needed]
[Qemu-devel] [RFC PATCH 1/2] docker: Add gentoo-mipsr5900el-cross image
This image is based on Gentoo and the toolchain is built using crossdev. Recipe from: https://lists.gnu.org/archive/html/qemu-devel/2018-09/msg03944.html Suggested-by: Fredrik Noring Signed-off-by: Philippe Mathieu-Daudé --- TODO: - Add Fredrik Noring S-o-b in his patches - Check patch merged upstream tests/docker/Makefile.include | 6 + .../gentoo-mipsr5900el-cross.docker | 39 .../binutils-v2.30-ps2-llsc.patch | 36 +++ .../crossdev.conf | 5 + .../gcc-v7.2.0-ps2-llsc.patch | 23 ++ .../gcc-v7.2.0-ps2.patch | 219 ++ 6 files changed, 328 insertions(+) create mode 100644 tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker create mode 100644 tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker.d/binutils-v2.30-ps2-llsc.patch create mode 100644 tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker.d/crossdev.conf create mode 100644 tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker.d/gcc-v7.2.0-ps2-llsc.patch create mode 100644 tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker.d/gcc-v7.2.0-ps2.patch diff --git a/tests/docker/Makefile.include b/tests/docker/Makefile.include index 9467e9d088..6ca615206f 100644 --- a/tests/docker/Makefile.include +++ b/tests/docker/Makefile.include @@ -119,6 +119,12 @@ docker-image-debian-sparc64-cross: docker-image-debian-sid docker-image-debian-mips64-cross: docker-image-debian-sid docker-image-debian-riscv64-cross: docker-image-debian-sid docker-image-debian-powerpc-cross: docker-image-debian-sid +docker-image-gentoo-mipsr5900el-cross: EXTRA_FILES:=$(addprefix \ + $(SRC_PATH)/tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker.d/,\ + crossdev.conf \ + binutils-v2.30-ps2-llsc.patch \ + gcc-v7.2.0-ps2.patch \ + gcc-v7.2.0-ps2-llsc.patch) docker-image-travis: NOUSER=1 # Specialist build images, sometimes very limited tools diff --git a/tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker b/tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker new file mode 100644 index 00..dbc2eb007b --- /dev/null +++ b/tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker @@ -0,0 +1,39 @@ +# +# Docker mipsel (r5900) cross-compiler target +# +# Using multi-stage builds, this image requires docker-17.05.0 or later. +# (See: https://github.com/gentoo/gentoo-docker-images) +# +# SPDX-License-Identifier: GPL-2.0-or-later + +# name the portage image +FROM gentoo/portage:latest as portage + +# image is based on stage3-amd64 +FROM gentoo/stage3-amd64:latest + +# copy the entire portage volume in +COPY --from=portage /usr/portage /usr/portage + +MAINTAINER Philippe Mathieu-Daudé + +# continue with image build ... +RUN emerge -qv sys-devel/crossdev + +# set CROSSDEV_OVERLAY to /usr/local/portage-crossdev +RUN mkdir -p /usr/local/portage-crossdev/{profiles,metadata} && \ +echo 'crossdev' > /usr/local/portage-crossdev/profiles/repo_name && \ +echo 'masters = gentoo' > /usr/local/portage-crossdev/metadata/layout.conf && \ +chown -R portage:portage /usr/local/portage-crossdev && \ +mkdir -p /etc/portage/repos.conf +ADD crossdev.conf /etc/portage/repos.conf/crossdev.conf + +# Fredrik's patches +RUN mkdir -p /etc/portage/patches/cross-mipsr5900el-unknown-linux-gnu/{binutils,gcc} +ADD binutils-v2.30-ps2-llsc.patch /etc/portage/patches/cross-mipsr5900el-unknown-linux-gnu/binutils +ADD gcc-v7.2.0-ps2.patch /etc/portage/patches/cross-mipsr5900el-unknown-linux-gnu/gcc +ADD gcc-v7.2.0-ps2-llsc.patch /etc/portage/patches/cross-mipsr5900el-unknown-linux-gnu/gcc + +RUN crossdev -s3 -t mipsr5900el-unknown-linux-gnu --binutils ">=2.30" --gcc ">=7.2.0" + +ENV QEMU_CONFIGURE_OPTS --cross-prefix=mipsr5900el-unknown-linux-gnu- diff --git a/tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker.d/binutils-v2.30-ps2-llsc.patch b/tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker.d/binutils-v2.30-ps2-llsc.patch new file mode 100644 index 00..62ecc3267a --- /dev/null +++ b/tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker.d/binutils-v2.30-ps2-llsc.patch @@ -0,0 +1,36 @@ +diff --git a/opcodes/mips-opc.c b/opcodes/mips-opc.c +index 1cbcbc6..0986b0a 100644 +--- a/opcodes/mips-opc.c b/opcodes/mips-opc.c +@@ -1280,11 +1280,11 @@ const struct mips_opcode mips_builtin_opcodes[] = + {"li.s", "t,f", 0,(int) M_LI_S, INSN_MACRO, INSN2_M_FP_S, I1, 0, 0 }, + {"li.s", "T,l", 0,(int) M_LI_SS,INSN_MACRO, INSN2_M_FP_S, I1, 0, 0 }, + {"ll","t,+j(b)", 0x7c36, 0xfc7f, WR_1|RD_3|LM, 0, I37,0, 0 }, +-{"ll","t,o(b)",
[Qemu-devel] [RFC PATCH 0/2] docker: Add gentoo-mipsr5900el-cross image
Hi, The first patch adds a cross toolchain for the R5900 MIPS. It is working correctly but the patches provided by Fredrik in [1] don't have proper S-o-b, thus it is tagged RFC. Fredrik: any update on the status of those patches upstream? I setup this image to try Fredrik's TCG tests in [2]. The second patch intents to have Shippable CI build the image and run the TCG tests. Since current idea of the Shippable tests is to cross-build QEMU for the target, it does not work (and makes sense) here. I don't think there is interest in running cross-compiled QEMU on a PS2... But we never know ;) The failure I have is pkg-config using an incorrect path. I suppose I am not using the CROSSDEV_OVERLAY path correctly. Help from Gentoo developers would be appreciated! I run the tests using: $ docker-image-gentoo-mipsr5900el-cross $ docker run --rm -it -v $PWD:$PWD \ -w $PWD/tests/tcg/mips/mipsr5900 qemu:gentoo-mipsr5900el-cross \ make all $ make -C $PWD/tests/tcg/mips/mipsr5900 \ check \ SIM=$PWD/build/mipsel-softmmu/qemu-system-mipsel With this minor modification: -- >8 -- diff --git a/tests/tcg/mips/mipsr5900/Makefile b/tests/tcg/mips/mipsr5900/Makefile @@ -5 +5 @@ CROSS=mipsr5900el-unknown-linux-gnu- -SIM=qemu-mipsel +SIM?=qemu-mipsel --- Regards, Phil. [1] https://lists.gnu.org/archive/html/qemu-devel/2018-09/msg03944.html [2] https://lists.gnu.org/archive/html/qemu-devel/2018-11/msg01284.html Philippe Mathieu-Daudé (2): docker: Add gentoo-mipsr5900el-cross image shippable: Add the mipsr5900el linux-user target .shippable.yml| 2 + tests/docker/Makefile.include | 6 + .../gentoo-mipsr5900el-cross.docker | 42 .../binutils-v2.30-ps2-llsc.patch | 36 +++ .../crossdev.conf | 5 + .../gcc-v7.2.0-ps2-llsc.patch | 23 ++ .../gcc-v7.2.0-ps2.patch | 219 ++ 7 files changed, 333 insertions(+) create mode 100644 tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker create mode 100644 tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker.d/binutils-v2.30-ps2-llsc.patch create mode 100644 tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker.d/crossdev.conf create mode 100644 tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker.d/gcc-v7.2.0-ps2-llsc.patch create mode 100644 tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker.d/gcc-v7.2.0-ps2.patch -- 2.17.2
[Qemu-devel] [RFC PATCH 2/2] shippable: Add the mipsr5900el linux-user target
Shippable node pool must be updated to use at lease v5.8.2: http://docs.shippable.com/platform/runtime/machine-image/ami-v582/ Signed-off-by: Philippe Mathieu-Daudé --- .shippable.yml | 2 ++ tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker | 3 +++ 2 files changed, 5 insertions(+) diff --git a/.shippable.yml b/.shippable.yml index f74a3de3ff..bd3316df91 100644 --- a/.shippable.yml +++ b/.shippable.yml @@ -25,6 +25,8 @@ env: TARGET_LIST=mips64el-softmmu,mips64el-linux-user - IMAGE=debian-ppc64el-cross TARGET_LIST=ppc64-softmmu,ppc64-linux-user,ppc64abi32-linux-user +- IMAGE=gentoo-mipsr5900el-cross + TARGET_LIST=mips64el-linux-user build: pre_ci: - make docker-image-${IMAGE} V=1 diff --git a/tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker b/tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker index dbc2eb007b..b35f779718 100644 --- a/tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker +++ b/tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker @@ -37,3 +37,8 @@ ADD gcc-v7.2.0-ps2-llsc.patch /etc/portage/patches/cross-mipsr5900el-unknown-lin RUN crossdev -s3 -t mipsr5900el-unknown-linux-gnu --binutils ">=2.30" --gcc ">=7.2.0" ENV QEMU_CONFIGURE_OPTS --cross-prefix=mipsr5900el-unknown-linux-gnu- + +# kludge for Shippable which checks clang available. +RUN ln -s /bin/true /usr/bin/clang + +RUN emerge -qv dev-vcs/git ">=dev-libs/glib-2.0" sys-libs/zlib dev-lang/python -- 2.17.2
Re: [Qemu-devel] [PATCH] docker: dockerfile for openSUSE Leap
On Sun, 2018-11-18 at 19:47 +, Alex Bennée wrote: > Dario Faggioli writes: > > > Dockerfile for building an openSUSE Leap container. > > > > Tracks the latest release (at the time of writing this > > patch, it is Leap 15). > > > > Signed-off-by: Dario Faggioli > > --- > > Cc: "Alex Bennée" > > Cc: Fam Zheng > > Cc: "Philippe Mathieu-Daudé" > > --- > > tests/docker/dockerfiles/opensuse-leap.docker | 62 > > + > > 1 file changed, 62 insertions(+) > > create mode 100644 tests/docker/dockerfiles/opensuse-leap.docker > > > > diff --git a/tests/docker/dockerfiles/opensuse-leap.docker > > b/tests/docker/dockerfiles/opensuse-leap.docker > > new file mode 100644 > > index 00..9d00861e66 > > --- /dev/null > > +++ b/tests/docker/dockerfiles/opensuse-leap.docker > > @@ -0,0 +1,62 @@ > > +FROM opensuse/leap > > +ENV PACKAGES \ > > > +tar \ > > +usbredir-devel \ > > +virglrenderer-devel \ > > +vte-devel \ > > +which \ > > +xen-devel > > +zlib-devel \ > > This hasn't been tested because the docker image fails to build due > to > continuation breakage. > It is indeed broken. Basically, I tested it thoroughly, and I'm quite sure it works ok (although, I'm no docker expert). Then, when I was about to send the mail, I saw in another commit that it was best to have the package list sorted; so I did that, and forgot to adjust the continuation. :-( Sorry. I'll send a v2 with this fixed. Thanks and Regards, Dario -- <> (Raistlin Majere) - Dario Faggioli, Ph.D, http://about.me/dario.faggioli Software Engineer @ SUSE https://www.suse.com/ signature.asc Description: This is a digitally signed message part
Re: [Qemu-devel] [PATCH] docker: dockerfile for openSUSE Leap
Hi Dario, On Sun, Nov 18, 2018 at 10:54 PM Dario Faggioli wrote: > > On Sun, 2018-11-18 at 19:47 +, Alex Bennée wrote: > > Dario Faggioli writes: > > > > > Dockerfile for building an openSUSE Leap container. > > > > > > Tracks the latest release (at the time of writing this > > > patch, it is Leap 15). > > > > > > Signed-off-by: Dario Faggioli > > > --- > > > Cc: "Alex Bennée" > > > Cc: Fam Zheng > > > Cc: "Philippe Mathieu-Daudé" > > > --- > > > tests/docker/dockerfiles/opensuse-leap.docker | 62 > > > + > > > 1 file changed, 62 insertions(+) > > > create mode 100644 tests/docker/dockerfiles/opensuse-leap.docker > > > > > > diff --git a/tests/docker/dockerfiles/opensuse-leap.docker > > > b/tests/docker/dockerfiles/opensuse-leap.docker > > > new file mode 100644 > > > index 00..9d00861e66 > > > --- /dev/null > > > +++ b/tests/docker/dockerfiles/opensuse-leap.docker > > > @@ -0,0 +1,62 @@ > > > +FROM opensuse/leap > > > +ENV PACKAGES \ > > > > > +tar \ > > > +usbredir-devel \ > > > +virglrenderer-devel \ > > > +vte-devel \ > > > +which \ > > > +xen-devel > > > +zlib-devel \ > > > > This hasn't been tested because the docker image fails to build due > > to > > continuation breakage. > > > It is indeed broken. > > Basically, I tested it thoroughly, and I'm quite sure it works ok > (although, I'm no docker expert). I quickly fixed/tried it. First this package list pulls many unuseful system packages. I added '--no-recommends' to reduce a bit: RUN zypper ref && \ zypper up -y && \ zypper install -y --no-recommends $PACKAGES This still install a bunch of unnecessary stuffs: Creating group systemd-journal with gid 485. Creating group systemd-network with gid 484. Creating user systemd-network (systemd Network Management) with uid 484 and gid 484. Creating group systemd-coredump with gid 483. Creating user systemd-coredump (systemd Core Dumper) with uid 483 and gid 483. Creating group systemd-timesync with gid 482. Creating user systemd-timesync (systemd Time Synchronization) with uid 482 and gid 482. Failed to connect to bus: No such file or directory Then the resulting image takes the same than without --no-recommends: 1.1GB. No idea why. At some point some package tried to talk to the network manager via DBus to start sshd... Second, while this works on x86_64, it fails on aarch64: Problem: libcurl-devel-7.59.0-lp150.1.2.aarch64 requires libcurl4 = 7.59.0, but this requirement cannot be provided not installable providers: libcurl4-7.59.0-lp150.1.2.aarch64[repo-oss] Solution 1: downgrade of libcurl4-7.60.0-lp150.2.15.1.aarch64 to libcurl4-7.59.0-lp150.1.2.aarch64 Solution 2: do not install libcurl-devel-7.59.0-lp150.1.2.aarch64 Solution 3: break libcurl-devel-7.59.0-lp150.1.2.aarch64 by ignoring some of its dependencies Not your fault. So I suppose you are planning to use this image to compile QEMU for openSUSE/x86_64 only, is this correct? Regards, Phil. > > Then, when I was about to send the mail, I saw in another commit that > it was best to have the package list sorted; so I did that, and forgot > to adjust the continuation. :-( > > Sorry. > > I'll send a v2 with this fixed. > > Thanks and Regards, > Dario > -- > <> (Raistlin Majere) > - > Dario Faggioli, Ph.D, http://about.me/dario.faggioli > Software Engineer @ SUSE https://www.suse.com/
Re: [Qemu-devel] SeaBIOS booting time optimization
Hi Stefano, On Fri, Nov 16, 2018 at 05:13:59PM +0100, Stefano Garzarella wrote: > Hi, > I'm investigating the SeaBIOS booting time, to understand if we can reduce > the boot time in some cases (e.g. legacy hardware is not needed). I think > this > can be interesting also for NEMU developers. Definitely, thanks. > Following this thread ( > https://mail.coreboot.org/pipermail/seabios/2015-July/009497.html), > I'm using qboot (https://github.com/bonzini/qboot) to compare the SeaBIOS > booting time. > > As Paolo did in qboot, I manually add small debug port writes in SeaBIOS and > linuxboot_dma.c (QEMU) to trace the events with perf (kvm_pio events) When doing similar measuremnts, I also used the debug-exit qemu device in order to make imho simpler measuremnts, see https://github.com/intel/nemu/wiki/Measuring-Boot-Latency Probably not as precised as yours, but it was enough for what we tried to characterize. > The goal is to have only one image of SeaBIOS configurable at runtime to > reduce > the boot time, avoiding unnecessary initialization. > > Any pointers or suggestions would be helpful. > > Following I put some preliminary measurements that I obtained: > I used this QEMU command line: > ./qemu-system-x86_64 -bios $BIOS -m 1G -cpu host -M accel=kvm \ > -kernel /boot/vmlinuz-4.18.18-300.fc29.x86_64 -append 'console=ttyS0' \ > -nographic -serial mon:stdio > > For each test, I measured these times (in milliseconds) relative to the > "sched_process_exec" event: > - qemu_init_end: first kvm_entry (i.e. QEMU initialized has finished) > - fw_start: first entry of the BIOS > - fw_do_boot: after the BIOS initialization (e.g. PCI setup, etc.) > - linux_start_boot: before the jump to the Linux kernel Are you planning to also measure the total time to userspace as well? How we set the hardware up from the FW/BIOS can also influence the overall (up to ring 3) boot latency on our experiments. > # qboot > BIOS=/home/stefano/repos/qboot/bios.bin > qemu_init_end: 40.561234 > fw_start: 40.721729 (+0.160495) > fw_do_boot: 47.025591 (+6.303862) > linux_start_boot: 48.874112 (+1.848521) > > # SeaBIOS with default configuration > BIOS=/home/stefano/repos/seabios/out_default/bios.bin > qemu_init_end: 40.419451 > fw_start: 40.639967 (+0.220516) > fw_do_boot: 886.668828 (+846.028861) > linux_start_boot: 889.723547 (+3.054719) > > # SeaBIOS with Kevin's configuration > # (https://mail.coreboot.org/pipermail/seabios/2015-July/009508.html) > # Note: this SeaBIOS setup is so stripped down that it can't actually boot > an OS > BIOS=/home/stefano/repos/seabios/out_kevin/bios.bin > qemu_init_end: 40.676412 > fw_start: 40.755757 (+0.079345) > fw_do_boot: 56.427023 (+15.671266) That's a slight improvement ;) > I did the same tests also with NEMU (without using -M virt) and I have > approximately the same results. Yes, as expected. > As the next step, I'll start from Kevin's configuration to have a minimal > SeaBIOS image ables to boot a Linux kernel. Please keep us posted. Cheers, Samuel.
Re: [Qemu-devel] [PATCH 1/2] hw: fw_cfg: refactor fw_cfg_bootsplash()
At 2018-11-17 00:33:34, "Markus Armbruster" wrote: >Li Qiang writes: > >> Currently when the splash-time value is bigger than 0x >> we report and correct it, when it is less than 0 we just ingore it. > >s/ingore/ignore/ > >> Also we use qemu_opt_get() to get 'splash-time', then convert it to a number >> ourselves. This is wrong. > >Well, doing it that way isn't wrong, it's just needlessly complicated >and error-prone. > >Suggest starting a new paragraph right here. > >> This patch does following: >> 1. use qemu_opt_get_number() to parse 'splash-time' >> 2. exit when the splash-time is invalid or loading the splash file failed >> 3. simplify code >> >> Signed-off-by: Li Qiang >> --- >> hw/nvram/fw_cfg.c | 40 >> vl.c | 2 +- >> 2 files changed, 17 insertions(+), 25 deletions(-) >> >> diff --git a/hw/nvram/fw_cfg.c b/hw/nvram/fw_cfg.c >> index 946f765f7f..78f43dad93 100644 >> --- a/hw/nvram/fw_cfg.c >> +++ b/hw/nvram/fw_cfg.c >> @@ -118,55 +118,47 @@ error: >> >> static void fw_cfg_bootsplash(FWCfgState *s) >> { >> -int boot_splash_time = -1; >> const char *boot_splash_filename = NULL; >> -char *p; >> +const char *boot_splash_time = NULL; >> char *filename, *file_data; >> gsize file_size; >> int file_type; >> -const char *temp; >> >> /* get user configuration */ >> QemuOptsList *plist = qemu_find_opts("boot-opts"); >> QemuOpts *opts = QTAILQ_FIRST(>head); >> -if (opts != NULL) { >> -temp = qemu_opt_get(opts, "splash"); >> -if (temp != NULL) { >> -boot_splash_filename = temp; >> -} >> -temp = qemu_opt_get(opts, "splash-time"); >> -if (temp != NULL) { >> -p = (char *)temp; >> -boot_splash_time = strtol(p, , 10); >> -} >> -} >> +boot_splash_filename = qemu_opt_get(opts, "splash"); >> +boot_splash_time = qemu_opt_get(opts, "splash-time"); > >You first get "splash-time" as a string, and then ... >> >> /* insert splash time if user configurated */ >> -if (boot_splash_time >= 0) { >> +if (boot_splash_time) { >> +int64_t bst_val = qemu_opt_get_number(opts, "splash-time", -1); > >... you get it again as a number. I figure you do that because >"splash-time not specified" is not the same as "splash-time=T" for any >T. I don't like such interfaces. Not this patch's fault. > >Just noticed: qemu_extra_params_fw[] has external linkage, but is used >only in this function. Care to make it static in this function in a >separate patch? Will do in the next revision. Thanks, Li Qiang > >> /* validate the input */ >> -if (boot_splash_time > 0x) { >> -error_report("splash time is big than 65535, force it to >> 65535."); >> -boot_splash_time = 0x; >> +if (bst_val < 0 || bst_val > 0x) { >> +error_report("splash time is invalid," >> + "it should be a value between 0 and 65535"); >> +exit(1); >> } >> /* use little endian format */ >> -qemu_extra_params_fw[0] = (uint8_t)(boot_splash_time & 0xff); >> -qemu_extra_params_fw[1] = (uint8_t)((boot_splash_time >> 8) & 0xff); >> +qemu_extra_params_fw[0] = (uint8_t)(bst_val & 0xff); >> +qemu_extra_params_fw[1] = (uint8_t)((bst_val >> 8) & 0xff); >> fw_cfg_add_file(s, "etc/boot-menu-wait", qemu_extra_params_fw, 2); >> } >> >> /* insert splash file if user configurated */ >> -if (boot_splash_filename != NULL) { >> +if (boot_splash_filename) { >> filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, >> boot_splash_filename); >> if (filename == NULL) { >> -error_report("failed to find file '%s'.", boot_splash_filename); >> -return; >> +error_report("failed to find file '%s'", boot_splash_filename); >> +exit(1); >> } >> >> /* loading file data */ >> file_data = read_splashfile(filename, _size, _type); >> if (file_data == NULL) { >> g_free(filename); >> -return; >> +error_report("failed to read file '%s'", boot_splash_filename); >> +exit(1); >> } >> g_free(boot_splash_filedata); >> boot_splash_filedata = (uint8_t *)file_data; >> diff --git a/vl.c b/vl.c >> index 55bab005b6..be37da46f0 100644 >> --- a/vl.c >> +++ b/vl.c >> @@ -336,7 +336,7 @@ static QemuOptsList qemu_boot_opts = { >> .type = QEMU_OPT_STRING, >> }, { >> .name = "splash-time", >> -.type = QEMU_OPT_STRING, >> +.type = QEMU_OPT_NUMBER, >> }, { >> .name = "reboot-timeout", >> .type = QEMU_OPT_STRING, > >Reviewed-by: Markus Armbruster
Re: [Qemu-devel] [PATCH v5 00/11] hw/m68k: add Apple Machintosh Quadra 800 machine
On 11/2/18 6:25 AM, Laurent Vivier wrote: >> If that does not work, I'm also fine if we simply deprecate the simcalls >> (if possible). > > I have a patch to deprecate the interface. I will send it once the > release will be done. Did I ever point out that I got m68k running under your q800 emulation in my https://github.com/landley/mkroot project? Now that m68k is in musl-libc, you can build https://github.com/richfelker/musl-cross-make or grab a prebuilt binary a toolchain from http://b.zv.io/mcm/bin/m68k-linux-musl-cross.tar.gz and then: CROSS_COMPILE=/path/to/m68k-linux-musl-cross/bin/m68k-linux-musl- \ ./mkroot.sh kernel And then: cd output/m68k ./qemu-m68k.sh -hda log.txt The -hda is just so you have a hard drive image you can play with. Cat /dev/?da should show you the log.txt contents. The network interface works too, although I haven't got wget in toybox yet but: echo GET / | netcat landley.net / Works. :) When does this go upstream? P.S. to build a current kernel: cd download git clone git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux cd .. And then mkroot.sh again. I tested with 4.20-rc1 and m68k worked fine. > Thanks, > Laurent Rob
Re: [Qemu-devel] [PATCH qemu RFC 7/7] spapr: Add NVLink2 pass through support
On Tue, Nov 13, 2018 at 07:31:04PM +1100, Alexey Kardashevskiy wrote: > The NVIDIA V100 GPU comes with some on-board RAM which is mapped into > the host memory space and accessible as normal RAM via NVLink bus. > The VFIO-PCI driver implements special regions for such GPU and emulated > NVLink bridge (below referred as NPU). The POWER9 CPU also provides > address translation services which includes TLB invalidation register > exposes via the NVLink bridge; the feature is called "ATSD". > > This adds a quirk to VFIO to map the memory and create an MR; the new MR > is stored in a GPU as a QOM link. The sPAPR PCI uses this to get the MR > and map it to the system address space. Another quirk does the same for > ATSD. > > This adds 3 additional steps to the FDT builder in spapr-pci: > 1. Search for specific GPUs and NPUs, collects findings in sPAPRPHBState; > 2. Adds several properties in the DT: "ibm,npu", "ibm,gpu", "memory-block", > and some other. These are required by the guest platform and GPU driver; > this also adds a new made-up compatible type for a PHB to signal > a modified guest that this particular PHB needs the default DMA window > removed as these GPUs have limited DMA mask size (way lower than usual 59); > 3. Adds new memory blocks with one addition - they have > "linux,memory-usable" property configured in the way which prevents > the guest from onlining it automatically as it needs to be deferred till > the guest GPU driver trains NVLink. > > A couple of notes: > - this changes the FDT rendeder as doing 1-2-3 from sPAPRPHBClass::realize > impossible - devices are not yet attached; > - this does not add VFIO quirk MRs to the system address space as > the address is selected in sPAPRPHBState, similar to MMIO. > > This puts new memory nodes in a separate NUMA node to replicate the host > system setup as close as possible (the GPU driver relies on this too). > > This adds fake NPU nodes to make the guest platform code work, > specifically "ibm,npu-link-index". > > Signed-off-by: Alexey Kardashevskiy > --- > hw/vfio/pci.h | 2 + > include/hw/pci-host/spapr.h | 28 > include/hw/ppc/spapr.h | 3 +- > hw/ppc/spapr.c | 14 +- > hw/ppc/spapr_pci.c | 256 +++- > hw/vfio/pci-quirks.c| 93 + > hw/vfio/pci.c | 14 ++ > hw/vfio/trace-events| 3 + > 8 files changed, 408 insertions(+), 5 deletions(-) > > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h > index f4c5fb6..b8954cc 100644 > --- a/hw/vfio/pci.h > +++ b/hw/vfio/pci.h > @@ -195,6 +195,8 @@ int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp); > int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, > struct vfio_region_info *info, > Error **errp); > +int vfio_pci_nvlink2_ram_init(VFIOPCIDevice *vdev, Error **errp); > +int vfio_pci_npu2_atsd_init(VFIOPCIDevice *vdev, Error **errp); > > void vfio_display_reset(VFIOPCIDevice *vdev); > int vfio_display_probe(VFIOPCIDevice *vdev, Error **errp); > diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h > index 7c66c38..1f8ebf3 100644 > --- a/include/hw/pci-host/spapr.h > +++ b/include/hw/pci-host/spapr.h > @@ -87,6 +87,24 @@ struct sPAPRPHBState { > uint32_t mig_liobn; > hwaddr mig_mem_win_addr, mig_mem_win_size; > hwaddr mig_io_win_addr, mig_io_win_size; > +hwaddr nv2_gpa_win_addr; > +hwaddr nv2_atsd_win_addr; > + > +struct spapr_phb_pci_nvgpu_config { > +uint64_t nv2_ram; > +uint64_t nv2_atsd; > +int num; > +struct { > +int links; > +uint64_t tgt; > +uint64_t gpa; > +PCIDevice *gpdev; > +uint64_t atsd[3]; > +PCIDevice *npdev[3]; > +} gpus[6]; > +uint64_t atsd[64]; /* Big Endian (BE), ready for the DT */ > +int atsd_num; > +} nvgpus; Is this information always relevant for the PHB, or only for PHBs which have an NPU or GPU attached to them? If the latter I'm wondering if we can allocate it only when necessary. > }; > > #define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x8000ULL > @@ -104,6 +122,16 @@ struct sPAPRPHBState { > > #define SPAPR_PCI_MSI_WINDOW 0x400ULL > > +#define PHANDLE_PCIDEV(phb, pdev)(0x1200 | \ > + (((phb)->index) << 16) | > ((pdev)->devfn)) > +#define PHANDLE_GPURAM(phb, n) (0x11FF | ((n) << 8) | \ > + (((phb)->index) << 16)) > +#define GPURAM_ASSOCIATIVITY(phb, n) (255 - ((phb)->index * 3 + (n))) > +#define SPAPR_PCI_NV2RAM64_WIN_BASE 0x100ULL /* 1 TiB */ > +#define SPAPR_PCI_NV2RAM64_WIN_SIZE 0x020ULL > +#define PHANDLE_NVLINK(phb, gn, nn) (0x0013 | (((phb)->index) << 8) | \ > + ((gn) << 4) | (nn)) AFAICT many of these values are only used in
Re: [Qemu-devel] [PATCH qemu RFC 3/7] pci: Move NVIDIA vendor id to the rest of ids
On Tue, Nov 13, 2018 at 07:31:00PM +1100, Alexey Kardashevskiy wrote: > sPAPR code will use it too so move it from VFIO to the common code. > > Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson This looks correct to me independent of the rest of the series. > --- > include/hw/pci/pci_ids.h | 2 ++ > hw/vfio/pci-quirks.c | 2 -- > 2 files changed, 2 insertions(+), 2 deletions(-) > > diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h > index 63acc72..3ed7d10 100644 > --- a/include/hw/pci/pci_ids.h > +++ b/include/hw/pci/pci_ids.h > @@ -271,4 +271,6 @@ > > #define PCI_VENDOR_ID_SYNOPSYS 0x16C3 > > +#define PCI_VENDOR_ID_NVIDIA 0x10de > + > #endif > diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c > index eae31c7..40a1200 100644 > --- a/hw/vfio/pci-quirks.c > +++ b/hw/vfio/pci-quirks.c > @@ -526,8 +526,6 @@ static void vfio_probe_ati_bar2_quirk(VFIOPCIDevice > *vdev, int nr) > * note it for future reference. > */ > > -#define PCI_VENDOR_ID_NVIDIA0x10de > - > /* > * Nvidia has several different methods to get to config space, the > * nouveu project has several of these documented here: -- David Gibson| I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson signature.asc Description: PGP signature
Re: [Qemu-devel] [PATCH qemu RFC 1/7] vfio/spapr: Fix indirect levels calculation
On Tue, Nov 13, 2018 at 07:30:58PM +1100, Alexey Kardashevskiy wrote: > The current code assumes that we can address more bits on a PCI bus > for DMA than we really can. Limit to the known tested maximum of 55 bits > and assume 64K IOMMU pages. This doesn't make much sense to me. It looks nothing like the calculation it's replacing, and not just for extreme values. For example the new calculation doesn't depend on the size of the window at all, whereas the previous one did. You say you're assuming 64k IOMMU pages, but create.page_shift (the actual IOMMU page size) is in there. Nor is it clear to me why the maximum PCI address is relevant to the number of levels in the first place. In addition, AFAICT the new calculation gives the answer '2' for all likely IOMMU page sizes (4k, 64k, 2M) > > Signed-off-by: Alexey Kardashevskiy > --- > hw/vfio/spapr.c | 3 ++- > hw/vfio/trace-events | 2 +- > 2 files changed, 3 insertions(+), 2 deletions(-) > > diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c > index becf71a..f5fdc53 100644 > --- a/hw/vfio/spapr.c > +++ b/hw/vfio/spapr.c > @@ -183,7 +183,7 @@ int vfio_spapr_create_window(VFIOContainer *container, > entries = create.window_size >> create.page_shift; > pages = MAX((entries * sizeof(uint64_t)) / getpagesize(), 1); > pages = MAX(pow2ceil(pages), 1); /* Round up */ > -create.levels = ctz64(pages) / 6 + 1; > +create.levels = MAX(1, (55 - create.page_shift) / 16); > > ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, ); > if (ret) { > @@ -200,6 +200,7 @@ int vfio_spapr_create_window(VFIOContainer *container, > return -EINVAL; > } > trace_vfio_spapr_create_window(create.page_shift, > + create.levels, > create.window_size, > create.start_addr); > *pgsize = pagesize; > diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events > index a85e866..db730f3 100644 > --- a/hw/vfio/trace-events > +++ b/hw/vfio/trace-events > @@ -128,6 +128,6 @@ vfio_prereg_listener_region_add_skip(uint64_t start, > uint64_t end) "0x%"PRIx64" > vfio_prereg_listener_region_del_skip(uint64_t start, uint64_t end) > "0x%"PRIx64" - 0x%"PRIx64 > vfio_prereg_register(uint64_t va, uint64_t size, int ret) "va=0x%"PRIx64" > size=0x%"PRIx64" ret=%d" > vfio_prereg_unregister(uint64_t va, uint64_t size, int ret) "va=0x%"PRIx64" > size=0x%"PRIx64" ret=%d" > -vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x > winsize=0x%"PRIx64" offset=0x%"PRIx64 > +vfio_spapr_create_window(int ps, unsigned int levels, uint64_t ws, uint64_t > off) "pageshift=0x%x levels=%u winsize=0x%"PRIx64" offset=0x%"PRIx64 > vfio_spapr_remove_window(uint64_t off) "offset=0x%"PRIx64 > vfio_spapr_group_attach(int groupfd, int tablefd) "Attached groupfd %d to > liobn fd %d" -- David Gibson| I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson signature.asc Description: PGP signature
Re: [Qemu-devel] [PATCH qemu RFC 5/7] spapr-iommu: Always advertise the maximum possible DMA window size
On Tue, Nov 13, 2018 at 07:31:02PM +1100, Alexey Kardashevskiy wrote: > When deciding about the huge DMA window, the typical Linux pseries guest > uses the maximum allowed RAM size as the upper limit. We did the same > on QEMU side to match that logic. Now we are going to support GPU RAM > pass through which is not available at the guest boot time as it requires > the guest driver interaction. As the result, the guest requests a smaller > window than it should. Therefore the guest needs to be patched to > understand this new memory and so does QEMU. > > Instead of reimplementing here whatever solution we will choose for > the guest, this advertises the biggest possible window size limited by > 32 bit (as defined by LoPAPR). > > This seems to be safe as: > 1. The guest visible emulated table is allocated in KVM (actual pages > are allocated in page fault handler) and QEMU (actual pages are allocated > when changed); > 2. The hardware table (and corresponding userspace addresses cache) > supports sparse allocation and also checks for locked_vm limit so > it is unable to cause the host any damage. > > Signed-off-by: Alexey Kardashevskiy This seems like a good idea to me, even without the NPU stuff. It always bothered me slightly that we based what's effectively the IOVA limit on the guest RAM size which doesn't have any direct connection to it. As long as it doesn't hit the locked memory limits, I don't see any reason we should prevent a guest from doing something weird like mapping a small bit of RAM over and over in IOVA space, or mapping its RAM sparsely in IOVA space. > --- > hw/ppc/spapr_rtas_ddw.c | 19 +++ > 1 file changed, 3 insertions(+), 16 deletions(-) > > diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c > index 329feb1..df60351 100644 > --- a/hw/ppc/spapr_rtas_ddw.c > +++ b/hw/ppc/spapr_rtas_ddw.c > @@ -96,9 +96,8 @@ static void rtas_ibm_query_pe_dma_window(PowerPCCPU *cpu, > uint32_t nret, target_ulong rets) > { > sPAPRPHBState *sphb; > -uint64_t buid, max_window_size; > +uint64_t buid; > uint32_t avail, addr, pgmask = 0; > -MachineState *machine = MACHINE(spapr); > > if ((nargs != 3) || (nret != 5)) { > goto param_error_exit; > @@ -114,27 +113,15 @@ static void rtas_ibm_query_pe_dma_window(PowerPCCPU > *cpu, > /* Translate page mask to LoPAPR format */ > pgmask = spapr_page_mask_to_query_mask(sphb->page_size_mask); > > -/* > - * This is "Largest contiguous block of TCEs allocated specifically > - * for (that is, are reserved for) this PE". > - * Return the maximum number as maximum supported RAM size was in 4K > pages. > - */ > -if (machine->ram_size == machine->maxram_size) { > -max_window_size = machine->ram_size; > -} else { > -max_window_size = machine->device_memory->base + > - memory_region_size(>device_memory->mr); > -} > - > avail = SPAPR_PCI_DMA_MAX_WINDOWS - spapr_phb_get_active_win_num(sphb); > > rtas_st(rets, 0, RTAS_OUT_SUCCESS); > rtas_st(rets, 1, avail); > -rtas_st(rets, 2, max_window_size >> SPAPR_TCE_PAGE_SHIFT); > +rtas_st(rets, 2, 0x); /* Largest contiguous block of TCEs */ One detail though.. where does this limit actually come from? Is it actually a limit imposed by the hardware somewhere, or just because the RTAS call doesn't ahve room for anything more? Previously limits would usually have been a power of 2; is it likely to matter that now it won't be? > rtas_st(rets, 3, pgmask); > rtas_st(rets, 4, 0); /* DMA migration mask, not supported */ > > -trace_spapr_iommu_ddw_query(buid, addr, avail, max_window_size, pgmask); > +trace_spapr_iommu_ddw_query(buid, addr, avail, 0x, pgmask); > return; > > param_error_exit: -- David Gibson| I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson signature.asc Description: PGP signature
Re: [Qemu-devel] [PATCH qemu RFC 4/7] vfio/nvidia-v100: Disable VBIOS update
On Tue, Nov 13, 2018 at 07:31:01PM +1100, Alexey Kardashevskiy wrote: > The NVIDIA V100 GPUs often come in several instances on the same system > board where they are connected directly via out of band fabric called > "NVLink". > > In order to make GPUs talk to each other, NVLink has to be enabled on > both GPUs and this is guaranteed by the firmware by providing special > MMIO registers to disable NVLink till GPU is reset. > > This blocks GPU VBIOS update to add an extra level of assurance that > the firmware does not get reflashed with a malicious firmware which > does not implement NVLink disabling mechanism. > > Signed-off-by: Alexey Kardashevskiy > --- > > NVIDIA firmwares come signed and GPUs do not accept unsigned images > anyway so this is probably overkill, or not? > > Also, there is no available documentation on the magic value of 0x22408; > however it does help as the nvflash upgrade tool stops working with this > applied. IIUC, the upshot of this is basically not to permit firmware updates of the V100 from within a guest, yes? However, it would still be possible to update the firmware from a userspace vfio program? Given that, I'm not sure this really gives us anything over the existing signature verifications. Alex, any thoughts? > --- > hw/vfio/pci.h| 1 + > include/hw/pci/pci_ids.h | 1 + > hw/vfio/pci-quirks.c | 26 ++ > hw/vfio/pci.c| 2 ++ > hw/vfio/trace-events | 1 + > 5 files changed, 31 insertions(+) > > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h > index b1ae4c0..f4c5fb6 100644 > --- a/hw/vfio/pci.h > +++ b/hw/vfio/pci.h > @@ -163,6 +163,7 @@ typedef struct VFIOPCIDevice { > bool no_kvm_msi; > bool no_kvm_msix; > bool no_geforce_quirks; > +bool no_nvidia_v100_quirks; > bool no_kvm_ioeventfd; > bool no_vfio_ioeventfd; > bool enable_ramfb; > diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h > index 3ed7d10..2140dad 100644 > --- a/include/hw/pci/pci_ids.h > +++ b/include/hw/pci/pci_ids.h > @@ -272,5 +272,6 @@ > #define PCI_VENDOR_ID_SYNOPSYS 0x16C3 > > #define PCI_VENDOR_ID_NVIDIA 0x10de > +#define PCI_VENDOR_ID_NVIDIA_V100_SXM2 0x1db1 > > #endif > diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c > index 40a1200..2796837 100644 > --- a/hw/vfio/pci-quirks.c > +++ b/hw/vfio/pci-quirks.c > @@ -996,6 +996,31 @@ static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice > *vdev, int nr) > trace_vfio_quirk_nvidia_bar0_probe(vdev->vbasedev.name); > } > > +static void vfio_probe_nvidia_v100_bar0_quirk(VFIOPCIDevice *vdev, int nr) > +{ > +VFIOQuirk *quirk; > + > +if (vdev->no_nvidia_v100_quirks || > +!vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, > + PCI_VENDOR_ID_NVIDIA_V100_SXM2) || > +nr != 0) { > +return; > +} > + > +quirk = vfio_quirk_alloc(1); > + > +memory_region_init_io(quirk->mem, OBJECT(vdev), > + NULL, quirk, > + "vfio-nvidia-v100_bar0-block-quirk", > + 4); > +memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, > +0x22408, quirk->mem, 1); > + > +QLIST_INSERT_HEAD(>bars[nr].quirks, quirk, next); > + > +trace_vfio_quirk_nvidia_v100_bar0_probe(vdev->vbasedev.name); > +} > + > /* > * TODO - Some Nvidia devices provide config access to their companion HDA > * device and even to their parent bridge via these config space mirrors. > @@ -1853,6 +1878,7 @@ void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr) > vfio_probe_ati_bar2_quirk(vdev, nr); > vfio_probe_nvidia_bar5_quirk(vdev, nr); > vfio_probe_nvidia_bar0_quirk(vdev, nr); > +vfio_probe_nvidia_v100_bar0_quirk(vdev, nr); > vfio_probe_rtl8168_bar2_quirk(vdev, nr); > vfio_probe_igd_bar4_quirk(vdev, nr); > } > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > index 5c7bd96..7848b28 100644 > --- a/hw/vfio/pci.c > +++ b/hw/vfio/pci.c > @@ -3203,6 +3203,8 @@ static Property vfio_pci_dev_properties[] = { > DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false), > DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice, > no_geforce_quirks, false), > +DEFINE_PROP_BOOL("x-no-nvidia-v100-quirks", VFIOPCIDevice, > + no_nvidia_v100_quirks, false), > DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd, > false), > DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd, > diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events > index db730f3..adfa75e 100644 > --- a/hw/vfio/trace-events > +++ b/hw/vfio/trace-events > @@ -68,6 +68,7 @@ vfio_quirk_nvidia_bar5_state(const char *name, const char > *state) "%s %s" > vfio_quirk_nvidia_bar5_probe(const char *name) "%s" > vfio_quirk_nvidia_bar0_msi_ack(const char *name) "%s" >
[Qemu-devel] [PATCH] hw/arm/stm32f205: Fix the UART and Timer region size
From: Seth Kintigh I corrected these 2 memory regions based on specifications from the chip manufacturer. The existing ranges seem to overlap and and cause odd behavior and/or crashes when trying to set up multiple UARTs, Signed-off-by: Seth Kintigh --- Phil, I hope this is the right format. PMM, my original changes were made on an old version, but I made the patch from the latest version per the instructions. I'm glad to hear that serial port limit is gone! hw/char/stm32f2xx_usart.c | 2 +- hw/timer/stm32f2xx_timer.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/char/stm32f2xx_usart.c b/hw/char/stm32f2xx_usart.c index 032b5fda13..f3363a2952 100644 --- a/hw/char/stm32f2xx_usart.c +++ b/hw/char/stm32f2xx_usart.c @@ -202,7 +202,7 @@ static void stm32f2xx_usart_init(Object *obj) sysbus_init_irq(SYS_BUS_DEVICE(obj), >irq); memory_region_init_io(>mmio, obj, _usart_ops, s, - TYPE_STM32F2XX_USART, 0x2000); + TYPE_STM32F2XX_USART, 0x400); sysbus_init_mmio(SYS_BUS_DEVICE(obj), >mmio); } diff --git a/hw/timer/stm32f2xx_timer.c b/hw/timer/stm32f2xx_timer.c index 58fc7b1188..ae744d1642 100644 --- a/hw/timer/stm32f2xx_timer.c +++ b/hw/timer/stm32f2xx_timer.c @@ -308,7 +308,7 @@ static void stm32f2xx_timer_init(Object *obj) sysbus_init_irq(SYS_BUS_DEVICE(obj), >irq); memory_region_init_io(>iomem, obj, _timer_ops, s, - "stm32f2xx_timer", 0x4000); + "stm32f2xx_timer", 0x400); sysbus_init_mmio(SYS_BUS_DEVICE(obj), >iomem); s->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, stm32f2xx_timer_interrupt, s); On Thu, Nov 15, 2018 at 7:05 AM Peter Maydell wrote: > On 4 November 2018 at 07:42, Seth K wrote: > > I corrected these 2 memory regions based on specifications from the chip > > manufacturer. The existing ranges seem to overlap and and cause odd > > behavior and/or crashes when trying to set up multiple UARTs, > > I also played with changing MAX_SERIAL_PORTS to 8 to match the hardware, > > but I did not include that in this patch as I never fully tested its > > effects. > > Hi; thanks for the patch. As Philippe says, it looks good, > but the one thing we definitely need is a Signed-off-by: > line from you. > > A minor note -- there is no "MAX_SERIAL_PORTS" definition > in QEMU any more: we removed that artificial limitation > earlier this year. Maybe you're basing your patch on an > older version of QEMU? It's best to use git master for > development. Our SoC model defines 6 uarts (STM_NUM_USARTS) > in hw/arm/stm32f205_soc.c, which should all now be connectable > on the command line, though I haven't tested that or checked > whether the hardware has 6 or some other number... > > thanks > -- PMM >
Re: [Qemu-devel] [PATCH qemu RFC 5/7] spapr-iommu: Always advertise the maximum possible DMA window size
On 19/11/2018 13:42, David Gibson wrote: > On Tue, Nov 13, 2018 at 07:31:02PM +1100, Alexey Kardashevskiy wrote: >> When deciding about the huge DMA window, the typical Linux pseries guest >> uses the maximum allowed RAM size as the upper limit. We did the same >> on QEMU side to match that logic. Now we are going to support GPU RAM >> pass through which is not available at the guest boot time as it requires >> the guest driver interaction. As the result, the guest requests a smaller >> window than it should. Therefore the guest needs to be patched to >> understand this new memory and so does QEMU. >> >> Instead of reimplementing here whatever solution we will choose for >> the guest, this advertises the biggest possible window size limited by >> 32 bit (as defined by LoPAPR). >> >> This seems to be safe as: >> 1. The guest visible emulated table is allocated in KVM (actual pages >> are allocated in page fault handler) and QEMU (actual pages are allocated >> when changed); >> 2. The hardware table (and corresponding userspace addresses cache) >> supports sparse allocation and also checks for locked_vm limit so >> it is unable to cause the host any damage. >> >> Signed-off-by: Alexey Kardashevskiy > > This seems like a good idea to me, even without the NPU stuff. It > always bothered me slightly that we based what's effectively the IOVA > limit on the guest RAM size which doesn't have any direct connection > to it. > > As long as it doesn't hit the locked memory limits, I don't see any > reason we should prevent a guest from doing something weird like > mapping a small bit of RAM over and over in IOVA space, or mapping its > RAM sparsely in IOVA space. > >> --- >> hw/ppc/spapr_rtas_ddw.c | 19 +++ >> 1 file changed, 3 insertions(+), 16 deletions(-) >> >> diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c >> index 329feb1..df60351 100644 >> --- a/hw/ppc/spapr_rtas_ddw.c >> +++ b/hw/ppc/spapr_rtas_ddw.c >> @@ -96,9 +96,8 @@ static void rtas_ibm_query_pe_dma_window(PowerPCCPU *cpu, >> uint32_t nret, target_ulong rets) >> { >> sPAPRPHBState *sphb; >> -uint64_t buid, max_window_size; >> +uint64_t buid; >> uint32_t avail, addr, pgmask = 0; >> -MachineState *machine = MACHINE(spapr); >> >> if ((nargs != 3) || (nret != 5)) { >> goto param_error_exit; >> @@ -114,27 +113,15 @@ static void rtas_ibm_query_pe_dma_window(PowerPCCPU >> *cpu, >> /* Translate page mask to LoPAPR format */ >> pgmask = spapr_page_mask_to_query_mask(sphb->page_size_mask); >> >> -/* >> - * This is "Largest contiguous block of TCEs allocated specifically >> - * for (that is, are reserved for) this PE". >> - * Return the maximum number as maximum supported RAM size was in 4K >> pages. >> - */ >> -if (machine->ram_size == machine->maxram_size) { >> -max_window_size = machine->ram_size; >> -} else { >> -max_window_size = machine->device_memory->base + >> - memory_region_size(>device_memory->mr); >> -} >> - >> avail = SPAPR_PCI_DMA_MAX_WINDOWS - spapr_phb_get_active_win_num(sphb); >> >> rtas_st(rets, 0, RTAS_OUT_SUCCESS); >> rtas_st(rets, 1, avail); >> -rtas_st(rets, 2, max_window_size >> SPAPR_TCE_PAGE_SHIFT); >> +rtas_st(rets, 2, 0x); /* Largest contiguous block of TCEs */ > > One detail though.. where does this limit actually come from? Is it > actually a limit imposed by the hardware somewhere, or just because > the RTAS call doesn't ahve room for anything more? I used this limit just because of the parameter size. > > Previously limits would usually have been a power of 2; is it likely > to matter that now it won't be? LoPAPR says it is "Largest contiguous block of TCEs" and no mention of power of two but ibm,create-pe-dma-window takes a window shift so it is assumed that windows can still be only power of two. Not sure. The powernv code will fail if the requested window is not power of two. Replacing 0x with 0x8000 won't make a difference though here but probably will make the error clearer... > >> rtas_st(rets, 3, pgmask); >> rtas_st(rets, 4, 0); /* DMA migration mask, not supported */ >> >> -trace_spapr_iommu_ddw_query(buid, addr, avail, max_window_size, pgmask); >> +trace_spapr_iommu_ddw_query(buid, addr, avail, 0x, pgmask); >> return; >> >> param_error_exit: > -- Alexey
Re: [Qemu-devel] [PATCH qemu RFC 7/7] spapr: Add NVLink2 pass through support
On 19/11/2018 14:01, David Gibson wrote: > On Tue, Nov 13, 2018 at 07:31:04PM +1100, Alexey Kardashevskiy wrote: >> The NVIDIA V100 GPU comes with some on-board RAM which is mapped into >> the host memory space and accessible as normal RAM via NVLink bus. >> The VFIO-PCI driver implements special regions for such GPU and emulated >> NVLink bridge (below referred as NPU). The POWER9 CPU also provides >> address translation services which includes TLB invalidation register >> exposes via the NVLink bridge; the feature is called "ATSD". >> >> This adds a quirk to VFIO to map the memory and create an MR; the new MR >> is stored in a GPU as a QOM link. The sPAPR PCI uses this to get the MR >> and map it to the system address space. Another quirk does the same for >> ATSD. >> >> This adds 3 additional steps to the FDT builder in spapr-pci: >> 1. Search for specific GPUs and NPUs, collects findings in sPAPRPHBState; >> 2. Adds several properties in the DT: "ibm,npu", "ibm,gpu", "memory-block", >> and some other. These are required by the guest platform and GPU driver; >> this also adds a new made-up compatible type for a PHB to signal >> a modified guest that this particular PHB needs the default DMA window >> removed as these GPUs have limited DMA mask size (way lower than usual 59); >> 3. Adds new memory blocks with one addition - they have >> "linux,memory-usable" property configured in the way which prevents >> the guest from onlining it automatically as it needs to be deferred till >> the guest GPU driver trains NVLink. >> >> A couple of notes: >> - this changes the FDT rendeder as doing 1-2-3 from sPAPRPHBClass::realize >> impossible - devices are not yet attached; >> - this does not add VFIO quirk MRs to the system address space as >> the address is selected in sPAPRPHBState, similar to MMIO. >> >> This puts new memory nodes in a separate NUMA node to replicate the host >> system setup as close as possible (the GPU driver relies on this too). >> >> This adds fake NPU nodes to make the guest platform code work, >> specifically "ibm,npu-link-index". >> >> Signed-off-by: Alexey Kardashevskiy >> --- >> hw/vfio/pci.h | 2 + >> include/hw/pci-host/spapr.h | 28 >> include/hw/ppc/spapr.h | 3 +- >> hw/ppc/spapr.c | 14 +- >> hw/ppc/spapr_pci.c | 256 +++- >> hw/vfio/pci-quirks.c| 93 + >> hw/vfio/pci.c | 14 ++ >> hw/vfio/trace-events| 3 + >> 8 files changed, 408 insertions(+), 5 deletions(-) >> >> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h >> index f4c5fb6..b8954cc 100644 >> --- a/hw/vfio/pci.h >> +++ b/hw/vfio/pci.h >> @@ -195,6 +195,8 @@ int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp); >> int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, >> struct vfio_region_info *info, >> Error **errp); >> +int vfio_pci_nvlink2_ram_init(VFIOPCIDevice *vdev, Error **errp); >> +int vfio_pci_npu2_atsd_init(VFIOPCIDevice *vdev, Error **errp); >> >> void vfio_display_reset(VFIOPCIDevice *vdev); >> int vfio_display_probe(VFIOPCIDevice *vdev, Error **errp); >> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h >> index 7c66c38..1f8ebf3 100644 >> --- a/include/hw/pci-host/spapr.h >> +++ b/include/hw/pci-host/spapr.h >> @@ -87,6 +87,24 @@ struct sPAPRPHBState { >> uint32_t mig_liobn; >> hwaddr mig_mem_win_addr, mig_mem_win_size; >> hwaddr mig_io_win_addr, mig_io_win_size; >> +hwaddr nv2_gpa_win_addr; >> +hwaddr nv2_atsd_win_addr; >> + >> +struct spapr_phb_pci_nvgpu_config { >> +uint64_t nv2_ram; >> +uint64_t nv2_atsd; >> +int num; >> +struct { >> +int links; >> +uint64_t tgt; >> +uint64_t gpa; >> +PCIDevice *gpdev; >> +uint64_t atsd[3]; >> +PCIDevice *npdev[3]; >> +} gpus[6]; >> +uint64_t atsd[64]; /* Big Endian (BE), ready for the DT */ >> +int atsd_num; >> +} nvgpus; > > Is this information always relevant for the PHB, or only for PHBs > which have an NPU or GPU attached to them? If the latter I'm > wondering if we can allocate it only when necessary. I think I can make it even local, just need to hack spapr_populate_pci_devices_dt's fdt struct to take the struct. > >> }; >> >> #define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x8000ULL >> @@ -104,6 +122,16 @@ struct sPAPRPHBState { >> >> #define SPAPR_PCI_MSI_WINDOW 0x400ULL >> >> +#define PHANDLE_PCIDEV(phb, pdev)(0x1200 | \ >> + (((phb)->index) << 16) | >> ((pdev)->devfn)) >> +#define PHANDLE_GPURAM(phb, n) (0x11FF | ((n) << 8) | \ >> + (((phb)->index) << 16)) >> +#define GPURAM_ASSOCIATIVITY(phb, n) (255 - ((phb)->index * 3 + (n))) >> +#define SPAPR_PCI_NV2RAM64_WIN_BASE
Re: [Qemu-devel] [PATCH qemu RFC 5/7] spapr-iommu: Always advertise the maximum possible DMA window size
On Mon, Nov 19, 2018 at 04:08:33PM +1100, Alexey Kardashevskiy wrote: > > > On 19/11/2018 13:42, David Gibson wrote: > > On Tue, Nov 13, 2018 at 07:31:02PM +1100, Alexey Kardashevskiy wrote: > >> When deciding about the huge DMA window, the typical Linux pseries guest > >> uses the maximum allowed RAM size as the upper limit. We did the same > >> on QEMU side to match that logic. Now we are going to support GPU RAM > >> pass through which is not available at the guest boot time as it requires > >> the guest driver interaction. As the result, the guest requests a smaller > >> window than it should. Therefore the guest needs to be patched to > >> understand this new memory and so does QEMU. > >> > >> Instead of reimplementing here whatever solution we will choose for > >> the guest, this advertises the biggest possible window size limited by > >> 32 bit (as defined by LoPAPR). > >> > >> This seems to be safe as: > >> 1. The guest visible emulated table is allocated in KVM (actual pages > >> are allocated in page fault handler) and QEMU (actual pages are allocated > >> when changed); > >> 2. The hardware table (and corresponding userspace addresses cache) > >> supports sparse allocation and also checks for locked_vm limit so > >> it is unable to cause the host any damage. > >> > >> Signed-off-by: Alexey Kardashevskiy > > > > This seems like a good idea to me, even without the NPU stuff. It > > always bothered me slightly that we based what's effectively the IOVA > > limit on the guest RAM size which doesn't have any direct connection > > to it. > > > > As long as it doesn't hit the locked memory limits, I don't see any > > reason we should prevent a guest from doing something weird like > > mapping a small bit of RAM over and over in IOVA space, or mapping its > > RAM sparsely in IOVA space. > > > >> --- > >> hw/ppc/spapr_rtas_ddw.c | 19 +++ > >> 1 file changed, 3 insertions(+), 16 deletions(-) > >> > >> diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c > >> index 329feb1..df60351 100644 > >> --- a/hw/ppc/spapr_rtas_ddw.c > >> +++ b/hw/ppc/spapr_rtas_ddw.c > >> @@ -96,9 +96,8 @@ static void rtas_ibm_query_pe_dma_window(PowerPCCPU *cpu, > >> uint32_t nret, target_ulong rets) > >> { > >> sPAPRPHBState *sphb; > >> -uint64_t buid, max_window_size; > >> +uint64_t buid; > >> uint32_t avail, addr, pgmask = 0; > >> -MachineState *machine = MACHINE(spapr); > >> > >> if ((nargs != 3) || (nret != 5)) { > >> goto param_error_exit; > >> @@ -114,27 +113,15 @@ static void rtas_ibm_query_pe_dma_window(PowerPCCPU > >> *cpu, > >> /* Translate page mask to LoPAPR format */ > >> pgmask = spapr_page_mask_to_query_mask(sphb->page_size_mask); > >> > >> -/* > >> - * This is "Largest contiguous block of TCEs allocated specifically > >> - * for (that is, are reserved for) this PE". > >> - * Return the maximum number as maximum supported RAM size was in 4K > >> pages. > >> - */ > >> -if (machine->ram_size == machine->maxram_size) { > >> -max_window_size = machine->ram_size; > >> -} else { > >> -max_window_size = machine->device_memory->base + > >> - memory_region_size(>device_memory->mr); > >> -} > >> - > >> avail = SPAPR_PCI_DMA_MAX_WINDOWS - > >> spapr_phb_get_active_win_num(sphb); > >> > >> rtas_st(rets, 0, RTAS_OUT_SUCCESS); > >> rtas_st(rets, 1, avail); > >> -rtas_st(rets, 2, max_window_size >> SPAPR_TCE_PAGE_SHIFT); > >> +rtas_st(rets, 2, 0x); /* Largest contiguous block of TCEs */ > > > > One detail though.. where does this limit actually come from? Is it > > actually a limit imposed by the hardware somewhere, or just because > > the RTAS call doesn't ahve room for anything more? > > I used this limit just because of the parameter size. > > > > > Previously limits would usually have been a power of 2; is it likely > > to matter that now it won't be? > > LoPAPR says it is "Largest contiguous block of TCEs" and no mention of > power of two but ibm,create-pe-dma-window takes a window shift so it is > assumed that windows can still be only power of two. Not sure. The > powernv code will fail if the requested window is not power of two. > > Replacing 0x with 0x8000 won't make a difference though here > but probably will make the error clearer... Yeah, given that the windows have to be a power of 2 in size, I think it makes sense for this to be a power of 2, even if it doesn't strictly have to be. So I think 0x8000 would be a better option. -- David Gibson| I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson signature.asc Description: PGP signature
Re: [Qemu-devel] [PATCH qemu RFC 1/7] vfio/spapr: Fix indirect levels calculation
On 19/11/2018 12:45, David Gibson wrote: > On Tue, Nov 13, 2018 at 07:30:58PM +1100, Alexey Kardashevskiy wrote: >> The current code assumes that we can address more bits on a PCI bus >> for DMA than we really can. Limit to the known tested maximum of 55 bits >> and assume 64K IOMMU pages. > > This doesn't make much sense to me. It looks nothing like the > calculation it's replacing, and not just for extreme values. For > example the new calculation doesn't depend on the size of the window > at all, whereas the previous one did. Uff. Yes, this is confusing. There are a typical 64k IOMMU page size and a typical 64k system page size... For the window as big as 0x200.. and 64k pages (actual working example), 33554432 entries are needed, or 256MB of space. The existing code calculates it as 25/6+1=5 levels. Which the hardware can do. But: 1. all TCE levels are the same size and 2. for the minimum size of the level I take a system page size which is 64k. So with 5 levels 16bit each (and this is without counting IOMMU page size, which is plus extra 16 bits) the table is huuuge. And I know from tests that 55 bits is the maximum the hardware can handle, and I have this limit in the powernv code so VFIO ioctl to create a window will fail. If I do not have a check in powernv, than things silently do not work (no EEH though). > You say you're assuming 64k > IOMMU pages, but create.page_shift (the actual IOMMU page size) is in > there. It is a radix tree basically. Out of 64bits available, I cannot use bits 56..63 (hardware does not work) and 0..12/15/21 (IOMMU page size), so I have bits 13/16/22..55 which I need to split equally between levels, each of which is 64k (system page size). > Nor is it clear to me why the maximum PCI address is relevant > to the number of levels in the first place. The window can only as big as the PHB hardware supports, this is why maximum PCI address. > > In addition, AFAICT the new calculation gives the answer '2' for all > likely IOMMU page sizes (4k, 64k, 2M) Hm, I did not realize that :) Instead of using 64k as a default level size, I could use some idea of what CONFIG_FORCE_MAX_ZONEORDER could be. Currently it is: arch/powerpc/Kconfig config FORCE_MAX_ZONEORDER int "Maximum zone order" range 8 9 if PPC64 && PPC_64K_PAGES default "9" if PPC64 && PPC_64K_PAGES range 13 13 if PPC64 && !PPC_64K_PAGES default "13" if PPC64 && !PPC_64K_PAGES range 9 64 if PPC32 && PPC_16K_PAGES default "9" if PPC32 && PPC_16K_PAGES range 7 64 if PPC32 && PPC_64K_PAGES default "7" if PPC32 && PPC_64K_PAGES range 5 64 if PPC32 && PPC_256K_PAGES default "5" if PPC32 && PPC_256K_PAGES range 11 64 default "11" So assuming that "8" is the absolute minimum, I could try levels as big as 1<<(16+8-1) and if this fails, then increase the number of levels till a window is created or it is too many levels. Or there is some obviously better solution to the problem? > >> >> Signed-off-by: Alexey Kardashevskiy >> --- >> hw/vfio/spapr.c | 3 ++- >> hw/vfio/trace-events | 2 +- >> 2 files changed, 3 insertions(+), 2 deletions(-) >> >> diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c >> index becf71a..f5fdc53 100644 >> --- a/hw/vfio/spapr.c >> +++ b/hw/vfio/spapr.c >> @@ -183,7 +183,7 @@ int vfio_spapr_create_window(VFIOContainer *container, >> entries = create.window_size >> create.page_shift; >> pages = MAX((entries * sizeof(uint64_t)) / getpagesize(), 1); >> pages = MAX(pow2ceil(pages), 1); /* Round up */ >> -create.levels = ctz64(pages) / 6 + 1; >> +create.levels = MAX(1, (55 - create.page_shift) / 16); >> >> ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, ); >> if (ret) { >> @@ -200,6 +200,7 @@ int vfio_spapr_create_window(VFIOContainer *container, >> return -EINVAL; >> } >> trace_vfio_spapr_create_window(create.page_shift, >> + create.levels, >> create.window_size, >> create.start_addr); >> *pgsize = pagesize; >> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events >> index a85e866..db730f3 100644 >> --- a/hw/vfio/trace-events >> +++ b/hw/vfio/trace-events >> @@ -128,6 +128,6 @@ vfio_prereg_listener_region_add_skip(uint64_t start, >> uint64_t end) "0x%"PRIx64" >> vfio_prereg_listener_region_del_skip(uint64_t start, uint64_t end) >> "0x%"PRIx64" - 0x%"PRIx64 >> vfio_prereg_register(uint64_t va, uint64_t size, int ret) "va=0x%"PRIx64" >> size=0x%"PRIx64" ret=%d" >> vfio_prereg_unregister(uint64_t va, uint64_t size, int ret) "va=0x%"PRIx64" >> size=0x%"PRIx64" ret=%d" >> -vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x >> winsize=0x%"PRIx64" offset=0x%"PRIx64 >> +vfio_spapr_create_window(int ps, unsigned int levels, uint64_t ws, uint64_t >> off) "pageshift=0x%x
Re: [Qemu-devel] [PULL 0/8] Monitor patches for 2018-10-30
On Thu, Nov 08, 2018 at 10:44:06AM +0800, Peter Xu wrote: > On Wed, Nov 07, 2018 at 11:21:32AM +, Peter Maydell wrote: > > On 7 November 2018 at 02:56, Peter Xu wrote: > > > Strange, "make check -j8" failed on my hosts (I tried two) with either > > > Markus's pull tree or qemu master: > > > > > > hw/core/ptimer.o: In function `timer_new_tl': > > > /home/xz/git/qemu/include/qemu/timer.h:536: undefined reference to > > > `timer_init_tl' > > > collect2: error: ld returned 1 exit status > > > make: *** [/home/xz/git/qemu/rules.mak:124: tests/ptimer-test] Error 1 > > > > > > Is that only happening on my hosts? > > > > Commit 89a603a0c80ae3 changed things so that there is no > > timer_new_tl() or timer_init_tl() any more, so if you have > > an object file that's referring to it then it's probably > > stale. Try a make clean. > > Yeh it worked for me, thanks Peter. > > Though after running a few more rounds of "configure --enable-debug && > make check -j8" I still cannot see anything wrong with Markus's tree. > I'll see whether there's any news from Markus and then I'll consider > whether I should install a FreeBSD. I reproduced the error with a FreeBSD guest and this change (which possibly can be squashed into "tests: qmp-test: add queue full test") worked for me: diff --git a/tests/qmp-test.c b/tests/qmp-test.c index 6989acbca4..83f353db4f 100644 --- a/tests/qmp-test.c +++ b/tests/qmp-test.c @@ -281,8 +281,15 @@ static void test_qmp_oob(void) * will only be able to be handled after the queue is shrinked, so * it'll be processed only after one existing in-band command * finishes. + * + * NOTE: we need to feed the queue with one extra request to make + * sure it'll be stuck since when we have sent the Nth request + * it's possible that we have already popped and processing the + * 1st request so the Nth request (which could potentially be the + * [N-1]th element on the queue) might not trigger the + * monitor-full condition deterministically. */ -for (i = 1; i <= QMP_REQ_QUEUE_LEN_MAX; i++) { +for (i = 1; i <= QMP_REQ_QUEUE_LEN_MAX + 1; i++) { id = g_strdup_printf("queue-blocks-%d", i); send_cmd_that_blocks(qts, id); g_free(id); @@ -291,7 +298,7 @@ static void test_qmp_oob(void) unblock_blocked_cmd(); recv_cmd_id(qts, "queue-blocks-1"); recv_cmd_id(qts, "oob-1"); -for (i = 2; i <= QMP_REQ_QUEUE_LEN_MAX; i++) { +for (i = 2; i <= QMP_REQ_QUEUE_LEN_MAX + 1; i++) { unblock_blocked_cmd(); id = g_strdup_printf("queue-blocks-%d", i); recv_cmd_id(qts, id); So the problem here is that the queue-block-N command might not really suspend the monitor everytime if we already popped the 1st request, which will let the N-th request to be (N-1)th, then the parser will continue to eat the oob command and it could "preempt" the previous commands. Maybe FreeBSD is scheduling the threads in some pattern so it happens only on FreeBSD and very constantly, but anyway it should be a general fix to the test program. Regards, -- Peter Xu
Re: [Qemu-devel] [PATCH 2/2] hw: fw_cfg: refactor fw_cfg_reboot()
ÀîÇ¿ writes: > At 2018-11-17 00:52:58, "Markus Armbruster" wrote: >>Li Qiang writes: >> >>> Currently the user can set a negative reboot_timeout. >>> Also it is wrong to parse 'reboot-timeout' with qemu_opt_get() and then >>> convert it to number. >> >>Again, it's not wrong per se, just needlessly complicated and >>error-prone. What makes it wrong is ... >> >>> convert it to number. This patch refactor this function by following: >>> 1. ensure reboot_timeout is in 0~0x >>> 2. use qemu_opt_get_number() to parse reboot_timeout >>> 3. simlify code >>> >>> Signed-off-by: Li Qiang >>> --- >>> hw/nvram/fw_cfg.c | 23 +++ >>> vl.c | 2 +- >>> 2 files changed, 12 insertions(+), 13 deletions(-) >>> >>> diff --git a/hw/nvram/fw_cfg.c b/hw/nvram/fw_cfg.c >>> index 78f43dad93..6aca80846a 100644 >>> --- a/hw/nvram/fw_cfg.c >>> +++ b/hw/nvram/fw_cfg.c >>> @@ -178,24 +178,23 @@ static void fw_cfg_bootsplash(FWCfgState *s) >>> >>> static void fw_cfg_reboot(FWCfgState *s) >>> { >>> -int reboot_timeout = -1; >>> -char *p; >>> -const char *temp; >>> +const char *reboot_timeout = NULL; >>> >>> /* get user configuration */ >>> QemuOptsList *plist = qemu_find_opts("boot-opts"); >>> QemuOpts *opts = QTAILQ_FIRST(>head); >>> -if (opts != NULL) { >>> -temp = qemu_opt_get(opts, "reboot-timeout"); >>> -if (temp != NULL) { >>> -p = (char *)temp; >>> -reboot_timeout = strtol(p, , 10); >> >>... the total lack of error checking here. Same in PATCH 1. > >> > > > Got. > > >>Here's my attempt at a clearer commit message: >> >>fw_cfg: Fix -boot reboot-timeout error checking >> >>fw_cfg_reboot() gets option parameter "reboot-timeout" with >>qemu_opt_get(), then converts it to an integer by hand. It neglects >>to check that conversion for errors, and fails to reject negative >>values. Positive values above the limit get reported and replaced >>by the limit. >> >>Check for conversion errors properly, and reject all values outside >>0..0x. > >> > > > Thanks for your advice, I appreciate it and will change in the revision > version. > > >>PATCH 1's commit message could be improved the same way. >> >>> -} >>> +reboot_timeout = qemu_opt_get(opts, "reboot-timeout"); >>> + >>> +if (reboot_timeout == NULL) { >>> +return; >>> } >>> +int64_t rt_val = qemu_opt_get_number(opts, "reboot-timeout", -1); >>> + >>> /* validate the input */ >>> -if (reboot_timeout > 0x) { >>> -error_report("reboot timeout is larger than 65535, force it to >>> 65535."); >>> -reboot_timeout = 0x; >>> +if (rt_val < 0 || rt_val > 0x) { >>> +error_report("reboot timeout is invalid," >>> + "it should be a value between 0 and 65535"); >>> +exit(1); >>> } >>> fw_cfg_add_file(s, "etc/boot-fail-wait", g_memdup(_timeout, 4), >>> 4); >>> } >> >>Change in behavior when "reboot-timeout" isn't specified. >> >>Before your patch, we fw_cfg_add_file() with a value of -1. >> >>After your patch, we don't fw_cfg_add_file(). >> >>Why is that okay? > >> > > > Here I following Gerd's advice. > For values >0x or < 0, report and exit. > -->http://lists.gnu.org/archive/html/qemu-devel/2018-11/msg00551.html Cases: 0. "reboot-timeout" not specified (e.g. no -boot option given) 1. "reboot-timeout" specified, value out of bounds 1.a. < 0 1.b. > 0x 2. "reboot-timeout" specified, value okay Gerd's advice is about case 1. Your patch implements it. My question is about case 0. Do you understand my question now? [...]
Re: [Qemu-devel] [PULL 0/8] Monitor patches for 2018-10-30
On Mon, Nov 19, 2018 at 02:17:27PM +0800, Peter Xu wrote: > On Thu, Nov 08, 2018 at 10:44:06AM +0800, Peter Xu wrote: > > On Wed, Nov 07, 2018 at 11:21:32AM +, Peter Maydell wrote: > > > On 7 November 2018 at 02:56, Peter Xu wrote: > > > > Strange, "make check -j8" failed on my hosts (I tried two) with either > > > > Markus's pull tree or qemu master: > > > > > > > > hw/core/ptimer.o: In function `timer_new_tl': > > > > /home/xz/git/qemu/include/qemu/timer.h:536: undefined reference to > > > > `timer_init_tl' > > > > collect2: error: ld returned 1 exit status > > > > make: *** [/home/xz/git/qemu/rules.mak:124: tests/ptimer-test] Error 1 > > > > > > > > Is that only happening on my hosts? > > > > > > Commit 89a603a0c80ae3 changed things so that there is no > > > timer_new_tl() or timer_init_tl() any more, so if you have > > > an object file that's referring to it then it's probably > > > stale. Try a make clean. > > > > Yeh it worked for me, thanks Peter. > > > > Though after running a few more rounds of "configure --enable-debug && > > make check -j8" I still cannot see anything wrong with Markus's tree. > > I'll see whether there's any news from Markus and then I'll consider > > whether I should install a FreeBSD. > > I reproduced the error with a FreeBSD guest and this change (which > possibly can be squashed into "tests: qmp-test: add queue full test") > worked for me: > > diff --git a/tests/qmp-test.c b/tests/qmp-test.c > index 6989acbca4..83f353db4f 100644 > --- a/tests/qmp-test.c > +++ b/tests/qmp-test.c > @@ -281,8 +281,15 @@ static void test_qmp_oob(void) > * will only be able to be handled after the queue is shrinked, so > * it'll be processed only after one existing in-band command > * finishes. > + * > + * NOTE: we need to feed the queue with one extra request to make > + * sure it'll be stuck since when we have sent the Nth request > + * it's possible that we have already popped and processing the > + * 1st request so the Nth request (which could potentially be the > + * [N-1]th element on the queue) might not trigger the > + * monitor-full condition deterministically. > */ > -for (i = 1; i <= QMP_REQ_QUEUE_LEN_MAX; i++) { > +for (i = 1; i <= QMP_REQ_QUEUE_LEN_MAX + 1; i++) { > id = g_strdup_printf("queue-blocks-%d", i); > send_cmd_that_blocks(qts, id); > g_free(id); > @@ -291,7 +298,7 @@ static void test_qmp_oob(void) > unblock_blocked_cmd(); > recv_cmd_id(qts, "queue-blocks-1"); > recv_cmd_id(qts, "oob-1"); > -for (i = 2; i <= QMP_REQ_QUEUE_LEN_MAX; i++) { > +for (i = 2; i <= QMP_REQ_QUEUE_LEN_MAX + 1; i++) { > unblock_blocked_cmd(); > id = g_strdup_printf("queue-blocks-%d", i); > recv_cmd_id(qts, id); > > So the problem here is that the queue-block-N command might not really > suspend the monitor everytime if we already popped the 1st request, > which will let the N-th request to be (N-1)th, then the parser will > continue to eat the oob command and it could "preempt" the previous > commands. > > Maybe FreeBSD is scheduling the threads in some pattern so it happens > only on FreeBSD and very constantly, but anyway it should be a general > fix to the test program. Markus, do you want me to repost a new version with this change? Is it still possible to have the oob-default series for 3.1? -- Peter Xu
Re: [Qemu-devel] [PATCH v4 10/23] json: Define new QMP message for pvrdma
The subsystem tag in the commit message's title should be "qapi: " not "json: ". Yuval Shaia writes: > pvrdma requires that the same GID attached to it will be attached to the > backend device in the host. > > A new QMP messages is defined so pvrdma device can broadcast any change > made to its GID table. This event is captured by libvirt which in turn > will update the GID table in the backend device. > > Signed-off-by: Yuval Shaia > Reviewed-by: Marcel Apfelbaum > --- > MAINTAINERS | 1 + > Makefile | 3 ++- > Makefile.objs | 4 > qapi/qapi-schema.json | 1 + > qapi/rdma.json| 38 ++ > 5 files changed, 46 insertions(+), 1 deletion(-) > create mode 100644 qapi/rdma.json > > diff --git a/MAINTAINERS b/MAINTAINERS > index e087d58ac6..a149f68a8f 100644 > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -2232,6 +2232,7 @@ F: hw/rdma/* > F: hw/rdma/vmw/* > F: docs/pvrdma.txt > F: contrib/rdmacm-mux/* > +F: qapi/rdma.json > > Build and test automation > - > diff --git a/Makefile b/Makefile > index 94072776ff..db4ce60ee5 100644 > --- a/Makefile > +++ b/Makefile > @@ -599,7 +599,8 @@ qapi-modules = $(SRC_PATH)/qapi/qapi-schema.json > $(SRC_PATH)/qapi/common.json \ > $(SRC_PATH)/qapi/tpm.json \ > $(SRC_PATH)/qapi/trace.json \ > $(SRC_PATH)/qapi/transaction.json \ > - $(SRC_PATH)/qapi/ui.json > + $(SRC_PATH)/qapi/ui.json \ > + $(SRC_PATH)/qapi/rdma.json Please insert between net.json and rocker.json to maintain alphabetical order. > qapi/qapi-builtin-types.c qapi/qapi-builtin-types.h \ > qapi/qapi-types.c qapi/qapi-types.h \ > diff --git a/Makefile.objs b/Makefile.objs > index cc7df3ad80..76d8028f2f 100644 > --- a/Makefile.objs > +++ b/Makefile.objs > @@ -21,6 +21,7 @@ util-obj-y += qapi/qapi-types-tpm.o > util-obj-y += qapi/qapi-types-trace.o > util-obj-y += qapi/qapi-types-transaction.o > util-obj-y += qapi/qapi-types-ui.o > +util-obj-y += qapi/qapi-types-rdma.o > util-obj-y += qapi/qapi-builtin-visit.o > util-obj-y += qapi/qapi-visit.o > util-obj-y += qapi/qapi-visit-block-core.o > @@ -40,6 +41,7 @@ util-obj-y += qapi/qapi-visit-tpm.o > util-obj-y += qapi/qapi-visit-trace.o > util-obj-y += qapi/qapi-visit-transaction.o > util-obj-y += qapi/qapi-visit-ui.o > +util-obj-y += qapi/qapi-visit-rdma.o > util-obj-y += qapi/qapi-events.o > util-obj-y += qapi/qapi-events-block-core.o > util-obj-y += qapi/qapi-events-block.o > @@ -58,6 +60,7 @@ util-obj-y += qapi/qapi-events-tpm.o > util-obj-y += qapi/qapi-events-trace.o > util-obj-y += qapi/qapi-events-transaction.o > util-obj-y += qapi/qapi-events-ui.o > +util-obj-y += qapi/qapi-events-rdma.o > util-obj-y += qapi/qapi-introspect.o > > chardev-obj-y = chardev/ > @@ -155,6 +158,7 @@ common-obj-y += qapi/qapi-commands-tpm.o > common-obj-y += qapi/qapi-commands-trace.o > common-obj-y += qapi/qapi-commands-transaction.o > common-obj-y += qapi/qapi-commands-ui.o > +common-obj-y += qapi/qapi-commands-rdma.o > common-obj-y += qapi/qapi-introspect.o > common-obj-y += qmp.o hmp.o > endif This is incomplete, and it conflicts with Eric Blake's "[PATCH v3] qapi: Reduce Makefile boilerplate". I recommend to base on Eric's patch, because that'll let you add the new rdma.json much more easily. > diff --git a/qapi/qapi-schema.json b/qapi/qapi-schema.json > index 65b6dc2f6f..a650d80f83 100644 > --- a/qapi/qapi-schema.json > +++ b/qapi/qapi-schema.json > @@ -94,3 +94,4 @@ > { 'include': 'trace.json' } > { 'include': 'introspect.json' } > { 'include': 'misc.json' } > +{ 'include': 'rdma.json' } This makes the "RDMA device" section appear last in the QEMU QMP reference manual, since they appear in include order there. $ grep -h @section bld-x86/qapi/qapi-doc.texi @section Introduction @section Stability Considerations @section Common data types @section Socket data types @section VM run state @section Cryptography @section Block devices @section Character devices @section Net devices @section Rocker switch device @section TPM (trusted platform module) devices @section Remote desktop @section Input @section Migration @section Transactions @section Tracing @section QMP introspection @section Miscellanea @section RDMA device It should go next to the other "device" sections, shouldn't it? > diff --git a/qapi/rdma.json b/qapi/rdma.json > new file mode 100644 > index 00..804c68ab36 > --- /dev/null > +++ b/qapi/rdma.json > @@ -0,0 +1,38 @@ > +# -*- Mode: Python -*- > +# > + > +## > +# = RDMA device > +## > + > +## > +# @RDMA_GID_STATUS_CHANGED: > +# > +# Emitted when guest driver adds/deletes GID to/from device > +# > +# @netdev: RoCE Network Device name - char * > +# > +# @gid-status: Add or delete indication - bool > +# > +# @subnet-prefix: Subnet Prefix - uint64 >
Re: [Qemu-devel] [PATCH 2/2] hw: fw_cfg: refactor fw_cfg_reboot()
Markus Armbruster 于2018年11月19日周一 下午3:01写道: > ÀîÇ¿ writes: > > > At 2018-11-17 00:52:58, "Markus Armbruster" wrote: > >>Li Qiang writes: > >> > >>> Currently the user can set a negative reboot_timeout. > >>> Also it is wrong to parse 'reboot-timeout' with qemu_opt_get() and then > >>> convert it to number. > >> > >>Again, it's not wrong per se, just needlessly complicated and > >>error-prone. What makes it wrong is ... > >> > >>> convert it to number. This patch refactor this function by following: > >>> 1. ensure reboot_timeout is in 0~0x > >>> 2. use qemu_opt_get_number() to parse reboot_timeout > >>> 3. simlify code > >>> > >>> Signed-off-by: Li Qiang > >>> --- > >>> hw/nvram/fw_cfg.c | 23 +++ > >>> vl.c | 2 +- > >>> 2 files changed, 12 insertions(+), 13 deletions(-) > >>> > >>> diff --git a/hw/nvram/fw_cfg.c b/hw/nvram/fw_cfg.c > >>> index 78f43dad93..6aca80846a 100644 > >>> --- a/hw/nvram/fw_cfg.c > >>> +++ b/hw/nvram/fw_cfg.c > >>> @@ -178,24 +178,23 @@ static void fw_cfg_bootsplash(FWCfgState *s) > >>> > >>> static void fw_cfg_reboot(FWCfgState *s) > >>> { > >>> -int reboot_timeout = -1; > >>> -char *p; > >>> -const char *temp; > >>> +const char *reboot_timeout = NULL; > >>> > >>> /* get user configuration */ > >>> QemuOptsList *plist = qemu_find_opts("boot-opts"); > >>> QemuOpts *opts = QTAILQ_FIRST(>head); > >>> -if (opts != NULL) { > >>> -temp = qemu_opt_get(opts, "reboot-timeout"); > >>> -if (temp != NULL) { > >>> -p = (char *)temp; > >>> -reboot_timeout = strtol(p, , 10); > >> > >>... the total lack of error checking here. Same in PATCH 1. > > > >> > > > > > > Got. > > > > > >>Here's my attempt at a clearer commit message: > >> > >>fw_cfg: Fix -boot reboot-timeout error checking > >> > >>fw_cfg_reboot() gets option parameter "reboot-timeout" with > >>qemu_opt_get(), then converts it to an integer by hand. It neglects > >>to check that conversion for errors, and fails to reject negative > >>values. Positive values above the limit get reported and replaced > >>by the limit. > >> > >>Check for conversion errors properly, and reject all values outside > >>0..0x. > > > >> > > > > > > Thanks for your advice, I appreciate it and will change in the revision > version. > > > > > >>PATCH 1's commit message could be improved the same way. > >> > >>> -} > >>> +reboot_timeout = qemu_opt_get(opts, "reboot-timeout"); > >>> + > >>> +if (reboot_timeout == NULL) { > >>> +return; > >>> } > >>> +int64_t rt_val = qemu_opt_get_number(opts, "reboot-timeout", -1); > >>> + > >>> /* validate the input */ > >>> -if (reboot_timeout > 0x) { > >>> -error_report("reboot timeout is larger than 65535, force it > to 65535."); > >>> -reboot_timeout = 0x; > >>> +if (rt_val < 0 || rt_val > 0x) { > >>> +error_report("reboot timeout is invalid," > >>> + "it should be a value between 0 and 65535"); > >>> +exit(1); > >>> } > >>> fw_cfg_add_file(s, "etc/boot-fail-wait", > g_memdup(_timeout, 4), 4); > >>> } > >> > >>Change in behavior when "reboot-timeout" isn't specified. > >> > >>Before your patch, we fw_cfg_add_file() with a value of -1. > >> > >>After your patch, we don't fw_cfg_add_file(). > >> > >>Why is that okay? > > > >> > > > > > > Here I following Gerd's advice. > > For values >0x or < 0, report and exit. > > -->http://lists.gnu.org/archive/html/qemu-devel/2018-11/msg00551.html > > Cases: > > 0. "reboot-timeout" not specified (e.g. no -boot option given) > > 1. "reboot-timeout" specified, value out of bounds > 1.a. < 0 > 1.b. > 0x > > 2. "reboot-timeout" specified, value okay > > Gerd's advice is about case 1. Your patch implements it. > > My question is about case 0. > > Do you understand my question now? > OK got. Once I think the 'reboot_timeout' can't be -1(as the user can't set this), but seems it's ok to be -1(the default value as no 'reboot-timeout' specified). I will prepare another patchset later. Thanks, Li Qiang > > [...] >
Re: [Qemu-devel] SeaBIOS booting time optimization
Hi Stefan, On Fri, Nov 16, 2018 at 04:39:20PM +, Stefan Hajnoczi wrote: > On Fri, Nov 16, 2018 at 4:21 PM Stefano Garzarella > wrote: > > I'm investigating the SeaBIOS booting time, to understand if we can reduce > > the boot time in some cases (e.g. legacy hardware is not needed). I think > > this > > can be interesting also for NEMU developers. > > Questions for NEMU folks: > > qboot isn't part of the virt-x86 NEMU tree and is only mentioned in > one or two places. Is NEMU focussing exclusively on OVMF guest > firmware instead of qboot? Initally we were only focusing on OVMF, but we added support for qboot [1] as we were not happy with the OVMF boot latencies we measured, especially in the Kata containers context. Rob was about to start looking at seabios support as well. We are going to upstream a cleaner version of the virt support to qboot very soon now. > Are there other boot time optimizations in NEMU that are a candidate > for upstreaming? We did not add anything specific, but when comparing PC+qboot with virt+qboot, we're seeing interesting overall boot time improvements. We assume the simpler virt machine type explains the difference. Cheers, Samuel. [1] https://github.com/rbradford/qboot/commit/dbcaaa1f6921064cd46f330a3625eb01e44aa4df
Re: [Qemu-devel] [PATCH 2/2] hw: fw_cfg: refactor fw_cfg_reboot()
At 2018-11-17 00:52:58, "Markus Armbruster" wrote: >Li Qiang writes: > >> Currently the user can set a negative reboot_timeout. >> Also it is wrong to parse 'reboot-timeout' with qemu_opt_get() and then >> convert it to number. > >Again, it's not wrong per se, just needlessly complicated and >error-prone. What makes it wrong is ... > >> convert it to number. This patch refactor this function by following: >> 1. ensure reboot_timeout is in 0~0x >> 2. use qemu_opt_get_number() to parse reboot_timeout >> 3. simlify code >> >> Signed-off-by: Li Qiang >> --- >> hw/nvram/fw_cfg.c | 23 +++ >> vl.c | 2 +- >> 2 files changed, 12 insertions(+), 13 deletions(-) >> >> diff --git a/hw/nvram/fw_cfg.c b/hw/nvram/fw_cfg.c >> index 78f43dad93..6aca80846a 100644 >> --- a/hw/nvram/fw_cfg.c >> +++ b/hw/nvram/fw_cfg.c >> @@ -178,24 +178,23 @@ static void fw_cfg_bootsplash(FWCfgState *s) >> >> static void fw_cfg_reboot(FWCfgState *s) >> { >> -int reboot_timeout = -1; >> -char *p; >> -const char *temp; >> +const char *reboot_timeout = NULL; >> >> /* get user configuration */ >> QemuOptsList *plist = qemu_find_opts("boot-opts"); >> QemuOpts *opts = QTAILQ_FIRST(>head); >> -if (opts != NULL) { >> -temp = qemu_opt_get(opts, "reboot-timeout"); >> -if (temp != NULL) { >> -p = (char *)temp; >> -reboot_timeout = strtol(p, , 10); > >... the total lack of error checking here. Same in PATCH 1. > Got. >Here's my attempt at a clearer commit message: > >fw_cfg: Fix -boot reboot-timeout error checking > >fw_cfg_reboot() gets option parameter "reboot-timeout" with >qemu_opt_get(), then converts it to an integer by hand. It neglects >to check that conversion for errors, and fails to reject negative >values. Positive values above the limit get reported and replaced >by the limit. > >Check for conversion errors properly, and reject all values outside >0..0x. > Thanks for your advice, I appreciate it and will change in the revision version. >PATCH 1's commit message could be improved the same way. > >> -} >> +reboot_timeout = qemu_opt_get(opts, "reboot-timeout"); >> + >> +if (reboot_timeout == NULL) { >> +return; >> } >> +int64_t rt_val = qemu_opt_get_number(opts, "reboot-timeout", -1); >> + >> /* validate the input */ >> -if (reboot_timeout > 0x) { >> -error_report("reboot timeout is larger than 65535, force it to >> 65535."); >> -reboot_timeout = 0x; >> +if (rt_val < 0 || rt_val > 0x) { >> +error_report("reboot timeout is invalid," >> + "it should be a value between 0 and 65535"); >> +exit(1); >> } >> fw_cfg_add_file(s, "etc/boot-fail-wait", g_memdup(_timeout, 4), >> 4); >> } > >Change in behavior when "reboot-timeout" isn't specified. > >Before your patch, we fw_cfg_add_file() with a value of -1. > >After your patch, we don't fw_cfg_add_file(). > >Why is that okay? > Here I following Gerd's advice. For values >0x or < 0, report and exit. -->http://lists.gnu.org/archive/html/qemu-devel/2018-11/msg00551.html Thanks, Li Qiang >> diff --git a/vl.c b/vl.c >> index be37da46f0..086127ff0b 100644 >> --- a/vl.c >> +++ b/vl.c >> @@ -339,7 +339,7 @@ static QemuOptsList qemu_boot_opts = { >> .type = QEMU_OPT_NUMBER, >> }, { >> .name = "reboot-timeout", >> -.type = QEMU_OPT_STRING, >> +.type = QEMU_OPT_NUMBER, >> }, { >> .name = "strict", >> .type = QEMU_OPT_BOOL,