Re: [Qemu-devel] [PATCH v3 11/23] hw/pvrdma: Add support to allow guest to configure GID table

2018-11-18 Thread Yuval Shaia
On Sat, Nov 17, 2018 at 02:48:34PM +0200, Marcel Apfelbaum wrote:
> 
> 
> On 11/13/18 9:13 AM, Yuval Shaia wrote:
> > The control over the RDMA device's GID table is done by updating the
> > device's Ethernet function addresses.
> > Usually the first GID entry is determine by the MAC address, the second
> 
> s/determine/determined
> 
> > by the first IPv6 address and the third by the IPv4 address. Other
> > entries can be added by adding more IP addresses. The opposite is the
> > same, i.e. whenever an address is removed, the corresponding GID entry
> > is removed.
> > 
> > The process is done by the network and RDMA stacks. Whenever an address
> > is added the ib_core driver is notified and calls the device driver
> > add_gid function which in turn update the device.
> > 
> > To support this in pvrdma device we need to hook into the create_bind
> > and destroy_bind HW commands triggered by pvrdma driver in guest.
> > Whenever a changed is made to the pvrdma device's GID table a special
> without 'a'
> 
> > QMP messages is sent to be processed by libvirt to update the address of
> > the backend Ethernet device.
> 
> So the device can't be used without libvirt? How can we
> use it anyway only with QEMU ?

We can't.

(actually we can but this involve a hack since we have to know in advance
the IP addresses that will be assigned to the net device in the guest)

> 
> > 
> > Signed-off-by: Yuval Shaia 
> > ---
> >   hw/rdma/rdma_backend.c  | 243 +++-
> 
> rdma_backend.c is gettting larger...

1261 lines only, let's revisit it on next major change.

> 
> >   hw/rdma/rdma_backend.h  |  22 ++--
> >   hw/rdma/rdma_backend_defs.h |   3 +-
> >   hw/rdma/rdma_rm.c   | 104 ++-
> >   hw/rdma/rdma_rm.h   |  17 ++-
> >   hw/rdma/rdma_rm_defs.h  |   9 +-
> >   hw/rdma/rdma_utils.h|  15 +++
> >   hw/rdma/vmw/pvrdma.h|   2 +-
> >   hw/rdma/vmw/pvrdma_cmd.c|  55 
> >   hw/rdma/vmw/pvrdma_main.c   |  25 +---
> >   hw/rdma/vmw/pvrdma_qp_ops.c |  20 +++
> >   11 files changed, 370 insertions(+), 145 deletions(-)
> > 
> > diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c
> > index 3eb0099f8d..5675504165 100644
> > --- a/hw/rdma/rdma_backend.c
> > +++ b/hw/rdma/rdma_backend.c
> > @@ -18,12 +18,14 @@
> >   #include "qapi/error.h"
> >   #include "qapi/qmp/qlist.h"
> >   #include "qapi/qmp/qnum.h"
> > +#include "qapi/qapi-events-rdma.h"
> >   #include 
> >   #include 
> >   #include 
> >   #include 
> > +#include "contrib/rdmacm-mux/rdmacm-mux.h"
> >   #include "trace.h"
> >   #include "rdma_utils.h"
> >   #include "rdma_rm.h"
> > @@ -300,11 +302,11 @@ static int build_host_sge_array(RdmaDeviceResources 
> > *rdma_dev_res,
> >   return 0;
> >   }
> > -static int mad_send(RdmaBackendDev *backend_dev, struct ibv_sge *sge,
> > -uint32_t num_sge)
> > +static int mad_send(RdmaBackendDev *backend_dev, uint8_t sgid_idx,
> > +union ibv_gid *sgid, struct ibv_sge *sge, uint32_t 
> > num_sge)
> >   {
> > -struct backend_umad umad = {0};
> > -char *hdr, *msg;
> > +RdmaCmMuxMsg msg = {0};
> > +char *hdr, *data;
> >   int ret;
> >   pr_dbg("num_sge=%d\n", num_sge);
> > @@ -313,41 +315,50 @@ static int mad_send(RdmaBackendDev *backend_dev, 
> > struct ibv_sge *sge,
> >   return -EINVAL;
> >   }
> > -umad.hdr.length = sge[0].length + sge[1].length;
> > -pr_dbg("msg_len=%d\n", umad.hdr.length);
> > +msg.hdr.msg_type = RDMACM_MUX_MSG_TYPE_MAD;
> > +memcpy(msg.hdr.sgid.raw, sgid->raw, sizeof(msg.hdr.sgid));
> > -if (umad.hdr.length > sizeof(umad.mad)) {
> > +msg.umad_len = sge[0].length + sge[1].length;
> > +pr_dbg("umad_len=%d\n", msg.umad_len);
> > +
> > +if (msg.umad_len > sizeof(msg.umad.mad)) {
> >   return -ENOMEM;
> >   }
> > -umad.hdr.addr.qpn = htobe32(1);
> > -umad.hdr.addr.grh_present = 1;
> > -umad.hdr.addr.gid_index = backend_dev->backend_gid_idx;
> > -memcpy(umad.hdr.addr.gid, backend_dev->gid.raw, 
> > sizeof(umad.hdr.addr.gid));
> > -umad.hdr.addr.hop_limit = 1;
> > +msg.umad.hdr.addr.qpn = htobe32(1);
> > +msg.umad.hdr.addr.grh_present = 1;
> > +pr_dbg("sgid_idx=%d\n", sgid_idx);
> > +pr_dbg("sgid=0x%llx\n", sgid->global.interface_id);
> > +msg.umad.hdr.addr.gid_index = sgid_idx;
> > +memcpy(msg.umad.hdr.addr.gid, sgid->raw, 
> > sizeof(msg.umad.hdr.addr.gid));
> > +msg.umad.hdr.addr.hop_limit = 1;
> >   hdr = rdma_pci_dma_map(backend_dev->dev, sge[0].addr, sge[0].length);
> > -msg = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length);
> > +data = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length);
> > +
> > +pr_dbg_buf("mad_hdr", hdr, sge[0].length);
> > +pr_dbg_buf("mad_data", data, sge[1].length);
> > -memcpy([0], hdr, sge[0].length);
> > -memcpy([sge[0].length], msg, sge[1].length);
> > +memcpy([0], 

Re: [Qemu-devel] [PATCH v2 13/22] hw/pvrdma: Make sure PCI function 0 is vmxnet3

2018-11-18 Thread Yuval Shaia
On Sat, Nov 17, 2018 at 01:41:41PM +0200, Marcel Apfelbaum wrote:
> 
> 
> On 11/11/18 9:45 AM, Yuval Shaia wrote:
> > On Sat, Nov 10, 2018 at 08:27:44PM +0200, Marcel Apfelbaum wrote:
> > > 
> > > On 11/8/18 6:08 PM, Yuval Shaia wrote:
> > > > Guest driver enforces it, we should also.
> > > > 
> > > > Signed-off-by: Yuval Shaia 
> > > > ---
> > > >hw/rdma/vmw/pvrdma.h  | 2 ++
> > > >hw/rdma/vmw/pvrdma_main.c | 3 +++
> > > >2 files changed, 5 insertions(+)
> > > > 
> > > > diff --git a/hw/rdma/vmw/pvrdma.h b/hw/rdma/vmw/pvrdma.h
> > > > index b019cb843a..10a3c4fb7c 100644
> > > > --- a/hw/rdma/vmw/pvrdma.h
> > > > +++ b/hw/rdma/vmw/pvrdma.h
> > > > @@ -20,6 +20,7 @@
> > > >#include "hw/pci/pci.h"
> > > >#include "hw/pci/msix.h"
> > > >#include "chardev/char-fe.h"
> > > > +#include "hw/net/vmxnet3_defs.h"
> > > >#include "../rdma_backend_defs.h"
> > > >#include "../rdma_rm_defs.h"
> > > > @@ -85,6 +86,7 @@ typedef struct PVRDMADev {
> > > >RdmaBackendDev backend_dev;
> > > >RdmaDeviceResources rdma_dev_res;
> > > >CharBackend mad_chr;
> > > > +VMXNET3State *func0;
> > > >} PVRDMADev;
> > > >#define PVRDMA_DEV(dev) OBJECT_CHECK(PVRDMADev, (dev), 
> > > > PVRDMA_HW_NAME)
> > > > diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c
> > > > index ac8c092db0..fa6468d221 100644
> > > > --- a/hw/rdma/vmw/pvrdma_main.c
> > > > +++ b/hw/rdma/vmw/pvrdma_main.c
> > > > @@ -576,6 +576,9 @@ static void pvrdma_realize(PCIDevice *pdev, Error 
> > > > **errp)
> > > >return;
> > > >}
> > > > +/* Break if not vmxnet3 device in slot 0 */
> > > > +dev->func0 = VMXNET3(pci_get_function_0(pdev));
> > > > +
> > > I don't see the error code flow in case VMXNET3 is not func 0.
> > > Am I missing something?
> > Yes, this is a dynamic cast that will break the process when fail to cast.
> > 
> > This is the error message that you will get in case that device on function
> > 0 is not vmxnet3:
> > 
> > pvrdma_main.c:589:pvrdma_realize: Object 0x557b959841a0 is not an instance 
> > of type vmxnet3
> 
> I am not sure we will see this error if QEMU is compiled in Release mode.
> I think object_dynamic_cast_assert throws this error only if
> CONFIG_QOM_CAST_DEBUG
> is set, and is possible the mentioned flag is not set in Release.
> 
> Thanks,
> Marcel

Done.
Thanks.

> 
> > 
> > > 
> > > Thanks,
> > > Marcel
> > > 
> > > >memdev_root = object_resolve_path("/objects", NULL);
> > > >if (memdev_root) {
> > > >object_child_foreach(memdev_root, pvrdma_check_ram_shared, 
> > > > _shared);
> 



Re: [Qemu-devel] [PATCH v3 05/23] hw/rdma: Add support for MAD packets

2018-11-18 Thread Yuval Shaia
On Sat, Nov 17, 2018 at 02:06:48PM +0200, Marcel Apfelbaum wrote:
> Hi Yuval,
> 
> On 11/13/18 9:12 AM, Yuval Shaia wrote:
> > MAD (Management Datagram) packets are widely used by various modules
> 
> Please add a link to Spec, I sent it in the V1 mail-thread
> Please add it also as a comment in the code. I know MADs
> are a complicated matter, but if somebody wants to have a look...

Added in v4, thanks.

> 
> > both in kernel and in user space for example the rdma_* API which is
> > used to create and maintain "connection" layer on top of RDMA uses
> > several types of MAD packets.
> > To support MAD packets the device uses an external utility
> > (contrib/rdmacm-mux) to relay packets from and to the guest driver.
> > 
> > Signed-off-by: Yuval Shaia 
> > ---
> >   hw/rdma/rdma_backend.c  | 263 +++-
> >   hw/rdma/rdma_backend.h  |   4 +-
> >   hw/rdma/rdma_backend_defs.h |  10 +-
> >   hw/rdma/vmw/pvrdma.h|   2 +
> >   hw/rdma/vmw/pvrdma_main.c   |   4 +-
> >   5 files changed, 273 insertions(+), 10 deletions(-)
> > 
> > diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c
> > index 1e148398a2..3eb0099f8d 100644
> > --- a/hw/rdma/rdma_backend.c
> > +++ b/hw/rdma/rdma_backend.c
> 
> rdma_backend is getting huge, have you consider taking out
> mad related code?

1261 lines only, is that so big? comparing to other files under hw/ i don't
see much difference here.

> > @@ -16,8 +16,13 @@
> >   #include "qemu/osdep.h"
> >   #include "qemu/error-report.h"
> >   #include "qapi/error.h"
> > +#include "qapi/qmp/qlist.h"
> > +#include "qapi/qmp/qnum.h"
> >   #include 
> > +#include 
> > +#include 
> > +#include 
> >   #include "trace.h"
> >   #include "rdma_utils.h"
> > @@ -33,16 +38,25 @@
> >   #define VENDOR_ERR_MAD_SEND 0x206
> >   #define VENDOR_ERR_INVLKEY  0x207
> >   #define VENDOR_ERR_MR_SMALL 0x208
> > +#define VENDOR_ERR_INV_MAD_BUFF 0x209
> > +#define VENDOR_ERR_INV_NUM_SGE  0x210
> >   #define THR_NAME_LEN 16
> >   #define THR_POLL_TO  5000
> > +#define MAD_HDR_SIZE sizeof(struct ibv_grh)
> > +
> >   typedef struct BackendCtx {
> > -uint64_t req_id;
> >   void *up_ctx;
> >   bool is_tx_req;
> > +struct ibv_sge sge; /* Used to save MAD recv buffer */
> >   } BackendCtx;
> > +struct backend_umad {
> > +struct ib_user_mad hdr;
> > +char mad[RDMA_MAX_PRIVATE_DATA];
> > +};
> > +
> >   static void (*comp_handler)(int status, unsigned int vendor_err, void 
> > *ctx);
> >   static void dummy_comp_handler(int status, unsigned int vendor_err, void 
> > *ctx)
> > @@ -286,6 +300,49 @@ static int build_host_sge_array(RdmaDeviceResources 
> > *rdma_dev_res,
> >   return 0;
> >   }
> > +static int mad_send(RdmaBackendDev *backend_dev, struct ibv_sge *sge,
> > +uint32_t num_sge)
> > +{
> > +struct backend_umad umad = {0};
> > +char *hdr, *msg;
> > +int ret;
> > +
> > +pr_dbg("num_sge=%d\n", num_sge);
> > +
> > +if (num_sge != 2) {
> > +return -EINVAL;
> > +}
> > +
> > +umad.hdr.length = sge[0].length + sge[1].length;
> > +pr_dbg("msg_len=%d\n", umad.hdr.length);
> > +
> > +if (umad.hdr.length > sizeof(umad.mad)) {
> > +return -ENOMEM;
> > +}
> > +
> > +umad.hdr.addr.qpn = htobe32(1);
> > +umad.hdr.addr.grh_present = 1;
> > +umad.hdr.addr.gid_index = backend_dev->backend_gid_idx;
> > +memcpy(umad.hdr.addr.gid, backend_dev->gid.raw, 
> > sizeof(umad.hdr.addr.gid));
> > +umad.hdr.addr.hop_limit = 1;
> > +
> > +hdr = rdma_pci_dma_map(backend_dev->dev, sge[0].addr, sge[0].length);
> > +msg = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length);
> > +
> 
> If rdma_pci_dma_map fails it will return NULL 

Done.

> 
> > +memcpy([0], hdr, sge[0].length);
> > +memcpy([sge[0].length], msg, sge[1].length);
> > +
> 
> ... and here we access a NULL pointer.
> Maybe is possible to return some error here.
> > +rdma_pci_dma_unmap(backend_dev->dev, msg, sge[1].length);
> > +rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length);
> > +
> > +ret = qemu_chr_fe_write(backend_dev->mad_chr_be, (const uint8_t 
> > *),
> > +sizeof(umad));
> > +
> > +pr_dbg("qemu_chr_fe_write=%d\n", ret);
> > +
> > +return (ret != sizeof(umad));
> > +}
> > +
> >   void rdma_backend_post_send(RdmaBackendDev *backend_dev,
> >   RdmaBackendQP *qp, uint8_t qp_type,
> >   struct ibv_sge *sge, uint32_t num_sge,
> > @@ -304,9 +361,13 @@ void rdma_backend_post_send(RdmaBackendDev 
> > *backend_dev,
> >   comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx);
> >   } else if (qp_type == IBV_QPT_GSI) {
> >   pr_dbg("QP1\n");
> > -comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
> > +rc = mad_send(backend_dev, sge, num_sge);
> > +if (rc) {
> > + 

Re: [Qemu-devel] [PATCH 2/2] scsi-disk: fix rerror/werror=ignore

2018-11-18 Thread Richard W.M. Jones
On Sat, Oct 13, 2018 at 11:53:23AM +0200, Paolo Bonzini wrote:
> rerror=ignore was returning true from scsi_handle_rw_error but the callers 
> were not
> calling scsi_req_complete when rerror=ignore returns true (this is the 
> correct thing
> to do when true is returned after executing a passthrough command).  Fix this 
> by
> calling it in scsi_handle_rw_error.
> 
> Signed-off-by: Paolo Bonzini 
> ---
>  hw/scsi/scsi-disk.c | 7 ++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
> index 4074d7c..e2c5408 100644
> --- a/hw/scsi/scsi-disk.c
> +++ b/hw/scsi/scsi-disk.c
> @@ -473,10 +473,15 @@ static bool scsi_handle_rw_error(SCSIDiskReq *r, int 
> error, bool acct_failed)
>  }
>  
>  blk_error_action(s->qdev.conf.blk, action, is_read, error);
> +if (action == BLOCK_ERROR_ACTION_IGNORE) {
> +scsi_req_complete(>req, 0);
> +return true;
> +}
> +
>  if (action == BLOCK_ERROR_ACTION_STOP) {
>  scsi_req_retry(>req);
>  }
> -return action != BLOCK_ERROR_ACTION_IGNORE;
> +return false;
>  }

This commit seems to cause an assertion failure in qemu which is
trivial to reproduce:

(1) Create an NBD disk which returns EIO for every request:

  $ nbdkit -f -v --filter=error memory size=64M error-rate=100%

(2) Attach the disk to qemu as a virtio-scsi device:

  $ x86_64-softmmu/qemu-system-x86_64 -device virtio-scsi,id=scsi -drive 
file=nbd:localhost:10809,format=raw,id=hd0,if=none -device scsi-hd,drive=hd0
  qemu-system-x86_64: hw/scsi/scsi-bus.c:1374: scsi_req_complete: Assertion 
`req->status == -1' failed.

More details including a stack trace here:

  https://bugzilla.redhat.com/show_bug.cgi?id=1650975

Rich.

-- 
Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
Read my programming and virtualization blog: http://rwmj.wordpress.com
virt-builder quickly builds VMs from scratch
http://libguestfs.org/virt-builder.1.html



[Qemu-devel] [PATCH v4 16/23] hw/pvrdma: Fill all CQE fields

2018-11-18 Thread Yuval Shaia
Add ability to pass specific WC attributes to CQE such as GRH_BIT flag.

Signed-off-by: Yuval Shaia 
Reviewed-by: Marcel Apfelbaum 
---
 hw/rdma/rdma_backend.c  | 59 +++--
 hw/rdma/rdma_backend.h  |  4 +--
 hw/rdma/vmw/pvrdma_qp_ops.c | 31 +++
 3 files changed, 58 insertions(+), 36 deletions(-)

diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c
index db78009bbb..b1259012e5 100644
--- a/hw/rdma/rdma_backend.c
+++ b/hw/rdma/rdma_backend.c
@@ -59,13 +59,24 @@ struct backend_umad {
 char mad[RDMA_MAX_PRIVATE_DATA];
 };
 
-static void (*comp_handler)(int status, unsigned int vendor_err, void *ctx);
+static void (*comp_handler)(void *ctx, struct ibv_wc *wc);
 
-static void dummy_comp_handler(int status, unsigned int vendor_err, void *ctx)
+static void dummy_comp_handler(void *ctx, struct ibv_wc *wc)
 {
 pr_err("No completion handler is registered\n");
 }
 
+static inline void complete_work(enum ibv_wc_status status, uint32_t 
vendor_err,
+ void *ctx)
+{
+struct ibv_wc wc = {0};
+
+wc.status = status;
+wc.vendor_err = vendor_err;
+
+comp_handler(ctx, );
+}
+
 static void poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq *ibcq)
 {
 int i, ne;
@@ -90,7 +101,7 @@ static void poll_cq(RdmaDeviceResources *rdma_dev_res, 
struct ibv_cq *ibcq)
 }
 pr_dbg("Processing %s CQE\n", bctx->is_tx_req ? "send" : "recv");
 
-comp_handler(wc[i].status, wc[i].vendor_err, bctx->up_ctx);
+comp_handler(bctx->up_ctx, [i]);
 
 rdma_rm_dealloc_cqe_ctx(rdma_dev_res, wc[i].wr_id);
 g_free(bctx);
@@ -184,8 +195,8 @@ static void start_comp_thread(RdmaBackendDev *backend_dev)
comp_handler_thread, backend_dev, QEMU_THREAD_DETACHED);
 }
 
-void rdma_backend_register_comp_handler(void (*handler)(int status,
-unsigned int vendor_err, void *ctx))
+void rdma_backend_register_comp_handler(void (*handler)(void *ctx,
+ struct ibv_wc *wc))
 {
 comp_handler = handler;
 }
@@ -412,14 +423,14 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev,
 if (!qp->ibqp) { /* This field does not get initialized for QP0 and QP1 */
 if (qp_type == IBV_QPT_SMI) {
 pr_dbg("QP0 unsupported\n");
-comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx);
+complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx);
 } else if (qp_type == IBV_QPT_GSI) {
 pr_dbg("QP1\n");
 rc = mad_send(backend_dev, sgid_idx, sgid, sge, num_sge);
 if (rc) {
-comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
+complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
 } else {
-comp_handler(IBV_WC_SUCCESS, 0, ctx);
+complete_work(IBV_WC_SUCCESS, 0, ctx);
 }
 }
 return;
@@ -428,7 +439,7 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev,
 pr_dbg("num_sge=%d\n", num_sge);
 if (!num_sge) {
 pr_dbg("num_sge=0\n");
-comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_NO_SGE, ctx);
+complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_NO_SGE, ctx);
 return;
 }
 
@@ -439,21 +450,21 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev,
 rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, _id, bctx);
 if (unlikely(rc)) {
 pr_dbg("Failed to allocate cqe_ctx\n");
-comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx);
+complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx);
 goto out_free_bctx;
 }
 
 rc = build_host_sge_array(backend_dev->rdma_dev_res, new_sge, sge, 
num_sge);
 if (rc) {
 pr_dbg("Error: Failed to build host SGE array\n");
-comp_handler(IBV_WC_GENERAL_ERR, rc, ctx);
+complete_work(IBV_WC_GENERAL_ERR, rc, ctx);
 goto out_dealloc_cqe_ctx;
 }
 
 if (qp_type == IBV_QPT_UD) {
 wr.wr.ud.ah = create_ah(backend_dev, qp->ibpd, sgid_idx, dgid);
 if (!wr.wr.ud.ah) {
-comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
+complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
 goto out_dealloc_cqe_ctx;
 }
 wr.wr.ud.remote_qpn = dqpn;
@@ -471,7 +482,7 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev,
 if (rc) {
 pr_dbg("Fail (%d, %d) to post send WQE to qpn %d\n", rc, errno,
 qp->ibqp->qp_num);
-comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
+complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
 goto out_dealloc_cqe_ctx;
 }
 
@@ -540,13 +551,13 @@ void rdma_backend_post_recv(RdmaBackendDev *backend_dev,
 if (!qp->ibqp) { /* This field does not get 

[Qemu-devel] [PATCH v4 19/23] vl: Introduce shutdown_notifiers

2018-11-18 Thread Yuval Shaia
Notifier will be used for signaling shutdown event to inform system is
shutdown. This will allow devices and other component to run some
cleanup code needed before VM is shutdown.

Signed-off-by: Yuval Shaia 
Reviewed-by: Cornelia Huck 
---
 include/sysemu/sysemu.h |  1 +
 vl.c| 15 ++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 8d6095d98b..0d15f16492 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -80,6 +80,7 @@ void qemu_register_wakeup_notifier(Notifier *notifier);
 void qemu_system_shutdown_request(ShutdownCause reason);
 void qemu_system_powerdown_request(void);
 void qemu_register_powerdown_notifier(Notifier *notifier);
+void qemu_register_shutdown_notifier(Notifier *notifier);
 void qemu_system_debug_request(void);
 void qemu_system_vmstop_request(RunState reason);
 void qemu_system_vmstop_request_prepare(void);
diff --git a/vl.c b/vl.c
index 1fcacc5caa..d33d52522c 100644
--- a/vl.c
+++ b/vl.c
@@ -1578,6 +1578,8 @@ static NotifierList suspend_notifiers =
 NOTIFIER_LIST_INITIALIZER(suspend_notifiers);
 static NotifierList wakeup_notifiers =
 NOTIFIER_LIST_INITIALIZER(wakeup_notifiers);
+static NotifierList shutdown_notifiers =
+NOTIFIER_LIST_INITIALIZER(shutdown_notifiers);
 static uint32_t wakeup_reason_mask = ~(1 << QEMU_WAKEUP_REASON_NONE);
 
 ShutdownCause qemu_shutdown_requested_get(void)
@@ -1809,6 +1811,12 @@ static void qemu_system_powerdown(void)
 notifier_list_notify(_notifiers, NULL);
 }
 
+static void qemu_system_shutdown(ShutdownCause cause)
+{
+qapi_event_send_shutdown(shutdown_caused_by_guest(cause));
+notifier_list_notify(_notifiers, );
+}
+
 void qemu_system_powerdown_request(void)
 {
 trace_qemu_system_powerdown_request();
@@ -1821,6 +1829,11 @@ void qemu_register_powerdown_notifier(Notifier *notifier)
 notifier_list_add(_notifiers, notifier);
 }
 
+void qemu_register_shutdown_notifier(Notifier *notifier)
+{
+notifier_list_add(_notifiers, notifier);
+}
+
 void qemu_system_debug_request(void)
 {
 debug_requested = 1;
@@ -1848,7 +1861,7 @@ static bool main_loop_should_exit(void)
 request = qemu_shutdown_requested();
 if (request) {
 qemu_kill_report();
-qapi_event_send_shutdown(shutdown_caused_by_guest(request));
+qemu_system_shutdown(request);
 if (no_shutdown) {
 vm_stop(RUN_STATE_SHUTDOWN);
 } else {
-- 
2.17.2




[Qemu-devel] [PATCH v4 02/23] hw/rdma: Add ability to force notification without re-arm

2018-11-18 Thread Yuval Shaia
Upon completion of incoming packet the device pushes CQE to driver's RX
ring and notify the driver (msix).
While for data-path incoming packets the driver needs the ability to
control whether it wished to receive interrupts or not, for control-path
packets such as incoming MAD the driver needs to be notified anyway, it
even do not need to re-arm the notification bit.

Enhance the notification field to support this.

Signed-off-by: Yuval Shaia 
Reviewed-by: Marcel Apfelbaum 
---
 hw/rdma/rdma_rm.c   | 12 ++--
 hw/rdma/rdma_rm_defs.h  |  8 +++-
 hw/rdma/vmw/pvrdma_qp_ops.c |  6 --
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/hw/rdma/rdma_rm.c b/hw/rdma/rdma_rm.c
index 8d59a42cd1..4f10fcabcc 100644
--- a/hw/rdma/rdma_rm.c
+++ b/hw/rdma/rdma_rm.c
@@ -263,7 +263,7 @@ int rdma_rm_alloc_cq(RdmaDeviceResources *dev_res, 
RdmaBackendDev *backend_dev,
 }
 
 cq->opaque = opaque;
-cq->notify = false;
+cq->notify = CNT_CLEAR;
 
 rc = rdma_backend_create_cq(backend_dev, >backend_cq, cqe);
 if (rc) {
@@ -291,7 +291,10 @@ void rdma_rm_req_notify_cq(RdmaDeviceResources *dev_res, 
uint32_t cq_handle,
 return;
 }
 
-cq->notify = notify;
+if (cq->notify != CNT_SET) {
+cq->notify = notify ? CNT_ARM : CNT_CLEAR;
+}
+
 pr_dbg("notify=%d\n", cq->notify);
 }
 
@@ -349,6 +352,11 @@ int rdma_rm_alloc_qp(RdmaDeviceResources *dev_res, 
uint32_t pd_handle,
 return -EINVAL;
 }
 
+if (qp_type == IBV_QPT_GSI) {
+scq->notify = CNT_SET;
+rcq->notify = CNT_SET;
+}
+
 qp = res_tbl_alloc(_res->qp_tbl, _qpn);
 if (!qp) {
 return -ENOMEM;
diff --git a/hw/rdma/rdma_rm_defs.h b/hw/rdma/rdma_rm_defs.h
index 7228151239..9b399063d3 100644
--- a/hw/rdma/rdma_rm_defs.h
+++ b/hw/rdma/rdma_rm_defs.h
@@ -49,10 +49,16 @@ typedef struct RdmaRmPD {
 uint32_t ctx_handle;
 } RdmaRmPD;
 
+typedef enum CQNotificationType {
+CNT_CLEAR,
+CNT_ARM,
+CNT_SET,
+} CQNotificationType;
+
 typedef struct RdmaRmCQ {
 RdmaBackendCQ backend_cq;
 void *opaque;
-bool notify;
+CQNotificationType notify;
 } RdmaRmCQ;
 
 /* MR (DMA region) */
diff --git a/hw/rdma/vmw/pvrdma_qp_ops.c b/hw/rdma/vmw/pvrdma_qp_ops.c
index c668afd0ed..762700a205 100644
--- a/hw/rdma/vmw/pvrdma_qp_ops.c
+++ b/hw/rdma/vmw/pvrdma_qp_ops.c
@@ -89,8 +89,10 @@ static int pvrdma_post_cqe(PVRDMADev *dev, uint32_t 
cq_handle,
 pvrdma_ring_write_inc(>dsr_info.cq);
 
 pr_dbg("cq->notify=%d\n", cq->notify);
-if (cq->notify) {
-cq->notify = false;
+if (cq->notify != CNT_CLEAR) {
+if (cq->notify == CNT_ARM) {
+cq->notify = CNT_CLEAR;
+}
 post_interrupt(dev, INTR_VEC_CMD_COMPLETION_Q);
 }
 
-- 
2.17.2




Re: [Qemu-devel] [PATCH-for-3.1] ps2kbd: default to scan enabled after reset

2018-11-18 Thread Hervé Poussineau

Ping again.

v3.0 didn't contain 143c04c7e0639e53086519592ead15d2556bfbf2, so this commit 
fixes a regression.

Le 10/11/2018 à 21:53, Hervé Poussineau a écrit :

Ping.

Le 21/10/2018 à 21:07, Hervé Poussineau a écrit :

A check for scan_enabled has been added to ps2_keyboard_event in commit
143c04c7e0639e53086519592ead15d2556bfbf2 to prevent stream corruption.
This works well as long as operating system is resetting keyboard, or enabling 
it.

This fixes IBM 40p firmware, which doesn't bother sending KBD_CMD_RESET,
KBD_CMD_ENABLE or KBD_CMD_RESET_ENABLE before trying to use the keyboard.

Signed-off-by: Hervé Poussineau 
---
  hw/input/ps2.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/input/ps2.c b/hw/input/ps2.c
index fdfcadf9a1..eded4f0f8d 100644
--- a/hw/input/ps2.c
+++ b/hw/input/ps2.c
@@ -938,7 +938,7 @@ static void ps2_kbd_reset(void *opaque)
  trace_ps2_kbd_reset(opaque);
  ps2_common_reset(>common);
-    s->scan_enabled = 0;
+    s->scan_enabled = 1;
  s->translate = 0;
  s->scancode_set = 2;
  s->modifiers = 0;









[Qemu-devel] [PATCH v4 17/23] hw/pvrdma: Fill error code in command's response

2018-11-18 Thread Yuval Shaia
Driver checks error code let's set it.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/vmw/pvrdma_cmd.c | 67 
 1 file changed, 48 insertions(+), 19 deletions(-)

diff --git a/hw/rdma/vmw/pvrdma_cmd.c b/hw/rdma/vmw/pvrdma_cmd.c
index 0d3c818c20..a326c5d470 100644
--- a/hw/rdma/vmw/pvrdma_cmd.c
+++ b/hw/rdma/vmw/pvrdma_cmd.c
@@ -131,7 +131,8 @@ static int query_port(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 
 if (rdma_backend_query_port(>backend_dev,
 (struct ibv_port_attr *))) {
-return -ENOMEM;
+resp->hdr.err = -ENOMEM;
+goto out;
 }
 
 memset(resp, 0, sizeof(*resp));
@@ -150,7 +151,9 @@ static int query_port(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 resp->attrs.active_width = 1;
 resp->attrs.active_speed = 1;
 
-return 0;
+out:
+pr_dbg("ret=%d\n", resp->hdr.err);
+return resp->hdr.err;
 }
 
 static int query_pkey(PVRDMADev *dev, union pvrdma_cmd_req *req,
@@ -170,7 +173,7 @@ static int query_pkey(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 resp->pkey = PVRDMA_PKEY;
 pr_dbg("pkey=0x%x\n", resp->pkey);
 
-return 0;
+return resp->hdr.err;
 }
 
 static int create_pd(PVRDMADev *dev, union pvrdma_cmd_req *req,
@@ -200,7 +203,9 @@ static int destroy_pd(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 
 rdma_rm_dealloc_pd(>rdma_dev_res, cmd->pd_handle);
 
-return 0;
+rsp->hdr.err = 0;
+
+return rsp->hdr.err;
 }
 
 static int create_mr(PVRDMADev *dev, union pvrdma_cmd_req *req,
@@ -251,7 +256,9 @@ static int destroy_mr(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 
 rdma_rm_dealloc_mr(>rdma_dev_res, cmd->mr_handle);
 
-return 0;
+rsp->hdr.err = 0;
+
+return rsp->hdr.err;
 }
 
 static int create_cq_ring(PCIDevice *pci_dev , PvrdmaRing **ring,
@@ -353,7 +360,8 @@ static int destroy_cq(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 cq = rdma_rm_get_cq(>rdma_dev_res, cmd->cq_handle);
 if (!cq) {
 pr_dbg("Invalid CQ handle\n");
-return -EINVAL;
+rsp->hdr.err = -EINVAL;
+goto out;
 }
 
 ring = (PvrdmaRing *)cq->opaque;
@@ -364,7 +372,11 @@ static int destroy_cq(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 
 rdma_rm_dealloc_cq(>rdma_dev_res, cmd->cq_handle);
 
-return 0;
+rsp->hdr.err = 0;
+
+out:
+pr_dbg("ret=%d\n", rsp->hdr.err);
+return rsp->hdr.err;
 }
 
 static int create_qp_rings(PCIDevice *pci_dev, uint64_t pdir_dma,
@@ -553,7 +565,8 @@ static int destroy_qp(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 qp = rdma_rm_get_qp(>rdma_dev_res, cmd->qp_handle);
 if (!qp) {
 pr_dbg("Invalid QP handle\n");
-return -EINVAL;
+rsp->hdr.err = -EINVAL;
+goto out;
 }
 
 rdma_rm_dealloc_qp(>rdma_dev_res, cmd->qp_handle);
@@ -567,7 +580,11 @@ static int destroy_qp(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 rdma_pci_dma_unmap(PCI_DEVICE(dev), ring->ring_state, TARGET_PAGE_SIZE);
 g_free(ring);
 
-return 0;
+rsp->hdr.err = 0;
+
+out:
+pr_dbg("ret=%d\n", rsp->hdr.err);
+return rsp->hdr.err;
 }
 
 static int create_bind(PVRDMADev *dev, union pvrdma_cmd_req *req,
@@ -580,7 +597,8 @@ static int create_bind(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 pr_dbg("index=%d\n", cmd->index);
 
 if (cmd->index >= MAX_PORT_GIDS) {
-return -EINVAL;
+rsp->hdr.err = -EINVAL;
+goto out;
 }
 
 pr_dbg("gid[%d]=0x%llx,0x%llx\n", cmd->index,
@@ -590,10 +608,15 @@ static int create_bind(PVRDMADev *dev, union 
pvrdma_cmd_req *req,
 rc = rdma_rm_add_gid(>rdma_dev_res, >backend_dev,
  dev->backend_eth_device_name, gid, cmd->index);
 if (rc < 0) {
-return -EINVAL;
+rsp->hdr.err = rc;
+goto out;
 }
 
-return 0;
+rsp->hdr.err = 0;
+
+out:
+pr_dbg("ret=%d\n", rsp->hdr.err);
+return rsp->hdr.err;
 }
 
 static int destroy_bind(PVRDMADev *dev, union pvrdma_cmd_req *req,
@@ -606,7 +629,8 @@ static int destroy_bind(PVRDMADev *dev, union 
pvrdma_cmd_req *req,
 pr_dbg("index=%d\n", cmd->index);
 
 if (cmd->index >= MAX_PORT_GIDS) {
-return -EINVAL;
+rsp->hdr.err = -EINVAL;
+goto out;
 }
 
 rc = rdma_rm_del_gid(>rdma_dev_res, >backend_dev,
@@ -617,7 +641,11 @@ static int destroy_bind(PVRDMADev *dev, union 
pvrdma_cmd_req *req,
 goto out;
 }
 
-return 0;
+rsp->hdr.err = 0;
+
+out:
+pr_dbg("ret=%d\n", rsp->hdr.err);
+return rsp->hdr.err;
 }
 
 static int create_uc(PVRDMADev *dev, union pvrdma_cmd_req *req,
@@ -634,9 +662,8 @@ static int create_uc(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 resp->hdr.err = rdma_rm_alloc_uc(>rdma_dev_res, cmd->pfn,
  >ctx_handle);
 
-pr_dbg("ret=%d\n", resp->hdr.err);
-
-return 0;
+pr_dbg("ret=%d\n", rsp->hdr.err);
+return rsp->hdr.err;
 }
 
 static int destroy_uc(PVRDMADev *dev, union 

[Qemu-devel] [Bug 1803872] [NEW] gcc 8.2 reports stringop-truncation when building qemu

2018-11-18 Thread Amir Gonnen
Public bug reported:

QEMU 3.0

block/sheepdog.c: In function 'find_vdi_name':
block/sheepdog.c:1239:5: error: 'strncpy' specified bound 256 equals 
destination size [-Werror=stringop-truncation]
 strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
 ^~


If this is the intended behavior, please suppress the warning. For example:

#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstringop-truncation"
strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
#pragma GCC diagnostic pop

** Affects: qemu
 Importance: Undecided
 Status: New

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1803872

Title:
  gcc 8.2 reports stringop-truncation when building qemu

Status in QEMU:
  New

Bug description:
  QEMU 3.0

  block/sheepdog.c: In function 'find_vdi_name':
  block/sheepdog.c:1239:5: error: 'strncpy' specified bound 256 equals 
destination size [-Werror=stringop-truncation]
   strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
   ^~

  
  If this is the intended behavior, please suppress the warning. For example:

  #pragma GCC diagnostic push
  #pragma GCC diagnostic ignored "-Wstringop-truncation"
  strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
  #pragma GCC diagnostic pop

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1803872/+subscriptions



Re: [Qemu-devel] [PATCH v3 22/23] hw/rdma: Do not call rdma_backend_del_gid on an empty gid

2018-11-18 Thread Yuval Shaia
On Sat, Nov 17, 2018 at 02:25:55PM +0200, Marcel Apfelbaum wrote:
> 
> 
> On 11/13/18 9:13 AM, Yuval Shaia wrote:
> > When device goes down the function fini_ports loops over all entries in
> > gid table regardless of the fact whether entry is valid or not. In case
> > that entry is not valid we'd like to skip from any further processing in
> > backend device.
> > 
> > Signed-off-by: Yuval Shaia 
> > ---
> >   hw/rdma/rdma_rm.c | 4 
> >   1 file changed, 4 insertions(+)
> > 
> > diff --git a/hw/rdma/rdma_rm.c b/hw/rdma/rdma_rm.c
> > index 35a96d9a64..e3f6b2f6ea 100644
> > --- a/hw/rdma/rdma_rm.c
> > +++ b/hw/rdma/rdma_rm.c
> > @@ -555,6 +555,10 @@ int rdma_rm_del_gid(RdmaDeviceResources *dev_res, 
> > RdmaBackendDev *backend_dev,
> >   {
> >   int rc;
> > +if (!dev_res->port.gid_tbl[gid_idx].gid.global.interface_id) {
> > +return 0;
> > +}
> > +
> >   rc = rdma_backend_del_gid(backend_dev, ifname,
> > _res->port.gid_tbl[gid_idx].gid);
> >   if (rc < 0) {
> 
> Reviewed-by: Marcel Apfelbaum

There seems to be a missing space separator between "Apfelbaum" and "<". Is
that ok?

> 
> Thanks,
> Marcel
> 



[Qemu-devel] [PATCH v4 01/23] contrib/rdmacm-mux: Add implementation of RDMA User MAD multiplexer

2018-11-18 Thread Yuval Shaia
RDMA MAD kernel module (ibcm) disallow more than one MAD-agent for a
given MAD class.
This does not go hand-by-hand with qemu pvrdma device's requirements
where each VM is MAD agent.
Fix it by adding implementation of RDMA MAD multiplexer service which on
one hand register as a sole MAD agent with the kernel module and on the
other hand gives service to more than one VM.

Design Overview:

A server process is registered to UMAD framework (for this to work the
rdma_cm kernel module needs to be unloaded) and creates a unix socket to
listen to incoming request from clients.
A client process (such as QEMU) connects to this unix socket and
registers with its own GID.

TX:
---
When client needs to send rdma_cm MAD message it construct it the same
way as without this multiplexer, i.e. creates a umad packet but this
time it writes its content to the socket instead of calling umad_send().
The server, upon receiving such a message fetch local_comm_id from it so
a context for this session can be maintain and relay the message to UMAD
layer by calling umad_send().

RX:
---
The server creates a worker thread to process incoming rdma_cm MAD
messages. When an incoming message arrived (umad_recv()) the server,
depending on the message type (attr_id) looks for target client by
either searching in gid->fd table or in local_comm_id->fd table. With
the extracted fd the server relays to incoming message to the client.

Signed-off-by: Yuval Shaia 
Reviewed-by: Shamir Rabinovitch 
---
 MAINTAINERS  |   1 +
 Makefile |   3 +
 Makefile.objs|   1 +
 contrib/rdmacm-mux/Makefile.objs |   4 +
 contrib/rdmacm-mux/main.c| 780 +++
 contrib/rdmacm-mux/rdmacm-mux.h  |  56 +++
 6 files changed, 845 insertions(+)
 create mode 100644 contrib/rdmacm-mux/Makefile.objs
 create mode 100644 contrib/rdmacm-mux/main.c
 create mode 100644 contrib/rdmacm-mux/rdmacm-mux.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 98a1856afc..e087d58ac6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2231,6 +2231,7 @@ S: Maintained
 F: hw/rdma/*
 F: hw/rdma/vmw/*
 F: docs/pvrdma.txt
+F: contrib/rdmacm-mux/*
 
 Build and test automation
 -
diff --git a/Makefile b/Makefile
index f2947186a4..94072776ff 100644
--- a/Makefile
+++ b/Makefile
@@ -418,6 +418,7 @@ dummy := $(call unnest-vars,, \
 elf2dmp-obj-y \
 ivshmem-client-obj-y \
 ivshmem-server-obj-y \
+rdmacm-mux-obj-y \
 libvhost-user-obj-y \
 vhost-user-scsi-obj-y \
 vhost-user-blk-obj-y \
@@ -725,6 +726,8 @@ vhost-user-scsi$(EXESUF): $(vhost-user-scsi-obj-y) 
libvhost-user.a
$(call LINK, $^)
 vhost-user-blk$(EXESUF): $(vhost-user-blk-obj-y) libvhost-user.a
$(call LINK, $^)
+rdmacm-mux$(EXESUF): $(rdmacm-mux-obj-y) $(COMMON_LDADDS)
+   $(call LINK, $^)
 
 module_block.h: $(SRC_PATH)/scripts/modules/module_block.py config-host.mak
$(call quiet-command,$(PYTHON) $< $@ \
diff --git a/Makefile.objs b/Makefile.objs
index 1e1ff387d7..cc7df3ad80 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -194,6 +194,7 @@ vhost-user-scsi.o-cflags := $(LIBISCSI_CFLAGS)
 vhost-user-scsi.o-libs := $(LIBISCSI_LIBS)
 vhost-user-scsi-obj-y = contrib/vhost-user-scsi/
 vhost-user-blk-obj-y = contrib/vhost-user-blk/
+rdmacm-mux-obj-y = contrib/rdmacm-mux/
 
 ##
 trace-events-subdirs =
diff --git a/contrib/rdmacm-mux/Makefile.objs b/contrib/rdmacm-mux/Makefile.objs
new file mode 100644
index 00..be3eacb6f7
--- /dev/null
+++ b/contrib/rdmacm-mux/Makefile.objs
@@ -0,0 +1,4 @@
+ifdef CONFIG_PVRDMA
+CFLAGS += -libumad -Wno-format-truncation
+rdmacm-mux-obj-y = main.o
+endif
diff --git a/contrib/rdmacm-mux/main.c b/contrib/rdmacm-mux/main.c
new file mode 100644
index 00..98de95ee64
--- /dev/null
+++ b/contrib/rdmacm-mux/main.c
@@ -0,0 +1,780 @@
+/*
+ * QEMU paravirtual RDMA - rdmacm-mux implementation
+ *
+ * Copyright (C) 2018 Oracle
+ * Copyright (C) 2018 Red Hat Inc
+ *
+ * Authors:
+ * Yuval Shaia 
+ * Marcel Apfelbaum 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "sys/poll.h"
+#include "sys/ioctl.h"
+#include "pthread.h"
+#include "syslog.h"
+
+#include "infiniband/verbs.h"
+#include "infiniband/umad.h"
+#include "infiniband/umad_types.h"
+#include "infiniband/umad_sa.h"
+#include "infiniband/umad_cm.h"
+
+#include "rdmacm-mux.h"
+
+#define SCALE_US 1000
+#define COMMID_TTL 2 /* How many SCALE_US a context of MAD session is saved */
+#define SLEEP_SECS 5 /* This is used both in poll() and thread */
+#define SERVER_LISTEN_BACKLOG 10
+#define MAX_CLIENTS 4096
+#define MAD_RMPP_VERSION 0
+#define MAD_METHOD_MASK0 0x8
+
+#define 

[Qemu-devel] [PATCH v4 12/23] vmxnet3: Move some definitions to header file

2018-11-18 Thread Yuval Shaia
pvrdma setup requires vmxnet3 device on PCI function 0 and PVRDMA device
on PCI function 1.
pvrdma device needs to access vmxnet3 device object for several reasons:
1. Make sure PCI function 0 is vmxnet3.
2. To monitor vmxnet3 device state.
3. To configure node_guid accoring to vmxnet3 device's MAC address.

To be able to access vmxnet3 device the definition of VMXNET3State is
moved to a new header file.

Signed-off-by: Yuval Shaia 
Reviewed-by: Dmitry Fleytman 
---
 hw/net/vmxnet3.c  | 116 +---
 hw/net/vmxnet3_defs.h | 133 ++
 2 files changed, 134 insertions(+), 115 deletions(-)
 create mode 100644 hw/net/vmxnet3_defs.h

diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c
index 3648630386..54746a4030 100644
--- a/hw/net/vmxnet3.c
+++ b/hw/net/vmxnet3.c
@@ -18,7 +18,6 @@
 #include "qemu/osdep.h"
 #include "hw/hw.h"
 #include "hw/pci/pci.h"
-#include "net/net.h"
 #include "net/tap.h"
 #include "net/checksum.h"
 #include "sysemu/sysemu.h"
@@ -29,6 +28,7 @@
 #include "migration/register.h"
 
 #include "vmxnet3.h"
+#include "vmxnet3_defs.h"
 #include "vmxnet_debug.h"
 #include "vmware_utils.h"
 #include "net_tx_pkt.h"
@@ -131,23 +131,11 @@ typedef struct VMXNET3Class {
 DeviceRealize parent_dc_realize;
 } VMXNET3Class;
 
-#define TYPE_VMXNET3 "vmxnet3"
-#define VMXNET3(obj) OBJECT_CHECK(VMXNET3State, (obj), TYPE_VMXNET3)
-
 #define VMXNET3_DEVICE_CLASS(klass) \
 OBJECT_CLASS_CHECK(VMXNET3Class, (klass), TYPE_VMXNET3)
 #define VMXNET3_DEVICE_GET_CLASS(obj) \
 OBJECT_GET_CLASS(VMXNET3Class, (obj), TYPE_VMXNET3)
 
-/* Cyclic ring abstraction */
-typedef struct {
-hwaddr pa;
-uint32_t size;
-uint32_t cell_size;
-uint32_t next;
-uint8_t gen;
-} Vmxnet3Ring;
-
 static inline void vmxnet3_ring_init(PCIDevice *d,
 Vmxnet3Ring *ring,
  hwaddr pa,
@@ -245,108 +233,6 @@ vmxnet3_dump_rx_descr(struct Vmxnet3_RxDesc *descr)
   descr->rsvd, descr->dtype, descr->ext1, descr->btype);
 }
 
-/* Device state and helper functions */
-#define VMXNET3_RX_RINGS_PER_QUEUE (2)
-
-typedef struct {
-Vmxnet3Ring tx_ring;
-Vmxnet3Ring comp_ring;
-
-uint8_t intr_idx;
-hwaddr tx_stats_pa;
-struct UPT1_TxStats txq_stats;
-} Vmxnet3TxqDescr;
-
-typedef struct {
-Vmxnet3Ring rx_ring[VMXNET3_RX_RINGS_PER_QUEUE];
-Vmxnet3Ring comp_ring;
-uint8_t intr_idx;
-hwaddr rx_stats_pa;
-struct UPT1_RxStats rxq_stats;
-} Vmxnet3RxqDescr;
-
-typedef struct {
-bool is_masked;
-bool is_pending;
-bool is_asserted;
-} Vmxnet3IntState;
-
-typedef struct {
-PCIDevice parent_obj;
-NICState *nic;
-NICConf conf;
-MemoryRegion bar0;
-MemoryRegion bar1;
-MemoryRegion msix_bar;
-
-Vmxnet3RxqDescr rxq_descr[VMXNET3_DEVICE_MAX_RX_QUEUES];
-Vmxnet3TxqDescr txq_descr[VMXNET3_DEVICE_MAX_TX_QUEUES];
-
-/* Whether MSI-X support was installed successfully */
-bool msix_used;
-hwaddr drv_shmem;
-hwaddr temp_shared_guest_driver_memory;
-
-uint8_t txq_num;
-
-/* This boolean tells whether RX packet being indicated has to */
-/* be split into head and body chunks from different RX rings  */
-bool rx_packets_compound;
-
-bool rx_vlan_stripping;
-bool lro_supported;
-
-uint8_t rxq_num;
-
-/* Network MTU */
-uint32_t mtu;
-
-/* Maximum number of fragments for indicated TX packets */
-uint32_t max_tx_frags;
-
-/* Maximum number of fragments for indicated RX packets */
-uint16_t max_rx_frags;
-
-/* Index for events interrupt */
-uint8_t event_int_idx;
-
-/* Whether automatic interrupts masking enabled */
-bool auto_int_masking;
-
-bool peer_has_vhdr;
-
-/* TX packets to QEMU interface */
-struct NetTxPkt *tx_pkt;
-uint32_t offload_mode;
-uint32_t cso_or_gso_size;
-uint16_t tci;
-bool needs_vlan;
-
-struct NetRxPkt *rx_pkt;
-
-bool tx_sop;
-bool skip_current_tx_pkt;
-
-uint32_t device_active;
-uint32_t last_command;
-
-uint32_t link_status_and_speed;
-
-Vmxnet3IntState interrupt_states[VMXNET3_MAX_INTRS];
-
-uint32_t temp_mac;   /* To store the low part first */
-
-MACAddr perm_mac;
-uint32_t vlan_table[VMXNET3_VFT_SIZE];
-uint32_t rx_mode;
-MACAddr *mcast_list;
-uint32_t mcast_list_len;
-uint32_t mcast_list_buff_size; /* needed for live migration. */
-
-/* Compatibility flags for migration */
-uint32_t compat_flags;
-} VMXNET3State;
-
 /* Interrupt management */
 
 /*
diff --git a/hw/net/vmxnet3_defs.h b/hw/net/vmxnet3_defs.h
new file mode 100644
index 00..6c19d29b12
--- /dev/null
+++ b/hw/net/vmxnet3_defs.h
@@ -0,0 +1,133 @@
+/*
+ * 

[Qemu-devel] [PATCH v4 14/23] hw/rdma: Initialize node_guid from vmxnet3 mac address

2018-11-18 Thread Yuval Shaia
node_guid should be set once device is load.
Make node_guid be GID format (32 bit) of PCI function 0 vmxnet3 device's
MAC.

A new function was added to do the conversion.
So for example the MAC 56:b6:44:e9:62:dc will be converted to GID
54b6:44ff:fee9:62dc.

Signed-off-by: Yuval Shaia 
Reviewed-by: Marcel Apfelbaum 
---
 hw/rdma/rdma_utils.h  |  9 +
 hw/rdma/vmw/pvrdma_cmd.c  | 10 --
 hw/rdma/vmw/pvrdma_main.c |  5 -
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/hw/rdma/rdma_utils.h b/hw/rdma/rdma_utils.h
index 989db249ef..202abb3366 100644
--- a/hw/rdma/rdma_utils.h
+++ b/hw/rdma/rdma_utils.h
@@ -63,4 +63,13 @@ extern unsigned long pr_dbg_cnt;
 void *rdma_pci_dma_map(PCIDevice *dev, dma_addr_t addr, dma_addr_t plen);
 void rdma_pci_dma_unmap(PCIDevice *dev, void *buffer, dma_addr_t len);
 
+static inline void addrconf_addr_eui48(uint8_t *eui, const char *addr)
+{
+memcpy(eui, addr, 3);
+eui[3] = 0xFF;
+eui[4] = 0xFE;
+memcpy(eui + 5, addr + 3, 3);
+eui[0] ^= 2;
+}
+
 #endif
diff --git a/hw/rdma/vmw/pvrdma_cmd.c b/hw/rdma/vmw/pvrdma_cmd.c
index a334f6205e..2979582fac 100644
--- a/hw/rdma/vmw/pvrdma_cmd.c
+++ b/hw/rdma/vmw/pvrdma_cmd.c
@@ -592,16 +592,6 @@ static int create_bind(PVRDMADev *dev, union 
pvrdma_cmd_req *req,
 return -EINVAL;
 }
 
-/* TODO: Since drivers stores node_guid at load_dsr phase then this
- * assignment is not relevant, i need to figure out a way how to
- * retrieve MAC of our netdev */
-if (!cmd->index) {
-dev->node_guid =
-dev->rdma_dev_res.ports[0].gid_tbl[0].gid.global.interface_id;
-pr_dbg("dev->node_guid=0x%llx\n",
-   (long long unsigned int)be64_to_cpu(dev->node_guid));
-}
-
 return 0;
 }
 
diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c
index b35b5dc5f0..150404dfa6 100644
--- a/hw/rdma/vmw/pvrdma_main.c
+++ b/hw/rdma/vmw/pvrdma_main.c
@@ -264,7 +264,7 @@ static void init_dsr_dev_caps(PVRDMADev *dev)
 dsr->caps.sys_image_guid = 0;
 pr_dbg("sys_image_guid=%" PRIx64 "\n", dsr->caps.sys_image_guid);
 
-dsr->caps.node_guid = cpu_to_be64(dev->node_guid);
+dsr->caps.node_guid = dev->node_guid;
 pr_dbg("node_guid=%" PRIx64 "\n", be64_to_cpu(dsr->caps.node_guid));
 
 dsr->caps.phys_port_cnt = MAX_PORTS;
@@ -588,6 +588,9 @@ static void pvrdma_realize(PCIDevice *pdev, Error **errp)
 }
 dev->func0 = VMXNET3(func0);
 
+addrconf_addr_eui48((unsigned char *)>node_guid,
+(const char *)>func0->conf.macaddr.a);
+
 memdev_root = object_resolve_path("/objects", NULL);
 if (memdev_root) {
 object_child_foreach(memdev_root, pvrdma_check_ram_shared, 
_shared);
-- 
2.17.2




[Qemu-devel] [PATCH v4 15/23] hw/pvrdma: Make device state depend on Ethernet function state

2018-11-18 Thread Yuval Shaia
User should be able to control the device by changing Ethernet function
state so if user runs 'ifconfig ens3 down' the PVRDMA function should be
down as well.

Signed-off-by: Yuval Shaia 
Reviewed-by: Marcel Apfelbaum 
---
 hw/rdma/vmw/pvrdma_cmd.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/rdma/vmw/pvrdma_cmd.c b/hw/rdma/vmw/pvrdma_cmd.c
index 2979582fac..0d3c818c20 100644
--- a/hw/rdma/vmw/pvrdma_cmd.c
+++ b/hw/rdma/vmw/pvrdma_cmd.c
@@ -139,7 +139,8 @@ static int query_port(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 resp->hdr.ack = PVRDMA_CMD_QUERY_PORT_RESP;
 resp->hdr.err = 0;
 
-resp->attrs.state = attrs.state;
+resp->attrs.state = dev->func0->device_active ? attrs.state :
+PVRDMA_PORT_DOWN;
 resp->attrs.max_mtu = attrs.max_mtu;
 resp->attrs.active_mtu = attrs.active_mtu;
 resp->attrs.phys_state = attrs.phys_state;
-- 
2.17.2




[Qemu-devel] [PATCH v4 21/23] hw/rdma: Do not use bitmap_zero_extend to free bitmap

2018-11-18 Thread Yuval Shaia
bitmap_zero_extend is designed to work for extending, not for
shrinking.
Using g_free instead.

Signed-off-by: Yuval Shaia 
Reviewed-by: Marcel Apfelbaum 
---
 hw/rdma/rdma_rm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/rdma/rdma_rm.c b/hw/rdma/rdma_rm.c
index b7d4ebe972..ca127c8c26 100644
--- a/hw/rdma/rdma_rm.c
+++ b/hw/rdma/rdma_rm.c
@@ -43,7 +43,7 @@ static inline void res_tbl_free(RdmaRmResTbl *tbl)
 {
 qemu_mutex_destroy(>lock);
 g_free(tbl->tbl);
-bitmap_zero_extend(tbl->bitmap, tbl->tbl_sz, 0);
+g_free(tbl->bitmap);
 }
 
 static inline void *res_tbl_get(RdmaRmResTbl *tbl, uint32_t handle)
-- 
2.17.2




Re: [Qemu-devel] [PATCH v3 17/23] hw/pvrdma: Fill error code in command's response

2018-11-18 Thread Yuval Shaia
On Sat, Nov 17, 2018 at 02:22:31PM +0200, Marcel Apfelbaum wrote:
> 
> 
> On 11/13/18 9:13 AM, Yuval Shaia wrote:
> > Driver checks error code let's set it.
> > 
> > Signed-off-by: Yuval Shaia 
> > ---
> >   hw/rdma/vmw/pvrdma_cmd.c | 67 
> >   1 file changed, 48 insertions(+), 19 deletions(-)
> > 
> > diff --git a/hw/rdma/vmw/pvrdma_cmd.c b/hw/rdma/vmw/pvrdma_cmd.c
> > index 0d3c818c20..a326c5d470 100644
> > --- a/hw/rdma/vmw/pvrdma_cmd.c
> > +++ b/hw/rdma/vmw/pvrdma_cmd.c
> > @@ -131,7 +131,8 @@ static int query_port(PVRDMADev *dev, union 
> > pvrdma_cmd_req *req,
> >   if (rdma_backend_query_port(>backend_dev,
> >   (struct ibv_port_attr *))) {
> > -return -ENOMEM;
> > +resp->hdr.err = -ENOMEM;
> > +goto out;
> >   }
> >   memset(resp, 0, sizeof(*resp));
> > @@ -150,7 +151,9 @@ static int query_port(PVRDMADev *dev, union 
> > pvrdma_cmd_req *req,
> >   resp->attrs.active_width = 1;
> >   resp->attrs.active_speed = 1;
> > -return 0;
> > +out:
> > +pr_dbg("ret=%d\n", resp->hdr.err);
> > +return resp->hdr.err;
> >   }
> >   static int query_pkey(PVRDMADev *dev, union pvrdma_cmd_req *req,
> > @@ -170,7 +173,7 @@ static int query_pkey(PVRDMADev *dev, union 
> > pvrdma_cmd_req *req,
> >   resp->pkey = PVRDMA_PKEY;
> >   pr_dbg("pkey=0x%x\n", resp->pkey);
> > -return 0;
> > +return resp->hdr.err;
> >   }
> >   static int create_pd(PVRDMADev *dev, union pvrdma_cmd_req *req,
> > @@ -200,7 +203,9 @@ static int destroy_pd(PVRDMADev *dev, union 
> > pvrdma_cmd_req *req,
> >   rdma_rm_dealloc_pd(>rdma_dev_res, cmd->pd_handle);
> > -return 0;
> > +rsp->hdr.err = 0;
> 
> Is it possible to ensure err is 0 by default during hdr creation
> instead of manually setting it every time?

Yes we can but since these handlers any fills some other fields in the
response i thought it will be clean if they will fill the op-status as
well.
I believe filling it at "handler" level is more modular.
Do you think filling it outside will make the code cleaner?

> 
> Thanks,
> Marcel
> 
> > +
> > +return rsp->hdr.err;
> >   }
> >   static int create_mr(PVRDMADev *dev, union pvrdma_cmd_req *req,
> > @@ -251,7 +256,9 @@ static int destroy_mr(PVRDMADev *dev, union 
> > pvrdma_cmd_req *req,
> >   rdma_rm_dealloc_mr(>rdma_dev_res, cmd->mr_handle);
> > -return 0;
> > +rsp->hdr.err = 0;
> > +
> > +return rsp->hdr.err;
> >   }
> >   static int create_cq_ring(PCIDevice *pci_dev , PvrdmaRing **ring,
> > @@ -353,7 +360,8 @@ static int destroy_cq(PVRDMADev *dev, union 
> > pvrdma_cmd_req *req,
> >   cq = rdma_rm_get_cq(>rdma_dev_res, cmd->cq_handle);
> >   if (!cq) {
> >   pr_dbg("Invalid CQ handle\n");
> > -return -EINVAL;
> > +rsp->hdr.err = -EINVAL;
> > +goto out;
> >   }
> >   ring = (PvrdmaRing *)cq->opaque;
> > @@ -364,7 +372,11 @@ static int destroy_cq(PVRDMADev *dev, union 
> > pvrdma_cmd_req *req,
> >   rdma_rm_dealloc_cq(>rdma_dev_res, cmd->cq_handle);
> > -return 0;
> > +rsp->hdr.err = 0;
> > +
> > +out:
> > +pr_dbg("ret=%d\n", rsp->hdr.err);
> > +return rsp->hdr.err;
> >   }
> >   static int create_qp_rings(PCIDevice *pci_dev, uint64_t pdir_dma,
> > @@ -553,7 +565,8 @@ static int destroy_qp(PVRDMADev *dev, union 
> > pvrdma_cmd_req *req,
> >   qp = rdma_rm_get_qp(>rdma_dev_res, cmd->qp_handle);
> >   if (!qp) {
> >   pr_dbg("Invalid QP handle\n");
> > -return -EINVAL;
> > +rsp->hdr.err = -EINVAL;
> > +goto out;
> >   }
> >   rdma_rm_dealloc_qp(>rdma_dev_res, cmd->qp_handle);
> > @@ -567,7 +580,11 @@ static int destroy_qp(PVRDMADev *dev, union 
> > pvrdma_cmd_req *req,
> >   rdma_pci_dma_unmap(PCI_DEVICE(dev), ring->ring_state, 
> > TARGET_PAGE_SIZE);
> >   g_free(ring);
> > -return 0;
> > +rsp->hdr.err = 0;
> > +
> > +out:
> > +pr_dbg("ret=%d\n", rsp->hdr.err);
> > +return rsp->hdr.err;
> >   }
> >   static int create_bind(PVRDMADev *dev, union pvrdma_cmd_req *req,
> > @@ -580,7 +597,8 @@ static int create_bind(PVRDMADev *dev, union 
> > pvrdma_cmd_req *req,
> >   pr_dbg("index=%d\n", cmd->index);
> >   if (cmd->index >= MAX_PORT_GIDS) {
> > -return -EINVAL;
> > +rsp->hdr.err = -EINVAL;
> > +goto out;
> >   }
> >   pr_dbg("gid[%d]=0x%llx,0x%llx\n", cmd->index,
> > @@ -590,10 +608,15 @@ static int create_bind(PVRDMADev *dev, union 
> > pvrdma_cmd_req *req,
> >   rc = rdma_rm_add_gid(>rdma_dev_res, >backend_dev,
> >dev->backend_eth_device_name, gid, cmd->index);
> >   if (rc < 0) {
> > -return -EINVAL;
> > +rsp->hdr.err = rc;
> > +goto out;
> >   }
> > -return 0;
> > +rsp->hdr.err = 0;
> > +
> > +out:
> > +pr_dbg("ret=%d\n", rsp->hdr.err);
> > +return rsp->hdr.err;
> >   }
> >   static int destroy_bind(PVRDMADev 

[Qemu-devel] [PATCH v4 10/23] json: Define new QMP message for pvrdma

2018-11-18 Thread Yuval Shaia
pvrdma requires that the same GID attached to it will be attached to the
backend device in the host.

A new QMP messages is defined so pvrdma device can broadcast any change
made to its GID table. This event is captured by libvirt which in turn
will update the GID table in the backend device.

Signed-off-by: Yuval Shaia 
Reviewed-by: Marcel Apfelbaum 
---
 MAINTAINERS   |  1 +
 Makefile  |  3 ++-
 Makefile.objs |  4 
 qapi/qapi-schema.json |  1 +
 qapi/rdma.json| 38 ++
 5 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100644 qapi/rdma.json

diff --git a/MAINTAINERS b/MAINTAINERS
index e087d58ac6..a149f68a8f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2232,6 +2232,7 @@ F: hw/rdma/*
 F: hw/rdma/vmw/*
 F: docs/pvrdma.txt
 F: contrib/rdmacm-mux/*
+F: qapi/rdma.json
 
 Build and test automation
 -
diff --git a/Makefile b/Makefile
index 94072776ff..db4ce60ee5 100644
--- a/Makefile
+++ b/Makefile
@@ -599,7 +599,8 @@ qapi-modules = $(SRC_PATH)/qapi/qapi-schema.json 
$(SRC_PATH)/qapi/common.json \
$(SRC_PATH)/qapi/tpm.json \
$(SRC_PATH)/qapi/trace.json \
$(SRC_PATH)/qapi/transaction.json \
-   $(SRC_PATH)/qapi/ui.json
+   $(SRC_PATH)/qapi/ui.json \
+   $(SRC_PATH)/qapi/rdma.json
 
 qapi/qapi-builtin-types.c qapi/qapi-builtin-types.h \
 qapi/qapi-types.c qapi/qapi-types.h \
diff --git a/Makefile.objs b/Makefile.objs
index cc7df3ad80..76d8028f2f 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -21,6 +21,7 @@ util-obj-y += qapi/qapi-types-tpm.o
 util-obj-y += qapi/qapi-types-trace.o
 util-obj-y += qapi/qapi-types-transaction.o
 util-obj-y += qapi/qapi-types-ui.o
+util-obj-y += qapi/qapi-types-rdma.o
 util-obj-y += qapi/qapi-builtin-visit.o
 util-obj-y += qapi/qapi-visit.o
 util-obj-y += qapi/qapi-visit-block-core.o
@@ -40,6 +41,7 @@ util-obj-y += qapi/qapi-visit-tpm.o
 util-obj-y += qapi/qapi-visit-trace.o
 util-obj-y += qapi/qapi-visit-transaction.o
 util-obj-y += qapi/qapi-visit-ui.o
+util-obj-y += qapi/qapi-visit-rdma.o
 util-obj-y += qapi/qapi-events.o
 util-obj-y += qapi/qapi-events-block-core.o
 util-obj-y += qapi/qapi-events-block.o
@@ -58,6 +60,7 @@ util-obj-y += qapi/qapi-events-tpm.o
 util-obj-y += qapi/qapi-events-trace.o
 util-obj-y += qapi/qapi-events-transaction.o
 util-obj-y += qapi/qapi-events-ui.o
+util-obj-y += qapi/qapi-events-rdma.o
 util-obj-y += qapi/qapi-introspect.o
 
 chardev-obj-y = chardev/
@@ -155,6 +158,7 @@ common-obj-y += qapi/qapi-commands-tpm.o
 common-obj-y += qapi/qapi-commands-trace.o
 common-obj-y += qapi/qapi-commands-transaction.o
 common-obj-y += qapi/qapi-commands-ui.o
+common-obj-y += qapi/qapi-commands-rdma.o
 common-obj-y += qapi/qapi-introspect.o
 common-obj-y += qmp.o hmp.o
 endif
diff --git a/qapi/qapi-schema.json b/qapi/qapi-schema.json
index 65b6dc2f6f..a650d80f83 100644
--- a/qapi/qapi-schema.json
+++ b/qapi/qapi-schema.json
@@ -94,3 +94,4 @@
 { 'include': 'trace.json' }
 { 'include': 'introspect.json' }
 { 'include': 'misc.json' }
+{ 'include': 'rdma.json' }
diff --git a/qapi/rdma.json b/qapi/rdma.json
new file mode 100644
index 00..804c68ab36
--- /dev/null
+++ b/qapi/rdma.json
@@ -0,0 +1,38 @@
+# -*- Mode: Python -*-
+#
+
+##
+# = RDMA device
+##
+
+##
+# @RDMA_GID_STATUS_CHANGED:
+#
+# Emitted when guest driver adds/deletes GID to/from device
+#
+# @netdev: RoCE Network Device name - char *
+#
+# @gid-status: Add or delete indication - bool
+#
+# @subnet-prefix: Subnet Prefix - uint64
+#
+# @interface-id : Interface ID - uint64
+#
+# Since: 3.2
+#
+# Example:
+#
+# <- {"timestamp": {"seconds": 1541579657, "microseconds": 986760},
+# "event": "RDMA_GID_STATUS_CHANGED",
+# "data":
+# {"netdev": "bridge0",
+# "interface-id": 15880512517475447892,
+# "gid-status": true,
+# "subnet-prefix": 33022}}
+#
+##
+{ 'event': 'RDMA_GID_STATUS_CHANGED',
+  'data': { 'netdev': 'str',
+'gid-status': 'bool',
+'subnet-prefix' : 'uint64',
+'interface-id'  : 'uint64' } }
-- 
2.17.2




[Qemu-devel] [PATCH v4 08/23] hw/pvrdma: Set the correct opcode for recv completion

2018-11-18 Thread Yuval Shaia
The function pvrdma_post_cqe populates CQE entry with opcode from the
given completion element. For receive operation value was not set. Fix
it by setting it to IBV_WC_RECV.

Signed-off-by: Yuval Shaia 
Reviewed-by: Marcel Apfelbaum 
---
 hw/rdma/vmw/pvrdma_qp_ops.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/rdma/vmw/pvrdma_qp_ops.c b/hw/rdma/vmw/pvrdma_qp_ops.c
index 762700a205..7b0f440fda 100644
--- a/hw/rdma/vmw/pvrdma_qp_ops.c
+++ b/hw/rdma/vmw/pvrdma_qp_ops.c
@@ -196,8 +196,9 @@ int pvrdma_qp_recv(PVRDMADev *dev, uint32_t qp_handle)
 comp_ctx = g_malloc(sizeof(CompHandlerCtx));
 comp_ctx->dev = dev;
 comp_ctx->cq_handle = qp->recv_cq_handle;
-comp_ctx->cqe.qp = qp_handle;
 comp_ctx->cqe.wr_id = wqe->hdr.wr_id;
+comp_ctx->cqe.qp = qp_handle;
+comp_ctx->cqe.opcode = IBV_WC_RECV;
 
 rdma_backend_post_recv(>backend_dev, >rdma_dev_res,
>backend_qp, qp->qp_type,
-- 
2.17.2




[Qemu-devel] [PATCH v4 04/23] hw/rdma: Abort send-op if fail to create addr handler

2018-11-18 Thread Yuval Shaia
Function create_ah might return NULL, let's exit with an error.

Signed-off-by: Yuval Shaia 
Reviewed-by: Marcel Apfelbaum 
---
 hw/rdma/rdma_backend.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c
index d7a4bbd91f..1e148398a2 100644
--- a/hw/rdma/rdma_backend.c
+++ b/hw/rdma/rdma_backend.c
@@ -338,6 +338,10 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev,
 if (qp_type == IBV_QPT_UD) {
 wr.wr.ud.ah = create_ah(backend_dev, qp->ibpd,
 backend_dev->backend_gid_idx, dgid);
+if (!wr.wr.ud.ah) {
+comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
+goto out_dealloc_cqe_ctx;
+}
 wr.wr.ud.remote_qpn = dqpn;
 wr.wr.ud.remote_qkey = dqkey;
 }
-- 
2.17.2




[Qemu-devel] [PATCH v4 03/23] hw/rdma: Return qpn 1 if ibqp is NULL

2018-11-18 Thread Yuval Shaia
Device is not supporting QP0, only QP1.

Signed-off-by: Yuval Shaia 
Reviewed-by: Marcel Apfelbaum 
---
 hw/rdma/rdma_backend.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/rdma/rdma_backend.h b/hw/rdma/rdma_backend.h
index 86e8fe8ab6..3ccc9a2494 100644
--- a/hw/rdma/rdma_backend.h
+++ b/hw/rdma/rdma_backend.h
@@ -33,7 +33,7 @@ static inline union ibv_gid *rdma_backend_gid(RdmaBackendDev 
*dev)
 
 static inline uint32_t rdma_backend_qpn(const RdmaBackendQP *qp)
 {
-return qp->ibqp ? qp->ibqp->qp_num : 0;
+return qp->ibqp ? qp->ibqp->qp_num : 1;
 }
 
 static inline uint32_t rdma_backend_mr_lkey(const RdmaBackendMR *mr)
-- 
2.17.2




[Qemu-devel] [PATCH v4 13/23] hw/pvrdma: Make sure PCI function 0 is vmxnet3

2018-11-18 Thread Yuval Shaia
Guest driver enforces it, we should also.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/vmw/pvrdma.h  |  2 ++
 hw/rdma/vmw/pvrdma_main.c | 12 
 2 files changed, 14 insertions(+)

diff --git a/hw/rdma/vmw/pvrdma.h b/hw/rdma/vmw/pvrdma.h
index b019cb843a..10a3c4fb7c 100644
--- a/hw/rdma/vmw/pvrdma.h
+++ b/hw/rdma/vmw/pvrdma.h
@@ -20,6 +20,7 @@
 #include "hw/pci/pci.h"
 #include "hw/pci/msix.h"
 #include "chardev/char-fe.h"
+#include "hw/net/vmxnet3_defs.h"
 
 #include "../rdma_backend_defs.h"
 #include "../rdma_rm_defs.h"
@@ -85,6 +86,7 @@ typedef struct PVRDMADev {
 RdmaBackendDev backend_dev;
 RdmaDeviceResources rdma_dev_res;
 CharBackend mad_chr;
+VMXNET3State *func0;
 } PVRDMADev;
 #define PVRDMA_DEV(dev) OBJECT_CHECK(PVRDMADev, (dev), PVRDMA_HW_NAME)
 
diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c
index ac8c092db0..b35b5dc5f0 100644
--- a/hw/rdma/vmw/pvrdma_main.c
+++ b/hw/rdma/vmw/pvrdma_main.c
@@ -565,6 +565,7 @@ static void pvrdma_realize(PCIDevice *pdev, Error **errp)
 PVRDMADev *dev = PVRDMA_DEV(pdev);
 Object *memdev_root;
 bool ram_shared = false;
+PCIDevice *func0;
 
 init_pr_dbg();
 
@@ -576,6 +577,17 @@ static void pvrdma_realize(PCIDevice *pdev, Error **errp)
 return;
 }
 
+func0 = pci_get_function_0(pdev);
+/* Break if not vmxnet3 device in slot 0 */
+if (strcmp(object_get_typename(>qdev.parent_obj), TYPE_VMXNET3)) {
+pr_dbg("func0 type is %s\n",
+   object_get_typename(>qdev.parent_obj));
+error_setg(errp, "Device on %x.0 must be %s", PCI_SLOT(pdev->devfn),
+   TYPE_VMXNET3);
+return;
+}
+dev->func0 = VMXNET3(func0);
+
 memdev_root = object_resolve_path("/objects", NULL);
 if (memdev_root) {
 object_child_foreach(memdev_root, pvrdma_check_ram_shared, 
_shared);
-- 
2.17.2




Re: [Qemu-devel] [PATCH v3 01/23] contrib/rdmacm-mux: Add implementation of RDMA User MAD multiplexer

2018-11-18 Thread Yuval Shaia
On Sat, Nov 17, 2018 at 07:27:43PM +0200, Shamir Rabinovitch wrote:
> On Tue, Nov 13, 2018 at 09:13:14AM +0200, Yuval Shaia wrote:
> > RDMA MAD kernel module (ibcm) disallow more than one MAD-agent for a
> > given MAD class.
> > This does not go hand-by-hand with qemu pvrdma device's requirements
> > where each VM is MAD agent.
> > Fix it by adding implementation of RDMA MAD multiplexer service which on
> > one hand register as a sole MAD agent with the kernel module and on the
> > other hand gives service to more than one VM.
> > 
> > Design Overview:
> > 
> > A server process is registered to UMAD framework (for this to work the
> > rdma_cm kernel module needs to be unloaded) and creates a unix socket to
> > listen to incoming request from clients.
> > A client process (such as QEMU) connects to this unix socket and
> > registers with its own GID.
> > 
> > TX:
> > ---
> > When client needs to send rdma_cm MAD message it construct it the same
> > way as without this multiplexer, i.e. creates a umad packet but this
> > time it writes its content to the socket instead of calling umad_send().
> > The server, upon receiving such a message fetch local_comm_id from it so
> > a context for this session can be maintain and relay the message to UMAD
> > layer by calling umad_send().
> > 
> > RX:
> > ---
> > The server creates a worker thread to process incoming rdma_cm MAD
> > messages. When an incoming message arrived (umad_recv()) the server,
> > depending on the message type (attr_id) looks for target client by
> > either searching in gid->fd table or in local_comm_id->fd table. With
> > the extracted fd the server relays to incoming message to the client.
> > 
> > Signed-off-by: Yuval Shaia 
> > ---
> >  MAINTAINERS  |   1 +
> >  Makefile |   3 +
> >  Makefile.objs|   1 +
> >  contrib/rdmacm-mux/Makefile.objs |   4 +
> >  contrib/rdmacm-mux/main.c| 771 +++
> >  contrib/rdmacm-mux/rdmacm-mux.h  |  56 +++
> >  6 files changed, 836 insertions(+)
> >  create mode 100644 contrib/rdmacm-mux/Makefile.objs
> >  create mode 100644 contrib/rdmacm-mux/main.c
> >  create mode 100644 contrib/rdmacm-mux/rdmacm-mux.h
> >
> 
> Reviewed-by: Shamir Rabinovitch 

Thanks Shamir!

Thanks a lot also for all the MAD related tips and comments you gave
off-list, it make the code much more mature and correct.

> 



[Qemu-devel] [PATCH v4 06/23] hw/pvrdma: Make function reset_device return void

2018-11-18 Thread Yuval Shaia
This function cannot fail - fix it to return void

Signed-off-by: Yuval Shaia 
Reviewed-by: Marcel Apfelbaum 
---
 hw/rdma/vmw/pvrdma_main.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c
index 6c8c0154fa..fc2abd34af 100644
--- a/hw/rdma/vmw/pvrdma_main.c
+++ b/hw/rdma/vmw/pvrdma_main.c
@@ -369,13 +369,11 @@ static int unquiesce_device(PVRDMADev *dev)
 return 0;
 }
 
-static int reset_device(PVRDMADev *dev)
+static void reset_device(PVRDMADev *dev)
 {
 pvrdma_stop(dev);
 
 pr_dbg("Device reset complete\n");
-
-return 0;
 }
 
 static uint64_t regs_read(void *opaque, hwaddr addr, unsigned size)
-- 
2.17.2




[Qemu-devel] [PATCH v4 05/23] hw/rdma: Add support for MAD packets

2018-11-18 Thread Yuval Shaia
MAD (Management Datagram) packets are widely used by various modules
both in kernel and in user space for example the rdma_* API which is
used to create and maintain "connection" layer on top of RDMA uses
several types of MAD packets.

For more information please refer to chapter 13.4 in Volume 1
Architecture Specification, Release 1.1 available here:
https://www.infinibandta.org/ibta-specifications-download/

To support MAD packets the device uses an external utility
(contrib/rdmacm-mux) to relay packets from and to the guest driver.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/rdma_backend.c  | 275 +++-
 hw/rdma/rdma_backend.h  |   4 +-
 hw/rdma/rdma_backend_defs.h |  10 +-
 hw/rdma/vmw/pvrdma.h|   2 +
 hw/rdma/vmw/pvrdma_main.c   |   4 +-
 5 files changed, 285 insertions(+), 10 deletions(-)

diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c
index 1e148398a2..7c220a5798 100644
--- a/hw/rdma/rdma_backend.c
+++ b/hw/rdma/rdma_backend.c
@@ -16,8 +16,13 @@
 #include "qemu/osdep.h"
 #include "qemu/error-report.h"
 #include "qapi/error.h"
+#include "qapi/qmp/qlist.h"
+#include "qapi/qmp/qnum.h"
 
 #include 
+#include 
+#include 
+#include 
 
 #include "trace.h"
 #include "rdma_utils.h"
@@ -33,16 +38,25 @@
 #define VENDOR_ERR_MAD_SEND 0x206
 #define VENDOR_ERR_INVLKEY  0x207
 #define VENDOR_ERR_MR_SMALL 0x208
+#define VENDOR_ERR_INV_MAD_BUFF 0x209
+#define VENDOR_ERR_INV_NUM_SGE  0x210
 
 #define THR_NAME_LEN 16
 #define THR_POLL_TO  5000
 
+#define MAD_HDR_SIZE sizeof(struct ibv_grh)
+
 typedef struct BackendCtx {
-uint64_t req_id;
 void *up_ctx;
 bool is_tx_req;
+struct ibv_sge sge; /* Used to save MAD recv buffer */
 } BackendCtx;
 
+struct backend_umad {
+struct ib_user_mad hdr;
+char mad[RDMA_MAX_PRIVATE_DATA];
+};
+
 static void (*comp_handler)(int status, unsigned int vendor_err, void *ctx);
 
 static void dummy_comp_handler(int status, unsigned int vendor_err, void *ctx)
@@ -286,6 +300,61 @@ static int build_host_sge_array(RdmaDeviceResources 
*rdma_dev_res,
 return 0;
 }
 
+static int mad_send(RdmaBackendDev *backend_dev, struct ibv_sge *sge,
+uint32_t num_sge)
+{
+struct backend_umad umad = {0};
+char *hdr, *msg;
+int ret;
+
+pr_dbg("num_sge=%d\n", num_sge);
+
+if (num_sge != 2) {
+return -EINVAL;
+}
+
+umad.hdr.length = sge[0].length + sge[1].length;
+pr_dbg("msg_len=%d\n", umad.hdr.length);
+
+if (umad.hdr.length > sizeof(umad.mad)) {
+return -ENOMEM;
+}
+
+umad.hdr.addr.qpn = htobe32(1);
+umad.hdr.addr.grh_present = 1;
+umad.hdr.addr.gid_index = backend_dev->backend_gid_idx;
+memcpy(umad.hdr.addr.gid, backend_dev->gid.raw, sizeof(umad.hdr.addr.gid));
+umad.hdr.addr.hop_limit = 1;
+
+hdr = rdma_pci_dma_map(backend_dev->dev, sge[0].addr, sge[0].length);
+if (!hdr) {
+pr_dbg("Fail to map to sge[0]\n");
+return -ENOMEM;
+}
+msg = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length);
+if (!msg) {
+pr_dbg("Fail to map to sge[1]\n");
+rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length);
+return -ENOMEM;
+}
+
+pr_dbg_buf("mad_hdr", hdr, sge[0].length);
+pr_dbg_buf("mad_data", data, sge[1].length);
+
+memcpy([0], hdr, sge[0].length);
+memcpy([sge[0].length], msg, sge[1].length);
+
+rdma_pci_dma_unmap(backend_dev->dev, msg, sge[1].length);
+rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length);
+
+ret = qemu_chr_fe_write(backend_dev->mad_chr_be, (const uint8_t *),
+sizeof(umad));
+
+pr_dbg("qemu_chr_fe_write=%d\n", ret);
+
+return (ret != sizeof(umad));
+}
+
 void rdma_backend_post_send(RdmaBackendDev *backend_dev,
 RdmaBackendQP *qp, uint8_t qp_type,
 struct ibv_sge *sge, uint32_t num_sge,
@@ -304,9 +373,13 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev,
 comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx);
 } else if (qp_type == IBV_QPT_GSI) {
 pr_dbg("QP1\n");
-comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
+rc = mad_send(backend_dev, sge, num_sge);
+if (rc) {
+comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
+} else {
+comp_handler(IBV_WC_SUCCESS, 0, ctx);
+}
 }
-pr_dbg("qp->ibqp is NULL for qp_type %d!!!\n", qp_type);
 return;
 }
 
@@ -370,6 +443,48 @@ out_free_bctx:
 g_free(bctx);
 }
 
+static unsigned int save_mad_recv_buffer(RdmaBackendDev *backend_dev,
+ struct ibv_sge *sge, uint32_t num_sge,
+ void *ctx)
+{
+BackendCtx *bctx;
+int rc;
+uint32_t bctx_id;
+
+if (num_sge != 1) {
+pr_dbg("Invalid 

[Qemu-devel] [PATCH v4 00/23] Add support for RDMA MAD

2018-11-18 Thread Yuval Shaia
Hi all.

This is a major enhancement to the pvrdma device to allow it to work with
state of the art applications such as MPI.

As described in patch #5, MAD packets are management packets that are used
for many purposes including but not limited to communication layer above IB
verbs API.

Patch 1 exposes new external executable (under contrib) that aims to
address a specific limitation in the RDMA usrespace MAD stack.

This patch-set mainly present MAD enhancement but during the work on it i
came across some bugs and enhancement needed to be implemented before doing
any MAD coding. This is the role of patches 2 to 4, 7 to 9 and 15 to 17.

Patches 6 and 18 are cosmetic changes while not relevant to this patchset
still introduce with it since (at least for 6) hard to decouple.

Patches 12 to 15 couple pvrdma device with vmxnet3 device as this is the
configuration enforced by pvrdma driver in guest - a vmxnet3 device in
function 0 and pvrdma device in function 1 in the same PCI slot. Patch 12
moves needed code from vmxnet3 device to a new header file that can be used
by pvrdma code while Patches 13 to 15 use of it.

Along with this patch-set there is a parallel patch posted to libvirt to
apply the change needed there as part of the process implemented in patches
10 and 11. This change is needed so that guest would be able to configure
any IP to the Ethernet function of the pvrdma device.
https://www.redhat.com/archives/libvir-list/2018-November/msg00135.html

Since we maintain external resources such as GIDs on host GID table we need
to do some cleanup before going down. This is the job of patches 19 and 20.
Patches 21 and 22 contain a fixes for bugs detected during the work on
processing cleanup code during shutdown.

Patch 23 fixes documentation.

v1 -> v2:
* Fix compilation issue detected when compiling for mingw
* Address comment from Eric Blake re version of QEMU in json
  message
* Fix example from QMP message in json file
* Fix case where a VM tries to remove an invalid GID from GID table
* rdmacm-mux: Cleanup entries in socket-gids table when socket is
  closed
* Cleanup resources (GIDs, QPs etc) when VM goes down

v2 -> v3:
* Address comment from Cornelia Huck for patch #19
* Add some R-Bs from Marcel Apfelbaum and Dmitry Fleytman
* Update docs/pvrdma.txt with the changes made by this patchset
* Address comments from Shamir Rabinovitch for UMAD multiplexer

v3 -> v4:
* Address some comments from Marcel
* Add some R-Bs from Cornelia Huck and Shamir Rabinovitch

Yuval Shaia (23):
  contrib/rdmacm-mux: Add implementation of RDMA User MAD multiplexer
  hw/rdma: Add ability to force notification without re-arm
  hw/rdma: Return qpn 1 if ibqp is NULL
  hw/rdma: Abort send-op if fail to create addr handler
  hw/rdma: Add support for MAD packets
  hw/pvrdma: Make function reset_device return void
  hw/pvrdma: Make default pkey 0x
  hw/pvrdma: Set the correct opcode for recv completion
  hw/pvrdma: Set the correct opcode for send completion
  json: Define new QMP message for pvrdma
  hw/pvrdma: Add support to allow guest to configure GID table
  vmxnet3: Move some definitions to header file
  hw/pvrdma: Make sure PCI function 0 is vmxnet3
  hw/rdma: Initialize node_guid from vmxnet3 mac address
  hw/pvrdma: Make device state depend on Ethernet function state
  hw/pvrdma: Fill all CQE fields
  hw/pvrdma: Fill error code in command's response
  hw/rdma: Remove unneeded code that handles more that one port
  vl: Introduce shutdown_notifiers
  hw/pvrdma: Clean device's resource when system is shutdown
  hw/rdma: Do not use bitmap_zero_extend to free bitmap
  hw/rdma: Do not call rdma_backend_del_gid on an empty gid
  docs: Update pvrdma device documentation

 MAINTAINERS  |   2 +
 Makefile |   6 +-
 Makefile.objs|   5 +
 contrib/rdmacm-mux/Makefile.objs |   4 +
 contrib/rdmacm-mux/main.c| 780 +++
 contrib/rdmacm-mux/rdmacm-mux.h  |  56 +++
 docs/pvrdma.txt  | 103 +++-
 hw/net/vmxnet3.c | 116 +
 hw/net/vmxnet3_defs.h| 133 ++
 hw/rdma/rdma_backend.c   | 496 +---
 hw/rdma/rdma_backend.h   |  28 +-
 hw/rdma/rdma_backend_defs.h  |  13 +-
 hw/rdma/rdma_rm.c| 120 -
 hw/rdma/rdma_rm.h|  17 +-
 hw/rdma/rdma_rm_defs.h   |  21 +-
 hw/rdma/rdma_utils.h |  24 +
 hw/rdma/vmw/pvrdma.h |  10 +-
 hw/rdma/vmw/pvrdma_cmd.c | 119 +++--
 hw/rdma/vmw/pvrdma_main.c|  58 ++-
 hw/rdma/vmw/pvrdma_qp_ops.c  |  62 ++-
 include/sysemu/sysemu.h  |   1 +
 qapi/qapi-schema.json|   1 +
 qapi/rdma.json   |  38 ++
 vl.c |  15 +-
 24 files changed, 1921 insertions(+), 307 deletions(-)
 create mode 100644 contrib/rdmacm-mux/Makefile.objs
 create mode 

[Qemu-devel] [PATCH v4 18/23] hw/rdma: Remove unneeded code that handles more that one port

2018-11-18 Thread Yuval Shaia
Device supports only one port, let's remove a dead code that handles
more than one port.

Signed-off-by: Yuval Shaia 
Reviewed-by: Marcel Apfelbaum 
---
 hw/rdma/rdma_rm.c  | 34 --
 hw/rdma/rdma_rm.h  |  2 +-
 hw/rdma/rdma_rm_defs.h |  4 ++--
 3 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/hw/rdma/rdma_rm.c b/hw/rdma/rdma_rm.c
index 250254561c..b7d4ebe972 100644
--- a/hw/rdma/rdma_rm.c
+++ b/hw/rdma/rdma_rm.c
@@ -545,7 +545,7 @@ int rdma_rm_add_gid(RdmaDeviceResources *dev_res, 
RdmaBackendDev *backend_dev,
 return -EINVAL;
 }
 
-memcpy(_res->ports[0].gid_tbl[gid_idx].gid, gid, sizeof(*gid));
+memcpy(_res->port.gid_tbl[gid_idx].gid, gid, sizeof(*gid));
 
 return 0;
 }
@@ -556,15 +556,15 @@ int rdma_rm_del_gid(RdmaDeviceResources *dev_res, 
RdmaBackendDev *backend_dev,
 int rc;
 
 rc = rdma_backend_del_gid(backend_dev, ifname,
-  _res->ports[0].gid_tbl[gid_idx].gid);
+  _res->port.gid_tbl[gid_idx].gid);
 if (rc) {
 pr_dbg("Fail to delete gid\n");
 return -EINVAL;
 }
 
-memset(dev_res->ports[0].gid_tbl[gid_idx].gid.raw, 0,
-   sizeof(dev_res->ports[0].gid_tbl[gid_idx].gid));
-dev_res->ports[0].gid_tbl[gid_idx].backend_gid_index = -1;
+memset(dev_res->port.gid_tbl[gid_idx].gid.raw, 0,
+   sizeof(dev_res->port.gid_tbl[gid_idx].gid));
+dev_res->port.gid_tbl[gid_idx].backend_gid_index = -1;
 
 return 0;
 }
@@ -577,16 +577,16 @@ int rdma_rm_get_backend_gid_index(RdmaDeviceResources 
*dev_res,
 return -EINVAL;
 }
 
-if (unlikely(dev_res->ports[0].gid_tbl[sgid_idx].backend_gid_index == -1)) 
{
-dev_res->ports[0].gid_tbl[sgid_idx].backend_gid_index =
+if (unlikely(dev_res->port.gid_tbl[sgid_idx].backend_gid_index == -1)) {
+dev_res->port.gid_tbl[sgid_idx].backend_gid_index =
 rdma_backend_get_gid_index(backend_dev,
-   _res->ports[0].gid_tbl[sgid_idx].gid);
+   _res->port.gid_tbl[sgid_idx].gid);
 }
 
 pr_dbg("backend_gid_index=%d\n",
-   dev_res->ports[0].gid_tbl[sgid_idx].backend_gid_index);
+   dev_res->port.gid_tbl[sgid_idx].backend_gid_index);
 
-return dev_res->ports[0].gid_tbl[sgid_idx].backend_gid_index;
+return dev_res->port.gid_tbl[sgid_idx].backend_gid_index;
 }
 
 static void destroy_qp_hash_key(gpointer data)
@@ -596,15 +596,13 @@ static void destroy_qp_hash_key(gpointer data)
 
 static void init_ports(RdmaDeviceResources *dev_res)
 {
-int i, j;
+int i;
 
-memset(dev_res->ports, 0, sizeof(dev_res->ports));
+memset(_res->port, 0, sizeof(dev_res->port));
 
-for (i = 0; i < MAX_PORTS; i++) {
-dev_res->ports[i].state = IBV_PORT_DOWN;
-for (j = 0; j < MAX_PORT_GIDS; j++) {
-dev_res->ports[i].gid_tbl[j].backend_gid_index = -1;
-}
+dev_res->port.state = IBV_PORT_DOWN;
+for (i = 0; i < MAX_PORT_GIDS; i++) {
+dev_res->port.gid_tbl[i].backend_gid_index = -1;
 }
 }
 
@@ -613,7 +611,7 @@ static void fini_ports(RdmaDeviceResources *dev_res,
 {
 int i;
 
-dev_res->ports[0].state = IBV_PORT_DOWN;
+dev_res->port.state = IBV_PORT_DOWN;
 for (i = 0; i < MAX_PORT_GIDS; i++) {
 rdma_rm_del_gid(dev_res, backend_dev, ifname, i);
 }
diff --git a/hw/rdma/rdma_rm.h b/hw/rdma/rdma_rm.h
index a7169b4e89..3c602c04c0 100644
--- a/hw/rdma/rdma_rm.h
+++ b/hw/rdma/rdma_rm.h
@@ -79,7 +79,7 @@ int rdma_rm_get_backend_gid_index(RdmaDeviceResources 
*dev_res,
 static inline union ibv_gid *rdma_rm_get_gid(RdmaDeviceResources *dev_res,
  int sgid_idx)
 {
-return _res->ports[0].gid_tbl[sgid_idx].gid;
+return _res->port.gid_tbl[sgid_idx].gid;
 }
 
 #endif
diff --git a/hw/rdma/rdma_rm_defs.h b/hw/rdma/rdma_rm_defs.h
index 7b3435f991..0ba61d1838 100644
--- a/hw/rdma/rdma_rm_defs.h
+++ b/hw/rdma/rdma_rm_defs.h
@@ -18,7 +18,7 @@
 
 #include "rdma_backend_defs.h"
 
-#define MAX_PORTS 1
+#define MAX_PORTS 1 /* Do not change - we support only one port */
 #define MAX_PORT_GIDS 255
 #define MAX_GIDS  MAX_PORT_GIDS
 #define MAX_PORT_PKEYS1
@@ -97,7 +97,7 @@ typedef struct RdmaRmPort {
 } RdmaRmPort;
 
 typedef struct RdmaDeviceResources {
-RdmaRmPort ports[MAX_PORTS];
+RdmaRmPort port;
 RdmaRmResTbl pd_tbl;
 RdmaRmResTbl mr_tbl;
 RdmaRmResTbl uc_tbl;
-- 
2.17.2




[Qemu-devel] [PATCH v4 09/23] hw/pvrdma: Set the correct opcode for send completion

2018-11-18 Thread Yuval Shaia
opcode for WC should be set by the device and not taken from work
element.

Signed-off-by: Yuval Shaia 
Reviewed-by: Marcel Apfelbaum 
---
 hw/rdma/vmw/pvrdma_qp_ops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/rdma/vmw/pvrdma_qp_ops.c b/hw/rdma/vmw/pvrdma_qp_ops.c
index 7b0f440fda..3388be1926 100644
--- a/hw/rdma/vmw/pvrdma_qp_ops.c
+++ b/hw/rdma/vmw/pvrdma_qp_ops.c
@@ -154,7 +154,7 @@ int pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle)
 comp_ctx->cq_handle = qp->send_cq_handle;
 comp_ctx->cqe.wr_id = wqe->hdr.wr_id;
 comp_ctx->cqe.qp = qp_handle;
-comp_ctx->cqe.opcode = wqe->hdr.opcode;
+comp_ctx->cqe.opcode = IBV_WC_SEND;
 
 rdma_backend_post_send(>backend_dev, >backend_qp, qp->qp_type,
(struct ibv_sge *)>sge[0], 
wqe->hdr.num_sge,
-- 
2.17.2




Re: [Qemu-devel] [PATCH for-3.1] fdc: fix segfault in fdctrl_stop_transfer() when DMA is disabled

2018-11-18 Thread Mark Cave-Ayland
On 13/11/2018 20:29, John Snow wrote:

> On 11/13/18 8:16 AM, Kevin Wolf wrote:
>> Am 12.11.2018 um 20:58 hat John Snow geschrieben:
>>>
>>>
>>> On 11/11/18 4:40 AM, Mark Cave-Ayland wrote:
 Commit c8a35f1cf0f "fdc: use IsaDma interface instead of global DMA_*
 functions" accidentally introduced a segfault in fdctrl_stop_transfer() for
 non-DMA transfers.

 If fdctrl->dma_chann has not been configured then the fdctrl->dma interface
 reference isn't initialised during isabus_fdc_realize(). Unfortunately
 fdctrl_stop_transfer() unconditionally references the DMA interface when
 finishing the transfer causing a NULL pointer dereference.

 Fix the issue by adding a check in fdctrl_stop_transfer() so that the DMA
 interface reference and release method is only invoked if fdctrl->dma_chann
 has been set.

 (This issue was discovered by Martin testing a recent change in the NetBSD
 installer under qemu-system-sparc)

 Reported-by: Martin Husemann 
 Signed-off-by: Mark Cave-Ayland 
 ---
  hw/block/fdc.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

 diff --git a/hw/block/fdc.c b/hw/block/fdc.c
 index 2e9c1e1e2f..6f19f127a5 100644
 --- a/hw/block/fdc.c
 +++ b/hw/block/fdc.c
 @@ -1617,7 +1617,7 @@ static void fdctrl_stop_transfer(FDCtrl *fdctrl, 
 uint8_t status0,
  fdctrl->fifo[5] = cur_drv->sect;
  fdctrl->fifo[6] = FD_SECTOR_SC;
  fdctrl->data_dir = FD_DIR_READ;
 -if (!(fdctrl->msr & FD_MSR_NONDMA)) {
 +if (fdctrl->dma_chann != -1 && !(fdctrl->msr & FD_MSR_NONDMA)) {
  IsaDmaClass *k = ISADMA_GET_CLASS(fdctrl->dma);
  k->release_DREQ(fdctrl->dma, fdctrl->dma_chann);
  }

>>>
>>> Thanks.
>>>
>>> Reviewed-by: John Snow 
>>>
>>> ... Kevin, would you mind staging this one-off for the next RC?
>>
>> No problem, I'm applying this to my block branch. However, my pull
>> request for -rc1 is already merged, so this will have to wait until next
>> week and -rc2.
>>
>> Kevin
>>
> 
> I think that should be fine. Thank you!

Thanks everyone! Kevin, any chance you could also add a CC: qemu-stable@ tag 
when you
apply this to your branch? This will help ensure that when the next NetBSD 
release
appears the fix should already be available for most people.


ATB,

Mark.



[Qemu-devel] [PATCH v4 11/23] hw/pvrdma: Add support to allow guest to configure GID table

2018-11-18 Thread Yuval Shaia
The control over the RDMA device's GID table is done by updating the
device's Ethernet function addresses.
Usually the first GID entry is determined by the MAC address, the second
by the first IPv6 address and the third by the IPv4 address. Other
entries can be added by adding more IP addresses. The opposite is the
same, i.e. whenever an address is removed, the corresponding GID entry
is removed.

The process is done by the network and RDMA stacks. Whenever an address
is added the ib_core driver is notified and calls the device driver
add_gid function which in turn update the device.

To support this in pvrdma device we need to hook into the create_bind
and destroy_bind HW commands triggered by pvrdma driver in guest.
Whenever changed is made to the pvrdma port's GID table a special QMP
messages is sent to be processed by libvirt to update the address of the
backend Ethernet device.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/rdma_backend.c  | 268 
 hw/rdma/rdma_backend.h  |  22 +--
 hw/rdma/rdma_backend_defs.h |   3 +-
 hw/rdma/rdma_rm.c   | 104 +-
 hw/rdma/rdma_rm.h   |  17 ++-
 hw/rdma/rdma_rm_defs.h  |   9 +-
 hw/rdma/rdma_utils.h|  15 ++
 hw/rdma/vmw/pvrdma.h|   2 +-
 hw/rdma/vmw/pvrdma_cmd.c|  55 +---
 hw/rdma/vmw/pvrdma_main.c   |  25 +---
 hw/rdma/vmw/pvrdma_qp_ops.c |  20 +++
 11 files changed, 394 insertions(+), 146 deletions(-)

diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c
index 7c220a5798..db78009bbb 100644
--- a/hw/rdma/rdma_backend.c
+++ b/hw/rdma/rdma_backend.c
@@ -18,12 +18,14 @@
 #include "qapi/error.h"
 #include "qapi/qmp/qlist.h"
 #include "qapi/qmp/qnum.h"
+#include "qapi/qapi-events-rdma.h"
 
 #include 
 #include 
 #include 
 #include 
 
+#include "contrib/rdmacm-mux/rdmacm-mux.h"
 #include "trace.h"
 #include "rdma_utils.h"
 #include "rdma_rm.h"
@@ -300,11 +302,35 @@ static int build_host_sge_array(RdmaDeviceResources 
*rdma_dev_res,
 return 0;
 }
 
-static int mad_send(RdmaBackendDev *backend_dev, struct ibv_sge *sge,
-uint32_t num_sge)
+static int check_mux_op_status(CharBackend *mad_chr_be)
 {
-struct backend_umad umad = {0};
-char *hdr, *msg;
+RdmaCmMuxMsg msg = {0};
+int ret;
+
+ret = qemu_chr_fe_read_all(mad_chr_be, (uint8_t *), sizeof(msg));
+if (ret != sizeof(msg)) {
+pr_dbg("Invalid message size %d, expecting %ld\n", ret, sizeof(msg));
+return -EIO;
+}
+
+if (msg.hdr.msg_type != RDMACM_MUX_MSG_TYPE_RESP) {
+pr_dbg("Invalid message type %d\n", msg.hdr.msg_type);
+return -EIO;
+}
+
+if (msg.hdr.err_code != RDMACM_MUX_ERR_CODE_OK) {
+pr_dbg("Operation failed in mux, error code %d\n", msg.hdr.err_code);
+return -EIO;
+}
+
+return 0;
+}
+
+static int mad_send(RdmaBackendDev *backend_dev, uint8_t sgid_idx,
+union ibv_gid *sgid, struct ibv_sge *sge, uint32_t num_sge)
+{
+RdmaCmMuxMsg msg = {0};
+char *hdr, *data;
 int ret;
 
 pr_dbg("num_sge=%d\n", num_sge);
@@ -313,26 +339,31 @@ static int mad_send(RdmaBackendDev *backend_dev, struct 
ibv_sge *sge,
 return -EINVAL;
 }
 
-umad.hdr.length = sge[0].length + sge[1].length;
-pr_dbg("msg_len=%d\n", umad.hdr.length);
+msg.hdr.msg_type = RDMACM_MUX_MSG_TYPE_MAD;
+memcpy(msg.hdr.sgid.raw, sgid->raw, sizeof(msg.hdr.sgid));
+
+msg.umad_len = sge[0].length + sge[1].length;
+pr_dbg("umad_len=%d\n", msg.umad_len);
 
-if (umad.hdr.length > sizeof(umad.mad)) {
+if (msg.umad_len > sizeof(msg.umad.mad)) {
 return -ENOMEM;
 }
 
-umad.hdr.addr.qpn = htobe32(1);
-umad.hdr.addr.grh_present = 1;
-umad.hdr.addr.gid_index = backend_dev->backend_gid_idx;
-memcpy(umad.hdr.addr.gid, backend_dev->gid.raw, sizeof(umad.hdr.addr.gid));
-umad.hdr.addr.hop_limit = 1;
+msg.umad.hdr.addr.qpn = htobe32(1);
+msg.umad.hdr.addr.grh_present = 1;
+pr_dbg("sgid_idx=%d\n", sgid_idx);
+pr_dbg("sgid=0x%llx\n", sgid->global.interface_id);
+msg.umad.hdr.addr.gid_index = sgid_idx;
+memcpy(msg.umad.hdr.addr.gid, sgid->raw, sizeof(msg.umad.hdr.addr.gid));
+msg.umad.hdr.addr.hop_limit = 1;
 
 hdr = rdma_pci_dma_map(backend_dev->dev, sge[0].addr, sge[0].length);
 if (!hdr) {
 pr_dbg("Fail to map to sge[0]\n");
 return -ENOMEM;
 }
-msg = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length);
-if (!msg) {
+data = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length);
+if (!data) {
 pr_dbg("Fail to map to sge[1]\n");
 rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length);
 return -ENOMEM;
@@ -341,25 +372,36 @@ static int mad_send(RdmaBackendDev *backend_dev, struct 
ibv_sge *sge,
 pr_dbg_buf("mad_hdr", hdr, sge[0].length);
 pr_dbg_buf("mad_data", data, sge[1].length);
 
-memcpy([0], hdr, sge[0].length);
-

[Qemu-devel] [PATCH v4 22/23] hw/rdma: Do not call rdma_backend_del_gid on an empty gid

2018-11-18 Thread Yuval Shaia
When device goes down the function fini_ports loops over all entries in
gid table regardless of the fact whether entry is valid or not. In case
that entry is not valid we'd like to skip from any further processing in
backend device.

Signed-off-by: Yuval Shaia 
Reviewed-by: Marcel Apfelbaum 
---
 hw/rdma/rdma_rm.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/hw/rdma/rdma_rm.c b/hw/rdma/rdma_rm.c
index ca127c8c26..f5b1295890 100644
--- a/hw/rdma/rdma_rm.c
+++ b/hw/rdma/rdma_rm.c
@@ -555,6 +555,10 @@ int rdma_rm_del_gid(RdmaDeviceResources *dev_res, 
RdmaBackendDev *backend_dev,
 {
 int rc;
 
+if (!dev_res->port.gid_tbl[gid_idx].gid.global.interface_id) {
+return 0;
+}
+
 rc = rdma_backend_del_gid(backend_dev, ifname,
   _res->port.gid_tbl[gid_idx].gid);
 if (rc) {
-- 
2.17.2




[Qemu-devel] [PATCH v4 23/23] docs: Update pvrdma device documentation

2018-11-18 Thread Yuval Shaia
Interface with the device is changed with the addition of support for
MAD packets.
Adjust documentation accordingly.

While there fix a minor mistake which may lead to think that there is a
relation between using RXE on host and the compatibility with bare-metal
peers.

Signed-off-by: Yuval Shaia 
---
 docs/pvrdma.txt | 103 +++-
 1 file changed, 84 insertions(+), 19 deletions(-)

diff --git a/docs/pvrdma.txt b/docs/pvrdma.txt
index 5599318159..f82b2a69d2 100644
--- a/docs/pvrdma.txt
+++ b/docs/pvrdma.txt
@@ -9,8 +9,9 @@ It works with its Linux Kernel driver AS IS, no need for any 
special guest
 modifications.
 
 While it complies with the VMware device, it can also communicate with bare
-metal RDMA-enabled machines and does not require an RDMA HCA in the host, it
-can work with Soft-RoCE (rxe).
+metal RDMA-enabled machines as peers.
+
+It does not require an RDMA HCA in the host, it can work with Soft-RoCE (rxe).
 
 It does not require the whole guest RAM to be pinned allowing memory
 over-commit and, even if not implemented yet, migration support will be
@@ -78,29 +79,93 @@ the required RDMA libraries.
 
 3. Usage
 
+
+
+3.1 VM Memory settings
+==+++=
 Currently the device is working only with memory backed RAM
 and it must be mark as "shared":
-m 1G \
-object memory-backend-ram,id=mb1,size=1G,share \
-numa node,memdev=mb1 \
 
-The pvrdma device is composed of two functions:
- - Function 0 is a vmxnet Ethernet Device which is redundant in Guest
-   but is required to pass the ibdevice GID using its MAC.
-   Examples:
- For an rxe backend using eth0 interface it will use its mac:
-   -device vmxnet3,addr=.0,multifunction=on,mac=
- For an SRIOV VF, we take the Ethernet Interface exposed by it:
-   -device vmxnet3,multifunction=on,mac=
- - Function 1 is the actual device:
-   -device 
pvrdma,addr=.1,backend-dev=,backend-gid-idx=,backend-port=
-   where the ibdevice can be rxe or RDMA VF (e.g. mlx5_4)
- Note: Pay special attention that the GID at backend-gid-idx matches vmxnet's 
MAC.
- The rules of conversion are part of the RoCE spec, but since manual conversion
- is not required, spotting problems is not hard:
-Example: GID: fe80::::7efe:90ff:fecb:743a
- MAC: 7c:fe:90:cb:74:3a
-Note the difference between the first byte of the MAC and the GID.
+
+3.2 MAD Multiplexer
+===
+MAD Multiplexer is a service that exposes MAD-like interface for VMs in
+order to overcome the limitation where only single entity can register with
+MAD layer to send and receive RDMA-CM MAD packets.
+
+To build rdmacm-mux run
+# make rdmacm-mux
+
+The application accepts 3 command line arguments and exposes a UNIX socket
+to pass control and data to it.
+-s unix-socket-path   Path to unix socket to listen on
+  (default /var/run/rdmacm-mux)
+-d rdma-device-name   Name of RDMA device to register with
+  (default rxe0)
+-p rdma-device-port   Port number of RDMA device to register with
+  (default 1)
+The final UNIX socket file name is a concatenation of the 3 arguments so
+for example for device mlx5_0 on port 2 this /var/run/rdmacm-mux-mlx5_0-2
+will be created.
+
+Please refer to contrib/rdmacm-mux for more details.
+
+
+3.3 PCI devices settings
+
+RoCE device exposes two functions - an Ethernet and RDMA.
+To support it, pvrdma device is composed of two PCI functions, an Ethernet
+device of type vmxnet3 on PCI slot 0 and a PVRDMA device on PCI slot 1. The
+Ethernet function can be used for other Ethernet purposes such as IP.
+
+
+3.4 Device parameters
+=
+- netdev: Specifies the Ethernet device on host. For Soft-RoCE (rxe) this
+  would be the Ethernet device used to create it. For any other physical
+  RoCE device this would be the netdev name of the device.
+- ibdev: The IB device name on host for example rxe0, mlx5_0 etc.
+- mad-chardev: The name of the MAD multiplexer char device.
+- ibport: In case of multi-port device (such as Mellanox's HCA) this
+  specify the port to use. If not set 1 will be used.
+- dev-caps-max-mr-size: The maximum size of MR.
+- dev-caps-max-qp: Maximum number of QPs.
+- dev-caps-max-sge: Maximum number of SGE elements in WR.
+- dev-caps-max-cq: Maximum number of CQs.
+- dev-caps-max-mr: Maximum number of MRs.
+- dev-caps-max-pd: Maximum number of PDs.
+- dev-caps-max-ah: Maximum number of AHs.
+
+Notes:
+- The first 3 parameters are mandatory settings, the rest have their
+  defaults.
+- The last 8 parameters (the ones that prefixed by dev-caps) defines the top
+  limits but the final values is adjusted by the backend device limitations.
+
+3.5 Example
+===
+Define bridge device with vmxnet3 network backend:
+
+  
+  
+  
+  
+
+
+Define pvrdma device:
+
+  
+  
+  
+  
+  
+  
+  
+  
+
 
 
 
-- 
2.17.2




[Qemu-devel] [PATCH v4 07/23] hw/pvrdma: Make default pkey 0xFFFF

2018-11-18 Thread Yuval Shaia
Commit 6e7dba23af ("hw/pvrdma: Make default pkey 0x") exports
default pkey as external definition but omit the change from 0x7FFF to
0x.

Fixes: 6e7dba23af ("hw/pvrdma: Make default pkey 0x")

Signed-off-by: Yuval Shaia 
Reviewed-by: Marcel Apfelbaum 
---
 hw/rdma/vmw/pvrdma.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/rdma/vmw/pvrdma.h b/hw/rdma/vmw/pvrdma.h
index e3742d893a..15c3f28b86 100644
--- a/hw/rdma/vmw/pvrdma.h
+++ b/hw/rdma/vmw/pvrdma.h
@@ -52,7 +52,7 @@
 #define PVRDMA_FW_VERSION14
 
 /* Some defaults */
-#define PVRDMA_PKEY  0x7FFF
+#define PVRDMA_PKEY  0x
 
 typedef struct DSRInfo {
 dma_addr_t dma;
-- 
2.17.2




[Qemu-devel] [PATCH v4 20/23] hw/pvrdma: Clean device's resource when system is shutdown

2018-11-18 Thread Yuval Shaia
In order to clean some external resources such as GIDs, QPs etc,
register to receive notification when VM is shutdown.

Signed-off-by: Yuval Shaia 
Reviewed-by: Marcel Apfelbaum 
---
 hw/rdma/vmw/pvrdma.h  |  2 ++
 hw/rdma/vmw/pvrdma_main.c | 12 
 2 files changed, 14 insertions(+)

diff --git a/hw/rdma/vmw/pvrdma.h b/hw/rdma/vmw/pvrdma.h
index 10a3c4fb7c..ffae36986e 100644
--- a/hw/rdma/vmw/pvrdma.h
+++ b/hw/rdma/vmw/pvrdma.h
@@ -17,6 +17,7 @@
 #define PVRDMA_PVRDMA_H
 
 #include "qemu/units.h"
+#include "qemu/notify.h"
 #include "hw/pci/pci.h"
 #include "hw/pci/msix.h"
 #include "chardev/char-fe.h"
@@ -87,6 +88,7 @@ typedef struct PVRDMADev {
 RdmaDeviceResources rdma_dev_res;
 CharBackend mad_chr;
 VMXNET3State *func0;
+Notifier shutdown_notifier;
 } PVRDMADev;
 #define PVRDMA_DEV(dev) OBJECT_CHECK(PVRDMADev, (dev), PVRDMA_HW_NAME)
 
diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c
index 150404dfa6..474cbfd78d 100644
--- a/hw/rdma/vmw/pvrdma_main.c
+++ b/hw/rdma/vmw/pvrdma_main.c
@@ -24,6 +24,7 @@
 #include "hw/qdev-properties.h"
 #include "cpu.h"
 #include "trace.h"
+#include "sysemu/sysemu.h"
 
 #include "../rdma_rm.h"
 #include "../rdma_backend.h"
@@ -559,6 +560,14 @@ static int pvrdma_check_ram_shared(Object *obj, void 
*opaque)
 return 0;
 }
 
+static void pvrdma_shutdown_notifier(Notifier *n, void *opaque)
+{
+PVRDMADev *dev = container_of(n, PVRDMADev, shutdown_notifier);
+PCIDevice *pci_dev = PCI_DEVICE(dev);
+
+pvrdma_fini(pci_dev);
+}
+
 static void pvrdma_realize(PCIDevice *pdev, Error **errp)
 {
 int rc;
@@ -632,6 +641,9 @@ static void pvrdma_realize(PCIDevice *pdev, Error **errp)
 goto out;
 }
 
+dev->shutdown_notifier.notify = pvrdma_shutdown_notifier;
+qemu_register_shutdown_notifier(>shutdown_notifier);
+
 out:
 if (rc) {
 error_append_hint(errp, "Device fail to load\n");
-- 
2.17.2




[Qemu-devel] [Bug 1803872] Re: gcc 8.2 reports stringop-truncation when building qemu

2018-11-18 Thread Amir Gonnen
** Description changed:

  QEMU 3.0
  
  block/sheepdog.c: In function 'find_vdi_name':
  block/sheepdog.c:1239:5: error: 'strncpy' specified bound 256 equals 
destination size [-Werror=stringop-truncation]
-  strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
-  ^~
+  strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
+  ^~
  
- 
- If this is the intended behavior, please suppress the warning. For example:
+ If this is the intended behavior, please suppress the warning. For
+ example:
  
  #pragma GCC diagnostic push
  #pragma GCC diagnostic ignored "-Wstringop-truncation"
- strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
+ strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
  #pragma GCC diagnostic pop
+ 
+ This also happens on other sources, for example hw/acpi/core.c, so
+ another option is to suppress it globally on CFLAGS (-Wno-stringop-
+ truncation)

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1803872

Title:
  gcc 8.2 reports stringop-truncation when building qemu

Status in QEMU:
  New

Bug description:
  QEMU 3.0

  block/sheepdog.c: In function 'find_vdi_name':
  block/sheepdog.c:1239:5: error: 'strncpy' specified bound 256 equals 
destination size [-Werror=stringop-truncation]
   strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
   ^~

  If this is the intended behavior, please suppress the warning. For
  example:

  #pragma GCC diagnostic push
  #pragma GCC diagnostic ignored "-Wstringop-truncation"
  strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
  #pragma GCC diagnostic pop

  This also happens on other sources, for example hw/acpi/core.c, so
  another option is to suppress it globally on CFLAGS (-Wno-stringop-
  truncation)

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1803872/+subscriptions



Re: [Qemu-devel] [PATCH] docker: dockerfile for openSUSE Leap

2018-11-18 Thread Alex Bennée


Dario Faggioli  writes:

> Dockerfile for building an openSUSE Leap container.
>
> Tracks the latest release (at the time of writing this
> patch, it is Leap 15).
>
> Signed-off-by: Dario Faggioli 
> ---
> Cc: "Alex Bennée" 
> Cc: Fam Zheng 
> Cc: "Philippe Mathieu-Daudé" 
> ---
>  tests/docker/dockerfiles/opensuse-leap.docker |   62 
> +
>  1 file changed, 62 insertions(+)
>  create mode 100644 tests/docker/dockerfiles/opensuse-leap.docker
>
> diff --git a/tests/docker/dockerfiles/opensuse-leap.docker 
> b/tests/docker/dockerfiles/opensuse-leap.docker
> new file mode 100644
> index 00..9d00861e66
> --- /dev/null
> +++ b/tests/docker/dockerfiles/opensuse-leap.docker
> @@ -0,0 +1,62 @@
> +FROM opensuse/leap
> +ENV PACKAGES \

> +tar \
> +usbredir-devel \
> +virglrenderer-devel \
> +vte-devel \
> +which \
> +xen-devel
> +zlib-devel \

This hasn't been tested because the docker image fails to build due to
continuation breakage.

--
Alex Bennée



[Qemu-devel] 3.1.0-rc{0,1} doesn't start

2018-11-18 Thread balducci
hello

I'm building qemu from source and happily using it since a bit
(2.3.0)

Since 3.1.0-rc0 (including latest 3.1.0-rc1) I'm no more able to start
qemu, getting:

8<
install:115> qemu
qemu: error: failed to set MSR 0x10a to 0x0
qemu: 
/home/balducci/tmp/install-us-d/qemu-3.1.0-rc1.d/qemu-3.1.0-rc0/target/i386/kvm.c:2185:
 kvm_put_msrs: Assertion `ret == cpu->kvm_msr_buf->nmsrs' failed.
Aborted
>8

I have no idea about what the reason might be, apologies.

Actually, I have found a recent (2018-10-16) post which might be
related to this (it mentions the same error message from qemu):
https://lkml.org/lkml/2018/10/16/440; but I'm not in the position to
go through. AFAICS, the commit mentioned in the link is present in the
4.19.2 kernel I'm using, so...?

I can add that 3.0.0 works nicely (everything else unchanged,
including running kernel 4.19.2)

OTOH, 3.1.0-rc0 dumps the same error message if I boot into 4.17.14 or
4.18.16 kernels.

I enclose my specs hoping that somebody can spot where the problem
might be. I will be happy to send any other detail which might be
useful.

I suspect that this might be some problem on my side, as I couldn't
find any similar report (apart some old (qemu-2.8.50) threads, that
didn't help)


thanks a lot in advance for any hint/help

ciao
gabriele


Here are my specs:

# 
# command to run qemu is:
qemu -m 2G /opt/windog  \
-accel kvm,thread=multi \
-netdev user,id=net0,smb=/home/balducci \
-device rtl8139,netdev=net0

# 
# qemu build configuration:
--prefix=/opt/stow.d/versions/qemu-3.1.0-rc1/usr
--libdir=/opt/stow.d/versions/qemu-3.1.0-rc1/usr/lib64
--sysconfdir=/opt/stow.d/versions/qemu-3.1.0-rc1/etc
--localstatedir=/var/run
--docdir=/opt/stow.d/versions/qemu-3.1.0-rc1/usr/share/doc/qemu
--target-list=x86_64-softmmu
--audio-drv-list=alsa


# 
install:154> uname -sr
Linux 4.19.2

# 
install:155> cat /proc/cpuinfo
processor   : 0
vendor_id   : AuthenticAMD
cpu family  : 21
model   : 48
model name  : AMD Athlon(tm) X4 860K Quad Core Processor
stepping: 1
microcode   : 0x6003106
cpu MHz : 3473.492
cache size  : 2048 KB
physical id : 0
siblings: 4
core id : 0
cpu cores   : 2
apicid  : 16
initial apicid  : 0
fpu : yes
fpu_exception   : yes
cpuid level : 13
wp  : yes
flags   : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov 
pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb 
rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf 
pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 popcnt aes xsave avx f16c 
lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch 
osvw ibs xop skinit wdt lwp fma4 tce nodeid_msr tbm topoext perfctr_core 
perfctr_nb bpext ptsc cpb hw_pstate ssbd vmmcall fsgsbase bmi1 xsaveopt arat 
npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists 
pausefilter pfthreshold overflow_recov
bugs: fxsave_leak sysret_ss_attrs null_seg spectre_v1 spectre_v2 
spec_store_bypass
bogomips: 7380.73
TLB size: 1536 4K pages
clflush size: 64
cache_alignment : 64
address sizes   : 48 bits physical, 48 bits virtual
power management: ts ttp tm 100mhzsteps hwpstate cpb eff_freq_ro [13]

[...cpus 1 2 3 omitted...]


# 
install:156> egrep KVM .config-4.19.2
CONFIG_HAVE_KVM=y
CONFIG_HAVE_KVM_IRQCHIP=y
CONFIG_HAVE_KVM_IRQFD=y
CONFIG_HAVE_KVM_IRQ_ROUTING=y
CONFIG_HAVE_KVM_EVENTFD=y
CONFIG_KVM_MMIO=y
CONFIG_KVM_ASYNC_PF=y
CONFIG_HAVE_KVM_MSI=y
CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT=y
CONFIG_KVM_VFIO=y
CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT=y
CONFIG_KVM_COMPAT=y
CONFIG_HAVE_KVM_IRQ_BYPASS=y
CONFIG_KVM=y
# CONFIG_KVM_INTEL is not set
CONFIG_KVM_AMD=y

[of course, I can send the whole kernel configuration file, if needed]




[Qemu-devel] [RFC PATCH 1/2] docker: Add gentoo-mipsr5900el-cross image

2018-11-18 Thread Philippe Mathieu-Daudé
This image is based on Gentoo and the toolchain is built using crossdev.

Recipe from:
  https://lists.gnu.org/archive/html/qemu-devel/2018-09/msg03944.html

Suggested-by: Fredrik Noring 
Signed-off-by: Philippe Mathieu-Daudé 
---
TODO:
- Add Fredrik Noring S-o-b in his patches
- Check patch merged upstream

 tests/docker/Makefile.include |   6 +
 .../gentoo-mipsr5900el-cross.docker   |  39 
 .../binutils-v2.30-ps2-llsc.patch |  36 +++
 .../crossdev.conf |   5 +
 .../gcc-v7.2.0-ps2-llsc.patch |  23 ++
 .../gcc-v7.2.0-ps2.patch  | 219 ++
 6 files changed, 328 insertions(+)
 create mode 100644 tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker
 create mode 100644 
tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker.d/binutils-v2.30-ps2-llsc.patch
 create mode 100644 
tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker.d/crossdev.conf
 create mode 100644 
tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker.d/gcc-v7.2.0-ps2-llsc.patch
 create mode 100644 
tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker.d/gcc-v7.2.0-ps2.patch

diff --git a/tests/docker/Makefile.include b/tests/docker/Makefile.include
index 9467e9d088..6ca615206f 100644
--- a/tests/docker/Makefile.include
+++ b/tests/docker/Makefile.include
@@ -119,6 +119,12 @@ docker-image-debian-sparc64-cross: docker-image-debian-sid
 docker-image-debian-mips64-cross: docker-image-debian-sid
 docker-image-debian-riscv64-cross: docker-image-debian-sid
 docker-image-debian-powerpc-cross: docker-image-debian-sid
+docker-image-gentoo-mipsr5900el-cross: EXTRA_FILES:=$(addprefix \
+   
$(SRC_PATH)/tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker.d/,\
+   crossdev.conf \
+   binutils-v2.30-ps2-llsc.patch \
+   gcc-v7.2.0-ps2.patch \
+   gcc-v7.2.0-ps2-llsc.patch)
 docker-image-travis: NOUSER=1
 
 # Specialist build images, sometimes very limited tools
diff --git a/tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker 
b/tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker
new file mode 100644
index 00..dbc2eb007b
--- /dev/null
+++ b/tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker
@@ -0,0 +1,39 @@
+#
+# Docker mipsel (r5900) cross-compiler target
+#
+# Using multi-stage builds, this image requires docker-17.05.0 or later.
+# (See: https://github.com/gentoo/gentoo-docker-images)
+#
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+# name the portage image
+FROM gentoo/portage:latest as portage
+
+# image is based on stage3-amd64
+FROM gentoo/stage3-amd64:latest
+
+# copy the entire portage volume in
+COPY --from=portage /usr/portage /usr/portage
+
+MAINTAINER Philippe Mathieu-Daudé 
+
+# continue with image build ...
+RUN emerge -qv sys-devel/crossdev
+
+# set CROSSDEV_OVERLAY to /usr/local/portage-crossdev
+RUN mkdir -p /usr/local/portage-crossdev/{profiles,metadata} && \
+echo 'crossdev' > /usr/local/portage-crossdev/profiles/repo_name && \
+echo 'masters = gentoo' > /usr/local/portage-crossdev/metadata/layout.conf 
&& \
+chown -R portage:portage /usr/local/portage-crossdev && \
+mkdir -p /etc/portage/repos.conf
+ADD crossdev.conf /etc/portage/repos.conf/crossdev.conf
+
+# Fredrik's patches
+RUN mkdir -p 
/etc/portage/patches/cross-mipsr5900el-unknown-linux-gnu/{binutils,gcc}
+ADD binutils-v2.30-ps2-llsc.patch 
/etc/portage/patches/cross-mipsr5900el-unknown-linux-gnu/binutils
+ADD gcc-v7.2.0-ps2.patch 
/etc/portage/patches/cross-mipsr5900el-unknown-linux-gnu/gcc
+ADD gcc-v7.2.0-ps2-llsc.patch 
/etc/portage/patches/cross-mipsr5900el-unknown-linux-gnu/gcc
+
+RUN crossdev -s3 -t mipsr5900el-unknown-linux-gnu --binutils ">=2.30" --gcc 
">=7.2.0"
+
+ENV QEMU_CONFIGURE_OPTS --cross-prefix=mipsr5900el-unknown-linux-gnu-
diff --git 
a/tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker.d/binutils-v2.30-ps2-llsc.patch
 
b/tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker.d/binutils-v2.30-ps2-llsc.patch
new file mode 100644
index 00..62ecc3267a
--- /dev/null
+++ 
b/tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker.d/binutils-v2.30-ps2-llsc.patch
@@ -0,0 +1,36 @@
+diff --git a/opcodes/mips-opc.c b/opcodes/mips-opc.c
+index 1cbcbc6..0986b0a 100644
+--- a/opcodes/mips-opc.c
 b/opcodes/mips-opc.c
+@@ -1280,11 +1280,11 @@ const struct mips_opcode mips_builtin_opcodes[] =
+ {"li.s",  "t,f",  0,(int) M_LI_S, INSN_MACRO, 
INSN2_M_FP_S,   I1, 0,  0 },
+ {"li.s",  "T,l",  0,(int) M_LI_SS,INSN_MACRO, 
INSN2_M_FP_S,   I1, 0,  0 },
+ {"ll","t,+j(b)",  0x7c36, 0xfc7f, 
WR_1|RD_3|LM,   0,  I37,0,  0 },
+-{"ll","t,o(b)",   

[Qemu-devel] [RFC PATCH 0/2] docker: Add gentoo-mipsr5900el-cross image

2018-11-18 Thread Philippe Mathieu-Daudé
Hi,

The first patch adds a cross toolchain for the R5900 MIPS.
It is working correctly but the patches provided by Fredrik in [1] don't
have proper S-o-b, thus it is tagged RFC.
Fredrik: any update on the status of those patches upstream?
I setup this image to try Fredrik's TCG tests in [2].

The second patch intents to have Shippable CI build the image and run the
TCG tests.
Since current idea of the Shippable tests is to cross-build QEMU for the
target, it does not work (and makes sense) here. I don't think there is
interest in running cross-compiled QEMU on a PS2... But we never know ;)
The failure I have is pkg-config using an incorrect path. I suppose I
am not using the CROSSDEV_OVERLAY path correctly.
Help from Gentoo developers would be appreciated!

I run the tests using:

$ docker-image-gentoo-mipsr5900el-cross
$ docker run --rm -it -v $PWD:$PWD \
-w $PWD/tests/tcg/mips/mipsr5900 qemu:gentoo-mipsr5900el-cross \
make all
$ make -C $PWD/tests/tcg/mips/mipsr5900 \
check \
SIM=$PWD/build/mipsel-softmmu/qemu-system-mipsel

With this minor modification:
-- >8 --
diff --git a/tests/tcg/mips/mipsr5900/Makefile 
b/tests/tcg/mips/mipsr5900/Makefile
@@ -5 +5 @@ CROSS=mipsr5900el-unknown-linux-gnu-
-SIM=qemu-mipsel
+SIM?=qemu-mipsel
---

Regards,

Phil.

[1]  https://lists.gnu.org/archive/html/qemu-devel/2018-09/msg03944.html
[2] https://lists.gnu.org/archive/html/qemu-devel/2018-11/msg01284.html

Philippe Mathieu-Daudé (2):
  docker: Add gentoo-mipsr5900el-cross image
  shippable: Add the mipsr5900el linux-user target

 .shippable.yml|   2 +
 tests/docker/Makefile.include |   6 +
 .../gentoo-mipsr5900el-cross.docker   |  42 
 .../binutils-v2.30-ps2-llsc.patch |  36 +++
 .../crossdev.conf |   5 +
 .../gcc-v7.2.0-ps2-llsc.patch |  23 ++
 .../gcc-v7.2.0-ps2.patch  | 219 ++
 7 files changed, 333 insertions(+)
 create mode 100644 tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker
 create mode 100644 
tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker.d/binutils-v2.30-ps2-llsc.patch
 create mode 100644 
tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker.d/crossdev.conf
 create mode 100644 
tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker.d/gcc-v7.2.0-ps2-llsc.patch
 create mode 100644 
tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker.d/gcc-v7.2.0-ps2.patch

-- 
2.17.2




[Qemu-devel] [RFC PATCH 2/2] shippable: Add the mipsr5900el linux-user target

2018-11-18 Thread Philippe Mathieu-Daudé
Shippable node pool must be updated to use at lease v5.8.2:

  http://docs.shippable.com/platform/runtime/machine-image/ami-v582/

Signed-off-by: Philippe Mathieu-Daudé 
---
 .shippable.yml   | 2 ++
 tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/.shippable.yml b/.shippable.yml
index f74a3de3ff..bd3316df91 100644
--- a/.shippable.yml
+++ b/.shippable.yml
@@ -25,6 +25,8 @@ env:
   TARGET_LIST=mips64el-softmmu,mips64el-linux-user
 - IMAGE=debian-ppc64el-cross
   TARGET_LIST=ppc64-softmmu,ppc64-linux-user,ppc64abi32-linux-user
+- IMAGE=gentoo-mipsr5900el-cross
+  TARGET_LIST=mips64el-linux-user
 build:
   pre_ci:
 - make docker-image-${IMAGE} V=1
diff --git a/tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker 
b/tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker
index dbc2eb007b..b35f779718 100644
--- a/tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker
+++ b/tests/docker/dockerfiles/gentoo-mipsr5900el-cross.docker
@@ -37,3 +37,8 @@ ADD gcc-v7.2.0-ps2-llsc.patch 
/etc/portage/patches/cross-mipsr5900el-unknown-lin
 RUN crossdev -s3 -t mipsr5900el-unknown-linux-gnu --binutils ">=2.30" --gcc 
">=7.2.0"
 
 ENV QEMU_CONFIGURE_OPTS --cross-prefix=mipsr5900el-unknown-linux-gnu-
+
+# kludge for Shippable which checks clang available.
+RUN ln -s /bin/true /usr/bin/clang
+
+RUN emerge -qv dev-vcs/git ">=dev-libs/glib-2.0" sys-libs/zlib dev-lang/python
-- 
2.17.2




Re: [Qemu-devel] [PATCH] docker: dockerfile for openSUSE Leap

2018-11-18 Thread Dario Faggioli
On Sun, 2018-11-18 at 19:47 +, Alex Bennée wrote:
> Dario Faggioli  writes:
> 
> > Dockerfile for building an openSUSE Leap container.
> > 
> > Tracks the latest release (at the time of writing this
> > patch, it is Leap 15).
> > 
> > Signed-off-by: Dario Faggioli 
> > ---
> > Cc: "Alex Bennée" 
> > Cc: Fam Zheng 
> > Cc: "Philippe Mathieu-Daudé" 
> > ---
> >  tests/docker/dockerfiles/opensuse-leap.docker |   62
> > +
> >  1 file changed, 62 insertions(+)
> >  create mode 100644 tests/docker/dockerfiles/opensuse-leap.docker
> > 
> > diff --git a/tests/docker/dockerfiles/opensuse-leap.docker
> > b/tests/docker/dockerfiles/opensuse-leap.docker
> > new file mode 100644
> > index 00..9d00861e66
> > --- /dev/null
> > +++ b/tests/docker/dockerfiles/opensuse-leap.docker
> > @@ -0,0 +1,62 @@
> > +FROM opensuse/leap
> > +ENV PACKAGES \
> 
> > +tar \
> > +usbredir-devel \
> > +virglrenderer-devel \
> > +vte-devel \
> > +which \
> > +xen-devel
> > +zlib-devel \
> 
> This hasn't been tested because the docker image fails to build due
> to
> continuation breakage.
>
It is indeed broken.

Basically, I tested it thoroughly, and I'm quite sure it works ok
(although, I'm no docker expert).

Then, when I was about to send the mail, I saw in another commit that
it was best to have the package list sorted; so I did that, and forgot
to adjust the continuation. :-(

Sorry.

I'll send a v2 with this fixed.

Thanks and Regards,
Dario
-- 
<> (Raistlin Majere)
-
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Software Engineer @ SUSE https://www.suse.com/


signature.asc
Description: This is a digitally signed message part


Re: [Qemu-devel] [PATCH] docker: dockerfile for openSUSE Leap

2018-11-18 Thread Philippe Mathieu-Daudé
Hi Dario,

On Sun, Nov 18, 2018 at 10:54 PM Dario Faggioli  wrote:
>
> On Sun, 2018-11-18 at 19:47 +, Alex Bennée wrote:
> > Dario Faggioli  writes:
> >
> > > Dockerfile for building an openSUSE Leap container.
> > >
> > > Tracks the latest release (at the time of writing this
> > > patch, it is Leap 15).
> > >
> > > Signed-off-by: Dario Faggioli 
> > > ---
> > > Cc: "Alex Bennée" 
> > > Cc: Fam Zheng 
> > > Cc: "Philippe Mathieu-Daudé" 
> > > ---
> > >  tests/docker/dockerfiles/opensuse-leap.docker |   62
> > > +
> > >  1 file changed, 62 insertions(+)
> > >  create mode 100644 tests/docker/dockerfiles/opensuse-leap.docker
> > >
> > > diff --git a/tests/docker/dockerfiles/opensuse-leap.docker
> > > b/tests/docker/dockerfiles/opensuse-leap.docker
> > > new file mode 100644
> > > index 00..9d00861e66
> > > --- /dev/null
> > > +++ b/tests/docker/dockerfiles/opensuse-leap.docker
> > > @@ -0,0 +1,62 @@
> > > +FROM opensuse/leap
> > > +ENV PACKAGES \
> > 
> > > +tar \
> > > +usbredir-devel \
> > > +virglrenderer-devel \
> > > +vte-devel \
> > > +which \
> > > +xen-devel
> > > +zlib-devel \
> >
> > This hasn't been tested because the docker image fails to build due
> > to
> > continuation breakage.
> >
> It is indeed broken.
>
> Basically, I tested it thoroughly, and I'm quite sure it works ok
> (although, I'm no docker expert).

I quickly fixed/tried it.
First this package list pulls many unuseful system packages.
I added '--no-recommends' to reduce a bit:

RUN zypper ref && \
zypper up -y && \
zypper install -y --no-recommends $PACKAGES

This still install a bunch of unnecessary stuffs:

Creating group systemd-journal with gid 485.
Creating group systemd-network with gid 484.
Creating user systemd-network (systemd Network Management) with uid
484 and gid 484.
Creating group systemd-coredump with gid 483.
Creating user systemd-coredump (systemd Core Dumper) with uid 483 and gid 483.
Creating group systemd-timesync with gid 482.
Creating user systemd-timesync (systemd Time Synchronization) with uid
482 and gid 482.
Failed to connect to bus: No such file or directory

Then the resulting image takes the same than without --no-recommends:
1.1GB. No idea why.
At some point some package tried to talk to the network manager via
DBus to start sshd...

Second, while this works on x86_64, it fails on aarch64:

Problem: libcurl-devel-7.59.0-lp150.1.2.aarch64 requires libcurl4 =
7.59.0, but this requirement cannot be provided
  not installable providers: libcurl4-7.59.0-lp150.1.2.aarch64[repo-oss]
 Solution 1: downgrade of libcurl4-7.60.0-lp150.2.15.1.aarch64 to
libcurl4-7.59.0-lp150.1.2.aarch64
 Solution 2: do not install libcurl-devel-7.59.0-lp150.1.2.aarch64
 Solution 3: break libcurl-devel-7.59.0-lp150.1.2.aarch64 by ignoring
some of its dependencies

Not your fault. So I suppose you are planning to use this image to
compile QEMU for openSUSE/x86_64 only, is this correct?

Regards,

Phil.

>
> Then, when I was about to send the mail, I saw in another commit that
> it was best to have the package list sorted; so I did that, and forgot
> to adjust the continuation. :-(
>
> Sorry.
>
> I'll send a v2 with this fixed.
>
> Thanks and Regards,
> Dario
> --
> <> (Raistlin Majere)
> -
> Dario Faggioli, Ph.D, http://about.me/dario.faggioli
> Software Engineer @ SUSE https://www.suse.com/



Re: [Qemu-devel] SeaBIOS booting time optimization

2018-11-18 Thread Samuel Ortiz
Hi Stefano,

On Fri, Nov 16, 2018 at 05:13:59PM +0100, Stefano Garzarella wrote:
> Hi,
> I'm investigating the SeaBIOS booting time, to understand if we can reduce
> the boot time in some cases (e.g. legacy hardware is not needed). I think
> this
> can be interesting also for NEMU developers.
Definitely, thanks.


> Following this thread (
> https://mail.coreboot.org/pipermail/seabios/2015-July/009497.html),
> I'm using qboot (https://github.com/bonzini/qboot) to compare the SeaBIOS
> booting time.
> 
> As Paolo did in qboot, I manually add small debug port writes in SeaBIOS and
> linuxboot_dma.c (QEMU) to trace the events with perf (kvm_pio events)
When doing similar measuremnts, I also used the debug-exit qemu device
in order to make imho simpler measuremnts, see
https://github.com/intel/nemu/wiki/Measuring-Boot-Latency

Probably not as precised as yours, but it was enough for what we tried
to characterize.

> The goal is to have only one image of SeaBIOS configurable at runtime to
> reduce
> the boot time, avoiding unnecessary initialization.
> 
> Any pointers or suggestions would be helpful.
> 
> Following I put some preliminary measurements that I obtained:
> I used this QEMU command line:
> ./qemu-system-x86_64 -bios $BIOS -m 1G -cpu host -M accel=kvm \
> -kernel /boot/vmlinuz-4.18.18-300.fc29.x86_64 -append 'console=ttyS0' \
> -nographic -serial mon:stdio
> 
> For each test, I measured these times (in milliseconds) relative to the
> "sched_process_exec" event:
> - qemu_init_end: first kvm_entry (i.e. QEMU initialized has finished)
> - fw_start: first entry of the BIOS
> - fw_do_boot: after the BIOS initialization (e.g. PCI setup, etc.)
> - linux_start_boot: before the jump to the Linux kernel
Are you planning to also measure the total time to userspace as well?
How we set the hardware up from the FW/BIOS can also influence the
overall (up to ring 3) boot latency on our experiments.

> # qboot
> BIOS=/home/stefano/repos/qboot/bios.bin
>  qemu_init_end: 40.561234
>  fw_start: 40.721729 (+0.160495)
>  fw_do_boot: 47.025591 (+6.303862)
>  linux_start_boot: 48.874112 (+1.848521)
> 
> # SeaBIOS with default configuration
> BIOS=/home/stefano/repos/seabios/out_default/bios.bin
>  qemu_init_end: 40.419451
>  fw_start: 40.639967 (+0.220516)
>  fw_do_boot: 886.668828 (+846.028861)
>  linux_start_boot: 889.723547 (+3.054719)
> 
> # SeaBIOS with Kevin's configuration
> # (https://mail.coreboot.org/pipermail/seabios/2015-July/009508.html)
> # Note: this SeaBIOS setup is so stripped down that it can't actually boot
> an OS
> BIOS=/home/stefano/repos/seabios/out_kevin/bios.bin
>  qemu_init_end: 40.676412
>  fw_start: 40.755757 (+0.079345)
>  fw_do_boot: 56.427023 (+15.671266)
That's a slight improvement ;)


> I did the same tests also with NEMU (without using -M virt) and I have
> approximately the same results.
Yes, as expected.


> As the next step, I'll start from Kevin's configuration to have a minimal
> SeaBIOS image ables to boot a Linux kernel.
Please keep us posted.

Cheers,
Samuel.



Re: [Qemu-devel] [PATCH 1/2] hw: fw_cfg: refactor fw_cfg_bootsplash()

2018-11-18 Thread 李强









At 2018-11-17 00:33:34, "Markus Armbruster"  wrote:
>Li Qiang  writes:
>
>> Currently when the splash-time value is bigger than 0x
>> we report and correct it, when it is less than 0 we just ingore it.
>
>s/ingore/ignore/
>
>> Also we use qemu_opt_get() to get 'splash-time', then convert it to a number
>> ourselves. This is wrong.
>
>Well, doing it that way isn't wrong, it's just needlessly complicated
>and error-prone.
>
>Suggest starting a new paragraph right here.
>
>>   This patch does following:
>> 1. use qemu_opt_get_number() to parse 'splash-time'
>> 2. exit when the splash-time is invalid or loading the splash file failed
>> 3. simplify code
>>
>> Signed-off-by: Li Qiang 
>> ---
>>  hw/nvram/fw_cfg.c | 40 
>>  vl.c  |  2 +-
>>  2 files changed, 17 insertions(+), 25 deletions(-)
>>
>> diff --git a/hw/nvram/fw_cfg.c b/hw/nvram/fw_cfg.c
>> index 946f765f7f..78f43dad93 100644
>> --- a/hw/nvram/fw_cfg.c
>> +++ b/hw/nvram/fw_cfg.c
>> @@ -118,55 +118,47 @@ error:
>>  
>>  static void fw_cfg_bootsplash(FWCfgState *s)
>>  {
>> -int boot_splash_time = -1;
>>  const char *boot_splash_filename = NULL;
>> -char *p;
>> +const char *boot_splash_time = NULL;
>>  char *filename, *file_data;
>>  gsize file_size;
>>  int file_type;
>> -const char *temp;
>>  
>>  /* get user configuration */
>>  QemuOptsList *plist = qemu_find_opts("boot-opts");
>>  QemuOpts *opts = QTAILQ_FIRST(>head);
>> -if (opts != NULL) {
>> -temp = qemu_opt_get(opts, "splash");
>> -if (temp != NULL) {
>> -boot_splash_filename = temp;
>> -}
>> -temp = qemu_opt_get(opts, "splash-time");
>> -if (temp != NULL) {
>> -p = (char *)temp;
>> -boot_splash_time = strtol(p, , 10);
>> -}
>> -}
>> +boot_splash_filename = qemu_opt_get(opts, "splash");
>> +boot_splash_time = qemu_opt_get(opts, "splash-time");
>
>You first get "splash-time" as a string, and then ...
>>  
>>  /* insert splash time if user configurated */
>> -if (boot_splash_time >= 0) {
>> +if (boot_splash_time) {
>> +int64_t bst_val = qemu_opt_get_number(opts, "splash-time", -1);
>
>... you get it again as a number.  I figure you do that because
>"splash-time not specified" is not the same as "splash-time=T" for any
>T.  I don't like such interfaces.  Not this patch's fault.
>
>Just noticed: qemu_extra_params_fw[] has external linkage, but is used
>only in this function.  Care to make it static in this function in a

>separate patch?


Will do in the next revision.


Thanks,
Li Qiang


>
>>  /* validate the input */
>> -if (boot_splash_time > 0x) {
>> -error_report("splash time is big than 65535, force it to 
>> 65535.");
>> -boot_splash_time = 0x;
>> +if (bst_val < 0 || bst_val > 0x) {
>> +error_report("splash time is invalid,"
>> + "it should be a value between 0 and 65535");
>> +exit(1);
>>  }
>>  /* use little endian format */
>> -qemu_extra_params_fw[0] = (uint8_t)(boot_splash_time & 0xff);
>> -qemu_extra_params_fw[1] = (uint8_t)((boot_splash_time >> 8) & 0xff);
>> +qemu_extra_params_fw[0] = (uint8_t)(bst_val & 0xff);
>> +qemu_extra_params_fw[1] = (uint8_t)((bst_val >> 8) & 0xff);
>>  fw_cfg_add_file(s, "etc/boot-menu-wait", qemu_extra_params_fw, 2);
>>  }
>>  
>>  /* insert splash file if user configurated */
>> -if (boot_splash_filename != NULL) {
>> +if (boot_splash_filename) {
>>  filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, 
>> boot_splash_filename);
>>  if (filename == NULL) {
>> -error_report("failed to find file '%s'.", boot_splash_filename);
>> -return;
>> +error_report("failed to find file '%s'", boot_splash_filename);
>> +exit(1);
>>  }
>>  
>>  /* loading file data */
>>  file_data = read_splashfile(filename, _size, _type);
>>  if (file_data == NULL) {
>>  g_free(filename);
>> -return;
>> +error_report("failed to read file '%s'", boot_splash_filename);
>> +exit(1);
>>  }
>>  g_free(boot_splash_filedata);
>>  boot_splash_filedata = (uint8_t *)file_data;
>> diff --git a/vl.c b/vl.c
>> index 55bab005b6..be37da46f0 100644
>> --- a/vl.c
>> +++ b/vl.c
>> @@ -336,7 +336,7 @@ static QemuOptsList qemu_boot_opts = {
>>  .type = QEMU_OPT_STRING,
>>  }, {
>>  .name = "splash-time",
>> -.type = QEMU_OPT_STRING,
>> +.type = QEMU_OPT_NUMBER,
>>  }, {
>>  .name = "reboot-timeout",
>>  .type = QEMU_OPT_STRING,
>
>Reviewed-by: Markus Armbruster 


Re: [Qemu-devel] [PATCH v5 00/11] hw/m68k: add Apple Machintosh Quadra 800 machine

2018-11-18 Thread Rob Landley
On 11/2/18 6:25 AM, Laurent Vivier wrote:
>> If that does not work, I'm also fine if we simply deprecate the simcalls
>> (if possible).
> 
> I have a patch to deprecate the interface. I will send it once the
> release will be done.

Did I ever point out that I got m68k running under your q800 emulation in my
https://github.com/landley/mkroot project?

Now that m68k is in musl-libc, you can build
https://github.com/richfelker/musl-cross-make or grab a prebuilt binary a
toolchain from http://b.zv.io/mcm/bin/m68k-linux-musl-cross.tar.gz and then:

  CROSS_COMPILE=/path/to/m68k-linux-musl-cross/bin/m68k-linux-musl- \
./mkroot.sh kernel

And then:

  cd output/m68k
  ./qemu-m68k.sh -hda log.txt

The -hda is just so you have a hard drive image you can play with. Cat /dev/?da
should show you the log.txt contents. The network interface works too, although
I haven't got wget in toybox yet but:

  echo GET / | netcat landley.net /

Works. :)

When does this go upstream?

P.S. to build a current kernel:

  cd download
  git clone git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux
  cd ..

And then mkroot.sh again. I tested with 4.20-rc1 and m68k worked fine.

> Thanks,
> Laurent

Rob



Re: [Qemu-devel] [PATCH qemu RFC 7/7] spapr: Add NVLink2 pass through support

2018-11-18 Thread David Gibson
On Tue, Nov 13, 2018 at 07:31:04PM +1100, Alexey Kardashevskiy wrote:
> The NVIDIA V100 GPU comes with some on-board RAM which is mapped into
> the host memory space and accessible as normal RAM via NVLink bus.
> The VFIO-PCI driver implements special regions for such GPU and emulated
> NVLink bridge (below referred as NPU). The POWER9 CPU also provides
> address translation services which includes TLB invalidation register
> exposes via the NVLink bridge; the feature is called "ATSD".
> 
> This adds a quirk to VFIO to map the memory and create an MR; the new MR
> is stored in a GPU as a QOM link. The sPAPR PCI uses this to get the MR
> and map it to the system address space. Another quirk does the same for
> ATSD.
> 
> This adds 3 additional steps to the FDT builder in spapr-pci:
> 1. Search for specific GPUs and NPUs, collects findings in sPAPRPHBState;
> 2. Adds several properties in the DT: "ibm,npu", "ibm,gpu", "memory-block",
> and some other. These are required by the guest platform and GPU driver;
> this also adds a new made-up compatible type for a PHB to signal
> a modified guest that this particular PHB needs the default DMA window
> removed as these GPUs have limited DMA mask size (way lower than usual 59);
> 3. Adds new memory blocks with one addition - they have
> "linux,memory-usable" property configured in the way which prevents
> the guest from onlining it automatically as it needs to be deferred till
> the guest GPU driver trains NVLink.
> 
> A couple of notes:
> - this changes the FDT rendeder as doing 1-2-3 from sPAPRPHBClass::realize
> impossible - devices are not yet attached;
> - this does not add VFIO quirk MRs to the system address space as
> the address is selected in sPAPRPHBState, similar to MMIO.
> 
> This puts new memory nodes in a separate NUMA node to replicate the host
> system setup as close as possible (the GPU driver relies on this too).
> 
> This adds fake NPU nodes to make the guest platform code work,
> specifically "ibm,npu-link-index".
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
>  hw/vfio/pci.h   |   2 +
>  include/hw/pci-host/spapr.h |  28 
>  include/hw/ppc/spapr.h  |   3 +-
>  hw/ppc/spapr.c  |  14 +-
>  hw/ppc/spapr_pci.c  | 256 +++-
>  hw/vfio/pci-quirks.c|  93 +
>  hw/vfio/pci.c   |  14 ++
>  hw/vfio/trace-events|   3 +
>  8 files changed, 408 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index f4c5fb6..b8954cc 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -195,6 +195,8 @@ int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp);
>  int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,
> struct vfio_region_info *info,
> Error **errp);
> +int vfio_pci_nvlink2_ram_init(VFIOPCIDevice *vdev, Error **errp);
> +int vfio_pci_npu2_atsd_init(VFIOPCIDevice *vdev, Error **errp);
>  
>  void vfio_display_reset(VFIOPCIDevice *vdev);
>  int vfio_display_probe(VFIOPCIDevice *vdev, Error **errp);
> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
> index 7c66c38..1f8ebf3 100644
> --- a/include/hw/pci-host/spapr.h
> +++ b/include/hw/pci-host/spapr.h
> @@ -87,6 +87,24 @@ struct sPAPRPHBState {
>  uint32_t mig_liobn;
>  hwaddr mig_mem_win_addr, mig_mem_win_size;
>  hwaddr mig_io_win_addr, mig_io_win_size;
> +hwaddr nv2_gpa_win_addr;
> +hwaddr nv2_atsd_win_addr;
> +
> +struct spapr_phb_pci_nvgpu_config {
> +uint64_t nv2_ram;
> +uint64_t nv2_atsd;
> +int num;
> +struct {
> +int links;
> +uint64_t tgt;
> +uint64_t gpa;
> +PCIDevice *gpdev;
> +uint64_t atsd[3];
> +PCIDevice *npdev[3];
> +} gpus[6];
> +uint64_t atsd[64]; /* Big Endian (BE), ready for the DT */
> +int atsd_num;
> +} nvgpus;

Is this information always relevant for the PHB, or only for PHBs
which have an NPU or GPU attached to them?  If the latter I'm
wondering if we can allocate it only when necessary.

>  };
>  
>  #define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x8000ULL
> @@ -104,6 +122,16 @@ struct sPAPRPHBState {
>  
>  #define SPAPR_PCI_MSI_WINDOW 0x400ULL
>  
> +#define PHANDLE_PCIDEV(phb, pdev)(0x1200 | \
> + (((phb)->index) << 16) | 
> ((pdev)->devfn))
> +#define PHANDLE_GPURAM(phb, n)   (0x11FF | ((n) << 8) | \
> + (((phb)->index) << 16))
> +#define GPURAM_ASSOCIATIVITY(phb, n) (255 - ((phb)->index * 3 + (n)))
> +#define SPAPR_PCI_NV2RAM64_WIN_BASE  0x100ULL /* 1 TiB */
> +#define SPAPR_PCI_NV2RAM64_WIN_SIZE  0x020ULL
> +#define PHANDLE_NVLINK(phb, gn, nn)  (0x0013 | (((phb)->index) << 8) | \
> + ((gn) << 4) | (nn))

AFAICT many of these values are only used in 

Re: [Qemu-devel] [PATCH qemu RFC 3/7] pci: Move NVIDIA vendor id to the rest of ids

2018-11-18 Thread David Gibson
On Tue, Nov 13, 2018 at 07:31:00PM +1100, Alexey Kardashevskiy wrote:
> sPAPR code will use it too so move it from VFIO to the common code.
> 
> Signed-off-by: Alexey Kardashevskiy 

Reviewed-by: David Gibson 

This looks correct to me independent of the rest of the series.

> ---
>  include/hw/pci/pci_ids.h | 2 ++
>  hw/vfio/pci-quirks.c | 2 --
>  2 files changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h
> index 63acc72..3ed7d10 100644
> --- a/include/hw/pci/pci_ids.h
> +++ b/include/hw/pci/pci_ids.h
> @@ -271,4 +271,6 @@
>  
>  #define PCI_VENDOR_ID_SYNOPSYS   0x16C3
>  
> +#define PCI_VENDOR_ID_NVIDIA 0x10de
> +
>  #endif
> diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c
> index eae31c7..40a1200 100644
> --- a/hw/vfio/pci-quirks.c
> +++ b/hw/vfio/pci-quirks.c
> @@ -526,8 +526,6 @@ static void vfio_probe_ati_bar2_quirk(VFIOPCIDevice 
> *vdev, int nr)
>   * note it for future reference.
>   */
>  
> -#define PCI_VENDOR_ID_NVIDIA0x10de
> -
>  /*
>   * Nvidia has several different methods to get to config space, the
>   * nouveu project has several of these documented here:

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [Qemu-devel] [PATCH qemu RFC 1/7] vfio/spapr: Fix indirect levels calculation

2018-11-18 Thread David Gibson
On Tue, Nov 13, 2018 at 07:30:58PM +1100, Alexey Kardashevskiy wrote:
> The current code assumes that we can address more bits on a PCI bus
> for DMA than we really can. Limit to the known tested maximum of 55 bits
> and assume 64K IOMMU pages.

This doesn't make much sense to me.  It looks nothing like the
calculation it's replacing, and not just for extreme values.  For
example the new calculation doesn't depend on the size of the window
at all, whereas the previous one did.  You say you're assuming 64k
IOMMU pages, but create.page_shift (the actual IOMMU page size) is in
there.  Nor is it clear to me why the maximum PCI address is relevant
to the number of levels in the first place.

In addition, AFAICT the new calculation gives the answer '2' for all
likely IOMMU page sizes (4k, 64k, 2M)

> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
>  hw/vfio/spapr.c  | 3 ++-
>  hw/vfio/trace-events | 2 +-
>  2 files changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
> index becf71a..f5fdc53 100644
> --- a/hw/vfio/spapr.c
> +++ b/hw/vfio/spapr.c
> @@ -183,7 +183,7 @@ int vfio_spapr_create_window(VFIOContainer *container,
>  entries = create.window_size >> create.page_shift;
>  pages = MAX((entries * sizeof(uint64_t)) / getpagesize(), 1);
>  pages = MAX(pow2ceil(pages), 1); /* Round up */
> -create.levels = ctz64(pages) / 6 + 1;
> +create.levels = MAX(1, (55 - create.page_shift) / 16);
>  
>  ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, );
>  if (ret) {
> @@ -200,6 +200,7 @@ int vfio_spapr_create_window(VFIOContainer *container,
>  return -EINVAL;
>  }
>  trace_vfio_spapr_create_window(create.page_shift,
> +   create.levels,
> create.window_size,
> create.start_addr);
>  *pgsize = pagesize;
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index a85e866..db730f3 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -128,6 +128,6 @@ vfio_prereg_listener_region_add_skip(uint64_t start, 
> uint64_t end) "0x%"PRIx64"
>  vfio_prereg_listener_region_del_skip(uint64_t start, uint64_t end) 
> "0x%"PRIx64" - 0x%"PRIx64
>  vfio_prereg_register(uint64_t va, uint64_t size, int ret) "va=0x%"PRIx64" 
> size=0x%"PRIx64" ret=%d"
>  vfio_prereg_unregister(uint64_t va, uint64_t size, int ret) "va=0x%"PRIx64" 
> size=0x%"PRIx64" ret=%d"
> -vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x 
> winsize=0x%"PRIx64" offset=0x%"PRIx64
> +vfio_spapr_create_window(int ps, unsigned int levels, uint64_t ws, uint64_t 
> off) "pageshift=0x%x levels=%u winsize=0x%"PRIx64" offset=0x%"PRIx64
>  vfio_spapr_remove_window(uint64_t off) "offset=0x%"PRIx64
>  vfio_spapr_group_attach(int groupfd, int tablefd) "Attached groupfd %d to 
> liobn fd %d"

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [Qemu-devel] [PATCH qemu RFC 5/7] spapr-iommu: Always advertise the maximum possible DMA window size

2018-11-18 Thread David Gibson
On Tue, Nov 13, 2018 at 07:31:02PM +1100, Alexey Kardashevskiy wrote:
> When deciding about the huge DMA window, the typical Linux pseries guest
> uses the maximum allowed RAM size as the upper limit. We did the same
> on QEMU side to match that logic. Now we are going to support GPU RAM
> pass through which is not available at the guest boot time as it requires
> the guest driver interaction. As the result, the guest requests a smaller
> window than it should. Therefore the guest needs to be patched to
> understand this new memory and so does QEMU.
> 
> Instead of reimplementing here whatever solution we will choose for
> the guest, this advertises the biggest possible window size limited by
> 32 bit (as defined by LoPAPR).
> 
> This seems to be safe as:
> 1. The guest visible emulated table is allocated in KVM (actual pages
> are allocated in page fault handler) and QEMU (actual pages are allocated
> when changed);
> 2. The hardware table (and corresponding userspace addresses cache)
> supports sparse allocation and also checks for locked_vm limit so
> it is unable to cause the host any damage.
> 
> Signed-off-by: Alexey Kardashevskiy 

This seems like a good idea to me, even without the NPU stuff.  It
always bothered me slightly that we based what's effectively the IOVA
limit on the guest RAM size which doesn't have any direct connection
to it.

As long as it doesn't hit the locked memory limits, I don't see any
reason we should prevent a guest from doing something weird like
mapping a small bit of RAM over and over in IOVA space, or mapping its
RAM sparsely in IOVA space.

> ---
>  hw/ppc/spapr_rtas_ddw.c | 19 +++
>  1 file changed, 3 insertions(+), 16 deletions(-)
> 
> diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c
> index 329feb1..df60351 100644
> --- a/hw/ppc/spapr_rtas_ddw.c
> +++ b/hw/ppc/spapr_rtas_ddw.c
> @@ -96,9 +96,8 @@ static void rtas_ibm_query_pe_dma_window(PowerPCCPU *cpu,
>   uint32_t nret, target_ulong rets)
>  {
>  sPAPRPHBState *sphb;
> -uint64_t buid, max_window_size;
> +uint64_t buid;
>  uint32_t avail, addr, pgmask = 0;
> -MachineState *machine = MACHINE(spapr);
>  
>  if ((nargs != 3) || (nret != 5)) {
>  goto param_error_exit;
> @@ -114,27 +113,15 @@ static void rtas_ibm_query_pe_dma_window(PowerPCCPU 
> *cpu,
>  /* Translate page mask to LoPAPR format */
>  pgmask = spapr_page_mask_to_query_mask(sphb->page_size_mask);
>  
> -/*
> - * This is "Largest contiguous block of TCEs allocated specifically
> - * for (that is, are reserved for) this PE".
> - * Return the maximum number as maximum supported RAM size was in 4K 
> pages.
> - */
> -if (machine->ram_size == machine->maxram_size) {
> -max_window_size = machine->ram_size;
> -} else {
> -max_window_size = machine->device_memory->base +
> -  memory_region_size(>device_memory->mr);
> -}
> -
>  avail = SPAPR_PCI_DMA_MAX_WINDOWS - spapr_phb_get_active_win_num(sphb);
>  
>  rtas_st(rets, 0, RTAS_OUT_SUCCESS);
>  rtas_st(rets, 1, avail);
> -rtas_st(rets, 2, max_window_size >> SPAPR_TCE_PAGE_SHIFT);
> +rtas_st(rets, 2, 0x); /* Largest contiguous block of TCEs */

One detail though.. where does this limit actually come from?  Is it
actually a limit imposed by the hardware somewhere, or just because
the RTAS call doesn't ahve room for anything more?

Previously limits would usually have been a power of 2; is it likely
to matter that now it won't be?

>  rtas_st(rets, 3, pgmask);
>  rtas_st(rets, 4, 0); /* DMA migration mask, not supported */
>  
> -trace_spapr_iommu_ddw_query(buid, addr, avail, max_window_size, pgmask);
> +trace_spapr_iommu_ddw_query(buid, addr, avail, 0x, pgmask);
>  return;
>  
>  param_error_exit:

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [Qemu-devel] [PATCH qemu RFC 4/7] vfio/nvidia-v100: Disable VBIOS update

2018-11-18 Thread David Gibson
On Tue, Nov 13, 2018 at 07:31:01PM +1100, Alexey Kardashevskiy wrote:
> The NVIDIA V100 GPUs often come in several instances on the same system
> board where they are connected directly via out of band fabric called
> "NVLink".
> 
> In order to make GPUs talk to each other, NVLink has to be enabled on
> both GPUs and this is guaranteed by the firmware by providing special
> MMIO registers to disable NVLink till GPU is reset.
> 
> This blocks GPU VBIOS update to add an extra level of assurance that
> the firmware does not get reflashed with a malicious firmware which
> does not implement NVLink disabling mechanism.
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
> 
> NVIDIA firmwares come signed and GPUs do not accept unsigned images
> anyway so this is probably overkill, or not?
> 
> Also, there is no available documentation on the magic value of 0x22408;
> however it does help as the nvflash upgrade tool stops working with this
> applied.

IIUC, the upshot of this is basically not to permit firmware updates
of the V100 from within a guest, yes?  However, it would still be
possible to update the firmware from a userspace vfio program?

Given that, I'm not sure this really gives us anything over the
existing signature verifications.  Alex, any thoughts?

> ---
>  hw/vfio/pci.h|  1 +
>  include/hw/pci/pci_ids.h |  1 +
>  hw/vfio/pci-quirks.c | 26 ++
>  hw/vfio/pci.c|  2 ++
>  hw/vfio/trace-events |  1 +
>  5 files changed, 31 insertions(+)
> 
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index b1ae4c0..f4c5fb6 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -163,6 +163,7 @@ typedef struct VFIOPCIDevice {
>  bool no_kvm_msi;
>  bool no_kvm_msix;
>  bool no_geforce_quirks;
> +bool no_nvidia_v100_quirks;
>  bool no_kvm_ioeventfd;
>  bool no_vfio_ioeventfd;
>  bool enable_ramfb;
> diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h
> index 3ed7d10..2140dad 100644
> --- a/include/hw/pci/pci_ids.h
> +++ b/include/hw/pci/pci_ids.h
> @@ -272,5 +272,6 @@
>  #define PCI_VENDOR_ID_SYNOPSYS   0x16C3
>  
>  #define PCI_VENDOR_ID_NVIDIA 0x10de
> +#define PCI_VENDOR_ID_NVIDIA_V100_SXM2   0x1db1
>  
>  #endif
> diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c
> index 40a1200..2796837 100644
> --- a/hw/vfio/pci-quirks.c
> +++ b/hw/vfio/pci-quirks.c
> @@ -996,6 +996,31 @@ static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice 
> *vdev, int nr)
>  trace_vfio_quirk_nvidia_bar0_probe(vdev->vbasedev.name);
>  }
>  
> +static void vfio_probe_nvidia_v100_bar0_quirk(VFIOPCIDevice *vdev, int nr)
> +{
> +VFIOQuirk *quirk;
> +
> +if (vdev->no_nvidia_v100_quirks ||
> +!vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA,
> + PCI_VENDOR_ID_NVIDIA_V100_SXM2) ||
> +nr != 0) {
> +return;
> +}
> +
> +quirk = vfio_quirk_alloc(1);
> +
> +memory_region_init_io(quirk->mem, OBJECT(vdev),
> +  NULL, quirk,
> +  "vfio-nvidia-v100_bar0-block-quirk",
> +  4);
> +memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
> +0x22408, quirk->mem, 1);
> +
> +QLIST_INSERT_HEAD(>bars[nr].quirks, quirk, next);
> +
> +trace_vfio_quirk_nvidia_v100_bar0_probe(vdev->vbasedev.name);
> +}
> +
>  /*
>   * TODO - Some Nvidia devices provide config access to their companion HDA
>   * device and even to their parent bridge via these config space mirrors.
> @@ -1853,6 +1878,7 @@ void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr)
>  vfio_probe_ati_bar2_quirk(vdev, nr);
>  vfio_probe_nvidia_bar5_quirk(vdev, nr);
>  vfio_probe_nvidia_bar0_quirk(vdev, nr);
> +vfio_probe_nvidia_v100_bar0_quirk(vdev, nr);
>  vfio_probe_rtl8168_bar2_quirk(vdev, nr);
>  vfio_probe_igd_bar4_quirk(vdev, nr);
>  }
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 5c7bd96..7848b28 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -3203,6 +3203,8 @@ static Property vfio_pci_dev_properties[] = {
>  DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),
>  DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice,
>   no_geforce_quirks, false),
> +DEFINE_PROP_BOOL("x-no-nvidia-v100-quirks", VFIOPCIDevice,
> + no_nvidia_v100_quirks, false),
>  DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd,
>   false),
>  DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd,
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index db730f3..adfa75e 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -68,6 +68,7 @@ vfio_quirk_nvidia_bar5_state(const char *name, const char 
> *state) "%s %s"
>  vfio_quirk_nvidia_bar5_probe(const char *name) "%s"
>  vfio_quirk_nvidia_bar0_msi_ack(const char *name) "%s"
>  

[Qemu-devel] [PATCH] hw/arm/stm32f205: Fix the UART and Timer region size

2018-11-18 Thread Seth K
From: Seth Kintigh 

I corrected these 2 memory regions based on specifications from the chip
manufacturer. The existing ranges seem to overlap and and cause odd
behavior and/or crashes when trying to set up multiple UARTs,

Signed-off-by: Seth Kintigh 
---
Phil, I hope this is the right format.

PMM, my original changes were made on an old version, but I made the patch
from the latest version per the instructions. I'm glad to hear that serial
port limit is gone!

 hw/char/stm32f2xx_usart.c  | 2 +-
 hw/timer/stm32f2xx_timer.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/char/stm32f2xx_usart.c b/hw/char/stm32f2xx_usart.c
index 032b5fda13..f3363a2952 100644
--- a/hw/char/stm32f2xx_usart.c
+++ b/hw/char/stm32f2xx_usart.c
@@ -202,7 +202,7 @@ static void stm32f2xx_usart_init(Object *obj)
 sysbus_init_irq(SYS_BUS_DEVICE(obj), >irq);

 memory_region_init_io(>mmio, obj, _usart_ops, s,
-  TYPE_STM32F2XX_USART, 0x2000);
+  TYPE_STM32F2XX_USART, 0x400);
 sysbus_init_mmio(SYS_BUS_DEVICE(obj), >mmio);
 }

diff --git a/hw/timer/stm32f2xx_timer.c b/hw/timer/stm32f2xx_timer.c
index 58fc7b1188..ae744d1642 100644
--- a/hw/timer/stm32f2xx_timer.c
+++ b/hw/timer/stm32f2xx_timer.c
@@ -308,7 +308,7 @@ static void stm32f2xx_timer_init(Object *obj)
 sysbus_init_irq(SYS_BUS_DEVICE(obj), >irq);

 memory_region_init_io(>iomem, obj, _timer_ops, s,
-  "stm32f2xx_timer", 0x4000);
+  "stm32f2xx_timer", 0x400);
 sysbus_init_mmio(SYS_BUS_DEVICE(obj), >iomem);

 s->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, stm32f2xx_timer_interrupt,
s);


On Thu, Nov 15, 2018 at 7:05 AM Peter Maydell 
wrote:

> On 4 November 2018 at 07:42, Seth K  wrote:
> > I corrected these 2 memory regions based on specifications from the chip
> > manufacturer. The existing ranges seem to overlap and and cause odd
> > behavior and/or crashes when trying to set up multiple UARTs,
> > I also played with changing MAX_SERIAL_PORTS to 8 to match the hardware,
> > but I did not include that in this patch as I never fully tested its
> > effects.
>
> Hi; thanks for the patch. As Philippe says, it looks good,
> but the one thing we definitely need is a Signed-off-by:
> line from you.
>
> A minor note -- there is no "MAX_SERIAL_PORTS" definition
> in QEMU any more: we removed that artificial limitation
> earlier this year. Maybe you're basing your patch on an
> older version of QEMU? It's best to use git master for
> development. Our SoC model defines 6 uarts (STM_NUM_USARTS)
> in hw/arm/stm32f205_soc.c, which should all now be connectable
> on the command line, though I haven't tested that or checked
> whether the hardware has 6 or some other number...
>
> thanks
> -- PMM
>


Re: [Qemu-devel] [PATCH qemu RFC 5/7] spapr-iommu: Always advertise the maximum possible DMA window size

2018-11-18 Thread Alexey Kardashevskiy



On 19/11/2018 13:42, David Gibson wrote:
> On Tue, Nov 13, 2018 at 07:31:02PM +1100, Alexey Kardashevskiy wrote:
>> When deciding about the huge DMA window, the typical Linux pseries guest
>> uses the maximum allowed RAM size as the upper limit. We did the same
>> on QEMU side to match that logic. Now we are going to support GPU RAM
>> pass through which is not available at the guest boot time as it requires
>> the guest driver interaction. As the result, the guest requests a smaller
>> window than it should. Therefore the guest needs to be patched to
>> understand this new memory and so does QEMU.
>>
>> Instead of reimplementing here whatever solution we will choose for
>> the guest, this advertises the biggest possible window size limited by
>> 32 bit (as defined by LoPAPR).
>>
>> This seems to be safe as:
>> 1. The guest visible emulated table is allocated in KVM (actual pages
>> are allocated in page fault handler) and QEMU (actual pages are allocated
>> when changed);
>> 2. The hardware table (and corresponding userspace addresses cache)
>> supports sparse allocation and also checks for locked_vm limit so
>> it is unable to cause the host any damage.
>>
>> Signed-off-by: Alexey Kardashevskiy 
> 
> This seems like a good idea to me, even without the NPU stuff.  It
> always bothered me slightly that we based what's effectively the IOVA
> limit on the guest RAM size which doesn't have any direct connection
> to it.
> 
> As long as it doesn't hit the locked memory limits, I don't see any
> reason we should prevent a guest from doing something weird like
> mapping a small bit of RAM over and over in IOVA space, or mapping its
> RAM sparsely in IOVA space.
> 
>> ---
>>  hw/ppc/spapr_rtas_ddw.c | 19 +++
>>  1 file changed, 3 insertions(+), 16 deletions(-)
>>
>> diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c
>> index 329feb1..df60351 100644
>> --- a/hw/ppc/spapr_rtas_ddw.c
>> +++ b/hw/ppc/spapr_rtas_ddw.c
>> @@ -96,9 +96,8 @@ static void rtas_ibm_query_pe_dma_window(PowerPCCPU *cpu,
>>   uint32_t nret, target_ulong rets)
>>  {
>>  sPAPRPHBState *sphb;
>> -uint64_t buid, max_window_size;
>> +uint64_t buid;
>>  uint32_t avail, addr, pgmask = 0;
>> -MachineState *machine = MACHINE(spapr);
>>  
>>  if ((nargs != 3) || (nret != 5)) {
>>  goto param_error_exit;
>> @@ -114,27 +113,15 @@ static void rtas_ibm_query_pe_dma_window(PowerPCCPU 
>> *cpu,
>>  /* Translate page mask to LoPAPR format */
>>  pgmask = spapr_page_mask_to_query_mask(sphb->page_size_mask);
>>  
>> -/*
>> - * This is "Largest contiguous block of TCEs allocated specifically
>> - * for (that is, are reserved for) this PE".
>> - * Return the maximum number as maximum supported RAM size was in 4K 
>> pages.
>> - */
>> -if (machine->ram_size == machine->maxram_size) {
>> -max_window_size = machine->ram_size;
>> -} else {
>> -max_window_size = machine->device_memory->base +
>> -  memory_region_size(>device_memory->mr);
>> -}
>> -
>>  avail = SPAPR_PCI_DMA_MAX_WINDOWS - spapr_phb_get_active_win_num(sphb);
>>  
>>  rtas_st(rets, 0, RTAS_OUT_SUCCESS);
>>  rtas_st(rets, 1, avail);
>> -rtas_st(rets, 2, max_window_size >> SPAPR_TCE_PAGE_SHIFT);
>> +rtas_st(rets, 2, 0x); /* Largest contiguous block of TCEs */
> 
> One detail though.. where does this limit actually come from?  Is it
> actually a limit imposed by the hardware somewhere, or just because
> the RTAS call doesn't ahve room for anything more?

I used this limit just because of the parameter size.

> 
> Previously limits would usually have been a power of 2; is it likely
> to matter that now it won't be?

LoPAPR says it is "Largest contiguous block of TCEs" and no mention of
power of two but ibm,create-pe-dma-window takes a window shift so it is
assumed that windows can still be only power of two. Not sure. The
powernv code will fail if the requested window is not power of two.

Replacing 0x with 0x8000 won't make a difference though here
but probably will make the error clearer...




> 
>>  rtas_st(rets, 3, pgmask);
>>  rtas_st(rets, 4, 0); /* DMA migration mask, not supported */
>>  
>> -trace_spapr_iommu_ddw_query(buid, addr, avail, max_window_size, pgmask);
>> +trace_spapr_iommu_ddw_query(buid, addr, avail, 0x, pgmask);
>>  return;
>>  
>>  param_error_exit:
> 

-- 
Alexey



Re: [Qemu-devel] [PATCH qemu RFC 7/7] spapr: Add NVLink2 pass through support

2018-11-18 Thread Alexey Kardashevskiy



On 19/11/2018 14:01, David Gibson wrote:
> On Tue, Nov 13, 2018 at 07:31:04PM +1100, Alexey Kardashevskiy wrote:
>> The NVIDIA V100 GPU comes with some on-board RAM which is mapped into
>> the host memory space and accessible as normal RAM via NVLink bus.
>> The VFIO-PCI driver implements special regions for such GPU and emulated
>> NVLink bridge (below referred as NPU). The POWER9 CPU also provides
>> address translation services which includes TLB invalidation register
>> exposes via the NVLink bridge; the feature is called "ATSD".
>>
>> This adds a quirk to VFIO to map the memory and create an MR; the new MR
>> is stored in a GPU as a QOM link. The sPAPR PCI uses this to get the MR
>> and map it to the system address space. Another quirk does the same for
>> ATSD.
>>
>> This adds 3 additional steps to the FDT builder in spapr-pci:
>> 1. Search for specific GPUs and NPUs, collects findings in sPAPRPHBState;
>> 2. Adds several properties in the DT: "ibm,npu", "ibm,gpu", "memory-block",
>> and some other. These are required by the guest platform and GPU driver;
>> this also adds a new made-up compatible type for a PHB to signal
>> a modified guest that this particular PHB needs the default DMA window
>> removed as these GPUs have limited DMA mask size (way lower than usual 59);
>> 3. Adds new memory blocks with one addition - they have
>> "linux,memory-usable" property configured in the way which prevents
>> the guest from onlining it automatically as it needs to be deferred till
>> the guest GPU driver trains NVLink.
>>
>> A couple of notes:
>> - this changes the FDT rendeder as doing 1-2-3 from sPAPRPHBClass::realize
>> impossible - devices are not yet attached;
>> - this does not add VFIO quirk MRs to the system address space as
>> the address is selected in sPAPRPHBState, similar to MMIO.
>>
>> This puts new memory nodes in a separate NUMA node to replicate the host
>> system setup as close as possible (the GPU driver relies on this too).
>>
>> This adds fake NPU nodes to make the guest platform code work,
>> specifically "ibm,npu-link-index".
>>
>> Signed-off-by: Alexey Kardashevskiy 
>> ---
>>  hw/vfio/pci.h   |   2 +
>>  include/hw/pci-host/spapr.h |  28 
>>  include/hw/ppc/spapr.h  |   3 +-
>>  hw/ppc/spapr.c  |  14 +-
>>  hw/ppc/spapr_pci.c  | 256 +++-
>>  hw/vfio/pci-quirks.c|  93 +
>>  hw/vfio/pci.c   |  14 ++
>>  hw/vfio/trace-events|   3 +
>>  8 files changed, 408 insertions(+), 5 deletions(-)
>>
>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
>> index f4c5fb6..b8954cc 100644
>> --- a/hw/vfio/pci.h
>> +++ b/hw/vfio/pci.h
>> @@ -195,6 +195,8 @@ int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp);
>>  int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,
>> struct vfio_region_info *info,
>> Error **errp);
>> +int vfio_pci_nvlink2_ram_init(VFIOPCIDevice *vdev, Error **errp);
>> +int vfio_pci_npu2_atsd_init(VFIOPCIDevice *vdev, Error **errp);
>>  
>>  void vfio_display_reset(VFIOPCIDevice *vdev);
>>  int vfio_display_probe(VFIOPCIDevice *vdev, Error **errp);
>> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
>> index 7c66c38..1f8ebf3 100644
>> --- a/include/hw/pci-host/spapr.h
>> +++ b/include/hw/pci-host/spapr.h
>> @@ -87,6 +87,24 @@ struct sPAPRPHBState {
>>  uint32_t mig_liobn;
>>  hwaddr mig_mem_win_addr, mig_mem_win_size;
>>  hwaddr mig_io_win_addr, mig_io_win_size;
>> +hwaddr nv2_gpa_win_addr;
>> +hwaddr nv2_atsd_win_addr;
>> +
>> +struct spapr_phb_pci_nvgpu_config {
>> +uint64_t nv2_ram;
>> +uint64_t nv2_atsd;
>> +int num;
>> +struct {
>> +int links;
>> +uint64_t tgt;
>> +uint64_t gpa;
>> +PCIDevice *gpdev;
>> +uint64_t atsd[3];
>> +PCIDevice *npdev[3];
>> +} gpus[6];
>> +uint64_t atsd[64]; /* Big Endian (BE), ready for the DT */
>> +int atsd_num;
>> +} nvgpus;
> 
> Is this information always relevant for the PHB, or only for PHBs
> which have an NPU or GPU attached to them?  If the latter I'm
> wondering if we can allocate it only when necessary.


I think I can make it even local, just need to hack
spapr_populate_pci_devices_dt's fdt struct to take the struct.


> 
>>  };
>>  
>>  #define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x8000ULL
>> @@ -104,6 +122,16 @@ struct sPAPRPHBState {
>>  
>>  #define SPAPR_PCI_MSI_WINDOW 0x400ULL
>>  
>> +#define PHANDLE_PCIDEV(phb, pdev)(0x1200 | \
>> + (((phb)->index) << 16) | 
>> ((pdev)->devfn))
>> +#define PHANDLE_GPURAM(phb, n)   (0x11FF | ((n) << 8) | \
>> + (((phb)->index) << 16))
>> +#define GPURAM_ASSOCIATIVITY(phb, n) (255 - ((phb)->index * 3 + (n)))
>> +#define SPAPR_PCI_NV2RAM64_WIN_BASE  

Re: [Qemu-devel] [PATCH qemu RFC 5/7] spapr-iommu: Always advertise the maximum possible DMA window size

2018-11-18 Thread David Gibson
On Mon, Nov 19, 2018 at 04:08:33PM +1100, Alexey Kardashevskiy wrote:
> 
> 
> On 19/11/2018 13:42, David Gibson wrote:
> > On Tue, Nov 13, 2018 at 07:31:02PM +1100, Alexey Kardashevskiy wrote:
> >> When deciding about the huge DMA window, the typical Linux pseries guest
> >> uses the maximum allowed RAM size as the upper limit. We did the same
> >> on QEMU side to match that logic. Now we are going to support GPU RAM
> >> pass through which is not available at the guest boot time as it requires
> >> the guest driver interaction. As the result, the guest requests a smaller
> >> window than it should. Therefore the guest needs to be patched to
> >> understand this new memory and so does QEMU.
> >>
> >> Instead of reimplementing here whatever solution we will choose for
> >> the guest, this advertises the biggest possible window size limited by
> >> 32 bit (as defined by LoPAPR).
> >>
> >> This seems to be safe as:
> >> 1. The guest visible emulated table is allocated in KVM (actual pages
> >> are allocated in page fault handler) and QEMU (actual pages are allocated
> >> when changed);
> >> 2. The hardware table (and corresponding userspace addresses cache)
> >> supports sparse allocation and also checks for locked_vm limit so
> >> it is unable to cause the host any damage.
> >>
> >> Signed-off-by: Alexey Kardashevskiy 
> > 
> > This seems like a good idea to me, even without the NPU stuff.  It
> > always bothered me slightly that we based what's effectively the IOVA
> > limit on the guest RAM size which doesn't have any direct connection
> > to it.
> > 
> > As long as it doesn't hit the locked memory limits, I don't see any
> > reason we should prevent a guest from doing something weird like
> > mapping a small bit of RAM over and over in IOVA space, or mapping its
> > RAM sparsely in IOVA space.
> > 
> >> ---
> >>  hw/ppc/spapr_rtas_ddw.c | 19 +++
> >>  1 file changed, 3 insertions(+), 16 deletions(-)
> >>
> >> diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c
> >> index 329feb1..df60351 100644
> >> --- a/hw/ppc/spapr_rtas_ddw.c
> >> +++ b/hw/ppc/spapr_rtas_ddw.c
> >> @@ -96,9 +96,8 @@ static void rtas_ibm_query_pe_dma_window(PowerPCCPU *cpu,
> >>   uint32_t nret, target_ulong rets)
> >>  {
> >>  sPAPRPHBState *sphb;
> >> -uint64_t buid, max_window_size;
> >> +uint64_t buid;
> >>  uint32_t avail, addr, pgmask = 0;
> >> -MachineState *machine = MACHINE(spapr);
> >>  
> >>  if ((nargs != 3) || (nret != 5)) {
> >>  goto param_error_exit;
> >> @@ -114,27 +113,15 @@ static void rtas_ibm_query_pe_dma_window(PowerPCCPU 
> >> *cpu,
> >>  /* Translate page mask to LoPAPR format */
> >>  pgmask = spapr_page_mask_to_query_mask(sphb->page_size_mask);
> >>  
> >> -/*
> >> - * This is "Largest contiguous block of TCEs allocated specifically
> >> - * for (that is, are reserved for) this PE".
> >> - * Return the maximum number as maximum supported RAM size was in 4K 
> >> pages.
> >> - */
> >> -if (machine->ram_size == machine->maxram_size) {
> >> -max_window_size = machine->ram_size;
> >> -} else {
> >> -max_window_size = machine->device_memory->base +
> >> -  memory_region_size(>device_memory->mr);
> >> -}
> >> -
> >>  avail = SPAPR_PCI_DMA_MAX_WINDOWS - 
> >> spapr_phb_get_active_win_num(sphb);
> >>  
> >>  rtas_st(rets, 0, RTAS_OUT_SUCCESS);
> >>  rtas_st(rets, 1, avail);
> >> -rtas_st(rets, 2, max_window_size >> SPAPR_TCE_PAGE_SHIFT);
> >> +rtas_st(rets, 2, 0x); /* Largest contiguous block of TCEs */
> > 
> > One detail though.. where does this limit actually come from?  Is it
> > actually a limit imposed by the hardware somewhere, or just because
> > the RTAS call doesn't ahve room for anything more?
> 
> I used this limit just because of the parameter size.
> 
> > 
> > Previously limits would usually have been a power of 2; is it likely
> > to matter that now it won't be?
> 
> LoPAPR says it is "Largest contiguous block of TCEs" and no mention of
> power of two but ibm,create-pe-dma-window takes a window shift so it is
> assumed that windows can still be only power of two. Not sure. The
> powernv code will fail if the requested window is not power of two.
> 
> Replacing 0x with 0x8000 won't make a difference though here
> but probably will make the error clearer...

Yeah, given that the windows have to be a power of 2 in size, I think
it makes sense for this to be a power of 2, even if it doesn't
strictly have to be.  So I think 0x8000 would be a better option.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [Qemu-devel] [PATCH qemu RFC 1/7] vfio/spapr: Fix indirect levels calculation

2018-11-18 Thread Alexey Kardashevskiy



On 19/11/2018 12:45, David Gibson wrote:
> On Tue, Nov 13, 2018 at 07:30:58PM +1100, Alexey Kardashevskiy wrote:
>> The current code assumes that we can address more bits on a PCI bus
>> for DMA than we really can. Limit to the known tested maximum of 55 bits
>> and assume 64K IOMMU pages.
> 
> This doesn't make much sense to me.  It looks nothing like the
> calculation it's replacing, and not just for extreme values.  For
> example the new calculation doesn't depend on the size of the window
> at all, whereas the previous one did.

Uff. Yes, this is confusing. There are a typical 64k IOMMU page size and
a typical 64k system page size...

For the window as big as 0x200.. and 64k pages (actual working
example), 33554432 entries are needed, or 256MB of space. The existing
code calculates it as 25/6+1=5 levels. Which the hardware can do.

But:
1. all TCE levels are the same size and
2. for the minimum size of the level I take a system page size which is 64k.

So with 5 levels 16bit each (and this is without counting IOMMU page
size, which is plus extra 16 bits) the table is huuuge.

And I know from tests that 55 bits is the maximum the hardware can
handle, and I have this limit in the powernv code so VFIO ioctl to
create a window will fail. If I do not have a check in powernv, than
things silently do not work (no EEH though).


> You say you're assuming 64k
> IOMMU pages, but create.page_shift (the actual IOMMU page size) is in
> there. 

It is a radix tree basically. Out of 64bits available, I cannot use bits
56..63 (hardware does not work) and 0..12/15/21 (IOMMU page size), so I
have bits 13/16/22..55 which I need to split equally between levels,
each of which is 64k (system page size).


> Nor is it clear to me why the maximum PCI address is relevant
> to the number of levels in the first place.


The window can only as big as the PHB hardware supports, this is why
maximum PCI address.


> 
> In addition, AFAICT the new calculation gives the answer '2' for all
> likely IOMMU page sizes (4k, 64k, 2M)

Hm, I did not realize that :) Instead of using 64k as a default level
size, I could use some idea of what CONFIG_FORCE_MAX_ZONEORDER could be.

Currently it is:

arch/powerpc/Kconfig
config FORCE_MAX_ZONEORDER
int "Maximum zone order"
range 8 9 if PPC64 && PPC_64K_PAGES
default "9" if PPC64 && PPC_64K_PAGES
range 13 13 if PPC64 && !PPC_64K_PAGES
default "13" if PPC64 && !PPC_64K_PAGES
range 9 64 if PPC32 && PPC_16K_PAGES
default "9" if PPC32 && PPC_16K_PAGES
range 7 64 if PPC32 && PPC_64K_PAGES
default "7" if PPC32 && PPC_64K_PAGES
range 5 64 if PPC32 && PPC_256K_PAGES
default "5" if PPC32 && PPC_256K_PAGES
range 11 64
default "11"

So assuming that "8" is the absolute minimum, I could try levels as big
as 1<<(16+8-1) and if this fails, then increase the number of levels
till a window is created or it is too many levels. Or there is some
obviously better solution to the problem?





> 
>>
>> Signed-off-by: Alexey Kardashevskiy 
>> ---
>>  hw/vfio/spapr.c  | 3 ++-
>>  hw/vfio/trace-events | 2 +-
>>  2 files changed, 3 insertions(+), 2 deletions(-)
>>
>> diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
>> index becf71a..f5fdc53 100644
>> --- a/hw/vfio/spapr.c
>> +++ b/hw/vfio/spapr.c
>> @@ -183,7 +183,7 @@ int vfio_spapr_create_window(VFIOContainer *container,
>>  entries = create.window_size >> create.page_shift;
>>  pages = MAX((entries * sizeof(uint64_t)) / getpagesize(), 1);
>>  pages = MAX(pow2ceil(pages), 1); /* Round up */
>> -create.levels = ctz64(pages) / 6 + 1;
>> +create.levels = MAX(1, (55 - create.page_shift) / 16);
>>  
>>  ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, );
>>  if (ret) {
>> @@ -200,6 +200,7 @@ int vfio_spapr_create_window(VFIOContainer *container,
>>  return -EINVAL;
>>  }
>>  trace_vfio_spapr_create_window(create.page_shift,
>> +   create.levels,
>> create.window_size,
>> create.start_addr);
>>  *pgsize = pagesize;
>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>> index a85e866..db730f3 100644
>> --- a/hw/vfio/trace-events
>> +++ b/hw/vfio/trace-events
>> @@ -128,6 +128,6 @@ vfio_prereg_listener_region_add_skip(uint64_t start, 
>> uint64_t end) "0x%"PRIx64"
>>  vfio_prereg_listener_region_del_skip(uint64_t start, uint64_t end) 
>> "0x%"PRIx64" - 0x%"PRIx64
>>  vfio_prereg_register(uint64_t va, uint64_t size, int ret) "va=0x%"PRIx64" 
>> size=0x%"PRIx64" ret=%d"
>>  vfio_prereg_unregister(uint64_t va, uint64_t size, int ret) "va=0x%"PRIx64" 
>> size=0x%"PRIx64" ret=%d"
>> -vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x 
>> winsize=0x%"PRIx64" offset=0x%"PRIx64
>> +vfio_spapr_create_window(int ps, unsigned int levels, uint64_t ws, uint64_t 
>> off) "pageshift=0x%x 

Re: [Qemu-devel] [PULL 0/8] Monitor patches for 2018-10-30

2018-11-18 Thread Peter Xu
On Thu, Nov 08, 2018 at 10:44:06AM +0800, Peter Xu wrote:
> On Wed, Nov 07, 2018 at 11:21:32AM +, Peter Maydell wrote:
> > On 7 November 2018 at 02:56, Peter Xu  wrote:
> > > Strange, "make check -j8" failed on my hosts (I tried two) with either
> > > Markus's pull tree or qemu master:
> > >
> > > hw/core/ptimer.o: In function `timer_new_tl':
> > > /home/xz/git/qemu/include/qemu/timer.h:536: undefined reference to 
> > > `timer_init_tl'
> > > collect2: error: ld returned 1 exit status
> > > make: *** [/home/xz/git/qemu/rules.mak:124: tests/ptimer-test] Error 1
> > >
> > > Is that only happening on my hosts?
> > 
> > Commit 89a603a0c80ae3 changed things so that there is no
> > timer_new_tl() or timer_init_tl() any more, so if you have
> > an object file that's referring to it then it's probably
> > stale. Try a make clean.
> 
> Yeh it worked for me, thanks Peter.
> 
> Though after running a few more rounds of "configure --enable-debug &&
> make check -j8" I still cannot see anything wrong with Markus's tree.
> I'll see whether there's any news from Markus and then I'll consider
> whether I should install a FreeBSD.

I reproduced the error with a FreeBSD guest and this change (which
possibly can be squashed into "tests: qmp-test: add queue full test")
worked for me:

diff --git a/tests/qmp-test.c b/tests/qmp-test.c
index 6989acbca4..83f353db4f 100644
--- a/tests/qmp-test.c
+++ b/tests/qmp-test.c
@@ -281,8 +281,15 @@ static void test_qmp_oob(void)
  * will only be able to be handled after the queue is shrinked, so
  * it'll be processed only after one existing in-band command
  * finishes.
+ *
+ * NOTE: we need to feed the queue with one extra request to make
+ * sure it'll be stuck since when we have sent the Nth request
+ * it's possible that we have already popped and processing the
+ * 1st request so the Nth request (which could potentially be the
+ * [N-1]th element on the queue) might not trigger the
+ * monitor-full condition deterministically.
  */
-for (i = 1; i <= QMP_REQ_QUEUE_LEN_MAX; i++) {
+for (i = 1; i <= QMP_REQ_QUEUE_LEN_MAX + 1; i++) {
 id = g_strdup_printf("queue-blocks-%d", i);
 send_cmd_that_blocks(qts, id);
 g_free(id);
@@ -291,7 +298,7 @@ static void test_qmp_oob(void)
 unblock_blocked_cmd();
 recv_cmd_id(qts, "queue-blocks-1");
 recv_cmd_id(qts, "oob-1");
-for (i = 2; i <= QMP_REQ_QUEUE_LEN_MAX; i++) {
+for (i = 2; i <= QMP_REQ_QUEUE_LEN_MAX + 1; i++) {
 unblock_blocked_cmd();
 id = g_strdup_printf("queue-blocks-%d", i);
 recv_cmd_id(qts, id);

So the problem here is that the queue-block-N command might not really
suspend the monitor everytime if we already popped the 1st request,
which will let the N-th request to be (N-1)th, then the parser will
continue to eat the oob command and it could "preempt" the previous
commands.

Maybe FreeBSD is scheduling the threads in some pattern so it happens
only on FreeBSD and very constantly, but anyway it should be a general
fix to the test program.

Regards,

-- 
Peter Xu



Re: [Qemu-devel] [PATCH 2/2] hw: fw_cfg: refactor fw_cfg_reboot()

2018-11-18 Thread Markus Armbruster
ÀîÇ¿  writes:

> At 2018-11-17 00:52:58, "Markus Armbruster"  wrote:
>>Li Qiang  writes:
>>
>>> Currently the user can set a negative reboot_timeout.
>>> Also it is wrong to parse 'reboot-timeout' with qemu_opt_get() and then
>>> convert it to number.
>>
>>Again, it's not wrong per se, just needlessly complicated and
>>error-prone.  What makes it wrong is ...
>>
>>> convert it to number. This patch refactor this function by following:
>>> 1. ensure reboot_timeout is in 0~0x
>>> 2. use qemu_opt_get_number() to parse reboot_timeout
>>> 3. simlify code
>>>
>>> Signed-off-by: Li Qiang 
>>> ---
>>>  hw/nvram/fw_cfg.c | 23 +++
>>>  vl.c  |  2 +-
>>>  2 files changed, 12 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/hw/nvram/fw_cfg.c b/hw/nvram/fw_cfg.c
>>> index 78f43dad93..6aca80846a 100644
>>> --- a/hw/nvram/fw_cfg.c
>>> +++ b/hw/nvram/fw_cfg.c
>>> @@ -178,24 +178,23 @@ static void fw_cfg_bootsplash(FWCfgState *s)
>>>  
>>>  static void fw_cfg_reboot(FWCfgState *s)
>>>  {
>>> -int reboot_timeout = -1;
>>> -char *p;
>>> -const char *temp;
>>> +const char *reboot_timeout = NULL;
>>>  
>>>  /* get user configuration */
>>>  QemuOptsList *plist = qemu_find_opts("boot-opts");
>>>  QemuOpts *opts = QTAILQ_FIRST(>head);
>>> -if (opts != NULL) {
>>> -temp = qemu_opt_get(opts, "reboot-timeout");
>>> -if (temp != NULL) {
>>> -p = (char *)temp;
>>> -reboot_timeout = strtol(p, , 10);
>>
>>... the total lack of error checking here.  Same in PATCH 1.
>
>>
>
>
> Got.
>
>
>>Here's my attempt at a clearer commit message:
>>
>>fw_cfg: Fix -boot reboot-timeout error checking
>>
>>fw_cfg_reboot() gets option parameter "reboot-timeout" with
>>qemu_opt_get(), then converts it to an integer by hand.  It neglects
>>to check that conversion for errors, and fails to reject negative
>>values.  Positive values above the limit get reported and replaced
>>by the limit.
>>
>>Check for conversion errors properly, and reject all values outside
>>0..0x.
>
>>
>
>
> Thanks for your advice, I appreciate it and will change in the revision 
> version.
>
>
>>PATCH 1's commit message could be improved the same way.
>>
>>> -}
>>> +reboot_timeout = qemu_opt_get(opts, "reboot-timeout");
>>> +
>>> +if (reboot_timeout == NULL) {
>>> +return;
>>>  }
>>> +int64_t rt_val = qemu_opt_get_number(opts, "reboot-timeout", -1);
>>> +
>>>  /* validate the input */
>>> -if (reboot_timeout > 0x) {
>>> -error_report("reboot timeout is larger than 65535, force it to 
>>> 65535.");
>>> -reboot_timeout = 0x;
>>> +if (rt_val < 0 || rt_val > 0x) {
>>> +error_report("reboot timeout is invalid,"
>>> + "it should be a value between 0 and 65535");
>>> +exit(1);
>>>  }
>>>  fw_cfg_add_file(s, "etc/boot-fail-wait", g_memdup(_timeout, 4), 
>>> 4);
>>>  }
>>
>>Change in behavior when "reboot-timeout" isn't specified.
>>
>>Before your patch, we fw_cfg_add_file() with a value of -1.
>>
>>After your patch, we don't fw_cfg_add_file().
>>
>>Why is that okay?
>
>>
>
>
> Here I following Gerd's advice. 
> For values >0x  or < 0, report and exit.
> -->http://lists.gnu.org/archive/html/qemu-devel/2018-11/msg00551.html

Cases:

0. "reboot-timeout" not specified (e.g. no -boot option given)

1. "reboot-timeout" specified, value out of bounds
1.a. < 0
1.b. > 0x

2. "reboot-timeout" specified, value okay

Gerd's advice is about case 1.  Your patch implements it.

My question is about case 0.

Do you understand my question now?

[...]



Re: [Qemu-devel] [PULL 0/8] Monitor patches for 2018-10-30

2018-11-18 Thread Peter Xu
On Mon, Nov 19, 2018 at 02:17:27PM +0800, Peter Xu wrote:
> On Thu, Nov 08, 2018 at 10:44:06AM +0800, Peter Xu wrote:
> > On Wed, Nov 07, 2018 at 11:21:32AM +, Peter Maydell wrote:
> > > On 7 November 2018 at 02:56, Peter Xu  wrote:
> > > > Strange, "make check -j8" failed on my hosts (I tried two) with either
> > > > Markus's pull tree or qemu master:
> > > >
> > > > hw/core/ptimer.o: In function `timer_new_tl':
> > > > /home/xz/git/qemu/include/qemu/timer.h:536: undefined reference to 
> > > > `timer_init_tl'
> > > > collect2: error: ld returned 1 exit status
> > > > make: *** [/home/xz/git/qemu/rules.mak:124: tests/ptimer-test] Error 1
> > > >
> > > > Is that only happening on my hosts?
> > > 
> > > Commit 89a603a0c80ae3 changed things so that there is no
> > > timer_new_tl() or timer_init_tl() any more, so if you have
> > > an object file that's referring to it then it's probably
> > > stale. Try a make clean.
> > 
> > Yeh it worked for me, thanks Peter.
> > 
> > Though after running a few more rounds of "configure --enable-debug &&
> > make check -j8" I still cannot see anything wrong with Markus's tree.
> > I'll see whether there's any news from Markus and then I'll consider
> > whether I should install a FreeBSD.
> 
> I reproduced the error with a FreeBSD guest and this change (which
> possibly can be squashed into "tests: qmp-test: add queue full test")
> worked for me:
> 
> diff --git a/tests/qmp-test.c b/tests/qmp-test.c
> index 6989acbca4..83f353db4f 100644
> --- a/tests/qmp-test.c
> +++ b/tests/qmp-test.c
> @@ -281,8 +281,15 @@ static void test_qmp_oob(void)
>   * will only be able to be handled after the queue is shrinked, so
>   * it'll be processed only after one existing in-band command
>   * finishes.
> + *
> + * NOTE: we need to feed the queue with one extra request to make
> + * sure it'll be stuck since when we have sent the Nth request
> + * it's possible that we have already popped and processing the
> + * 1st request so the Nth request (which could potentially be the
> + * [N-1]th element on the queue) might not trigger the
> + * monitor-full condition deterministically.
>   */
> -for (i = 1; i <= QMP_REQ_QUEUE_LEN_MAX; i++) {
> +for (i = 1; i <= QMP_REQ_QUEUE_LEN_MAX + 1; i++) {
>  id = g_strdup_printf("queue-blocks-%d", i);
>  send_cmd_that_blocks(qts, id);
>  g_free(id);
> @@ -291,7 +298,7 @@ static void test_qmp_oob(void)
>  unblock_blocked_cmd();
>  recv_cmd_id(qts, "queue-blocks-1");
>  recv_cmd_id(qts, "oob-1");
> -for (i = 2; i <= QMP_REQ_QUEUE_LEN_MAX; i++) {
> +for (i = 2; i <= QMP_REQ_QUEUE_LEN_MAX + 1; i++) {
>  unblock_blocked_cmd();
>  id = g_strdup_printf("queue-blocks-%d", i);
>  recv_cmd_id(qts, id);
> 
> So the problem here is that the queue-block-N command might not really
> suspend the monitor everytime if we already popped the 1st request,
> which will let the N-th request to be (N-1)th, then the parser will
> continue to eat the oob command and it could "preempt" the previous
> commands.
> 
> Maybe FreeBSD is scheduling the threads in some pattern so it happens
> only on FreeBSD and very constantly, but anyway it should be a general
> fix to the test program.

Markus, do you want me to repost a new version with this change?  Is
it still possible to have the oob-default series for 3.1?

-- 
Peter Xu



Re: [Qemu-devel] [PATCH v4 10/23] json: Define new QMP message for pvrdma

2018-11-18 Thread Markus Armbruster
The subsystem tag in the commit message's title should be "qapi: " not
"json: ".

Yuval Shaia  writes:

> pvrdma requires that the same GID attached to it will be attached to the
> backend device in the host.
>
> A new QMP messages is defined so pvrdma device can broadcast any change
> made to its GID table. This event is captured by libvirt which in turn
> will update the GID table in the backend device.
>
> Signed-off-by: Yuval Shaia 
> Reviewed-by: Marcel Apfelbaum 
> ---
>  MAINTAINERS   |  1 +
>  Makefile  |  3 ++-
>  Makefile.objs |  4 
>  qapi/qapi-schema.json |  1 +
>  qapi/rdma.json| 38 ++
>  5 files changed, 46 insertions(+), 1 deletion(-)
>  create mode 100644 qapi/rdma.json
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index e087d58ac6..a149f68a8f 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -2232,6 +2232,7 @@ F: hw/rdma/*
>  F: hw/rdma/vmw/*
>  F: docs/pvrdma.txt
>  F: contrib/rdmacm-mux/*
> +F: qapi/rdma.json
>  
>  Build and test automation
>  -
> diff --git a/Makefile b/Makefile
> index 94072776ff..db4ce60ee5 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -599,7 +599,8 @@ qapi-modules = $(SRC_PATH)/qapi/qapi-schema.json 
> $(SRC_PATH)/qapi/common.json \
> $(SRC_PATH)/qapi/tpm.json \
> $(SRC_PATH)/qapi/trace.json \
> $(SRC_PATH)/qapi/transaction.json \
> -   $(SRC_PATH)/qapi/ui.json
> +   $(SRC_PATH)/qapi/ui.json \
> +   $(SRC_PATH)/qapi/rdma.json

Please insert between net.json and rocker.json to maintain alphabetical
order.

>  qapi/qapi-builtin-types.c qapi/qapi-builtin-types.h \
>  qapi/qapi-types.c qapi/qapi-types.h \
> diff --git a/Makefile.objs b/Makefile.objs
> index cc7df3ad80..76d8028f2f 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -21,6 +21,7 @@ util-obj-y += qapi/qapi-types-tpm.o
>  util-obj-y += qapi/qapi-types-trace.o
>  util-obj-y += qapi/qapi-types-transaction.o
>  util-obj-y += qapi/qapi-types-ui.o
> +util-obj-y += qapi/qapi-types-rdma.o
>  util-obj-y += qapi/qapi-builtin-visit.o
>  util-obj-y += qapi/qapi-visit.o
>  util-obj-y += qapi/qapi-visit-block-core.o
> @@ -40,6 +41,7 @@ util-obj-y += qapi/qapi-visit-tpm.o
>  util-obj-y += qapi/qapi-visit-trace.o
>  util-obj-y += qapi/qapi-visit-transaction.o
>  util-obj-y += qapi/qapi-visit-ui.o
> +util-obj-y += qapi/qapi-visit-rdma.o
>  util-obj-y += qapi/qapi-events.o
>  util-obj-y += qapi/qapi-events-block-core.o
>  util-obj-y += qapi/qapi-events-block.o
> @@ -58,6 +60,7 @@ util-obj-y += qapi/qapi-events-tpm.o
>  util-obj-y += qapi/qapi-events-trace.o
>  util-obj-y += qapi/qapi-events-transaction.o
>  util-obj-y += qapi/qapi-events-ui.o
> +util-obj-y += qapi/qapi-events-rdma.o
>  util-obj-y += qapi/qapi-introspect.o
>  
>  chardev-obj-y = chardev/
> @@ -155,6 +158,7 @@ common-obj-y += qapi/qapi-commands-tpm.o
>  common-obj-y += qapi/qapi-commands-trace.o
>  common-obj-y += qapi/qapi-commands-transaction.o
>  common-obj-y += qapi/qapi-commands-ui.o
> +common-obj-y += qapi/qapi-commands-rdma.o
>  common-obj-y += qapi/qapi-introspect.o
>  common-obj-y += qmp.o hmp.o
>  endif

This is incomplete, and it conflicts with Eric Blake's "[PATCH v3] qapi:
Reduce Makefile boilerplate".  I recommend to base on Eric's patch,
because that'll let you add the new rdma.json much more easily.

> diff --git a/qapi/qapi-schema.json b/qapi/qapi-schema.json
> index 65b6dc2f6f..a650d80f83 100644
> --- a/qapi/qapi-schema.json
> +++ b/qapi/qapi-schema.json
> @@ -94,3 +94,4 @@
>  { 'include': 'trace.json' }
>  { 'include': 'introspect.json' }
>  { 'include': 'misc.json' }
> +{ 'include': 'rdma.json' }

This makes the "RDMA device" section appear last in the QEMU QMP
reference manual, since they appear in include order there.

$ grep -h @section bld-x86/qapi/qapi-doc.texi 
@section Introduction
@section Stability Considerations
@section Common data types
@section Socket data types
@section VM run state
@section Cryptography
@section Block devices
@section Character devices
@section Net devices
@section Rocker switch device
@section TPM (trusted platform module) devices
@section Remote desktop
@section Input
@section Migration
@section Transactions
@section Tracing
@section QMP introspection
@section Miscellanea
@section RDMA device

It should go next to the other "device" sections, shouldn't it?

> diff --git a/qapi/rdma.json b/qapi/rdma.json
> new file mode 100644
> index 00..804c68ab36
> --- /dev/null
> +++ b/qapi/rdma.json
> @@ -0,0 +1,38 @@
> +# -*- Mode: Python -*-
> +#
> +
> +##
> +# = RDMA device
> +##
> +
> +##
> +# @RDMA_GID_STATUS_CHANGED:
> +#
> +# Emitted when guest driver adds/deletes GID to/from device
> +#
> +# @netdev: RoCE Network Device name - char *
> +#
> +# @gid-status: Add or delete indication - bool
> +#
> +# @subnet-prefix: Subnet Prefix - uint64
> 

Re: [Qemu-devel] [PATCH 2/2] hw: fw_cfg: refactor fw_cfg_reboot()

2018-11-18 Thread Li Qiang
Markus Armbruster  于2018年11月19日周一 下午3:01写道:

> ÀîÇ¿  writes:
>
> > At 2018-11-17 00:52:58, "Markus Armbruster"  wrote:
> >>Li Qiang  writes:
> >>
> >>> Currently the user can set a negative reboot_timeout.
> >>> Also it is wrong to parse 'reboot-timeout' with qemu_opt_get() and then
> >>> convert it to number.
> >>
> >>Again, it's not wrong per se, just needlessly complicated and
> >>error-prone.  What makes it wrong is ...
> >>
> >>> convert it to number. This patch refactor this function by following:
> >>> 1. ensure reboot_timeout is in 0~0x
> >>> 2. use qemu_opt_get_number() to parse reboot_timeout
> >>> 3. simlify code
> >>>
> >>> Signed-off-by: Li Qiang 
> >>> ---
> >>>  hw/nvram/fw_cfg.c | 23 +++
> >>>  vl.c  |  2 +-
> >>>  2 files changed, 12 insertions(+), 13 deletions(-)
> >>>
> >>> diff --git a/hw/nvram/fw_cfg.c b/hw/nvram/fw_cfg.c
> >>> index 78f43dad93..6aca80846a 100644
> >>> --- a/hw/nvram/fw_cfg.c
> >>> +++ b/hw/nvram/fw_cfg.c
> >>> @@ -178,24 +178,23 @@ static void fw_cfg_bootsplash(FWCfgState *s)
> >>>
> >>>  static void fw_cfg_reboot(FWCfgState *s)
> >>>  {
> >>> -int reboot_timeout = -1;
> >>> -char *p;
> >>> -const char *temp;
> >>> +const char *reboot_timeout = NULL;
> >>>
> >>>  /* get user configuration */
> >>>  QemuOptsList *plist = qemu_find_opts("boot-opts");
> >>>  QemuOpts *opts = QTAILQ_FIRST(>head);
> >>> -if (opts != NULL) {
> >>> -temp = qemu_opt_get(opts, "reboot-timeout");
> >>> -if (temp != NULL) {
> >>> -p = (char *)temp;
> >>> -reboot_timeout = strtol(p, , 10);
> >>
> >>... the total lack of error checking here.  Same in PATCH 1.
> >
> >>
> >
> >
> > Got.
> >
> >
> >>Here's my attempt at a clearer commit message:
> >>
> >>fw_cfg: Fix -boot reboot-timeout error checking
> >>
> >>fw_cfg_reboot() gets option parameter "reboot-timeout" with
> >>qemu_opt_get(), then converts it to an integer by hand.  It neglects
> >>to check that conversion for errors, and fails to reject negative
> >>values.  Positive values above the limit get reported and replaced
> >>by the limit.
> >>
> >>Check for conversion errors properly, and reject all values outside
> >>0..0x.
> >
> >>
> >
> >
> > Thanks for your advice, I appreciate it and will change in the revision
> version.
> >
> >
> >>PATCH 1's commit message could be improved the same way.
> >>
> >>> -}
> >>> +reboot_timeout = qemu_opt_get(opts, "reboot-timeout");
> >>> +
> >>> +if (reboot_timeout == NULL) {
> >>> +return;
> >>>  }
> >>> +int64_t rt_val = qemu_opt_get_number(opts, "reboot-timeout", -1);
> >>> +
> >>>  /* validate the input */
> >>> -if (reboot_timeout > 0x) {
> >>> -error_report("reboot timeout is larger than 65535, force it
> to 65535.");
> >>> -reboot_timeout = 0x;
> >>> +if (rt_val < 0 || rt_val > 0x) {
> >>> +error_report("reboot timeout is invalid,"
> >>> + "it should be a value between 0 and 65535");
> >>> +exit(1);
> >>>  }
> >>>  fw_cfg_add_file(s, "etc/boot-fail-wait",
> g_memdup(_timeout, 4), 4);
> >>>  }
> >>
> >>Change in behavior when "reboot-timeout" isn't specified.
> >>
> >>Before your patch, we fw_cfg_add_file() with a value of -1.
> >>
> >>After your patch, we don't fw_cfg_add_file().
> >>
> >>Why is that okay?
> >
> >>
> >
> >
> > Here I following Gerd's advice.
> > For values >0x  or < 0, report and exit.
> > -->http://lists.gnu.org/archive/html/qemu-devel/2018-11/msg00551.html
>
> Cases:
>
> 0. "reboot-timeout" not specified (e.g. no -boot option given)
>
> 1. "reboot-timeout" specified, value out of bounds
> 1.a. < 0
> 1.b. > 0x
>
> 2. "reboot-timeout" specified, value okay
>
> Gerd's advice is about case 1.  Your patch implements it.
>
> My question is about case 0.
>
> Do you understand my question now?
>


OK got. Once I think the 'reboot_timeout' can't be -1(as the user can't set
this),
but seems it's ok to be -1(the default value as no 'reboot-timeout'
specified).

I will prepare another patchset later.

Thanks,
Li Qiang



>
> [...]
>


Re: [Qemu-devel] SeaBIOS booting time optimization

2018-11-18 Thread Samuel Ortiz
Hi Stefan,

On Fri, Nov 16, 2018 at 04:39:20PM +, Stefan Hajnoczi wrote:
> On Fri, Nov 16, 2018 at 4:21 PM Stefano Garzarella  
> wrote:
> > I'm investigating the SeaBIOS booting time, to understand if we can reduce
> > the boot time in some cases (e.g. legacy hardware is not needed). I think
> > this
> > can be interesting also for NEMU developers.
> 
> Questions for NEMU folks:
> 
> qboot isn't part of the virt-x86 NEMU tree and is only mentioned in
> one or two places.  Is NEMU focussing exclusively on OVMF guest
> firmware instead of qboot?
Initally we were only focusing on OVMF, but we added support for qboot
[1] as we were not happy with the OVMF boot latencies we measured,
especially in the Kata containers context. Rob was about to start
looking at seabios support as well.

We are going to upstream a cleaner version of the virt support to qboot
very soon now.

> Are there other boot time optimizations in NEMU that are a candidate
> for upstreaming?

We did not add anything specific, but when comparing PC+qboot with
virt+qboot, we're seeing interesting overall boot time improvements. We
assume the simpler virt machine type explains the difference.

Cheers,
Samuel.

[1] 
https://github.com/rbradford/qboot/commit/dbcaaa1f6921064cd46f330a3625eb01e44aa4df



Re: [Qemu-devel] [PATCH 2/2] hw: fw_cfg: refactor fw_cfg_reboot()

2018-11-18 Thread 李强









At 2018-11-17 00:52:58, "Markus Armbruster"  wrote:
>Li Qiang  writes:
>
>> Currently the user can set a negative reboot_timeout.
>> Also it is wrong to parse 'reboot-timeout' with qemu_opt_get() and then
>> convert it to number.
>
>Again, it's not wrong per se, just needlessly complicated and
>error-prone.  What makes it wrong is ...
>
>> convert it to number. This patch refactor this function by following:
>> 1. ensure reboot_timeout is in 0~0x
>> 2. use qemu_opt_get_number() to parse reboot_timeout
>> 3. simlify code
>>
>> Signed-off-by: Li Qiang 
>> ---
>>  hw/nvram/fw_cfg.c | 23 +++
>>  vl.c  |  2 +-
>>  2 files changed, 12 insertions(+), 13 deletions(-)
>>
>> diff --git a/hw/nvram/fw_cfg.c b/hw/nvram/fw_cfg.c
>> index 78f43dad93..6aca80846a 100644
>> --- a/hw/nvram/fw_cfg.c
>> +++ b/hw/nvram/fw_cfg.c
>> @@ -178,24 +178,23 @@ static void fw_cfg_bootsplash(FWCfgState *s)
>>  
>>  static void fw_cfg_reboot(FWCfgState *s)
>>  {
>> -int reboot_timeout = -1;
>> -char *p;
>> -const char *temp;
>> +const char *reboot_timeout = NULL;
>>  
>>  /* get user configuration */
>>  QemuOptsList *plist = qemu_find_opts("boot-opts");
>>  QemuOpts *opts = QTAILQ_FIRST(>head);
>> -if (opts != NULL) {
>> -temp = qemu_opt_get(opts, "reboot-timeout");
>> -if (temp != NULL) {
>> -p = (char *)temp;
>> -reboot_timeout = strtol(p, , 10);
>
>... the total lack of error checking here.  Same in PATCH 1.

>


Got.


>Here's my attempt at a clearer commit message:
>
>fw_cfg: Fix -boot reboot-timeout error checking
>
>fw_cfg_reboot() gets option parameter "reboot-timeout" with
>qemu_opt_get(), then converts it to an integer by hand.  It neglects
>to check that conversion for errors, and fails to reject negative
>values.  Positive values above the limit get reported and replaced
>by the limit.
>
>Check for conversion errors properly, and reject all values outside
>0..0x.

>


Thanks for your advice, I appreciate it and will change in the revision version.


>PATCH 1's commit message could be improved the same way.
>
>> -}
>> +reboot_timeout = qemu_opt_get(opts, "reboot-timeout");
>> +
>> +if (reboot_timeout == NULL) {
>> +return;
>>  }
>> +int64_t rt_val = qemu_opt_get_number(opts, "reboot-timeout", -1);
>> +
>>  /* validate the input */
>> -if (reboot_timeout > 0x) {
>> -error_report("reboot timeout is larger than 65535, force it to 
>> 65535.");
>> -reboot_timeout = 0x;
>> +if (rt_val < 0 || rt_val > 0x) {
>> +error_report("reboot timeout is invalid,"
>> + "it should be a value between 0 and 65535");
>> +exit(1);
>>  }
>>  fw_cfg_add_file(s, "etc/boot-fail-wait", g_memdup(_timeout, 4), 
>> 4);
>>  }
>
>Change in behavior when "reboot-timeout" isn't specified.
>
>Before your patch, we fw_cfg_add_file() with a value of -1.
>
>After your patch, we don't fw_cfg_add_file().
>
>Why is that okay?

>


Here I following Gerd's advice. 
For values >0x  or < 0, report and exit.
-->http://lists.gnu.org/archive/html/qemu-devel/2018-11/msg00551.html
Thanks,
Li Qiang
>> diff --git a/vl.c b/vl.c
>> index be37da46f0..086127ff0b 100644
>> --- a/vl.c
>> +++ b/vl.c
>> @@ -339,7 +339,7 @@ static QemuOptsList qemu_boot_opts = {
>>  .type = QEMU_OPT_NUMBER,
>>  }, {
>>  .name = "reboot-timeout",
>> -.type = QEMU_OPT_STRING,
>> +.type = QEMU_OPT_NUMBER,
>>  }, {
>>  .name = "strict",
>>  .type = QEMU_OPT_BOOL,