Re: [PATCH net 3/4] vhost: vsock: add weight support

2019-05-16 Thread Jason Wang


On 2019/5/16 下午5:33, Stefan Hajnoczi wrote:

On Thu, May 16, 2019 at 03:47:41AM -0400, Jason Wang wrote:

@@ -183,7 +184,8 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
virtio_transport_deliver_tap_pkt(pkt);
  
  		virtio_transport_free_pkt(pkt);

-   }
+   total_len += pkt->len;

Please increment total_len before virtio_transport_free_pkt(pkt) to
avoid use-after-free.



Right, let me fix this.

Thanks


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v9 2/7] virtio-pmem: Add virtio pmem driver

2019-05-16 Thread Jakub Staroń
On 5/14/19 7:54 AM, Pankaj Gupta wrote:
> + if (!list_empty(>req_list)) {
> + req_buf = list_first_entry(>req_list,
> + struct virtio_pmem_request, list);
> + req_buf->wq_buf_avail = true;
> + wake_up(_buf->wq_buf);
> + list_del(_buf->list);
Yes, this change is the right one, thank you!

> +  /*
> +   * If virtqueue_add_sgs returns -ENOSPC then req_vq virtual
> +   * queue does not have free descriptor. We add the request
> +   * to req_list and wait for host_ack to wake us up when free
> +   * slots are available.
> +   */
> + while ((err = virtqueue_add_sgs(vpmem->req_vq, sgs, 1, 1, req,
> + GFP_ATOMIC)) == -ENOSPC) {
> +
> + dev_err(>dev, "failed to send command to virtio pmem" \
> + "device, no free slots in the virtqueue\n");
> + req->wq_buf_avail = false;
> + list_add_tail(>list, >req_list);
> + spin_unlock_irqrestore(>pmem_lock, flags);
> +
> + /* A host response results in "host_ack" getting called */
> + wait_event(req->wq_buf, req->wq_buf_avail);
> + spin_lock_irqsave(>pmem_lock, flags);
> + }
> + err1 = virtqueue_kick(vpmem->req_vq);
> + spin_unlock_irqrestore(>pmem_lock, flags);
> +
> + /*
> +  * virtqueue_add_sgs failed with error different than -ENOSPC, we can't
> +  * do anything about that.
> +  */
> + if (err || !err1) {
> + dev_info(>dev, "failed to send command to virtio pmem 
> device\n");
> + err = -EIO;
> + } else {
> + /* A host repsonse results in "host_ack" getting called */
> + wait_event(req->host_acked, req->done);
> + err = req->ret;
> +I confirm that the failures I was facing with the `-ENOSPC` error path are 
> not present in v9.

Best,
Jakub Staron


[PATCH V2 1/4] vhost: introduce vhost_exceeds_weight()

2019-05-16 Thread Jason Wang
We used to have vhost_exceeds_weight() for vhost-net to:

- prevent vhost kthread from hogging the cpu
- balance the time spent between TX and RX

This function could be useful for vsock and scsi as well. So move it
to vhost.c. Device must specify a weight which counts the number of
requests, or it can also specific a byte_weight which counts the
number of bytes that has been processed.

Signed-off-by: Jason Wang 
---
 drivers/vhost/net.c   | 22 ++
 drivers/vhost/scsi.c  |  9 -
 drivers/vhost/vhost.c | 20 +++-
 drivers/vhost/vhost.h |  5 -
 drivers/vhost/vsock.c | 12 +++-
 5 files changed, 48 insertions(+), 20 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index df51a35..061a06d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -604,12 +604,6 @@ static size_t init_iov_iter(struct vhost_virtqueue *vq, 
struct iov_iter *iter,
return iov_iter_count(iter);
 }
 
-static bool vhost_exceeds_weight(int pkts, int total_len)
-{
-   return total_len >= VHOST_NET_WEIGHT ||
-  pkts >= VHOST_NET_PKT_WEIGHT;
-}
-
 static int get_tx_bufs(struct vhost_net *net,
   struct vhost_net_virtqueue *nvq,
   struct msghdr *msg,
@@ -845,10 +839,8 @@ static void handle_tx_copy(struct vhost_net *net, struct 
socket *sock)
vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
vq->heads[nvq->done_idx].len = 0;
++nvq->done_idx;
-   if (vhost_exceeds_weight(++sent_pkts, total_len)) {
-   vhost_poll_queue(>poll);
+   if (vhost_exceeds_weight(vq, ++sent_pkts, total_len))
break;
-   }
}
 
vhost_tx_batch(net, nvq, sock, );
@@ -951,10 +943,9 @@ static void handle_tx_zerocopy(struct vhost_net *net, 
struct socket *sock)
else
vhost_zerocopy_signal_used(net, vq);
vhost_net_tx_packet(net);
-   if (unlikely(vhost_exceeds_weight(++sent_pkts, total_len))) {
-   vhost_poll_queue(>poll);
+   if (unlikely(vhost_exceeds_weight(vq, ++sent_pkts,
+ total_len)))
break;
-   }
}
 }
 
@@ -1239,10 +1230,8 @@ static void handle_rx(struct vhost_net *net)
vhost_log_write(vq, vq_log, log, vhost_len,
vq->iov, in);
total_len += vhost_len;
-   if (unlikely(vhost_exceeds_weight(++recv_pkts, total_len))) {
-   vhost_poll_queue(>poll);
+   if (unlikely(vhost_exceeds_weight(vq, ++recv_pkts, total_len)))
goto out;
-   }
}
if (unlikely(busyloop_intr))
vhost_poll_queue(>poll);
@@ -1338,7 +1327,8 @@ static int vhost_net_open(struct inode *inode, struct 
file *f)
vhost_net_buf_init(>vqs[i].rxq);
}
vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX,
-  UIO_MAXIOV + VHOST_NET_BATCH);
+  UIO_MAXIOV + VHOST_NET_BATCH,
+  VHOST_NET_WEIGHT, VHOST_NET_PKT_WEIGHT);
 
vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, 
dev);
vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev);
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 618fb64..d830579 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -57,6 +57,12 @@
 #define VHOST_SCSI_PREALLOC_UPAGES 2048
 #define VHOST_SCSI_PREALLOC_PROT_SGLS 2048
 
+/* Max number of requests before requeueing the job.
+ * Using this limit prevents one virtqueue from starving others with
+ * request.
+ */
+#define VHOST_SCSI_WEIGHT 256
+
 struct vhost_scsi_inflight {
/* Wait for the flush operation to finish */
struct completion comp;
@@ -1622,7 +1628,8 @@ static int vhost_scsi_open(struct inode *inode, struct 
file *f)
vqs[i] = >vqs[i].vq;
vs->vqs[i].vq.handle_kick = vhost_scsi_handle_kick;
}
-   vhost_dev_init(>dev, vqs, VHOST_SCSI_MAX_VQ, UIO_MAXIOV);
+   vhost_dev_init(>dev, vqs, VHOST_SCSI_MAX_VQ, UIO_MAXIOV,
+  VHOST_SCSI_WEIGHT, 0);
 
vhost_scsi_init_inflight(vs, NULL);
 
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 351af88..290d6e5 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -413,8 +413,24 @@ static void vhost_dev_free_iovecs(struct vhost_dev *dev)
vhost_vq_free_iovecs(dev->vqs[i]);
 }
 
+bool vhost_exceeds_weight(struct vhost_virtqueue *vq,
+ int pkts, int total_len)
+{
+   struct vhost_dev *dev = vq->dev;
+
+   if ((dev->byte_weight && total_len >= dev->byte_weight) ||
+   pkts >= dev->weight) {
+   

Re: [Qemu-devel] [PATCH v9 2/7] virtio-pmem: Add virtio pmem driver

2019-05-16 Thread Pankaj Gupta



Hi Jakub,

> 
> On 5/14/19 7:54 AM, Pankaj Gupta wrote:
> > +   if (!list_empty(>req_list)) {
> > +   req_buf = list_first_entry(>req_list,
> > +   struct virtio_pmem_request, list);
> > +   req_buf->wq_buf_avail = true;
> > +   wake_up(_buf->wq_buf);
> > +   list_del(_buf->list);
> Yes, this change is the right one, thank you!

Thank you for the confirmation.

> 
> > +/*
> > + * If virtqueue_add_sgs returns -ENOSPC then req_vq virtual
> > + * queue does not have free descriptor. We add the request
> > + * to req_list and wait for host_ack to wake us up when free
> > + * slots are available.
> > + */
> > +   while ((err = virtqueue_add_sgs(vpmem->req_vq, sgs, 1, 1, req,
> > +   GFP_ATOMIC)) == -ENOSPC) {
> > +
> > +   dev_err(>dev, "failed to send command to virtio pmem" \
> > +   "device, no free slots in the virtqueue\n");
> > +   req->wq_buf_avail = false;
> > +   list_add_tail(>list, >req_list);
> > +   spin_unlock_irqrestore(>pmem_lock, flags);
> > +
> > +   /* A host response results in "host_ack" getting called */
> > +   wait_event(req->wq_buf, req->wq_buf_avail);
> > +   spin_lock_irqsave(>pmem_lock, flags);
> > +   }
> > +   err1 = virtqueue_kick(vpmem->req_vq);
> > +   spin_unlock_irqrestore(>pmem_lock, flags);
> > +
> > +   /*
> > +* virtqueue_add_sgs failed with error different than -ENOSPC, we can't
> > +* do anything about that.
> > +*/
> > +   if (err || !err1) {
> > +   dev_info(>dev, "failed to send command to virtio pmem 
> > device\n");
> > +   err = -EIO;
> > +   } else {
> > +   /* A host repsonse results in "host_ack" getting called */
> > +   wait_event(req->host_acked, req->done);
> > +   err = req->ret;
> > +I confirm that the failures I was facing with the `-ENOSPC` error path are
> > not present in v9.

Can I take it your reviewed/acked-by? or tested-by tag? for the virtio patch :)

Thank you,
Pankaj

> 
> Best,
> Jakub Staron
> 
> 
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH V2 3/4] vhost: vsock: add weight support

2019-05-16 Thread Jason Wang
This patch will check the weight and exit the loop if we exceeds the
weight. This is useful for preventing vsock kthread from hogging cpu
which is guest triggerable. The weight can help to avoid starving the
request from on direction while another direction is being processed.

The value of weight is picked from vhost-net.

This addresses CVE-2019-3900.

Cc: Stefan Hajnoczi 
Fixes: 433fc58e6bf2 ("VSOCK: Introduce vhost_vsock.ko")
Signed-off-by: Jason Wang 
---
 drivers/vhost/vsock.c | 16 ++--
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 47c6d4d..814bed7 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -86,6 +86,7 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
struct vhost_virtqueue *vq)
 {
struct vhost_virtqueue *tx_vq = >vqs[VSOCK_VQ_TX];
+   int pkts = 0, total_len = 0;
bool added = false;
bool restart_tx = false;
 
@@ -97,7 +98,7 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
/* Avoid further vmexits, we're already processing the virtqueue */
vhost_disable_notify(>dev, vq);
 
-   for (;;) {
+   do {
struct virtio_vsock_pkt *pkt;
struct iov_iter iov_iter;
unsigned out, in;
@@ -182,8 +183,9 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
 */
virtio_transport_deliver_tap_pkt(pkt);
 
+   total_len += pkt->len;
virtio_transport_free_pkt(pkt);
-   }
+   } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));
if (added)
vhost_signal(>dev, vq);
 
@@ -358,7 +360,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work 
*work)
struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
 dev);
struct virtio_vsock_pkt *pkt;
-   int head;
+   int head, pkts = 0, total_len = 0;
unsigned int out, in;
bool added = false;
 
@@ -368,7 +370,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work 
*work)
goto out;
 
vhost_disable_notify(>dev, vq);
-   for (;;) {
+   do {
u32 len;
 
if (!vhost_vsock_more_replies(vsock)) {
@@ -409,9 +411,11 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work 
*work)
else
virtio_transport_free_pkt(pkt);
 
-   vhost_add_used(vq, head, sizeof(pkt->hdr) + len);
+   len += sizeof(pkt->hdr);
+   vhost_add_used(vq, head, len);
+   total_len += len;
added = true;
-   }
+   } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));
 
 no_more_replies:
if (added)
-- 
1.8.3.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH V2 2/4] vhost_net: fix possible infinite loop

2019-05-16 Thread Jason Wang
When the rx buffer is too small for a packet, we will discard the vq
descriptor and retry it for the next packet:

while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
  _intr))) {
...
/* On overrun, truncate and discard */
if (unlikely(headcount > UIO_MAXIOV)) {
iov_iter_init(_iter, READ, vq->iov, 1, 1);
err = sock->ops->recvmsg(sock, ,
 1, MSG_DONTWAIT | MSG_TRUNC);
pr_debug("Discarded rx packet: len %zd\n", sock_len);
continue;
}
...
}

This makes it possible to trigger a infinite while..continue loop
through the co-opreation of two VMs like:

1) Malicious VM1 allocate 1 byte rx buffer and try to slow down the
   vhost process as much as possible e.g using indirect descriptors or
   other.
2) Malicious VM2 generate packets to VM1 as fast as possible

Fixing this by checking against weight at the end of RX and TX
loop. This also eliminate other similar cases when:

- userspace is consuming the packets in the meanwhile
- theoretical TOCTOU attack if guest moving avail index back and forth
  to hit the continue after vhost find guest just add new buffers

This addresses CVE-2019-3900.

Fixes: d8316f3991d20 ("vhost: fix total length when packets are too short")
Fixes: 3a4d5c94e9593 ("vhost_net: a kernel-level virtio server")
Signed-off-by: Jason Wang 
---
The patch is needed for stable.
---
 drivers/vhost/net.c | 29 +
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 061a06d..2d9df78 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -773,7 +773,7 @@ static void handle_tx_copy(struct vhost_net *net, struct 
socket *sock)
int sent_pkts = 0;
bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX);
 
-   for (;;) {
+   do {
bool busyloop_intr = false;
 
if (nvq->done_idx == VHOST_NET_BATCH)
@@ -839,9 +839,7 @@ static void handle_tx_copy(struct vhost_net *net, struct 
socket *sock)
vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
vq->heads[nvq->done_idx].len = 0;
++nvq->done_idx;
-   if (vhost_exceeds_weight(vq, ++sent_pkts, total_len))
-   break;
-   }
+   } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
 
vhost_tx_batch(net, nvq, sock, );
 }
@@ -866,7 +864,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, 
struct socket *sock)
bool zcopy_used;
int sent_pkts = 0;
 
-   for (;;) {
+   do {
bool busyloop_intr;
 
/* Release DMAs done buffers first */
@@ -943,10 +941,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, 
struct socket *sock)
else
vhost_zerocopy_signal_used(net, vq);
vhost_net_tx_packet(net);
-   if (unlikely(vhost_exceeds_weight(vq, ++sent_pkts,
- total_len)))
-   break;
-   }
+   } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
 }
 
 /* Expects to be always run from workqueue - which acts as
@@ -1144,8 +1139,11 @@ static void handle_rx(struct vhost_net *net)
vq->log : NULL;
mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
 
-   while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
- _intr))) {
+   do {
+   sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
+ _intr);
+   if (!sock_len)
+   break;
sock_len += sock_hlen;
vhost_len = sock_len + vhost_hlen;
headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
@@ -1230,12 +1228,11 @@ static void handle_rx(struct vhost_net *net)
vhost_log_write(vq, vq_log, log, vhost_len,
vq->iov, in);
total_len += vhost_len;
-   if (unlikely(vhost_exceeds_weight(vq, ++recv_pkts, total_len)))
-   goto out;
-   }
+   } while (likely(!vhost_exceeds_weight(vq, ++recv_pkts, total_len)));
+
if (unlikely(busyloop_intr))
vhost_poll_queue(>poll);
-   else
+   else if (!sock_len)
vhost_net_enable_vq(net, vq);
 out:
vhost_net_signal_used(nvq);
@@ -1328,7 +1325,7 @@ static int vhost_net_open(struct inode *inode, struct 
file *f)
}
vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX,
   UIO_MAXIOV + VHOST_NET_BATCH,
-  VHOST_NET_WEIGHT, VHOST_NET_PKT_WEIGHT);
+  VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT);
 

[PATCH V2 0/4] Prevent vhost kthread from hogging CPU

2019-05-16 Thread Jason Wang
Hi:

This series try to prevent a guest triggerable CPU hogging through
vhost kthread. This is done by introducing and checking the weight
after each requrest. The patch has been tested with reproducer of
vsock and virtio-net. Only compile test is done for vhost-scsi.

Please review.

This addresses CVE-2019-3900.

Changs from V1:
- fix user-ater-free in vosck patch

Jason Wang (4):
  vhost: introduce vhost_exceeds_weight()
  vhost_net: fix possible infinite loop
  vhost: vsock: add weight support
  vhost: scsi: add weight support

 drivers/vhost/net.c   | 41 ++---
 drivers/vhost/scsi.c  | 21 ++---
 drivers/vhost/vhost.c | 20 +++-
 drivers/vhost/vhost.h |  5 -
 drivers/vhost/vsock.c | 28 +---
 5 files changed, 72 insertions(+), 43 deletions(-)

-- 
1.8.3.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH V2 4/4] vhost: scsi: add weight support

2019-05-16 Thread Jason Wang
This patch will check the weight and exit the loop if we exceeds the
weight. This is useful for preventing scsi kthread from hogging cpu
which is guest triggerable.

This addresses CVE-2019-3900.

Cc: Paolo Bonzini 
Cc: Stefan Hajnoczi 
Fixes: 057cbf49a1f0 ("tcm_vhost: Initial merge for vhost level target fabric 
driver")
Signed-off-by: Jason Wang 
---
 drivers/vhost/scsi.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index d830579..3a59f47 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -918,7 +918,7 @@ static void vhost_scsi_submission_work(struct work_struct 
*work)
struct iov_iter in_iter, prot_iter, data_iter;
u64 tag;
u32 exp_data_len, data_direction;
-   int ret, prot_bytes;
+   int ret, prot_bytes, c = 0;
u16 lun;
u8 task_attr;
bool t10_pi = vhost_has_feature(vq, VIRTIO_SCSI_F_T10_PI);
@@ -938,7 +938,7 @@ static void vhost_scsi_submission_work(struct work_struct 
*work)
 
vhost_disable_notify(>dev, vq);
 
-   for (;;) {
+   do {
ret = vhost_scsi_get_desc(vs, vq, );
if (ret)
goto err;
@@ -1118,7 +1118,7 @@ static void vhost_scsi_submission_work(struct work_struct 
*work)
break;
else if (ret == -EIO)
vhost_scsi_send_bad_target(vs, vq, vc.head, vc.out);
-   }
+   } while (likely(!vhost_exceeds_weight(vq, ++c, 0)));
 out:
mutex_unlock(>mutex);
 }
@@ -1177,7 +1177,7 @@ static void vhost_scsi_submission_work(struct work_struct 
*work)
} v_req;
struct vhost_scsi_ctx vc;
size_t typ_size;
-   int ret;
+   int ret, c = 0;
 
mutex_lock(>mutex);
/*
@@ -1191,7 +1191,7 @@ static void vhost_scsi_submission_work(struct work_struct 
*work)
 
vhost_disable_notify(>dev, vq);
 
-   for (;;) {
+   do {
ret = vhost_scsi_get_desc(vs, vq, );
if (ret)
goto err;
@@ -1270,7 +1270,7 @@ static void vhost_scsi_submission_work(struct work_struct 
*work)
break;
else if (ret == -EIO)
vhost_scsi_send_bad_target(vs, vq, vc.head, vc.out);
-   }
+   } while (likely(!vhost_exceeds_weight(vq, ++c, 0)));
 out:
mutex_unlock(>mutex);
 }
-- 
1.8.3.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [Qemu-devel] [PATCH v9 2/7] virtio-pmem: Add virtio pmem driver

2019-05-16 Thread Pankaj Gupta


> 
> On Wed, May 15, 2019 at 10:46:00PM +0200, David Hildenbrand wrote:
> > > + vpmem->vdev = vdev;
> > > + vdev->priv = vpmem;
> > > + err = init_vq(vpmem);
> > > + if (err) {
> > > + dev_err(>dev, "failed to initialize virtio pmem vq's\n");
> > > + goto out_err;
> > > + }
> > > +
> > > + virtio_cread(vpmem->vdev, struct virtio_pmem_config,
> > > + start, >start);
> > > + virtio_cread(vpmem->vdev, struct virtio_pmem_config,
> > > + size, >size);
> > > +
> > > + res.start = vpmem->start;
> > > + res.end   = vpmem->start + vpmem->size-1;
> > 
> > nit: " - 1;"
> > 
> > > + vpmem->nd_desc.provider_name = "virtio-pmem";
> > > + vpmem->nd_desc.module = THIS_MODULE;
> > > +
> > > + vpmem->nvdimm_bus = nvdimm_bus_register(>dev,
> > > + >nd_desc);
> > > + if (!vpmem->nvdimm_bus) {
> > > + dev_err(>dev, "failed to register device with 
> > > nvdimm_bus\n");
> > > + err = -ENXIO;
> > > + goto out_vq;
> > > + }
> > > +
> > > + dev_set_drvdata(>dev, vpmem->nvdimm_bus);
> > > +
> > > + ndr_desc.res = 
> > > + ndr_desc.numa_node = nid;
> > > + ndr_desc.flush = async_pmem_flush;
> > > + set_bit(ND_REGION_PAGEMAP, _desc.flags);
> > > + set_bit(ND_REGION_ASYNC, _desc.flags);
> > > + nd_region = nvdimm_pmem_region_create(vpmem->nvdimm_bus, _desc);
> > > + if (!nd_region) {
> > > + dev_err(>dev, "failed to create nvdimm region\n");
> > > + err = -ENXIO;
> > > + goto out_nd;
> > > + }
> > > + nd_region->provider_data =
> > > dev_to_virtio(nd_region->dev.parent->parent);
> > > + return 0;
> > > +out_nd:
> > > + nvdimm_bus_unregister(vpmem->nvdimm_bus);
> > > +out_vq:
> > > + vdev->config->del_vqs(vdev);
> > > +out_err:
> > > + return err;
> > > +}
> > > +
> > > +static void virtio_pmem_remove(struct virtio_device *vdev)
> > > +{
> > > + struct nvdimm_bus *nvdimm_bus = dev_get_drvdata(>dev);
> > > +
> > > + nvdimm_bus_unregister(nvdimm_bus);
> > > + vdev->config->del_vqs(vdev);
> > > + vdev->config->reset(vdev);
> > > +}
> > > +
> > > +static struct virtio_driver virtio_pmem_driver = {
> > > + .driver.name= KBUILD_MODNAME,
> > > + .driver.owner   = THIS_MODULE,
> > > + .id_table   = id_table,
> > > + .probe  = virtio_pmem_probe,
> > > + .remove = virtio_pmem_remove,
> > > +};
> > > +
> > > +module_virtio_driver(virtio_pmem_driver);
> > > +MODULE_DEVICE_TABLE(virtio, id_table);
> > > +MODULE_DESCRIPTION("Virtio pmem driver");
> > > +MODULE_LICENSE("GPL");
> > > diff --git a/drivers/nvdimm/virtio_pmem.h b/drivers/nvdimm/virtio_pmem.h
> > > new file mode 100644
> > > index ..ab1da877575d
> > > --- /dev/null
> > > +++ b/drivers/nvdimm/virtio_pmem.h
> > > @@ -0,0 +1,60 @@
> > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > +/*
> > > + * virtio_pmem.h: virtio pmem Driver
> > > + *
> > > + * Discovers persistent memory range information
> > > + * from host and provides a virtio based flushing
> > > + * interface.
> > > + **/
> > > +
> > > +#ifndef _LINUX_VIRTIO_PMEM_H
> > > +#define _LINUX_VIRTIO_PMEM_H
> > > +
> > > +#include 
> > > +#include 
> > > +#include 
> > > +#include 
> > > +#include 
> > > +#include 
> > > +
> > > +struct virtio_pmem_request {
> > > + /* Host return status corresponding to flush request */
> > > + int ret;
> > > +
> > > + /* command name*/
> > > + char name[16];
> > 
> > So ... why are we sending string commands and expect native-endianess
> > integers and don't define a proper request/response structure + request
> > types in include/uapi/linux/virtio_pmem.h like
> 
> passing names could be ok.
> I missed the fact we return a native endian int.
> Pls fix that.

Sure. will fix this.

> 
> 
> > 
> > struct virtio_pmem_resp {
> > __virtio32 ret;
> > }
> > 
> > #define VIRTIO_PMEM_REQ_TYPE_FLUSH  1
> > struct virtio_pmem_req {
> > __virtio16 type;
> > }
> > 
> > ... and this way we also define a proper endianess format for exchange
> > and keep it extensible
> > 
> > @MST, what's your take on this?
> 
> Extensions can always use feature bits so I don't think
> it's a problem.

That was exactly my thought when I implemented this. Though I am
fine with separate structures for request/response and I made the
change. 

Thank you for all the comments.

Best regards,
Pankaj 
> > 
> > --
> > 
> > Thanks,
> > 
> > David / dhildenb
> 
> 


Re: [PATCH 05/10] s390/cio: introduce DMA pools to cio

2019-05-16 Thread Cornelia Huck
On Wed, 15 May 2019 19:12:57 +0200
Halil Pasic  wrote:

> On Mon, 13 May 2019 15:29:24 +0200
> Cornelia Huck  wrote:
> 
> > On Sun, 12 May 2019 20:22:56 +0200
> > Halil Pasic  wrote:
> >   
> > > On Fri, 10 May 2019 16:10:13 +0200
> > > Cornelia Huck  wrote:
> > >   
> > > > On Fri, 10 May 2019 00:11:12 +0200
> > > > Halil Pasic  wrote:
> > > > 
> > > > > On Thu, 9 May 2019 12:11:06 +0200
> > > > > Cornelia Huck  wrote:
> > > > > 
> > > > > > On Wed, 8 May 2019 23:22:10 +0200
> > > > > > Halil Pasic  wrote:
> > > > > >   
> > > > > > > On Wed, 8 May 2019 15:18:10 +0200 (CEST)
> > > > > > > Sebastian Ott  wrote:  
> > > > > >   
> > > > > > > > > @@ -1063,6 +1163,7 @@ static int __init css_bus_init(void)
> > > > > > > > >   
> > > > > > > > > unregister_reboot_notifier(_reboot_notifier);
> > > > > > > > >   goto out_unregister;
> > > > > > > > >   }
> > > > > > > > > + cio_dma_pool_init();  
> > > > > > > > 
> > > > > > > > This is too late for early devices (ccw console!).
> > > > > > > 
> > > > > > > You have already raised concern about this last time (thanks). I 
> > > > > > > think,
> > > > > > > I've addressed this issue: the cio_dma_pool is only used by the 
> > > > > > > airq
> > > > > > > stuff. I don't think the ccw console needs it. Please have an 
> > > > > > > other look
> > > > > > > at patch #6, and explain your concern in more detail if it 
> > > > > > > persists.  
> > > > > > 
> > > > > > What about changing the naming/adding comments here, so that (1) 
> > > > > > folks
> > > > > > aren't confused by the same thing in the future and (2) folks don't 
> > > > > > try
> > > > > > to use that pool for something needed for the early ccw consoles?
> > > > > >   
> > > > > 
> > > > > I'm all for clarity! Suggestions for better names?
> > > > 
> > > > css_aiv_dma_pool, maybe? Or is there other cross-device stuff that may
> > > > need it?
> > > > 
> > > 
> > > Ouch! I was considering to use cio_dma_zalloc() for the adapter
> > > interruption vectors but I ended up between the two chairs in the end.
> > > So with this series there are no uses for cio_dma pool.
> > > 
> > > I don't feel strongly about this going one way the other.
> > > 
> > > Against getting rid of the cio_dma_pool and sticking with the speaks
> > > dma_alloc_coherent() that we waste a DMA page per vector, which is a
> > > non obvious side effect.  
> > 
> > That would basically mean one DMA page per virtio-ccw device, right?  
> 
> Not quite: virtio-ccw shares airq vectors among multiple devices. We
> alloc 64 bytes a time and use that as long as we don't run out of bits.
>  
> > For single queue devices, this seems like quite a bit of overhead.
> >  
> 
> Nod.
>  
> > Are we expecting many devices in use per guest?  
> 
> This is affect linux in general, not only guest 2 stuff (i.e. we
> also waste in vanilla lpar mode). And zpci seems to do at least one
> airq_iv_create() per pci function.

Hm, I thought that guests with virtio-ccw devices were the most heavy
user of this.

On LPAR (which would be the environment where I'd expect lots of
devices), how many users of this infrastructure do we typically have?
DASD do not use adapter interrupts, and QDIO devices (qeth/zfcp) use
their own indicator handling (that pre-dates this infrastructure) IIRC,
which basically only leaves the PCI functions you mention above.

> 
> >   
> > > 
> > > What speaks against cio_dma_pool is that it is slightly more code, and
> > > this currently can not be used for very early stuff, which I don't
> > > think is relevant.   
> > 
> > Unless properly documented, it feels like something you can easily trip
> > over, however.
> > 
> > I assume that the "very early stuff" is basically only ccw consoles.
> > Not sure if we can use virtio-serial as an early ccw console -- IIRC
> > that was only about 3215/3270? While QEMU guests are able to use a 3270
> > console, this is experimental and I would not count that as a use case.
> > Anyway, 3215/3270 don't use adapter interrupts, and probably not
> > anything cross-device, either; so unless early virtio-serial is a
> > thing, this restriction is fine if properly documented.
> >   
> 
> Mimu can you dig into this a bit?
> 
> We could also aim for getting rid of this limitation. One idea would be
> some sort of lazy initialization (i.e. triggered by first usage).
> Another idea would be simply doing the initialization earlier.
> Unfortunately I'm not all that familiar with the early stuff. Is there
> some doc you can recommend?

I'd probably look at the code for that.

> 
> Anyway before investing much more into this, I think we should have a
> fair idea which option do we prefer...

Agreed.

> 
> > > What also used to speak against it is that
> > > allocations asking for more than a page would just fail, but I addressed
> > > that in the patch I've hacked up on top of the series, and I'm going to
> > > paste below. While at it I 

Re: [PATCH 06/10] s390/cio: add basic protected virtualization support

2019-05-16 Thread Cornelia Huck
On Wed, 15 May 2019 22:51:58 +0200
Halil Pasic  wrote:

> On Mon, 13 May 2019 11:41:36 +0200
> Cornelia Huck  wrote:
> 
> > On Fri, 26 Apr 2019 20:32:41 +0200
> > Halil Pasic  wrote:
> >   
> > > As virtio-ccw devices are channel devices, we need to use the dma area
> > > for any communication with the hypervisor.
> > > 
> > > This patch addresses the most basic stuff (mostly what is required for
> > > virtio-ccw), and does take care of QDIO or any devices.  
> > 
> > "does not take care of QDIO", surely?   
> 
> I did not bother making the QDIO library code use dma memory for
> anything that is conceptually dma memory. AFAIK QDIO is out of scope for
> prot virt for now. If one were to do some emulated qdio with prot virt
> guests, one wound need to make a bunch of things shared.

And unless you wanted to support protected virt under z/VM as well, it
would be wasted effort :)

> 
> > (Also, what does "any devices"
> > mean? Do you mean "every arbitrary device", perhaps?)  
> 
> What I mean is: this patch takes care of the core stuff, but any
> particular device is likely to have to do more -- that is it ain't all
> the cio devices support prot virt with this patch. For example
> virtio-ccw needs to make sure that the ccws constituting the channel
> programs, as well as the data pointed by the ccws is shared. If one
> would want to make vfio-ccw DASD pass-through work under prot virt, one
> would need to make sure, that everything that needs to be shared is
> shared (data buffers, channel programs).
> 
> Does is clarify things?

That's what I thought, but the sentence was confusing :) What about

"This patch addresses the most basic stuff (mostly what is required to
support virtio-ccw devices). It handles neither QDIO devices, nor
arbitrary non-virtio-ccw devices." ?

> 
> >   
> > > 
> > > An interesting side effect is that virtio structures are now going to
> > > get allocated in 31 bit addressable storage.  
> > 
> > Hm...
> >   
> > > 
> > > Signed-off-by: Halil Pasic 
> > > ---
> > >  arch/s390/include/asm/ccwdev.h   |  4 +++
> > >  drivers/s390/cio/ccwreq.c|  8 ++---
> > >  drivers/s390/cio/device.c| 65 
> > > +---
> > >  drivers/s390/cio/device_fsm.c| 40 -
> > >  drivers/s390/cio/device_id.c | 18 +--
> > >  drivers/s390/cio/device_ops.c| 21 +++--
> > >  drivers/s390/cio/device_pgid.c   | 20 ++---
> > >  drivers/s390/cio/device_status.c | 24 +++
> > >  drivers/s390/cio/io_sch.h| 21 +
> > >  drivers/s390/virtio/virtio_ccw.c | 10 ---
> > >  10 files changed, 148 insertions(+), 83 deletions(-)  
> > 
> > (...)
> >   
> > > diff --git a/drivers/s390/virtio/virtio_ccw.c 
> > > b/drivers/s390/virtio/virtio_ccw.c
> > > index 6d989c360f38..bb7a92316fc8 100644
> > > --- a/drivers/s390/virtio/virtio_ccw.c
> > > +++ b/drivers/s390/virtio/virtio_ccw.c
> > > @@ -66,7 +66,6 @@ struct virtio_ccw_device {
> > >   bool device_lost;
> > >   unsigned int config_ready;
> > >   void *airq_info;
> > > - u64 dma_mask;
> > >  };
> > >  
> > >  struct vq_info_block_legacy {
> > > @@ -1255,16 +1254,7 @@ static int virtio_ccw_online(struct ccw_device 
> > > *cdev)
> > >   ret = -ENOMEM;
> > >   goto out_free;
> > >   }
> > > -
> > >   vcdev->vdev.dev.parent = >dev;
> > > - cdev->dev.dma_mask = >dma_mask;
> > > - /* we are fine with common virtio infrastructure using 64 bit DMA */
> > > - ret = dma_set_mask_and_coherent(>dev, DMA_BIT_MASK(64));
> > > - if (ret) {
> > > - dev_warn(>dev, "Failed to enable 64-bit DMA.\n");
> > > - goto out_free;
> > > - }  
> > 
> > This means that vring structures now need to fit into 31 bits as well,
> > I think?  
> 
> Nod.
> 
> > Is there any way to reserve the 31 bit restriction for channel
> > subsystem structures and keep vring in the full 64 bit range? (Or am I
> > fundamentally misunderstanding something?)
> >   
> 
> At the root of this problem is that the DMA API basically says devices
> may have addressing limitations expressed by the dma_mask, while our
> addressing limitations are not coming from the device but from the IO
> arch: e.g. orb.cpa and ccw.cda are 31 bit addresses. In our case it
> depends on how and for what is the device going to use the memory (e.g.
> buffers addressed by MIDA vs IDA vs direct).
> 
> Virtio uses the DMA properties of the parent, that is in our case the
> struct device embedded in struct ccw_device.
> 
> The previous version (RFC) used to allocate all the cio DMA stuff from
> this global cio_dma_pool using the css0.dev for the DMA API
> interactions. And we set *css0.dev.dma_mask == DMA_BIT_MASK(31) so
> e.g. the allocated ccws are 31 bit addressable.
> 
> But I was asked to change this so that when I allocate DMA memory for a
> channel program of particular ccw device, a struct device of that ccw
> device is used as the first argument of dma_alloc_coherent().
> 
> Considering

Re: [PATCH 06/10] s390/cio: add basic protected virtualization support

2019-05-16 Thread Cornelia Huck
On Wed, 15 May 2019 23:08:17 +0200
Halil Pasic  wrote:

> On Tue, 14 May 2019 10:47:34 -0400
> "Jason J. Herne"  wrote:

> > Are we 
> > worried that virtio data structures are going to be a burden on the 31-bit 
> > address space?
> > 
> >   
> 
> That is a good question I can not answer. Since it is currently at least
> a page per queue (because we use dma direct, right Mimu?), I am concerned
> about this.
> 
> Connie, what is your opinion?

Yes, running into problems there was one of my motivations for my
question. I guess it depends on the number of devices and how many
queues they use. The problem is that it affects not only protected virt
guests, but all guests.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 06/10] s390/cio: add basic protected virtualization support

2019-05-16 Thread Cornelia Huck
On Thu, 16 May 2019 15:42:45 +0200
Halil Pasic  wrote:

> On Thu, 16 May 2019 08:32:28 +0200
> Cornelia Huck  wrote:
> 
> > On Wed, 15 May 2019 23:08:17 +0200
> > Halil Pasic  wrote:
> >   
> > > On Tue, 14 May 2019 10:47:34 -0400
> > > "Jason J. Herne"  wrote:  
> >   
> > > > Are we 
> > > > worried that virtio data structures are going to be a burden on the 
> > > > 31-bit address space?
> > > > 
> > > > 
> > > 
> > > That is a good question I can not answer. Since it is currently at least
> > > a page per queue (because we use dma direct, right Mimu?), I am concerned
> > > about this.
> > > 
> > > Connie, what is your opinion?  
> > 
> > Yes, running into problems there was one of my motivations for my
> > question. I guess it depends on the number of devices and how many
> > queues they use. The problem is that it affects not only protected virt
> > guests, but all guests.
> >   
> 
> Unless things are about to change only devices that have
> VIRTIO_F_IOMMU_PLATFORM are affected. So it does not necessarily affect
> not protected virt guests. (With prot virt we have to use
> VIRTIO_F_IOMMU_PLATFORM.)
> 
> If it were not like this, I would be much more worried.

If we go forward with this approach, documenting this side effect of
VIRTIO_F_IOMMU_PLATFORM is something that needs to happen.

> 
> @Mimu: Could you please discuss this problem with the team? It might be
> worth considering to go back to the design of the RFC (i.e. cio/ccw stuff
> allocated from a common cio dma pool which gives you 31 bit addressable
> memory, and 64 bit dma mask for a ccw device of a virtio device).
> 
> Regards,
> Halil
> 

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 06/10] s390/cio: add basic protected virtualization support

2019-05-16 Thread Halil Pasic
On Thu, 16 May 2019 08:32:28 +0200
Cornelia Huck  wrote:

> On Wed, 15 May 2019 23:08:17 +0200
> Halil Pasic  wrote:
> 
> > On Tue, 14 May 2019 10:47:34 -0400
> > "Jason J. Herne"  wrote:
> 
> > > Are we 
> > > worried that virtio data structures are going to be a burden on the 
> > > 31-bit address space?
> > > 
> > >   
> > 
> > That is a good question I can not answer. Since it is currently at least
> > a page per queue (because we use dma direct, right Mimu?), I am concerned
> > about this.
> > 
> > Connie, what is your opinion?
> 
> Yes, running into problems there was one of my motivations for my
> question. I guess it depends on the number of devices and how many
> queues they use. The problem is that it affects not only protected virt
> guests, but all guests.
> 

Unless things are about to change only devices that have
VIRTIO_F_IOMMU_PLATFORM are affected. So it does not necessarily affect
not protected virt guests. (With prot virt we have to use
VIRTIO_F_IOMMU_PLATFORM.)

If it were not like this, I would be much more worried.

@Mimu: Could you please discuss this problem with the team? It might be
worth considering to go back to the design of the RFC (i.e. cio/ccw stuff
allocated from a common cio dma pool which gives you 31 bit addressable
memory, and 64 bit dma mask for a ccw device of a virtio device).

Regards,
Halil

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v9 2/7] virtio-pmem: Add virtio pmem driver

2019-05-16 Thread Michael S. Tsirkin
On Wed, May 15, 2019 at 10:46:00PM +0200, David Hildenbrand wrote:
> > +   vpmem->vdev = vdev;
> > +   vdev->priv = vpmem;
> > +   err = init_vq(vpmem);
> > +   if (err) {
> > +   dev_err(>dev, "failed to initialize virtio pmem vq's\n");
> > +   goto out_err;
> > +   }
> > +
> > +   virtio_cread(vpmem->vdev, struct virtio_pmem_config,
> > +   start, >start);
> > +   virtio_cread(vpmem->vdev, struct virtio_pmem_config,
> > +   size, >size);
> > +
> > +   res.start = vpmem->start;
> > +   res.end   = vpmem->start + vpmem->size-1;
> 
> nit: " - 1;"
> 
> > +   vpmem->nd_desc.provider_name = "virtio-pmem";
> > +   vpmem->nd_desc.module = THIS_MODULE;
> > +
> > +   vpmem->nvdimm_bus = nvdimm_bus_register(>dev,
> > +   >nd_desc);
> > +   if (!vpmem->nvdimm_bus) {
> > +   dev_err(>dev, "failed to register device with 
> > nvdimm_bus\n");
> > +   err = -ENXIO;
> > +   goto out_vq;
> > +   }
> > +
> > +   dev_set_drvdata(>dev, vpmem->nvdimm_bus);
> > +
> > +   ndr_desc.res = 
> > +   ndr_desc.numa_node = nid;
> > +   ndr_desc.flush = async_pmem_flush;
> > +   set_bit(ND_REGION_PAGEMAP, _desc.flags);
> > +   set_bit(ND_REGION_ASYNC, _desc.flags);
> > +   nd_region = nvdimm_pmem_region_create(vpmem->nvdimm_bus, _desc);
> > +   if (!nd_region) {
> > +   dev_err(>dev, "failed to create nvdimm region\n");
> > +   err = -ENXIO;
> > +   goto out_nd;
> > +   }
> > +   nd_region->provider_data = dev_to_virtio(nd_region->dev.parent->parent);
> > +   return 0;
> > +out_nd:
> > +   nvdimm_bus_unregister(vpmem->nvdimm_bus);
> > +out_vq:
> > +   vdev->config->del_vqs(vdev);
> > +out_err:
> > +   return err;
> > +}
> > +
> > +static void virtio_pmem_remove(struct virtio_device *vdev)
> > +{
> > +   struct nvdimm_bus *nvdimm_bus = dev_get_drvdata(>dev);
> > +
> > +   nvdimm_bus_unregister(nvdimm_bus);
> > +   vdev->config->del_vqs(vdev);
> > +   vdev->config->reset(vdev);
> > +}
> > +
> > +static struct virtio_driver virtio_pmem_driver = {
> > +   .driver.name= KBUILD_MODNAME,
> > +   .driver.owner   = THIS_MODULE,
> > +   .id_table   = id_table,
> > +   .probe  = virtio_pmem_probe,
> > +   .remove = virtio_pmem_remove,
> > +};
> > +
> > +module_virtio_driver(virtio_pmem_driver);
> > +MODULE_DEVICE_TABLE(virtio, id_table);
> > +MODULE_DESCRIPTION("Virtio pmem driver");
> > +MODULE_LICENSE("GPL");
> > diff --git a/drivers/nvdimm/virtio_pmem.h b/drivers/nvdimm/virtio_pmem.h
> > new file mode 100644
> > index ..ab1da877575d
> > --- /dev/null
> > +++ b/drivers/nvdimm/virtio_pmem.h
> > @@ -0,0 +1,60 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +/*
> > + * virtio_pmem.h: virtio pmem Driver
> > + *
> > + * Discovers persistent memory range information
> > + * from host and provides a virtio based flushing
> > + * interface.
> > + **/
> > +
> > +#ifndef _LINUX_VIRTIO_PMEM_H
> > +#define _LINUX_VIRTIO_PMEM_H
> > +
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +
> > +struct virtio_pmem_request {
> > +   /* Host return status corresponding to flush request */
> > +   int ret;
> > +
> > +   /* command name*/
> > +   char name[16];
> 
> So ... why are we sending string commands and expect native-endianess
> integers and don't define a proper request/response structure + request
> types in include/uapi/linux/virtio_pmem.h like

passing names could be ok.
I missed the fact we return a native endian int.
Pls fix that.


> 
> struct virtio_pmem_resp {
>   __virtio32 ret;
> }
> 
> #define VIRTIO_PMEM_REQ_TYPE_FLUSH1
> struct virtio_pmem_req {
>   __virtio16 type;
> }
> 
> ... and this way we also define a proper endianess format for exchange
> and keep it extensible
> 
> @MST, what's your take on this?

Extensions can always use feature bits so I don't think
it's a problem.

> 
> -- 
> 
> Thanks,
> 
> David / dhildenb


Re: [PATCH 05/10] s390/cio: introduce DMA pools to cio

2019-05-16 Thread Sebastian Ott
On Sun, 12 May 2019, Halil Pasic wrote:
> I've also got code that deals with AIRQ_IV_CACHELINE by turning the
> kmem_cache into a dma_pool.
> 
> Cornelia, Sebastian which approach do you prefer:
> 1) get rid of cio_dma_pool and AIRQ_IV_CACHELINE, and waste a page per
> vector, or
> 2) go with the approach taken by the patch below?

We only have a couple of users for airq_iv:

virtio_ccw.c: 2K bits

pci with floating IRQs: <= 2K (for the per-function bit vectors)
1..4K (for the summary bit vector)

pci with CPU directed IRQs: 2K (for the per-CPU bit vectors)
1..nr_cpu (for the summary bit vector)


The options are:
* page allocations for everything
* dma_pool for AIRQ_IV_CACHELINE ,gen_pool for others
* dma_pool for everything

I think we should do option 3 and use a dma_pool with cachesize
alignment for everything (as a prerequisite we have to limit
config PCI_NR_FUNCTIONS to 2K - but that is not a real constraint).

Sebastian

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH] vsock/virtio: Initialize core virtio vsock before registering the driver

2019-05-16 Thread Stefan Hajnoczi
On Thu, May 16, 2019 at 09:48:52AM +0200, Stefano Garzarella wrote:
> On Wed, May 15, 2019 at 04:24:00PM +0100, Stefan Hajnoczi wrote:
> > On Tue, May 07, 2019 at 02:25:43PM +0200, Stefano Garzarella wrote:
> > > Hi Jorge,
> > > 
> > > On Mon, May 06, 2019 at 01:19:55PM -0700, Jorge Moreira Broche wrote:
> > > > > On Wed, May 01, 2019 at 03:08:31PM -0400, Stefan Hajnoczi wrote:
> > > > > > On Tue, Apr 30, 2019 at 05:30:01PM -0700, Jorge E. Moreira wrote:
> > > > > > > Avoid a race in which static variables in 
> > > > > > > net/vmw_vsock/af_vsock.c are
> > > > > > > accessed (while handling interrupts) before they are initialized.
> > > > > > >
> > > > > > >
> > > > > > > [4.201410] BUG: unable to handle kernel paging request at 
> > > > > > > ffe8
> > > > > > > [4.207829] IP: vsock_addr_equals_addr+0x3/0x20
> > > > > > > [4.211379] PGD 28210067 P4D 28210067 PUD 28212067 PMD 0
> > > > > > > [4.211379] Oops:  [#1] PREEMPT SMP PTI
> > > > > > > [4.211379] Modules linked in:
> > > > > > > [4.211379] CPU: 1 PID: 30 Comm: kworker/1:1 Not tainted 
> > > > > > > 4.14.106-419297-gd7e28cc1f241 #1
> > > > > > > [4.211379] Hardware name: QEMU Standard PC (i440FX + PIIX, 
> > > > > > > 1996), BIOS 1.10.2-1 04/01/2014
> > > > > > > [4.211379] Workqueue: virtio_vsock virtio_transport_rx_work
> > > > > > > [4.211379] task: a3273d175280 task.stack: aea1800e8000
> > > > > > > [4.211379] RIP: 0010:vsock_addr_equals_addr+0x3/0x20
> > > > > > > [4.211379] RSP: :aea1800ebd28 EFLAGS: 00010286
> > > > > > > [4.211379] RAX: 0002 RBX:  RCX: 
> > > > > > > b94e42f0
> > > > > > > [4.211379] RDX: 0400 RSI: ffe0 RDI: 
> > > > > > > aea1800ebdd0
> > > > > > > [4.211379] RBP: aea1800ebd58 R08: 0001 R09: 
> > > > > > > 0001
> > > > > > > [4.211379] R10:  R11: b89d5d60 R12: 
> > > > > > > aea1800ebdd0
> > > > > > > [4.211379] R13: 828cbfbf R14:  R15: 
> > > > > > > aea1800ebdc0
> > > > > > > [4.211379] FS:  () 
> > > > > > > GS:a3273fd0() knlGS:
> > > > > > > [4.211379] CS:  0010 DS:  ES:  CR0: 80050033
> > > > > > > [4.211379] CR2: ffe8 CR3: 2820e001 CR4: 
> > > > > > > 001606e0
> > > > > > > [4.211379] DR0:  DR1:  DR2: 
> > > > > > > 
> > > > > > > [4.211379] DR3:  DR6: fffe0ff0 DR7: 
> > > > > > > 0400
> > > > > > > [4.211379] Call Trace:
> > > > > > > [4.211379]  ? vsock_find_connected_socket+0x6c/0xe0
> > > > > > > [4.211379]  virtio_transport_recv_pkt+0x15f/0x740
> > > > > > > [4.211379]  ? detach_buf+0x1b5/0x210
> > > > > > > [4.211379]  virtio_transport_rx_work+0xb7/0x140
> > > > > > > [4.211379]  process_one_work+0x1ef/0x480
> > > > > > > [4.211379]  worker_thread+0x312/0x460
> > > > > > > [4.211379]  kthread+0x132/0x140
> > > > > > > [4.211379]  ? process_one_work+0x480/0x480
> > > > > > > [4.211379]  ? kthread_destroy_worker+0xd0/0xd0
> > > > > > > [4.211379]  ret_from_fork+0x35/0x40
> > > > > > > [4.211379] Code: c7 47 08 00 00 00 00 66 c7 07 28 00 c7 47 08 
> > > > > > > ff ff ff ff c7 47 04 ff ff ff ff c3 0f 1f 00 66 2e 0f 1f 84 00 00 
> > > > > > > 00 00 00 8b 47 08 <3b> 46 08 75 0a 8b 47 04 3b 46 04 0f 94 c0 c3 
> > > > > > > 31 c0 c3 90 66 2e
> > > > > > > [4.211379] RIP: vsock_addr_equals_addr+0x3/0x20 RSP: 
> > > > > > > aea1800ebd28
> > > > > > > [4.211379] CR2: ffe8
> > > > > > > [4.211379] ---[ end trace f31cc4a2e6df3689 ]---
> > > > > > > [4.211379] Kernel panic - not syncing: Fatal exception in 
> > > > > > > interrupt
> > > > > > > [4.211379] Kernel Offset: 0x3700 from 0x8100 
> > > > > > > (relocation range: 0x8000-0xbfff)
> > > > > > > [4.211379] Rebooting in 5 seconds..
> > > > > > >
> > > > > > > Fixes: 22b5c0b63f32 ("vsock/virtio: fix kernel panic after device 
> > > > > > > hot-unplug")
> > > > > > > Cc: Stefan Hajnoczi 
> > > > > > > Cc: "David S. Miller" 
> > > > > > > Cc: k...@vger.kernel.org
> > > > > > > Cc: virtualization@lists.linux-foundation.org
> > > > > > > Cc: net...@vger.kernel.org
> > > > > > > Cc: kernel-t...@android.com
> > > > > > > Cc: sta...@vger.kernel.org [4.9+]
> > > > > > > Signed-off-by: Jorge E. Moreira 
> > > > > > > ---
> > > > > > >  net/vmw_vsock/virtio_transport.c | 13 ++---
> > > > > > >  1 file changed, 6 insertions(+), 7 deletions(-)
> > > > > > >
> > > > > > > diff --git a/net/vmw_vsock/virtio_transport.c 
> > > > > > > b/net/vmw_vsock/virtio_transport.c
> > > > > > > index 15eb5d3d4750..96ab344f17bb 100644
> > > > > > > --- a/net/vmw_vsock/virtio_transport.c
> > > > > > > +++ 

Re: [PATCH net 3/4] vhost: vsock: add weight support

2019-05-16 Thread Stefan Hajnoczi
On Thu, May 16, 2019 at 03:47:41AM -0400, Jason Wang wrote:
> @@ -183,7 +184,8 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
>   virtio_transport_deliver_tap_pkt(pkt);
>  
>   virtio_transport_free_pkt(pkt);
> - }
> + total_len += pkt->len;

Please increment total_len before virtio_transport_free_pkt(pkt) to
avoid use-after-free.


signature.asc
Description: PGP signature
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH] vsock/virtio: Initialize core virtio vsock before registering the driver

2019-05-16 Thread Stefano Garzarella
On Wed, May 15, 2019 at 04:24:00PM +0100, Stefan Hajnoczi wrote:
> On Tue, May 07, 2019 at 02:25:43PM +0200, Stefano Garzarella wrote:
> > Hi Jorge,
> > 
> > On Mon, May 06, 2019 at 01:19:55PM -0700, Jorge Moreira Broche wrote:
> > > > On Wed, May 01, 2019 at 03:08:31PM -0400, Stefan Hajnoczi wrote:
> > > > > On Tue, Apr 30, 2019 at 05:30:01PM -0700, Jorge E. Moreira wrote:
> > > > > > Avoid a race in which static variables in net/vmw_vsock/af_vsock.c 
> > > > > > are
> > > > > > accessed (while handling interrupts) before they are initialized.
> > > > > >
> > > > > >
> > > > > > [4.201410] BUG: unable to handle kernel paging request at 
> > > > > > ffe8
> > > > > > [4.207829] IP: vsock_addr_equals_addr+0x3/0x20
> > > > > > [4.211379] PGD 28210067 P4D 28210067 PUD 28212067 PMD 0
> > > > > > [4.211379] Oops:  [#1] PREEMPT SMP PTI
> > > > > > [4.211379] Modules linked in:
> > > > > > [4.211379] CPU: 1 PID: 30 Comm: kworker/1:1 Not tainted 
> > > > > > 4.14.106-419297-gd7e28cc1f241 #1
> > > > > > [4.211379] Hardware name: QEMU Standard PC (i440FX + PIIX, 
> > > > > > 1996), BIOS 1.10.2-1 04/01/2014
> > > > > > [4.211379] Workqueue: virtio_vsock virtio_transport_rx_work
> > > > > > [4.211379] task: a3273d175280 task.stack: aea1800e8000
> > > > > > [4.211379] RIP: 0010:vsock_addr_equals_addr+0x3/0x20
> > > > > > [4.211379] RSP: :aea1800ebd28 EFLAGS: 00010286
> > > > > > [4.211379] RAX: 0002 RBX:  RCX: 
> > > > > > b94e42f0
> > > > > > [4.211379] RDX: 0400 RSI: ffe0 RDI: 
> > > > > > aea1800ebdd0
> > > > > > [4.211379] RBP: aea1800ebd58 R08: 0001 R09: 
> > > > > > 0001
> > > > > > [4.211379] R10:  R11: b89d5d60 R12: 
> > > > > > aea1800ebdd0
> > > > > > [4.211379] R13: 828cbfbf R14:  R15: 
> > > > > > aea1800ebdc0
> > > > > > [4.211379] FS:  () 
> > > > > > GS:a3273fd0() knlGS:
> > > > > > [4.211379] CS:  0010 DS:  ES:  CR0: 80050033
> > > > > > [4.211379] CR2: ffe8 CR3: 2820e001 CR4: 
> > > > > > 001606e0
> > > > > > [4.211379] DR0:  DR1:  DR2: 
> > > > > > 
> > > > > > [4.211379] DR3:  DR6: fffe0ff0 DR7: 
> > > > > > 0400
> > > > > > [4.211379] Call Trace:
> > > > > > [4.211379]  ? vsock_find_connected_socket+0x6c/0xe0
> > > > > > [4.211379]  virtio_transport_recv_pkt+0x15f/0x740
> > > > > > [4.211379]  ? detach_buf+0x1b5/0x210
> > > > > > [4.211379]  virtio_transport_rx_work+0xb7/0x140
> > > > > > [4.211379]  process_one_work+0x1ef/0x480
> > > > > > [4.211379]  worker_thread+0x312/0x460
> > > > > > [4.211379]  kthread+0x132/0x140
> > > > > > [4.211379]  ? process_one_work+0x480/0x480
> > > > > > [4.211379]  ? kthread_destroy_worker+0xd0/0xd0
> > > > > > [4.211379]  ret_from_fork+0x35/0x40
> > > > > > [4.211379] Code: c7 47 08 00 00 00 00 66 c7 07 28 00 c7 47 08 
> > > > > > ff ff ff ff c7 47 04 ff ff ff ff c3 0f 1f 00 66 2e 0f 1f 84 00 00 
> > > > > > 00 00 00 8b 47 08 <3b> 46 08 75 0a 8b 47 04 3b 46 04 0f 94 c0 c3 31 
> > > > > > c0 c3 90 66 2e
> > > > > > [4.211379] RIP: vsock_addr_equals_addr+0x3/0x20 RSP: 
> > > > > > aea1800ebd28
> > > > > > [4.211379] CR2: ffe8
> > > > > > [4.211379] ---[ end trace f31cc4a2e6df3689 ]---
> > > > > > [4.211379] Kernel panic - not syncing: Fatal exception in 
> > > > > > interrupt
> > > > > > [4.211379] Kernel Offset: 0x3700 from 0x8100 
> > > > > > (relocation range: 0x8000-0xbfff)
> > > > > > [4.211379] Rebooting in 5 seconds..
> > > > > >
> > > > > > Fixes: 22b5c0b63f32 ("vsock/virtio: fix kernel panic after device 
> > > > > > hot-unplug")
> > > > > > Cc: Stefan Hajnoczi 
> > > > > > Cc: "David S. Miller" 
> > > > > > Cc: k...@vger.kernel.org
> > > > > > Cc: virtualization@lists.linux-foundation.org
> > > > > > Cc: net...@vger.kernel.org
> > > > > > Cc: kernel-t...@android.com
> > > > > > Cc: sta...@vger.kernel.org [4.9+]
> > > > > > Signed-off-by: Jorge E. Moreira 
> > > > > > ---
> > > > > >  net/vmw_vsock/virtio_transport.c | 13 ++---
> > > > > >  1 file changed, 6 insertions(+), 7 deletions(-)
> > > > > >
> > > > > > diff --git a/net/vmw_vsock/virtio_transport.c 
> > > > > > b/net/vmw_vsock/virtio_transport.c
> > > > > > index 15eb5d3d4750..96ab344f17bb 100644
> > > > > > --- a/net/vmw_vsock/virtio_transport.c
> > > > > > +++ b/net/vmw_vsock/virtio_transport.c
> > > > > > @@ -702,28 +702,27 @@ static int __init virtio_vsock_init(void)
> > > > > > if (!virtio_vsock_workqueue)
> > > > > > return -ENOMEM;
> > > > > >
> > > > > > -   ret = register_virtio_driver(_vsock_driver);
> > > > > 

[PATCH net 2/4] vhost_net: fix possible infinite loop

2019-05-16 Thread Jason Wang
When the rx buffer is too small for a packet, we will discard the vq
descriptor and retry it for the next packet:

while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
  _intr))) {
...
/* On overrun, truncate and discard */
if (unlikely(headcount > UIO_MAXIOV)) {
iov_iter_init(_iter, READ, vq->iov, 1, 1);
err = sock->ops->recvmsg(sock, ,
 1, MSG_DONTWAIT | MSG_TRUNC);
pr_debug("Discarded rx packet: len %zd\n", sock_len);
continue;
}
...
}

This makes it possible to trigger a infinite while..continue loop
through the co-opreation of two VMs like:

1) Malicious VM1 allocate 1 byte rx buffer and try to slow down the
   vhost process as much as possible e.g using indirect descriptors or
   other.
2) Malicious VM2 generate packets to VM1 as fast as possible

Fixing this by checking against weight at the end of RX and TX
loop. This also eliminate other similar cases when:

- userspace is consuming the packets in the meanwhile
- theoretical TOCTOU attack if guest moving avail index back and forth
  to hit the continue after vhost find guest just add new buffers

This addresses CVE-2019-3900.

Fixes: d8316f3991d20 ("vhost: fix total length when packets are too short")
Fixes: 3a4d5c94e9593 ("vhost_net: a kernel-level virtio server")
Signed-off-by: Jason Wang 
---
The patch is needed for stable.
---
 drivers/vhost/net.c | 29 +
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 061a06d..2d9df78 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -773,7 +773,7 @@ static void handle_tx_copy(struct vhost_net *net, struct 
socket *sock)
int sent_pkts = 0;
bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX);
 
-   for (;;) {
+   do {
bool busyloop_intr = false;
 
if (nvq->done_idx == VHOST_NET_BATCH)
@@ -839,9 +839,7 @@ static void handle_tx_copy(struct vhost_net *net, struct 
socket *sock)
vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
vq->heads[nvq->done_idx].len = 0;
++nvq->done_idx;
-   if (vhost_exceeds_weight(vq, ++sent_pkts, total_len))
-   break;
-   }
+   } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
 
vhost_tx_batch(net, nvq, sock, );
 }
@@ -866,7 +864,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, 
struct socket *sock)
bool zcopy_used;
int sent_pkts = 0;
 
-   for (;;) {
+   do {
bool busyloop_intr;
 
/* Release DMAs done buffers first */
@@ -943,10 +941,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, 
struct socket *sock)
else
vhost_zerocopy_signal_used(net, vq);
vhost_net_tx_packet(net);
-   if (unlikely(vhost_exceeds_weight(vq, ++sent_pkts,
- total_len)))
-   break;
-   }
+   } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
 }
 
 /* Expects to be always run from workqueue - which acts as
@@ -1144,8 +1139,11 @@ static void handle_rx(struct vhost_net *net)
vq->log : NULL;
mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
 
-   while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
- _intr))) {
+   do {
+   sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
+ _intr);
+   if (!sock_len)
+   break;
sock_len += sock_hlen;
vhost_len = sock_len + vhost_hlen;
headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
@@ -1230,12 +1228,11 @@ static void handle_rx(struct vhost_net *net)
vhost_log_write(vq, vq_log, log, vhost_len,
vq->iov, in);
total_len += vhost_len;
-   if (unlikely(vhost_exceeds_weight(vq, ++recv_pkts, total_len)))
-   goto out;
-   }
+   } while (likely(!vhost_exceeds_weight(vq, ++recv_pkts, total_len)));
+
if (unlikely(busyloop_intr))
vhost_poll_queue(>poll);
-   else
+   else if (!sock_len)
vhost_net_enable_vq(net, vq);
 out:
vhost_net_signal_used(nvq);
@@ -1328,7 +1325,7 @@ static int vhost_net_open(struct inode *inode, struct 
file *f)
}
vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX,
   UIO_MAXIOV + VHOST_NET_BATCH,
-  VHOST_NET_WEIGHT, VHOST_NET_PKT_WEIGHT);
+  VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT);
 

[PATCH net 4/4] vhost: scsi: add weight support

2019-05-16 Thread Jason Wang
This patch will check the weight and exit the loop if we exceeds the
weight. This is useful for preventing scsi kthread from hogging cpu
which is guest triggerable.

This addresses CVE-2019-3900.

Cc: Paolo Bonzini 
Cc: Stefan Hajnoczi 
Fixes: 057cbf49a1f0 ("tcm_vhost: Initial merge for vhost level target fabric 
driver")
Signed-off-by: Jason Wang 
---
 drivers/vhost/scsi.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index d830579..3a59f47 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -918,7 +918,7 @@ static void vhost_scsi_submission_work(struct work_struct 
*work)
struct iov_iter in_iter, prot_iter, data_iter;
u64 tag;
u32 exp_data_len, data_direction;
-   int ret, prot_bytes;
+   int ret, prot_bytes, c = 0;
u16 lun;
u8 task_attr;
bool t10_pi = vhost_has_feature(vq, VIRTIO_SCSI_F_T10_PI);
@@ -938,7 +938,7 @@ static void vhost_scsi_submission_work(struct work_struct 
*work)
 
vhost_disable_notify(>dev, vq);
 
-   for (;;) {
+   do {
ret = vhost_scsi_get_desc(vs, vq, );
if (ret)
goto err;
@@ -1118,7 +1118,7 @@ static void vhost_scsi_submission_work(struct work_struct 
*work)
break;
else if (ret == -EIO)
vhost_scsi_send_bad_target(vs, vq, vc.head, vc.out);
-   }
+   } while (likely(!vhost_exceeds_weight(vq, ++c, 0)));
 out:
mutex_unlock(>mutex);
 }
@@ -1177,7 +1177,7 @@ static void vhost_scsi_submission_work(struct work_struct 
*work)
} v_req;
struct vhost_scsi_ctx vc;
size_t typ_size;
-   int ret;
+   int ret, c = 0;
 
mutex_lock(>mutex);
/*
@@ -1191,7 +1191,7 @@ static void vhost_scsi_submission_work(struct work_struct 
*work)
 
vhost_disable_notify(>dev, vq);
 
-   for (;;) {
+   do {
ret = vhost_scsi_get_desc(vs, vq, );
if (ret)
goto err;
@@ -1270,7 +1270,7 @@ static void vhost_scsi_submission_work(struct work_struct 
*work)
break;
else if (ret == -EIO)
vhost_scsi_send_bad_target(vs, vq, vc.head, vc.out);
-   }
+   } while (likely(!vhost_exceeds_weight(vq, ++c, 0)));
 out:
mutex_unlock(>mutex);
 }
-- 
1.8.3.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v9 2/7] virtio-pmem: Add virtio pmem driver

2019-05-16 Thread Pankaj Gupta


> 
> > +   vpmem->vdev = vdev;
> > +   vdev->priv = vpmem;
> > +   err = init_vq(vpmem);
> > +   if (err) {
> > +   dev_err(>dev, "failed to initialize virtio pmem vq's\n");
> > +   goto out_err;
> > +   }
> > +
> > +   virtio_cread(vpmem->vdev, struct virtio_pmem_config,
> > +   start, >start);
> > +   virtio_cread(vpmem->vdev, struct virtio_pmem_config,
> > +   size, >size);
> > +
> > +   res.start = vpmem->start;
> > +   res.end   = vpmem->start + vpmem->size-1;
> 
> nit: " - 1;"

Sure.

> 
> > +   vpmem->nd_desc.provider_name = "virtio-pmem";
> > +   vpmem->nd_desc.module = THIS_MODULE;
> > +
> > +   vpmem->nvdimm_bus = nvdimm_bus_register(>dev,
> > +   >nd_desc);
> > +   if (!vpmem->nvdimm_bus) {
> > +   dev_err(>dev, "failed to register device with 
> > nvdimm_bus\n");
> > +   err = -ENXIO;
> > +   goto out_vq;
> > +   }
> > +
> > +   dev_set_drvdata(>dev, vpmem->nvdimm_bus);
> > +
> > +   ndr_desc.res = 
> > +   ndr_desc.numa_node = nid;
> > +   ndr_desc.flush = async_pmem_flush;
> > +   set_bit(ND_REGION_PAGEMAP, _desc.flags);
> > +   set_bit(ND_REGION_ASYNC, _desc.flags);
> > +   nd_region = nvdimm_pmem_region_create(vpmem->nvdimm_bus, _desc);
> > +   if (!nd_region) {
> > +   dev_err(>dev, "failed to create nvdimm region\n");
> > +   err = -ENXIO;
> > +   goto out_nd;
> > +   }
> > +   nd_region->provider_data = dev_to_virtio(nd_region->dev.parent->parent);
> > +   return 0;
> > +out_nd:
> > +   nvdimm_bus_unregister(vpmem->nvdimm_bus);
> > +out_vq:
> > +   vdev->config->del_vqs(vdev);
> > +out_err:
> > +   return err;
> > +}
> > +
> > +static void virtio_pmem_remove(struct virtio_device *vdev)
> > +{
> > +   struct nvdimm_bus *nvdimm_bus = dev_get_drvdata(>dev);
> > +
> > +   nvdimm_bus_unregister(nvdimm_bus);
> > +   vdev->config->del_vqs(vdev);
> > +   vdev->config->reset(vdev);
> > +}
> > +
> > +static struct virtio_driver virtio_pmem_driver = {
> > +   .driver.name= KBUILD_MODNAME,
> > +   .driver.owner   = THIS_MODULE,
> > +   .id_table   = id_table,
> > +   .probe  = virtio_pmem_probe,
> > +   .remove = virtio_pmem_remove,
> > +};
> > +
> > +module_virtio_driver(virtio_pmem_driver);
> > +MODULE_DEVICE_TABLE(virtio, id_table);
> > +MODULE_DESCRIPTION("Virtio pmem driver");
> > +MODULE_LICENSE("GPL");
> > diff --git a/drivers/nvdimm/virtio_pmem.h b/drivers/nvdimm/virtio_pmem.h
> > new file mode 100644
> > index ..ab1da877575d
> > --- /dev/null
> > +++ b/drivers/nvdimm/virtio_pmem.h
> > @@ -0,0 +1,60 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +/*
> > + * virtio_pmem.h: virtio pmem Driver
> > + *
> > + * Discovers persistent memory range information
> > + * from host and provides a virtio based flushing
> > + * interface.
> > + **/
> > +
> > +#ifndef _LINUX_VIRTIO_PMEM_H
> > +#define _LINUX_VIRTIO_PMEM_H
> > +
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +
> > +struct virtio_pmem_request {
> > +   /* Host return status corresponding to flush request */
> > +   int ret;
> > +
> > +   /* command name*/
> > +   char name[16];
> 
> So ... why are we sending string commands and expect native-endianess
> integers and don't define a proper request/response structure + request
> types in include/uapi/linux/virtio_pmem.h like
> 
> struct virtio_pmem_resp {
>   __virtio32 ret;
> }
> 
> #define VIRTIO_PMEM_REQ_TYPE_FLUSH1
> struct virtio_pmem_req {
>   __virtio16 type;
> }
> 
> ... and this way we also define a proper endianess format for exchange
> and keep it extensible

Done the suggested change.

Thank you,
Pankaj

> 
> @MST, what's your take on this?
> 
> 
> --
> 
> Thanks,
> 
> David / dhildenb
> 


[PATCH net 1/4] vhost: introduce vhost_exceeds_weight()

2019-05-16 Thread Jason Wang
We used to have vhost_exceeds_weight() for vhost-net to:

- prevent vhost kthread from hogging the cpu
- balance the time spent between TX and RX

This function could be useful for vsock and scsi as well. So move it
to vhost.c. Device must specify a weight which counts the number of
requests, or it can also specific a byte_weight which counts the
number of bytes that has been processed.

Signed-off-by: Jason Wang 
---
 drivers/vhost/net.c   | 22 ++
 drivers/vhost/scsi.c  |  9 -
 drivers/vhost/vhost.c | 20 +++-
 drivers/vhost/vhost.h |  5 -
 drivers/vhost/vsock.c | 12 +++-
 5 files changed, 48 insertions(+), 20 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index df51a35..061a06d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -604,12 +604,6 @@ static size_t init_iov_iter(struct vhost_virtqueue *vq, 
struct iov_iter *iter,
return iov_iter_count(iter);
 }
 
-static bool vhost_exceeds_weight(int pkts, int total_len)
-{
-   return total_len >= VHOST_NET_WEIGHT ||
-  pkts >= VHOST_NET_PKT_WEIGHT;
-}
-
 static int get_tx_bufs(struct vhost_net *net,
   struct vhost_net_virtqueue *nvq,
   struct msghdr *msg,
@@ -845,10 +839,8 @@ static void handle_tx_copy(struct vhost_net *net, struct 
socket *sock)
vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
vq->heads[nvq->done_idx].len = 0;
++nvq->done_idx;
-   if (vhost_exceeds_weight(++sent_pkts, total_len)) {
-   vhost_poll_queue(>poll);
+   if (vhost_exceeds_weight(vq, ++sent_pkts, total_len))
break;
-   }
}
 
vhost_tx_batch(net, nvq, sock, );
@@ -951,10 +943,9 @@ static void handle_tx_zerocopy(struct vhost_net *net, 
struct socket *sock)
else
vhost_zerocopy_signal_used(net, vq);
vhost_net_tx_packet(net);
-   if (unlikely(vhost_exceeds_weight(++sent_pkts, total_len))) {
-   vhost_poll_queue(>poll);
+   if (unlikely(vhost_exceeds_weight(vq, ++sent_pkts,
+ total_len)))
break;
-   }
}
 }
 
@@ -1239,10 +1230,8 @@ static void handle_rx(struct vhost_net *net)
vhost_log_write(vq, vq_log, log, vhost_len,
vq->iov, in);
total_len += vhost_len;
-   if (unlikely(vhost_exceeds_weight(++recv_pkts, total_len))) {
-   vhost_poll_queue(>poll);
+   if (unlikely(vhost_exceeds_weight(vq, ++recv_pkts, total_len)))
goto out;
-   }
}
if (unlikely(busyloop_intr))
vhost_poll_queue(>poll);
@@ -1338,7 +1327,8 @@ static int vhost_net_open(struct inode *inode, struct 
file *f)
vhost_net_buf_init(>vqs[i].rxq);
}
vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX,
-  UIO_MAXIOV + VHOST_NET_BATCH);
+  UIO_MAXIOV + VHOST_NET_BATCH,
+  VHOST_NET_WEIGHT, VHOST_NET_PKT_WEIGHT);
 
vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, 
dev);
vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev);
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 618fb64..d830579 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -57,6 +57,12 @@
 #define VHOST_SCSI_PREALLOC_UPAGES 2048
 #define VHOST_SCSI_PREALLOC_PROT_SGLS 2048
 
+/* Max number of requests before requeueing the job.
+ * Using this limit prevents one virtqueue from starving others with
+ * request.
+ */
+#define VHOST_SCSI_WEIGHT 256
+
 struct vhost_scsi_inflight {
/* Wait for the flush operation to finish */
struct completion comp;
@@ -1622,7 +1628,8 @@ static int vhost_scsi_open(struct inode *inode, struct 
file *f)
vqs[i] = >vqs[i].vq;
vs->vqs[i].vq.handle_kick = vhost_scsi_handle_kick;
}
-   vhost_dev_init(>dev, vqs, VHOST_SCSI_MAX_VQ, UIO_MAXIOV);
+   vhost_dev_init(>dev, vqs, VHOST_SCSI_MAX_VQ, UIO_MAXIOV,
+  VHOST_SCSI_WEIGHT, 0);
 
vhost_scsi_init_inflight(vs, NULL);
 
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 351af88..290d6e5 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -413,8 +413,24 @@ static void vhost_dev_free_iovecs(struct vhost_dev *dev)
vhost_vq_free_iovecs(dev->vqs[i]);
 }
 
+bool vhost_exceeds_weight(struct vhost_virtqueue *vq,
+ int pkts, int total_len)
+{
+   struct vhost_dev *dev = vq->dev;
+
+   if ((dev->byte_weight && total_len >= dev->byte_weight) ||
+   pkts >= dev->weight) {
+   

[PATCH net 0/4] Prevent vhost kthread from hogging CPU

2019-05-16 Thread Jason Wang
Hi:

This series try to prvernt a guest triggerable CPU hogging through
vhost kthread. This is done by introducing and checking the weight
after each requrest. The patch has been tested with reproducer of
vsock and virtio-net. Only compile test is done for vhost-scsi.

Please review.

This addresses CVE-2019-3900.

Jason Wang (4):
  vhost: introduce vhost_exceeds_weight()
  vhost_net: fix possible infinite loop
  vhost: vsock: add weight support
  vhost: scsi: add weight support

 drivers/vhost/net.c   | 41 ++---
 drivers/vhost/scsi.c  | 21 ++---
 drivers/vhost/vhost.c | 20 +++-
 drivers/vhost/vhost.h |  5 -
 drivers/vhost/vsock.c | 28 +---
 5 files changed, 72 insertions(+), 43 deletions(-)

-- 
1.8.3.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH net 3/4] vhost: vsock: add weight support

2019-05-16 Thread Jason Wang
This patch will check the weight and exit the loop if we exceeds the
weight. This is useful for preventing vsock kthread from hogging cpu
which is guest triggerable. The weight can help to avoid starving the
request from on direction while another direction is being processed.

The value of weight is picked from vhost-net.

This addresses CVE-2019-3900.

Cc: Stefan Hajnoczi 
Fixes: 433fc58e6bf2 ("VSOCK: Introduce vhost_vsock.ko")
Signed-off-by: Jason Wang 
---
 drivers/vhost/vsock.c | 16 ++--
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 47c6d4d..1fa9deb 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -86,6 +86,7 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
struct vhost_virtqueue *vq)
 {
struct vhost_virtqueue *tx_vq = >vqs[VSOCK_VQ_TX];
+   int pkts = 0, total_len = 0;
bool added = false;
bool restart_tx = false;
 
@@ -97,7 +98,7 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
/* Avoid further vmexits, we're already processing the virtqueue */
vhost_disable_notify(>dev, vq);
 
-   for (;;) {
+   do {
struct virtio_vsock_pkt *pkt;
struct iov_iter iov_iter;
unsigned out, in;
@@ -183,7 +184,8 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
virtio_transport_deliver_tap_pkt(pkt);
 
virtio_transport_free_pkt(pkt);
-   }
+   total_len += pkt->len;
+   } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));
if (added)
vhost_signal(>dev, vq);
 
@@ -358,7 +360,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work 
*work)
struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
 dev);
struct virtio_vsock_pkt *pkt;
-   int head;
+   int head, pkts = 0, total_len = 0;
unsigned int out, in;
bool added = false;
 
@@ -368,7 +370,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work 
*work)
goto out;
 
vhost_disable_notify(>dev, vq);
-   for (;;) {
+   do {
u32 len;
 
if (!vhost_vsock_more_replies(vsock)) {
@@ -409,9 +411,11 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work 
*work)
else
virtio_transport_free_pkt(pkt);
 
-   vhost_add_used(vq, head, sizeof(pkt->hdr) + len);
+   len += sizeof(pkt->hdr);
+   vhost_add_used(vq, head, len);
+   total_len += len;
added = true;
-   }
+   } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));
 
 no_more_replies:
if (added)
-- 
1.8.3.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH] vsock/virtio: Initialize core virtio vsock before registering the driver

2019-05-16 Thread Stefan Hajnoczi
On Tue, Apr 30, 2019 at 05:30:01PM -0700, Jorge E. Moreira wrote:
> Avoid a race in which static variables in net/vmw_vsock/af_vsock.c are
> accessed (while handling interrupts) before they are initialized.
> 
> [4.201410] BUG: unable to handle kernel paging request at ffe8
> [4.207829] IP: vsock_addr_equals_addr+0x3/0x20
> [4.211379] PGD 28210067 P4D 28210067 PUD 28212067 PMD 0
> [4.211379] Oops:  [#1] PREEMPT SMP PTI
> [4.211379] Modules linked in:
> [4.211379] CPU: 1 PID: 30 Comm: kworker/1:1 Not tainted 
> 4.14.106-419297-gd7e28cc1f241 #1
> [4.211379] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
> 1.10.2-1 04/01/2014
> [4.211379] Workqueue: virtio_vsock virtio_transport_rx_work
> [4.211379] task: a3273d175280 task.stack: aea1800e8000
> [4.211379] RIP: 0010:vsock_addr_equals_addr+0x3/0x20
> [4.211379] RSP: :aea1800ebd28 EFLAGS: 00010286
> [4.211379] RAX: 0002 RBX:  RCX: 
> b94e42f0
> [4.211379] RDX: 0400 RSI: ffe0 RDI: 
> aea1800ebdd0
> [4.211379] RBP: aea1800ebd58 R08: 0001 R09: 
> 0001
> [4.211379] R10:  R11: b89d5d60 R12: 
> aea1800ebdd0
> [4.211379] R13: 828cbfbf R14:  R15: 
> aea1800ebdc0
> [4.211379] FS:  () GS:a3273fd0() 
> knlGS:
> [4.211379] CS:  0010 DS:  ES:  CR0: 80050033
> [4.211379] CR2: ffe8 CR3: 2820e001 CR4: 
> 001606e0
> [4.211379] DR0:  DR1:  DR2: 
> 
> [4.211379] DR3:  DR6: fffe0ff0 DR7: 
> 0400
> [4.211379] Call Trace:
> [4.211379]  ? vsock_find_connected_socket+0x6c/0xe0
> [4.211379]  virtio_transport_recv_pkt+0x15f/0x740
> [4.211379]  ? detach_buf+0x1b5/0x210
> [4.211379]  virtio_transport_rx_work+0xb7/0x140
> [4.211379]  process_one_work+0x1ef/0x480
> [4.211379]  worker_thread+0x312/0x460
> [4.211379]  kthread+0x132/0x140
> [4.211379]  ? process_one_work+0x480/0x480
> [4.211379]  ? kthread_destroy_worker+0xd0/0xd0
> [4.211379]  ret_from_fork+0x35/0x40
> [4.211379] Code: c7 47 08 00 00 00 00 66 c7 07 28 00 c7 47 08 ff ff ff ff 
> c7 47 04 ff ff ff ff c3 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 8b 47 08 <3b> 
> 46 08 75 0a 8b 47 04 3b 46 04 0f 94 c0 c3 31 c0 c3 90 66 2e
> [4.211379] RIP: vsock_addr_equals_addr+0x3/0x20 RSP: aea1800ebd28
> [4.211379] CR2: ffe8
> [4.211379] ---[ end trace f31cc4a2e6df3689 ]---
> [4.211379] Kernel panic - not syncing: Fatal exception in interrupt
> [4.211379] Kernel Offset: 0x3700 from 0x8100 (relocation 
> range: 0x8000-0xbfff)
> [4.211379] Rebooting in 5 seconds..
> 
> Fixes: 22b5c0b63f32 ("vsock/virtio: fix kernel panic after device hot-unplug")
> Cc: Stefan Hajnoczi 
> Cc: "David S. Miller" 
> Cc: k...@vger.kernel.org
> Cc: virtualization@lists.linux-foundation.org
> Cc: net...@vger.kernel.org
> Cc: kernel-t...@android.com
> Cc: sta...@vger.kernel.org [4.9+]
> Signed-off-by: Jorge E. Moreira 
> ---
>  net/vmw_vsock/virtio_transport.c | 13 ++---
>  1 file changed, 6 insertions(+), 7 deletions(-)

This patch is good to go, I've posted my R-b downthread.

Stefan


signature.asc
Description: PGP signature
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH net 0/4] Prevent vhost kthread from hogging CPU

2019-05-16 Thread Stefan Hajnoczi
On Thu, May 16, 2019 at 03:47:38AM -0400, Jason Wang wrote:
> Hi:
> 
> This series try to prvernt a guest triggerable CPU hogging through
> vhost kthread. This is done by introducing and checking the weight
> after each requrest. The patch has been tested with reproducer of
> vsock and virtio-net. Only compile test is done for vhost-scsi.
> 
> Please review.
> 
> This addresses CVE-2019-3900.
> 
> Jason Wang (4):
>   vhost: introduce vhost_exceeds_weight()
>   vhost_net: fix possible infinite loop
>   vhost: vsock: add weight support
>   vhost: scsi: add weight support
> 
>  drivers/vhost/net.c   | 41 ++---
>  drivers/vhost/scsi.c  | 21 ++---
>  drivers/vhost/vhost.c | 20 +++-
>  drivers/vhost/vhost.h |  5 -
>  drivers/vhost/vsock.c | 28 +---
>  5 files changed, 72 insertions(+), 43 deletions(-)
> 
> -- 
> 1.8.3.1
> 

Looks good aside from the use-after-free in the vsock patch.


signature.asc
Description: PGP signature
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v9 2/7] virtio-pmem: Add virtio pmem driver

2019-05-16 Thread Pankaj Gupta


> >> +  vpmem->vdev = vdev;
> >> +  vdev->priv = vpmem;
> >> +  err = init_vq(vpmem);
> >> +  if (err) {
> >> +  dev_err(>dev, "failed to initialize virtio pmem vq's\n");
> >> +  goto out_err;
> >> +  }
> >> +
> >> +  virtio_cread(vpmem->vdev, struct virtio_pmem_config,
> >> +  start, >start);
> >> +  virtio_cread(vpmem->vdev, struct virtio_pmem_config,
> >> +  size, >size);
> >> +
> >> +  res.start = vpmem->start;
> >> +  res.end   = vpmem->start + vpmem->size-1;
> > 
> > nit: " - 1;"
> > 
> >> +  vpmem->nd_desc.provider_name = "virtio-pmem";
> >> +  vpmem->nd_desc.module = THIS_MODULE;
> >> +
> >> +  vpmem->nvdimm_bus = nvdimm_bus_register(>dev,
> >> +  >nd_desc);
> >> +  if (!vpmem->nvdimm_bus) {
> >> +  dev_err(>dev, "failed to register device with 
> >> nvdimm_bus\n");
> >> +  err = -ENXIO;
> >> +  goto out_vq;
> >> +  }
> >> +
> >> +  dev_set_drvdata(>dev, vpmem->nvdimm_bus);
> >> +
> >> +  ndr_desc.res = 
> >> +  ndr_desc.numa_node = nid;
> >> +  ndr_desc.flush = async_pmem_flush;
> >> +  set_bit(ND_REGION_PAGEMAP, _desc.flags);
> >> +  set_bit(ND_REGION_ASYNC, _desc.flags);
> >> +  nd_region = nvdimm_pmem_region_create(vpmem->nvdimm_bus, _desc);
> >> +  if (!nd_region) {
> >> +  dev_err(>dev, "failed to create nvdimm region\n");
> >> +  err = -ENXIO;
> >> +  goto out_nd;
> >> +  }
> >> +  nd_region->provider_data = dev_to_virtio(nd_region->dev.parent->parent);
> >> +  return 0;
> >> +out_nd:
> >> +  nvdimm_bus_unregister(vpmem->nvdimm_bus);
> >> +out_vq:
> >> +  vdev->config->del_vqs(vdev);
> >> +out_err:
> >> +  return err;
> >> +}
> >> +
> >> +static void virtio_pmem_remove(struct virtio_device *vdev)
> >> +{
> >> +  struct nvdimm_bus *nvdimm_bus = dev_get_drvdata(>dev);
> >> +
> >> +  nvdimm_bus_unregister(nvdimm_bus);
> >> +  vdev->config->del_vqs(vdev);
> >> +  vdev->config->reset(vdev);
> >> +}
> >> +
> >> +static struct virtio_driver virtio_pmem_driver = {
> >> +  .driver.name= KBUILD_MODNAME,
> >> +  .driver.owner   = THIS_MODULE,
> >> +  .id_table   = id_table,
> >> +  .probe  = virtio_pmem_probe,
> >> +  .remove = virtio_pmem_remove,
> >> +};
> >> +
> >> +module_virtio_driver(virtio_pmem_driver);
> >> +MODULE_DEVICE_TABLE(virtio, id_table);
> >> +MODULE_DESCRIPTION("Virtio pmem driver");
> >> +MODULE_LICENSE("GPL");
> >> diff --git a/drivers/nvdimm/virtio_pmem.h b/drivers/nvdimm/virtio_pmem.h
> >> new file mode 100644
> >> index ..ab1da877575d
> >> --- /dev/null
> >> +++ b/drivers/nvdimm/virtio_pmem.h
> >> @@ -0,0 +1,60 @@
> >> +/* SPDX-License-Identifier: GPL-2.0 */
> >> +/*
> >> + * virtio_pmem.h: virtio pmem Driver
> >> + *
> >> + * Discovers persistent memory range information
> >> + * from host and provides a virtio based flushing
> >> + * interface.
> >> + **/
> >> +
> >> +#ifndef _LINUX_VIRTIO_PMEM_H
> >> +#define _LINUX_VIRTIO_PMEM_H
> >> +
> >> +#include 
> >> +#include 
> >> +#include 
> >> +#include 
> >> +#include 
> >> +#include 
> >> +
> >> +struct virtio_pmem_request {
> >> +  /* Host return status corresponding to flush request */
> >> +  int ret;
> >> +
> >> +  /* command name*/
> >> +  char name[16];
> > 
> > So ... why are we sending string commands and expect native-endianess
> > integers and don't define a proper request/response structure + request
> > types in include/uapi/linux/virtio_pmem.h like
> > 
> > struct virtio_pmem_resp {
> > __virtio32 ret;
> > }
> 
> FWIW, I wonder if we should even properly translate return values and
> define types like
> 
> VIRTIO_PMEM_RESP_TYPE_OK  0
> VIRTIO_PMEM_RESP_TYPE_EIO 1

Don't think these are required as only failure and success
return types easy to understand.

Thanks,
Pankaj
> 
> ..
> 
> > 
> > #define VIRTIO_PMEM_REQ_TYPE_FLUSH  1
> > struct virtio_pmem_req {
> > __virtio16 type;
> > }
> > 
> > ... and this way we also define a proper endianess format for exchange
> > and keep it extensible
> > 
> > @MST, what's your take on this?
> > 
> > 
> 
> 
> --
> 
> Thanks,
> 
> David / dhildenb
> 
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v2 2/8] vsock/virtio: free packets during the socket release

2019-05-16 Thread Stefan Hajnoczi
On Fri, May 10, 2019 at 02:58:37PM +0200, Stefano Garzarella wrote:
> When the socket is released, we should free all packets
> queued in the per-socket list in order to avoid a memory
> leak.
> 
> Signed-off-by: Stefano Garzarella 
> ---
>  net/vmw_vsock/virtio_transport_common.c | 8 
>  1 file changed, 8 insertions(+)

Ouch, this would be nice as a separate patch that can be merged right
away (with s/virtio_vsock_buf/virtio_vsock_pkt/).


signature.asc
Description: PGP signature
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v2 1/8] vsock/virtio: limit the memory used per-socket

2019-05-16 Thread Stefan Hajnoczi
On Fri, May 10, 2019 at 02:58:36PM +0200, Stefano Garzarella wrote:
> +struct virtio_vsock_buf {

Please add a comment describing the purpose of this struct and to
differentiate its use from struct virtio_vsock_pkt.

> +static struct virtio_vsock_buf *
> +virtio_transport_alloc_buf(struct virtio_vsock_pkt *pkt, bool zero_copy)
> +{
> + struct virtio_vsock_buf *buf;
> +
> + if (pkt->len == 0)
> + return NULL;
> +
> + buf = kzalloc(sizeof(*buf), GFP_KERNEL);
> + if (!buf)
> + return NULL;
> +
> + /* If the buffer in the virtio_vsock_pkt is full, we can move it to
> +  * the new virtio_vsock_buf avoiding the copy, because we are sure that
> +  * we are not use more memory than that counted by the credit mechanism.
> +  */
> + if (zero_copy && pkt->len == pkt->buf_len) {
> + buf->addr = pkt->buf;
> + pkt->buf = NULL;
> + } else {
> + buf->addr = kmalloc(pkt->len, GFP_KERNEL);

buf and buf->addr could be allocated in a single call, though I'm not
sure how big an optimization this is.

> @@ -841,20 +882,24 @@ virtio_transport_recv_connected(struct sock *sk,
>  {
>   struct vsock_sock *vsk = vsock_sk(sk);
>   struct virtio_vsock_sock *vvs = vsk->trans;
> + struct virtio_vsock_buf *buf;
>   int err = 0;
>  
>   switch (le16_to_cpu(pkt->hdr.op)) {
>   case VIRTIO_VSOCK_OP_RW:
>   pkt->len = le32_to_cpu(pkt->hdr.len);
> - pkt->off = 0;
> + buf = virtio_transport_alloc_buf(pkt, true);
>  
> - spin_lock_bh(>rx_lock);
> - virtio_transport_inc_rx_pkt(vvs, pkt);
> - list_add_tail(>list, >rx_queue);
> - spin_unlock_bh(>rx_lock);
> + if (buf) {
> + spin_lock_bh(>rx_lock);
> + virtio_transport_inc_rx_pkt(vvs, pkt->len);
> + list_add_tail(>list, >rx_queue);
> + spin_unlock_bh(>rx_lock);
>  
> - sk->sk_data_ready(sk);
> - return err;
> + sk->sk_data_ready(sk);
> + }

The return value of this function isn't used but the code still makes an
effort to return errors.  Please return -ENOMEM when buf == NULL.

If you'd like to remove the return value that's fine too, but please do
it for the whole function to be consistent.


signature.asc
Description: PGP signature
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[PATCH 2/2] drm: Reserve/unreserve GEM VRAM BOs from within pin/unpin functions

2019-05-16 Thread Thomas Zimmermann
The original bochs and vbox implementations of pin and unpin functions
automatically reserved BOs during validation. This functionality got lost
while converting the code to a generic implementation. This may result
in validating unlocked TTM BOs.

Adding the reserve and unreserve operations to GEM VRAM's pin and unpin
functions fixes the bochs and vbox drivers. Additionally the patch changes
the mgag200, ast and hibmc drivers to not reserve BOs by themselves.

Signed-off-by: Thomas Zimmermann 
Fixes: a3232987fdbf0bede92a9d7c7e2db99a5084d31b ("drm/bochs: Convert [...]")
Reported-by: kernel test robot 
---
 drivers/gpu/drm/ast/ast_mode.c| 24 +
 drivers/gpu/drm/drm_gem_vram_helper.c | 54 ++-
 .../gpu/drm/hisilicon/hibmc/hibmc_drm_de.c|  6 ---
 .../gpu/drm/hisilicon/hibmc/hibmc_drm_fbdev.c | 17 +-
 drivers/gpu/drm/mgag200/mgag200_mode.c| 19 +--
 5 files changed, 45 insertions(+), 75 deletions(-)

diff --git a/drivers/gpu/drm/ast/ast_mode.c b/drivers/gpu/drm/ast/ast_mode.c
index 3475591a22c3..9aca9135a5cc 100644
--- a/drivers/gpu/drm/ast/ast_mode.c
+++ b/drivers/gpu/drm/ast/ast_mode.c
@@ -539,24 +539,16 @@ static int ast_crtc_do_set_base(struct drm_crtc *crtc,
ast_fb = to_ast_framebuffer(fb);
obj = ast_fb->obj;
gbo = drm_gem_vram_of_gem(obj);
-   ret = drm_gem_vram_reserve(gbo, false);
-   if (ret)
-   return ret;
drm_gem_vram_push_to_system(gbo);
-   drm_gem_vram_unreserve(gbo);
}
 
ast_fb = to_ast_framebuffer(crtc->primary->fb);
obj = ast_fb->obj;
gbo = drm_gem_vram_of_gem(obj);
 
-   ret = drm_gem_vram_reserve(gbo, false);
-   if (ret)
-   return ret;
-
ret = drm_gem_vram_pin(gbo, DRM_GEM_VRAM_PL_FLAG_VRAM);
if (ret)
-   goto err_drm_gem_vram_unreserve;
+   return ret;
gpu_addr = drm_gem_vram_offset(gbo);
if (gpu_addr < 0) {
ret = (int)gpu_addr;
@@ -573,7 +565,6 @@ static int ast_crtc_do_set_base(struct drm_crtc *crtc,
ast_fbdev_set_base(ast, gpu_addr);
}
}
-   drm_gem_vram_unreserve(gbo);
 
ast_set_offset_reg(crtc);
ast_set_start_address_crt1(crtc, (u32)gpu_addr);
@@ -582,8 +573,6 @@ static int ast_crtc_do_set_base(struct drm_crtc *crtc,
 
 err_drm_gem_vram_unpin:
drm_gem_vram_unpin(gbo);
-err_drm_gem_vram_unreserve:
-   drm_gem_vram_unreserve(gbo);
return ret;
 }
 
@@ -630,8 +619,6 @@ static int ast_crtc_mode_set(struct drm_crtc *crtc,
 
 static void ast_crtc_disable(struct drm_crtc *crtc)
 {
-   int ret;
-
DRM_DEBUG_KMS("\n");
ast_crtc_dpms(crtc, DRM_MODE_DPMS_OFF);
if (crtc->primary->fb) {
@@ -639,11 +626,7 @@ static void ast_crtc_disable(struct drm_crtc *crtc)
struct drm_gem_object *obj = ast_fb->obj;
struct drm_gem_vram_object *gbo = drm_gem_vram_of_gem(obj);
 
-   ret = drm_gem_vram_reserve(gbo, false);
-   if (ret)
-   return;
drm_gem_vram_push_to_system(gbo);
-   drm_gem_vram_unreserve(gbo);
}
crtc->primary->fb = NULL;
 }
@@ -939,12 +922,7 @@ static int ast_cursor_init(struct drm_device *dev)
if (ret)
return ret;
gbo = drm_gem_vram_of_gem(obj);
-   ret = drm_gem_vram_reserve(gbo, false);
-   if (unlikely(ret != 0))
-   goto fail;
-
ret = drm_gem_vram_pin(gbo, DRM_GEM_VRAM_PL_FLAG_VRAM);
-   drm_gem_vram_unreserve(gbo);
if (ret)
goto fail;
gpu_addr = drm_gem_vram_offset(gbo);
diff --git a/drivers/gpu/drm/drm_gem_vram_helper.c 
b/drivers/gpu/drm/drm_gem_vram_helper.c
index a002c03eaf4c..bde8237e8021 100644
--- a/drivers/gpu/drm/drm_gem_vram_helper.c
+++ b/drivers/gpu/drm/drm_gem_vram_helper.c
@@ -235,10 +235,12 @@ int drm_gem_vram_pin(struct drm_gem_vram_object *gbo, 
unsigned long pl_flag)
int i, ret;
struct ttm_operation_ctx ctx = { false, false };
 
-   if (gbo->pin_count) {
-   ++gbo->pin_count;
-   return 0;
-   }
+   ret = ttm_bo_reserve(>bo, true, false, NULL);
+   if (ret < 0)
+   return ret;
+
+   if (gbo->pin_count)
+   goto out;
 
drm_gem_vram_placement(gbo, pl_flag);
for (i = 0; i < gbo->placement.num_placement; ++i)
@@ -246,11 +248,17 @@ int drm_gem_vram_pin(struct drm_gem_vram_object *gbo, 
unsigned long pl_flag)
 
ret = ttm_bo_validate(>bo, >placement, );
if (ret < 0)
-   return ret;
+   goto err_ttm_bo_unreserve;
 
-   gbo->pin_count = 1;
+out:
+   ++gbo->pin_count;
+   ttm_bo_unreserve(>bo);
 
return 0;
+
+err_ttm_bo_unreserve:
+   ttm_bo_unreserve(>bo);
+   return ret;
 }
 

[PATCH 1/2] drm: Add drm_gem_vram_{pin/unpin}_reserved() and convert mgag200

2019-05-16 Thread Thomas Zimmermann
The new interfaces drm_gem_vram_{pin/unpin}_reserved() are variants of the
GEM VRAM pin/unpin functions that do not reserve the BO during validation.
The mgag200 driver requires this behavior for its cursor handling. The
patch also converts the driver to use the new interfaces.

Signed-off-by: Thomas Zimmermann 
---
 drivers/gpu/drm/drm_gem_vram_helper.c| 75 
 drivers/gpu/drm/mgag200/mgag200_cursor.c | 18 +++---
 include/drm/drm_gem_vram_helper.h|  3 +
 3 files changed, 88 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/drm_gem_vram_helper.c 
b/drivers/gpu/drm/drm_gem_vram_helper.c
index 8f142b810eb4..a002c03eaf4c 100644
--- a/drivers/gpu/drm/drm_gem_vram_helper.c
+++ b/drivers/gpu/drm/drm_gem_vram_helper.c
@@ -254,6 +254,47 @@ int drm_gem_vram_pin(struct drm_gem_vram_object *gbo, 
unsigned long pl_flag)
 }
 EXPORT_SYMBOL(drm_gem_vram_pin);
 
+/**
+ * drm_gem_vram_pin_reserved() - Pins a GEM VRAM object in a region.
+ * @gbo:   the GEM VRAM object
+ * @pl_flag:   a bitmask of possible memory regions
+ *
+ * Pinning a buffer object ensures that it is not evicted from
+ * a memory region. A pinned buffer object has to be unpinned before
+ * it can be pinned to another region.
+ *
+ * This function pins a GEM VRAM object that has already been
+ * reserved. Use drm_gem_vram_pin() if possible.
+ *
+ * Returns:
+ * 0 on success, or
+ * a negative error code otherwise.
+ */
+int drm_gem_vram_pin_reserved(struct drm_gem_vram_object *gbo,
+ unsigned long pl_flag)
+{
+   int i, ret;
+   struct ttm_operation_ctx ctx = { false, false };
+
+   if (gbo->pin_count) {
+   ++gbo->pin_count;
+   return 0;
+   }
+
+   drm_gem_vram_placement(gbo, pl_flag);
+   for (i = 0; i < gbo->placement.num_placement; ++i)
+   gbo->placements[i].flags |= TTM_PL_FLAG_NO_EVICT;
+
+   ret = ttm_bo_validate(>bo, >placement, );
+   if (ret < 0)
+   return ret;
+
+   gbo->pin_count = 1;
+
+   return 0;
+}
+EXPORT_SYMBOL(drm_gem_vram_pin_reserved);
+
 /**
  * drm_gem_vram_unpin() - Unpins a GEM VRAM object
  * @gbo:   the GEM VRAM object
@@ -285,6 +326,40 @@ int drm_gem_vram_unpin(struct drm_gem_vram_object *gbo)
 }
 EXPORT_SYMBOL(drm_gem_vram_unpin);
 
+/**
+ * drm_gem_vram_unpin_reserved() - Unpins a GEM VRAM object
+ * @gbo:   the GEM VRAM object
+ *
+ * This function unpins a GEM VRAM object that has already been
+ * reserved. Use drm_gem_vram_unpin() if possible.
+ *
+ * Returns:
+ * 0 on success, or
+ * a negative error code otherwise.
+ */
+int drm_gem_vram_unpin_reserved(struct drm_gem_vram_object *gbo)
+{
+   int i, ret;
+   struct ttm_operation_ctx ctx = { false, false };
+
+   if (WARN_ON_ONCE(!gbo->pin_count))
+   return 0;
+
+   --gbo->pin_count;
+   if (gbo->pin_count)
+   return 0;
+
+   for (i = 0; i < gbo->placement.num_placement ; ++i)
+   gbo->placements[i].flags &= ~TTM_PL_FLAG_NO_EVICT;
+
+   ret = ttm_bo_validate(>bo, >placement, );
+   if (ret < 0)
+   return ret;
+
+   return 0;
+}
+EXPORT_SYMBOL(drm_gem_vram_unpin_reserved);
+
 /**
  * drm_gem_vram_push_to_system() - \
Unpins a GEM VRAM object and moves it to system memory
diff --git a/drivers/gpu/drm/mgag200/mgag200_cursor.c 
b/drivers/gpu/drm/mgag200/mgag200_cursor.c
index 6c1a9d724d85..1c4fc85315a0 100644
--- a/drivers/gpu/drm/mgag200/mgag200_cursor.c
+++ b/drivers/gpu/drm/mgag200/mgag200_cursor.c
@@ -23,9 +23,9 @@ static void mga_hide_cursor(struct mga_device *mdev)
WREG8(MGA_CURPOSXL, 0);
WREG8(MGA_CURPOSXH, 0);
if (mdev->cursor.pixels_1->pin_count)
-   drm_gem_vram_unpin(mdev->cursor.pixels_1);
+   drm_gem_vram_unpin_reserved(mdev->cursor.pixels_1);
if (mdev->cursor.pixels_2->pin_count)
-   drm_gem_vram_unpin(mdev->cursor.pixels_2);
+   drm_gem_vram_unpin_reserved(mdev->cursor.pixels_2);
 }
 
 int mga_crtc_cursor_set(struct drm_crtc *crtc,
@@ -96,26 +96,28 @@ int mga_crtc_cursor_set(struct drm_crtc *crtc,
 
/* Move cursor buffers into VRAM if they aren't already */
if (!pixels_1->pin_count) {
-   ret = drm_gem_vram_pin(pixels_1, DRM_GEM_VRAM_PL_FLAG_VRAM);
+   ret = drm_gem_vram_pin_reserved(pixels_1,
+   DRM_GEM_VRAM_PL_FLAG_VRAM);
if (ret)
goto out1;
gpu_addr = drm_gem_vram_offset(pixels_1);
if (gpu_addr < 0) {
-   drm_gem_vram_unpin(pixels_1);
+   drm_gem_vram_unpin_reserved(pixels_1);
goto out1;
}
mdev->cursor.pixels_1_gpu_addr = gpu_addr;
}
if (!pixels_2->pin_count) {
-   ret = drm_gem_vram_pin(pixels_2, DRM_GEM_VRAM_PL_FLAG_VRAM);
+   

[PATCH 0/2] Add BO reservation to GEM VRAM pin/unpin/push_to_system

2019-05-16 Thread Thomas Zimmermann
A kernel test bot reported a problem with the locktorture testcase that
was triggered by the GEM VRAM helpers.

  ...
  [   10.004734] RIP: 0010:ttm_bo_validate+0x41/0x141 [ttm]
  ...
  [   10.015669]  ? kvm_sched_clock_read+0x5/0xd
  [   10.016157]  ? get_lock_stats+0x11/0x3f
  [   10.016607]  drm_gem_vram_pin+0x77/0xa2 [drm_vram_helper]
  [   10.017229]  drm_gem_vram_driver_gem_prime_vmap+0xe/0x39 [drm_vram_helper]
  [   10.018015]  drm_gem_vmap+0x36/0x43 [drm]
  [   10.018491]  drm_client_framebuffer_create+0xc6/0x1ca [drm]
  [   10.019143]  drm_fb_helper_generic_probe+0x4c/0x157 [drm_kms_helper]
  [   10.019864]  __drm_fb_helper_initial_config_and_unlock+0x307/0x442 
[drm_kms_helper]
  [   10.020739]  drm_fbdev_client_hotplug+0xc8/0x115 [drm_kms_helper]
  [   10.021442]  drm_fbdev_generic_setup+0xc4/0xf1 [drm_kms_helper]
  [   10.022120]  bochs_pci_probe+0x123/0x143 [bochs_drm]
  [   10.022688]  local_pci_probe+0x34/0x75
  [   10.023127]  pci_device_probe+0xf8/0x150A
  ...

It turns out that the bochs and vbox drivers automatically reserved and
unreserved the BO from within their pin and unpin functions. The other
drivers; ast, hibmc and mgag200; performed reservation explicitly. With the
GEM VRAM conversion, automatic BO reservation within pin and unpin functions
accidentally got lost. So for bochs and vbox, ttm_bo_validate() worked on
unlocked BOs.

This patch set fixes the problem by adding automatic reservation to the
implementation of drm_gem_vram_{pin,unpin,push_to_system}() to fix bochs
and vbox. It removes explicit BO reservation around the pin, unpin and
push-to-system calls in the ast, hibmc and mgag200 drivers.

The only exception is the cursor handling of mgag200. In this case, the
mgag200 driver now calls drm_gem_vram_{pin,unpin}_reserved(), which works
with reserved BOs. The respective code should be refactored in a future
patch to work with the regular pin and unpin functions.

Thomas Zimmermann (2):
  drm: Add drm_gem_vram_{pin/unpin}_reserved() and convert mgag200
  drm: Reserve/unreserve GEM VRAM BOs from within pin/unpin functions

 drivers/gpu/drm/ast/ast_mode.c|  24 +---
 drivers/gpu/drm/drm_gem_vram_helper.c | 115 +-
 .../gpu/drm/hisilicon/hibmc/hibmc_drm_de.c|   6 -
 .../gpu/drm/hisilicon/hibmc/hibmc_drm_fbdev.c |  17 +--
 drivers/gpu/drm/mgag200/mgag200_cursor.c  |  18 +--
 drivers/gpu/drm/mgag200/mgag200_mode.c|  19 +--
 include/drm/drm_gem_vram_helper.h |   3 +
 7 files changed, 126 insertions(+), 76 deletions(-)

--
2.21.0

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization