[PATCH v9 16/17] vhost_scsi: add support for worker ioctls

2023-06-26 Thread Mike Christie
This has vhost-scsi support the worker ioctls by calling the
vhost_worker_ioctl helper.

With a single worker, the single thread becomes a bottlneck when trying
to use 3 or more virtqueues like:

fio --filename=/dev/sdb  --direct=1 --rw=randrw --bs=4k \
--ioengine=libaio --iodepth=128  --numjobs=3

With the patches and doing a worker per vq, we can scale to at least
16 vCPUs/vqs (that's my system limit) with the same command fio command
above with numjobs=16:

fio --filename=/dev/sdb  --direct=1 --rw=randrw --bs=4k \
--ioengine=libaio --iodepth=64  --numjobs=16

which gives around 2002K IOPs.

Note that for testing I dropped depth to 64 above because the vhost/virt
layer supports only 1024 total commands per device. And the only tuning I
did was set LIO's emulate_pr to 0 to avoid LIO's PR lock in the main IO
path which becomes an issue at around 12 jobs/virtqueues.

Signed-off-by: Mike Christie 
---
 drivers/vhost/scsi.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 2c3cf487cc71..c83f7f043470 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -1927,6 +1927,14 @@ vhost_scsi_ioctl(struct file *f,
if (copy_from_user(&features, featurep, sizeof features))
return -EFAULT;
return vhost_scsi_set_features(vs, features);
+   case VHOST_NEW_WORKER:
+   case VHOST_FREE_WORKER:
+   case VHOST_ATTACH_VRING_WORKER:
+   case VHOST_GET_VRING_WORKER:
+   mutex_lock(&vs->dev.mutex);
+   r = vhost_worker_ioctl(&vs->dev, ioctl, argp);
+   mutex_unlock(&vs->dev.mutex);
+   return r;
default:
mutex_lock(&vs->dev.mutex);
r = vhost_dev_ioctl(&vs->dev, ioctl, argp);
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v9 13/17] vhost: add helper to parse userspace vring state/file

2023-06-26 Thread Mike Christie
The next patches add new vhost worker ioctls which will need to get a
vhost_virtqueue from a userspace struct which specifies the vq's index.
This moves the vhost_vring_ioctl code to do this to a helper so it can
be shared.

Signed-off-by: Mike Christie 
---
 drivers/vhost/vhost.c | 29 ++---
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 6cadbc6e6d11..12203d3893c5 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -599,6 +599,27 @@ static struct vhost_worker *vhost_worker_create(struct 
vhost_dev *dev)
return NULL;
 }
 
+static int vhost_get_vq_from_user(struct vhost_dev *dev, void __user *argp,
+ struct vhost_virtqueue **vq, u32 *id)
+{
+   u32 __user *idxp = argp;
+   u32 idx;
+   long r;
+
+   r = get_user(idx, idxp);
+   if (r < 0)
+   return r;
+
+   if (idx >= dev->nvqs)
+   return -ENOBUFS;
+
+   idx = array_index_nospec(idx, dev->nvqs);
+
+   *vq = dev->vqs[idx];
+   *id = idx;
+   return 0;
+}
+
 /* Caller should have device mutex */
 long vhost_dev_set_owner(struct vhost_dev *dev)
 {
@@ -1618,21 +1639,15 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned 
int ioctl, void __user *arg
struct file *eventfp, *filep = NULL;
bool pollstart = false, pollstop = false;
struct eventfd_ctx *ctx = NULL;
-   u32 __user *idxp = argp;
struct vhost_virtqueue *vq;
struct vhost_vring_state s;
struct vhost_vring_file f;
u32 idx;
long r;
 
-   r = get_user(idx, idxp);
+   r = vhost_get_vq_from_user(d, argp, &vq, &idx);
if (r < 0)
return r;
-   if (idx >= d->nvqs)
-   return -ENOBUFS;
-
-   idx = array_index_nospec(idx, d->nvqs);
-   vq = d->vqs[idx];
 
if (ioctl == VHOST_SET_VRING_NUM ||
ioctl == VHOST_SET_VRING_ADDR) {
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v9 05/17] vhost: take worker or vq instead of dev for queueing

2023-06-26 Thread Mike Christie
This patch has the core work queueing function take a worker for when we
support multiple workers. It also adds a helper that takes a vq during
queueing so modules can control which vq/worker to queue work on.

This temp leaves vhost_work_queue. It will be removed when the drivers
are converted in the next patches.

Signed-off-by: Mike Christie 
---
 drivers/vhost/vhost.c | 44 +++
 drivers/vhost/vhost.h |  1 +
 2 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index aafb23e12477..611e495eeb3c 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -231,21 +231,10 @@ void vhost_poll_stop(struct vhost_poll *poll)
 }
 EXPORT_SYMBOL_GPL(vhost_poll_stop);
 
-void vhost_dev_flush(struct vhost_dev *dev)
+static bool vhost_worker_queue(struct vhost_worker *worker,
+  struct vhost_work *work)
 {
-   struct vhost_flush_struct flush;
-
-   init_completion(&flush.wait_event);
-   vhost_work_init(&flush.work, vhost_flush_work);
-
-   if (vhost_work_queue(dev, &flush.work))
-   wait_for_completion(&flush.wait_event);
-}
-EXPORT_SYMBOL_GPL(vhost_dev_flush);
-
-bool vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
-{
-   if (!dev->worker)
+   if (!worker)
return false;
/*
 * vsock can queue while we do a VHOST_SET_OWNER, so we have a smp_wmb
@@ -257,14 +246,37 @@ bool vhost_work_queue(struct vhost_dev *dev, struct 
vhost_work *work)
 * sure it was not in the list.
 * test_and_set_bit() implies a memory barrier.
 */
-   llist_add(&work->node, &dev->worker->work_list);
-   vhost_task_wake(dev->worker->vtsk);
+   llist_add(&work->node, &worker->work_list);
+   vhost_task_wake(worker->vtsk);
}
 
return true;
 }
+
+bool vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
+{
+   return vhost_worker_queue(dev->worker, work);
+}
 EXPORT_SYMBOL_GPL(vhost_work_queue);
 
+bool vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work)
+{
+   return vhost_worker_queue(vq->worker, work);
+}
+EXPORT_SYMBOL_GPL(vhost_vq_work_queue);
+
+void vhost_dev_flush(struct vhost_dev *dev)
+{
+   struct vhost_flush_struct flush;
+
+   init_completion(&flush.wait_event);
+   vhost_work_init(&flush.work, vhost_flush_work);
+
+   if (vhost_work_queue(dev, &flush.work))
+   wait_for_completion(&flush.wait_event);
+}
+EXPORT_SYMBOL_GPL(vhost_dev_flush);
+
 /* A lockless hint for busy polling code to exit the loop */
 bool vhost_vq_has_work(struct vhost_virtqueue *vq)
 {
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index d5728f986744..e2dd4558a1f9 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -198,6 +198,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
  struct vhost_log *log, unsigned int *log_num);
 void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
+bool vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work);
 bool vhost_vq_has_work(struct vhost_virtqueue *vq);
 bool vhost_vq_is_setup(struct vhost_virtqueue *vq);
 int vhost_vq_init_access(struct vhost_virtqueue *);
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v9 10/17] vhost_scsi: convert to vhost_vq_work_queue

2023-06-26 Thread Mike Christie
Convert from vhost_work_queue to vhost_vq_work_queue so we can
remove vhost_work_queue.

Signed-off-by: Mike Christie 
---
 drivers/vhost/scsi.c | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index a77c53bb035a..1668009bd489 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -353,8 +353,9 @@ static void vhost_scsi_release_cmd(struct se_cmd *se_cmd)
if (se_cmd->se_cmd_flags & SCF_SCSI_TMR_CDB) {
struct vhost_scsi_tmf *tmf = container_of(se_cmd,
struct vhost_scsi_tmf, se_cmd);
+   struct vhost_virtqueue *vq = &tmf->svq->vq;
 
-   vhost_work_queue(&tmf->vhost->dev, &tmf->vwork);
+   vhost_vq_work_queue(vq, &tmf->vwork);
} else {
struct vhost_scsi_cmd *cmd = container_of(se_cmd,
struct vhost_scsi_cmd, tvc_se_cmd);
@@ -1332,11 +1333,9 @@ static void vhost_scsi_ctl_handle_kick(struct vhost_work 
*work)
 }
 
 static void
-vhost_scsi_send_evt(struct vhost_scsi *vs,
-  struct vhost_scsi_tpg *tpg,
-  struct se_lun *lun,
-  u32 event,
-  u32 reason)
+vhost_scsi_send_evt(struct vhost_scsi *vs, struct vhost_virtqueue *vq,
+   struct vhost_scsi_tpg *tpg, struct se_lun *lun,
+   u32 event, u32 reason)
 {
struct vhost_scsi_evt *evt;
 
@@ -1358,7 +1357,7 @@ vhost_scsi_send_evt(struct vhost_scsi *vs,
}
 
llist_add(&evt->list, &vs->vs_event_list);
-   vhost_work_queue(&vs->dev, &vs->vs_event_work);
+   vhost_vq_work_queue(vq, &vs->vs_event_work);
 }
 
 static void vhost_scsi_evt_handle_kick(struct vhost_work *work)
@@ -1372,7 +1371,8 @@ static void vhost_scsi_evt_handle_kick(struct vhost_work 
*work)
goto out;
 
if (vs->vs_events_missed)
-   vhost_scsi_send_evt(vs, NULL, NULL, VIRTIO_SCSI_T_NO_EVENT, 0);
+   vhost_scsi_send_evt(vs, vq, NULL, NULL, VIRTIO_SCSI_T_NO_EVENT,
+   0);
 out:
mutex_unlock(&vq->mutex);
 }
@@ -1991,7 +1991,7 @@ vhost_scsi_do_plug(struct vhost_scsi_tpg *tpg,
goto unlock;
 
if (vhost_has_feature(vq, VIRTIO_SCSI_F_HOTPLUG))
-   vhost_scsi_send_evt(vs, tpg, lun,
+   vhost_scsi_send_evt(vs, vq, tpg, lun,
   VIRTIO_SCSI_T_TRANSPORT_RESET, reason);
 unlock:
mutex_unlock(&vq->mutex);
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v9 12/17] vhost: remove vhost_work_queue

2023-06-26 Thread Mike Christie
vhost_work_queue is no longer used. Each driver is using the poll or vq
based queueing, so remove vhost_work_queue.

Signed-off-by: Mike Christie 
---
 drivers/vhost/vhost.c | 6 --
 drivers/vhost/vhost.h | 5 ++---
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index b6149ed8acb4..6cadbc6e6d11 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -255,12 +255,6 @@ static bool vhost_worker_queue(struct vhost_worker *worker,
return true;
 }
 
-bool vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
-{
-   return vhost_worker_queue(dev->worker, work);
-}
-EXPORT_SYMBOL_GPL(vhost_work_queue);
-
 bool vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work)
 {
return vhost_worker_queue(vq->worker, work);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 3882266fbbf3..83f78b6f563b 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -44,15 +44,14 @@ struct vhost_poll {
struct vhost_virtqueue  *vq;
 };
 
-void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
-bool vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
-
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
 __poll_t mask, struct vhost_dev *dev,
 struct vhost_virtqueue *vq);
 int vhost_poll_start(struct vhost_poll *poll, struct file *file);
 void vhost_poll_stop(struct vhost_poll *poll);
 void vhost_poll_queue(struct vhost_poll *poll);
+
+void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
 void vhost_dev_flush(struct vhost_dev *dev);
 
 struct vhost_log {
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v9 11/17] vhost_scsi: flush IO vqs then send TMF rsp

2023-06-26 Thread Mike Christie
With one worker we will always send the scsi cmd responses then send the
TMF rsp, because LIO will always complete the scsi cmds first then call
into us to send the TMF response.

With multiple workers, the IO vq workers could be running while the
TMF/ctl vq worker is running so this has us do a flush before completing
the TMF to make sure cmds are completed when it's work is later queued
and run.

Signed-off-by: Mike Christie 
---
 drivers/vhost/scsi.c | 21 ++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 1668009bd489..2c3cf487cc71 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -1133,12 +1133,27 @@ static void vhost_scsi_tmf_resp_work(struct vhost_work 
*work)
 {
struct vhost_scsi_tmf *tmf = container_of(work, struct vhost_scsi_tmf,
  vwork);
-   int resp_code;
+   struct vhost_virtqueue *ctl_vq, *vq;
+   int resp_code, i;
+
+   if (tmf->scsi_resp == TMR_FUNCTION_COMPLETE) {
+   /*
+* Flush IO vqs that don't share a worker with the ctl to make
+* sure they have sent their responses before us.
+*/
+   ctl_vq = &tmf->vhost->vqs[VHOST_SCSI_VQ_CTL].vq;
+   for (i = VHOST_SCSI_VQ_IO; i < tmf->vhost->dev.nvqs; i++) {
+   vq = &tmf->vhost->vqs[i].vq;
+
+   if (vhost_vq_is_setup(vq) &&
+   vq->worker != ctl_vq->worker)
+   vhost_vq_flush(vq);
+   }
 
-   if (tmf->scsi_resp == TMR_FUNCTION_COMPLETE)
resp_code = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED;
-   else
+   } else {
resp_code = VIRTIO_SCSI_S_FUNCTION_REJECTED;
+   }
 
vhost_scsi_send_tmf_resp(tmf->vhost, &tmf->svq->vq, tmf->in_iovs,
 tmf->vq_desc, &tmf->resp_iov, resp_code);
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v9 08/17] vhost_sock: convert to vhost_vq_work_queue

2023-06-26 Thread Mike Christie
Convert from vhost_work_queue to vhost_vq_work_queue, so we can drop
vhost_work_queue.

Signed-off-by: Mike Christie 
---
 drivers/vhost/vsock.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 6578db78f0ae..817d377a3f36 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -285,7 +285,7 @@ vhost_transport_send_pkt(struct sk_buff *skb)
atomic_inc(&vsock->queued_replies);
 
virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb);
-   vhost_work_queue(&vsock->dev, &vsock->send_pkt_work);
+   vhost_vq_work_queue(&vsock->vqs[VSOCK_VQ_RX], &vsock->send_pkt_work);
 
rcu_read_unlock();
return len;
@@ -583,7 +583,7 @@ static int vhost_vsock_start(struct vhost_vsock *vsock)
/* Some packets may have been queued before the device was started,
 * let's kick the send worker to send them.
 */
-   vhost_work_queue(&vsock->dev, &vsock->send_pkt_work);
+   vhost_vq_work_queue(&vsock->vqs[VSOCK_VQ_RX], &vsock->send_pkt_work);
 
mutex_unlock(&vsock->dev.mutex);
return 0;
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v9 17/17] vhost: Allow worker switching while work is queueing

2023-06-26 Thread Mike Christie
This patch drops the requirement that we can only switch workers if work
has not been queued by using RCU for the vq based queueing paths and a
mutex for the device wide flush.

We can also use this to support SIGKILL properly in the future where we
should exit almost immediately after getting that signal. With this
patch, when get_signal returns true, we can set the vq->worker to NULL
and do a synchronize_rcu to prevent new work from being queued to the
vhost_task that has been killed.

Signed-off-by: Mike Christie 
---
 drivers/vhost/vhost.c  | 153 +++--
 drivers/vhost/vhost.h  |   4 +-
 include/uapi/linux/vhost.h |   4 +-
 3 files changed, 115 insertions(+), 46 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index bfba91ecbd0a..c71d573f1c94 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -233,16 +233,9 @@ void vhost_poll_stop(struct vhost_poll *poll)
 }
 EXPORT_SYMBOL_GPL(vhost_poll_stop);
 
-static bool vhost_worker_queue(struct vhost_worker *worker,
+static void vhost_worker_queue(struct vhost_worker *worker,
   struct vhost_work *work)
 {
-   if (!worker)
-   return false;
-   /*
-* vsock can queue while we do a VHOST_SET_OWNER, so we have a smp_wmb
-* when setting up the worker. We don't have a smp_rmb here because
-* test_and_set_bit gives us a mb already.
-*/
if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
/* We can only add the work to the list after we're
 * sure it was not in the list.
@@ -251,47 +244,85 @@ static bool vhost_worker_queue(struct vhost_worker 
*worker,
llist_add(&work->node, &worker->work_list);
vhost_task_wake(worker->vtsk);
}
-
-   return true;
 }
 
 bool vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work)
 {
-   return vhost_worker_queue(vq->worker, work);
+   struct vhost_worker *worker;
+   bool queued = false;
+
+   rcu_read_lock();
+   worker = rcu_dereference(vq->worker);
+   if (worker) {
+   queued = true;
+   vhost_worker_queue(worker, work);
+   }
+   rcu_read_unlock();
+
+   return queued;
 }
 EXPORT_SYMBOL_GPL(vhost_vq_work_queue);
 
-static void vhost_worker_flush(struct vhost_worker *worker)
+void vhost_vq_flush(struct vhost_virtqueue *vq)
 {
struct vhost_flush_struct flush;
 
init_completion(&flush.wait_event);
vhost_work_init(&flush.work, vhost_flush_work);
 
-   if (vhost_worker_queue(worker, &flush.work))
+   if (vhost_vq_work_queue(vq, &flush.work))
wait_for_completion(&flush.wait_event);
 }
+EXPORT_SYMBOL_GPL(vhost_vq_flush);
 
-void vhost_vq_flush(struct vhost_virtqueue *vq)
+/**
+ * vhost_worker_flush - flush a worker
+ * @worker: worker to flush
+ *
+ * This does not use RCU to protect the worker, so the device or worker
+ * mutex must be held.
+ */
+static void vhost_worker_flush(struct vhost_worker *worker)
 {
-   vhost_worker_flush(vq->worker);
+   struct vhost_flush_struct flush;
+
+   init_completion(&flush.wait_event);
+   vhost_work_init(&flush.work, vhost_flush_work);
+
+   vhost_worker_queue(worker, &flush.work);
+   wait_for_completion(&flush.wait_event);
 }
-EXPORT_SYMBOL_GPL(vhost_vq_flush);
 
 void vhost_dev_flush(struct vhost_dev *dev)
 {
struct vhost_worker *worker;
unsigned long i;
 
-   xa_for_each(&dev->worker_xa, i, worker)
+   xa_for_each(&dev->worker_xa, i, worker) {
+   mutex_lock(&worker->mutex);
+   if (!worker->attachment_cnt) {
+   mutex_unlock(&worker->mutex);
+   continue;
+   }
vhost_worker_flush(worker);
+   mutex_unlock(&worker->mutex);
+   }
 }
 EXPORT_SYMBOL_GPL(vhost_dev_flush);
 
 /* A lockless hint for busy polling code to exit the loop */
 bool vhost_vq_has_work(struct vhost_virtqueue *vq)
 {
-   return !llist_empty(&vq->worker->work_list);
+   struct vhost_worker *worker;
+   bool has_work = false;
+
+   rcu_read_lock();
+   worker = rcu_dereference(vq->worker);
+   if (worker && !llist_empty(&worker->work_list))
+   has_work = true;
+   rcu_read_unlock();
+
+   return has_work;
 }
 EXPORT_SYMBOL_GPL(vhost_vq_has_work);
 
@@ -356,7 +387,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vq->busyloop_timeout = 0;
vq->umem = NULL;
vq->iotlb = NULL;
-   vq->worker = NULL;
+   rcu_assign_pointer(vq->worker, NULL);
vhost_vring_call_reset(&vq->call_ctx);
__vhost_vq_meta_reset(vq);
 }
@@ -578,7 +609,7 @@ static void vhost_workers_free(struct vhost_dev *dev)
return;
 
for (i = 0; i < dev->nvqs; i++)
-   dev->vqs[i]->worker = NULL;
+   rcu_assign_pointer

[PATCH v9 14/17] vhost: replace single worker pointer with xarray

2023-06-26 Thread Mike Christie
The next patch allows userspace to create multiple workers per device,
so this patch replaces the vhost_worker pointer with an xarray so we
can store mupltiple workers and look them up.

Signed-off-by: Mike Christie 
---
 drivers/vhost/vhost.c | 64 ---
 drivers/vhost/vhost.h |  3 +-
 2 files changed, 50 insertions(+), 17 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 12203d3893c5..ffbaf7d32e2c 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -280,7 +280,11 @@ EXPORT_SYMBOL_GPL(vhost_vq_flush);
 
 void vhost_dev_flush(struct vhost_dev *dev)
 {
-   vhost_worker_flush(dev->worker);
+   struct vhost_worker *worker;
+   unsigned long i;
+
+   xa_for_each(&dev->worker_xa, i, worker)
+   vhost_worker_flush(worker);
 }
 EXPORT_SYMBOL_GPL(vhost_dev_flush);
 
@@ -482,7 +486,6 @@ void vhost_dev_init(struct vhost_dev *dev,
dev->umem = NULL;
dev->iotlb = NULL;
dev->mm = NULL;
-   dev->worker = NULL;
dev->iov_limit = iov_limit;
dev->weight = weight;
dev->byte_weight = byte_weight;
@@ -492,7 +495,7 @@ void vhost_dev_init(struct vhost_dev *dev,
INIT_LIST_HEAD(&dev->read_list);
INIT_LIST_HEAD(&dev->pending_list);
spin_lock_init(&dev->iotlb_lock);
-
+   xa_init_flags(&dev->worker_xa, XA_FLAGS_ALLOC);
 
for (i = 0; i < dev->nvqs; ++i) {
vq = dev->vqs[i];
@@ -554,15 +557,35 @@ static void vhost_detach_mm(struct vhost_dev *dev)
dev->mm = NULL;
 }
 
-static void vhost_worker_free(struct vhost_dev *dev)
+static void vhost_worker_destroy(struct vhost_dev *dev,
+struct vhost_worker *worker)
+{
+   if (!worker)
+   return;
+
+   WARN_ON(!llist_empty(&worker->work_list));
+   xa_erase(&dev->worker_xa, worker->id);
+   vhost_task_stop(worker->vtsk);
+   kfree(worker);
+}
+
+static void vhost_workers_free(struct vhost_dev *dev)
 {
-   if (!dev->worker)
+   struct vhost_worker *worker;
+   unsigned long i;
+
+   if (!dev->use_worker)
return;
 
-   WARN_ON(!llist_empty(&dev->worker->work_list));
-   vhost_task_stop(dev->worker->vtsk);
-   kfree(dev->worker);
-   dev->worker = NULL;
+   for (i = 0; i < dev->nvqs; i++)
+   dev->vqs[i]->worker = NULL;
+   /*
+* Free the default worker we created and cleanup workers userspace
+* created but couldn't clean up (it forgot or crashed).
+*/
+   xa_for_each(&dev->worker_xa, i, worker)
+   vhost_worker_destroy(dev, worker);
+   xa_destroy(&dev->worker_xa);
 }
 
 static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
@@ -570,6 +593,8 @@ static struct vhost_worker *vhost_worker_create(struct 
vhost_dev *dev)
struct vhost_worker *worker;
struct vhost_task *vtsk;
char name[TASK_COMM_LEN];
+   int ret;
+   u32 id;
 
worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
if (!worker)
@@ -584,16 +609,18 @@ static struct vhost_worker *vhost_worker_create(struct 
vhost_dev *dev)
init_llist_head(&worker->work_list);
worker->kcov_handle = kcov_common_handle();
worker->vtsk = vtsk;
-   /*
-* vsock can already try to queue so make sure llist and vtsk are both
-* set before vhost_work_queue sees dev->worker is set.
-*/
-   smp_wmb();
-   dev->worker = worker;
 
vhost_task_start(vtsk);
+
+   ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
+   if (ret < 0)
+   goto stop_worker;
+   worker->id = id;
+
return worker;
 
+stop_worker:
+   vhost_task_stop(vtsk);
 free_worker:
kfree(worker);
return NULL;
@@ -650,6 +677,11 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
err = -ENOMEM;
goto err_worker;
}
+   /*
+* vsock can already try to queue so make sure the worker
+* is setup before vhost_vq_work_queue sees vq->worker is set.
+*/
+   smp_wmb();
 
for (i = 0; i < dev->nvqs; i++)
dev->vqs[i]->worker = worker;
@@ -751,7 +783,7 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
dev->iotlb = NULL;
vhost_clear_msg(dev);
wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
-   vhost_worker_free(dev);
+   vhost_workers_free(dev);
vhost_detach_mm(dev);
 }
 EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 83f78b6f563b..44c3017766b2 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -30,6 +30,7 @@ struct vhost_worker {
struct vhost_task   *vtsk;
struct llist_head   work_list;
u64 kcov_

[PATCH v9 15/17] vhost: allow userspace to create workers

2023-06-26 Thread Mike Christie
For vhost-scsi with 3 vqs or more and a workload that tries to use
them in parallel like:

fio --filename=/dev/sdb  --direct=1 --rw=randrw --bs=4k \
--ioengine=libaio --iodepth=128  --numjobs=3

the single vhost worker thread will become a bottlneck and we are stuck
at around 500K IOPs no matter how many jobs, virtqueues, and CPUs are
used.

To better utilize virtqueues and available CPUs, this patch allows
userspace to create workers and bind them to vqs. You can have N workers
per dev and also share N workers with M vqs on that dev.

This patch adds the interface related code and the next patch will hook
vhost-scsi into it. The patches do not try to hook net and vsock into
the interface because:

1. multiple workers don't seem to help vsock. The problem is that with
only 2 virtqueues we never fully use the existing worker when doing
bidirectional tests. This seems to match vhost-scsi where we don't see
the worker as a bottleneck until 3 virtqueues are used.

2. net already has a way to use multiple workers.

Signed-off-by: Mike Christie 
---
 drivers/vhost/vhost.c| 142 ++-
 drivers/vhost/vhost.h|   3 +
 include/uapi/linux/vhost.h   |  33 +++
 include/uapi/linux/vhost_types.h |  16 
 4 files changed, 193 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index ffbaf7d32e2c..bfba91ecbd0a 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -626,6 +626,76 @@ static struct vhost_worker *vhost_worker_create(struct 
vhost_dev *dev)
return NULL;
 }
 
+/* Caller must have device mutex */
+static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq,
+struct vhost_worker *worker)
+{
+   if (vq->worker)
+   vq->worker->attachment_cnt--;
+   worker->attachment_cnt++;
+   vq->worker = worker;
+}
+
+ /* Caller must have device and virtqueue mutex */
+static int vhost_vq_attach_worker(struct vhost_virtqueue *vq,
+ struct vhost_vring_worker *info)
+{
+   unsigned long index = info->worker_id;
+   struct vhost_dev *dev = vq->dev;
+   struct vhost_worker *worker;
+
+   if (!dev->use_worker)
+   return -EINVAL;
+
+   /*
+* We only allow userspace to set a virtqueue's worker if it's not
+* active and polling is not enabled. We also assume drivers
+* supporting this will not be internally queueing works directly or
+* via calls like vhost_dev_flush at this time.
+*/
+   if (vhost_vq_get_backend(vq) || vq->kick)
+   return -EBUSY;
+
+   worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT);
+   if (!worker || worker->id != info->worker_id)
+   return -ENODEV;
+
+   __vhost_vq_attach_worker(vq, worker);
+   return 0;
+}
+
+/* Caller must have device mutex */
+static int vhost_new_worker(struct vhost_dev *dev,
+   struct vhost_worker_state *info)
+{
+   struct vhost_worker *worker;
+
+   worker = vhost_worker_create(dev);
+   if (!worker)
+   return -ENOMEM;
+
+   info->worker_id = worker->id;
+   return 0;
+}
+
+/* Caller must have device mutex */
+static int vhost_free_worker(struct vhost_dev *dev,
+struct vhost_worker_state *info)
+{
+   unsigned long index = info->worker_id;
+   struct vhost_worker *worker;
+
+   worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT);
+   if (!worker || worker->id != info->worker_id)
+   return -ENODEV;
+
+   if (worker->attachment_cnt)
+   return -EBUSY;
+
+   vhost_worker_destroy(dev, worker);
+   return 0;
+}
+
 static int vhost_get_vq_from_user(struct vhost_dev *dev, void __user *argp,
  struct vhost_virtqueue **vq, u32 *id)
 {
@@ -647,6 +717,76 @@ static int vhost_get_vq_from_user(struct vhost_dev *dev, 
void __user *argp,
return 0;
 }
 
+/* Caller must have device mutex */
+long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl,
+   void __user *argp)
+{
+   struct vhost_vring_worker ring_worker;
+   struct vhost_worker_state state;
+   struct vhost_virtqueue *vq;
+   long ret;
+   u32 idx;
+
+   if (!dev->use_worker)
+   return -EINVAL;
+
+   if (!vhost_dev_has_owner(dev))
+   return -EINVAL;
+
+   ret = vhost_dev_check_owner(dev);
+   if (ret)
+   return ret;
+
+   switch (ioctl) {
+   /* dev worker ioctls */
+   case VHOST_NEW_WORKER:
+   ret = vhost_new_worker(dev, &state);
+   if (!ret && copy_to_user(argp, &state, sizeof(state)))
+   ret = -EFAULT;
+   return ret;
+   case VHOST_FREE_WORKER:
+   if (copy_from_user(&state, argp, sizeof(state)))
+   return -

[PATCH v9 03/17] vhost: add vhost_worker pointer to vhost_virtqueue

2023-06-26 Thread Mike Christie
This patchset allows userspace to map vqs to different workers. This
patch adds a worker pointer to the vq so in later patches in this set
we can queue/flush specific vqs and their workers.

Signed-off-by: Mike Christie 
---
 drivers/vhost/vhost.c | 21 ++---
 drivers/vhost/vhost.h |  1 +
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index dfd96cf6a152..db88464c1691 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -333,6 +333,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vq->busyloop_timeout = 0;
vq->umem = NULL;
vq->iotlb = NULL;
+   vq->worker = NULL;
vhost_vring_call_reset(&vq->call_ctx);
__vhost_vq_meta_reset(vq);
 }
@@ -545,7 +546,7 @@ static void vhost_worker_free(struct vhost_dev *dev)
dev->worker = NULL;
 }
 
-static int vhost_worker_create(struct vhost_dev *dev)
+static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
 {
struct vhost_worker *worker;
struct vhost_task *vtsk;
@@ -553,7 +554,7 @@ static int vhost_worker_create(struct vhost_dev *dev)
 
worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
if (!worker)
-   return -ENOMEM;
+   return NULL;
 
snprintf(name, sizeof(name), "vhost-%d", current->pid);
 
@@ -572,17 +573,18 @@ static int vhost_worker_create(struct vhost_dev *dev)
dev->worker = worker;
 
vhost_task_start(vtsk);
-   return 0;
+   return worker;
 
 free_worker:
kfree(worker);
-   return -ENOMEM;
+   return NULL;
 }
 
 /* Caller should have device mutex */
 long vhost_dev_set_owner(struct vhost_dev *dev)
 {
-   int err;
+   struct vhost_worker *worker;
+   int err, i;
 
/* Is there an owner already? */
if (vhost_dev_has_owner(dev)) {
@@ -603,9 +605,14 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
 * below since we don't have to worry about vsock queueing
 * while we free the worker.
 */
-   err = vhost_worker_create(dev);
-   if (err)
+   worker = vhost_worker_create(dev);
+   if (!worker) {
+   err = -ENOMEM;
goto err_worker;
+   }
+
+   for (i = 0; i < dev->nvqs; i++)
+   dev->vqs[i]->worker = worker;
}
 
return 0;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 6cd72d0b5245..0baacc245934 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -74,6 +74,7 @@ struct vhost_vring_call {
 /* The virtqueue structure describes a queue attached to a device. */
 struct vhost_virtqueue {
struct vhost_dev *dev;
+   struct vhost_worker *worker;
 
/* The actual ring of buffers. */
struct mutex mutex;
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v9 02/17] vhost: dynamically allocate vhost_worker

2023-06-26 Thread Mike Christie
This patchset allows us to allocate multiple workers, so this has us
move from the vhost_worker that's embedded in the vhost_dev to
dynamically allocating it.

Signed-off-by: Mike Christie 
---
 drivers/vhost/vhost.c | 66 ---
 drivers/vhost/vhost.h |  4 +--
 2 files changed, 45 insertions(+), 25 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 82966ffb4a5c..dfd96cf6a152 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -235,36 +235,40 @@ void vhost_dev_flush(struct vhost_dev *dev)
 {
struct vhost_flush_struct flush;
 
-   if (dev->worker.vtsk) {
-   init_completion(&flush.wait_event);
-   vhost_work_init(&flush.work, vhost_flush_work);
+   init_completion(&flush.wait_event);
+   vhost_work_init(&flush.work, vhost_flush_work);
 
-   vhost_work_queue(dev, &flush.work);
+   if (vhost_work_queue(dev, &flush.work))
wait_for_completion(&flush.wait_event);
-   }
 }
 EXPORT_SYMBOL_GPL(vhost_dev_flush);
 
-void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
+bool vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
 {
-   if (!dev->worker.vtsk)
-   return;
-
+   if (!dev->worker)
+   return false;
+   /*
+* vsock can queue while we do a VHOST_SET_OWNER, so we have a smp_wmb
+* when setting up the worker. We don't have a smp_rmb here because
+* test_and_set_bit gives us a mb already.
+*/
if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
/* We can only add the work to the list after we're
 * sure it was not in the list.
 * test_and_set_bit() implies a memory barrier.
 */
-   llist_add(&work->node, &dev->worker.work_list);
-   vhost_task_wake(dev->worker.vtsk);
+   llist_add(&work->node, &dev->worker->work_list);
+   vhost_task_wake(dev->worker->vtsk);
}
+
+   return true;
 }
 EXPORT_SYMBOL_GPL(vhost_work_queue);
 
 /* A lockless hint for busy polling code to exit the loop */
 bool vhost_has_work(struct vhost_dev *dev)
 {
-   return !llist_empty(&dev->worker.work_list);
+   return !llist_empty(&dev->worker->work_list);
 }
 EXPORT_SYMBOL_GPL(vhost_has_work);
 
@@ -458,8 +462,7 @@ void vhost_dev_init(struct vhost_dev *dev,
dev->umem = NULL;
dev->iotlb = NULL;
dev->mm = NULL;
-   memset(&dev->worker, 0, sizeof(dev->worker));
-   init_llist_head(&dev->worker.work_list);
+   dev->worker = NULL;
dev->iov_limit = iov_limit;
dev->weight = weight;
dev->byte_weight = byte_weight;
@@ -533,30 +536,47 @@ static void vhost_detach_mm(struct vhost_dev *dev)
 
 static void vhost_worker_free(struct vhost_dev *dev)
 {
-   if (!dev->worker.vtsk)
+   if (!dev->worker)
return;
 
-   WARN_ON(!llist_empty(&dev->worker.work_list));
-   vhost_task_stop(dev->worker.vtsk);
-   dev->worker.kcov_handle = 0;
-   dev->worker.vtsk = NULL;
+   WARN_ON(!llist_empty(&dev->worker->work_list));
+   vhost_task_stop(dev->worker->vtsk);
+   kfree(dev->worker);
+   dev->worker = NULL;
 }
 
 static int vhost_worker_create(struct vhost_dev *dev)
 {
+   struct vhost_worker *worker;
struct vhost_task *vtsk;
char name[TASK_COMM_LEN];
 
+   worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
+   if (!worker)
+   return -ENOMEM;
+
snprintf(name, sizeof(name), "vhost-%d", current->pid);
 
-   vtsk = vhost_task_create(vhost_worker, &dev->worker, name);
+   vtsk = vhost_task_create(vhost_worker, worker, name);
if (!vtsk)
-   return -ENOMEM;
+   goto free_worker;
+
+   init_llist_head(&worker->work_list);
+   worker->kcov_handle = kcov_common_handle();
+   worker->vtsk = vtsk;
+   /*
+* vsock can already try to queue so make sure llist and vtsk are both
+* set before vhost_work_queue sees dev->worker is set.
+*/
+   smp_wmb();
+   dev->worker = worker;
 
-   dev->worker.kcov_handle = kcov_common_handle();
-   dev->worker.vtsk = vtsk;
vhost_task_start(vtsk);
return 0;
+
+free_worker:
+   kfree(worker);
+   return -ENOMEM;
 }
 
 /* Caller should have device mutex */
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 37ce869f8a5c..6cd72d0b5245 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -44,7 +44,7 @@ struct vhost_poll {
 };
 
 void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
-void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
+bool vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
 bool vhost_has_work(struct vhost_dev *dev);
 
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
@@ -1

[PATCH v9 09/17] vhost_scsi: make SCSI cmd completion per vq

2023-06-26 Thread Mike Christie
This patch separates the scsi cmd completion code paths so we can complete
cmds based on their vq instead of having all cmds complete on the same
worker/CPU. This will be useful with the next patches that allow us to
create mulitple worker threads and bind them to different vqs, and we can
have completions running on different threads/CPUs.

Signed-off-by: Mike Christie 
Reviewed-by: Stefan Hajnoczi 
---
 drivers/vhost/scsi.c | 56 
 1 file changed, 26 insertions(+), 30 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index bb10fa4bb4f6..a77c53bb035a 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -167,6 +167,7 @@ MODULE_PARM_DESC(max_io_vqs, "Set the max number of IO 
virtqueues a vhost scsi d
 
 struct vhost_scsi_virtqueue {
struct vhost_virtqueue vq;
+   struct vhost_scsi *vs;
/*
 * Reference counting for inflight reqs, used for flush operation. At
 * each time, one reference tracks new commands submitted, while we
@@ -181,6 +182,9 @@ struct vhost_scsi_virtqueue {
struct vhost_scsi_cmd *scsi_cmds;
struct sbitmap scsi_tags;
int max_cmds;
+
+   struct vhost_work completion_work;
+   struct llist_head completion_list;
 };
 
 struct vhost_scsi {
@@ -190,12 +194,8 @@ struct vhost_scsi {
 
struct vhost_dev dev;
struct vhost_scsi_virtqueue *vqs;
-   unsigned long *compl_bitmap;
struct vhost_scsi_inflight **old_inflight;
 
-   struct vhost_work vs_completion_work; /* cmd completion work item */
-   struct llist_head vs_completion_list; /* cmd completion queue */
-
struct vhost_work vs_event_work; /* evt injection work item */
struct llist_head vs_event_list; /* evt injection queue */
 
@@ -358,10 +358,11 @@ static void vhost_scsi_release_cmd(struct se_cmd *se_cmd)
} else {
struct vhost_scsi_cmd *cmd = container_of(se_cmd,
struct vhost_scsi_cmd, tvc_se_cmd);
-   struct vhost_scsi *vs = cmd->tvc_vhost;
+   struct vhost_scsi_virtqueue *svq =  container_of(cmd->tvc_vq,
+   struct vhost_scsi_virtqueue, vq);
 
-   llist_add(&cmd->tvc_completion_list, &vs->vs_completion_list);
-   vhost_work_queue(&vs->dev, &vs->vs_completion_work);
+   llist_add(&cmd->tvc_completion_list, &svq->completion_list);
+   vhost_vq_work_queue(&svq->vq, &svq->completion_work);
}
 }
 
@@ -509,17 +510,17 @@ static void vhost_scsi_evt_work(struct vhost_work *work)
  */
 static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
 {
-   struct vhost_scsi *vs = container_of(work, struct vhost_scsi,
-   vs_completion_work);
+   struct vhost_scsi_virtqueue *svq = container_of(work,
+   struct vhost_scsi_virtqueue, completion_work);
struct virtio_scsi_cmd_resp v_rsp;
struct vhost_scsi_cmd *cmd, *t;
struct llist_node *llnode;
struct se_cmd *se_cmd;
struct iov_iter iov_iter;
-   int ret, vq;
+   bool signal = false;
+   int ret;
 
-   bitmap_zero(vs->compl_bitmap, vs->dev.nvqs);
-   llnode = llist_del_all(&vs->vs_completion_list);
+   llnode = llist_del_all(&svq->completion_list);
llist_for_each_entry_safe(cmd, t, llnode, tvc_completion_list) {
se_cmd = &cmd->tvc_se_cmd;
 
@@ -539,21 +540,17 @@ static void vhost_scsi_complete_cmd_work(struct 
vhost_work *work)
  cmd->tvc_in_iovs, sizeof(v_rsp));
ret = copy_to_iter(&v_rsp, sizeof(v_rsp), &iov_iter);
if (likely(ret == sizeof(v_rsp))) {
-   struct vhost_scsi_virtqueue *q;
+   signal = true;
+
vhost_add_used(cmd->tvc_vq, cmd->tvc_vq_desc, 0);
-   q = container_of(cmd->tvc_vq, struct 
vhost_scsi_virtqueue, vq);
-   vq = q - vs->vqs;
-   __set_bit(vq, vs->compl_bitmap);
} else
pr_err("Faulted on virtio_scsi_cmd_resp\n");
 
vhost_scsi_release_cmd_res(se_cmd);
}
 
-   vq = -1;
-   while ((vq = find_next_bit(vs->compl_bitmap, vs->dev.nvqs, vq + 1))
-   < vs->dev.nvqs)
-   vhost_signal(&vs->dev, &vs->vqs[vq].vq);
+   if (signal)
+   vhost_signal(&svq->vs->dev, &svq->vq);
 }
 
 static struct vhost_scsi_cmd *
@@ -1770,6 +1767,7 @@ static int vhost_scsi_set_features(struct vhost_scsi *vs, 
u64 features)
 
 static int vhost_scsi_open(struct inode *inode, struct file *f)
 {
+   struct vhost_scsi_virtqueue *svq;
struct vhost_scsi *vs;
struct vhost_virtqueue **vqs;
int r = -ENOMEM, i, nvqs = vhost_scsi_max_io_vqs;
@@ -1788,10 +1786,6 @@ static int vhost_scsi_open(

[PATCH v9 04/17] vhost, vhost_net: add helper to check if vq has work

2023-06-26 Thread Mike Christie
In the next patches each vq might have different workers so one could
have work but others do not. For net, we only want to check specific vqs,
so this adds a helper to check if a vq has work pending and converts
vhost-net to use it.

Signed-off-by: Mike Christie 
Acked-by: Jason Wang 
---
 drivers/vhost/net.c   | 2 +-
 drivers/vhost/vhost.c | 6 +++---
 drivers/vhost/vhost.h | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index ae2273196b0c..98bb957eb3b9 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -546,7 +546,7 @@ static void vhost_net_busy_poll(struct vhost_net *net,
endtime = busy_clock() + busyloop_timeout;
 
while (vhost_can_busy_poll(endtime)) {
-   if (vhost_has_work(&net->dev)) {
+   if (vhost_vq_has_work(vq)) {
*busyloop_intr = true;
break;
}
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index db88464c1691..aafb23e12477 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -266,11 +266,11 @@ bool vhost_work_queue(struct vhost_dev *dev, struct 
vhost_work *work)
 EXPORT_SYMBOL_GPL(vhost_work_queue);
 
 /* A lockless hint for busy polling code to exit the loop */
-bool vhost_has_work(struct vhost_dev *dev)
+bool vhost_vq_has_work(struct vhost_virtqueue *vq)
 {
-   return !llist_empty(&dev->worker->work_list);
+   return !llist_empty(&vq->worker->work_list);
 }
-EXPORT_SYMBOL_GPL(vhost_has_work);
+EXPORT_SYMBOL_GPL(vhost_vq_has_work);
 
 void vhost_poll_queue(struct vhost_poll *poll)
 {
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 0baacc245934..d5728f986744 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -45,7 +45,6 @@ struct vhost_poll {
 
 void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
 bool vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
-bool vhost_has_work(struct vhost_dev *dev);
 
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
 __poll_t mask, struct vhost_dev *dev);
@@ -199,6 +198,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
  struct vhost_log *log, unsigned int *log_num);
 void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
+bool vhost_vq_has_work(struct vhost_virtqueue *vq);
 bool vhost_vq_is_setup(struct vhost_virtqueue *vq);
 int vhost_vq_init_access(struct vhost_virtqueue *);
 int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v9 06/17] vhost: take worker or vq for flushing

2023-06-26 Thread Mike Christie
This patch has the core work flush function take a worker. When we
support multiple workers we can then flush each worker during device
removal, stoppage, etc. It also adds a helper to flush specific
virtqueues, so vhost-scsi can flush IO vqs from it's ctl vq.

Signed-off-by: Mike Christie 
---
 drivers/vhost/vhost.c | 15 +--
 drivers/vhost/vhost.h |  1 +
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 611e495eeb3c..2ea107e867a1 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -265,16 +265,27 @@ bool vhost_vq_work_queue(struct vhost_virtqueue *vq, 
struct vhost_work *work)
 }
 EXPORT_SYMBOL_GPL(vhost_vq_work_queue);
 
-void vhost_dev_flush(struct vhost_dev *dev)
+static void vhost_worker_flush(struct vhost_worker *worker)
 {
struct vhost_flush_struct flush;
 
init_completion(&flush.wait_event);
vhost_work_init(&flush.work, vhost_flush_work);
 
-   if (vhost_work_queue(dev, &flush.work))
+   if (vhost_worker_queue(worker, &flush.work))
wait_for_completion(&flush.wait_event);
 }
+
+void vhost_vq_flush(struct vhost_virtqueue *vq)
+{
+   vhost_worker_flush(vq->worker);
+}
+EXPORT_SYMBOL_GPL(vhost_vq_flush);
+
+void vhost_dev_flush(struct vhost_dev *dev)
+{
+   vhost_worker_flush(dev->worker);
+}
 EXPORT_SYMBOL_GPL(vhost_dev_flush);
 
 /* A lockless hint for busy polling code to exit the loop */
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index e2dd4558a1f9..f208f9923c88 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -198,6 +198,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
  struct vhost_log *log, unsigned int *log_num);
 void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
+void vhost_vq_flush(struct vhost_virtqueue *vq);
 bool vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work);
 bool vhost_vq_has_work(struct vhost_virtqueue *vq);
 bool vhost_vq_is_setup(struct vhost_virtqueue *vq);
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v9 01/17] vhost: create worker at end of vhost_dev_set_owner

2023-06-26 Thread Mike Christie
vsock can start queueing work after VHOST_VSOCK_SET_GUEST_CID, so
after we have called vhost_worker_create it can be calling
vhost_work_queue and trying to access the vhost worker/task. If
vhost_dev_alloc_iovecs fails, then vhost_worker_free could free
the worker/task from under vsock.

This moves vhost_worker_create to the end of vhost_dev_set_owner
where we know we can no longer fail in that path. If it fails
after the VHOST_SET_OWNER and userspace closes the device, then
the normal vsock release handling will do the right thing.

Signed-off-by: Mike Christie 
---
 drivers/vhost/vhost.c | 19 +--
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 60c9ebd629dd..82966ffb4a5c 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -572,20 +572,27 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
 
vhost_attach_mm(dev);
 
+   err = vhost_dev_alloc_iovecs(dev);
+   if (err)
+   goto err_iovecs;
+
if (dev->use_worker) {
+   /*
+* This should be done last, because vsock can queue work
+* before VHOST_SET_OWNER so it simplifies the failure path
+* below since we don't have to worry about vsock queueing
+* while we free the worker.
+*/
err = vhost_worker_create(dev);
if (err)
goto err_worker;
}
 
-   err = vhost_dev_alloc_iovecs(dev);
-   if (err)
-   goto err_iovecs;
-
return 0;
-err_iovecs:
-   vhost_worker_free(dev);
+
 err_worker:
+   vhost_dev_free_iovecs(dev);
+err_iovecs:
vhost_detach_mm(dev);
 err_mm:
return err;
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v9 00/17] vhost: multiple worker support

2023-06-26 Thread Mike Christie
The following patches were built over Linux's tree. The also apply over
the mst vhost branch if you revert the previous vhost worker patchset.

The patches allow us to support multiple vhost workers tasks per device.
The design is a modified version of Stefan's original idea where userspace
has the kernel create a worker and we pass back the pid. In this version
instead of passing the pid between user/kernel space we use a worker_id
which is just an integer managed by the vhost driver and we allow
userspace to create and free workers and then attach them to virtqueues
at setup time.

All review comments from the past reviews should be handled. If I didn't
reply to a review comment, I agreed with the comment and should have
handled it in this posting. Let me know if I missed one.

Results:


fio jobs1   2   4   8   12  16
--
1 worker160k   488k -   -   -   -
worker per vq   160k   310k620k1182K1564K   2002k

Notes:
0. This used a simple fio command:

fio --filename=/dev/sdb  --direct=1 --rw=randrw --bs=4k \
--ioengine=libaio --iodepth=128  --numjobs=$JOBS_ABOVE

and I used a VM with 16 vCPUs and 16 virtqueues.

1. The patches were tested with LIO's emulate_pr=0 which drops the
LIO PR lock use. This was a bottleneck at around 12 vqs/jobs.

2. Because we have a hard limit of 1024 cmds, if the num jobs * iodepth
was greater than 1024, I would decrease iodepth. So 12 jobs used 85 cmds,
and 16 used 64.

3. The perf issue above at 2 jobs is because when we only have 1 worker
we execute more cmds per vhost_work due to all vqs funneling to one worker.
I have 2 patches that fix this. One is just submit from the worker thread
instead of kikcing off to a workqueue like how the vhost block patches do.
I'll post this after the worker support is merged. I'm also working on
patches to add back batching during lio completion and do polling on the
submission side.

We will still want the threading patches, because if we batch at the fio
level plus use the vhost theading patches, we can see a big boost like
below. So hopefully doing it at the kernel will allow apps to just work
without having to be smart like fio.

fio using io_uring and batching with the iodepth_batch* settings:

fio jobs1   2   4   8   12  16
-
1 worker494k520k-   -   -   -
worker per vq   496k878k1542k   2436k   2304k   2590k

V9:
- Fix vhost_dev_reset_owner handling. Make sure a virtqueue's
worker is cleared so when the vhost_dev is reused the old worker is
not used.
- Remove old/unused attach ioctl copy to user code.

V8:
- Rebase.
- Move comments about assumptions so it's in the body of the code
instead of top of function so it's more clear.
- Added patch for RCU support and swapping workers while works are
running.

V7:
- Add more comments about assumptions.
- Drop refcounting and just use an attachment_cnt variable, so there
is no confusion about when a worker is freed.
- Do a opt-in model, because vsiock has an issue where it can queue works
before it's running and it doesn't even need multiple workers, so there
is no point in chaning the driver or core code.
- Add back get worker ioctl.
- Broke up last patches to make it easier to read.

V6:
- Rebase against vhost_task patchset.
- Used xa instead of idr.

V5:
- Rebase against user_worker patchset.
- Rebase against flush patchset.
- Redo vhost-scsi tmf flush handling so it doesn't access vq->worker.

V4:
- fix vhost-sock VSOCK_VQ_RX use.
- name functions called directly by ioctl cmd's to match the ioctl cmd.
- break up VHOST_SET_VRING_WORKER into a new, free and attach cmd.
- document worker lifetime, and cgroup, namespace, mm, rlimit
inheritance, make it clear we currently only support sharing within the
device.
- add support to attach workers while IO is running.
- instead of passing a pid_t of the kernel thread, pass a int allocated
by the vhost layer with an idr.

V3:
- fully convert vhost code to use vq based APIs instead of leaving it
half per dev and half per vq.
- rebase against kernel worker API.
- Drop delayed worker creation. We always create the default worker at
VHOST_SET_OWNER time. Userspace can create and bind workers after that.

V2:
- change loop that we take a refcount to the worker in
- replaced pid == -1 with define.
- fixed tabbing/spacing coding style issue
- use hash instead of list to lookup workers.
- I dropped the patch that added an ioctl cmd to get a vq's worker's
pid. I saw we might do a generic netlink interface instead.




___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v9 07/17] vhost: convert poll work to be vq based

2023-06-26 Thread Mike Christie
This has the drivers pass in their poll to vq mapping and then converts
the core poll code to use the vq based helpers. In the next patches we
will allow vqs to be handled by different workers, so to allow drivers
to execute operations like queue, stop, flush, etc on specific polls/vqs
we need to know the mappings.

Signed-off-by: Mike Christie 
---
 drivers/vhost/net.c   | 6 --
 drivers/vhost/vhost.c | 8 +---
 drivers/vhost/vhost.h | 4 +++-
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 98bb957eb3b9..f2ed7167c848 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1347,8 +1347,10 @@ static int vhost_net_open(struct inode *inode, struct 
file *f)
   VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT, true,
   NULL);
 
-   vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, 
dev);
-   vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev);
+   vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev,
+   vqs[VHOST_NET_VQ_TX]);
+   vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev,
+   vqs[VHOST_NET_VQ_RX]);
 
f->private_data = n;
n->page_frag.page = NULL;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 2ea107e867a1..b6149ed8acb4 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -187,13 +187,15 @@ EXPORT_SYMBOL_GPL(vhost_work_init);
 
 /* Init poll structure */
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
-__poll_t mask, struct vhost_dev *dev)
+__poll_t mask, struct vhost_dev *dev,
+struct vhost_virtqueue *vq)
 {
init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
init_poll_funcptr(&poll->table, vhost_poll_func);
poll->mask = mask;
poll->dev = dev;
poll->wqh = NULL;
+   poll->vq = vq;
 
vhost_work_init(&poll->work, fn);
 }
@@ -297,7 +299,7 @@ EXPORT_SYMBOL_GPL(vhost_vq_has_work);
 
 void vhost_poll_queue(struct vhost_poll *poll)
 {
-   vhost_work_queue(poll->dev, &poll->work);
+   vhost_vq_work_queue(poll->vq, &poll->work);
 }
 EXPORT_SYMBOL_GPL(vhost_poll_queue);
 
@@ -508,7 +510,7 @@ void vhost_dev_init(struct vhost_dev *dev,
vhost_vq_reset(dev, vq);
if (vq->handle_kick)
vhost_poll_init(&vq->poll, vq->handle_kick,
-   EPOLLIN, dev);
+   EPOLLIN, dev, vq);
}
 }
 EXPORT_SYMBOL_GPL(vhost_dev_init);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index f208f9923c88..3882266fbbf3 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -41,13 +41,15 @@ struct vhost_poll {
struct vhost_work   work;
__poll_tmask;
struct vhost_dev*dev;
+   struct vhost_virtqueue  *vq;
 };
 
 void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
 bool vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
 
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
-__poll_t mask, struct vhost_dev *dev);
+__poll_t mask, struct vhost_dev *dev,
+struct vhost_virtqueue *vq);
 int vhost_poll_start(struct vhost_poll *poll, struct file *file);
 void vhost_poll_stop(struct vhost_poll *poll);
 void vhost_poll_queue(struct vhost_poll *poll);
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCHES 00/17] IOMMUFD: Deliver IO page faults to user space

2023-06-26 Thread Jason Gunthorpe
On Sun, Jun 25, 2023 at 02:30:46PM +0800, Baolu Lu wrote:

> Agreed. We should avoid workqueue in sva iopf framework. Perhaps we
> could go ahead with below code? It will be registered to device with
> iommu_register_device_fault_handler() in IOMMU_DEV_FEAT_IOPF enabling
> path. Un-registering in the disable path of cause.

This maze needs to be undone as well.

It makes no sense that all the drivers are calling 

 iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);

The driver should RX a PRI fault and deliver it to some core code
function, this looks like a good start:

> static int io_pgfault_handler(struct iommu_fault *fault, void *cookie)
> {
> ioasid_t pasid = fault->prm.pasid;
> struct device *dev = cookie;
> struct iommu_domain *domain;
> 
> if (fault->type != IOMMU_FAULT_PAGE_REQ)
> return -EOPNOTSUPP;
> 
> if (fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID)
> domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
> else
> domain = iommu_get_domain_for_dev(dev);
> 
> if (!domain || !domain->iopf_handler)
> return -ENODEV;
> 
> if (domain->type == IOMMU_DOMAIN_SVA)
> return iommu_queue_iopf(fault, cookie);
> 
> return domain->iopf_handler(fault, dev, domain->fault_data);

Then we find the domain that owns the translation and invoke its
domain->ops->iopf_handler()

If the driver created a SVA domain then the op should point to some
generic 'handle sva fault' function. There shouldn't be weird SVA
stuff in the core code.

The weird SVA stuff is really just a generic per-device workqueue
dispatcher, so if we think that is valuable then it should be
integrated into the iommu_domain (domain->ops->use_iopf_workqueue =
true for instance). Then it could route the fault through the
workqueue and still invoke domain->ops->iopf_handler.

The word "SVA" should not appear in any of this.

Not sure what iommu_register_device_fault_handler() has to do with all
of this.. Setting up the dev_iommu stuff to allow for the workqueue
should happen dynamically during domain attach, ideally in the core
code before calling to the driver.

Also, I can understand there is a need to turn on PRI support really
early, and it can make sense to have some IOMMU_DEV_FEAT_IOPF/SVA to
ask to turn it on.. But that should really only be needed if the HW
cannot turn it on dynamically during domain attach of a PRI enabled
domain.

It needs cleaning up..

Jason
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v1 0/4] virtio/vsock: some updates for MSG_PEEK flag

2023-06-26 Thread Stefano Garzarella

On Sun, Jun 18, 2023 at 09:24:47AM +0300, Arseniy Krasnov wrote:

Hello,

This patchset does several things around MSG_PEEK flag support. In
general words it reworks MSG_PEEK test and adds support for this flag
in SOCK_SEQPACKET logic. Here is per-patch description:

1) This is cosmetic change for SOCK_STREAM implementation of MSG_PEEK:
  1) I think there is no need of "safe" mode walk here as there is no
 "unlink" of skbs inside loop (it is MSG_PEEK mode - we don't change
 queue).
  2) Nested while loop is removed: in case of MSG_PEEK we just walk
 over skbs and copy data from each one. I guess this nested loop
 even didn't behave as loop - it always executed just for single
 iteration.

2) This adds MSG_PEEK support for SOCK_SEQPACKET. It could be implemented
  be reworking MSG_PEEK callback for SOCK_STREAM to support SOCK_SEQPACKET
  also, but I think it will be more simple and clear from potential
  bugs to implemented it as separate function thus not mixing logics
  for both types of socket. So I've added it as dedicated function.

3) This is reworked MSG_PEEK test for SOCK_STREAM. Previous version just
  sent single byte, then tried to read it with MSG_PEEK flag, then read
  it in normal way. New version is more complex: now sender uses buffer
  instead of single byte and this buffer is initialized with random
  values. Receiver tests several things:
  1) Read empty socket with MSG_PEEK flag.
  2) Read part of buffer with MSG_PEEK flag.
  3) Read whole buffer with MSG_PEEK flag, then checks that it is same
 as buffer from 2) (limited by size of buffer from 2) of course).
  4) Read whole buffer without any flags, then checks that is is same
 as buffer from 3).

4) This is MSG_PEEK test for SOCK_SEQPACKET. It works in the same way
  as for SOCK_STREAM, except it also checks combination of MSG_TRUNC
  and MSG_PEEK.

Head is:
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=d20dd0ea14072e8a90ff864b2c1603bd68920b4b


Nice cleanup, LGTM, but I'd like a comment from Bobby.

Thanks,
Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v1 2/4] virtio/vsock: support MSG_PEEK for SOCK_SEQPACKET

2023-06-26 Thread Stefano Garzarella

On Sun, Jun 18, 2023 at 09:24:49AM +0300, Arseniy Krasnov wrote:

This adds support of MSG_PEEK flag for SOCK_SEQPACKET type of socket.
Difference with SOCK_STREAM is that this callback returns either length
of the message or error.

Signed-off-by: Arseniy Krasnov 
---
net/vmw_vsock/virtio_transport_common.c | 63 +++--
1 file changed, 60 insertions(+), 3 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index 2ee40574c339..352d042b130b 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -460,6 +460,63 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
return err;
}

+static ssize_t
+virtio_transport_seqpacket_do_peek(struct vsock_sock *vsk,
+  struct msghdr *msg)
+{
+   struct virtio_vsock_sock *vvs = vsk->trans;
+   struct sk_buff *skb;
+   size_t total, len;
+
+   spin_lock_bh(&vvs->rx_lock);
+
+   if (!vvs->msg_count) {
+   spin_unlock_bh(&vvs->rx_lock);
+   return 0;
+   }
+
+   total = 0;
+   len = msg_data_left(msg);
+
+   skb_queue_walk(&vvs->rx_queue, skb) {
+   struct virtio_vsock_hdr *hdr;
+
+   if (total < len) {
+   size_t bytes;
+   int err;
+
+   bytes = len - total;
+   if (bytes > skb->len)
+   bytes = skb->len;
+
+   spin_unlock_bh(&vvs->rx_lock);
+
+   /* sk_lock is held by caller so no one else can dequeue.
+* Unlock rx_lock since memcpy_to_msg() may sleep.
+*/
+   err = memcpy_to_msg(msg, skb->data, bytes);
+   if (err)
+   return err;
+
+   spin_lock_bh(&vvs->rx_lock);
+   }
+
+   total += skb->len;
+   hdr = virtio_vsock_hdr(skb);
+
+   if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) {
+   if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR)
+   msg->msg_flags |= MSG_EOR;
+
+   break;
+   }
+   }
+
+   spin_unlock_bh(&vvs->rx_lock);
+
+   return total;


Should we return the minimum between total and len?

Thanks,
Stefano


+}
+
static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk,
 struct msghdr *msg,
 int flags)
@@ -554,9 +611,9 @@ virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk,
   int flags)
{
if (flags & MSG_PEEK)
-   return -EOPNOTSUPP;
-
-   return virtio_transport_seqpacket_do_dequeue(vsk, msg, flags);
+   return virtio_transport_seqpacket_do_peek(vsk, msg);
+   else
+   return virtio_transport_seqpacket_do_dequeue(vsk, msg, flags);
}
EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_dequeue);

--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v1 1/4] virtio/vsock: rework MSG_PEEK for SOCK_STREAM

2023-06-26 Thread Stefano Garzarella

On Sun, Jun 18, 2023 at 09:24:48AM +0300, Arseniy Krasnov wrote:

This reworks current implementation of MSG_PEEK logic:
1) Replaces 'skb_queue_walk_safe()' with 'skb_queue_walk()'. There is
  no need in the first one, as there are no removes of skb in loop.
2) Removes nested while loop - MSG_PEEK logic could be implemented
  without it: just iterate over skbs without removing it and copy
  data from each until destination buffer is not full.

Signed-off-by: Arseniy Krasnov 
---
net/vmw_vsock/virtio_transport_common.c | 41 -
1 file changed, 19 insertions(+), 22 deletions(-)


Great clean up!

LGTM, but @Bobby can you also take a look?

Thanks,
Stefano



diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index b769fc258931..2ee40574c339 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -348,37 +348,34 @@ virtio_transport_stream_do_peek(struct vsock_sock *vsk,
size_t len)
{
struct virtio_vsock_sock *vvs = vsk->trans;
-   size_t bytes, total = 0, off;
-   struct sk_buff *skb, *tmp;
-   int err = -EFAULT;
+   struct sk_buff *skb;
+   size_t total = 0;
+   int err;

spin_lock_bh(&vvs->rx_lock);

-   skb_queue_walk_safe(&vvs->rx_queue, skb,  tmp) {
-   off = 0;
+   skb_queue_walk(&vvs->rx_queue, skb) {
+   size_t bytes;

-   if (total == len)
-   break;
+   bytes = len - total;
+   if (bytes > skb->len)
+   bytes = skb->len;

-   while (total < len && off < skb->len) {
-   bytes = len - total;
-   if (bytes > skb->len - off)
-   bytes = skb->len - off;
+   spin_unlock_bh(&vvs->rx_lock);

-   /* sk_lock is held by caller so no one else can dequeue.
-* Unlock rx_lock since memcpy_to_msg() may sleep.
-*/
-   spin_unlock_bh(&vvs->rx_lock);
+   /* sk_lock is held by caller so no one else can dequeue.
+* Unlock rx_lock since memcpy_to_msg() may sleep.
+*/
+   err = memcpy_to_msg(msg, skb->data, bytes);
+   if (err)
+   goto out;

-   err = memcpy_to_msg(msg, skb->data + off, bytes);
-   if (err)
-   goto out;
+   total += bytes;

-   spin_lock_bh(&vvs->rx_lock);
+   spin_lock_bh(&vvs->rx_lock);

-   total += bytes;
-   off += bytes;
-   }
+   if (total == len)
+   break;
}

spin_unlock_bh(&vvs->rx_lock);
--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v4 00/17] vsock: MSG_ZEROCOPY flag support

2023-06-26 Thread Stefano Garzarella

On Sat, Jun 03, 2023 at 11:49:22PM +0300, Arseniy Krasnov wrote:

Hello,

  DESCRIPTION

this is MSG_ZEROCOPY feature support for virtio/vsock. I tried to follow
current implementation for TCP as much as possible:

1) Sender must enable SO_ZEROCOPY flag to use this feature. Without this
  flag, data will be sent in "classic" copy manner and MSG_ZEROCOPY
  flag will be ignored (e.g. without completion).

2) Kernel uses completions from socket's error queue. Single completion
  for single tx syscall (or it can merge several completions to single
  one). I used already implemented logic for MSG_ZEROCOPY support:
  'msg_zerocopy_realloc()' etc.

Difference with copy way is not significant. During packet allocation,
non-linear skb is created and filled with pinned user pages.
There are also some updates for vhost and guest parts of transport - in
both cases i've added handling of non-linear skb for virtio part. vhost
copies data from such skb to the guest's rx virtio buffers. In the guest,
virtio transport fills tx virtio queue with pages from skb.

Head of this patchset is:
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=d20dd0ea14072e8a90ff864b2c1603bd68920b4b


This version has several limits/problems:

1) As this feature totally depends on transport, there is no way (or it
  is difficult) to check whether transport is able to handle it or not
  during SO_ZEROCOPY setting. Seems I need to call AF_VSOCK specific
  setsockopt callback from setsockopt callback for SOL_SOCKET, but this
  leads to lock problem, because both AF_VSOCK and SOL_SOCKET callback
  are not considered to be called from each other. So in current version
  SO_ZEROCOPY is set successfully to any type (e.g. transport) of
  AF_VSOCK socket, but if transport does not support MSG_ZEROCOPY,
  tx routine will fail with EOPNOTSUPP.

  ^^^
  This is still no resolved :(

2) When MSG_ZEROCOPY is used, for each tx system call we need to enqueue
  one completion. In each completion there is flag which shows how tx
  was performed: zerocopy or copy. This leads that whole message must
  be send in zerocopy or copy way - we can't send part of message with
  copying and rest of message with zerocopy mode (or vice versa). Now,
  we need to account vsock credit logic, e.g. we can't send whole data
  once - only allowed number of bytes could sent at any moment. In case
  of copying way there is no problem as in worst case we can send single
  bytes, but zerocopy is more complex because smallest transmission
  unit is single page. So if there is not enough space at peer's side
  to send integer number of pages (at least one) - we will wait, thus
  stalling tx side. To overcome this problem i've added simple rule -
  zerocopy is possible only when there is enough space at another side
  for whole message (to check, that current 'msghdr' was already used
  in previous tx iterations i use 'iov_offset' field of it's iov iter).

  ^^^
  Discussed as ok during v2. Link:
  
https://lore.kernel.org/netdev/23guh3txkghxpgcrcjx7h62qsoj3xgjhfzgtbmqp2slrz3rxr4@zya2z7kwt75l/

3) loopback transport is not supported, because it requires to implement
  non-linear skb handling in dequeue logic (as we "send" fragged skb
  and "receive" it from the same queue). I'm going to implement it in
  next versions.

  ^^^ fixed in v2

4) Current implementation sets max length of packet to 64KB. IIUC this
  is due to 'kmalloc()' allocated data buffers. I think, in case of
  MSG_ZEROCOPY this value could be increased, because 'kmalloc()' is
  not touched for data - user space pages are used as buffers. Also
  this limit trims every message which is > 64KB, thus such messages
  will be send in copy mode due to 'iov_offset' check in 2).

  ^^^ fixed in v2

PATCHSET STRUCTURE

Patchset has the following structure:
1) Handle non-linear skbuff on receive in virtio/vhost.
2) Handle non-linear skbuff on send in virtio/vhost.
3) Updates for AF_VSOCK.
4) Enable MSG_ZEROCOPY support on transports.
5) Tests/tools/docs updates.

   PERFORMANCE

Performance: it is a little bit tricky to compare performance between
copy and zerocopy transmissions. In zerocopy way we need to wait when
user buffers will be released by kernel, so it is like synchronous
path (wait until device driver will process it), while in copy way we
can feed data to kernel as many as we want, don't care about device
driver. So I compared only time which we spend in the 'send()' syscall.
Then if this value will be combined with total number of transmitted
bytes, we can get Gbit/s parameter. Also to avoid tx stalls due to not
enough credit, receiver allocates same amount of space as sender needs.

Sender:
./vsock_perf --sender  --buf-size  --bytes 256M [--zc]

Receiver:
./vsock_perf --vsk-size 256M

I run tests on two setups: desktop with Core i7 - I use this PC for
development and in this case guest is nested guest, and host is norma

Re: [RFC PATCH v4 12/17] vsock/loopback: support MSG_ZEROCOPY for transport

2023-06-26 Thread Stefano Garzarella

On Sat, Jun 03, 2023 at 11:49:34PM +0300, Arseniy Krasnov wrote:

Add 'msgzerocopy_allow()' callback for loopback transport.

Signed-off-by: Arseniy Krasnov 
---
net/vmw_vsock/vsock_loopback.c | 8 
1 file changed, 8 insertions(+)

diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index 5c6360df1f31..a2e4aeda2d92 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vsock/vsock_loopback.c
@@ -47,6 +47,7 @@ static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk)
}

static bool vsock_loopback_seqpacket_allow(u32 remote_cid);
+static bool vsock_loopback_msgzerocopy_allow(void);


I don't know why we did this for `vsock_loopback_seqpacket_allow`, but
can we just put the implementation here?



static struct virtio_transport loopback_transport = {
.transport = {
@@ -92,11 +93,18 @@ static struct virtio_transport loopback_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,

.read_skb = virtio_transport_read_skb,
+
+   .msgzerocopy_allow= vsock_loopback_msgzerocopy_allow,


Ditto the moving.


},

.send_pkt = vsock_loopback_send_pkt,
};

+static bool vsock_loopback_msgzerocopy_allow(void)
+{
+   return true;
+}
+
static bool vsock_loopback_seqpacket_allow(u32 remote_cid)
{
return true;
--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v4 11/17] vsock/virtio: support MSG_ZEROCOPY for transport

2023-06-26 Thread Stefano Garzarella

On Sat, Jun 03, 2023 at 11:49:33PM +0300, Arseniy Krasnov wrote:

Add 'msgzerocopy_allow()' callback for virtio transport.

Signed-off-by: Arseniy Krasnov 
---
net/vmw_vsock/virtio_transport.c | 7 +++
1 file changed, 7 insertions(+)

diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 6053d8341091..d9ffa16dda69 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -438,6 +438,11 @@ static void virtio_vsock_rx_done(struct virtqueue *vq)
queue_work(virtio_vsock_workqueue, &vsock->rx_work);
}

+static bool virtio_transport_msgzerocopy_allow(void)
+{
+   return true;
+}
+
static bool virtio_transport_seqpacket_allow(u32 remote_cid);

static struct virtio_transport virtio_transport = {
@@ -484,6 +489,8 @@ static struct virtio_transport virtio_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,

.read_skb = virtio_transport_read_skb,
+
+   .msgzerocopy_allow= virtio_transport_msgzerocopy_allow,


Ditto.


},

.send_pkt = virtio_transport_send_pkt,
--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v4 10/17] vhost/vsock: support MSG_ZEROCOPY for transport

2023-06-26 Thread Stefano Garzarella

On Sat, Jun 03, 2023 at 11:49:32PM +0300, Arseniy Krasnov wrote:

Add 'msgzerocopy_allow()' callback for vhost transport.

Signed-off-by: Arseniy Krasnov 
---
drivers/vhost/vsock.c | 6 ++
1 file changed, 6 insertions(+)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index b254aa4b756a..318866713ef7 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -396,6 +396,11 @@ static bool vhost_vsock_more_replies(struct vhost_vsock 
*vsock)
return val < vq->num;
}

+static bool vhost_transport_msgzerocopy_allow(void)
+{
+   return true;
+}
+
static bool vhost_transport_seqpacket_allow(u32 remote_cid);

static struct virtio_transport vhost_transport = {
@@ -442,6 +447,7 @@ static struct virtio_transport vhost_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,

.read_skb = virtio_transport_read_skb,
+   .msgzerocopy_allow= vhost_transport_msgzerocopy_allow,


Can we move this after .seqpacket section?


},

.send_pkt = vhost_transport_send_pkt,
--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v4 07/17] vsock: read from socket's error queue

2023-06-26 Thread Stefano Garzarella

On Sat, Jun 03, 2023 at 11:49:29PM +0300, Arseniy Krasnov wrote:

This adds handling of MSG_ERRQUEUE input flag in receive call. This flag
is used to read socket's error queue instead of data queue. Possible
scenario of error queue usage is receiving completions for transmission
with MSG_ZEROCOPY flag.

Signed-off-by: Arseniy Krasnov 
---
include/linux/socket.h   | 1 +
net/vmw_vsock/af_vsock.c | 5 +
2 files changed, 6 insertions(+)

diff --git a/include/linux/socket.h b/include/linux/socket.h
index bd1cc3238851..d79efd026880 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -382,6 +382,7 @@ struct ucred {
#define SOL_MPTCP   284
#define SOL_MCTP285
#define SOL_SMC 286
+#define SOL_VSOCK  287


Maybe this change should go in another patch where we describe that
we need to support setsockopt()



/* IPX options */
#define IPX_TYPE1
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 45fd20c4ed50..07803d9fbf6d 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -110,6 +110,7 @@
#include 
#include 
#include 
+#include 

static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr);
static void vsock_sk_destruct(struct sock *sk);
@@ -2135,6 +2136,10 @@ vsock_connectible_recvmsg(struct socket *sock, struct 
msghdr *msg, size_t len,
int err;

sk = sock->sk;
+
+   if (unlikely(flags & MSG_ERRQUEUE))
+   return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, 0);
+
vsk = vsock_sk(sk);
err = 0;

--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v4 06/17] vsock: check error queue to set EPOLLERR

2023-06-26 Thread Stefano Garzarella

On Sat, Jun 03, 2023 at 11:49:28PM +0300, Arseniy Krasnov wrote:

If socket's error queue is not empty, EPOLLERR must be set. Otherwise,
reader of error queue won't detect data in it using EPOLLERR bit.

Signed-off-by: Arseniy Krasnov 
---
net/vmw_vsock/af_vsock.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)


This patch looks like it can go even without this series.

Is it a fix? Should we add a fixes tag?

Thanks,
Stefano



diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index efb8a0937a13..45fd20c4ed50 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1030,7 +1030,7 @@ static __poll_t vsock_poll(struct file *file, struct 
socket *sock,
poll_wait(file, sk_sleep(sk), wait);
mask = 0;

-   if (sk->sk_err)
+   if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
/* Signify that there has been an error on this socket. */
mask |= EPOLLERR;

--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v4 05/17] vsock/virtio: MSG_ZEROCOPY flag support

2023-06-26 Thread Stefano Garzarella

On Sat, Jun 03, 2023 at 11:49:27PM +0300, Arseniy Krasnov wrote:

This adds handling of MSG_ZEROCOPY flag on transmission path: if this
flag is set and zerocopy transmission is possible, then non-linear skb
will be created and filled with the pages of user's buffer. Pages of
user's buffer are locked in memory by 'get_user_pages()'.

Signed-off-by: Arseniy Krasnov 
---
net/vmw_vsock/virtio_transport_common.c | 270 ++--
1 file changed, 208 insertions(+), 62 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index 0de562c1dc4b..f1ec38c72db7 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -37,27 +37,100 @@ virtio_transport_get_ops(struct vsock_sock *vsk)
return container_of(t, struct virtio_transport, transport);
}

-/* Returns a new packet on success, otherwise returns NULL.
- *
- * If NULL is returned, errp is set to a negative errno.
- */
-static struct sk_buff *
-virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
-  size_t len,
-  u32 src_cid,
-  u32 src_port,
-  u32 dst_cid,
-  u32 dst_port)
-{
-   const size_t skb_len = VIRTIO_VSOCK_SKB_HEADROOM + len;
-   struct virtio_vsock_hdr *hdr;
-   struct sk_buff *skb;
+static bool virtio_transport_can_zcopy(struct virtio_vsock_pkt_info *info,
+  size_t max_to_send)
+{
+   struct iov_iter *iov_iter;
+
+   if (!info->msg)
+   return false;
+
+   iov_iter = &info->msg->msg_iter;
+
+   /* Data is simple buffer. */
+   if (iter_is_ubuf(iov_iter))
+   return true;
+
+   if (!iter_is_iovec(iov_iter))
+   return false;
+
+   if (iov_iter->iov_offset)
+   return false;
+
+   /* We can't send whole iov. */
+   if (iov_iter->count > max_to_send)
+   return false;
+
+   return true;
+}
+
+static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk,
+  struct sk_buff *skb,
+  struct msghdr *msg,
+  bool zerocopy)
+{
+   struct ubuf_info *uarg;
+
+   if (msg->msg_ubuf) {
+   uarg = msg->msg_ubuf;
+   net_zcopy_get(uarg);
+   } else {
+   struct iov_iter *iter = &msg->msg_iter;
+   struct ubuf_info_msgzc *uarg_zc;
+   int len;
+
+   /* Only ITER_IOVEC or ITER_UBUF are allowed and
+* checked before.
+*/
+   if (iter_is_iovec(iter))
+   len = iov_length(iter->__iov, iter->nr_segs);
+   else
+   len = iter->count;
+
+   uarg = msg_zerocopy_realloc(sk_vsock(vsk),
+   len,
+   NULL);
+
+   if (!uarg)
+   return -1;
+
+   uarg_zc = uarg_to_msgzc(uarg);
+   uarg_zc->zerocopy = zerocopy ? 1 : 0;
+   }
+
+   skb_zcopy_init(skb, uarg);
+
+   return 0;
+}
+
+static int virtio_transport_fill_linear_skb(struct sk_buff *skb,
+   struct vsock_sock *vsk,


`vsk` seems unused


+   struct virtio_vsock_pkt_info *info,
+   size_t len)
+{
void *payload;
int err;

-   skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL);
-   if (!skb)
-   return NULL;
+   payload = skb_put(skb, len);
+   err = memcpy_from_msg(payload, info->msg, len);
+   if (err)
+   return -1;
+
+   if (msg_data_left(info->msg))
+   return 0;
+
+   return 0;
+}
+
+static void virtio_transport_init_hdr(struct sk_buff *skb,
+ struct virtio_vsock_pkt_info *info,
+ u32 src_cid,
+ u32 src_port,
+ u32 dst_cid,
+ u32 dst_port,
+ size_t len)
+{
+   struct virtio_vsock_hdr *hdr;

hdr = virtio_vsock_hdr(skb);
hdr->type= cpu_to_le16(info->type);
@@ -68,42 +141,6 @@ virtio_transport_alloc_skb(struct virtio_vsock_pkt_info 
*info,
hdr->dst_port= cpu_to_le32(dst_port);
hdr->flags   = cpu_to_le32(info->flags);
hdr->len = cpu_to_le32(len);
-
-   if (info->msg && len > 0) {
-   payload = skb_put(skb, len);
-   err = memcpy_from_msg(payload, info->msg, len);
-   if (err)
-   goto out;
-
-   if (msg_data_left(info->msg) == 0 &&
-  

Re: [RFC PATCH v4 04/17] vsock/virtio: non-linear skb handling for tap

2023-06-26 Thread Stefano Garzarella

On Sat, Jun 03, 2023 at 11:49:26PM +0300, Arseniy Krasnov wrote:

For tap device new skb is created and data from the current skb is
copied to it. This adds copying data from non-linear skb to new
the skb.

Signed-off-by: Arseniy Krasnov 
---
net/vmw_vsock/virtio_transport_common.c | 31 ++---
1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index 5819a9cd4515..0de562c1dc4b 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -106,6 +106,27 @@ virtio_transport_alloc_skb(struct virtio_vsock_pkt_info 
*info,
return NULL;
}

+static void virtio_transport_copy_nonlinear_skb(struct sk_buff *skb,


`const struct sk_buff *skb` should be better also to understand that
the function copy data from *skb to *dst.


+   void *dst,
+   size_t len)
+{
+   struct iov_iter iov_iter = { 0 };
+   struct kvec kvec;
+   size_t to_copy;
+
+   kvec.iov_base = dst;
+   kvec.iov_len = len;
+
+   iov_iter.iter_type = ITER_KVEC;
+   iov_iter.kvec = &kvec;
+   iov_iter.nr_segs = 1;
+
+   to_copy = min_t(size_t, len, skb->len);
+
+   skb_copy_datagram_iter(skb, VIRTIO_VSOCK_SKB_CB(skb)->frag_off,
+  &iov_iter, to_copy);
+}
+
/* Packet capture */
static struct sk_buff *virtio_transport_build_skb(void *opaque)
{
@@ -114,7 +135,6 @@ static struct sk_buff *virtio_transport_build_skb(void 
*opaque)
struct af_vsockmon_hdr *hdr;
struct sk_buff *skb;
size_t payload_len;
-   void *payload_buf;

/* A packet could be split to fit the RX buffer, so we can retrieve
 * the payload length from the header and the buffer pointer taking
@@ -122,7 +142,6 @@ static struct sk_buff *virtio_transport_build_skb(void 
*opaque)
 */
pkt_hdr = virtio_vsock_hdr(pkt);
payload_len = pkt->len;
-   payload_buf = pkt->data;

skb = alloc_skb(sizeof(*hdr) + sizeof(*pkt_hdr) + payload_len,
GFP_ATOMIC);
@@ -165,7 +184,13 @@ static struct sk_buff *virtio_transport_build_skb(void 
*opaque)
skb_put_data(skb, pkt_hdr, sizeof(*pkt_hdr));

if (payload_len) {
-   skb_put_data(skb, payload_buf, payload_len);
+   if (skb_is_nonlinear(pkt)) {
+   void *data = skb_put(skb, payload_len);
+
+			virtio_transport_copy_nonlinear_skb(pkt, data, 
payload_len);

+   } else {
+   skb_put_data(skb, pkt->data, payload_len);
+   }
}

return skb;
--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v4 03/17] vsock/virtio: support to send non-linear skb

2023-06-26 Thread Stefano Garzarella

On Sat, Jun 03, 2023 at 11:49:25PM +0300, Arseniy Krasnov wrote:

For non-linear skb use its pages from fragment array as buffers in
virtio tx queue. These pages are already pinned by 'get_user_pages()'
during such skb creation.

Signed-off-by: Arseniy Krasnov 
---
net/vmw_vsock/virtio_transport.c | 37 ++--
1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index e95df847176b..6053d8341091 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -100,7 +100,9 @@ virtio_transport_send_pkt_work(struct work_struct *work)
vq = vsock->vqs[VSOCK_VQ_TX];

for (;;) {
-   struct scatterlist hdr, buf, *sgs[2];
+   /* +1 is for packet header. */
+   struct scatterlist *sgs[MAX_SKB_FRAGS + 1];
+   struct scatterlist bufs[MAX_SKB_FRAGS + 1];
int ret, in_sg = 0, out_sg = 0;
struct sk_buff *skb;
bool reply;
@@ -111,12 +113,35 @@ virtio_transport_send_pkt_work(struct work_struct *work)

virtio_transport_deliver_tap_pkt(skb);
reply = virtio_vsock_skb_reply(skb);
+   sg_init_one(&bufs[0], virtio_vsock_hdr(skb), 
sizeof(*virtio_vsock_hdr(skb)));
+   sgs[out_sg++] = &bufs[0];


Can we use out_sg also to index bufs (here and in the rest of the code)?

E.g.

sg_init_one(&bufs[out_sg], ...)
sgs[out_sg] = &bufs[out_sg];
++out_sg;

...
if (skb->len > 0) {
sg_init_one(&bufs[out_sg], skb->data, skb->len);
sgs[out_sg] = &bufs[out_sg];
++out_sg;
}

etc...


+


For readability, I would move the smaller branch above:

if (!skb_is_nonlinear(skb)) {
// small block
...
} else {
// big block
...
}


+   if (skb_is_nonlinear(skb)) {
+   struct skb_shared_info *si;
+   int i;
+
+   si = skb_shinfo(skb);
+
+   for (i = 0; i < si->nr_frags; i++) {
+   skb_frag_t *skb_frag = &si->frags[i];
+   void *va = page_to_virt(skb_frag->bv_page);
+
+   /* We will use 'page_to_virt()' for userspace 
page here,
+* because virtio layer will call 
'virt_to_phys()' later
+* to fill buffer descriptor. We don't touch 
memory at
+* "virtual" address of this page.
+*/
+   sg_init_one(&bufs[i + 1],
+   va + skb_frag->bv_offset,
+   skb_frag->bv_len);
+   sgs[out_sg++] = &bufs[i + 1];
+   }
+   } else {
+   if (skb->len > 0) {


Should we do the same check (skb->len > 0) for nonlinear skb as well?
Or do the nonlinear ones necessarily have len > 0?


+   sg_init_one(&bufs[1], skb->data, skb->len);
+   sgs[out_sg++] = &bufs[1];
+   }


   ^
Blank line that we can remove.

Stefano


-   sg_init_one(&hdr, virtio_vsock_hdr(skb), 
sizeof(*virtio_vsock_hdr(skb)));
-   sgs[out_sg++] = &hdr;
-   if (skb->len > 0) {
-   sg_init_one(&buf, skb->data, skb->len);
-   sgs[out_sg++] = &buf;
}

ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, skb, 
GFP_KERNEL);
--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [syzbot] [net?] [virt?] [kvm?] KASAN: slab-use-after-free Read in __vhost_vq_attach_worker

2023-06-26 Thread Michael S. Tsirkin
On Mon, Jun 26, 2023 at 10:03:25AM -0500, Mike Christie wrote:
> On 6/26/23 2:15 AM, Michael S. Tsirkin wrote:
> > On Mon, Jun 26, 2023 at 12:06:54AM -0700, syzbot wrote:
> >> Hello,
> >>
> >> syzbot found the following issue on:
> >>
> >> HEAD commit:8d2be868b42c Add linux-next specific files for 20230623
> >> git tree:   linux-next
> >> console+strace: https://syzkaller.appspot.com/x/log.txt?x=12872950a8
> >> kernel config:  https://syzkaller.appspot.com/x/.config?x=d8ac8dd33677e8e0
> >> dashboard link: 
> >> https://syzkaller.appspot.com/bug?extid=8540db210d403f1aa214
> >> compiler:   gcc (Debian 10.2.1-6) 10.2.1 20210110, GNU ld (GNU 
> >> Binutils for Debian) 2.35.2
> >> syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=15c1b70f28
> >> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=122ee4cb28
> >>
> >> Downloadable assets:
> >> disk image: 
> >> https://storage.googleapis.com/syzbot-assets/2a004483aca3/disk-8d2be868.raw.xz
> >> vmlinux: 
> >> https://storage.googleapis.com/syzbot-assets/5688cb13b277/vmlinux-8d2be868.xz
> >> kernel image: 
> >> https://storage.googleapis.com/syzbot-assets/76de0b63bc53/bzImage-8d2be868.xz
> >>
> >> The issue was bisected to:
> >>
> >> commit 21a18f4a51896fde11002165f0e7340f4131d6a0
> >> Author: Mike Christie 
> >> Date:   Tue Jun 13 01:32:46 2023 +
> >>
> >> vhost: allow userspace to create workers
> >>
> >> bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=130850bf28
> >> final oops: https://syzkaller.appspot.com/x/report.txt?x=108850bf28
> >> console output: https://syzkaller.appspot.com/x/log.txt?x=170850bf28
> >>
> >> IMPORTANT: if you fix the issue, please add the following tag to the 
> >> commit:
> >> Reported-by: syzbot+8540db210d403f1aa...@syzkaller.appspotmail.com
> >> Fixes: 21a18f4a5189 ("vhost: allow userspace to create workers")
> > 
> > Mike, would appreciate prompt attention to this as I am preparing
> > a pull request for the merge window and need to make a
> > decision on whether to include your userspace-controlled
> > threading patchset.
> > 
> 
> Do you want me to resubmit the patchset or submit a patch against your vhost
> branch?

Resubmit pls.

> The bug is that vhost-net can call vhost_dev_reset_owner and that will
> free the workers. However, I leave the virtqueue->worker pointer set so
> we end up referencing the freed workers later on. When I handled a
> review comment between v5 and v6, I deleted that code thinking it was
> also not needed.
> 
> So the fix is:
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index ab79b064aade..5a07e220e46d 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -607,6 +607,10 @@ static void vhost_workers_free(struct vhost_dev *dev)
>  
>   if (!dev->use_worker)
>   return;
> +
> + for (i = 0; i < dev->nvqs; i++)
> + rcu_assign_pointer(dev->vqs[i]->worker, NULL);
> +
>   /*
>* Free the default worker we created and cleanup workers userspace
>* created but couldn't clean up (it forgot or crashed).
> 
> 

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v4 02/17] vhost/vsock: read data from non-linear skb

2023-06-26 Thread Stefano Garzarella

On Sat, Jun 03, 2023 at 11:49:24PM +0300, Arseniy Krasnov wrote:

This adds copying to guest's virtio buffers from non-linear skbs. Such
skbs are created by protocol layer when MSG_ZEROCOPY flags is used. It
changes call of 'copy_to_iter()' to 'skb_copy_datagram_iter()'. Second
function can read data from non-linear skb.

See commit to 'net/vmw_vsock/virtio_transport_common.c' with the same
name for more details.


I think it's okay if we report the same details here.



Signed-off-by: Arseniy Krasnov 
---
drivers/vhost/vsock.c | 12 +++-
1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 6578db78f0ae..b254aa4b756a 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -156,7 +156,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
}

iov_iter_init(&iov_iter, ITER_DEST, &vq->iov[out], in, iov_len);
-   payload_len = skb->len;
+   payload_len = skb->len - VIRTIO_VSOCK_SKB_CB(skb)->frag_off;


Also here a variable should make the code more readable.

Stefano


hdr = virtio_vsock_hdr(skb);

/* If the packet is greater than the space available in the
@@ -197,8 +197,10 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
break;
}

-   nbytes = copy_to_iter(skb->data, payload_len, &iov_iter);
-   if (nbytes != payload_len) {
+   if (skb_copy_datagram_iter(skb,
+  VIRTIO_VSOCK_SKB_CB(skb)->frag_off,
+  &iov_iter,
+  payload_len)) {
kfree_skb(skb);
vq_err(vq, "Faulted on copying pkt buf\n");
break;
@@ -212,13 +214,13 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
vhost_add_used(vq, head, sizeof(*hdr) + payload_len);
added = true;

-   skb_pull(skb, payload_len);
+   VIRTIO_VSOCK_SKB_CB(skb)->frag_off += payload_len;
total_len += payload_len;

/* If we didn't send all the payload we can requeue the packet
 * to send it with the next available buffer.
 */
-   if (skb->len > 0) {
+   if (VIRTIO_VSOCK_SKB_CB(skb)->frag_off < skb->len) {
hdr->flags |= cpu_to_le32(flags_to_restore);

/* We are queueing the same skb to handle
--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v4 01/17] vsock/virtio: read data from non-linear skb

2023-06-26 Thread Stefano Garzarella

On Sat, Jun 03, 2023 at 11:49:23PM +0300, Arseniy Krasnov wrote:

This is preparation patch for non-linear skbuff handling. It replaces
direct calls of 'memcpy_to_msg()' with 'skb_copy_datagram_iter()'. Main
advantage of the second one is that is can handle paged part of the skb
by using 'kmap()' on each page, but if there are no pages in the skb,
it behaves like simple copying to iov iterator. This patch also adds
new field to the control block of skb - this value shows current offset
in the skb to read next portion of data (it doesn't matter linear it or
not). Idea is that 'skb_copy_datagram_iter()' handles both types of
skb internally - it just needs an offset from which to copy data from
the given skb. This offset is incremented on each read from skb. This
approach allows to avoid special handling of non-linear skbs:
1) We can't call 'skb_pull()' on it, because it updates 'data' pointer.
2) We need to update 'data_len' also on each read from this skb.

Signed-off-by: Arseniy Krasnov 
---
include/linux/virtio_vsock.h|  1 +
net/vmw_vsock/virtio_transport_common.c | 26 +
2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index c58453699ee9..17dbb7176e37 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -12,6 +12,7 @@
struct virtio_vsock_skb_cb {
bool reply;
bool tap_delivered;
+   u32 frag_off;
};

#define VIRTIO_VSOCK_SKB_CB(skb) ((struct virtio_vsock_skb_cb *)((skb)->cb))
diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index b769fc258931..5819a9cd4515 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -355,7 +355,7 @@ virtio_transport_stream_do_peek(struct vsock_sock *vsk,
spin_lock_bh(&vvs->rx_lock);

skb_queue_walk_safe(&vvs->rx_queue, skb,  tmp) {
-   off = 0;
+   off = VIRTIO_VSOCK_SKB_CB(skb)->frag_off;

if (total == len)
break;
@@ -370,7 +370,10 @@ virtio_transport_stream_do_peek(struct vsock_sock *vsk,
 */
spin_unlock_bh(&vvs->rx_lock);

-   err = memcpy_to_msg(msg, skb->data + off, bytes);
+   err = skb_copy_datagram_iter(skb, off,
+&msg->msg_iter,
+bytes);
+
if (err)
goto out;

@@ -414,24 +417,28 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
skb = skb_peek(&vvs->rx_queue);

bytes = len - total;
-   if (bytes > skb->len)
-   bytes = skb->len;
+   if (bytes > skb->len - VIRTIO_VSOCK_SKB_CB(skb)->frag_off)
+   bytes = skb->len - VIRTIO_VSOCK_SKB_CB(skb)->frag_off;


What about storing `VIRTIO_VSOCK_SKB_CB(skb)->frag_off` in a variable?
More for readability than optimization, which I hope the compiler
already does on its own.

The rest LGTM.

Stefano



/* sk_lock is held by caller so no one else can dequeue.
 * Unlock rx_lock since memcpy_to_msg() may sleep.
 */
spin_unlock_bh(&vvs->rx_lock);

-   err = memcpy_to_msg(msg, skb->data, bytes);
+   err = skb_copy_datagram_iter(skb,
+VIRTIO_VSOCK_SKB_CB(skb)->frag_off,
+&msg->msg_iter, bytes);
+
if (err)
goto out;

spin_lock_bh(&vvs->rx_lock);

total += bytes;
-   skb_pull(skb, bytes);

-   if (skb->len == 0) {
+   VIRTIO_VSOCK_SKB_CB(skb)->frag_off += bytes;
+
+   if (skb->len == VIRTIO_VSOCK_SKB_CB(skb)->frag_off) {
u32 pkt_len = le32_to_cpu(virtio_vsock_hdr(skb)->len);

virtio_transport_dec_rx_pkt(vvs, pkt_len);
@@ -503,7 +510,10 @@ static int virtio_transport_seqpacket_do_dequeue(struct 
vsock_sock *vsk,
 */
spin_unlock_bh(&vvs->rx_lock);

-   err = memcpy_to_msg(msg, skb->data, 
bytes_to_copy);
+   err = skb_copy_datagram_iter(skb, 0,
+&msg->msg_iter,
+bytes_to_copy);
+
if (err) {
/* Copy of message failed. Rest of
 * fragments will be freed without copy.
--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation

Re: [syzbot] [net?] [virt?] [kvm?] KASAN: slab-use-after-free Read in __vhost_vq_attach_worker

2023-06-26 Thread Mike Christie
On 6/26/23 2:15 AM, Michael S. Tsirkin wrote:
> On Mon, Jun 26, 2023 at 12:06:54AM -0700, syzbot wrote:
>> Hello,
>>
>> syzbot found the following issue on:
>>
>> HEAD commit:8d2be868b42c Add linux-next specific files for 20230623
>> git tree:   linux-next
>> console+strace: https://syzkaller.appspot.com/x/log.txt?x=12872950a8
>> kernel config:  https://syzkaller.appspot.com/x/.config?x=d8ac8dd33677e8e0
>> dashboard link: https://syzkaller.appspot.com/bug?extid=8540db210d403f1aa214
>> compiler:   gcc (Debian 10.2.1-6) 10.2.1 20210110, GNU ld (GNU Binutils 
>> for Debian) 2.35.2
>> syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=15c1b70f28
>> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=122ee4cb28
>>
>> Downloadable assets:
>> disk image: 
>> https://storage.googleapis.com/syzbot-assets/2a004483aca3/disk-8d2be868.raw.xz
>> vmlinux: 
>> https://storage.googleapis.com/syzbot-assets/5688cb13b277/vmlinux-8d2be868.xz
>> kernel image: 
>> https://storage.googleapis.com/syzbot-assets/76de0b63bc53/bzImage-8d2be868.xz
>>
>> The issue was bisected to:
>>
>> commit 21a18f4a51896fde11002165f0e7340f4131d6a0
>> Author: Mike Christie 
>> Date:   Tue Jun 13 01:32:46 2023 +
>>
>> vhost: allow userspace to create workers
>>
>> bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=130850bf28
>> final oops: https://syzkaller.appspot.com/x/report.txt?x=108850bf28
>> console output: https://syzkaller.appspot.com/x/log.txt?x=170850bf28
>>
>> IMPORTANT: if you fix the issue, please add the following tag to the commit:
>> Reported-by: syzbot+8540db210d403f1aa...@syzkaller.appspotmail.com
>> Fixes: 21a18f4a5189 ("vhost: allow userspace to create workers")
> 
> Mike, would appreciate prompt attention to this as I am preparing
> a pull request for the merge window and need to make a
> decision on whether to include your userspace-controlled
> threading patchset.
> 

Do you want me to resubmit the patchset or submit a patch against your vhost
branch?

The bug is that vhost-net can call vhost_dev_reset_owner and that will
free the workers. However, I leave the virtqueue->worker pointer set so
we end up referencing the freed workers later on. When I handled a
review comment between v5 and v6, I deleted that code thinking it was
also not needed.

So the fix is:

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index ab79b064aade..5a07e220e46d 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -607,6 +607,10 @@ static void vhost_workers_free(struct vhost_dev *dev)
 
if (!dev->use_worker)
return;
+
+   for (i = 0; i < dev->nvqs; i++)
+   rcu_assign_pointer(dev->vqs[i]->worker, NULL);
+
/*
 * Free the default worker we created and cleanup workers userspace
 * created but couldn't clean up (it forgot or crashed).



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH RFC net-next v4 6/8] virtio/vsock: support dgrams

2023-06-26 Thread Stefano Garzarella

On Fri, Jun 23, 2023 at 04:37:55AM +, Bobby Eshleman wrote:

On Thu, Jun 22, 2023 at 06:09:12PM +0200, Stefano Garzarella wrote:

On Sun, Jun 11, 2023 at 11:49:02PM +0300, Arseniy Krasnov wrote:
> Hello Bobby!
>
> On 10.06.2023 03:58, Bobby Eshleman wrote:
> > This commit adds support for datagrams over virtio/vsock.
> >
> > Message boundaries are preserved on a per-skb and per-vq entry basis.
>
> I'm a little bit confused about the following case: let vhost sends 4097 bytes
> datagram to the guest. Guest uses 4096 RX buffers in it's virtio queue, each
> buffer has attached empty skb to it. Vhost places first 4096 bytes to the 
first
> buffer of guests RX queue, and 1 last byte to the second buffer. Now IIUC 
guest
> has two skb in it rx queue, and user in guest wants to read data - does it 
read
> 4097 bytes, while guest has two skb - 4096 bytes and 1 bytes? In seqpacket 
there is
> special marker in header which shows where message ends, and how it works 
here?

I think the main difference is that DGRAM is not connection-oriented, so
we don't have a stream and we can't split the packet into 2 (maybe we
could, but we have no guarantee that the second one for example will be
not discarded because there is no space).

So I think it is acceptable as a restriction to keep it simple.

My only doubt is, should we make the RX buffer size configurable,
instead of always using 4k?


I think that is a really good idea. What mechanism do you imagine?


Some parameter in sysfs?



For sendmsg() with buflen > VQ_BUF_SIZE, I think I'd like -ENOBUFS


For the guest it should be easy since it allocates the buffers, but for
the host?

Maybe we should add a field in the configuration space that reports some
sort of MTU.

Something in addition to what Laura had proposed here:
https://markmail.org/message/ymhz7wllutdxji3e


returned even though it is uncharacteristic of Linux sockets.
Alternatively, silently dropping is okay... but seems needlessly
unhelpful.


UDP takes advantage of IP fragmentation, right?
But what happens if a fragment is lost?

We should try to behave in a similar way.



FYI, this patch is broken for h2g because it requeues partially sent
skbs, so probably doesn't need much code review until we decided on the
policy.


Got it.

Thanks,
Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH RFC net-next v4 3/8] vsock: support multi-transport datagrams

2023-06-26 Thread Stefano Garzarella

On Fri, Jun 23, 2023 at 02:59:23AM +, Bobby Eshleman wrote:

On Fri, Jun 23, 2023 at 02:50:01AM +, Bobby Eshleman wrote:

On Thu, Jun 22, 2023 at 05:19:08PM +0200, Stefano Garzarella wrote:
> On Sat, Jun 10, 2023 at 12:58:30AM +, Bobby Eshleman wrote:
> > This patch adds support for multi-transport datagrams.
> >
> > This includes:
> > - Per-packet lookup of transports when using sendto(sockaddr_vm)
> > - Selecting H2G or G2H transport using VMADDR_FLAG_TO_HOST and CID in
> >  sockaddr_vm
> >
> > To preserve backwards compatibility with VMCI, some important changes
> > were made. The "transport_dgram" / VSOCK_TRANSPORT_F_DGRAM is changed to
> > be used for dgrams iff there is not yet a g2h or h2g transport that has
>
> s/iff/if
>
> > been registered that can transmit the packet. If there is a g2h/h2g
> > transport for that remote address, then that transport will be used and
> > not "transport_dgram". This essentially makes "transport_dgram" a
> > fallback transport for when h2g/g2h has not yet gone online, which
> > appears to be the exact use case for VMCI.
> >
> > This design makes sense, because there is no reason that the
> > transport_{g2h,h2g} cannot also service datagrams, which makes the role
> > of transport_dgram difficult to understand outside of the VMCI context.
> >
> > The logic around "transport_dgram" had to be retained to prevent
> > breaking VMCI:
> >
> > 1) VMCI datagrams appear to function outside of the h2g/g2h
> >   paradigm. When the vmci transport becomes online, it registers itself
> >   with the DGRAM feature, but not H2G/G2H. Only later when the
> >   transport has more information about its environment does it register
> >   H2G or G2H. In the case that a datagram socket becomes active
> >   after DGRAM registration but before G2H/H2G registration, the
> >   "transport_dgram" transport needs to be used.
>
> IIRC we did this, because at that time only VMCI supported DGRAM. Now that
> there are more transports, maybe DGRAM can follow the h2g/g2h paradigm.
>

Totally makes sense. I'll add the detail above that the prior design was
a result of chronology.

> >
> > 2) VMCI seems to require special message be sent by the transport when a
> >   datagram socket calls bind(). Under the h2g/g2h model, the transport
> >   is selected using the remote_addr which is set by connect(). At
> >   bind time there is no remote_addr because often no connect() has been
> >   called yet: the transport is null. Therefore, with a null transport
> >   there doesn't seem to be any good way for a datagram socket a tell the
> >   VMCI transport that it has just had bind() called upon it.
>
> @Vishnu, @Bryan do you think we can avoid this in some way?
>
> >
> > Only transports with a special datagram fallback use-case such as VMCI
> > need to register VSOCK_TRANSPORT_F_DGRAM.
>
> Maybe we should rename it in VSOCK_TRANSPORT_F_DGRAM_FALLBACK or
> something like that.
>
> In any case, we definitely need to update the comment in
> include/net/af_vsock.h on top of VSOCK_TRANSPORT_F_DGRAM mentioning
> this.
>

Agreed. I'll rename to VSOCK_TRANSPORT_F_DGRAM_FALLBACK, unless we find
there is a better way altogether.

> >
> > Signed-off-by: Bobby Eshleman 
> > ---
> > drivers/vhost/vsock.c   |  1 -
> > include/linux/virtio_vsock.h|  2 -
> > net/vmw_vsock/af_vsock.c| 78 
+
> > net/vmw_vsock/hyperv_transport.c|  6 ---
> > net/vmw_vsock/virtio_transport.c|  1 -
> > net/vmw_vsock/virtio_transport_common.c |  7 ---
> > net/vmw_vsock/vsock_loopback.c  |  1 -
> > 7 files changed, 60 insertions(+), 36 deletions(-)
> >
> > diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> > index c8201c070b4b..8f0082da5e70 100644
> > --- a/drivers/vhost/vsock.c
> > +++ b/drivers/vhost/vsock.c
> > @@ -410,7 +410,6 @@ static struct virtio_transport vhost_transport = {
> >   .cancel_pkt   = vhost_transport_cancel_pkt,
> >
> >   .dgram_enqueue= virtio_transport_dgram_enqueue,
> > - .dgram_bind   = virtio_transport_dgram_bind,
> >   .dgram_allow  = virtio_transport_dgram_allow,
> >   .dgram_get_cid= virtio_transport_dgram_get_cid,
> >   .dgram_get_port   = virtio_transport_dgram_get_port,
> > diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
> > index 23521a318cf0..73afa09f4585 100644
> > --- a/include/linux/virtio_vsock.h
> > +++ b/include/linux/virtio_vsock.h
> > @@ -216,8 +216,6 @@ void virtio_transport_notify_buffer_size(struct 
vsock_sock *vsk, u64 *val);
> > u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk);
> > bool virtio_transport_stream_is_active(struct vsock_sock *vsk);
> > bool virtio_transport_stream_allow(u32 cid, u32 port);
> > -int virtio_transport_dgram_bind(struct vsock_sock *vsk,
> > - struct sockaddr_vm *

Re: [PATCH 08/26] virtio-mem: use array_size

2023-06-26 Thread Michael S. Tsirkin
On Fri, Jun 23, 2023 at 11:14:39PM +0200, Julia Lawall wrote:
> Use array_size to protect against multiplication overflows.
> 
> The changes were done using the following Coccinelle semantic patch:
> 
> // 
> @@
> expression E1, E2;
> constant C1, C2;
> identifier alloc = {vmalloc,vzalloc};
> @@
> 
> (
>   alloc(C1 * C2,...)
> |
>   alloc(
> -   (E1) * (E2)
> +   array_size(E1, E2)
>   ,...)
> )
> // 
> 
> Signed-off-by: Julia Lawall 
> 
> ---
>  drivers/virtio/virtio_mem.c |6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)

can't hurt I guess.

Acked-by: Michael S. Tsirkin 


> diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
> index 835f6cc2fb66..a4dfe7aab288 100644
> --- a/drivers/virtio/virtio_mem.c
> +++ b/drivers/virtio/virtio_mem.c
> @@ -399,7 +399,7 @@ static int 
> virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm)
>   if (vm->bbm.bb_states && old_pages == new_pages)
>   return 0;
>  
> - new_array = vzalloc(new_pages * PAGE_SIZE);
> + new_array = vzalloc(array_size(new_pages, PAGE_SIZE));
>   if (!new_array)
>   return -ENOMEM;
>
> @@ -465,7 +465,7 @@ static int 
> virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm)
>   if (vm->sbm.mb_states && old_pages == new_pages)
>   return 0;
>  
> - new_array = vzalloc(new_pages * PAGE_SIZE);
> + new_array = vzalloc(array_size(new_pages, PAGE_SIZE));
>   if (!new_array)
>   return -ENOMEM;
>  
> @@ -588,7 +588,7 @@ static int 
> virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm)
>   if (vm->sbm.sb_states && old_pages == new_pages)
>   return 0;
>  
> - new_bitmap = vzalloc(new_pages * PAGE_SIZE);
> + new_bitmap = vzalloc(array_size(new_pages, PAGE_SIZE));
>   if (!new_bitmap)
>   return -ENOMEM;
>  

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCHES 00/17] IOMMUFD: Deliver IO page faults to user space

2023-06-26 Thread Jean-Philippe Brucker
On Mon, Jun 19, 2023 at 11:35:50AM +0800, Baolu Lu wrote:
> > Another outstanding issue was what to do for PASID stop. When the guest
> > device driver stops using a PASID it issues a PASID stop request to the
> > device (a device-specific mechanism). If the device is not using PRI stop
> > markers it waits for pending PRs to complete and we're fine. Otherwise it
> > sends a stop marker which is flushed to the PRI queue, but does not wait
> > for pending PRs.
> > 
> > Handling stop markers is annoying. If the device issues one, then the PRI
> > queue contains stale faults, a stop marker, followed by valid faults for
> > the next address space bound to this PASID. The next address space will
> > get all the spurious faults because the fault handler doesn't know that
> > there is a stop marker coming. Linux is probably alright with spurious
> > faults, though maybe not in all cases, and other guests may not support
> > them at all.
> > 
> > We might need to revisit supporting stop markers: request that each device
> > driver declares whether their device uses stop markers on unbind() ("This
> > mechanism must indicate that a Stop Marker Message will be generated."
> > says the spec, but doesn't say if the function always uses one or the
> > other mechanism so it's per-unbind). Then we still have to synchronize
> > unbind() with the fault handler to deal with the pending stop marker,
> > which might have already gone through or be generated later.
> 
> I don't quite follow here. Once a PASID is unbound from the device, the
> device driver should be free to release the PASID. The PASID could then
> be used for any other purpose. The device driver has no idea when the
> pending page requests are flushed after unbind(), so it cannot decide
> how long should the PASID be delayed for reuse. Therefore, I understand
> that a successful return from the unbind() function denotes that all
> pending page requests have been flushed and the PASID is viable for
> other use.

Yes that's the contract for unbind() at the moment

> 
> > 
> > Currently we ignore all that and just flush the PRI queue, followed by the
> > IOPF queue, to get rid of any stale fault before reassigning the PASID. A
> > guest however would also need to first flush the HW PRI queue, but doesn't
> > have a direct way to do that. If we want to support guests that don't deal
> > with stop markers, the host needs to flush the PRI queue when a PASID is
> > detached. I guess on Intel detaching the PASID goes through the host which
> > can flush the host queue. On Arm we'll probably need to flush the queue
> > when receiving a PASID cache invalidation, which the guest issues after
> > clearing a PASID table entry.
> 
> The Intel VT-d driver follows below steps to drain pending page requests
> when a PASID is unbound from a device.
> 
> - Tear down the device's pasid table entry for the stopped pasid.
>   This ensures that ATS/PRI will stop putting more page requests for the
>   pasid in VT-d PRQ.

Oh that's interesting, I didn't know about the implicit TLB invalidations
on page requests for VT-d.

For Arm SMMU, clearing the PASID table entry does cause ATS Translation
Requests to return with Completer Abort, but does not affect PRI. The SMMU
pushes page requests directly into the PRI queue without reading any table
(unless the queue overflows).

We're counting on the device driver to perform the PASID stop request
before calling unbind(), described in PCIe 6.20.1 (Managing PASID Usage)
and 10.4.1.2 (Managing PASID Usage on PRG Requests). This ensures that
when unbind() is called, no more page request for the PASID is pushed into
the PRI queue. But some may still be in the queue if the device uses stop
markers.

> - Sync with the PRQ handling thread until all related page requests in
>   PRQ have been delivered.

This is what I'm concerned about. For VT-d this happens in the host which
is in charge of modifying the PASID table. For SMMU, the guest writes the
PASID table. It flushes its virtual PRI queue, but not the physical queue
that is managed by the host.

One synchronization point where the host could flush the physical PRI
queue is the PASID config invalidation (CMD_CFGI_CD). As Jason pointed out
the host may not be able to observe those if a command queue is assigned
directly to the guest (a theoretical SMMU extension), though in that case
the guest may also have direct access to a PRI queue (like the AMD vIOMMU
extension) and be able to flush the queue directly.

But we can just wait for PRI implementations and see what the drivers
need. Maybe no device will implement stop markers.

Thanks,
Jean

> - Flush the iopf queue with iopf_queue_flush_dev().
> - Follow the steps defined in VT-d spec section 7.10 to drain all page
>   requests and responses between VT-d and the endpoint device.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/v

Re: [PATCH v3] virtio_pmem: add the missing REQ_OP_WRITE for flush bio

2023-06-26 Thread Christoph Hellwig
Looks good:

Reviewed-by: Christoph Hellwig 
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 08/26] virtio-mem: use array_size

2023-06-26 Thread David Hildenbrand

On 23.06.23 23:14, Julia Lawall wrote:

Use array_size to protect against multiplication overflows.

The changes were done using the following Coccinelle semantic patch:

// 
@@
 expression E1, E2;
 constant C1, C2;
 identifier alloc = {vmalloc,vzalloc};
@@
 
(

   alloc(C1 * C2,...)
|
   alloc(
-   (E1) * (E2)
+   array_size(E1, E2)
   ,...)
)
// 

Signed-off-by: Julia Lawall 

---
  drivers/virtio/virtio_mem.c |6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)




Reviewed-by: David Hildenbrand 

--
Cheers,

David / dhildenb

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [syzbot] [net?] [virt?] [kvm?] KASAN: slab-use-after-free Read in __vhost_vq_attach_worker

2023-06-26 Thread Michael S. Tsirkin
On Mon, Jun 26, 2023 at 12:06:54AM -0700, syzbot wrote:
> Hello,
> 
> syzbot found the following issue on:
> 
> HEAD commit:8d2be868b42c Add linux-next specific files for 20230623
> git tree:   linux-next
> console+strace: https://syzkaller.appspot.com/x/log.txt?x=12872950a8
> kernel config:  https://syzkaller.appspot.com/x/.config?x=d8ac8dd33677e8e0
> dashboard link: https://syzkaller.appspot.com/bug?extid=8540db210d403f1aa214
> compiler:   gcc (Debian 10.2.1-6) 10.2.1 20210110, GNU ld (GNU Binutils 
> for Debian) 2.35.2
> syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=15c1b70f28
> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=122ee4cb28
> 
> Downloadable assets:
> disk image: 
> https://storage.googleapis.com/syzbot-assets/2a004483aca3/disk-8d2be868.raw.xz
> vmlinux: 
> https://storage.googleapis.com/syzbot-assets/5688cb13b277/vmlinux-8d2be868.xz
> kernel image: 
> https://storage.googleapis.com/syzbot-assets/76de0b63bc53/bzImage-8d2be868.xz
> 
> The issue was bisected to:
> 
> commit 21a18f4a51896fde11002165f0e7340f4131d6a0
> Author: Mike Christie 
> Date:   Tue Jun 13 01:32:46 2023 +
> 
> vhost: allow userspace to create workers
> 
> bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=130850bf28
> final oops: https://syzkaller.appspot.com/x/report.txt?x=108850bf28
> console output: https://syzkaller.appspot.com/x/log.txt?x=170850bf28
> 
> IMPORTANT: if you fix the issue, please add the following tag to the commit:
> Reported-by: syzbot+8540db210d403f1aa...@syzkaller.appspotmail.com
> Fixes: 21a18f4a5189 ("vhost: allow userspace to create workers")

Mike, would appreciate prompt attention to this as I am preparing
a pull request for the merge window and need to make a
decision on whether to include your userspace-controlled
threading patchset.

Thanks!


> ==
> BUG: KASAN: slab-use-after-free in __mutex_lock_common 
> kernel/locking/mutex.c:582 [inline]
> BUG: KASAN: slab-use-after-free in __mutex_lock+0x1029/0x1350 
> kernel/locking/mutex.c:747
> Read of size 8 at addr 8880703fff68 by task syz-executor204/5105
> 
> CPU: 0 PID: 5105 Comm: syz-executor204 Not tainted 
> 6.4.0-rc7-next-20230623-syzkaller #0
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS 
> Google 05/27/2023
> Call Trace:
>  
>  __dump_stack lib/dump_stack.c:88 [inline]
>  dump_stack_lvl+0xd9/0x150 lib/dump_stack.c:106
>  print_address_description.constprop.0+0x2c/0x3c0 mm/kasan/report.c:364
>  print_report mm/kasan/report.c:475 [inline]
>  kasan_report+0x11d/0x130 mm/kasan/report.c:588
>  __mutex_lock_common kernel/locking/mutex.c:582 [inline]
>  __mutex_lock+0x1029/0x1350 kernel/locking/mutex.c:747
>  __vhost_vq_attach_worker+0xe7/0x390 drivers/vhost/vhost.c:678
>  vhost_dev_set_owner+0x670/0xa60 drivers/vhost/vhost.c:892
>  vhost_net_set_owner drivers/vhost/net.c:1687 [inline]
>  vhost_net_ioctl+0x668/0x16a0 drivers/vhost/net.c:1737
>  vfs_ioctl fs/ioctl.c:51 [inline]
>  __do_sys_ioctl fs/ioctl.c:870 [inline]
>  __se_sys_ioctl fs/ioctl.c:856 [inline]
>  __x64_sys_ioctl+0x19d/0x210 fs/ioctl.c:856
>  do_syscall_x64 arch/x86/entry/common.c:50 [inline]
>  do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80
>  entry_SYSCALL_64_after_hwframe+0x63/0xcd
> RIP: 0033:0x7fe7a9715629
> Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 21 19 00 00 90 48 89 f8 48 89 f7 48 
> 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 
> 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
> RSP: 002b:7fe7a96ba208 EFLAGS: 0246 ORIG_RAX: 0010
> RAX: ffda RBX: 000b RCX: 7fe7a9715629
> RDX:  RSI: af01 RDI: 0003
> RBP: 7fe7a979e240 R08: 7fe7a979e248 R09: 7fe7a979e248
> R10: 7fe7a979e248 R11: 0246 R12: 7fe7a979e24c
> R13: 7ffcfa04d48f R14: 7fe7a96ba300 R15: 00022000
>  
> 
> Allocated by task 5105:
>  kasan_save_stack+0x22/0x40 mm/kasan/common.c:45
>  kasan_set_track+0x25/0x30 mm/kasan/common.c:52
>  kasan_kmalloc mm/kasan/common.c:374 [inline]
>  kasan_kmalloc mm/kasan/common.c:333 [inline]
>  __kasan_kmalloc+0xa2/0xb0 mm/kasan/common.c:383
>  kmalloc include/linux/slab.h:579 [inline]
>  kzalloc include/linux/slab.h:700 [inline]
>  vhost_worker_create+0x9c/0x320 drivers/vhost/vhost.c:627
>  vhost_dev_set_owner+0x5b9/0xa60 drivers/vhost/vhost.c:885
>  vhost_net_set_owner drivers/vhost/net.c:1687 [inline]
>  vhost_net_ioctl+0x668/0x16a0 drivers/vhost/net.c:1737
>  vfs_ioctl fs/ioctl.c:51 [inline]
>  __do_sys_ioctl fs/ioctl.c:870 [inline]
>  __se_sys_ioctl fs/ioctl.c:856 [inline]
>  __x64_sys_ioctl+0x19d/0x210 fs/ioctl.c:856
>  do_syscall_x64 arch/x86/entry/common.c:50 [inline]
>  do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80
>  entry_SYSCALL_64_after_hwframe+0x63/0xcd
> 
> Freed by task 5108:
>  kasan_save_stack+0x22/0x40 mm/kasan/common