Re: [PATCH] vhost: move is_le setup to the backend

2015-11-12 Thread Greg Kurz
On Fri, 30 Oct 2015 12:42:35 +0100
Greg Kurz  wrote:

> The vq->is_le field is used to fix endianness when accessing the vring via
> the cpu_to_vhost16() and vhost16_to_cpu() helpers in the following cases:
> 
> 1) host is big endian and device is modern virtio
> 
> 2) host has cross-endian support and device is legacy virtio with a different
>endianness than the host
> 
> Both cases rely on the VHOST_SET_FEATURES ioctl, but 2) also needs the
> VHOST_SET_VRING_ENDIAN ioctl to be called by userspace. Since vq->is_le
> is only needed when the backend is active, it was decided to set it at
> backend start.
> 
> This is currently done in vhost_init_used()->vhost_init_is_le() but it
> obfuscates the core vhost code. This patch moves the is_le setup to a
> dedicated function that is called from the backend code.
> 
> Note vhost_net is the only backend that can pass vq->private_data == NULL to
> vhost_init_used(), hence the "if (sock)" branch.
> 
> No behaviour change.
> 
> Signed-off-by: Greg Kurz 
> ---

Ping ?

>  drivers/vhost/net.c   |6 ++
>  drivers/vhost/scsi.c  |3 +++
>  drivers/vhost/test.c  |2 ++
>  drivers/vhost/vhost.c |   12 +++-
>  drivers/vhost/vhost.h |1 +
>  5 files changed, 19 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 9eda69e40678..d6319cb2664c 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -917,6 +917,12 @@ static long vhost_net_set_backend(struct vhost_net *n, 
> unsigned index, int fd)
> 
>   vhost_net_disable_vq(n, vq);
>   vq->private_data = sock;
> +
> + if (sock)
> + vhost_set_is_le(vq);
> + else
> + vq->is_le = virtio_legacy_is_little_endian();
> +
>   r = vhost_init_used(vq);
>   if (r)
>   goto err_used;
> diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
> index e25a23692822..e2644a301fa5 100644
> --- a/drivers/vhost/scsi.c
> +++ b/drivers/vhost/scsi.c
> @@ -1276,6 +1276,9 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs,
>   vq = >vqs[i].vq;
>   mutex_lock(>mutex);
>   vq->private_data = vs_tpg;
> +
> + vhost_set_is_le(vq);
> +
>   vhost_init_used(vq);
>   mutex_unlock(>mutex);
>   }
> diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
> index f2882ac98726..b1c7df502211 100644
> --- a/drivers/vhost/test.c
> +++ b/drivers/vhost/test.c
> @@ -196,6 +196,8 @@ static long vhost_test_run(struct vhost_test *n, int test)
>   oldpriv = vq->private_data;
>   vq->private_data = priv;
> 
> + vhost_set_is_le(vq);
> +
>   r = vhost_init_used(>vqs[index]);
> 
>   mutex_unlock(>mutex);
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index eec2f11809ff..6be863dcbd13 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -113,6 +113,12 @@ static void vhost_init_is_le(struct vhost_virtqueue *vq)
>  }
>  #endif /* CONFIG_VHOST_CROSS_ENDIAN_LEGACY */
> 
> +void vhost_set_is_le(struct vhost_virtqueue *vq)
> +{
> + vhost_init_is_le(vq);
> +}
> +EXPORT_SYMBOL_GPL(vhost_set_is_le);
> +
>  static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
>   poll_table *pt)
>  {
> @@ -1156,12 +1162,8 @@ int vhost_init_used(struct vhost_virtqueue *vq)
>  {
>   __virtio16 last_used_idx;
>   int r;
> - if (!vq->private_data) {
> - vq->is_le = virtio_legacy_is_little_endian();
> + if (!vq->private_data)
>   return 0;
> - }
> -
> - vhost_init_is_le(vq);
> 
>   r = vhost_update_used_flags(vq);
>   if (r)
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 4772862b71a7..8a62041959fe 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -162,6 +162,7 @@ bool vhost_enable_notify(struct vhost_dev *, struct 
> vhost_virtqueue *);
> 
>  int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
>   unsigned int log_num, u64 len);
> +void vhost_set_is_le(struct vhost_virtqueue *vq);
> 
>  #define vq_err(vq, fmt, ...) do {  \
>   pr_debug(pr_fmt(fmt), ##__VA_ARGS__);   \
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH] VMCI: Use 32bit atomics for queue headers on X86_32

2015-11-12 Thread Jorgen Hansen
This change restricts the reading and setting of the head and tail
pointers on 32bit X86 to 32bit for both correctness and
performance reasons. On uniprocessor X86_32, the atomic64_read
may be implemented as a non-locked cmpxchg8b. This may result in
updates to the pointers done by the VMCI device being overwritten.
On MP systems, there is no such correctness issue, but using 32bit
atomics avoids the overhead of the locked 64bit operation. All this
is safe because the queue size on 32bit systems will never exceed
a 32bit value.

Reviewed-by: Thomas Hellstrom 
Signed-off-by: Jorgen Hansen 
---
 drivers/misc/vmw_vmci/vmci_driver.c |2 +-
 include/linux/vmw_vmci_defs.h   |   43 +++---
 2 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/drivers/misc/vmw_vmci/vmci_driver.c 
b/drivers/misc/vmw_vmci/vmci_driver.c
index b823f9a..896be15 100644
--- a/drivers/misc/vmw_vmci/vmci_driver.c
+++ b/drivers/misc/vmw_vmci/vmci_driver.c
@@ -113,5 +113,5 @@ module_exit(vmci_drv_exit);
 
 MODULE_AUTHOR("VMware, Inc.");
 MODULE_DESCRIPTION("VMware Virtual Machine Communication Interface.");
-MODULE_VERSION("1.1.3.0-k");
+MODULE_VERSION("1.1.4.0-k");
 MODULE_LICENSE("GPL v2");
diff --git a/include/linux/vmw_vmci_defs.h b/include/linux/vmw_vmci_defs.h
index 65ac54c..1bd31a3 100644
--- a/include/linux/vmw_vmci_defs.h
+++ b/include/linux/vmw_vmci_defs.h
@@ -734,6 +734,41 @@ static inline void *vmci_event_data_payload(struct 
vmci_event_data *ev_data)
 }
 
 /*
+ * Helper to read a value from a head or tail pointer. For X86_32, the
+ * pointer is treated as a 32bit value, since the pointer value
+ * never exceeds a 32bit value in this case. Also, doing an
+ * atomic64_read on X86_32 uniprocessor systems may be implemented
+ * as a non locked cmpxchg8b, that may end up overwriting updates done
+ * by the VMCI device to the memory location. On 32bit SMP, the lock
+ * prefix will be used, so correctness isn't an issue, but using a
+ * 64bit operation still adds unnecessary overhead.
+ */
+static inline u64 vmci_q_read_pointer(atomic64_t *var)
+{
+#if defined(CONFIG_X86_32)
+   return atomic_read((atomic_t *)var);
+#else
+   return atomic64_read(var);
+#endif
+}
+
+/*
+ * Helper to set the value of a head or tail pointer. For X86_32, the
+ * pointer is treated as a 32bit value, since the pointer value
+ * never exceeds a 32bit value in this case. On 32bit SMP, using a
+ * locked cmpxchg8b adds unnecessary overhead.
+ */
+static inline void vmci_q_set_pointer(atomic64_t *var,
+ u64 new_val)
+{
+#if defined(CONFIG_X86_32)
+   return atomic_set((atomic_t *)var, (u32)new_val);
+#else
+   return atomic64_set(var, new_val);
+#endif
+}
+
+/*
  * Helper to add a given offset to a head or tail pointer. Wraps the
  * value of the pointer around the max size of the queue.
  */
@@ -741,14 +776,14 @@ static inline void vmci_qp_add_pointer(atomic64_t *var,
   size_t add,
   u64 size)
 {
-   u64 new_val = atomic64_read(var);
+   u64 new_val = vmci_q_read_pointer(var);
 
if (new_val >= size - add)
new_val -= size;
 
new_val += add;
 
-   atomic64_set(var, new_val);
+   vmci_q_set_pointer(var, new_val);
 }
 
 /*
@@ -758,7 +793,7 @@ static inline u64
 vmci_q_header_producer_tail(const struct vmci_queue_header *q_header)
 {
struct vmci_queue_header *qh = (struct vmci_queue_header *)q_header;
-   return atomic64_read(>producer_tail);
+   return vmci_q_read_pointer(>producer_tail);
 }
 
 /*
@@ -768,7 +803,7 @@ static inline u64
 vmci_q_header_consumer_head(const struct vmci_queue_header *q_header)
 {
struct vmci_queue_header *qh = (struct vmci_queue_header *)q_header;
-   return atomic64_read(>consumer_head);
+   return vmci_q_read_pointer(>consumer_head);
 }
 
 /*
-- 
1.7.0

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH net-next RFC V3 0/3] basic busy polling support for vhost_net

2015-11-12 Thread Jason Wang
Hi all:

This series tries to add basic busy polling for vhost net. The idea is
simple: at the end of tx/rx processing, busy polling for new tx added
descriptor and rx receive socket for a while. The maximum number of
time (in us) could be spent on busy polling was specified ioctl.

Test were done through:

- 50 us as busy loop timeout
- Netperf 2.6
- Two machines with back to back connected ixgbe
- Guest with 1 vcpu and 1 queue

Results:
- For stream workload, ioexits were reduced dramatically in medium
  size (1024-2048) of tx (at most -39%) and almost all rx (at most
  -79%) as a result of polling. This compensate for the possible
  wasted cpu cycles more or less. That porbably why we can still see
  some increasing in the normalized throughput in some cases.
- Throughput of tx were increased (at most 105%) expect for the huge
  write (16384). And we can send more packets in the case (+tpkts were
  increased).
- Very minor rx regression in some cases.
- Improvemnt on TCP_RR (at most 16%).

size/session/+thu%/+normalize%/+tpkts%/+rpkts%/+ioexits%/
   64/ 1/   +9%/  -17%/   +5%/  +10%/   -2%
   64/ 2/   +8%/  -18%/   +6%/  +10%/   -1%
   64/ 4/   +4%/  -21%/   +6%/  +10%/   -1%
   64/ 8/   +9%/  -17%/   +6%/   +9%/   -2%
  256/ 1/  +20%/   -1%/  +15%/  +11%/   -9%
  256/ 2/  +15%/   -6%/  +15%/   +8%/   -8%
  256/ 4/  +17%/   -4%/  +16%/   +8%/   -8%
  256/ 8/  -61%/  -69%/  +16%/  +10%/  -10%
  512/ 1/  +15%/   -3%/  +19%/  +18%/  -11%
  512/ 2/  +19%/0%/  +19%/  +13%/  -10%
  512/ 4/  +18%/   -2%/  +18%/  +15%/  -10%
  512/ 8/  +17%/   -1%/  +18%/  +15%/  -11%
 1024/ 1/  +25%/   +4%/  +27%/  +16%/  -21%
 1024/ 2/  +28%/   +8%/  +25%/  +15%/  -22%
 1024/ 4/  +25%/   +5%/  +25%/  +14%/  -21%
 1024/ 8/  +27%/   +7%/  +25%/  +16%/  -21%
 2048/ 1/  +32%/  +12%/  +31%/  +22%/  -38%
 2048/ 2/  +33%/  +12%/  +30%/  +23%/  -36%
 2048/ 4/  +31%/  +10%/  +31%/  +24%/  -37%
 2048/ 8/ +105%/  +75%/  +33%/  +23%/  -39%
16384/ 1/0%/  -14%/   +2%/0%/  +19%
16384/ 2/0%/  -13%/  +19%/  -13%/  +17%
16384/ 4/0%/  -12%/   +3%/0%/   +2%
16384/ 8/0%/  -11%/   -2%/   +1%/   +1%
size/session/+thu%/+normalize%/+tpkts%/+rpkts%/+ioexits%/
   64/ 1/   -7%/  -23%/   +4%/   +6%/  -74%
   64/ 2/   -2%/  -12%/   +2%/   +2%/  -55%
   64/ 4/   +2%/   -5%/  +10%/   -2%/  -43%
   64/ 8/   -5%/   -5%/  +11%/  -34%/  -59%
  256/ 1/   -6%/  -16%/   +9%/  +11%/  -60%
  256/ 2/   +3%/   -4%/   +6%/   -3%/  -28%
  256/ 4/0%/   -5%/   -9%/   -9%/  -10%
  256/ 8/   -3%/   -6%/  -12%/   -9%/  -40%
  512/ 1/   -4%/  -17%/  -10%/  +21%/  -34%
  512/ 2/0%/   -9%/  -14%/   -3%/  -30%
  512/ 4/0%/   -4%/  -18%/  -12%/   -4%
  512/ 8/   -1%/   -4%/   -1%/   -5%/   +4%
 1024/ 1/0%/  -16%/  +12%/  +11%/  -10%
 1024/ 2/0%/  -11%/0%/   +5%/  -31%
 1024/ 4/0%/   -4%/   -7%/   +1%/  -22%
 1024/ 8/   -5%/   -6%/  -17%/  -29%/  -79%
 2048/ 1/0%/  -16%/   +1%/   +9%/  -10%
 2048/ 2/0%/  -12%/   +7%/   +9%/  -26%
 2048/ 4/0%/   -7%/   -4%/   +3%/  -64%
 2048/ 8/   -1%/   -5%/   -6%/   +4%/  -20%
16384/ 1/0%/  -12%/  +11%/   +7%/  -20%
16384/ 2/0%/   -7%/   +1%/   +5%/  -26%
16384/ 4/0%/   -5%/  +12%/  +22%/  -23%
16384/ 8/0%/   -1%/   -8%/   +5%/   -3%
size/session/+thu%/+normalize%/+tpkts%/+rpkts%/+ioexits%/
1/ 1/   +9%/  -29%/   +9%/   +9%/   +9%
1/25/   +6%/  -18%/   +6%/   +6%/   -1%
1/50/   +6%/  -19%/   +5%/   +5%/   -2%
1/   100/   +5%/  -19%/   +4%/   +4%/   -3%
   64/ 1/  +10%/  -28%/  +10%/  +10%/  +10%
   64/25/   +8%/  -18%/   +7%/   +7%/   -2%
   64/50/   +8%/  -17%/   +8%/   +8%/   -1%
   64/   100/   +8%/  -17%/   +8%/   +8%/   -1%
  256/ 1/  +10%/  -28%/  +10%/  +10%/  +10%
  256/25/  +15%/  -13%/  +15%/  +15%/0%
  256/50/  +16%/  -14%/  +18%/  +18%/   +2%
  256/   100/  +15%/  -13%/  +12%/  +12%/   -2%

Changes from V2:
- poll also at the end of rx handling
- factor out the polling logic and optimize the code a little bit
- add two ioctls to get and set the busy poll timeout
- test on ixgbe (which can give more stable and reproducable numbers)
  instead of mlx4.

Changes from V1:
- Add a comment for vhost_has_work() to explain why it could be
  lockless
- Add param description for busyloop_timeout
- Split out the busy polling logic into a new helper
- Check and exit the loop when there's a pending signal
- Disable preemption during busy looping to make sure lock_clock() was
  correctly used.

Jason Wang (3):
  vhost: introduce vhost_has_work()
  vhost: introduce vhost_vq_more_avail()
  vhost_net: basic polling support

 drivers/vhost/net.c| 77 +++---
 drivers/vhost/vhost.c  | 48 +++--
 drivers/vhost/vhost.h  |  3 ++
 

[PATCH net-next RFC V3 1/3] vhost: introduce vhost_has_work()

2015-11-12 Thread Jason Wang
This path introduces a helper which can give a hint for whether or not
there's a work queued in the work list.

Signed-off-by: Jason Wang 
---
 drivers/vhost/vhost.c | 7 +++
 drivers/vhost/vhost.h | 1 +
 2 files changed, 8 insertions(+)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index eec2f11..163b365 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -245,6 +245,13 @@ void vhost_work_queue(struct vhost_dev *dev, struct 
vhost_work *work)
 }
 EXPORT_SYMBOL_GPL(vhost_work_queue);
 
+/* A lockless hint for busy polling code to exit the loop */
+bool vhost_has_work(struct vhost_dev *dev)
+{
+   return !list_empty(>work_list);
+}
+EXPORT_SYMBOL_GPL(vhost_has_work);
+
 void vhost_poll_queue(struct vhost_poll *poll)
 {
vhost_work_queue(poll->dev, >work);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 4772862..ea0327d 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -37,6 +37,7 @@ struct vhost_poll {
 
 void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
 void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
+bool vhost_has_work(struct vhost_dev *dev);
 
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
 unsigned long mask, struct vhost_dev *dev);
-- 
2.1.4

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH net-next RFC V3 2/3] vhost: introduce vhost_vq_more_avail()

2015-11-12 Thread Jason Wang
Signed-off-by: Jason Wang 
---
 drivers/vhost/vhost.c | 26 +-
 drivers/vhost/vhost.h |  1 +
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 163b365..b86c5aa 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1633,10 +1633,25 @@ void vhost_add_used_and_signal_n(struct vhost_dev *dev,
 }
 EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
 
+bool vhost_vq_more_avail(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
+   __virtio16 avail_idx;
+   int r;
+
+   r = __get_user(avail_idx, >avail->idx);
+   if (r) {
+   vq_err(vq, "Failed to check avail idx at %p: %d\n",
+  >avail->idx, r);
+   return false;
+   }
+
+   return vhost16_to_cpu(vq, avail_idx) != vq->avail_idx;
+}
+EXPORT_SYMBOL_GPL(vhost_vq_more_avail);
+
 /* OK, now we need to know about added descriptors. */
 bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 {
-   __virtio16 avail_idx;
int r;
 
if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))
@@ -1660,14 +1675,7 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct 
vhost_virtqueue *vq)
/* They could have slipped one in as we were doing that: make
 * sure it's written, then check again. */
smp_mb();
-   r = __get_user(avail_idx, >avail->idx);
-   if (r) {
-   vq_err(vq, "Failed to check avail idx at %p: %d\n",
-  >avail->idx, r);
-   return false;
-   }
-
-   return vhost16_to_cpu(vq, avail_idx) != vq->avail_idx;
+   return vhost_vq_more_avail(dev, vq);
 }
 EXPORT_SYMBOL_GPL(vhost_enable_notify);
 
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index ea0327d..5983a13 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -159,6 +159,7 @@ void vhost_add_used_and_signal_n(struct vhost_dev *, struct 
vhost_virtqueue *,
   struct vring_used_elem *heads, unsigned count);
 void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
 void vhost_disable_notify(struct vhost_dev *, struct vhost_virtqueue *);
+bool vhost_vq_more_avail(struct vhost_dev *, struct vhost_virtqueue *);
 bool vhost_enable_notify(struct vhost_dev *, struct vhost_virtqueue *);
 
 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
-- 
2.1.4

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v3 0/3] virtio DMA API core stuff

2015-11-12 Thread David Woodhouse
On Thu, 2015-11-12 at 13:09 +0200, Michael S. Tsirkin wrote:
> On Wed, Nov 11, 2015 at 11:30:27PM +0100, David Woodhouse wrote:
> > 
> > If the IOMMU is exposed, and enabled, and telling the guest kernel that
> > it *does* cover the virtio devices, then those virtio devices will
> > *not* be in passthrough mode.
> 
> This we need to fix. Because in most configurations if you are
> using kernel drivers, then you don't want IOMMU with virtio,
> but if you are using VFIO then you do.

This is *absolutely* not specific to virtio. There are *plenty* of
other users (especially networking) where we only really care about the
existence of the IOMMU for VFIO purposes and assigning devices to
guests, and we are willing to dispense with the protection that it
offers for native in-kernel drivers. For that, boot with iommu=pt.

There is no way, currently, to enable the passthrough mode on a per-
device basis. Although it has been discussed right here, very recently.

Let's not conflate those issues.

> > You choosing to use the DMA API in the virtio device drivers instead of
> > being buggy, has nothing to do with whether it's actually in
> > passthrough mode or not. Whether it's in passthrough mode or not, using
> > the DMA API is technically the right thing to do — because it should
> > either *do* the translation, or return a 1:1 mapped IOVA, as
> > appropriate.
> 
> Right but first we need to actually make DMA API do the right thing
> at least on x86,ppc and arm.

It already does the right thing on x86, modulo BIOS bugs (including the
qemu ACPI table but that you said you're not too worried about).

> I'm not worried about qemu bugs that much.  I am interested in being
> able to use both VFIO and kernel drivers with virtio devices with good
> performance and without tweaking kernel parameters.

OK, then you are interested in the semi-orthogonal discussion about
DMA_ATTR_IOMMU_BYPASS. Either way, device drivers SHALL use the DMA
API.


> > Having said that, if this were real hardware I'd just be blacklisting
> > it and saying "Another BIOS with broken DMAR tables --> IOMMU
> > completely disabled". So perhaps we should just do that.
> > 
> Yes, once there is new QEMU where virtio is covered by the IOMMU,
> that would be one way to address existing QEMU bugs. 

No, that's not required. All that's required is to fix the currently-
broken ACPI table so that it *admits* that the virtio devices aren't
covered by the IOMMU. And I've never waited for a fix to be available
before, before blacklisting *other* broken firmwares...

The only reason I'm holding off for now is because ARM and PPC also
need a quirk for their platform code to realise that certain devices
actually *aren't* covered by the IOMMU, and I might be able to just use
the same thing and still enable the IOMMU in the offending qemu
versions.

Although as noted, it would need to cover assigned devices as well as
virtio — qemu currently lies to us and tells us that the emulated IOMMU
in the guest does cover *those* too.

> With virt, each device can have different priveledges:
> some are part of hypervisor so with a kernel driver
> trying to get protection from them using an IOMMU which is also
> part of hypervisor makes no sense
>  - but when using a
> userspace driver then getting protection from the userspace
> driver does make sense. Others are real devices so
> getting protection from them makes some sense.
> 
> Which is which? It's easiest for the device driver itself to
> gain that knowledge. Please note this is *not* the same
> question as whether a specific device is covered by an IOMMU.

OK. How does your device driver know whether the virtio PCI device it's
talking to is actually implemented by the hypervisor, or whether it's
one of the real PCI implementations that apparently exist?

> Linux doesn't seem to support that usecase at the moment, if this is a
> generic problem then we need to teach Linux to solve it, but if virtio
> is unique in this requirement, then we should just keep doing virtio
> specific things to solve it.

It is a generic problem. There is a discussion elsewhere about how (or
indeed whether) to solve it. It absolutely isn't virtio-specific, and
we absolutely shouldn't be doing virtio-specific things to solve it.

Nothing excuses just eschewing the correct DMA API. That's just broken,
and only ever worked in conjunction with *other* bugs elsewhere in the
platform.


-- 
dwmw2



smime.p7s
Description: S/MIME cryptographic signature
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH net-next RFC V3 0/3] basic busy polling support for vhost_net

2015-11-12 Thread Jason Wang


On 11/12/2015 06:16 PM, Jason Wang wrote:
> Hi all:
>
> This series tries to add basic busy polling for vhost net. The idea is
> simple: at the end of tx/rx processing, busy polling for new tx added
> descriptor and rx receive socket for a while. The maximum number of
> time (in us) could be spent on busy polling was specified ioctl.
>
> Test were done through:
>
> - 50 us as busy loop timeout
> - Netperf 2.6
> - Two machines with back to back connected ixgbe
> - Guest with 1 vcpu and 1 queue
>
> Results:
> - For stream workload, ioexits were reduced dramatically in medium
>   size (1024-2048) of tx (at most -39%) and almost all rx (at most
>   -79%) as a result of polling. This compensate for the possible
>   wasted cpu cycles more or less. That porbably why we can still see
>   some increasing in the normalized throughput in some cases.
> - Throughput of tx were increased (at most 105%) expect for the huge
>   write (16384). And we can send more packets in the case (+tpkts were
>   increased).
> - Very minor rx regression in some cases.
> - Improvemnt on TCP_RR (at most 16%).

Forget to mention, the following test results by order are:

1) Guest TX
2) Guest RX
3) TCP_RR

> size/session/+thu%/+normalize%/+tpkts%/+rpkts%/+ioexits%/
>64/ 1/   +9%/  -17%/   +5%/  +10%/   -2%
>64/ 2/   +8%/  -18%/   +6%/  +10%/   -1%
>64/ 4/   +4%/  -21%/   +6%/  +10%/   -1%
>64/ 8/   +9%/  -17%/   +6%/   +9%/   -2%
>   256/ 1/  +20%/   -1%/  +15%/  +11%/   -9%
>   256/ 2/  +15%/   -6%/  +15%/   +8%/   -8%
>   256/ 4/  +17%/   -4%/  +16%/   +8%/   -8%
>   256/ 8/  -61%/  -69%/  +16%/  +10%/  -10%
>   512/ 1/  +15%/   -3%/  +19%/  +18%/  -11%
>   512/ 2/  +19%/0%/  +19%/  +13%/  -10%
>   512/ 4/  +18%/   -2%/  +18%/  +15%/  -10%
>   512/ 8/  +17%/   -1%/  +18%/  +15%/  -11%
>  1024/ 1/  +25%/   +4%/  +27%/  +16%/  -21%
>  1024/ 2/  +28%/   +8%/  +25%/  +15%/  -22%
>  1024/ 4/  +25%/   +5%/  +25%/  +14%/  -21%
>  1024/ 8/  +27%/   +7%/  +25%/  +16%/  -21%
>  2048/ 1/  +32%/  +12%/  +31%/  +22%/  -38%
>  2048/ 2/  +33%/  +12%/  +30%/  +23%/  -36%
>  2048/ 4/  +31%/  +10%/  +31%/  +24%/  -37%
>  2048/ 8/ +105%/  +75%/  +33%/  +23%/  -39%
> 16384/ 1/0%/  -14%/   +2%/0%/  +19%
> 16384/ 2/0%/  -13%/  +19%/  -13%/  +17%
> 16384/ 4/0%/  -12%/   +3%/0%/   +2%
> 16384/ 8/0%/  -11%/   -2%/   +1%/   +1%
> size/session/+thu%/+normalize%/+tpkts%/+rpkts%/+ioexits%/
>64/ 1/   -7%/  -23%/   +4%/   +6%/  -74%
>64/ 2/   -2%/  -12%/   +2%/   +2%/  -55%
>64/ 4/   +2%/   -5%/  +10%/   -2%/  -43%
>64/ 8/   -5%/   -5%/  +11%/  -34%/  -59%
>   256/ 1/   -6%/  -16%/   +9%/  +11%/  -60%
>   256/ 2/   +3%/   -4%/   +6%/   -3%/  -28%
>   256/ 4/0%/   -5%/   -9%/   -9%/  -10%
>   256/ 8/   -3%/   -6%/  -12%/   -9%/  -40%
>   512/ 1/   -4%/  -17%/  -10%/  +21%/  -34%
>   512/ 2/0%/   -9%/  -14%/   -3%/  -30%
>   512/ 4/0%/   -4%/  -18%/  -12%/   -4%
>   512/ 8/   -1%/   -4%/   -1%/   -5%/   +4%
>  1024/ 1/0%/  -16%/  +12%/  +11%/  -10%
>  1024/ 2/0%/  -11%/0%/   +5%/  -31%
>  1024/ 4/0%/   -4%/   -7%/   +1%/  -22%
>  1024/ 8/   -5%/   -6%/  -17%/  -29%/  -79%
>  2048/ 1/0%/  -16%/   +1%/   +9%/  -10%
>  2048/ 2/0%/  -12%/   +7%/   +9%/  -26%
>  2048/ 4/0%/   -7%/   -4%/   +3%/  -64%
>  2048/ 8/   -1%/   -5%/   -6%/   +4%/  -20%
> 16384/ 1/0%/  -12%/  +11%/   +7%/  -20%
> 16384/ 2/0%/   -7%/   +1%/   +5%/  -26%
> 16384/ 4/0%/   -5%/  +12%/  +22%/  -23%
> 16384/ 8/0%/   -1%/   -8%/   +5%/   -3%
> size/session/+thu%/+normalize%/+tpkts%/+rpkts%/+ioexits%/
> 1/ 1/   +9%/  -29%/   +9%/   +9%/   +9%
> 1/25/   +6%/  -18%/   +6%/   +6%/   -1%
> 1/50/   +6%/  -19%/   +5%/   +5%/   -2%
> 1/   100/   +5%/  -19%/   +4%/   +4%/   -3%
>64/ 1/  +10%/  -28%/  +10%/  +10%/  +10%
>64/25/   +8%/  -18%/   +7%/   +7%/   -2%
>64/50/   +8%/  -17%/   +8%/   +8%/   -1%
>64/   100/   +8%/  -17%/   +8%/   +8%/   -1%
>   256/ 1/  +10%/  -28%/  +10%/  +10%/  +10%
>   256/25/  +15%/  -13%/  +15%/  +15%/0%
>   256/50/  +16%/  -14%/  +18%/  +18%/   +2%
>   256/   100/  +15%/  -13%/  +12%/  +12%/   -2%
>
> Changes from V2:
> - poll also at the end of rx handling
> - factor out the polling logic and optimize the code a little bit
> - add two ioctls to get and set the busy poll timeout
> - test on ixgbe (which can give more stable and reproducable numbers)
>   instead of mlx4.
>
> Changes from V1:
> - Add a comment for vhost_has_work() to explain why it could be
>   lockless
> - Add param description for busyloop_timeout
> - Split out the busy polling logic into a new helper
> - Check and exit the loop when there's a pending signal
> - Disable preemption during busy looping to make sure lock_clock() was
>   correctly 

Re: [PATCH v3 0/3] virtio DMA API core stuff

2015-11-12 Thread Michael S. Tsirkin
On Wed, Nov 11, 2015 at 11:30:27PM +0100, David Woodhouse wrote:
> On Wed, 2015-11-11 at 07:56 -0800, Andy Lutomirski wrote:
> > 
> > Can you flesh out this trick?
> > 
> > On x86 IIUC the IOMMU more-or-less defaults to passthrough.  If the
> > kernel wants, it can switch it to a non-passthrough mode.  My patches
> > cause the virtio driver to do exactly this, except that the host
> > implementation doesn't actually exist yet, so the patches will instead
> > have no particular effect.
> 
> At some level, yes — we're compatible with a 1982 IBM PC and thus the
> IOMMU is entirely disabled at boot until the kernel turns it on —
> except in TXT mode where we abandon that compatibility.
> 
> But no, the virtio driver has *nothing* to do with switching the device
> out of passthrough mode. It is either in passthrough mode, or it isn't.
> 
> If the VMM *doesn't* expose an IOMMU to the guest, obviously the
> devices are in passthrough mode. If the guest kernel doesn't have IOMMU
> support enabled, then obviously the devices are in passthrough mode.
> And if the ACPI tables exposed to the guest kernel *tell* it that the
> virtio devices are not actually behind the IOMMU (which qemu gets
> wrong), then it'll be in passthrough mode.
> 
> If the IOMMU is exposed, and enabled, and telling the guest kernel that
> it *does* cover the virtio devices, then those virtio devices will
> *not* be in passthrough mode.

This we need to fix. Because in most configurations if you are
using kernel drivers, then you don't want IOMMU with virtio,
but if you are using VFIO then you do.

Intel's iommu can be programmed to still
do a kind of passthrough (1:1) mapping, it's
just a matter of doing this for virtio devices
when not using VFIO.

> You choosing to use the DMA API in the virtio device drivers instead of
> being buggy, has nothing to do with whether it's actually in
> passthrough mode or not. Whether it's in passthrough mode or not, using
> the DMA API is technically the right thing to do — because it should
> either *do* the translation, or return a 1:1 mapped IOVA, as
> appropriate.

Right but first we need to actually make DMA API do the right thing
at least on x86,ppc and arm.

> > On powerpc and sparc, we *already* screwed up.  The host already tells
> > the guest that there's an IOMMU and that it's *enabled* because those
> > platforms don't have selective IOMMU coverage the way that x86 does.
> > So we need to work around it.
> 
> No, we need it on x86 too because once we fix the virtio device driver
> bug and make it start using the DMA API, then we start to trip up on
> the qemu bug where it lies about which devices are covered by the
> IOMMU.
> 
> Of course, we still have that same qemu bug w.r.t. assigned devices,
> which it *also* claims are behind its IOMMU when they're not...

I'm not worried about qemu bugs that much.  I am interested in being
able to use both VFIO and kernel drivers with virtio devices with good
performance and without tweaking kernel parameters.


> > I think that, if we want fancy virt-friendly IOMMU stuff like you're
> > talking about, then the right thing to do is to create a virtio bus
> > instead of pretending to be PCI.  That bus could have a virtio IOMMU
> > and its own cross-platform enumeration mechanism for devices on the
> > bus, and everything would be peachy.
> 
> That doesn't really help very much for the x86 case where the problem
> is compatibility with *existing* (arguably broken) qemu
> implementations.
> 
> Having said that, if this were real hardware I'd just be blacklisting
> it and saying "Another BIOS with broken DMAR tables --> IOMMU
> completely disabled". So perhaps we should just do that.
> 

Yes, once there is new QEMU where virtio is covered by the IOMMU,
that would be one way to address existing QEMU bugs. 

> > I still don't understand what trick.  If we want virtio devices to be
> > assignable, then they should be translated through the IOMMU, and the
> > DMA API is the right interface for that.
> 
> The DMA API is the right interface *regardless* of whether there's
> actual translation to be done. The device driver itself should not be
> involved in any way with that decision.

With virt, each device can have different priveledges:
some are part of hypervisor so with a kernel driver
trying to get protection from them using an IOMMU which is also
part of hypervisor makes no sense - but when using a
userspace driver then getting protection from the userspace
driver does make sense. Others are real devices so
getting protection from them makes some sense.

Which is which? It's easiest for the device driver itself to
gain that knowledge. Please note this is *not* the same
question as whether a specific device is covered by an IOMMU.

> When you want to access MMIO, you use ioremap() and writel() instead of
> doing random crap for yourself. When you want DMA, you use the DMA API
> to get a bus address for your device *even* if you expect there to be
> no IOMMU and 

Re: [PATCH] vhost: move is_le setup to the backend

2015-11-12 Thread Cornelia Huck
On Fri, 30 Oct 2015 12:42:35 +0100
Greg Kurz  wrote:

> The vq->is_le field is used to fix endianness when accessing the vring via
> the cpu_to_vhost16() and vhost16_to_cpu() helpers in the following cases:
> 
> 1) host is big endian and device is modern virtio
> 
> 2) host has cross-endian support and device is legacy virtio with a different
>endianness than the host
> 
> Both cases rely on the VHOST_SET_FEATURES ioctl, but 2) also needs the
> VHOST_SET_VRING_ENDIAN ioctl to be called by userspace. Since vq->is_le
> is only needed when the backend is active, it was decided to set it at
> backend start.
> 
> This is currently done in vhost_init_used()->vhost_init_is_le() but it
> obfuscates the core vhost code. This patch moves the is_le setup to a
> dedicated function that is called from the backend code.
> 
> Note vhost_net is the only backend that can pass vq->private_data == NULL to
> vhost_init_used(), hence the "if (sock)" branch.
> 
> No behaviour change.
> 
> Signed-off-by: Greg Kurz 
> ---
>  drivers/vhost/net.c   |6 ++
>  drivers/vhost/scsi.c  |3 +++
>  drivers/vhost/test.c  |2 ++
>  drivers/vhost/vhost.c |   12 +++-
>  drivers/vhost/vhost.h |1 +
>  5 files changed, 19 insertions(+), 5 deletions(-)

Makes sense.

Reviewed-by: Cornelia Huck 

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH] vhost: move is_le setup to the backend

2015-11-12 Thread Michael S. Tsirkin
On Fri, Oct 30, 2015 at 12:42:35PM +0100, Greg Kurz wrote:
> The vq->is_le field is used to fix endianness when accessing the vring via
> the cpu_to_vhost16() and vhost16_to_cpu() helpers in the following cases:
> 
> 1) host is big endian and device is modern virtio
> 
> 2) host has cross-endian support and device is legacy virtio with a different
>endianness than the host
> 
> Both cases rely on the VHOST_SET_FEATURES ioctl, but 2) also needs the
> VHOST_SET_VRING_ENDIAN ioctl to be called by userspace. Since vq->is_le
> is only needed when the backend is active, it was decided to set it at
> backend start.
> 
> This is currently done in vhost_init_used()->vhost_init_is_le() but it
> obfuscates the core vhost code. This patch moves the is_le setup to a
> dedicated function that is called from the backend code.
> 
> Note vhost_net is the only backend that can pass vq->private_data == NULL to
> vhost_init_used(), hence the "if (sock)" branch.
> 
> No behaviour change.
> 
> Signed-off-by: Greg Kurz 

I plan to look at this next week, busy with QEMU 2.5 now.

> ---
>  drivers/vhost/net.c   |6 ++
>  drivers/vhost/scsi.c  |3 +++
>  drivers/vhost/test.c  |2 ++
>  drivers/vhost/vhost.c |   12 +++-
>  drivers/vhost/vhost.h |1 +
>  5 files changed, 19 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 9eda69e40678..d6319cb2664c 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -917,6 +917,12 @@ static long vhost_net_set_backend(struct vhost_net *n, 
> unsigned index, int fd)
>  
>   vhost_net_disable_vq(n, vq);
>   vq->private_data = sock;
> +
> + if (sock)
> + vhost_set_is_le(vq);
> + else
> + vq->is_le = virtio_legacy_is_little_endian();
> +
>   r = vhost_init_used(vq);
>   if (r)
>   goto err_used;
> diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
> index e25a23692822..e2644a301fa5 100644
> --- a/drivers/vhost/scsi.c
> +++ b/drivers/vhost/scsi.c
> @@ -1276,6 +1276,9 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs,
>   vq = >vqs[i].vq;
>   mutex_lock(>mutex);
>   vq->private_data = vs_tpg;
> +
> + vhost_set_is_le(vq);
> +
>   vhost_init_used(vq);
>   mutex_unlock(>mutex);
>   }
> diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
> index f2882ac98726..b1c7df502211 100644
> --- a/drivers/vhost/test.c
> +++ b/drivers/vhost/test.c
> @@ -196,6 +196,8 @@ static long vhost_test_run(struct vhost_test *n, int test)
>   oldpriv = vq->private_data;
>   vq->private_data = priv;
>  
> + vhost_set_is_le(vq);
> +
>   r = vhost_init_used(>vqs[index]);
>  
>   mutex_unlock(>mutex);
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index eec2f11809ff..6be863dcbd13 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -113,6 +113,12 @@ static void vhost_init_is_le(struct vhost_virtqueue *vq)
>  }
>  #endif /* CONFIG_VHOST_CROSS_ENDIAN_LEGACY */
>  
> +void vhost_set_is_le(struct vhost_virtqueue *vq)
> +{
> + vhost_init_is_le(vq);
> +}
> +EXPORT_SYMBOL_GPL(vhost_set_is_le);
> +
>  static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
>   poll_table *pt)
>  {
> @@ -1156,12 +1162,8 @@ int vhost_init_used(struct vhost_virtqueue *vq)
>  {
>   __virtio16 last_used_idx;
>   int r;
> - if (!vq->private_data) {
> - vq->is_le = virtio_legacy_is_little_endian();
> + if (!vq->private_data)
>   return 0;
> - }
> -
> - vhost_init_is_le(vq);
>  
>   r = vhost_update_used_flags(vq);
>   if (r)
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 4772862b71a7..8a62041959fe 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -162,6 +162,7 @@ bool vhost_enable_notify(struct vhost_dev *, struct 
> vhost_virtqueue *);
>  
>  int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
>   unsigned int log_num, u64 len);
> +void vhost_set_is_le(struct vhost_virtqueue *vq);
>  
>  #define vq_err(vq, fmt, ...) do {  \
>   pr_debug(pr_fmt(fmt), ##__VA_ARGS__);   \
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization