Re: [PATCH] net: vhost: improve performance when enable busyloop

2018-06-20 Thread Jason Wang



On 2018年06月20日 21:28, Tonghao Zhang wrote:

This patch improves the guest receive performance from
host. On the handle_tx side, we poll the sock receive
queue at the same time. handle_rx do that in the same way.

we set the poll-us=100 us and use the iperf3 to test
its throughput. The iperf3 command is shown as below.

iperf3 -s -D
iperf3 -c 192.168.1.100 -i 1 -P 10 -t 10 -M 1400 --bandwidth 10M

* With the patch:21.1 Gbits/sec
* Without the patch: 12.7 Gbits/sec


Thanks a lot for the patch. But looks like it needs some work to avoid 
e.g deadlock.


E.g in vhost_process_iotlb_msg() we call vhost_dev_lock_vqs() which did:

    for (i = 0; i < d->nvqs; ++i)
        mutex_lock_nested(>vqs[i]->mutex, i);

I believe we need to change the code to lock the vq one by one like the 
attached (only compile test).



Signed-off-by: Tonghao Zhang 
---
  drivers/vhost/net.c | 21 +
  1 file changed, 21 insertions(+)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e7cf7d2..9364ede 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -429,22 +429,43 @@ static int vhost_net_enable_vq(struct vhost_net *n,
return vhost_poll_start(poll, sock->file);
  }
  
+static int sk_has_rx_data(struct sock *sk);

+


How about move sk_has_rx_data() here.


  static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
struct vhost_virtqueue *vq,
struct iovec iov[], unsigned int iov_size,
unsigned int *out_num, unsigned int *in_num)
  {
unsigned long uninitialized_var(endtime);
+   struct vhost_net_virtqueue *nvq = >vqs[VHOST_NET_VQ_RX];
+   struct vhost_virtqueue *rvq = >vq;
+   struct socket *sock = rvq->private_data;
+
int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
  out_num, in_num, NULL, NULL);
  
  	if (r == vq->num && vq->busyloop_timeout) {

+   mutex_lock_nested(>mutex, 1);
+
+   vhost_disable_notify(>dev, rvq);
+
preempt_disable();
endtime = busy_clock() + vq->busyloop_timeout;
while (vhost_can_busy_poll(vq->dev, endtime) &&
+  !(sock && sk_has_rx_data(sock->sk)) &&
   vhost_vq_avail_empty(vq->dev, vq))
cpu_relax();
preempt_enable();
+
+   if (sock && sk_has_rx_data(sock->sk))
+   vhost_poll_queue(>poll);
+   else if (unlikely(vhost_enable_notify(>dev, rvq))) {
+   vhost_disable_notify(>dev, rvq);
+   vhost_poll_queue(>poll);
+   }
+
+   mutex_unlock(>mutex);


Some kinds of code duplication, can we try to unify them?

Btw, net-next is closed, so you need resubmit after it was open and use 
a "net-next" as the prefix of the patch.


Thanks


+
r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
  out_num, in_num, NULL, NULL);
}


>From 383fe9d98420d92a632dc554969b4b1716017ba2 Mon Sep 17 00:00:00 2001
From: Jason Wang 
Date: Thu, 21 Jun 2018 13:58:31 +0800
Subject: [PATCH] vhost: lock vqs one by one

Signed-off-by: Jason Wang 
---
 drivers/vhost/vhost.c | 23 ++-
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index e5bc4bb..937252d 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -294,8 +294,11 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
 {
 	int i;
 
-	for (i = 0; i < d->nvqs; ++i)
+	for (i = 0; i < d->nvqs; ++i) {
+		mutex_lock(>vqs[i]->mutex);
 		__vhost_vq_meta_reset(d->vqs[i]);
+		mutex_unlock(>vqs[i]->mutex);
+	}
 }
 
 static void vhost_vq_reset(struct vhost_dev *dev,
@@ -855,20 +858,6 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
 #define vhost_get_used(vq, x, ptr) \
 	vhost_get_user(vq, x, ptr, VHOST_ADDR_USED)
 
-static void vhost_dev_lock_vqs(struct vhost_dev *d)
-{
-	int i = 0;
-	for (i = 0; i < d->nvqs; ++i)
-		mutex_lock_nested(>vqs[i]->mutex, i);
-}
-
-static void vhost_dev_unlock_vqs(struct vhost_dev *d)
-{
-	int i = 0;
-	for (i = 0; i < d->nvqs; ++i)
-		mutex_unlock(>vqs[i]->mutex);
-}
-
 static int vhost_new_umem_range(struct vhost_umem *umem,
 u64 start, u64 size, u64 end,
 u64 userspace_addr, int perm)
@@ -918,7 +907,9 @@ static void vhost_iotlb_notify_vq(struct vhost_dev *d,
 		if (msg->iova <= vq_msg->iova &&
 		msg->iova + msg->size - 1 > vq_msg->iova &&
 		vq_msg->type == VHOST_IOTLB_MISS) {
+			mutex_lock(>vq->mutex);
 			vhost_poll_queue(>vq->poll);
+			mutex_unlock(>vq->mutex);
 			list_del(>node);
 			kfree(node);
 		}
@@ -950,7 +941,6 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
 	int ret = 0;
 
 	mutex_lock(>mutex);
-	vhost_dev_lock_vqs(dev);
 	switch (msg->type) {
 	case 

[PATCH net] vhost_net: validate sock before trying to put its fd

2018-06-20 Thread Jason Wang
Sock will be NULL if we pass -1 to vhost_net_set_backend(), but when
we meet errors during ubuf allocation, the code does not check for
NULL before calling sockfd_put(), this will lead NULL
dereferencing. Fixing by checking sock pointer before.

Fixes: bab632d69ee4 ("vhost: vhost TX zero-copy support")
Reported-by: Dan Carpenter 
Signed-off-by: Jason Wang 
---
 drivers/vhost/net.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 986058a..b97a994 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1208,7 +1208,8 @@ static long vhost_net_set_backend(struct vhost_net *n, 
unsigned index, int fd)
if (ubufs)
vhost_net_ubuf_put_wait_and_free(ubufs);
 err_ubufs:
-   sockfd_put(sock);
+   if (sock)
+   sockfd_put(sock);
 err_vq:
mutex_unlock(>mutex);
 err:
-- 
2.7.4

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [virtio-dev] Re: [Qemu-devel] [PATCH] qemu: Introduce VIRTIO_NET_F_STANDBY feature bit to virtio_net

2018-06-20 Thread Siwei Liu
On Wed, Jun 20, 2018 at 7:34 AM, Cornelia Huck  wrote:
> On Tue, 19 Jun 2018 13:09:14 -0700
> Siwei Liu  wrote:
>
>> On Tue, Jun 19, 2018 at 3:54 AM, Cornelia Huck  wrote:
>> > On Fri, 15 Jun 2018 10:06:07 -0700
>> > Siwei Liu  wrote:
>> >
>> >> On Fri, Jun 15, 2018 at 4:48 AM, Cornelia Huck  wrote:
>> >> > On Thu, 14 Jun 2018 18:57:11 -0700
>> >> > Siwei Liu  wrote:
>
>> >> > I'm a bit confused here. What, exactly, ties the two devices together?
>> >>
>> >> The group UUID. Since QEMU VFIO dvice does not have insight of MAC
>> >> address (which it doesn't have to), the association between VFIO
>> >> passthrough and standby must be specificed for QEMU to understand the
>> >> relationship with this model. Note, standby feature is no longer
>> >> required to be exposed under this model.
>> >
>> > Isn't that a bit limiting, though?
>> >
>> > With this model, you can probably tie a vfio-pci device and a
>> > virtio-net-pci device together. But this will fail if you have
>> > different transports: Consider tying together a vfio-pci device and a
>> > virtio-net-ccw device on s390, for example. The standby feature bit is
>> > on the virtio-net level and should not have any dependency on the
>> > transport used.
>>
>> Probably we'd limit the support for grouping to virtio-net-pci device
>> and vfio-pci device only. For virtio-net-pci, as you might see with
>> Venu's patch, we store the group UUID on the config space of
>> virtio-pci, which is only applicable to PCI transport.
>>
>> If virtio-net-ccw needs to support the same, I think similar grouping
>> interface should be defined on the VirtIO CCW transport. I think the
>> current implementation of the Linux failover driver assumes that it's
>> SR-IOV VF with same MAC address which the virtio-net-pci needs to pair
>> with, and that the PV path is on same PF without needing to update
>> network of the port-MAC association change. If we need to extend the
>> grouping mechanism to virtio-net-ccw, it has to pass such failover
>> mode to virtio driver specifically through some other option I guess.
>
> Hm, I've just spent some time reading the Linux failover code and I did
> not really find much pci-related magic in there (other than checking
> for a pci device in net_failover_slave_pre_register). We also seem to
> look for a matching device by MAC only. What magic am I missing?

The existing assumptions around SR-IOV VF and thus PCI is implicit. A
lot of simplications are built on the fact that the passthrough device
is a SR-IOV Virtual Function specifically than others: MAC addresses
for couple devices must be the same, changing MAC address is
prohibited, programming VLAN filter is challenged, the datapath of
virtio-net has to share the same physical function where VF belongs
to. There's no hankshake during datapath switching at all to support a
normal passthrough device at this point. I'd imagine some work around
that ahead, which might be a bit involved than just to support a
simplified model for VF migration.

>
> Is the look-for-uuid handling supposed to happen in the host only?

The look-for-MAC matching scheme is not ideal in many aspects. I don't
want to repeat those again, but once the group UUID is added to QEMU,
the failover driver is supposed to switch to the UUID based matching
scheme in the guest.

>
>> >> > If libvirt already has the knowledge that it should manage the two as a
>> >> > couple, why do we need the group id (or something else for other
>> >> > architectures)? (Maybe I'm simply missing something because I'm not
>> >> > that familiar with pci.)
>> >>
>> >> The idea is to have QEMU control the visibility and enumeration order
>> >> of the passthrough VFIO for the failover scenario. Hotplug can be one
>> >> way to achieve it, and perhaps there's other way around also. The
>> >> group ID is not just for QEMU to couple devices, it's also helpful to
>> >> guest too as grouping using MAC address is just not safe.
>> >
>> > Sorry about dragging mainframes into this, but this will only work for
>> > homogenous device coupling, not for heterogenous. Consider my vfio-pci
>> > + virtio-net-ccw example again: The guest cannot find out that the two
>> > belong together by checking some group ID, it has to either use the MAC
>> > or some needs-to-be-architectured property.
>> >
>> > Alternatively, we could propose that mechanism as pci-only, which means
>> > we can rely on mechanisms that won't necessarily work on non-pci
>> > transports. (FWIW, I don't see a use case for using vfio-ccw to pass
>> > through a network card anytime in the near future, due to the nature of
>> > network cards currently in use on s390.)
>>
>> Yes, let's do this just for PCI transport (homogenous) for now.
>
> But why? Using pci for passthrough to make things easier (and because
> there's not really a use case), sure. But I really don't want to
> restrict this to virtio-pci only.

Of course, technically it doesn't have to be virtio-pci only. The
group UUID can even extend it 

Re: [virtio-dev] Re: [Qemu-devel] [PATCH] qemu: Introduce VIRTIO_NET_F_STANDBY feature bit to virtio_net

2018-06-20 Thread Michael S. Tsirkin
On Wed, Jun 20, 2018 at 06:06:19PM +0200, Cornelia Huck wrote:
> In any case, I'm not sure anymore why we'd want the extra uuid.

It's mostly so we can have e.g. multiple devices with same MAC
(which some people seem to want in order to then use
then with different containers).

But it is also handy for when you assign a PF, since then you
can't set the MAC.

-- 
MST
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [virtio-dev] Re: [Qemu-devel] [PATCH] qemu: Introduce VIRTIO_NET_F_STANDBY feature bit to virtio_net

2018-06-20 Thread Cornelia Huck
On Wed, 20 Jun 2018 17:11:59 +0300
"Michael S. Tsirkin"  wrote:

> On Wed, Jun 20, 2018 at 11:53:59AM +0200, Cornelia Huck wrote:
> > On Tue, 19 Jun 2018 23:32:06 +0300
> > "Michael S. Tsirkin"  wrote:
> >   
> > > On Tue, Jun 19, 2018 at 12:54:53PM +0200, Cornelia Huck wrote:  
> > > > Sorry about dragging mainframes into this, but this will only work for
> > > > homogenous device coupling, not for heterogenous. Consider my vfio-pci
> > > > + virtio-net-ccw example again: The guest cannot find out that the two
> > > > belong together by checking some group ID, it has to either use the MAC
> > > > or some needs-to-be-architectured property.
> > > > 
> > > > Alternatively, we could propose that mechanism as pci-only, which means
> > > > we can rely on mechanisms that won't necessarily work on non-pci
> > > > transports. (FWIW, I don't see a use case for using vfio-ccw to pass
> > > > through a network card anytime in the near future, due to the nature of
> > > > network cards currently in use on s390.)
> > > 
> > > That's what it boils down to, yes.  If there's need to have this for
> > > non-pci devices, then we should put it in config space.
> > > Cornelia, what do you think?
> > >   
> > 
> > I think the only really useful config on s390 is the vfio-pci network
> > card coupled with a virtio-net-ccw device: Using an s390 network card
> > via vfio-ccw is out due to the nature of the s390 network cards, and
> > virtio-ccw is the default transport (virtio-pci is not supported on any
> > enterprise distro AFAIK).
> > 
> > For this, having a uuid in the config space could work (vfio-pci
> > devices have a config space by virtue of being pci devices, and
> > virtio-net-ccw devices have a config space by virtue of being virtio
> > devices -- ccw devices usually don't have that concept).  
> 
> OK so this calls for adding such a field generally (it's
> device agnostic right now).
> 
> How would you suggest doing that?

I hope that I'm not thoroughly confused at this point in time, so I'll
summarize my current understanding (also keep in mind that I haven't
looked at Venu's patches yet):

- The Linux guest initiates coupling from the virtio-net driver.
  Matching the other device is done via the MAC, and only pci devices
  are allowed for the failover device. (There does not seem to be any
  restriction on the transport of the virtio-net device.)
- The Linux guest virtio-net driver does not allow changing the MAC if
  standby has been negotiated (implying that the hypervisor needs to
  configure the correct MAC).
- In QEMU, we need to know which two devices (vfio-pci and virtio-net)
  go together, so that the virtio-net device gets the correct MAC. We
  also need the pairing so that we can make the vfio-pci device
  available once the guest has negotiated the standby feature.

We can tack the two devices together in QEMU by introducing new,
optional properties pointing from the virtio-net device to the vfio-pci
device (only offer standby if this is set) and the other way around
(don't make the device visible at the start if this is set). Problems:

- The admin needs to figure out the MAC by themselves and set it
  correctly. If this is incorrect, the vfio-pci device cannot be found
  in the guest. (Not sure how much of a problem this is in practice --
  and QEMU cannot figure out the MAC without poking at the vfio-pci
  device, and we probably want to avoid that.)
- This two-way pointing makes for interesting handing of the command
  line and when both devices are plugged later.

In any case, I'm not sure anymore why we'd want the extra uuid. Is
there any way QEMU (or libvirt) can figure it out without actually
looking at the vfio-pci device?
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


CFP SENSORNETS 2019 - 8th Int.l Conf. on Sensor Networks (Prague/Czech Republic)

2018-06-20 Thread sensorn...@insticc.info
SUBMISSION DEADLINE 

8th International Conference on Sensor Networks

Submission Deadline: October 1, 2018

http://www.sensornets.org/

February 26 - 27, 2019
Prague, Czech Republic.

 SENSORNETS is organized in 5 major tracks:

 - Sensor Networks Software, Architectures and Applications
 - Wireless Sensor Networks
 - Energy and Environment
 - Intelligent Data Analysis and Processing
 - Security and Privacy in Sensor Networks


In Cooperation with: EUROGRAPHICS and AFIG. 
Proceedings will be submitted for indexation by: DBLP, DBLP, Thomson Reuters, 
EI, SCOPUS, Semantic Scholar and Semantic Scholar. 
 
A short list of presented papers will be selected so that revised and extended 
versions of these papers will be published by Springer.
 
All papers presented at the congress venue will also be available at the 
SCITEPRESS Digital Library (http://www.scitepress.org/DigitalLibrary/).
  
Should you have any question please don’t hesitate contacting me.
 

Kind regards,
SENSORNETS Secretariat

Address: Av. D. Manuel I, 27A, 2º esq.
2910-595 Setubal, Portugal
Tel: +351 265 100 033
Fax: +351 265 520 186
Web: http://www.sensornets.org/
e-mail: sensornets.secretar...@insticc.org

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [virtio-dev] Re: [PATCH v33 2/4] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT

2018-06-20 Thread Michael S. Tsirkin
On Wed, Jun 20, 2018 at 09:11:39AM +, Wang, Wei W wrote:
> On Tuesday, June 19, 2018 10:43 PM, Michael S. Tsirk wrote:
> > On Tue, Jun 19, 2018 at 08:13:37PM +0800, Wei Wang wrote:
> > > On 06/19/2018 11:05 AM, Michael S. Tsirkin wrote:
> > > > On Tue, Jun 19, 2018 at 01:06:48AM +, Wang, Wei W wrote:
> > > > > On Monday, June 18, 2018 10:29 AM, Michael S. Tsirkin wrote:
> > > > > > On Sat, Jun 16, 2018 at 01:09:44AM +, Wang, Wei W wrote:
> > > > > > > Not necessarily, I think. We have min(4m_page_blocks / 512,
> > > > > > > 1024) above,
> > > > > > so the maximum memory that can be reported is 2TB. For larger
> > guests, e.g.
> > > > > > 4TB, the optimization can still offer 2TB free memory (better
> > > > > > than no optimization).
> > > > > >
> > > > > > Maybe it's better, maybe it isn't. It certainly muddies the waters 
> > > > > > even
> > more.
> > > > > > I'd rather we had a better plan. From that POV I like what
> > > > > > Matthew Wilcox suggested for this which is to steal the necessary # 
> > > > > > of
> > entries off the list.
> > > > > Actually what Matthew suggested doesn't make a difference here.
> > > > > That method always steal the first free page blocks, and sure can
> > > > > be changed to take more. But all these can be achieved via kmalloc
> > > > I'd do get_user_pages really. You don't want pages split, etc.
> > 
> > Oops sorry. I meant get_free_pages .
> 
> Yes, we can use __get_free_pages, and the max allocation is MAX_ORDER - 1, 
> which can report up to 2TB free memory. 
> 
> "getting two pages isn't harder", do you mean passing two arrays (two 
> allocations by get_free_pages(,MAX_ORDER -1)) to the mm API?

Yes, or generally a list of pages with as many as needed.


> Please see if the following logic aligns to what you think:
> 
> uint32_t i, max_hints, hints_per_page, hints_per_array, total_arrays;
> unsigned long *arrays;
>  
>  /*
>  * Each array size is MAX_ORDER_NR_PAGES. If one array is not enough 
> to
>  * store all the hints, we need to allocate multiple arrays.
>  * max_hints: the max number of 4MB free page blocks
>  * hints_per_page: the number of hints each page can store
>  * hints_per_array: the number of hints an array can store
>  * total_arrays: the number of arrays we need
>  */
> max_hints = totalram_pages / MAX_ORDER_NR_PAGES;
> hints_per_page = PAGE_SIZE / sizeof(__le64);
> hints_per_array = hints_per_page * MAX_ORDER_NR_PAGES;
> total_arrays = max_hints /  hints_per_array +
>!!(max_hints % hints_per_array);
> arrays = kmalloc(total_arrays * sizeof(unsigned long), GFP_KERNEL);
> for (i = 0; i < total_arrays; i++) {
> arrays[i] = __get_free_pages(__GFP_ATOMIC | __GFP_NOMEMALLOC, 
> MAX_ORDER - 1);
> 
>   if (!arrays[i])
>   goto out;
> }
> 
> 
> - the mm API needs to be changed to support storing hints to multiple 
> separated arrays offered by the caller.
> 
> Best,
> Wei

Yes. And add an API to just count entries so we know how many arrays to 
allocate.

-- 
MST
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [virtio-dev] Re: [Qemu-devel] [PATCH] qemu: Introduce VIRTIO_NET_F_STANDBY feature bit to virtio_net

2018-06-20 Thread Michael S. Tsirkin
On Wed, Jun 20, 2018 at 11:53:59AM +0200, Cornelia Huck wrote:
> On Tue, 19 Jun 2018 23:32:06 +0300
> "Michael S. Tsirkin"  wrote:
> 
> > On Tue, Jun 19, 2018 at 12:54:53PM +0200, Cornelia Huck wrote:
> > > Sorry about dragging mainframes into this, but this will only work for
> > > homogenous device coupling, not for heterogenous. Consider my vfio-pci
> > > + virtio-net-ccw example again: The guest cannot find out that the two
> > > belong together by checking some group ID, it has to either use the MAC
> > > or some needs-to-be-architectured property.
> > > 
> > > Alternatively, we could propose that mechanism as pci-only, which means
> > > we can rely on mechanisms that won't necessarily work on non-pci
> > > transports. (FWIW, I don't see a use case for using vfio-ccw to pass
> > > through a network card anytime in the near future, due to the nature of
> > > network cards currently in use on s390.)  
> > 
> > That's what it boils down to, yes.  If there's need to have this for
> > non-pci devices, then we should put it in config space.
> > Cornelia, what do you think?
> > 
> 
> I think the only really useful config on s390 is the vfio-pci network
> card coupled with a virtio-net-ccw device: Using an s390 network card
> via vfio-ccw is out due to the nature of the s390 network cards, and
> virtio-ccw is the default transport (virtio-pci is not supported on any
> enterprise distro AFAIK).
> 
> For this, having a uuid in the config space could work (vfio-pci
> devices have a config space by virtue of being pci devices, and
> virtio-net-ccw devices have a config space by virtue of being virtio
> devices -- ccw devices usually don't have that concept).

OK so this calls for adding such a field generally (it's
device agnostic right now).

How would you suggest doing that?


-- 
MST
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH] net: vhost: improve performance when enable busyloop

2018-06-20 Thread Tonghao Zhang
This patch improves the guest receive performance from
host. On the handle_tx side, we poll the sock receive
queue at the same time. handle_rx do that in the same way.

we set the poll-us=100 us and use the iperf3 to test
its throughput. The iperf3 command is shown as below.

iperf3 -s -D
iperf3 -c 192.168.1.100 -i 1 -P 10 -t 10 -M 1400 --bandwidth 10M

* With the patch:21.1 Gbits/sec
* Without the patch: 12.7 Gbits/sec

Signed-off-by: Tonghao Zhang 
---
 drivers/vhost/net.c | 21 +
 1 file changed, 21 insertions(+)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e7cf7d2..9364ede 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -429,22 +429,43 @@ static int vhost_net_enable_vq(struct vhost_net *n,
return vhost_poll_start(poll, sock->file);
 }
 
+static int sk_has_rx_data(struct sock *sk);
+
 static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
struct vhost_virtqueue *vq,
struct iovec iov[], unsigned int iov_size,
unsigned int *out_num, unsigned int *in_num)
 {
unsigned long uninitialized_var(endtime);
+   struct vhost_net_virtqueue *nvq = >vqs[VHOST_NET_VQ_RX];
+   struct vhost_virtqueue *rvq = >vq;
+   struct socket *sock = rvq->private_data;
+
int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
  out_num, in_num, NULL, NULL);
 
if (r == vq->num && vq->busyloop_timeout) {
+   mutex_lock_nested(>mutex, 1);
+
+   vhost_disable_notify(>dev, rvq);
+
preempt_disable();
endtime = busy_clock() + vq->busyloop_timeout;
while (vhost_can_busy_poll(vq->dev, endtime) &&
+  !(sock && sk_has_rx_data(sock->sk)) &&
   vhost_vq_avail_empty(vq->dev, vq))
cpu_relax();
preempt_enable();
+
+   if (sock && sk_has_rx_data(sock->sk))
+   vhost_poll_queue(>poll);
+   else if (unlikely(vhost_enable_notify(>dev, rvq))) {
+   vhost_disable_notify(>dev, rvq);
+   vhost_poll_queue(>poll);
+   }
+
+   mutex_unlock(>mutex);
+
r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
  out_num, in_num, NULL, NULL);
}
-- 
1.8.3.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v5 0/9] x86: macrofying inline asm for better compilation

2018-06-20 Thread Peter Zijlstra
On Tue, Jun 19, 2018 at 12:48:45PM -0700, Nadav Amit wrote:
> Nadav Amit (9):
>   Makefile: Prepare for using macros for inline asm
>   x86: objtool: use asm macro for better compiler decisions
>   x86: refcount: prevent gcc distortions
>   x86: alternatives: macrofy locks for better inlining
>   x86: bug: prevent gcc distortions
>   x86: prevent inline distortion by paravirt ops
>   x86: extable: use macros instead of inline assembly
>   x86: cpufeature: use macros instead of inline assembly
>   x86: jump-labels: use macros instead of inline assembly
> 
>  Makefile   |  9 ++-
>  arch/x86/Makefile  | 11 ++-
>  arch/x86/include/asm/alternative-asm.h | 20 --
>  arch/x86/include/asm/alternative.h | 11 +--
>  arch/x86/include/asm/asm.h | 61 +++-
>  arch/x86/include/asm/bug.h | 98 +++---
>  arch/x86/include/asm/cpufeature.h  | 82 -
>  arch/x86/include/asm/jump_label.h  | 65 ++---
>  arch/x86/include/asm/paravirt_types.h  | 56 +++
>  arch/x86/include/asm/refcount.h| 74 +++
>  arch/x86/kernel/macros.S   | 16 +
>  include/asm-generic/bug.h  |  8 +--
>  include/linux/compiler.h   | 56 +++
>  scripts/Kbuild.include |  4 +-
>  scripts/mod/Makefile   |  2 +
>  15 files changed, 340 insertions(+), 233 deletions(-)
>  create mode 100644 arch/x86/kernel/macros.S

Aside from the one niggle:

Acked-by: Peter Zijlstra (Intel) 
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v5 6/9] x86: prevent inline distortion by paravirt ops

2018-06-20 Thread Peter Zijlstra
On Tue, Jun 19, 2018 at 12:48:51PM -0700, Nadav Amit wrote:
> +#define paravirt_alt \
> + "PARAVIRT_CALL type=\"%c[paravirt_typenum]\""   \
> + " clobber=\"%c[paravirt_clobber]\"" \
> + " pv_opptr=\"%c[paravirt_opptr]\""

That wants to be:

+ " pv_opptr=\"%c[paravirt_opptr]\";"

And other than that I would suggest: 's/paravirt_alt/paravirt_call/g'.

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [virtio-dev] Re: [Qemu-devel] [PATCH] qemu: Introduce VIRTIO_NET_F_STANDBY feature bit to virtio_net

2018-06-20 Thread Cornelia Huck
On Tue, 19 Jun 2018 23:32:06 +0300
"Michael S. Tsirkin"  wrote:

> On Tue, Jun 19, 2018 at 12:54:53PM +0200, Cornelia Huck wrote:
> > Sorry about dragging mainframes into this, but this will only work for
> > homogenous device coupling, not for heterogenous. Consider my vfio-pci
> > + virtio-net-ccw example again: The guest cannot find out that the two
> > belong together by checking some group ID, it has to either use the MAC
> > or some needs-to-be-architectured property.
> > 
> > Alternatively, we could propose that mechanism as pci-only, which means
> > we can rely on mechanisms that won't necessarily work on non-pci
> > transports. (FWIW, I don't see a use case for using vfio-ccw to pass
> > through a network card anytime in the near future, due to the nature of
> > network cards currently in use on s390.)  
> 
> That's what it boils down to, yes.  If there's need to have this for
> non-pci devices, then we should put it in config space.
> Cornelia, what do you think?
> 

I think the only really useful config on s390 is the vfio-pci network
card coupled with a virtio-net-ccw device: Using an s390 network card
via vfio-ccw is out due to the nature of the s390 network cards, and
virtio-ccw is the default transport (virtio-pci is not supported on any
enterprise distro AFAIK).

For this, having a uuid in the config space could work (vfio-pci
devices have a config space by virtue of being pci devices, and
virtio-net-ccw devices have a config space by virtue of being virtio
devices -- ccw devices usually don't have that concept).
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


RE: [virtio-dev] Re: [PATCH v33 2/4] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT

2018-06-20 Thread Wang, Wei W
On Tuesday, June 19, 2018 10:43 PM, Michael S. Tsirk wrote:
> On Tue, Jun 19, 2018 at 08:13:37PM +0800, Wei Wang wrote:
> > On 06/19/2018 11:05 AM, Michael S. Tsirkin wrote:
> > > On Tue, Jun 19, 2018 at 01:06:48AM +, Wang, Wei W wrote:
> > > > On Monday, June 18, 2018 10:29 AM, Michael S. Tsirkin wrote:
> > > > > On Sat, Jun 16, 2018 at 01:09:44AM +, Wang, Wei W wrote:
> > > > > > Not necessarily, I think. We have min(4m_page_blocks / 512,
> > > > > > 1024) above,
> > > > > so the maximum memory that can be reported is 2TB. For larger
> guests, e.g.
> > > > > 4TB, the optimization can still offer 2TB free memory (better
> > > > > than no optimization).
> > > > >
> > > > > Maybe it's better, maybe it isn't. It certainly muddies the waters 
> > > > > even
> more.
> > > > > I'd rather we had a better plan. From that POV I like what
> > > > > Matthew Wilcox suggested for this which is to steal the necessary # of
> entries off the list.
> > > > Actually what Matthew suggested doesn't make a difference here.
> > > > That method always steal the first free page blocks, and sure can
> > > > be changed to take more. But all these can be achieved via kmalloc
> > > I'd do get_user_pages really. You don't want pages split, etc.
> 
> Oops sorry. I meant get_free_pages .

Yes, we can use __get_free_pages, and the max allocation is MAX_ORDER - 1, 
which can report up to 2TB free memory. 

"getting two pages isn't harder", do you mean passing two arrays (two 
allocations by get_free_pages(,MAX_ORDER -1)) to the mm API?

Please see if the following logic aligns to what you think:

uint32_t i, max_hints, hints_per_page, hints_per_array, total_arrays;
unsigned long *arrays;
 
 /*
 * Each array size is MAX_ORDER_NR_PAGES. If one array is not enough to
 * store all the hints, we need to allocate multiple arrays.
 * max_hints: the max number of 4MB free page blocks
 * hints_per_page: the number of hints each page can store
 * hints_per_array: the number of hints an array can store
 * total_arrays: the number of arrays we need
 */
max_hints = totalram_pages / MAX_ORDER_NR_PAGES;
hints_per_page = PAGE_SIZE / sizeof(__le64);
hints_per_array = hints_per_page * MAX_ORDER_NR_PAGES;
total_arrays = max_hints /  hints_per_array +
   !!(max_hints % hints_per_array);
arrays = kmalloc(total_arrays * sizeof(unsigned long), GFP_KERNEL);
for (i = 0; i < total_arrays; i++) {
arrays[i] = __get_free_pages(__GFP_ATOMIC | __GFP_NOMEMALLOC, 
MAX_ORDER - 1);

  if (!arrays[i])
  goto out;
}


- the mm API needs to be changed to support storing hints to multiple separated 
arrays offered by the caller.

Best,
Wei
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization