Re: [PATCH v2 0/4] vhost: Cleanup

2024-04-29 Thread Gavin Shan

On 4/30/24 04:50, Michael S. Tsirkin wrote:

On Mon, Apr 29, 2024 at 08:13:56PM +1000, Gavin Shan wrote:

This is suggested by Michael S. Tsirkin according to [1] and the goal
is to apply smp_rmb() inside vhost_get_avail_idx() if needed. With it,
the caller of the function needn't to worry about memory barriers. Since
we're here, other cleanups are also applied.

[1] 
https://lore.kernel.org/virtualization/20240327155750-mutt-send-email-...@kernel.org/



Patch 1 makes some sense, gave some comments. Rest I think we should
just drop.



Sure, v3 has been sent with PATCH[v2 2/3/4] dropped. Please take a look
when you getting a chance.

v3: 
https://lore.kernel.org/virtualization/20240429232748.642356-1-gs...@redhat.com/T/#u

Thanks,
Gavin




[PATCH v3] vhost: Improve vhost_get_avail_idx() with smp_rmb()

2024-04-29 Thread Gavin Shan
From: "Michael S. Tsirkin" 

All the callers of vhost_get_avail_idx() are concerned with the
memory barrier, imposed by smp_rmb() to ensure the order of the
available ring entry read and avail_idx read.

Improve vhost_get_avail_idx() so that smp_rmb() is executed when
the avail_idx is accessed. With it, the callers needn't to worry
about the memory barrier. As a side benefit, we also validate the
index on all paths now, which will hopefully help to catch future
errors earlier.

Note that current code is inconsistent in how the errors are handled.
They are treated as an empty ring in some places, but as non-empty
ring in other places. This patch doesn't attempt to change the existing
behaviour.

No functional change intended.

Signed-off-by: Michael S. Tsirkin 
Reviewed-by: Gavin Shan 
Acked-by: Will Deacon 
---
v3: Improved commit log and comments as Michael suggested
---
 drivers/vhost/vhost.c | 105 +-
 1 file changed, 42 insertions(+), 63 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 8995730ce0bf..60d9592eff7b 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1290,10 +1290,36 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
mutex_unlock(>vqs[i]->mutex);
 }
 
-static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
- __virtio16 *idx)
+static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq)
 {
-   return vhost_get_avail(vq, *idx, >avail->idx);
+   __virtio16 idx;
+   int r;
+
+   r = vhost_get_avail(vq, idx, >avail->idx);
+   if (unlikely(r < 0)) {
+   vq_err(vq, "Failed to access available index at %p (%d)\n",
+  >avail->idx, r);
+   return r;
+   }
+
+   /* Check it isn't doing very strange thing with available indexes */
+   vq->avail_idx = vhost16_to_cpu(vq, idx);
+   if (unlikely((u16)(vq->avail_idx - vq->last_avail_idx) > vq->num)) {
+   vq_err(vq, "Invalid available index change from %u to %u",
+  vq->last_avail_idx, vq->avail_idx);
+   return -EINVAL;
+   }
+
+   /* We're done if there is nothing new */
+   if (vq->avail_idx == vq->last_avail_idx)
+   return 0;
+
+   /*
+* We updated vq->avail_idx so we need a memory barrier between
+* the index read above and the caller reading avail ring entries.
+*/
+   smp_rmb();
+   return 1;
 }
 
 static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
@@ -2498,38 +2524,17 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 {
struct vring_desc desc;
unsigned int i, head, found = 0;
-   u16 last_avail_idx;
-   __virtio16 avail_idx;
+   u16 last_avail_idx = vq->last_avail_idx;
__virtio16 ring_head;
int ret, access;
 
-   /* Check it isn't doing very strange things with descriptor numbers. */
-   last_avail_idx = vq->last_avail_idx;
-
if (vq->avail_idx == vq->last_avail_idx) {
-   if (unlikely(vhost_get_avail_idx(vq, _idx))) {
-   vq_err(vq, "Failed to access avail idx at %p\n",
-   >avail->idx);
-   return -EFAULT;
-   }
-   vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
-
-   if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
-   vq_err(vq, "Guest moved avail index from %u to %u",
-   last_avail_idx, vq->avail_idx);
-   return -EFAULT;
-   }
+   ret = vhost_get_avail_idx(vq);
+   if (unlikely(ret < 0))
+   return ret;
 
-   /* If there's nothing new since last we looked, return
-* invalid.
-*/
-   if (vq->avail_idx == last_avail_idx)
+   if (!ret)
return vq->num;
-
-   /* Only get avail ring entries after they have been
-* exposed by guest.
-*/
-   smp_rmb();
}
 
/* Grab the next descriptor number they're advertising, and increment
@@ -2790,35 +2795,21 @@ EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
 /* return true if we're sure that avaiable ring is empty */
 bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 {
-   __virtio16 avail_idx;
int r;
 
if (vq->avail_idx != vq->last_avail_idx)
return false;
 
-   r = vhost_get_avail_idx(vq, _idx);
-   if (unlikely(r))
-   return false;
-
-   vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
-   if (vq->avail_idx != vq->last_avail_idx) {

Re: [PATCH v2 1/4] vhost: Improve vhost_get_avail_idx() with smp_rmb()

2024-04-29 Thread Gavin Shan

On 4/30/24 04:44, Michael S. Tsirkin wrote:

On Mon, Apr 29, 2024 at 08:13:57PM +1000, Gavin Shan wrote:

From: "Michael S. Tsirkin" 

All the callers of vhost_get_avail_idx() are concerned to the memory


*with* the memory barrier



Thanks, will be corrected in v3.


barrier, imposed by smp_rmb() to ensure the order of the available
ring entry read and avail_idx read.

Improve vhost_get_avail_idx() so that smp_rmb() is executed when
the avail_idx is advanced.


accessed, not advanced. guest advances it.



smp_rmb() is executed only when vp->last_avail_idx != vp->avail_idx.
I used 'advanced' to indicate the condition. 'accessed' is also
correct since the 'advanced' case included to 'accessed' case.


With it, the callers needn't to worry
about the memory barrier.

No functional change intended.


I'd add:

As a side benefit, we also validate the index on all paths now, which
will hopefully help catch future errors earlier.

Note: current code is inconsistent in how it handles errors:
some places treat it as an empty ring, others - non empty.
This patch does not attempt to change the existing behaviour.



Ok, I will integrate this to v3's commit log.





Signed-off-by: Michael S. Tsirkin 
[gshan: repainted vhost_get_avail_idx()]


?repainted?



It's just a indicator to say the changes aren't simply copied from
[1]. Some follow-up changes are also applied. So it needs to be
reviewed. I will drop this in v3.


Reviewed-by: Gavin Shan 
Acked-by: Will Deacon 
---
  drivers/vhost/vhost.c | 106 +-
  1 file changed, 42 insertions(+), 64 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 8995730ce0bf..7aa623117aab 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1290,10 +1290,36 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
mutex_unlock(>vqs[i]->mutex);
  }
  
-static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,

- __virtio16 *idx)
+static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq)
  {
-   return vhost_get_avail(vq, *idx, >avail->idx);
+   __virtio16 idx;
+   int r;
+
+   r = vhost_get_avail(vq, idx, >avail->idx);
+   if (unlikely(r < 0)) {
+   vq_err(vq, "Failed to access available index at %p (%d)\n",
+  >avail->idx, r);
+   return r;
+   }
+
+   /* Check it isn't doing very strange thing with available indexes */
+   vq->avail_idx = vhost16_to_cpu(vq, idx);
+   if (unlikely((u16)(vq->avail_idx - vq->last_avail_idx) > vq->num)) {
+   vq_err(vq, "Invalid available index change from %u to %u",
+  vq->last_avail_idx, vq->avail_idx);
+   return -EINVAL;
+   }
+
+   /* We're done if there is nothing new */
+   if (vq->avail_idx == vq->last_avail_idx)
+   return 0;
+
+   /*
+* We updated vq->avail_idx so we need a memory barrier between
+* the index read above and the caller reading avail ring entries.
+*/
+   smp_rmb();
+   return 1;
  }
  
  static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,

@@ -2498,38 +2524,17 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
  {
struct vring_desc desc;
unsigned int i, head, found = 0;
-   u16 last_avail_idx;
-   __virtio16 avail_idx;
+   u16 last_avail_idx = vq->last_avail_idx;
__virtio16 ring_head;
int ret, access;
  
-	/* Check it isn't doing very strange things with descriptor numbers. */

-   last_avail_idx = vq->last_avail_idx;
-
if (vq->avail_idx == vq->last_avail_idx) {
-   if (unlikely(vhost_get_avail_idx(vq, _idx))) {
-   vq_err(vq, "Failed to access avail idx at %p\n",
-   >avail->idx);
-   return -EFAULT;
-   }
-   vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
-
-   if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
-   vq_err(vq, "Guest moved avail index from %u to %u",
-   last_avail_idx, vq->avail_idx);
-   return -EFAULT;
-   }
+   ret = vhost_get_avail_idx(vq);
+   if (unlikely(ret < 0))
+   return ret;
  
-		/* If there's nothing new since last we looked, return

-* invalid.
-*/
-   if (vq->avail_idx == last_avail_idx)
+   if (!ret)
return vq->num;
-
-   /* Only get avail ring entries after they have been
-* exposed by guest.
-*/
-   smp_rmb();
}
  
  	/* Grab the next d

Re: [PATCH v2 2/4] vhost: Drop variable last_avail_idx in vhost_get_vq_desc()

2024-04-29 Thread Gavin Shan

On 4/30/24 04:45, Michael S. Tsirkin wrote:

On Mon, Apr 29, 2024 at 08:13:58PM +1000, Gavin Shan wrote:

The local variable @last_avail_idx is equivalent to vq->last_avail_idx.
So the code can be simplified a bit by dropping the local variable
@last_avail_idx.

No functional change intended.

Signed-off-by: Gavin Shan 
---
  drivers/vhost/vhost.c | 7 +++
  1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 7aa623117aab..b278c0333a66 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2524,7 +2524,6 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
  {
struct vring_desc desc;
unsigned int i, head, found = 0;
-   u16 last_avail_idx = vq->last_avail_idx;
__virtio16 ring_head;
int ret, access;
  
@@ -2539,10 +2538,10 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
  
  	/* Grab the next descriptor number they're advertising, and increment

 * the index we've seen. */
-   if (unlikely(vhost_get_avail_head(vq, _head, last_avail_idx))) {
+   if (unlikely(vhost_get_avail_head(vq, _head, vq->last_avail_idx))) 
{
vq_err(vq, "Failed to read head: idx %d address %p\n",
-  last_avail_idx,
-  >avail->ring[last_avail_idx % vq->num]);
+  vq->last_avail_idx,
+  >avail->ring[vq->last_avail_idx % vq->num]);
return -EFAULT;
}


I don't see the big advantage and the line is long now.



The point is to avoid the local variable @last_avail_idx since it's equivalent
to vq->last_avail_idx, as stated in the commit log. Besides, it paves the way
for PATCH[v2 3/4] where the whole logic fetching the head and sanity check is
moved to vhost_get_avail_head(), so that vhost_get_vq_desc() is simplified

I will drop PATCH[2, 3, 4] as you suggested.

Thanks,
Gavin




Re: [PATCH 0/4] vhost: Cleanup

2024-04-29 Thread Gavin Shan

On 4/29/24 17:02, Michael S. Tsirkin wrote:

On Tue, Apr 23, 2024 at 01:24:03PM +1000, Gavin Shan wrote:

This is suggested by Michael S. Tsirkin according to [1] and the goal
is to apply smp_rmb() inside vhost_get_avail_idx() if needed. With it,
the caller of the function needn't to worry about memory barriers. Since
we're here, other cleanups are also applied.



Gavin I suggested another approach.
1. Start with the patch I sent (vhost: order avail ring reads after
index updates) just do a diff against latest.
simplify error handling a bit.
2. Do any other cleanups on top.



My apologies, Michael. I didn't see your patch until now [1]

  [1] 
https://lore.kernel.org/virtualization/20240327155750-mutt-send-email-...@kernel.org/

v2 was sent with your changes integrated and other cleanup are applied on
top of it. Please take a look when you getting a chance.

  v2: 
https://lore.kernel.org/virtualization/20240429101400.617007-1-gs...@redhat.com/T/#t

Thanks,
Gavin




[PATCH v2 4/4] vhost: Reformat vhost_{get, put}_user()

2024-04-29 Thread Gavin Shan
Reformat the macros to use tab as the terminator for each line so
that it looks clean.

No functional change intended.

Signed-off-by: Gavin Shan 
---
 drivers/vhost/vhost.c | 60 +--
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 4ddb9ec2fe46..c1ed5e750521 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1207,21 +1207,22 @@ static inline void __user *__vhost_get_user(struct 
vhost_virtqueue *vq,
return __vhost_get_user_slow(vq, addr, size, type);
 }
 
-#define vhost_put_user(vq, x, ptr) \
-({ \
-   int ret; \
-   if (!vq->iotlb) { \
-   ret = __put_user(x, ptr); \
-   } else { \
-   __typeof__(ptr) to = \
+#define vhost_put_user(vq, x, ptr) \
+({ \
+   int ret;\
+   if (!vq->iotlb) {   \
+   ret = __put_user(x, ptr);   \
+   } else {\
+   __typeof__(ptr) to =\
(__typeof__(ptr)) __vhost_get_user(vq, ptr, \
- sizeof(*ptr), VHOST_ADDR_USED); \
-   if (to != NULL) \
-   ret = __put_user(x, to); \
-   else \
-   ret = -EFAULT;  \
-   } \
-   ret; \
+   sizeof(*ptr),   \
+   VHOST_ADDR_USED);   \
+   if (to != NULL) \
+   ret = __put_user(x, to);\
+   else\
+   ret = -EFAULT;  \
+   }   \
+   ret;\
 })
 
 static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
@@ -1252,22 +1253,21 @@ static inline int vhost_put_used_idx(struct 
vhost_virtqueue *vq)
  >used->idx);
 }
 
-#define vhost_get_user(vq, x, ptr, type)   \
-({ \
-   int ret; \
-   if (!vq->iotlb) { \
-   ret = __get_user(x, ptr); \
-   } else { \
-   __typeof__(ptr) from = \
-   (__typeof__(ptr)) __vhost_get_user(vq, ptr, \
-  sizeof(*ptr), \
-  type); \
-   if (from != NULL) \
-   ret = __get_user(x, from); \
-   else \
-   ret = -EFAULT; \
-   } \
-   ret; \
+#define vhost_get_user(vq, x, ptr, type)   \
+({ \
+   int ret;\
+   if (!vq->iotlb) {   \
+   ret = __get_user(x, ptr);   \
+   } else {\
+   __typeof__(ptr) from =  \
+   (__typeof__(ptr)) __vhost_get_user(vq, ptr, \
+   sizeof(*ptr), type);\
+   if (from != NULL)   \
+   ret = __get_user(x, from);  \
+   else\
+   ret = -EFAULT;  \
+   }   \
+   ret;\
 })
 
 #define vhost_get_avail(vq, x, ptr) \
-- 
2.44.0




[PATCH v2 3/4] vhost: Improve vhost_get_avail_head()

2024-04-29 Thread Gavin Shan
Improve vhost_get_avail_head() so that the head or errno is returned.
With it, the relevant sanity checks are squeezed to vhost_get_avail_head()
and vhost_get_vq_desc() is further simplified.

No functional change intended.

Signed-off-by: Gavin Shan 
---
 drivers/vhost/vhost.c | 50 ++-
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index b278c0333a66..4ddb9ec2fe46 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1322,11 +1322,27 @@ static inline int vhost_get_avail_idx(struct 
vhost_virtqueue *vq)
return 1;
 }
 
-static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
-  __virtio16 *head, int idx)
+static inline int vhost_get_avail_head(struct vhost_virtqueue *vq)
 {
-   return vhost_get_avail(vq, *head,
-  >avail->ring[idx & (vq->num - 1)]);
+   __virtio16 head;
+   int r;
+
+   r = vhost_get_avail(vq, head,
+   >avail->ring[vq->last_avail_idx & (vq->num - 
1)]);
+   if (unlikely(r)) {
+   vq_err(vq, "Failed to read head: index %u address %p\n",
+  vq->last_avail_idx,
+  >avail->ring[vq->last_avail_idx & (vq->num - 1)]);
+   return r;
+   }
+
+   r = vhost16_to_cpu(vq, head);
+   if (unlikely(r >= vq->num)) {
+   vq_err(vq, "Invalid head %d (%u)\n", r, vq->num);
+   return -EINVAL;
+   }
+
+   return r;
 }
 
 static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
@@ -2523,9 +2539,8 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
  struct vhost_log *log, unsigned int *log_num)
 {
struct vring_desc desc;
-   unsigned int i, head, found = 0;
-   __virtio16 ring_head;
-   int ret, access;
+   unsigned int i, found = 0;
+   int head, ret, access;
 
if (vq->avail_idx == vq->last_avail_idx) {
ret = vhost_get_avail_idx(vq);
@@ -2536,23 +2551,10 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
return vq->num;
}
 
-   /* Grab the next descriptor number they're advertising, and increment
-* the index we've seen. */
-   if (unlikely(vhost_get_avail_head(vq, _head, vq->last_avail_idx))) 
{
-   vq_err(vq, "Failed to read head: idx %d address %p\n",
-  vq->last_avail_idx,
-  >avail->ring[vq->last_avail_idx % vq->num]);
-   return -EFAULT;
-   }
-
-   head = vhost16_to_cpu(vq, ring_head);
-
-   /* If their number is silly, that's an error. */
-   if (unlikely(head >= vq->num)) {
-   vq_err(vq, "Guest says index %u > %u is available",
-  head, vq->num);
-   return -EINVAL;
-   }
+   /* Grab the next descriptor number they're advertising */
+   head = vhost_get_avail_head(vq);
+   if (unlikely(head < 0))
+   return head;
 
/* When we start there are none of either input nor output. */
*out_num = *in_num = 0;
-- 
2.44.0




[PATCH v2 2/4] vhost: Drop variable last_avail_idx in vhost_get_vq_desc()

2024-04-29 Thread Gavin Shan
The local variable @last_avail_idx is equivalent to vq->last_avail_idx.
So the code can be simplified a bit by dropping the local variable
@last_avail_idx.

No functional change intended.

Signed-off-by: Gavin Shan 
---
 drivers/vhost/vhost.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 7aa623117aab..b278c0333a66 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2524,7 +2524,6 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 {
struct vring_desc desc;
unsigned int i, head, found = 0;
-   u16 last_avail_idx = vq->last_avail_idx;
__virtio16 ring_head;
int ret, access;
 
@@ -2539,10 +2538,10 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 
/* Grab the next descriptor number they're advertising, and increment
 * the index we've seen. */
-   if (unlikely(vhost_get_avail_head(vq, _head, last_avail_idx))) {
+   if (unlikely(vhost_get_avail_head(vq, _head, vq->last_avail_idx))) 
{
vq_err(vq, "Failed to read head: idx %d address %p\n",
-  last_avail_idx,
-  >avail->ring[last_avail_idx % vq->num]);
+  vq->last_avail_idx,
+  >avail->ring[vq->last_avail_idx % vq->num]);
return -EFAULT;
}
 
-- 
2.44.0




[PATCH v2 1/4] vhost: Improve vhost_get_avail_idx() with smp_rmb()

2024-04-29 Thread Gavin Shan
From: "Michael S. Tsirkin" 

All the callers of vhost_get_avail_idx() are concerned to the memory
barrier, imposed by smp_rmb() to ensure the order of the available
ring entry read and avail_idx read.

Improve vhost_get_avail_idx() so that smp_rmb() is executed when
the avail_idx is advanced. With it, the callers needn't to worry
about the memory barrier.

No functional change intended.

Signed-off-by: Michael S. Tsirkin 
[gshan: repainted vhost_get_avail_idx()]
Reviewed-by: Gavin Shan 
Acked-by: Will Deacon 
---
 drivers/vhost/vhost.c | 106 +-
 1 file changed, 42 insertions(+), 64 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 8995730ce0bf..7aa623117aab 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1290,10 +1290,36 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
mutex_unlock(>vqs[i]->mutex);
 }
 
-static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
- __virtio16 *idx)
+static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq)
 {
-   return vhost_get_avail(vq, *idx, >avail->idx);
+   __virtio16 idx;
+   int r;
+
+   r = vhost_get_avail(vq, idx, >avail->idx);
+   if (unlikely(r < 0)) {
+   vq_err(vq, "Failed to access available index at %p (%d)\n",
+  >avail->idx, r);
+   return r;
+   }
+
+   /* Check it isn't doing very strange thing with available indexes */
+   vq->avail_idx = vhost16_to_cpu(vq, idx);
+   if (unlikely((u16)(vq->avail_idx - vq->last_avail_idx) > vq->num)) {
+   vq_err(vq, "Invalid available index change from %u to %u",
+  vq->last_avail_idx, vq->avail_idx);
+   return -EINVAL;
+   }
+
+   /* We're done if there is nothing new */
+   if (vq->avail_idx == vq->last_avail_idx)
+   return 0;
+
+   /*
+* We updated vq->avail_idx so we need a memory barrier between
+* the index read above and the caller reading avail ring entries.
+*/
+   smp_rmb();
+   return 1;
 }
 
 static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
@@ -2498,38 +2524,17 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 {
struct vring_desc desc;
unsigned int i, head, found = 0;
-   u16 last_avail_idx;
-   __virtio16 avail_idx;
+   u16 last_avail_idx = vq->last_avail_idx;
__virtio16 ring_head;
int ret, access;
 
-   /* Check it isn't doing very strange things with descriptor numbers. */
-   last_avail_idx = vq->last_avail_idx;
-
if (vq->avail_idx == vq->last_avail_idx) {
-   if (unlikely(vhost_get_avail_idx(vq, _idx))) {
-   vq_err(vq, "Failed to access avail idx at %p\n",
-   >avail->idx);
-   return -EFAULT;
-   }
-   vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
-
-   if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
-   vq_err(vq, "Guest moved avail index from %u to %u",
-   last_avail_idx, vq->avail_idx);
-   return -EFAULT;
-   }
+   ret = vhost_get_avail_idx(vq);
+   if (unlikely(ret < 0))
+   return ret;
 
-   /* If there's nothing new since last we looked, return
-* invalid.
-*/
-   if (vq->avail_idx == last_avail_idx)
+   if (!ret)
return vq->num;
-
-   /* Only get avail ring entries after they have been
-* exposed by guest.
-*/
-   smp_rmb();
}
 
/* Grab the next descriptor number they're advertising, and increment
@@ -2790,35 +2795,20 @@ EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
 /* return true if we're sure that avaiable ring is empty */
 bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 {
-   __virtio16 avail_idx;
int r;
 
if (vq->avail_idx != vq->last_avail_idx)
return false;
 
-   r = vhost_get_avail_idx(vq, _idx);
-   if (unlikely(r))
-   return false;
-
-   vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
-   if (vq->avail_idx != vq->last_avail_idx) {
-   /* Since we have updated avail_idx, the following
-* call to vhost_get_vq_desc() will read available
-* ring entries. Make sure that read happens after
-* the avail_idx read.
-*/
-   smp_rmb();
-   return false;
-   }
-
-   return true;
+   /* Tre

[PATCH v2 0/4] vhost: Cleanup

2024-04-29 Thread Gavin Shan
This is suggested by Michael S. Tsirkin according to [1] and the goal
is to apply smp_rmb() inside vhost_get_avail_idx() if needed. With it,
the caller of the function needn't to worry about memory barriers. Since
we're here, other cleanups are also applied.

[1] 
https://lore.kernel.org/virtualization/20240327155750-mutt-send-email-...@kernel.org/

PATCH[1] improves vhost_get_avail_idx() so that smp_rmb() is applied if
 needed. Besides, the sanity checks on the retrieved available
 queue index are also squeezed to vhost_get_avail_idx()
PATCH[2] drops the local variable @last_avail_idx since it's equivalent
 to vq->last_avail_idx
PATCH[3] improves vhost_get_avail_head(), similar to what we're doing
 for vhost_get_avail_idx(), so that the relevant sanity checks
 on the head are squeezed to vhost_get_avail_head()
PATCH[4] Reformat vhost_{get, put}_user() by using tab instead of space
 as the terminator for each line

Gavin Shan (3):
  vhost: Drop variable last_avail_idx in vhost_get_vq_desc()
  vhost: Improve vhost_get_avail_head()
  vhost: Reformat vhost_{get, put}_user()

Michael S. Tsirkin (1):
  vhost: Improve vhost_get_avail_idx() with smp_rmb()

 drivers/vhost/vhost.c | 215 +++---
 1 file changed, 97 insertions(+), 118 deletions(-)

Changelog
=
v2:
  * Improve vhost_get_avail_idx() as Michael suggested in [1]
as above (Michael)
  * Correct @head's type from 'unsigned int' to 'int'
(l...@intel.com)

-- 
2.44.0




Re: [PATCH 3/4] vhost: Improve vhost_get_avail_head()

2024-04-25 Thread Gavin Shan

On 4/26/24 06:42, kernel test robot wrote:> kernel test robot noticed the 
following build warnings:


[auto build test WARNING on mst-vhost/linux-next]
[also build test WARNING on linus/master v6.9-rc5 next-20240424]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:
https://github.com/intel-lab-lkp/linux/commits/Gavin-Shan/vhost-Drop-variable-last_avail_idx-in-vhost_get_vq_desc/20240423-112803
base:   https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git linux-next
patch link:
https://lore.kernel.org/r/20240423032407.262329-4-gshan%40redhat.com
patch subject: [PATCH 3/4] vhost: Improve vhost_get_avail_head()
config: i386-randconfig-141-20240426 
(https://download.01.org/0day-ci/archive/20240426/202404260448.g7f06v7m-...@intel.com/config)
compiler: clang version 17.0.6 (https://github.com/llvm/llvm-project 
6009708b4367171ccdbf4b5905cb6a803753fe18)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot 
| Closes: 
https://lore.kernel.org/oe-kbuild-all/202404260448.g7f06v7m-...@intel.com/

smatch warnings:
drivers/vhost/vhost.c:2614 vhost_get_vq_desc() warn: unsigned 'head' is never 
less than zero.
drivers/vhost/vhost.c:2614 vhost_get_vq_desc() warn: error code type promoted 
to positive: 'head'

vim +/head +2614 drivers/vhost/vhost.c

   2581 
   2582 /* This looks in the virtqueue and for the first available buffer, and 
converts
   2583  * it to an iovec for convenient access.  Since descriptors consist of 
some
   2584  * number of output then some number of input descriptors, it's 
actually two
   2585  * iovecs, but we pack them into one and note how many of each there 
were.
   2586  *
   2587  * This function returns the descriptor number found, or vq->num (which 
is
   2588  * never a valid descriptor number) if none was found.  A negative code 
is
   2589  * returned on error. */
   2590 int vhost_get_vq_desc(struct vhost_virtqueue *vq,
   2591   struct iovec iov[], unsigned int iov_size,
   2592   unsigned int *out_num, unsigned int *in_num,
   2593   struct vhost_log *log, unsigned int *log_num)
   2594 {
   2595 struct vring_desc desc;
   2596 unsigned int i, head, found = 0;
   2597 int ret, access;
   2598 
   2599 if (vq->avail_idx == vq->last_avail_idx) {
   2600 ret = vhost_get_avail_idx(vq);
   2601 if (unlikely(ret))
   2602 return ret;
   2603 
   2604 /* If there's nothing new since last we looked, return
   2605  * invalid.
   2606  */
   2607 if (vq->avail_idx == vq->last_avail_idx)
   2608 return vq->num;
   2609 }
   2610 
   2611 /* Grab the next descriptor number they're advertising, and 
increment
   2612  * the index we've seen. */
   2613 head = vhost_get_avail_head(vq);

  2614  if (unlikely(head < 0))

   2615 return head;


Thanks for the report. @head needs to be 'int' instead of 'unsigned int'
so that it can hold the error number from vhost_get_avail_head(). I would
give it more time to see if there are other review comments before I revise
it to fix it up.

Thanks,
Gavin




Re: [PATCH v3 3/3] vhost: Improve vhost_get_avail_idx() with smp_rmb()

2024-04-22 Thread Gavin Shan

On 4/23/24 06:46, Michael S. Tsirkin wrote:

On Mon, Apr 08, 2024 at 02:15:24PM +1000, Gavin Shan wrote:

On 3/30/24 19:02, Gavin Shan wrote:

On 3/28/24 19:31, Michael S. Tsirkin wrote:

On Thu, Mar 28, 2024 at 10:21:49AM +1000, Gavin Shan wrote:

All the callers of vhost_get_avail_idx() are concerned to the memory
barrier, imposed by smp_rmb() to ensure the order of the available
ring entry read and avail_idx read.

Improve vhost_get_avail_idx() so that smp_rmb() is executed when
the avail_idx is advanced. With it, the callers needn't to worry
about the memory barrier.

Suggested-by: Michael S. Tsirkin 
Signed-off-by: Gavin Shan 


Previous patches are ok. This one I feel needs more work -
first more code such as sanity checking should go into
this function, second there's actually a difference
between comparing to last_avail_idx and just comparing
to the previous value of avail_idx.
I will pick patches 1-2 and post a cleanup on top so you can
take a look, ok?



Thanks, Michael. It's fine to me.



A kindly ping.

If it's ok to you, could you please merge PATCH[1-2]? Our downstream
9.4 need the fixes, especially for NVidia's grace-hopper and grace-grace
platforms.

For PATCH[3], I also can help with the improvement if you don't have time
for it. Please let me know.



1-2 are upstream go ahead and post the cleanup.



Michael, a cleanup series has been sent for review.

https://lore.kernel.org/virtualization/20240423032407.262329-1-gs...@redhat.com/T/#t

Thanks,
Gavin




[PATCH 4/4] vhost: Reformat vhost_{get, put}_user()

2024-04-22 Thread Gavin Shan
Reformat the macros to use tab as the terminator for each line so
that it looks clean.

No functional change intended.

Signed-off-by: Gavin Shan 
---
 drivers/vhost/vhost.c | 60 +--
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index a3de9325175f..3be19877f9df 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1207,21 +1207,22 @@ static inline void __user *__vhost_get_user(struct 
vhost_virtqueue *vq,
return __vhost_get_user_slow(vq, addr, size, type);
 }
 
-#define vhost_put_user(vq, x, ptr) \
-({ \
-   int ret; \
-   if (!vq->iotlb) { \
-   ret = __put_user(x, ptr); \
-   } else { \
-   __typeof__(ptr) to = \
+#define vhost_put_user(vq, x, ptr) \
+({ \
+   int ret;\
+   if (!vq->iotlb) {   \
+   ret = __put_user(x, ptr);   \
+   } else {\
+   __typeof__(ptr) to =\
(__typeof__(ptr)) __vhost_get_user(vq, ptr, \
- sizeof(*ptr), VHOST_ADDR_USED); \
-   if (to != NULL) \
-   ret = __put_user(x, to); \
-   else \
-   ret = -EFAULT;  \
-   } \
-   ret; \
+   sizeof(*ptr),   \
+   VHOST_ADDR_USED);   \
+   if (to != NULL) \
+   ret = __put_user(x, to);\
+   else\
+   ret = -EFAULT;  \
+   }   \
+   ret;\
 })
 
 static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
@@ -1252,22 +1253,21 @@ static inline int vhost_put_used_idx(struct 
vhost_virtqueue *vq)
  >used->idx);
 }
 
-#define vhost_get_user(vq, x, ptr, type)   \
-({ \
-   int ret; \
-   if (!vq->iotlb) { \
-   ret = __get_user(x, ptr); \
-   } else { \
-   __typeof__(ptr) from = \
-   (__typeof__(ptr)) __vhost_get_user(vq, ptr, \
-  sizeof(*ptr), \
-  type); \
-   if (from != NULL) \
-   ret = __get_user(x, from); \
-   else \
-   ret = -EFAULT; \
-   } \
-   ret; \
+#define vhost_get_user(vq, x, ptr, type)   \
+({ \
+   int ret;\
+   if (!vq->iotlb) {   \
+   ret = __get_user(x, ptr);   \
+   } else {\
+   __typeof__(ptr) from =  \
+   (__typeof__(ptr)) __vhost_get_user(vq, ptr, \
+   sizeof(*ptr), type);\
+   if (from != NULL)   \
+   ret = __get_user(x, from);  \
+   else\
+   ret = -EFAULT;  \
+   }   \
+   ret;\
 })
 
 #define vhost_get_avail(vq, x, ptr) \
-- 
2.44.0




[PATCH 3/4] vhost: Improve vhost_get_avail_head()

2024-04-22 Thread Gavin Shan
Improve vhost_get_avail_head() so that the head or errno is returned.
With it, the relevant sanity checks are squeezed to vhost_get_avail_head()
and vhost_get_vq_desc() is further simplified.

No functional change intended.

Signed-off-by: Gavin Shan 
---
 drivers/vhost/vhost.c | 43 +++
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index b3adc0bc9e72..a3de9325175f 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1320,11 +1320,27 @@ static inline int vhost_get_avail_idx(struct 
vhost_virtqueue *vq)
return 0;
 }
 
-static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
-  __virtio16 *head, int idx)
+static inline int vhost_get_avail_head(struct vhost_virtqueue *vq)
 {
-   return vhost_get_avail(vq, *head,
-  >avail->ring[idx & (vq->num - 1)]);
+   __virtio16 head;
+   int r;
+
+   r = vhost_get_avail(vq, head,
+   >avail->ring[vq->last_avail_idx & (vq->num - 
1)]);
+   if (unlikely(r)) {
+   vq_err(vq, "Failed to read head: idx %u address %p\n",
+  vq->last_avail_idx,
+  >avail->ring[vq->last_avail_idx % vq->num]);
+   return r;
+   }
+
+   r = vhost16_to_cpu(vq, head);
+   if (unlikely(r >= vq->num)) {
+   vq_err(vq, "Invalid head %d (%u)\n", r, vq->num);
+   return -EINVAL;
+   }
+
+   return r;
 }
 
 static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
@@ -2522,7 +2538,6 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 {
struct vring_desc desc;
unsigned int i, head, found = 0;
-   __virtio16 ring_head;
int ret, access;
 
if (vq->avail_idx == vq->last_avail_idx) {
@@ -2539,21 +2554,9 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 
/* Grab the next descriptor number they're advertising, and increment
 * the index we've seen. */
-   if (unlikely(vhost_get_avail_head(vq, _head, vq->last_avail_idx))) 
{
-   vq_err(vq, "Failed to read head: idx %d address %p\n",
-  vq->last_avail_idx,
-  >avail->ring[vq->last_avail_idx % vq->num]);
-   return -EFAULT;
-   }
-
-   head = vhost16_to_cpu(vq, ring_head);
-
-   /* If their number is silly, that's an error. */
-   if (unlikely(head >= vq->num)) {
-   vq_err(vq, "Guest says index %u > %u is available",
-  head, vq->num);
-   return -EINVAL;
-   }
+   head = vhost_get_avail_head(vq);
+   if (unlikely(head < 0))
+   return head;
 
/* When we start there are none of either input nor output. */
*out_num = *in_num = 0;
-- 
2.44.0




[PATCH 2/4] vhost: Improve vhost_get_avail_idx() with smp_rmb()

2024-04-22 Thread Gavin Shan
All the callers of vhost_get_avail_idx() are concerned to the memory
barrier, imposed by smp_rmb() to ensure the order of the available
ring entry read and avail_idx read.

Improve vhost_get_avail_idx() so that smp_rmb() is executed when
the avail_idx is advanced. With it, the callers needn't to worry
about the memory barrier.

No functional change intended.

Suggested-by: Michael S. Tsirkin 
Signed-off-by: Gavin Shan 
---
 drivers/vhost/vhost.c | 91 ---
 1 file changed, 34 insertions(+), 57 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index ef7942103232..b3adc0bc9e72 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1290,10 +1290,34 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
mutex_unlock(>vqs[i]->mutex);
 }
 
-static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
- __virtio16 *idx)
+static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq)
 {
-   return vhost_get_avail(vq, *idx, >avail->idx);
+   __virtio16 avail_idx;
+   int r;
+
+   r = vhost_get_avail(vq, avail_idx, >avail->idx);
+   if (unlikely(r)) {
+   vq_err(vq, "Failed to access avail idx at %p\n",
+  >avail->idx);
+   return r;
+   }
+
+   vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
+   if (vq->avail_idx != vq->last_avail_idx) {
+   /* Ensure the available ring entry read happens
+* before the avail_idx read when the avail_idx
+* is advanced.
+*/
+   smp_rmb();
+   }
+
+   if (unlikely((u16)(vq->avail_idx - vq->last_avail_idx) > vq->num)) {
+   vq_err(vq, "Invalid avail index change from %u to %u",
+  vq->last_avail_idx, vq->avail_idx);
+   return -EINVAL;
+   }
+
+   return 0;
 }
 
 static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
@@ -2498,35 +2522,19 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 {
struct vring_desc desc;
unsigned int i, head, found = 0;
-   __virtio16 avail_idx;
__virtio16 ring_head;
int ret, access;
 
-   /* Check it isn't doing very strange things with descriptor numbers. */
if (vq->avail_idx == vq->last_avail_idx) {
-   if (unlikely(vhost_get_avail_idx(vq, _idx))) {
-   vq_err(vq, "Failed to access avail idx at %p\n",
-   >avail->idx);
-   return -EFAULT;
-   }
-   vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
-
-   if (unlikely((u16)(vq->avail_idx - vq->last_avail_idx) > 
vq->num)) {
-   vq_err(vq, "Guest moved avail index from %u to %u",
-   vq->last_avail_idx, vq->avail_idx);
-   return -EFAULT;
-   }
+   ret = vhost_get_avail_idx(vq);
+   if (unlikely(ret))
+   return ret;
 
/* If there's nothing new since last we looked, return
 * invalid.
 */
if (vq->avail_idx == vq->last_avail_idx)
return vq->num;
-
-   /* Only get avail ring entries after they have been
-* exposed by guest.
-*/
-   smp_rmb();
}
 
/* Grab the next descriptor number they're advertising, and increment
@@ -2787,35 +2795,19 @@ EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
 /* return true if we're sure that avaiable ring is empty */
 bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 {
-   __virtio16 avail_idx;
-   int r;
-
if (vq->avail_idx != vq->last_avail_idx)
return false;
 
-   r = vhost_get_avail_idx(vq, _idx);
-   if (unlikely(r))
-   return false;
-
-   vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
-   if (vq->avail_idx != vq->last_avail_idx) {
-   /* Since we have updated avail_idx, the following
-* call to vhost_get_vq_desc() will read available
-* ring entries. Make sure that read happens after
-* the avail_idx read.
-*/
-   smp_rmb();
+   if (unlikely(vhost_get_avail_idx(vq)))
return false;
-   }
 
-   return true;
+   return vq->avail_idx == vq->last_avail_idx;
 }
 EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
 
 /* OK, now we need to know about added descriptors. */
 bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 {
-   __virtio16 avail_idx;
int r;
 
if (!(vq->used_flags & VRING_USED_F_NO_N

[PATCH 1/4] vhost: Drop variable last_avail_idx in vhost_get_vq_desc()

2024-04-22 Thread Gavin Shan
The local variable @last_avail_idx is equivalent to vq->last_avail_idx.
So the code can be simplified a bit by dropping the local variable
@last_avail_idx.

No functional change intended.

Signed-off-by: Gavin Shan 
---
 drivers/vhost/vhost.c | 15 ++-
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 8995730ce0bf..ef7942103232 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2498,14 +2498,11 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 {
struct vring_desc desc;
unsigned int i, head, found = 0;
-   u16 last_avail_idx;
__virtio16 avail_idx;
__virtio16 ring_head;
int ret, access;
 
/* Check it isn't doing very strange things with descriptor numbers. */
-   last_avail_idx = vq->last_avail_idx;
-
if (vq->avail_idx == vq->last_avail_idx) {
if (unlikely(vhost_get_avail_idx(vq, _idx))) {
vq_err(vq, "Failed to access avail idx at %p\n",
@@ -2514,16 +2511,16 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
}
vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
 
-   if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
+   if (unlikely((u16)(vq->avail_idx - vq->last_avail_idx) > 
vq->num)) {
vq_err(vq, "Guest moved avail index from %u to %u",
-   last_avail_idx, vq->avail_idx);
+   vq->last_avail_idx, vq->avail_idx);
return -EFAULT;
}
 
/* If there's nothing new since last we looked, return
 * invalid.
 */
-   if (vq->avail_idx == last_avail_idx)
+   if (vq->avail_idx == vq->last_avail_idx)
return vq->num;
 
/* Only get avail ring entries after they have been
@@ -2534,10 +2531,10 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 
/* Grab the next descriptor number they're advertising, and increment
 * the index we've seen. */
-   if (unlikely(vhost_get_avail_head(vq, _head, last_avail_idx))) {
+   if (unlikely(vhost_get_avail_head(vq, _head, vq->last_avail_idx))) 
{
vq_err(vq, "Failed to read head: idx %d address %p\n",
-  last_avail_idx,
-  >avail->ring[last_avail_idx % vq->num]);
+  vq->last_avail_idx,
+  >avail->ring[vq->last_avail_idx % vq->num]);
return -EFAULT;
}
 
-- 
2.44.0




[PATCH 0/4] vhost: Cleanup

2024-04-22 Thread Gavin Shan
This is suggested by Michael S. Tsirkin according to [1] and the goal
is to apply smp_rmb() inside vhost_get_avail_idx() if needed. With it,
the caller of the function needn't to worry about memory barriers. Since
we're here, other cleanups are also applied.

[1] 
https://lore.kernel.org/virtualization/20240327075940-mutt-send-email-...@kernel.org/

PATCH[1] drops the local variable @last_avail_idx since it's equivalent
 to vq->last_avail_idx
PATCH[2] improves vhost_get_avail_idx() so that smp_rmb() is applied if
 needed. Besides, the sanity checks on the retrieved available
 queue index are also squeezed to vhost_get_avail_idx()
PATCH[3] improves vhost_get_avail_head(), similar to what we're doing
 for vhost_get_avail_idx(), so that the relevant sanity checks
 on the head are squeezed to vhost_get_avail_head()
PATCH[4] Reformat vhost_{get, put}_user() by using tab instead of space
 as the terminator for each line

Gavin Shan (4):
  vhost: Drop variable last_avail_idx in vhost_get_vq_desc()
  vhost: Improve vhost_get_avail_idx() with smp_rmb()
  vhost: Improve vhost_get_avail_head()
  vhost: Reformat vhost_{get, put}_user()

 drivers/vhost/vhost.c | 199 +++---
 1 file changed, 88 insertions(+), 111 deletions(-)

-- 
2.44.0




Re: [PATCH v3 3/3] vhost: Improve vhost_get_avail_idx() with smp_rmb()

2024-04-07 Thread Gavin Shan

Hi Michael,

On 3/30/24 19:02, Gavin Shan wrote:

On 3/28/24 19:31, Michael S. Tsirkin wrote:

On Thu, Mar 28, 2024 at 10:21:49AM +1000, Gavin Shan wrote:

All the callers of vhost_get_avail_idx() are concerned to the memory
barrier, imposed by smp_rmb() to ensure the order of the available
ring entry read and avail_idx read.

Improve vhost_get_avail_idx() so that smp_rmb() is executed when
the avail_idx is advanced. With it, the callers needn't to worry
about the memory barrier.

Suggested-by: Michael S. Tsirkin 
Signed-off-by: Gavin Shan 


Previous patches are ok. This one I feel needs more work -
first more code such as sanity checking should go into
this function, second there's actually a difference
between comparing to last_avail_idx and just comparing
to the previous value of avail_idx.
I will pick patches 1-2 and post a cleanup on top so you can
take a look, ok?



Thanks, Michael. It's fine to me.



A kindly ping.

If it's ok to you, could you please merge PATCH[1-2]? Our downstream
9.4 need the fixes, especially for NVidia's grace-hopper and grace-grace
platforms.

For PATCH[3], I also can help with the improvement if you don't have time
for it. Please let me know.

Thanks,
Gavin




Re: [PATCH v3 3/3] vhost: Improve vhost_get_avail_idx() with smp_rmb()

2024-03-30 Thread Gavin Shan

On 3/28/24 19:31, Michael S. Tsirkin wrote:

On Thu, Mar 28, 2024 at 10:21:49AM +1000, Gavin Shan wrote:

All the callers of vhost_get_avail_idx() are concerned to the memory
barrier, imposed by smp_rmb() to ensure the order of the available
ring entry read and avail_idx read.

Improve vhost_get_avail_idx() so that smp_rmb() is executed when
the avail_idx is advanced. With it, the callers needn't to worry
about the memory barrier.

Suggested-by: Michael S. Tsirkin 
Signed-off-by: Gavin Shan 


Previous patches are ok. This one I feel needs more work -
first more code such as sanity checking should go into
this function, second there's actually a difference
between comparing to last_avail_idx and just comparing
to the previous value of avail_idx.
I will pick patches 1-2 and post a cleanup on top so you can
take a look, ok?



Thanks, Michael. It's fine to me.

Thanks,
Gavin




Re: [PATCH v2 1/2] vhost: Add smp_rmb() in vhost_vq_avail_empty()

2024-03-27 Thread Gavin Shan

On 3/27/24 17:42, Jason Wang wrote:

On Wed, Mar 27, 2024 at 3:35 PM Gavin Shan  wrote:


On 3/27/24 14:08, Gavin Shan wrote:

On 3/27/24 12:44, Jason Wang wrote:

On Wed, Mar 27, 2024 at 10:34 AM Jason Wang  wrote:

On Wed, Mar 27, 2024 at 7:39 AM Gavin Shan  wrote:


A smp_rmb() has been missed in vhost_vq_avail_empty(), spotted by
Will Deacon . Otherwise, it's not ensured the
available ring entries pushed by guest can be observed by vhost
in time, leading to stale available ring entries fetched by vhost
in vhost_get_vq_desc(), as reported by Yihuang Yu on NVidia's
grace-hopper (ARM64) platform.

/home/gavin/sandbox/qemu.main/build/qemu-system-aarch64  \
-accel kvm -machine virt,gic-version=host -cpu host  \
-smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \
-m 4096M,slots=16,maxmem=64G \
-object memory-backend-ram,id=mem0,size=4096M\
 :   \
-netdev tap,id=vnet0,vhost=true  \
-device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0
 :
guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM
virtio_net virtio0: output.0:id 100 is not a head!

Add the missed smp_rmb() in vhost_vq_avail_empty(). Note that it
should be safe until vq->avail_idx is changed by commit 275bf960ac697
("vhost: better detection of available buffers").

Fixes: 275bf960ac697 ("vhost: better detection of available buffers")
Cc:  # v4.11+
Reported-by: Yihuang Yu 
Signed-off-by: Gavin Shan 
---
   drivers/vhost/vhost.c | 11 ++-
   1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 045f666b4f12..00445ab172b3 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2799,9 +2799,18 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct 
vhost_virtqueue *vq)
  r = vhost_get_avail_idx(vq, _idx);
  if (unlikely(r))
  return false;
+
  vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
+   if (vq->avail_idx != vq->last_avail_idx) {
+   /* Similar to what's done in vhost_get_vq_desc(), we need
+* to ensure the available ring entries have been exposed
+* by guest.
+*/


We need to be more verbose here. For example, which load needs to be
ordered with which load.

The rmb in vhost_get_vq_desc() is used to order the load of avail idx
and the load of head. It is paired with e.g virtio_wmb() in
virtqueue_add_split().

vhost_vq_avail_empty() are mostly used as a hint in
vhost_net_busy_poll() which is under the protection of the vq mutex.

An exception is the tx_can_batch(), but in that case it doesn't even
want to read the head.


Ok, if it is needed only in that path, maybe we can move the barriers there.



[cc Will Deacon]

Jason, appreciate for your review and comments. I think PATCH[1/2] is
the fix for the hypothesis, meaning PATCH[2/2] is the real fix. However,
it would be nice to fix all of them in one shoot. I will try with PATCH[2/2]
only to see if our issue will disappear or not. However, the issue still
exists if PATCH[2/2] is missed.



Jason, PATCH[2/2] is sufficient to fix our current issue. I tried with 
PATCH[2/2]
only and unable to hit the issue. However, PATCH[1/2] may be needed by other 
scenarios.
So it would be nice to fix them in one shoot.


Yes, see below.





Firstly, We were failing on the transmit queue and {tvq, rvq}->busyloop_timeout
== false if I remember correctly. So the added smp_rmb() in 
vhost_vq_avail_empty()
is only a concern to tx_can_batch(). A mutex isn't enough to ensure the order
for the available index and available ring entry (head). For example, 
vhost_vq_avail_empty()
called by tx_can_batch() can see next available index, but its corresponding
available ring entry (head) may not be seen by vhost yet if smp_rmb() is missed.
The next call to get_tx_bufs(), where the available ring entry (head) doesn't
arrived yet, leading to stale available ring entry (head) being fetched.

handle_tx_copy
  get_tx_bufs // smp_rmb() won't be executed when 
vq->avail_idx != vq->last_avail_idx
  tx_can_batch
vhost_vq_avail_empty  // vq->avail_idx is updated from 
vq->avail->idx

The reason why I added smp_rmb() to vhost_vq_avail_empty() is because the 
function
is a exposed API, even it's only used by drivers/vhost/net.c at present. It 
means
the API has been broken internally. So it seems more appropriate to fix it up in
vhost_vq_avail_empty() so that the API's users needn't worry about the memory 
access
order.


When tx_can_batch returns true it means there's still pending tx
buffers. Since it might read indices so it still can bypass the
smp_rmb() in the vhost_get_vq_desc().

I'd suggest adding those above to change log.

With this,

Acked-by

Re: [PATCH v2 1/2] vhost: Add smp_rmb() in vhost_vq_avail_empty()

2024-03-27 Thread Gavin Shan

On 3/27/24 22:07, Michael S. Tsirkin wrote:

On Wed, Mar 27, 2024 at 09:38:45AM +1000, Gavin Shan wrote:

A smp_rmb() has been missed in vhost_vq_avail_empty(), spotted by
Will Deacon . Otherwise, it's not ensured the
available ring entries pushed by guest can be observed by vhost
in time, leading to stale available ring entries fetched by vhost
in vhost_get_vq_desc(), as reported by Yihuang Yu on NVidia's
grace-hopper (ARM64) platform.

   /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64  \
   -accel kvm -machine virt,gic-version=host -cpu host  \
   -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \
   -m 4096M,slots=16,maxmem=64G \
   -object memory-backend-ram,id=mem0,size=4096M\
:   \
   -netdev tap,id=vnet0,vhost=true  \
   -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0
:
   guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM
   virtio_net virtio0: output.0:id 100 is not a head!

Add the missed smp_rmb() in vhost_vq_avail_empty(). Note that it
should be safe until vq->avail_idx is changed by commit 275bf960ac697
("vhost: better detection of available buffers").

Fixes: 275bf960ac697 ("vhost: better detection of available buffers")
Cc:  # v4.11+
Reported-by: Yihuang Yu 
Signed-off-by: Gavin Shan 
---
  drivers/vhost/vhost.c | 11 ++-
  1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 045f666b4f12..00445ab172b3 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2799,9 +2799,18 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct 
vhost_virtqueue *vq)
r = vhost_get_avail_idx(vq, _idx);
if (unlikely(r))
return false;
+
vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
+   if (vq->avail_idx != vq->last_avail_idx) {
+   /* Similar to what's done in vhost_get_vq_desc(), we need
+* to ensure the available ring entries have been exposed
+* by guest.
+*/


A slightly clearer comment:

/* Since we have updated avail_idx, the following call to
  * vhost_get_vq_desc will read available ring entries.
  * Make sure that read happens after the avail_idx read.
  */

Pls repost with that, and I will apply.

Also add suggested-by for will.



Sure, the suggested comments have been included to v3.




+   smp_rmb();
+   return false;
+   }
  
-	return vq->avail_idx == vq->last_avail_idx;

+   return true;
  }
  EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);


As a follow-up patch, we should clean out code duplication that
accumulated with 3 places reading avail idx in essentially
the same way - this duplication is what causes the mess in
the 1st place.



Yes, nice idea. I've added PATCH[v3 3/3] to improve vhost_get_avail_idx()
to handle the memory barrier since all the callers have the concern.

v3: 
https://lore.kernel.org/virtualization/20240328002149.1141302-1-gs...@redhat.com/

Thanks,
Gavin




[PATCH v3 3/3] vhost: Improve vhost_get_avail_idx() with smp_rmb()

2024-03-27 Thread Gavin Shan
All the callers of vhost_get_avail_idx() are concerned to the memory
barrier, imposed by smp_rmb() to ensure the order of the available
ring entry read and avail_idx read.

Improve vhost_get_avail_idx() so that smp_rmb() is executed when
the avail_idx is advanced. With it, the callers needn't to worry
about the memory barrier.

Suggested-by: Michael S. Tsirkin 
Signed-off-by: Gavin Shan 
---
 drivers/vhost/vhost.c | 75 +++
 1 file changed, 26 insertions(+), 49 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 32686c79c41d..e6882f4f6ce2 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1290,10 +1290,28 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
mutex_unlock(>vqs[i]->mutex);
 }
 
-static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
- __virtio16 *idx)
+static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq)
 {
-   return vhost_get_avail(vq, *idx, >avail->idx);
+   __virtio16 avail_idx;
+   int r;
+
+   r = vhost_get_avail(vq, avail_idx, >avail->idx);
+   if (unlikely(r)) {
+   vq_err(vq, "Failed to access avail idx at %p\n",
+  >avail->idx);
+   return r;
+   }
+
+   vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
+   if (vq->avail_idx != vq->last_avail_idx) {
+   /* Ensure the available ring entry read happens
+* before the avail_idx read when the avail_idx
+* is advanced.
+*/
+   smp_rmb();
+   }
+
+   return 0;
 }
 
 static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
@@ -2499,7 +2517,6 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
struct vring_desc desc;
unsigned int i, head, found = 0;
u16 last_avail_idx;
-   __virtio16 avail_idx;
__virtio16 ring_head;
int ret, access;
 
@@ -2507,12 +2524,8 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
last_avail_idx = vq->last_avail_idx;
 
if (vq->avail_idx == vq->last_avail_idx) {
-   if (unlikely(vhost_get_avail_idx(vq, _idx))) {
-   vq_err(vq, "Failed to access avail idx at %p\n",
-   >avail->idx);
+   if (unlikely(vhost_get_avail_idx(vq)))
return -EFAULT;
-   }
-   vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
 
if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
vq_err(vq, "Guest moved used index from %u to %u",
@@ -2525,11 +2538,6 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 */
if (vq->avail_idx == last_avail_idx)
return vq->num;
-
-   /* Only get avail ring entries after they have been
-* exposed by guest.
-*/
-   smp_rmb();
}
 
/* Grab the next descriptor number they're advertising, and increment
@@ -2790,35 +2798,19 @@ EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
 /* return true if we're sure that avaiable ring is empty */
 bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 {
-   __virtio16 avail_idx;
-   int r;
-
if (vq->avail_idx != vq->last_avail_idx)
return false;
 
-   r = vhost_get_avail_idx(vq, _idx);
-   if (unlikely(r))
-   return false;
-
-   vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
-   if (vq->avail_idx != vq->last_avail_idx) {
-   /* Since we have updated avail_idx, the following
-* call to vhost_get_vq_desc() will read available
-* ring entries. Make sure that read happens after
-* the avail_idx read.
-*/
-   smp_rmb();
+   if (unlikely(vhost_get_avail_idx(vq)))
return false;
-   }
 
-   return true;
+   return vq->avail_idx == vq->last_avail_idx;
 }
 EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
 
 /* OK, now we need to know about added descriptors. */
 bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
 {
-   __virtio16 avail_idx;
int r;
 
if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))
@@ -2842,25 +2834,10 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct 
vhost_virtqueue *vq)
/* They could have slipped one in as we were doing that: make
 * sure it's written, then check again. */
smp_mb();
-   r = vhost_get_avail_idx(vq, _idx);
-   if (r) {
-   vq_err(vq, "Failed to check avail idx at %p: %d\n",
-  >avail->idx, r);
+   if (unlikely(vhost_get_avail_idx(vq)))

[PATCH v3 2/3] vhost: Add smp_rmb() in vhost_enable_notify()

2024-03-27 Thread Gavin Shan
A smp_rmb() has been missed in vhost_enable_notify(), inspired by
Will. Otherwise, it's not ensured the available ring entries pushed
by guest can be observed by vhost in time, leading to stale available
ring entries fetched by vhost in vhost_get_vq_desc(), as reported by
Yihuang Yu on NVidia's grace-hopper (ARM64) platform.

  /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64  \
  -accel kvm -machine virt,gic-version=host -cpu host  \
  -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \
  -m 4096M,slots=16,maxmem=64G \
  -object memory-backend-ram,id=mem0,size=4096M\
   :   \
  -netdev tap,id=vnet0,vhost=true  \
  -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0
   :
  guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM
  virtio_net virtio0: output.0:id 100 is not a head!

Add the missed smp_rmb() in vhost_enable_notify(). When it returns true,
it means there's still pending tx buffers. Since it might read indices,
so it still can bypass the smp_rmb() in vhost_get_vq_desc(). Note that
it should be safe until vq->avail_idx is changed by commit d3bb267bbdcb
("vhost: cache avail index in vhost_enable_notify()").

Fixes: d3bb267bbdcb ("vhost: cache avail index in vhost_enable_notify()")
Cc:  # v5.18+
Reported-by: Yihuang Yu 
Suggested-by: Will Deacon 
Signed-off-by: Gavin Shan 
Acked-by: Jason Wang 
---
 drivers/vhost/vhost.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 29df65b2ebf2..32686c79c41d 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2848,9 +2848,19 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct 
vhost_virtqueue *vq)
   >avail->idx, r);
return false;
}
+
vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
+   if (vq->avail_idx != vq->last_avail_idx) {
+   /* Since we have updated avail_idx, the following
+* call to vhost_get_vq_desc() will read available
+* ring entries. Make sure that read happens after
+* the avail_idx read.
+*/
+   smp_rmb();
+   return true;
+   }
 
-   return vq->avail_idx != vq->last_avail_idx;
+   return false;
 }
 EXPORT_SYMBOL_GPL(vhost_enable_notify);
 
-- 
2.44.0




[PATCH v3 1/3] vhost: Add smp_rmb() in vhost_vq_avail_empty()

2024-03-27 Thread Gavin Shan
A smp_rmb() has been missed in vhost_vq_avail_empty(), spotted by
Will. Otherwise, it's not ensured the available ring entries pushed
by guest can be observed by vhost in time, leading to stale available
ring entries fetched by vhost in vhost_get_vq_desc(), as reported by
Yihuang Yu on NVidia's grace-hopper (ARM64) platform.

  /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64  \
  -accel kvm -machine virt,gic-version=host -cpu host  \
  -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \
  -m 4096M,slots=16,maxmem=64G \
  -object memory-backend-ram,id=mem0,size=4096M\
   :   \
  -netdev tap,id=vnet0,vhost=true  \
  -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0
   :
  guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM
  virtio_net virtio0: output.0:id 100 is not a head!

Add the missed smp_rmb() in vhost_vq_avail_empty(). When tx_can_batch()
returns true, it means there's still pending tx buffers. Since it might
read indices, so it still can bypass the smp_rmb() in vhost_get_vq_desc().
Note that it should be safe until vq->avail_idx is changed by commit
275bf960ac697 ("vhost: better detection of available buffers").

Fixes: 275bf960ac69 ("vhost: better detection of available buffers")
Cc:  # v4.11+
Reported-by: Yihuang Yu 
Suggested-by: Will Deacon 
Signed-off-by: Gavin Shan 
Acked-by: Jason Wang 
---
 drivers/vhost/vhost.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 045f666b4f12..29df65b2ebf2 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2799,9 +2799,19 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct 
vhost_virtqueue *vq)
r = vhost_get_avail_idx(vq, _idx);
if (unlikely(r))
return false;
+
vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
+   if (vq->avail_idx != vq->last_avail_idx) {
+   /* Since we have updated avail_idx, the following
+* call to vhost_get_vq_desc() will read available
+* ring entries. Make sure that read happens after
+* the avail_idx read.
+*/
+   smp_rmb();
+   return false;
+   }
 
-   return vq->avail_idx == vq->last_avail_idx;
+   return true;
 }
 EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
 
-- 
2.44.0




[PATCH v3 0/3] vhost: Fix stale available ring entries

2024-03-27 Thread Gavin Shan
The issue was reported by Yihuang Yu on NVidia's grace-hopper (ARM64)
platform. The wrong head (available ring entry) is seen by the guest
when running 'netperf' on the guest and running 'netserver' on another
NVidia's grace-grace machine.

  /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64  \
  -accel kvm -machine virt,gic-version=host -cpu host  \
  -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \
  -m 4096M,slots=16,maxmem=64G \
  -object memory-backend-ram,id=mem0,size=4096M\
   :   \
  -netdev tap,id=tap0,vhost=true   \
  -device virtio-net-pci,bus=pcie.8,netdev=tap0,mac=52:54:00:f1:26:b0
   :
  guest# ifconfig eth0 | grep 'inet addr'
  inet addr:10.26.1.220
  guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM
  virtio_net virtio0: output.0:id 100 is not a head!

There is missed smp_rmb() in vhost_{vq_avail_empty, enable_notify}()
Without smp_rmb(), vq->avail_idx is advanced but the available ring
entries aren't arriving to vhost side yet. So a stale available ring
entry can be fetched in vhost_get_vq_desc().

Fix it by adding smp_rmb() in those two functions. Note that I need
two patches so that they can be easily picked up by the stable kernel.
With the changes, I'm unable to hit the issue again. Besides, the
function vhost_get_avail_idx() is improved to tackle the memory barrier
so that the callers needn't to worry about it.

v2: 
https://lore.kernel.org/virtualization/46c6a9aa-821c-4013-afe7-61ec05fc9...@redhat.com
v1: 
https://lore.kernel.org/virtualization/66e12633-b2d6-4b9a-9103-bb79770fc...@redhat.com

Changelog
=
v3:
  Improved change log (Jason)
  Improved comments and added PATCH[v3 3/3] to execute
  smp_rmb() in vhost_get_avail_idx()  (Michael)


Gavin Shan (3):
  vhost: Add smp_rmb() in vhost_vq_avail_empty()
  vhost: Add smp_rmb() in vhost_enable_notify()
  vhost: Improve vhost_get_avail_idx() with smp_rmb()

 drivers/vhost/vhost.c | 51 ---
 1 file changed, 24 insertions(+), 27 deletions(-)

-- 
2.44.0




Re: [PATCH v2 1/2] vhost: Add smp_rmb() in vhost_vq_avail_empty()

2024-03-27 Thread Gavin Shan

On 3/27/24 14:08, Gavin Shan wrote:

On 3/27/24 12:44, Jason Wang wrote:

On Wed, Mar 27, 2024 at 10:34 AM Jason Wang  wrote:

On Wed, Mar 27, 2024 at 7:39 AM Gavin Shan  wrote:


A smp_rmb() has been missed in vhost_vq_avail_empty(), spotted by
Will Deacon . Otherwise, it's not ensured the
available ring entries pushed by guest can be observed by vhost
in time, leading to stale available ring entries fetched by vhost
in vhost_get_vq_desc(), as reported by Yihuang Yu on NVidia's
grace-hopper (ARM64) platform.

   /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64  \
   -accel kvm -machine virt,gic-version=host -cpu host  \
   -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \
   -m 4096M,slots=16,maxmem=64G \
   -object memory-backend-ram,id=mem0,size=4096M    \
    :   \
   -netdev tap,id=vnet0,vhost=true  \
   -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0
    :
   guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM
   virtio_net virtio0: output.0:id 100 is not a head!

Add the missed smp_rmb() in vhost_vq_avail_empty(). Note that it
should be safe until vq->avail_idx is changed by commit 275bf960ac697
("vhost: better detection of available buffers").

Fixes: 275bf960ac697 ("vhost: better detection of available buffers")
Cc:  # v4.11+
Reported-by: Yihuang Yu 
Signed-off-by: Gavin Shan 
---
  drivers/vhost/vhost.c | 11 ++-
  1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 045f666b4f12..00445ab172b3 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2799,9 +2799,18 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct 
vhost_virtqueue *vq)
 r = vhost_get_avail_idx(vq, _idx);
 if (unlikely(r))
 return false;
+
 vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
+   if (vq->avail_idx != vq->last_avail_idx) {
+   /* Similar to what's done in vhost_get_vq_desc(), we need
+    * to ensure the available ring entries have been exposed
+    * by guest.
+    */


We need to be more verbose here. For example, which load needs to be
ordered with which load.

The rmb in vhost_get_vq_desc() is used to order the load of avail idx
and the load of head. It is paired with e.g virtio_wmb() in
virtqueue_add_split().

vhost_vq_avail_empty() are mostly used as a hint in
vhost_net_busy_poll() which is under the protection of the vq mutex.

An exception is the tx_can_batch(), but in that case it doesn't even
want to read the head.


Ok, if it is needed only in that path, maybe we can move the barriers there.



[cc Will Deacon]

Jason, appreciate for your review and comments. I think PATCH[1/2] is
the fix for the hypothesis, meaning PATCH[2/2] is the real fix. However,
it would be nice to fix all of them in one shoot. I will try with PATCH[2/2]
only to see if our issue will disappear or not. However, the issue still
exists if PATCH[2/2] is missed.



Jason, PATCH[2/2] is sufficient to fix our current issue. I tried with 
PATCH[2/2]
only and unable to hit the issue. However, PATCH[1/2] may be needed by other 
scenarios.
So it would be nice to fix them in one shoot.



Firstly, We were failing on the transmit queue and {tvq, rvq}->busyloop_timeout
== false if I remember correctly. So the added smp_rmb() in 
vhost_vq_avail_empty()
is only a concern to tx_can_batch(). A mutex isn't enough to ensure the order
for the available index and available ring entry (head). For example, 
vhost_vq_avail_empty()
called by tx_can_batch() can see next available index, but its corresponding
available ring entry (head) may not be seen by vhost yet if smp_rmb() is missed.
The next call to get_tx_bufs(), where the available ring entry (head) doesn't
arrived yet, leading to stale available ring entry (head) being fetched.

   handle_tx_copy
     get_tx_bufs // smp_rmb() won't be executed when vq->avail_idx 
!= vq->last_avail_idx
     tx_can_batch
   vhost_vq_avail_empty  // vq->avail_idx is updated from vq->avail->idx

The reason why I added smp_rmb() to vhost_vq_avail_empty() is because the 
function
is a exposed API, even it's only used by drivers/vhost/net.c at present. It 
means
the API has been broken internally. So it seems more appropriate to fix it up in
vhost_vq_avail_empty() so that the API's users needn't worry about the memory 
access
order.





+   smp_rmb();
+   return false;
+   }

-   return vq->avail_idx == vq->last_avail_idx;
+   return true;
  }
  EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);


Thanks,
Gavin




Re: [PATCH v2 2/2] vhost: Add smp_rmb() in vhost_enable_notify()

2024-03-26 Thread Gavin Shan

On 3/27/24 12:41, Jason Wang wrote:

On Wed, Mar 27, 2024 at 7:39 AM Gavin Shan  wrote:


A smp_rmb() has been missed in vhost_enable_notify(), inspired by
Will Deacon . Otherwise, it's not ensured the
available ring entries pushed by guest can be observed by vhost
in time, leading to stale available ring entries fetched by vhost
in vhost_get_vq_desc(), as reported by Yihuang Yu on NVidia's
grace-hopper (ARM64) platform.

   /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64  \
   -accel kvm -machine virt,gic-version=host -cpu host  \
   -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \
   -m 4096M,slots=16,maxmem=64G \
   -object memory-backend-ram,id=mem0,size=4096M\
:   \
   -netdev tap,id=vnet0,vhost=true  \
   -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0
:
   guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM
   virtio_net virtio0: output.0:id 100 is not a head!

Add the missed smp_rmb() in vhost_enable_notify(). Note that it
should be safe until vq->avail_idx is changed by commit d3bb267bbdcb
("vhost: cache avail index in vhost_enable_notify()").

Fixes: d3bb267bbdcb ("vhost: cache avail index in vhost_enable_notify()")
Cc:  # v5.18+
Reported-by: Yihuang Yu 
Signed-off-by: Gavin Shan 
---
  drivers/vhost/vhost.c | 11 ++-
  1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 00445ab172b3..58f9d6a435f0 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2847,9 +2847,18 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct 
vhost_virtqueue *vq)
>avail->idx, r);
 return false;
 }
+
 vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
+   if (vq->avail_idx != vq->last_avail_idx) {
+   /* Similar to what's done in vhost_get_vq_desc(), we need
+* to ensure the available ring entries have been exposed
+* by guest.
+*/
+   smp_rmb();
+   return true;
+   }

-   return vq->avail_idx != vq->last_avail_idx;
+   return false;


So we only care about the case when vhost_enable_notify() returns true.

In that case, I think you want to order with vhost_get_vq_desc():

last_avail_idx = vq->last_avail_idx;

if (vq->avail_idx == vq->last_avail_idx) { /* false */
}

vhost_get_avail_head(vq, _head, last_avail_idx)

Assuming I understand the patch correctly.

Acked-by: Jason Wang 



Jason, thanks for your review and comments. Your understanding is exactly
what I understood.




  }
  EXPORT_SYMBOL_GPL(vhost_enable_notify);



Thanks,
Gavin




Re: [PATCH v2 1/2] vhost: Add smp_rmb() in vhost_vq_avail_empty()

2024-03-26 Thread Gavin Shan

On 3/27/24 12:44, Jason Wang wrote:

On Wed, Mar 27, 2024 at 10:34 AM Jason Wang  wrote:

On Wed, Mar 27, 2024 at 7:39 AM Gavin Shan  wrote:


A smp_rmb() has been missed in vhost_vq_avail_empty(), spotted by
Will Deacon . Otherwise, it's not ensured the
available ring entries pushed by guest can be observed by vhost
in time, leading to stale available ring entries fetched by vhost
in vhost_get_vq_desc(), as reported by Yihuang Yu on NVidia's
grace-hopper (ARM64) platform.

   /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64  \
   -accel kvm -machine virt,gic-version=host -cpu host  \
   -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \
   -m 4096M,slots=16,maxmem=64G \
   -object memory-backend-ram,id=mem0,size=4096M\
:   \
   -netdev tap,id=vnet0,vhost=true  \
   -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0
:
   guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM
   virtio_net virtio0: output.0:id 100 is not a head!

Add the missed smp_rmb() in vhost_vq_avail_empty(). Note that it
should be safe until vq->avail_idx is changed by commit 275bf960ac697
("vhost: better detection of available buffers").

Fixes: 275bf960ac697 ("vhost: better detection of available buffers")
Cc:  # v4.11+
Reported-by: Yihuang Yu 
Signed-off-by: Gavin Shan 
---
  drivers/vhost/vhost.c | 11 ++-
  1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 045f666b4f12..00445ab172b3 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2799,9 +2799,18 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct 
vhost_virtqueue *vq)
 r = vhost_get_avail_idx(vq, _idx);
 if (unlikely(r))
 return false;
+
 vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
+   if (vq->avail_idx != vq->last_avail_idx) {
+   /* Similar to what's done in vhost_get_vq_desc(), we need
+* to ensure the available ring entries have been exposed
+* by guest.
+*/


We need to be more verbose here. For example, which load needs to be
ordered with which load.

The rmb in vhost_get_vq_desc() is used to order the load of avail idx
and the load of head. It is paired with e.g virtio_wmb() in
virtqueue_add_split().

vhost_vq_avail_empty() are mostly used as a hint in
vhost_net_busy_poll() which is under the protection of the vq mutex.

An exception is the tx_can_batch(), but in that case it doesn't even
want to read the head.


Ok, if it is needed only in that path, maybe we can move the barriers there.



[cc Will Deacon]

Jason, appreciate for your review and comments. I think PATCH[1/2] is
the fix for the hypothesis, meaning PATCH[2/2] is the real fix. However,
it would be nice to fix all of them in one shoot. I will try with PATCH[2/2]
only to see if our issue will disappear or not. However, the issue still
exists if PATCH[2/2] is missed.

Firstly, We were failing on the transmit queue and {tvq, rvq}->busyloop_timeout
== false if I remember correctly. So the added smp_rmb() in 
vhost_vq_avail_empty()
is only a concern to tx_can_batch(). A mutex isn't enough to ensure the order
for the available index and available ring entry (head). For example, 
vhost_vq_avail_empty()
called by tx_can_batch() can see next available index, but its corresponding
available ring entry (head) may not be seen by vhost yet if smp_rmb() is missed.
The next call to get_tx_bufs(), where the available ring entry (head) doesn't
arrived yet, leading to stale available ring entry (head) being fetched.

  handle_tx_copy
get_tx_bufs // smp_rmb() won't be executed when vq->avail_idx 
!= vq->last_avail_idx
tx_can_batch
  vhost_vq_avail_empty  // vq->avail_idx is updated from vq->avail->idx

The reason why I added smp_rmb() to vhost_vq_avail_empty() is because the 
function
is a exposed API, even it's only used by drivers/vhost/net.c at present. It 
means
the API has been broken internally. So it seems more appropriate to fix it up in
vhost_vq_avail_empty() so that the API's users needn't worry about the memory 
access
order.





+   smp_rmb();
+   return false;
+   }

-   return vq->avail_idx == vq->last_avail_idx;
+   return true;
  }
  EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);


Thanks,
Gavin




Re: [PATCH] virtio_ring: Fix the stale index in available ring

2024-03-26 Thread Gavin Shan



On 3/27/24 09:14, Gavin Shan wrote:

On 3/27/24 01:46, Will Deacon wrote:

On Tue, Mar 26, 2024 at 11:43:13AM +, Will Deacon wrote:

Ok, long shot after eyeballing the vhost code, but does the diff below
help at all? It looks like vhost_vq_avail_empty() can advance the value
saved in 'vq->avail_idx' but without the read barrier, possibly confusing
vhost_get_vq_desc() in polling mode.

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 045f666b4f12..87bff710331a 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2801,6 +2801,7 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct 
vhost_virtqueue *vq)
 return false;
 vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
+   smp_rmb();
 return vq->avail_idx == vq->last_avail_idx;
  }
  EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);



Thanks, Will. I already noticed smp_rmb() has been missed in 
vhost_vq_avail_empty().
The issue still exists after smp_rmb() is added here. However, I'm inspired by 
your
suggestion and recheck the code again. It seems another smp_rmb() has been 
missed
in vhost_enable_notify().

With smp_rmb() added to vhost_vq_avail_empty() and vhost_enable_notify(), I'm 
unable
to hit the issue. I will try for more times to make sure the issue is really 
resolved.
After that, I will post formal patches for review.



Thanks again, Will. The formal patches have been sent for review.

https://lkml.org/lkml/2024/3/27/40

Thanks,
Gavin




Re: [PATCH v2 0/2] vhost: Fix stale available ring entries

2024-03-26 Thread Gavin Shan

On 3/27/24 09:38, Gavin Shan wrote:

The issue was reported by Yihuang Yu on NVidia's grace-hopper (ARM64)
platform. The wrong head (available ring entry) is seen by the guest
when running 'netperf' on the guest and running 'netserver' on another
NVidia's grace-grace machine.

   /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64  \
   -accel kvm -machine virt,gic-version=host -cpu host  \
   -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \
   -m 4096M,slots=16,maxmem=64G \
   -object memory-backend-ram,id=mem0,size=4096M\
:   \
   -netdev tap,id=tap0,vhost=true   \
   -device virtio-net-pci,bus=pcie.8,netdev=tap0,mac=52:54:00:f1:26:b0
:
   guest# ifconfig eth0 | grep 'inet addr'
   inet addr:10.26.1.220
   guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM
   virtio_net virtio0: output.0:id 100 is not a head!

There is missed smp_rmb() in vhost_vq_avail_empty() and vhost_enable_notify().
Without smp_rmb(), vq->avail_idx is increased but the available ring
entries aren't arriving to vhost side yet. So a stale available ring
entry can be fetched in vhost_get_vq_desc().

Fix it by adding smp_rmb() in those two functions. Note that I need
two patches so that they can be easily picked up by the stable kernel.
With the changes, I'm unable to hit the issue again.

Gavin Shan (2):
   vhost: Add smp_rmb() in vhost_vq_avail_empty()
   vhost: Add smp_rmb() in vhost_enable_notify()

  drivers/vhost/vhost.c | 22 --
  1 file changed, 20 insertions(+), 2 deletions(-)



Sorry, I was supposed to copy Will. Amending for it.

Thanks,
Gavin




[PATCH v2 2/2] vhost: Add smp_rmb() in vhost_enable_notify()

2024-03-26 Thread Gavin Shan
A smp_rmb() has been missed in vhost_enable_notify(), inspired by
Will Deacon . Otherwise, it's not ensured the
available ring entries pushed by guest can be observed by vhost
in time, leading to stale available ring entries fetched by vhost
in vhost_get_vq_desc(), as reported by Yihuang Yu on NVidia's
grace-hopper (ARM64) platform.

  /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64  \
  -accel kvm -machine virt,gic-version=host -cpu host  \
  -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \
  -m 4096M,slots=16,maxmem=64G \
  -object memory-backend-ram,id=mem0,size=4096M\
   :   \
  -netdev tap,id=vnet0,vhost=true  \
  -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0
   :
  guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM
  virtio_net virtio0: output.0:id 100 is not a head!

Add the missed smp_rmb() in vhost_enable_notify(). Note that it
should be safe until vq->avail_idx is changed by commit d3bb267bbdcb
("vhost: cache avail index in vhost_enable_notify()").

Fixes: d3bb267bbdcb ("vhost: cache avail index in vhost_enable_notify()")
Cc:  # v5.18+
Reported-by: Yihuang Yu 
Signed-off-by: Gavin Shan 
---
 drivers/vhost/vhost.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 00445ab172b3..58f9d6a435f0 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2847,9 +2847,18 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct 
vhost_virtqueue *vq)
   >avail->idx, r);
return false;
}
+
vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
+   if (vq->avail_idx != vq->last_avail_idx) {
+   /* Similar to what's done in vhost_get_vq_desc(), we need
+* to ensure the available ring entries have been exposed
+* by guest.
+*/
+   smp_rmb();
+   return true;
+   }
 
-   return vq->avail_idx != vq->last_avail_idx;
+   return false;
 }
 EXPORT_SYMBOL_GPL(vhost_enable_notify);
 
-- 
2.44.0




[PATCH v2 1/2] vhost: Add smp_rmb() in vhost_vq_avail_empty()

2024-03-26 Thread Gavin Shan
A smp_rmb() has been missed in vhost_vq_avail_empty(), spotted by
Will Deacon . Otherwise, it's not ensured the
available ring entries pushed by guest can be observed by vhost
in time, leading to stale available ring entries fetched by vhost
in vhost_get_vq_desc(), as reported by Yihuang Yu on NVidia's
grace-hopper (ARM64) platform.

  /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64  \
  -accel kvm -machine virt,gic-version=host -cpu host  \
  -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \
  -m 4096M,slots=16,maxmem=64G \
  -object memory-backend-ram,id=mem0,size=4096M\
   :   \
  -netdev tap,id=vnet0,vhost=true  \
  -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0
   :
  guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM
  virtio_net virtio0: output.0:id 100 is not a head!

Add the missed smp_rmb() in vhost_vq_avail_empty(). Note that it
should be safe until vq->avail_idx is changed by commit 275bf960ac697
("vhost: better detection of available buffers").

Fixes: 275bf960ac697 ("vhost: better detection of available buffers")
Cc:  # v4.11+
Reported-by: Yihuang Yu 
Signed-off-by: Gavin Shan 
---
 drivers/vhost/vhost.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 045f666b4f12..00445ab172b3 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2799,9 +2799,18 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct 
vhost_virtqueue *vq)
r = vhost_get_avail_idx(vq, _idx);
if (unlikely(r))
return false;
+
vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
+   if (vq->avail_idx != vq->last_avail_idx) {
+   /* Similar to what's done in vhost_get_vq_desc(), we need
+* to ensure the available ring entries have been exposed
+* by guest.
+*/
+   smp_rmb();
+   return false;
+   }
 
-   return vq->avail_idx == vq->last_avail_idx;
+   return true;
 }
 EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
 
-- 
2.44.0




[PATCH v2 0/2] vhost: Fix stale available ring entries

2024-03-26 Thread Gavin Shan
The issue was reported by Yihuang Yu on NVidia's grace-hopper (ARM64)
platform. The wrong head (available ring entry) is seen by the guest
when running 'netperf' on the guest and running 'netserver' on another
NVidia's grace-grace machine.

  /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64  \
  -accel kvm -machine virt,gic-version=host -cpu host  \
  -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \
  -m 4096M,slots=16,maxmem=64G \
  -object memory-backend-ram,id=mem0,size=4096M\
   :   \
  -netdev tap,id=tap0,vhost=true   \
  -device virtio-net-pci,bus=pcie.8,netdev=tap0,mac=52:54:00:f1:26:b0
   :
  guest# ifconfig eth0 | grep 'inet addr'
  inet addr:10.26.1.220
  guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM
  virtio_net virtio0: output.0:id 100 is not a head!

There is missed smp_rmb() in vhost_vq_avail_empty() and vhost_enable_notify().
Without smp_rmb(), vq->avail_idx is increased but the available ring
entries aren't arriving to vhost side yet. So a stale available ring
entry can be fetched in vhost_get_vq_desc().

Fix it by adding smp_rmb() in those two functions. Note that I need
two patches so that they can be easily picked up by the stable kernel.
With the changes, I'm unable to hit the issue again.

Gavin Shan (2):
  vhost: Add smp_rmb() in vhost_vq_avail_empty()
  vhost: Add smp_rmb() in vhost_enable_notify()

 drivers/vhost/vhost.c | 22 --
 1 file changed, 20 insertions(+), 2 deletions(-)

-- 
2.44.0




Re: [PATCH] virtio_ring: Fix the stale index in available ring

2024-03-26 Thread Gavin Shan



On 3/27/24 01:46, Will Deacon wrote:

On Tue, Mar 26, 2024 at 11:43:13AM +, Will Deacon wrote:

Ok, long shot after eyeballing the vhost code, but does the diff below
help at all? It looks like vhost_vq_avail_empty() can advance the value
saved in 'vq->avail_idx' but without the read barrier, possibly confusing
vhost_get_vq_desc() in polling mode.

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 045f666b4f12..87bff710331a 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2801,6 +2801,7 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct 
vhost_virtqueue *vq)
 return false;
 vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
  
+   smp_rmb();

 return vq->avail_idx == vq->last_avail_idx;
  }
  EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);



Thanks, Will. I already noticed smp_rmb() has been missed in 
vhost_vq_avail_empty().
The issue still exists after smp_rmb() is added here. However, I'm inspired by 
your
suggestion and recheck the code again. It seems another smp_rmb() has been 
missed
in vhost_enable_notify().

With smp_rmb() added to vhost_vq_avail_empty() and vhost_enable_notify(), I'm 
unable
to hit the issue. I will try for more times to make sure the issue is really 
resolved.
After that, I will post formal patches for review.

Thanks,
Gavin




Re: [PATCH] virtio_ring: Fix the stale index in available ring

2024-03-25 Thread Gavin Shan



On 3/20/24 17:14, Michael S. Tsirkin wrote:

On Wed, Mar 20, 2024 at 03:24:16PM +1000, Gavin Shan wrote:

On 3/20/24 10:49, Michael S. Tsirkin wrote:>

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 6f7e5010a673..79456706d0bd 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -685,7 +685,8 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
/* Put entry in available array (but don't update avail->idx until they
 * do sync). */
avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
-   vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);
+   u16 headwithflag = head | (q->split.avail_idx_shadow & 
~(vq->split.vring.num - 1));
+   vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, 
headwithflag);
/* Descriptors and available array need to be set before we expose the
 * new available array entries. */



Ok, Michael. I continued with my debugging code. It still looks like a
hardware bug on NVidia's grace-hopper. I really think NVidia needs to be
involved for the discussion, as suggested by you.

Firstly, I bind the vhost process and vCPU thread to CPU#71 and CPU#70.
Note that I have only one vCPU in my configuration.

Secondly, the debugging code is enhanced so that the available head for
(last_avail_idx - 1) is read for twice and recorded. It means the available
head for one specific available index is read for twice. I do see the
available heads are different from the consecutive reads. More details
are shared as below.

From the guest side
===

virtio_net virtio0: output.0:id 86 is not a head!
head to be released: 047 062 112

avail_idx:
000  49665
001  49666  <--
 :
015  49664

avail_head:
000  062
001  047  <--
 :
015  112

From the host side
==

avail_idx
000  49663
001  49666  <---
 :

avail_head
000  062  (062)
001  047  (047)  <---
 :
015  086  (112)  // head 086 is returned from the first read,
 // but head 112 is returned from the second read

vhost_get_vq_desc: Inconsistent head in two read (86 -> 112) for avail_idx 49664

Thanks,
Gavin





Re: [PATCH] virtio_ring: Fix the stale index in available ring

2024-03-21 Thread Gavin Shan

On 3/21/24 03:15, Keir Fraser wrote:

On Wed, Mar 20, 2024 at 03:24:16PM +1000, Gavin Shan wrote:


Before this patch was posted, I had debugging code to record last 16 
transactions
to the available and used queue from guest and host side. It did reveal the 
wrong
head was fetched from the available queue.

[   11.785745]  virtqueue_get_buf_ctx_split 
[   11.786238] virtio_net virtio0: output.0:id 74 is not a head!
[   11.786655] head to be released: 036 077
[   11.786952]
[   11.786952] avail_idx:
[   11.787234] 000  63985  <--
[   11.787237] 001  63986
[   11.787444] 002  63987
[   11.787632] 003  63988
[   11.787821] 004  63989
[   11.788006] 005  63990
[   11.788194] 006  63991
[   11.788381] 007  63992
[   11.788567] 008  63993
[   11.788772] 009  63994
[   11.788957] 010  63995
[   11.789141] 011  63996
[   11.789327] 012  63997
[   11.789515] 013  63998
[   11.789701] 014  63999
[   11.789886] 015  64000


Does the error always occur at such a round idx value?

Here, 64000 == 0xFA00. Maybe coincidence but it's improbable enough to be 
interesting.

This debug code seems rather useful!



Keir, Nope, it's just coincidence. We don't have such kind of pattern.

Thanks,
Gavin





[   11.790068]
[   11.790068] avail_head:
[   11.790529] 000  075  <--
[   11.790718] 001  036
[   11.790890] 002  077
[   11.791061] 003  129
[   11.791231] 004  072
[   11.791400] 005  130
[   11.791574] 006  015
[   11.791748] 007  074
[   11.791918] 008  130
[   11.792094] 009  130
[   11.792263] 010  074
[   11.792437] 011  015
[   11.792617] 012  072
[   11.792788] 013  129
[   11.792961] 014  077// The last two heads from guest to host: 077, 036
[   11.793134] 015  036

[root@nvidia-grace-hopper-05 qemu.main]# cat /proc/vhost

avail_idx
000  63998
001  64000
002  63954  <---
003  63955
004  63956
005  63974
006  63981
007  63984
008  63986
009  63987
010  63988
011  63989
012  63992
013  63993
014  63995
015  63997

avail_head
000  074
001  015
002  072
003  129
004  074// The last two heads seen by vhost is: 074, 036
005  036
006  075  <---
007  036
008  077
009  129
010  072
011  130
012  015
013  074
014  130
015  130

used_idx
000  64000
001  63882  <---
002  63889
003  63891
004  63898
005  63936
006  63942
007  63946
008  63949
009  63953
010  63957
011  63981
012  63990
013  63992
014  63993
015  63999

used_head
000  072
001  129
002  074  // The last two heads published to guest is: 074, 036
003  036
004  075  <---
005  036
006  077
007  129
008  072
009  130
010  015
011  074
012  130
013  130
014  074
015  015

Thanks,
Gavin











Re: [PATCH] virtio_ring: Fix the stale index in available ring

2024-03-19 Thread Gavin Shan
On 3/20/24 10:49, Michael S. Tsirkin wrote:> 

I think you are wasting the time with these tests. Even if it helps what
does this tell us? Try setting a flag as I suggested elsewhere.
Then check it in vhost.
Or here's another idea - possibly easier. Copy the high bits from index
into ring itself. Then vhost can check that head is synchronized with
index.

Warning: completely untested, not even compiled. But should give you
the idea. If this works btw we should consider making this official in
the spec.


  static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 6f7e5010a673..79456706d0bd 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -685,7 +685,8 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
/* Put entry in available array (but don't update avail->idx until they
 * do sync). */
avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
-   vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);
+   u16 headwithflag = head | (q->split.avail_idx_shadow & 
~(vq->split.vring.num - 1));
+   vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, 
headwithflag);
  
  	/* Descriptors and available array need to be set before we expose the

 * new available array entries. */

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 045f666b4f12..bd8f7c763caa 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1299,8 +1299,15 @@ static inline int vhost_get_avail_idx(struct 
vhost_virtqueue *vq,
  static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
   __virtio16 *head, int idx)
  {
-   return vhost_get_avail(vq, *head,
+   unsigned i = idx;
+   unsigned flag = i & ~(vq->num - 1);
+   unsigned val = vhost_get_avail(vq, *head,
   >avail->ring[idx & (vq->num - 1)]);
+   unsigned valflag = val & ~(vq->num - 1);
+
+   WARN_ON(valflag != flag);
+
+   return val & (vq->num - 1);
  }
  


Thanks, Michael. The code is already self-explanatory. Since vq->num is 256, I 
just
squeezed the last_avail_idx to the high byte. Unfortunately, I'm unable to hit
the WARN_ON(). Does it mean the low byte is stale (or corrupted) while the high
byte is still correct and valid?

avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
vq->split.vring.avail->ring[avail] =
cpu_to_virtio16(_vq->vdev, head | (avail << 8));


head = vhost16_to_cpu(vq, ring_head);
WARN_ON((head >> 8) != (vq->last_avail_idx % vq->num));
head = head & 0xff;

One question: Does QEMU has any chance writing data to the available queue when
vhost is enabled? My previous understanding is no, the queue is totally owned by
vhost instead of QEMU.

Before this patch was posted, I had debugging code to record last 16 
transactions
to the available and used queue from guest and host side. It did reveal the 
wrong
head was fetched from the available queue.

[   11.785745]  virtqueue_get_buf_ctx_split 
[   11.786238] virtio_net virtio0: output.0:id 74 is not a head!
[   11.786655] head to be released: 036 077
[   11.786952]
[   11.786952] avail_idx:
[   11.787234] 000  63985  <--
[   11.787237] 001  63986
[   11.787444] 002  63987
[   11.787632] 003  63988
[   11.787821] 004  63989
[   11.788006] 005  63990
[   11.788194] 006  63991
[   11.788381] 007  63992
[   11.788567] 008  63993
[   11.788772] 009  63994
[   11.788957] 010  63995
[   11.789141] 011  63996
[   11.789327] 012  63997
[   11.789515] 013  63998
[   11.789701] 014  63999
[   11.789886] 015  64000
[   11.790068]
[   11.790068] avail_head:
[   11.790529] 000  075  <--
[   11.790718] 001  036
[   11.790890] 002  077
[   11.791061] 003  129
[   11.791231] 004  072
[   11.791400] 005  130
[   11.791574] 006  015
[   11.791748] 007  074
[   11.791918] 008  130
[   11.792094] 009  130
[   11.792263] 010  074
[   11.792437] 011  015
[   11.792617] 012  072
[   11.792788] 013  129
[   11.792961] 014  077// The last two heads from guest to host: 077, 036
[   11.793134] 015  036

[root@nvidia-grace-hopper-05 qemu.main]# cat /proc/vhost

avail_idx
000  63998
001  64000
002  63954  <---
003  63955
004  63956
005  63974
006  63981
007  63984
008  63986
009  63987
010  63988
011  63989
012  63992
013  63993
014  63995
015  63997

avail_head
000  074
001  015
002  072
003  129
004  074// The last two heads seen by vhost is: 074, 036
005  036
006  075  <---
007  036
008  077
009  129
010  072
011  130
012  015
013  074
014  130
015  130

used_idx
000  64000
001  63882  <---
002  63889
003  63891
004  63898
005  63936
006  63942
007  63946
008  63949
009  63953
010  63957
011  63981
012  63990
013  63992
014  63993
015  63999

used_head
000  072
001  129
002  074  // The 

Re: [PATCH] virtio_ring: Fix the stale index in available ring

2024-03-19 Thread Gavin Shan

On 3/20/24 04:22, Will Deacon wrote:

On Tue, Mar 19, 2024 at 02:59:23PM +1000, Gavin Shan wrote:

On 3/19/24 02:59, Will Deacon wrote:

   drivers/virtio/virtio_ring.c | 12 +---
   1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 49299b1f9ec7..7d852811c912 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -687,9 +687,15 @@ static inline int virtqueue_add_split(struct virtqueue 
*_vq,
avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);
-   /* Descriptors and available array need to be set before we expose the
-* new available array entries. */
-   virtio_wmb(vq->weak_barriers);
+   /*
+* Descriptors and available array need to be set before we expose
+* the new available array entries. virtio_wmb() should be enough
+* to ensuere the order theoretically. However, a stronger barrier
+* is needed by ARM64. Otherwise, the stale data can be observed
+* by the host (vhost). A stronger barrier should work for other
+* architectures, but performance loss is expected.
+*/
+   virtio_mb(false);
vq->split.avail_idx_shadow++;
vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev,
vq->split.avail_idx_shadow);


Replacing a DMB with a DSB is _very_ unlikely to be the correct solution
here, especially when ordering accesses to coherent memory.

In practice, either the larger timing different from the DSB or the fact
that you're going from a Store->Store barrier to a full barrier is what
makes things "work" for you. Have you tried, for example, a DMB SY
(e.g. via __smb_mb()).

We definitely shouldn't take changes like this without a proper
explanation of what is going on.



Thanks for your comments, Will.

Yes, DMB should work for us. However, it seems this instruction has issues on
NVidia's grace-hopper. It's hard for me to understand how DMB and DSB works
from hardware level. I agree it's not the solution to replace DMB with DSB
before we fully understand the root cause.

I tried the possible replacement like below. __smp_mb() can avoid the issue like
__mb() does. __ndelay(10) can avoid the issue, but __ndelay(9) doesn't.

static inline int virtqueue_add_split(struct virtqueue *_vq, ...)
{
 :
 /* Put entry in available array (but don't update avail->idx until they
  * do sync). */
 avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
 vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);

 /* Descriptors and available array need to be set before we expose the
  * new available array entries. */
 // Broken: virtio_wmb(vq->weak_barriers);
 // Broken: __dma_mb();
 // Work:   __mb();
 // Work:   __smp_mb();


It's pretty weird that __dma_mb() is "broken" but __smp_mb() "works". How
confident are you in that result?



Yes, __dma_mb() is even stronger than __smp_mb(). I retried the test, showing
that both __dma_mb() and __smp_mb() work for us. I had too many tests yesterday
and something may have been messed up.

Instruction Hitting times in 10 tests
-
__smp_wmb() 8
__smp_mb()  0
__dma_wmb() 7
__dma_mb()  0
__mb()  0
__wmb() 0

It's strange that __smp_mb() works, but __smp_wmb() fails. It seems we need a
read barrier here. I will try WRITE_ONCE() + __smp_wmb() as suggested by Michael
in another reply. Will update the result soon.

Thanks,
Gavin




Re: [PATCH] virtio_ring: Fix the stale index in available ring

2024-03-19 Thread Gavin Shan

On 3/19/24 17:09, Michael S. Tsirkin wrote:

On Tue, Mar 19, 2024 at 04:49:50PM +1000, Gavin Shan wrote:


On 3/19/24 16:43, Michael S. Tsirkin wrote:

On Tue, Mar 19, 2024 at 04:38:49PM +1000, Gavin Shan wrote:

On 3/19/24 16:09, Michael S. Tsirkin wrote:


diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 49299b1f9ec7..7d852811c912 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -687,9 +687,15 @@ static inline int virtqueue_add_split(struct virtqueue 
*_vq,
avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);
-   /* Descriptors and available array need to be set before we expose the
-* new available array entries. */
-   virtio_wmb(vq->weak_barriers);
+   /*
+* Descriptors and available array need to be set before we expose
+* the new available array entries. virtio_wmb() should be enough
+* to ensuere the order theoretically. However, a stronger barrier
+* is needed by ARM64. Otherwise, the stale data can be observed
+* by the host (vhost). A stronger barrier should work for other
+* architectures, but performance loss is expected.
+*/
+   virtio_mb(false);
vq->split.avail_idx_shadow++;
vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev,
vq->split.avail_idx_shadow);


Replacing a DMB with a DSB is _very_ unlikely to be the correct solution
here, especially when ordering accesses to coherent memory.

In practice, either the larger timing different from the DSB or the fact
that you're going from a Store->Store barrier to a full barrier is what
makes things "work" for you. Have you tried, for example, a DMB SY
(e.g. via __smb_mb()).

We definitely shouldn't take changes like this without a proper
explanation of what is going on.



Thanks for your comments, Will.

Yes, DMB should work for us. However, it seems this instruction has issues on
NVidia's grace-hopper. It's hard for me to understand how DMB and DSB works
from hardware level. I agree it's not the solution to replace DMB with DSB
before we fully understand the root cause.

I tried the possible replacement like below. __smp_mb() can avoid the issue like
__mb() does. __ndelay(10) can avoid the issue, but __ndelay(9) doesn't.

static inline int virtqueue_add_split(struct virtqueue *_vq, ...)
{
   :
   /* Put entry in available array (but don't update avail->idx until 
they
* do sync). */
   avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
   vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, 
head);

   /* Descriptors and available array need to be set before we expose 
the
* new available array entries. */
   // Broken: virtio_wmb(vq->weak_barriers);
   // Broken: __dma_mb();
   // Work:   __mb();
   // Work:   __smp_mb();
   // Work:   __ndelay(100);
   // Work:   __ndelay(10);
   // Broken: __ndelay(9);

  vq->split.avail_idx_shadow++;
   vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev,
   vq->split.avail_idx_shadow);


What if you stick __ndelay here?



 /* Put entry in available array (but don't update avail->idx until they
   * do sync). */
  avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
  vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);

  /* Descriptors and available array need to be set before we expose the
   * new available array entries. */
  virtio_wmb(vq->weak_barriers);
  vq->split.avail_idx_shadow++;
  vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev,
  vq->split.avail_idx_shadow);
  /* Try __ndelay(x) here as Michael suggested
   *
   * Work:  __ndelay(200);possiblly make it hard to reproduce
   * Broken:__ndelay(100);
   * Broken:__ndelay(20);
   * Broken:__ndelay(10);
   */
  __ndelay(200);


So we see that just changing the timing masks the race.
What are you using on the host side? vhost or qemu?



__ndelay(200) may make the issue harder to be reproduce as I understand.
More delays here will give vhost relief, reducing the race.

The issue is only reproducible when vhost is turned on. Otherwise, we
aren't able to hit the issue.

-netdev 
tap,id=vnet0,vhost=true,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown \
-device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0



Given it's vhost, it's also possible that the is

Re: [PATCH] virtio_ring: Fix the stale index in available ring

2024-03-19 Thread Gavin Shan

On 3/19/24 17:04, Michael S. Tsirkin wrote:

On Tue, Mar 19, 2024 at 04:54:15PM +1000, Gavin Shan wrote:

On 3/19/24 16:10, Michael S. Tsirkin wrote:

On Tue, Mar 19, 2024 at 02:09:34AM -0400, Michael S. Tsirkin wrote:

On Tue, Mar 19, 2024 at 02:59:23PM +1000, Gavin Shan wrote:

On 3/19/24 02:59, Will Deacon wrote:

[...]

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 49299b1f9ec7..7d852811c912 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -687,9 +687,15 @@ static inline int virtqueue_add_split(struct virtqueue 
*_vq,
avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);
-   /* Descriptors and available array need to be set before we expose the
-* new available array entries. */
-   virtio_wmb(vq->weak_barriers);
+   /*
+* Descriptors and available array need to be set before we expose
+* the new available array entries. virtio_wmb() should be enough
+* to ensuere the order theoretically. However, a stronger barrier
+* is needed by ARM64. Otherwise, the stale data can be observed
+* by the host (vhost). A stronger barrier should work for other
+* architectures, but performance loss is expected.
+*/
+   virtio_mb(false);
vq->split.avail_idx_shadow++;
vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev,
vq->split.avail_idx_shadow);


Replacing a DMB with a DSB is _very_ unlikely to be the correct solution
here, especially when ordering accesses to coherent memory.

In practice, either the larger timing different from the DSB or the fact
that you're going from a Store->Store barrier to a full barrier is what
makes things "work" for you. Have you tried, for example, a DMB SY
(e.g. via __smb_mb()).

We definitely shouldn't take changes like this without a proper
explanation of what is going on.



Thanks for your comments, Will.

Yes, DMB should work for us. However, it seems this instruction has issues on
NVidia's grace-hopper. It's hard for me to understand how DMB and DSB works
from hardware level. I agree it's not the solution to replace DMB with DSB
before we fully understand the root cause.

I tried the possible replacement like below. __smp_mb() can avoid the issue like
__mb() does. __ndelay(10) can avoid the issue, but __ndelay(9) doesn't.

static inline int virtqueue_add_split(struct virtqueue *_vq, ...)
{
  :
  /* Put entry in available array (but don't update avail->idx until 
they
   * do sync). */
  avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
  vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);

  /* Descriptors and available array need to be set before we expose the
   * new available array entries. */
  // Broken: virtio_wmb(vq->weak_barriers);
  // Broken: __dma_mb();
  // Work:   __mb();
  // Work:   __smp_mb();


Did you try __smp_wmb ? And wmb?



virtio_wmb(false) is equivalent to __smb_wmb(), which is broken.

__wmb() works either. No issue found with it.


Oh interesting. So how do smp_mb() and wmb() disassemble on this
platform? Can you please check?



I don't see they have been translated wrongly on Nvidia's grace-hopper:

===> virtio_wmb(vq->weak_barriers)

0x8000807b07c8 <+1168>:  ldrbw0, [x20, #66]
0x8000807b07cc <+1172>:  cbz w0, 0x8000807b089c 

0x8000807b07d0 <+1176>:  dmb ishst // same to __smp_wmb()
:
0x8000807b089c <+1380>:  dmb oshst // same to __dma_wmb()
0x8000807b08a0 <+1384>:  b   0x8000807b07d4 


===> wmb()

0x8000807b07c8 <+1168>:  dsb st





  // Work:   __ndelay(100);
  // Work:   __ndelay(10);
  // Broken: __ndelay(9);

 vq->split.avail_idx_shadow++;
  vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev,
  vq->split.avail_idx_shadow);


What if you stick __ndelay here?


And keep virtio_wmb above?



The result has been shared through a separate reply.




  vq->num_added++;

  pr_debug("Added buffer head %i to %p\n", head, vq);
  END_USE(vq);
  :
}

I also tried to measure the consumed time for various barrier-relative 
instructions using
ktime_get_ns() which should have consumed most of the time. __smb_mb() is 
slower than
__smp_wmb() but faster than __mb()

  Instruction   Range of used time in ns
  --
  __smp_wmb()   [32  1128032]
  __smp_mb()[32  1160096]
  __mb()[32  1162496]



Thanks,
Gavin




Re: [PATCH] virtio_ring: Fix the stale index in available ring

2024-03-19 Thread Gavin Shan

On 3/19/24 16:10, Michael S. Tsirkin wrote:

On Tue, Mar 19, 2024 at 02:09:34AM -0400, Michael S. Tsirkin wrote:

On Tue, Mar 19, 2024 at 02:59:23PM +1000, Gavin Shan wrote:

On 3/19/24 02:59, Will Deacon wrote:

[...]

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 49299b1f9ec7..7d852811c912 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -687,9 +687,15 @@ static inline int virtqueue_add_split(struct virtqueue 
*_vq,
avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);
-   /* Descriptors and available array need to be set before we expose the
-* new available array entries. */
-   virtio_wmb(vq->weak_barriers);
+   /*
+* Descriptors and available array need to be set before we expose
+* the new available array entries. virtio_wmb() should be enough
+* to ensuere the order theoretically. However, a stronger barrier
+* is needed by ARM64. Otherwise, the stale data can be observed
+* by the host (vhost). A stronger barrier should work for other
+* architectures, but performance loss is expected.
+*/
+   virtio_mb(false);
vq->split.avail_idx_shadow++;
vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev,
vq->split.avail_idx_shadow);


Replacing a DMB with a DSB is _very_ unlikely to be the correct solution
here, especially when ordering accesses to coherent memory.

In practice, either the larger timing different from the DSB or the fact
that you're going from a Store->Store barrier to a full barrier is what
makes things "work" for you. Have you tried, for example, a DMB SY
(e.g. via __smb_mb()).

We definitely shouldn't take changes like this without a proper
explanation of what is going on.



Thanks for your comments, Will.

Yes, DMB should work for us. However, it seems this instruction has issues on
NVidia's grace-hopper. It's hard for me to understand how DMB and DSB works
from hardware level. I agree it's not the solution to replace DMB with DSB
before we fully understand the root cause.

I tried the possible replacement like below. __smp_mb() can avoid the issue like
__mb() does. __ndelay(10) can avoid the issue, but __ndelay(9) doesn't.

static inline int virtqueue_add_split(struct virtqueue *_vq, ...)
{
 :
 /* Put entry in available array (but don't update avail->idx until they
  * do sync). */
 avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
 vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);

 /* Descriptors and available array need to be set before we expose the
  * new available array entries. */
 // Broken: virtio_wmb(vq->weak_barriers);
 // Broken: __dma_mb();
 // Work:   __mb();
 // Work:   __smp_mb();


Did you try __smp_wmb ? And wmb?



virtio_wmb(false) is equivalent to __smb_wmb(), which is broken.

__wmb() works either. No issue found with it.


 // Work:   __ndelay(100);
 // Work:   __ndelay(10);
 // Broken: __ndelay(9);

vq->split.avail_idx_shadow++;
 vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev,
 vq->split.avail_idx_shadow);


What if you stick __ndelay here?


And keep virtio_wmb above?



The result has been shared through a separate reply.




 vq->num_added++;

 pr_debug("Added buffer head %i to %p\n", head, vq);
 END_USE(vq);
 :
}

I also tried to measure the consumed time for various barrier-relative 
instructions using
ktime_get_ns() which should have consumed most of the time. __smb_mb() is 
slower than
__smp_wmb() but faster than __mb()

 Instruction   Range of used time in ns
 --
 __smp_wmb()   [32  1128032]
 __smp_mb()[32  1160096]
 __mb()[32  1162496]



Thanks,
Gavin




Re: [PATCH] virtio_ring: Fix the stale index in available ring

2024-03-19 Thread Gavin Shan



On 3/19/24 16:43, Michael S. Tsirkin wrote:

On Tue, Mar 19, 2024 at 04:38:49PM +1000, Gavin Shan wrote:

On 3/19/24 16:09, Michael S. Tsirkin wrote:


diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 49299b1f9ec7..7d852811c912 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -687,9 +687,15 @@ static inline int virtqueue_add_split(struct virtqueue 
*_vq,
avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);
-   /* Descriptors and available array need to be set before we expose the
-* new available array entries. */
-   virtio_wmb(vq->weak_barriers);
+   /*
+* Descriptors and available array need to be set before we expose
+* the new available array entries. virtio_wmb() should be enough
+* to ensuere the order theoretically. However, a stronger barrier
+* is needed by ARM64. Otherwise, the stale data can be observed
+* by the host (vhost). A stronger barrier should work for other
+* architectures, but performance loss is expected.
+*/
+   virtio_mb(false);
vq->split.avail_idx_shadow++;
vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev,
vq->split.avail_idx_shadow);


Replacing a DMB with a DSB is _very_ unlikely to be the correct solution
here, especially when ordering accesses to coherent memory.

In practice, either the larger timing different from the DSB or the fact
that you're going from a Store->Store barrier to a full barrier is what
makes things "work" for you. Have you tried, for example, a DMB SY
(e.g. via __smb_mb()).

We definitely shouldn't take changes like this without a proper
explanation of what is going on.



Thanks for your comments, Will.

Yes, DMB should work for us. However, it seems this instruction has issues on
NVidia's grace-hopper. It's hard for me to understand how DMB and DSB works
from hardware level. I agree it's not the solution to replace DMB with DSB
before we fully understand the root cause.

I tried the possible replacement like below. __smp_mb() can avoid the issue like
__mb() does. __ndelay(10) can avoid the issue, but __ndelay(9) doesn't.

static inline int virtqueue_add_split(struct virtqueue *_vq, ...)
{
  :
  /* Put entry in available array (but don't update avail->idx until 
they
   * do sync). */
  avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
  vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);

  /* Descriptors and available array need to be set before we expose the
   * new available array entries. */
  // Broken: virtio_wmb(vq->weak_barriers);
  // Broken: __dma_mb();
  // Work:   __mb();
  // Work:   __smp_mb();
  // Work:   __ndelay(100);
  // Work:   __ndelay(10);
  // Broken: __ndelay(9);

 vq->split.avail_idx_shadow++;
  vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev,
  vq->split.avail_idx_shadow);


What if you stick __ndelay here?



/* Put entry in available array (but don't update avail->idx until they
  * do sync). */
 avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
 vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);

 /* Descriptors and available array need to be set before we expose the
  * new available array entries. */
 virtio_wmb(vq->weak_barriers);
 vq->split.avail_idx_shadow++;
 vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev,
 vq->split.avail_idx_shadow);
 /* Try __ndelay(x) here as Michael suggested
  *
  * Work:  __ndelay(200);possiblly make it hard to reproduce
  * Broken:__ndelay(100);
  * Broken:__ndelay(20);
  * Broken:__ndelay(10);
  */
 __ndelay(200);


So we see that just changing the timing masks the race.
What are you using on the host side? vhost or qemu?



__ndelay(200) may make the issue harder to be reproduce as I understand.
More delays here will give vhost relief, reducing the race.

The issue is only reproducible when vhost is turned on. Otherwise, we
aren't able to hit the issue.

   -netdev 
tap,id=vnet0,vhost=true,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown \
   -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0






  vq->num_added++;

  pr_debug("Added buffer head %i to %p\n", head, vq);
  END_USE(vq);
  :
}

I also tried to measure the consumed time fo

Re: [PATCH] virtio_ring: Fix the stale index in available ring

2024-03-19 Thread Gavin Shan

On 3/19/24 16:09, Michael S. Tsirkin wrote:


diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 49299b1f9ec7..7d852811c912 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -687,9 +687,15 @@ static inline int virtqueue_add_split(struct virtqueue 
*_vq,
avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);
-   /* Descriptors and available array need to be set before we expose the
-* new available array entries. */
-   virtio_wmb(vq->weak_barriers);
+   /*
+* Descriptors and available array need to be set before we expose
+* the new available array entries. virtio_wmb() should be enough
+* to ensuere the order theoretically. However, a stronger barrier
+* is needed by ARM64. Otherwise, the stale data can be observed
+* by the host (vhost). A stronger barrier should work for other
+* architectures, but performance loss is expected.
+*/
+   virtio_mb(false);
vq->split.avail_idx_shadow++;
vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev,
vq->split.avail_idx_shadow);


Replacing a DMB with a DSB is _very_ unlikely to be the correct solution
here, especially when ordering accesses to coherent memory.

In practice, either the larger timing different from the DSB or the fact
that you're going from a Store->Store barrier to a full barrier is what
makes things "work" for you. Have you tried, for example, a DMB SY
(e.g. via __smb_mb()).

We definitely shouldn't take changes like this without a proper
explanation of what is going on.



Thanks for your comments, Will.

Yes, DMB should work for us. However, it seems this instruction has issues on
NVidia's grace-hopper. It's hard for me to understand how DMB and DSB works
from hardware level. I agree it's not the solution to replace DMB with DSB
before we fully understand the root cause.

I tried the possible replacement like below. __smp_mb() can avoid the issue like
__mb() does. __ndelay(10) can avoid the issue, but __ndelay(9) doesn't.

static inline int virtqueue_add_split(struct virtqueue *_vq, ...)
{
 :
 /* Put entry in available array (but don't update avail->idx until they
  * do sync). */
 avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
 vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);

 /* Descriptors and available array need to be set before we expose the
  * new available array entries. */
 // Broken: virtio_wmb(vq->weak_barriers);
 // Broken: __dma_mb();
 // Work:   __mb();
 // Work:   __smp_mb();
 // Work:   __ndelay(100);
 // Work:   __ndelay(10);
 // Broken: __ndelay(9);

vq->split.avail_idx_shadow++;
 vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev,
 vq->split.avail_idx_shadow);


What if you stick __ndelay here?



   /* Put entry in available array (but don't update avail->idx until they
 * do sync). */
avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);

/* Descriptors and available array need to be set before we expose the
 * new available array entries. */
virtio_wmb(vq->weak_barriers);
vq->split.avail_idx_shadow++;
vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev,
vq->split.avail_idx_shadow);
/* Try __ndelay(x) here as Michael suggested
 *
 * Work:  __ndelay(200);possiblly make it hard to reproduce
 * Broken:__ndelay(100);
 * Broken:__ndelay(20);
 * Broken:__ndelay(10);
 */
__ndelay(200);





 vq->num_added++;

 pr_debug("Added buffer head %i to %p\n", head, vq);
 END_USE(vq);
 :
}

I also tried to measure the consumed time for various barrier-relative 
instructions using
ktime_get_ns() which should have consumed most of the time. __smb_mb() is 
slower than
__smp_wmb() but faster than __mb()

 Instruction   Range of used time in ns
 --
 __smp_wmb()   [32  1128032]
 __smp_mb()[32  1160096]
 __mb()[32  1162496]



Thanks,
Gavin




Re: [PATCH] virtio_ring: Fix the stale index in available ring

2024-03-18 Thread Gavin Shan

On 3/19/24 02:59, Will Deacon wrote:

On Thu, Mar 14, 2024 at 05:49:23PM +1000, Gavin Shan wrote:

The issue is reported by Yihuang Yu who have 'netperf' test on
NVidia's grace-grace and grace-hopper machines. The 'netperf'
client is started in the VM hosted by grace-hopper machine,
while the 'netperf' server is running on grace-grace machine.

The VM is started with virtio-net and vhost has been enabled.
We observe a error message spew from VM and then soft-lockup
report. The error message indicates the data associated with
the descriptor (index: 135) has been released, and the queue
is marked as broken. It eventually leads to the endless effort
to fetch free buffer (skb) in drivers/net/virtio_net.c::start_xmit()
and soft-lockup. The stale index 135 is fetched from the available
ring and published to the used ring by vhost, meaning we have
disordred write to the available ring element and available index.

   /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64  \
   -accel kvm -machine virt,gic-version=host\
  : \
   -netdev tap,id=vnet0,vhost=on\
   -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 \

   [   19.993158] virtio_net virtio1: output.0:id 135 is not a head!

Fix the issue by replacing virtio_wmb(vq->weak_barriers) with stronger
virtio_mb(false), equivalent to replaced 'dmb' by 'dsb' instruction on
ARM64. It should work for other architectures, but performance loss is
expected.

Cc: sta...@vger.kernel.org
Reported-by: Yihuang Yu 
Signed-off-by: Gavin Shan 
---
  drivers/virtio/virtio_ring.c | 12 +---
  1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 49299b1f9ec7..7d852811c912 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -687,9 +687,15 @@ static inline int virtqueue_add_split(struct virtqueue 
*_vq,
avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);
  
-	/* Descriptors and available array need to be set before we expose the

-* new available array entries. */
-   virtio_wmb(vq->weak_barriers);
+   /*
+* Descriptors and available array need to be set before we expose
+* the new available array entries. virtio_wmb() should be enough
+* to ensuere the order theoretically. However, a stronger barrier
+* is needed by ARM64. Otherwise, the stale data can be observed
+* by the host (vhost). A stronger barrier should work for other
+* architectures, but performance loss is expected.
+*/
+   virtio_mb(false);
vq->split.avail_idx_shadow++;
vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev,
vq->split.avail_idx_shadow);


Replacing a DMB with a DSB is _very_ unlikely to be the correct solution
here, especially when ordering accesses to coherent memory.

In practice, either the larger timing different from the DSB or the fact
that you're going from a Store->Store barrier to a full barrier is what
makes things "work" for you. Have you tried, for example, a DMB SY
(e.g. via __smb_mb()).

We definitely shouldn't take changes like this without a proper
explanation of what is going on.



Thanks for your comments, Will.

Yes, DMB should work for us. However, it seems this instruction has issues on
NVidia's grace-hopper. It's hard for me to understand how DMB and DSB works
from hardware level. I agree it's not the solution to replace DMB with DSB
before we fully understand the root cause.

I tried the possible replacement like below. __smp_mb() can avoid the issue like
__mb() does. __ndelay(10) can avoid the issue, but __ndelay(9) doesn't.

static inline int virtqueue_add_split(struct virtqueue *_vq, ...)
{
:
/* Put entry in available array (but don't update avail->idx until they
 * do sync). */
avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);

/* Descriptors and available array need to be set before we expose the
 * new available array entries. */
// Broken: virtio_wmb(vq->weak_barriers);
// Broken: __dma_mb();
// Work:   __mb();
// Work:   __smp_mb();
// Work:   __ndelay(100);
// Work:   __ndelay(10);
// Broken: __ndelay(9);

   vq->split.avail_idx_shadow++;
vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev,
vq->split.avail_idx_shadow);
vq->num_added++;

pr_debug("Added buffer head %i to %p\n", head, vq)

Re: [PATCH] virtio_ring: Fix the stale index in available ring

2024-03-17 Thread Gavin Shan

On 3/18/24 02:50, Michael S. Tsirkin wrote:

On Fri, Mar 15, 2024 at 09:24:36PM +1000, Gavin Shan wrote:


On 3/15/24 21:05, Michael S. Tsirkin wrote:

On Fri, Mar 15, 2024 at 08:45:10PM +1000, Gavin Shan wrote:

Yes, I guess smp_wmb() ('dmb') is buggy on NVidia's grace-hopper platform. I 
tried

to reproduce it with my own driver where one thread writes to the shared buffer
and another thread reads from the buffer. I don't hit the out-of-order issue so
far.


Make sure the 2 areas you are accessing are in different cache lines.



Yes, I already put those 2 areas to separate cache lines.




My driver may be not correct somewhere and I will update if I can reproduce
the issue with my driver in the future.


Then maybe your change is just making virtio slower and masks the bug
that is actually elsewhere?

You don't really need a driver. Here's a simple test: without barriers
assertion will fail. With barriers it will not.
(Warning: didn't bother testing too much, could be buggy.

---

#include 
#include 
#include 
#include 

#define FIRST values[0]
#define SECOND values[64]

volatile int values[100] = {};

void* writer_thread(void* arg) {
while (1) {
FIRST++;
// NEED smp_wmb here

 __asm__ volatile("dmb ishst" : : : "memory");

SECOND++;
}
}

void* reader_thread(void* arg) {
  while (1) {
int first = FIRST;
// NEED smp_rmb here

 __asm__ volatile("dmb ishld" : : : "memory");

int second = SECOND;
assert(first - second == 1 || first - second == 0);
  }
}

int main() {
  pthread_t writer, reader;

  pthread_create(, NULL, writer_thread, NULL);
  pthread_create(, NULL, reader_thread, NULL);

  pthread_join(writer, NULL);
  pthread_join(reader, NULL);

  return 0;
}



Had a quick test on NVidia's grace-hopper and Ampere's CPUs. I hit
the assert on both of them. After replacing 'dmb' with 'dsb', I can
hit assert on both of them too. I need to look at the code closely.

[root@virt-mtcollins-02 test]# ./a
a: a.c:26: reader_thread: Assertion `first - second == 1 || first - second == 
0' failed.
Aborted (core dumped)

[root@nvidia-grace-hopper-05 test]# ./a
a: a.c:26: reader_thread: Assertion `first - second == 1 || first - second == 
0' failed.
Aborted (core dumped)

Thanks,
Gavin



Actually this test is broken. No need for ordering it's a simple race.
The following works on x86 though (x86 does not need barriers
though).


#include 
#include 
#include 
#include 

#if 0
#define x86_rmb()  asm volatile("lfence":::"memory")
#define x86_mb()  asm volatile("mfence":::"memory")
#define x86_smb()  asm volatile("sfence":::"memory")
#else
#define x86_rmb()  asm volatile("":::"memory")
#define x86_mb()  asm volatile("":::"memory")
#define x86_smb()  asm volatile("":::"memory")
#endif

#define FIRST values[0]
#define SECOND values[640]
#define FLAG values[1280]

volatile unsigned values[2000] = {};

void* writer_thread(void* arg) {
while (1) {
/* Now synchronize with reader */
while(FLAG);
FIRST++;
x86_smb();
SECOND++;
x86_smb();
FLAG = 1;
}
}

void* reader_thread(void* arg) {
 while (1) {
/* Now synchronize with writer */
while(!FLAG);
x86_rmb();
unsigned first = FIRST;
x86_rmb();
unsigned second = SECOND;
assert(first - second == 1 || first - second == 0);
FLAG = 0;

if (!(first %100))
printf("%d\n", first);
}
}

int main() {
 pthread_t writer, reader;

 pthread_create(, NULL, writer_thread, NULL);
 pthread_create(, NULL, reader_thread, NULL);

 pthread_join(writer, NULL);
 pthread_join(reader, NULL);

 return 0;
}



I tried it on host and VM of NVidia's grace-hopper. Without the barriers, I
can hit assert. With the barriers, it's working fine without hitting the
assert.

I also had some code to mimic virtio vring last weekend, and it's just
working well. Back to our original issue, __smb_wmb() is issued by guest
while __smb_rmb() is executed on host. The VM and host are running at
different exception level: EL2 vs EL1. I'm not sure it's the cause. I
need to modify my code so that __smb_wmb() and __smb_rmb() can be executed
from guest and host.

[gshan@gshan code]$ cat test.h
#ifndef __TEST_H
#define __TEST_H

struct vring_desc {
uint64_taddr;
uint32_tlen;
uint16_tflags;
uint16_tnext;
} __attribute__((aligned(4)));

struct vring_avail {
uint16_tflags;
uint16_tidx;
uint16_tring[];
} __attribute__((aligned(4)));

struct vring_used_elem {
uint32_tid;
uint32_tlen;
} __attribute__((aligned(4)));

stru

Re: [PATCH] virtio_ring: Fix the stale index in available ring

2024-03-15 Thread Gavin Shan



On 3/15/24 21:05, Michael S. Tsirkin wrote:

On Fri, Mar 15, 2024 at 08:45:10PM +1000, Gavin Shan wrote:

Yes, I guess smp_wmb() ('dmb') is buggy on NVidia's grace-hopper platform. I 
tried

to reproduce it with my own driver where one thread writes to the shared buffer
and another thread reads from the buffer. I don't hit the out-of-order issue so
far.


Make sure the 2 areas you are accessing are in different cache lines.



Yes, I already put those 2 areas to separate cache lines.




My driver may be not correct somewhere and I will update if I can reproduce
the issue with my driver in the future.


Then maybe your change is just making virtio slower and masks the bug
that is actually elsewhere?

You don't really need a driver. Here's a simple test: without barriers
assertion will fail. With barriers it will not.
(Warning: didn't bother testing too much, could be buggy.

---

#include 
#include 
#include 
#include 

#define FIRST values[0]
#define SECOND values[64]

volatile int values[100] = {};

void* writer_thread(void* arg) {
while (1) {
FIRST++;
// NEED smp_wmb here

__asm__ volatile("dmb ishst" : : : "memory");

SECOND++;
}
}

void* reader_thread(void* arg) {
 while (1) {
int first = FIRST;
// NEED smp_rmb here

__asm__ volatile("dmb ishld" : : : "memory");

int second = SECOND;
assert(first - second == 1 || first - second == 0);
 }
}

int main() {
 pthread_t writer, reader;

 pthread_create(, NULL, writer_thread, NULL);
 pthread_create(, NULL, reader_thread, NULL);

 pthread_join(writer, NULL);
 pthread_join(reader, NULL);

 return 0;
}



Had a quick test on NVidia's grace-hopper and Ampere's CPUs. I hit
the assert on both of them. After replacing 'dmb' with 'dsb', I can
hit assert on both of them too. I need to look at the code closely.

[root@virt-mtcollins-02 test]# ./a
a: a.c:26: reader_thread: Assertion `first - second == 1 || first - second == 
0' failed.
Aborted (core dumped)

[root@nvidia-grace-hopper-05 test]# ./a
a: a.c:26: reader_thread: Assertion `first - second == 1 || first - second == 
0' failed.
Aborted (core dumped)

Thanks,
Gavin




Re: [PATCH] virtio_ring: Fix the stale index in available ring

2024-03-15 Thread Gavin Shan



+ Will, Catalin and Matt from Nvidia

On 3/14/24 22:59, Michael S. Tsirkin wrote:

On Thu, Mar 14, 2024 at 10:50:15PM +1000, Gavin Shan wrote:

On 3/14/24 21:50, Michael S. Tsirkin wrote:

On Thu, Mar 14, 2024 at 08:15:22PM +1000, Gavin Shan wrote:

On 3/14/24 18:05, Michael S. Tsirkin wrote:

On Thu, Mar 14, 2024 at 05:49:23PM +1000, Gavin Shan wrote:

The issue is reported by Yihuang Yu who have 'netperf' test on
NVidia's grace-grace and grace-hopper machines. The 'netperf'
client is started in the VM hosted by grace-hopper machine,
while the 'netperf' server is running on grace-grace machine.

The VM is started with virtio-net and vhost has been enabled.
We observe a error message spew from VM and then soft-lockup
report. The error message indicates the data associated with
the descriptor (index: 135) has been released, and the queue
is marked as broken. It eventually leads to the endless effort
to fetch free buffer (skb) in drivers/net/virtio_net.c::start_xmit()
and soft-lockup. The stale index 135 is fetched from the available
ring and published to the used ring by vhost, meaning we have
disordred write to the available ring element and available index.

 /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64  \
 -accel kvm -machine virt,gic-version=host\
: \
 -netdev tap,id=vnet0,vhost=on\
 -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 \

 [   19.993158] virtio_net virtio1: output.0:id 135 is not a head!

Fix the issue by replacing virtio_wmb(vq->weak_barriers) with stronger
virtio_mb(false), equivalent to replaced 'dmb' by 'dsb' instruction on
ARM64. It should work for other architectures, but performance loss is
expected.

Cc: sta...@vger.kernel.org
Reported-by: Yihuang Yu 
Signed-off-by: Gavin Shan 
---
drivers/virtio/virtio_ring.c | 12 +---
1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 49299b1f9ec7..7d852811c912 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -687,9 +687,15 @@ static inline int virtqueue_add_split(struct virtqueue 
*_vq,
avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);
-   /* Descriptors and available array need to be set before we expose the
-* new available array entries. */
-   virtio_wmb(vq->weak_barriers);
+   /*
+* Descriptors and available array need to be set before we expose
+* the new available array entries. virtio_wmb() should be enough
+* to ensuere the order theoretically. However, a stronger barrier
+* is needed by ARM64. Otherwise, the stale data can be observed
+* by the host (vhost). A stronger barrier should work for other
+* architectures, but performance loss is expected.
+*/
+   virtio_mb(false);



I don't get what is going on here. Any explanation why virtio_wmb is not
enough besides "it does not work"?



The change is replacing instruction "dmb" with "dsb". "dsb" is stronger barrier
than "dmb" because "dsb" ensures that all memory accesses raised before this
instruction is completed when the 'dsb' instruction completes. However, "dmb"
doesn't guarantee the order of completion of the memory accesses.

So 'vq->split.vring.avail->idx = cpu_to_virtio(_vq->vdev, 
vq->split.avail_idx_shadow)'
can be completed before 'vq->split.vring.avail->ring[avail] = 
cpu_to_virtio16(_vq->vdev, head)'.


Completed as observed by which CPU?
We have 2 writes that we want observed by another CPU in order.
So if CPU observes a new value of idx we want it to see
new value in ring.
This is standard use of smp_wmb()
How are these 2 writes different?

What DMB does, is that is seems to ensure that effects
of 'vq->split.vring.avail->idx = cpu_to_virtio(_vq->vdev, 
vq->split.avail_idx_shadow)'
are observed after effects of
'vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head)'.




Completed as observed by the CPU where vhost worker is running. I don't think 
DMB
does the work here. If I'm understanding correctly, DMB ensures the order of 
these
two writes from the local CPU's standpoint.


No this makes no sense at all. All memory accesses are in order from
local CPU standpoint.



It's true if compiler doesn't reorder the accesses, and light-weight barrier
like 'dmb' and 'isb' is used. Otherwise, the accesses still can be disordered
on the local CPU, correct?


The written data can be stored in local
CPU's cache, not flushed to DRAM and propogated to the cache of the far CPU 
where
vhost worker is running. So DMB isn't ensu

Re: [PATCH] virtio_ring: Fix the stale index in available ring

2024-03-14 Thread Gavin Shan

On 3/14/24 21:50, Michael S. Tsirkin wrote:

On Thu, Mar 14, 2024 at 08:15:22PM +1000, Gavin Shan wrote:

On 3/14/24 18:05, Michael S. Tsirkin wrote:

On Thu, Mar 14, 2024 at 05:49:23PM +1000, Gavin Shan wrote:

The issue is reported by Yihuang Yu who have 'netperf' test on
NVidia's grace-grace and grace-hopper machines. The 'netperf'
client is started in the VM hosted by grace-hopper machine,
while the 'netperf' server is running on grace-grace machine.

The VM is started with virtio-net and vhost has been enabled.
We observe a error message spew from VM and then soft-lockup
report. The error message indicates the data associated with
the descriptor (index: 135) has been released, and the queue
is marked as broken. It eventually leads to the endless effort
to fetch free buffer (skb) in drivers/net/virtio_net.c::start_xmit()
and soft-lockup. The stale index 135 is fetched from the available
ring and published to the used ring by vhost, meaning we have
disordred write to the available ring element and available index.

/home/gavin/sandbox/qemu.main/build/qemu-system-aarch64  \
-accel kvm -machine virt,gic-version=host\
   : \
-netdev tap,id=vnet0,vhost=on\
-device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 \

[   19.993158] virtio_net virtio1: output.0:id 135 is not a head!

Fix the issue by replacing virtio_wmb(vq->weak_barriers) with stronger
virtio_mb(false), equivalent to replaced 'dmb' by 'dsb' instruction on
ARM64. It should work for other architectures, but performance loss is
expected.

Cc: sta...@vger.kernel.org
Reported-by: Yihuang Yu 
Signed-off-by: Gavin Shan 
---
   drivers/virtio/virtio_ring.c | 12 +---
   1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 49299b1f9ec7..7d852811c912 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -687,9 +687,15 @@ static inline int virtqueue_add_split(struct virtqueue 
*_vq,
avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);
-   /* Descriptors and available array need to be set before we expose the
-* new available array entries. */
-   virtio_wmb(vq->weak_barriers);
+   /*
+* Descriptors and available array need to be set before we expose
+* the new available array entries. virtio_wmb() should be enough
+* to ensuere the order theoretically. However, a stronger barrier
+* is needed by ARM64. Otherwise, the stale data can be observed
+* by the host (vhost). A stronger barrier should work for other
+* architectures, but performance loss is expected.
+*/
+   virtio_mb(false);



I don't get what is going on here. Any explanation why virtio_wmb is not
enough besides "it does not work"?



The change is replacing instruction "dmb" with "dsb". "dsb" is stronger barrier
than "dmb" because "dsb" ensures that all memory accesses raised before this
instruction is completed when the 'dsb' instruction completes. However, "dmb"
doesn't guarantee the order of completion of the memory accesses.

So 'vq->split.vring.avail->idx = cpu_to_virtio(_vq->vdev, 
vq->split.avail_idx_shadow)'
can be completed before 'vq->split.vring.avail->ring[avail] = 
cpu_to_virtio16(_vq->vdev, head)'.


Completed as observed by which CPU?
We have 2 writes that we want observed by another CPU in order.
So if CPU observes a new value of idx we want it to see
new value in ring.
This is standard use of smp_wmb()
How are these 2 writes different?

What DMB does, is that is seems to ensure that effects
of 'vq->split.vring.avail->idx = cpu_to_virtio(_vq->vdev, 
vq->split.avail_idx_shadow)'
are observed after effects of
'vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head)'.




Completed as observed by the CPU where vhost worker is running. I don't think 
DMB
does the work here. If I'm understanding correctly, DMB ensures the order of 
these
two writes from the local CPU's standpoint. The written data can be stored in 
local
CPU's cache, not flushed to DRAM and propogated to the cache of the far CPU 
where
vhost worker is running. So DMB isn't ensuring the write data is observed from 
the
far CPU.

DSB ensures that the written data is observable from the far CPU immediately.
 




The stronger barrier 'dsb' ensures the completion order as we expected.

 virtio_wmb(true) virt_mb(false)
   virt_wmb mb
 __smp_wmb   __mb
   dmb(ishst)  dsb(sy)


First, why would you want a non smp barrier when you are

Re: [PATCH] virtio_ring: Fix the stale index in available ring

2024-03-14 Thread Gavin Shan

On 3/14/24 18:05, Michael S. Tsirkin wrote:

On Thu, Mar 14, 2024 at 05:49:23PM +1000, Gavin Shan wrote:

The issue is reported by Yihuang Yu who have 'netperf' test on
NVidia's grace-grace and grace-hopper machines. The 'netperf'
client is started in the VM hosted by grace-hopper machine,
while the 'netperf' server is running on grace-grace machine.

The VM is started with virtio-net and vhost has been enabled.
We observe a error message spew from VM and then soft-lockup
report. The error message indicates the data associated with
the descriptor (index: 135) has been released, and the queue
is marked as broken. It eventually leads to the endless effort
to fetch free buffer (skb) in drivers/net/virtio_net.c::start_xmit()
and soft-lockup. The stale index 135 is fetched from the available
ring and published to the used ring by vhost, meaning we have
disordred write to the available ring element and available index.

   /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64  \
   -accel kvm -machine virt,gic-version=host\
  : \
   -netdev tap,id=vnet0,vhost=on\
   -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 \

   [   19.993158] virtio_net virtio1: output.0:id 135 is not a head!

Fix the issue by replacing virtio_wmb(vq->weak_barriers) with stronger
virtio_mb(false), equivalent to replaced 'dmb' by 'dsb' instruction on
ARM64. It should work for other architectures, but performance loss is
expected.

Cc: sta...@vger.kernel.org
Reported-by: Yihuang Yu 
Signed-off-by: Gavin Shan 
---
  drivers/virtio/virtio_ring.c | 12 +---
  1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 49299b1f9ec7..7d852811c912 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -687,9 +687,15 @@ static inline int virtqueue_add_split(struct virtqueue 
*_vq,
avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);
  
-	/* Descriptors and available array need to be set before we expose the

-* new available array entries. */
-   virtio_wmb(vq->weak_barriers);
+   /*
+* Descriptors and available array need to be set before we expose
+* the new available array entries. virtio_wmb() should be enough
+* to ensuere the order theoretically. However, a stronger barrier
+* is needed by ARM64. Otherwise, the stale data can be observed
+* by the host (vhost). A stronger barrier should work for other
+* architectures, but performance loss is expected.
+*/
+   virtio_mb(false);



I don't get what is going on here. Any explanation why virtio_wmb is not
enough besides "it does not work"?



The change is replacing instruction "dmb" with "dsb". "dsb" is stronger barrier
than "dmb" because "dsb" ensures that all memory accesses raised before this
instruction is completed when the 'dsb' instruction completes. However, "dmb"
doesn't guarantee the order of completion of the memory accesses.

So 'vq->split.vring.avail->idx = cpu_to_virtio(_vq->vdev, 
vq->split.avail_idx_shadow)'
can be completed before 'vq->split.vring.avail->ring[avail] = 
cpu_to_virtio16(_vq->vdev, head)'.
The stronger barrier 'dsb' ensures the completion order as we expected.

virtio_wmb(true) virt_mb(false)
  virt_wmb mb
__smp_wmb   __mb
  dmb(ishst)  dsb(sy)
  


Extraced from ARMv9 specificaton

The DMB instruction is a memory barrier instruction that ensures the relative
order of memory accesses before the barrier with memory accesses after the
barrier. The DMB instruction _does not_ ensure the completion of any of the
memory accesses for which it ensures relative order.

A DSB instruction is a memory barrier that ensures that memory accesses that
occur before the DSB instruction have __completed__ before the completion of
the DSB instruction. In doing this, it acts as a stronger barrier than a DMB
and all ordering that is created by a DMB with specific options is also 
generated
by a DSB with the same options.


vq->split.avail_idx_shadow++;
vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev,
vq->split.avail_idx_shadow);
--
2.44.0




Thanks,
Gavin




[PATCH] virtio_ring: Fix the stale index in available ring

2024-03-14 Thread Gavin Shan
The issue is reported by Yihuang Yu who have 'netperf' test on
NVidia's grace-grace and grace-hopper machines. The 'netperf'
client is started in the VM hosted by grace-hopper machine,
while the 'netperf' server is running on grace-grace machine.

The VM is started with virtio-net and vhost has been enabled.
We observe a error message spew from VM and then soft-lockup
report. The error message indicates the data associated with
the descriptor (index: 135) has been released, and the queue
is marked as broken. It eventually leads to the endless effort
to fetch free buffer (skb) in drivers/net/virtio_net.c::start_xmit()
and soft-lockup. The stale index 135 is fetched from the available
ring and published to the used ring by vhost, meaning we have
disordred write to the available ring element and available index.

  /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64  \
  -accel kvm -machine virt,gic-version=host\
 : \
  -netdev tap,id=vnet0,vhost=on\
  -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 \

  [   19.993158] virtio_net virtio1: output.0:id 135 is not a head!

Fix the issue by replacing virtio_wmb(vq->weak_barriers) with stronger
virtio_mb(false), equivalent to replaced 'dmb' by 'dsb' instruction on
ARM64. It should work for other architectures, but performance loss is
expected.

Cc: sta...@vger.kernel.org
Reported-by: Yihuang Yu 
Signed-off-by: Gavin Shan 
---
 drivers/virtio/virtio_ring.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 49299b1f9ec7..7d852811c912 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -687,9 +687,15 @@ static inline int virtqueue_add_split(struct virtqueue 
*_vq,
avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);
 
-   /* Descriptors and available array need to be set before we expose the
-* new available array entries. */
-   virtio_wmb(vq->weak_barriers);
+   /*
+* Descriptors and available array need to be set before we expose
+* the new available array entries. virtio_wmb() should be enough
+* to ensuere the order theoretically. However, a stronger barrier
+* is needed by ARM64. Otherwise, the stale data can be observed
+* by the host (vhost). A stronger barrier should work for other
+* architectures, but performance loss is expected.
+*/
+   virtio_mb(false);
vq->split.avail_idx_shadow++;
vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev,
vq->split.avail_idx_shadow);
-- 
2.44.0




[PATCH v2 3/3] KVM: arm64: Don't retrieve memory slot again in page fault handler

2021-03-15 Thread Gavin Shan
We needn't retrieve the memory slot again in user_mem_abort() because
the corresponding memory slot has been passed from the caller. This
would save some CPU cycles. For example, the time used to write 1GB
memory, which is backed by 2MB hugetlb pages and write-protected, is
dropped by 6.8% from 928ms to 864ms.

Signed-off-by: Gavin Shan 
Reviewed-by: Keqian Zhu 
---
 arch/arm64/kvm/mmu.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 192e0df2fc8e..2491b40a294a 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -843,10 +843,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
 * unmapped afterwards, the call to kvm_unmap_hva will take it away
 * from us again properly. This smp_rmb() interacts with the smp_wmb()
 * in kvm_mmu_notifier_invalidate_.
+*
+* Besides, __gfn_to_pfn_memslot() instead of gfn_to_pfn_prot() is
+* used to avoid unnecessary overhead introduced to locate the memory
+* slot because it's always fixed even @gfn is adjusted for huge pages.
 */
smp_rmb();
 
-   pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, );
+   pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
+  write_fault, , NULL);
if (pfn == KVM_PFN_ERR_HWPOISON) {
kvm_send_hwpoison_signal(hva, vma_shift);
return 0;
@@ -912,7 +917,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
/* Mark the page dirty only if the fault is handled successfully */
if (writable && !ret) {
kvm_set_pfn_dirty(pfn);
-   mark_page_dirty(kvm, gfn);
+   mark_page_dirty_in_slot(kvm, memslot, gfn);
}
 
 out_unlock:
-- 
2.23.0



[PATCH v2 1/3] KVM: arm64: Hide kvm_mmu_wp_memory_region()

2021-03-15 Thread Gavin Shan
We needn't expose the function as it's only used by mmu.c since it
was introduced by commit c64735554c0a ("KVM: arm: Add initial dirty
page locking support").

Signed-off-by: Gavin Shan 
Reviewed-by: Keqian Zhu 
---
 arch/arm64/include/asm/kvm_host.h | 1 -
 arch/arm64/kvm/mmu.c  | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 3d10e6527f7d..688f2df1957b 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -632,7 +632,6 @@ void kvm_arm_resume_guest(struct kvm *kvm);
})
 
 void force_vm_exit(const cpumask_t *mask);
-void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
 
 int handle_exit(struct kvm_vcpu *vcpu, int exception_index);
 void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index);
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 8711894db8c2..28f3b3736dc8 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -555,7 +555,7 @@ static void stage2_wp_range(struct kvm_s2_mmu *mmu, 
phys_addr_t addr, phys_addr_
  * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
  * serializing operations for VM memory regions.
  */
-void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
+static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
 {
struct kvm_memslots *slots = kvm_memslots(kvm);
struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
-- 
2.23.0



[PATCH v2 2/3] KVM: arm64: Use find_vma_intersection()

2021-03-15 Thread Gavin Shan
find_vma_intersection() has been existing to search the intersected
vma. This uses the function where it's applicable, to simplify the
code.

Signed-off-by: Gavin Shan 
Reviewed-by: Keqian Zhu 
---
 arch/arm64/kvm/mmu.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 28f3b3736dc8..192e0df2fc8e 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -421,10 +421,11 @@ static void stage2_unmap_memslot(struct kvm *kvm,
 * ++
 */
do {
-   struct vm_area_struct *vma = find_vma(current->mm, hva);
+   struct vm_area_struct *vma;
hva_t vm_start, vm_end;
 
-   if (!vma || vma->vm_start >= reg_end)
+   vma = find_vma_intersection(current->mm, hva, reg_end);
+   if (!vma)
break;
 
/*
@@ -1329,10 +1330,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 * ++
 */
do {
-   struct vm_area_struct *vma = find_vma(current->mm, hva);
+   struct vm_area_struct *vma;
hva_t vm_start, vm_end;
 
-   if (!vma || vma->vm_start >= reg_end)
+   vma = find_vma_intersection(current->mm, hva, reg_end);
+   if (!vma)
break;
 
/*
-- 
2.23.0



[PATCH v2 0/3] KVM: arm64: Minor page fault handler improvement

2021-03-15 Thread Gavin Shan
The series includes several minior improvements to stage-2 page fault
handler: PATCH[1/2] are cleaning up the code. PATCH[3] don't retrieve
the memory slot again in the page fault handler to save a bit CPU cycles.

Changelog
=
v2:
   * Rebased to 5.12.rc3 and include r-bs from Keqian  (Gavin)
   * Drop patch to fix IPA limit boundary issue(Keqian)
   * Comments on why we use __gfn_to_pfn_memslot() (Keqian)

Gavin Shan (3):
  KVM: arm64: Hide kvm_mmu_wp_memory_region()
  KVM: arm64: Use find_vma_intersection()
  KVM: arm64: Don't retrieve memory slot again in page fault handler

 arch/arm64/include/asm/kvm_host.h |  1 -
 arch/arm64/kvm/mmu.c  | 21 ++---
 2 files changed, 14 insertions(+), 8 deletions(-)

-- 
2.23.0



Re: [PATCH 2/4] KVM: arm64: Use find_vma_intersection()

2021-03-15 Thread Gavin Shan

Hi Keqian,

On 3/15/21 8:42 PM, Gavin Shan wrote:

On 3/15/21 7:04 PM, Keqian Zhu wrote:

On 2021/3/15 12:18, Gavin Shan wrote:

find_vma_intersection() has been existing to search the intersected
vma. This uses the function where it's applicable, to simplify the
code.

Signed-off-by: Gavin Shan 
---
  arch/arm64/kvm/mmu.c | 10 ++
  1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 84e70f953de6..286b603ed0d3 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -421,10 +421,11 @@ static void stage2_unmap_memslot(struct kvm *kvm,
   * ++
   */
  do {
-    struct vm_area_struct *vma = find_vma(current->mm, hva);
+    struct vm_area_struct *vma;
  hva_t vm_start, vm_end;
-    if (!vma || vma->vm_start >= reg_end)
+    vma = find_vma_intersection(current->mm, hva, reg_end);

Nit: Keep a same style may be better(Assign vma when declare it).
Other looks good to me.



Yeah, I agree. I will adjust the code in v2 and included your r-b.
Thanks for your time to review.



After rechecking the code, I think it'd better to keep current style
because there is a follow-on validation on @vma. Keeping them together
seems a good idea. I think it wouldn't a big deal to you. So I will
keep current style with your r-b in v2.

vma = find_vma_intersection(current->mm, hva, reg_end);
if (!vma)
 break;
Thanks,
Gavin
 

+    if (!vma)
  break;
  /*
@@ -1330,10 +1331,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
   * ++
   */
  do {
-    struct vm_area_struct *vma = find_vma(current->mm, hva);
+    struct vm_area_struct *vma;
  hva_t vm_start, vm_end;
-    if (!vma || vma->vm_start >= reg_end)
+    vma = find_vma_intersection(current->mm, hva, reg_end);
+    if (!vma)
  break;
  /*









Re: [PATCH 4/4] KVM: arm64: Don't retrieve memory slot again in page fault handler

2021-03-15 Thread Gavin Shan

Hi Keqian,

On 3/15/21 7:25 PM, Keqian Zhu wrote:

On 2021/3/15 12:18, Gavin Shan wrote:

We needn't retrieve the memory slot again in user_mem_abort() because
the corresponding memory slot has been passed from the caller. This

I think you are right, though fault_ipa will be adjusted when we try to use 
block mapping,
the fault_supports_stage2_huge_mapping() makes sure we're not trying to map 
anything
not covered by the memslot, so the adjusted fault_ipa still belongs to the 
memslot.



Yeah, it's correct. Besides, the @logging_active is determined
based on the passed memory slot. It means user_mem_abort() can't
support memory range which spans multiple memory slot.


would save some CPU cycles. For example, the time used to write 1GB
memory, which is backed by 2MB hugetlb pages and write-protected, is
dropped by 6.8% from 928ms to 864ms.

Signed-off-by: Gavin Shan 
---
  arch/arm64/kvm/mmu.c | 5 +++--
  1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index a5a8ade9fde4..4a4abcccfafb 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -846,7 +846,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
 */
smp_rmb();
  
-	pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, );

+   pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
+  write_fault, , NULL);

It's better to update the code comments at same time.



I guess you need some comments here? If so, I would add something
like below in v2:

/*
 * gfn_to_pfn_prot() can be used either with unnecessary overhead
 * introduced to locate the memory slot because the memory slot is
 * always fixed even @gfn is adjusted for huge pages.
 */


if (pfn == KVM_PFN_ERR_HWPOISON) {
kvm_send_hwpoison_signal(hva, vma_shift);
return 0;
@@ -912,7 +913,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
/* Mark the page dirty only if the fault is handled successfully */
if (writable && !ret) {
kvm_set_pfn_dirty(pfn);
-   mark_page_dirty(kvm, gfn);
+   mark_page_dirty_in_slot(kvm, memslot, gfn);
}
  
  out_unlock:




Thanks,
Gavin




Re: [PATCH 3/4] KVM: arm64: Fix address check for memory slot

2021-03-15 Thread Gavin Shan

Hi Keqian,

On 3/15/21 6:33 PM, Keqian Zhu wrote:

FYI, this has been fixed by Marc in commit 262b003d059c.



Yeah, I didn't check 5.12.rc3 code where the issue has been
fixed. So please ignore this one and sorry for the noise.

Thanks,
Gavin
 

On 2021/3/15 12:18, Gavin Shan wrote:

The last (IPA) page can't be specified when a new memory slot is
added. The error -EFAULT is returned when the memory slot is added
with the following parameters for the VM, which has 40-bits IPA
limit. The host has 4KB base page size. It's not correct because
the last (IPA) page is still usable.

struct kvm_userspace_memory_region {
   __u32 slot;   /* 1*/
   __u32 flags;  /* 0*/
   __u64 guest_phys_addr;/* 0xfff000 */
   __u64 memory_size;/* 0x1000   */
   __u64 userspace_addr;
};

Signed-off-by: Gavin Shan 
---
  arch/arm64/kvm/mmu.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 286b603ed0d3..a5a8ade9fde4 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1313,7 +1313,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 * Prevent userspace from creating a memory region outside of the IPA
 * space addressable by the KVM guest IPA space.
 */
-   if (memslot->base_gfn + memslot->npages >=
+   if (memslot->base_gfn + memslot->npages >
(kvm_phys_size(kvm) >> PAGE_SHIFT))
return -EFAULT;
  







Re: [PATCH 2/4] KVM: arm64: Use find_vma_intersection()

2021-03-15 Thread Gavin Shan

Hi Keqian,

On 3/15/21 7:04 PM, Keqian Zhu wrote:

On 2021/3/15 12:18, Gavin Shan wrote:

find_vma_intersection() has been existing to search the intersected
vma. This uses the function where it's applicable, to simplify the
code.

Signed-off-by: Gavin Shan 
---
  arch/arm64/kvm/mmu.c | 10 ++
  1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 84e70f953de6..286b603ed0d3 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -421,10 +421,11 @@ static void stage2_unmap_memslot(struct kvm *kvm,
 * ++
 */
do {
-   struct vm_area_struct *vma = find_vma(current->mm, hva);
+   struct vm_area_struct *vma;
hva_t vm_start, vm_end;
  
-		if (!vma || vma->vm_start >= reg_end)

+   vma = find_vma_intersection(current->mm, hva, reg_end);

Nit: Keep a same style may be better(Assign vma when declare it).
Other looks good to me.



Yeah, I agree. I will adjust the code in v2 and included your r-b.
Thanks for your time to review.

Thanks,
Gavin

 

+   if (!vma)
break;
  
  		/*

@@ -1330,10 +1331,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 * ++
 */
do {
-   struct vm_area_struct *vma = find_vma(current->mm, hva);
+   struct vm_area_struct *vma;
hva_t vm_start, vm_end;
  
-		if (!vma || vma->vm_start >= reg_end)

+   vma = find_vma_intersection(current->mm, hva, reg_end);
+   if (!vma)
break;
  
  		/*








Re: [PATCH 2/4] KVM: arm64: Use find_vma_intersection()

2021-03-15 Thread Gavin Shan

Hi Marc,

On 3/15/21 7:52 PM, Marc Zyngier wrote:

On Mon, 15 Mar 2021 04:18:42 +,
Gavin Shan  wrote:


find_vma_intersection() has been existing to search the intersected
vma. This uses the function where it's applicable, to simplify the
code.

Signed-off-by: Gavin Shan 
---
  arch/arm64/kvm/mmu.c | 10 ++
  1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 84e70f953de6..286b603ed0d3 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -421,10 +421,11 @@ static void stage2_unmap_memslot(struct kvm *kvm,
 * ++
 */
do {
-   struct vm_area_struct *vma = find_vma(current->mm, hva);
+   struct vm_area_struct *vma;
hva_t vm_start, vm_end;
  
-		if (!vma || vma->vm_start >= reg_end)

+   vma = find_vma_intersection(current->mm, hva, reg_end);


For context, here's the definition of find_vma_intersection():


static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * 
mm, unsigned long start_addr, unsigned long end_addr)
{
struct vm_area_struct * vma = find_vma(mm,start_addr);

if (vma && end_addr <= vma->vm_start)
vma = NULL;
return vma;
}


It seems that there is a boundary issue in either the old code or the
new one in the case where (reg_end == vma->start).

Which one is which?



The old and new code is interchangeable, meaning "reg_end == vma->start"
is invalid in both cases. So if there is a boundary issue, the old and new
code should have same issue.

According to the code, "reg_end == vma->start" is invalid. So I don't see
there is a boundary issue. Hopefully, I don't miss anything :)

Thanks,
Gavin



[PATCH 4/4] KVM: arm64: Don't retrieve memory slot again in page fault handler

2021-03-14 Thread Gavin Shan
We needn't retrieve the memory slot again in user_mem_abort() because
the corresponding memory slot has been passed from the caller. This
would save some CPU cycles. For example, the time used to write 1GB
memory, which is backed by 2MB hugetlb pages and write-protected, is
dropped by 6.8% from 928ms to 864ms.

Signed-off-by: Gavin Shan 
---
 arch/arm64/kvm/mmu.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index a5a8ade9fde4..4a4abcccfafb 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -846,7 +846,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
 */
smp_rmb();
 
-   pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, );
+   pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
+  write_fault, , NULL);
if (pfn == KVM_PFN_ERR_HWPOISON) {
kvm_send_hwpoison_signal(hva, vma_shift);
return 0;
@@ -912,7 +913,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
/* Mark the page dirty only if the fault is handled successfully */
if (writable && !ret) {
kvm_set_pfn_dirty(pfn);
-   mark_page_dirty(kvm, gfn);
+   mark_page_dirty_in_slot(kvm, memslot, gfn);
}
 
 out_unlock:
-- 
2.23.0



[PATCH 3/4] KVM: arm64: Fix address check for memory slot

2021-03-14 Thread Gavin Shan
The last (IPA) page can't be specified when a new memory slot is
added. The error -EFAULT is returned when the memory slot is added
with the following parameters for the VM, which has 40-bits IPA
limit. The host has 4KB base page size. It's not correct because
the last (IPA) page is still usable.

   struct kvm_userspace_memory_region {
  __u32 slot;   /* 1*/
  __u32 flags;  /* 0*/
  __u64 guest_phys_addr;/* 0xfff000 */
  __u64 memory_size;/* 0x1000   */
  __u64 userspace_addr;
   };

Signed-off-by: Gavin Shan 
---
 arch/arm64/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 286b603ed0d3..a5a8ade9fde4 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1313,7 +1313,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 * Prevent userspace from creating a memory region outside of the IPA
 * space addressable by the KVM guest IPA space.
 */
-   if (memslot->base_gfn + memslot->npages >=
+   if (memslot->base_gfn + memslot->npages >
(kvm_phys_size(kvm) >> PAGE_SHIFT))
return -EFAULT;
 
-- 
2.23.0



[PATCH 2/4] KVM: arm64: Use find_vma_intersection()

2021-03-14 Thread Gavin Shan
find_vma_intersection() has been existing to search the intersected
vma. This uses the function where it's applicable, to simplify the
code.

Signed-off-by: Gavin Shan 
---
 arch/arm64/kvm/mmu.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 84e70f953de6..286b603ed0d3 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -421,10 +421,11 @@ static void stage2_unmap_memslot(struct kvm *kvm,
 * ++
 */
do {
-   struct vm_area_struct *vma = find_vma(current->mm, hva);
+   struct vm_area_struct *vma;
hva_t vm_start, vm_end;
 
-   if (!vma || vma->vm_start >= reg_end)
+   vma = find_vma_intersection(current->mm, hva, reg_end);
+   if (!vma)
break;
 
/*
@@ -1330,10 +1331,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 * ++
 */
do {
-   struct vm_area_struct *vma = find_vma(current->mm, hva);
+   struct vm_area_struct *vma;
hva_t vm_start, vm_end;
 
-   if (!vma || vma->vm_start >= reg_end)
+   vma = find_vma_intersection(current->mm, hva, reg_end);
+   if (!vma)
break;
 
/*
-- 
2.23.0



[PATCH 1/4] KVM: arm64: Hide kvm_mmu_wp_memory_region()

2021-03-14 Thread Gavin Shan
We needn't expose the function as it's only used by mmu.c.

Signed-off-by: Gavin Shan 
---
 arch/arm64/include/asm/kvm_host.h | 1 -
 arch/arm64/kvm/mmu.c  | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 3d10e6527f7d..688f2df1957b 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -632,7 +632,6 @@ void kvm_arm_resume_guest(struct kvm *kvm);
})
 
 void force_vm_exit(const cpumask_t *mask);
-void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
 
 int handle_exit(struct kvm_vcpu *vcpu, int exception_index);
 void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index);
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 77cb2d28f2a4..84e70f953de6 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -555,7 +555,7 @@ static void stage2_wp_range(struct kvm_s2_mmu *mmu, 
phys_addr_t addr, phys_addr_
  * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
  * serializing operations for VM memory regions.
  */
-void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
+static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
 {
struct kvm_memslots *slots = kvm_memslots(kvm);
struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
-- 
2.23.0



[PATCH 0/4] KVM: arm64: Minor page fault handler improvement

2021-03-14 Thread Gavin Shan
The series includes several minior improvements to stage-2 page fault
handler: PATCH[1/2] are cleaning up the code. PATCH[3] fixes the address
range check on adding new memory slot. PATCH[4] don't retrieve the memory
slot again in the page fault handler to save a bit CPU cycles.

Gavin Shan (4):
  KVM: arm64: Hide kvm_mmu_wp_memory_region()
  KVM: arm64: Use find_vma_intersection()
  KVM: arm64: Fix address check for memory slot
  KVM: arm64: Don't retrieve memory slot again in page fault handler

 arch/arm64/include/asm/kvm_host.h |  1 -
 arch/arm64/kvm/mmu.c  | 19 +++
 2 files changed, 11 insertions(+), 9 deletions(-)

-- 
2.23.0



[PATCH v2 17/17] KVM: arm64: Add async PF document

2021-02-08 Thread Gavin Shan
This adds document to explain the interface for asynchronous page
fault and how it works in general.

Signed-off-by: Gavin Shan 
---
 Documentation/virt/kvm/arm/apf.rst   | 143 +++
 Documentation/virt/kvm/arm/index.rst |   1 +
 2 files changed, 144 insertions(+)
 create mode 100644 Documentation/virt/kvm/arm/apf.rst

diff --git a/Documentation/virt/kvm/arm/apf.rst 
b/Documentation/virt/kvm/arm/apf.rst
new file mode 100644
index ..4f5c01b6699f
--- /dev/null
+++ b/Documentation/virt/kvm/arm/apf.rst
@@ -0,0 +1,143 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Asynchronous Page Fault Support for arm64
+=
+
+There are two stages of page faults when KVM module is enabled as accelerator
+to the guest. The guest is responsible for handling the stage-1 page faults,
+while the host handles the stage-2 page faults. During the period of handling
+the stage-2 page faults, the guest is suspended until the requested page is
+ready. It could take several milliseconds, even hundreds of milliseconds in
+extreme situations because I/O might be required to move the requested page
+from disk to DRAM. The guest does not do any work when it is suspended. The
+feature (Asynchronous Page Fault) is introduced to take advantage of the
+suspending period and to improve the overall performance.
+
+There are two paths in order to fulfil the asynchronous page fault, called
+as control path and data path. The control path allows the VMM or guest to
+configure the functionality, while the notifications are delivered in data
+path. The notifications are classified into page-not-present and page-ready
+notifications.
+
+Data Path
+-
+
+There are two types of notifications delivered from host to guest in the
+data path: page-not-present and page-ready notification. They are delivered
+through SDEI event and (PPI) interrupt separately. Besides, there is a shared
+buffer between host and guest to indicate the reason and sequential token,
+which is used to identify the asynchronous page fault. The reason and token
+resident in the shared buffer is written by host, read and cleared by guest.
+An asynchronous page fault is delivered and completed as below.
+
+(1) When an asynchronous page fault starts, a (workqueue) worker is created
+and queued to the vCPU's pending queue. The worker makes the requested
+page ready and resident to DRAM in the background. The shared buffer is
+updated with reason and sequential token. After that, SDEI event is sent
+to guest as page-not-present notification.
+
+(2) When the SDEI event is received on guest, the current process is tagged
+with TIF_ASYNC_PF and associated with a wait queue. The process is ready
+to keep rescheduling itself on switching from kernel to user mode. After
+that, a reschedule IPI is sent to current CPU and the received SDEI event
+is acknowledged. Note that the IPI is delivered when the acknowledgment
+on the SDEI event is received on host.
+
+(3) On the host, the worker is dequeued from the vCPU's pending queue and
+enqueued to its completion queue when the requested page becomes ready.
+In the mean while, KVM_REQ_ASYNC_PF request is sent the vCPU if the
+worker is the first element enqueued to the completion queue.
+
+(4) With pending KVM_REQ_ASYNC_PF request, the first worker in the completion
+queue is dequeued and destroyed. In the mean while, a (PPI) interrupt is
+sent to guest with updated reason and token in the shared buffer.
+
+(5) When the (PPI) interrupt is received on guest, the affected process is
+located using the token and waken up after its TIF_ASYNC_PF tag is cleared.
+After that, the interrupt is acknowledged through SMCCC interface. The
+workers in the completion queue is dequeued and destroyed if any workers
+exist, and another (PPI) interrupt is sent to the guest.
+
+Control Path
+
+
+The configurations are passed through SMCCC or ioctl interface. The SDEI
+event and (PPI) interrupt are owned by VMM, so the SDEI event and interrupt
+numbers are configured through ioctl command on per-vCPU basis. Besides,
+the functionality might be enabled and configured through ioctl interface
+by VMM during migration:
+
+   * KVM_ARM_ASYNC_PF_CMD_GET_VERSION
+
+ Returns the current version of the feature, supported by the host. It is
+ made up of major, minor and revision fields. Each field is one byte in
+ length.
+
+   * KVM_ARM_ASYNC_PF_CMD_GET_SDEI:
+
+ Retrieve the SDEI event number, used for page-not-present notification,
+ so that it can be configured on destination VM in the scenario of
+ migration.
+
+   * KVM_ARM_ASYNC_PF_GET_IRQ:
+
+ Retrieve the IRQ (PPI) number, used for page-ready notification, so that
+ it can be configured on destination VM in the scenario of migration.
+
+   * KVM_ARM_ASYNC_PF_CMD_GET_CONTROL
+
+ Retrieve the address of control block, so that it can

[PATCH v2 15/17] arm64: Reschedule process on aync PF

2021-02-08 Thread Gavin Shan
The page-not-present notification is delivered by SDEI event. The
guest reschedules current process to another one when the SDEI event
is received. It's not safe to do so in the SDEI event handler because
the SDEI event should be acknowledged as soon as possible.

So the rescheduling is postponed until the current process switches
from kernel to user mode. In order to trigger the switch, the SDEI
event handler sends (reschedule) IPI to current CPU and it's delivered
in time after the SDEI event is acknowledged.

A new thread flag (TIF_ASYNC_PF) is introduced in order to track the
state for the process, to be rescheduled. With the flag is set, there
is a head of wait-queue is associated with the process. The process
keeps rescheduling itself until the flag is cleared when page-ready
notification is received through (PPI) interrupt.

Signed-off-by: Gavin Shan 
---
 arch/arm64/include/asm/processor.h   |  1 +
 arch/arm64/include/asm/thread_info.h |  4 +++-
 arch/arm64/kernel/signal.c   | 17 +
 3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/processor.h 
b/arch/arm64/include/asm/processor.h
index ca2cd75d3286..2176c88c77a7 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -154,6 +154,7 @@ struct thread_struct {
u64 sctlr_tcf0;
u64 gcr_user_excl;
 #endif
+   void*data;
 };
 
 static inline void arch_thread_struct_whitelist(unsigned long *offset,
diff --git a/arch/arm64/include/asm/thread_info.h 
b/arch/arm64/include/asm/thread_info.h
index 9f4e3b266f21..939beb3c7723 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -65,6 +65,7 @@ void arch_release_task_struct(struct task_struct *tsk);
 #define TIF_UPROBE 4   /* uprobe breakpoint or singlestep */
 #define TIF_MTE_ASYNC_FAULT5   /* MTE Asynchronous Tag Check Fault */
 #define TIF_NOTIFY_SIGNAL  6   /* signal notifications exist */
+#define TIF_ASYNC_PF   7   /* Asynchronous page fault */
 #define TIF_SYSCALL_TRACE  8   /* syscall trace active */
 #define TIF_SYSCALL_AUDIT  9   /* syscall auditing */
 #define TIF_SYSCALL_TRACEPOINT 10  /* syscall tracepoint for ftrace */
@@ -95,11 +96,12 @@ void arch_release_task_struct(struct task_struct *tsk);
 #define _TIF_SVE   (1 << TIF_SVE)
 #define _TIF_MTE_ASYNC_FAULT   (1 << TIF_MTE_ASYNC_FAULT)
 #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL)
+#define _TIF_ASYNC_PF  (1 << TIF_ASYNC_PF)
 
 #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
 _TIF_UPROBE | _TIF_MTE_ASYNC_FAULT | \
-_TIF_NOTIFY_SIGNAL)
+_TIF_NOTIFY_SIGNAL | _TIF_ASYNC_PF)
 
 #define _TIF_SYSCALL_WORK  (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
 _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 6237486ff6bb..2cd2d13aa905 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -915,6 +915,23 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
 unsigned long thread_flags)
 {
do {
+   if (thread_flags & _TIF_ASYNC_PF) {
+   struct swait_queue_head *wq =
+   READ_ONCE(current->thread.data);
+   DECLARE_SWAITQUEUE(wait);
+
+   local_daif_restore(DAIF_PROCCTX_NOIRQ);
+
+   do {
+   prepare_to_swait_exclusive(wq,
+   , TASK_UNINTERRUPTIBLE);
+   if (!test_thread_flag(TIF_ASYNC_PF))
+   break;
+
+   schedule();
+   } while (test_thread_flag(TIF_ASYNC_PF));
+   }
+
if (thread_flags & _TIF_NEED_RESCHED) {
/* Unmask Debug and SError for the next task */
local_daif_restore(DAIF_PROCCTX_NOIRQ);
-- 
2.23.0



[PATCH v2 16/17] arm64: Enable async PF

2021-02-08 Thread Gavin Shan
This enables asynchronous page fault from guest side. The design
is highlighted as below:

   * The per-vCPU shared memory region, which is represented by
 "struct kvm_vcpu_pv_apf_data", is allocated. The reason and
 token associated with the received notifications of asynchronous
 page fault are delivered through it.

   * A per-vCPU table, which is represented by "struct kvm_apf_table",
 is allocated. The process, on which the page-not-present notification
 is received, is added into the table so that it can reschedule
 itself on switching from kernel to user mode. Afterwards, the
 process, identified by token, is removed from the table and put
 into runnable state when page-ready notification is received.

   * During CPU hotplug, the (private) SDEI event is expected to be
 enabled or disabled on the affected CPU by SDEI client driver.
 The (PPI) interrupt is enabled or disabled on the affected CPU
 by ourself. When the system is going to reboot, the SDEI event
 is disabled and unregistered and the (PPI) interrupt is disabled.

   * The SDEI event and (PPI) interrupt number are retrieved from host
 through SMCCC interface. Besides, the version of the asynchronous
 page fault is validated when the feature is enabled on the guest.

   * The feature is disabled on guest when boot parameter "no-kvmapf"
 is specified.

Signed-off-by: Gavin Shan 
---
 arch/arm64/kernel/Makefile |   1 +
 arch/arm64/kernel/kvm.c| 452 +
 2 files changed, 453 insertions(+)
 create mode 100644 arch/arm64/kernel/kvm.c

diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 86364ab6f13f..c849ef61f043 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_ACPI)+= acpi.o
 obj-$(CONFIG_ACPI_NUMA)+= acpi_numa.o
 obj-$(CONFIG_ARM64_ACPI_PARKING_PROTOCOL)  += acpi_parking_protocol.o
 obj-$(CONFIG_PARAVIRT) += paravirt.o
+obj-$(CONFIG_KVM_GUEST)+= kvm.o
 obj-$(CONFIG_RANDOMIZE_BASE)   += kaslr.o
 obj-$(CONFIG_HIBERNATION)  += hibernate.o hibernate-asm.o
 obj-$(CONFIG_KEXEC_CORE)   += machine_kexec.o relocate_kernel.o
\
diff --git a/arch/arm64/kernel/kvm.c b/arch/arm64/kernel/kvm.c
new file mode 100644
index ..effe8dc7e921
--- /dev/null
+++ b/arch/arm64/kernel/kvm.c
@@ -0,0 +1,452 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Asynchronous page fault support.
+ *
+ * Copyright (C) 2021 Red Hat, Inc.
+ *
+ * Author(s): Gavin Shan 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+struct kvm_apf_task {
+   unsigned inttoken;
+   struct task_struct  *task;
+   struct swait_queue_head wq;
+};
+
+struct kvm_apf_table {
+   raw_spinlock_t  lock;
+   unsigned intcount;
+   struct kvm_apf_task tasks[0];
+};
+
+static bool async_pf_available = true;
+static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_data) 
__aligned(64);
+static struct kvm_apf_table __percpu *apf_tables;
+static unsigned int apf_tasks;
+static unsigned int apf_sdei_num;
+static unsigned int apf_ppi_num;
+static int apf_irq;
+
+static bool kvm_async_pf_add_task(struct task_struct *task,
+ unsigned int token)
+{
+   struct kvm_apf_table *table = this_cpu_ptr(apf_tables);
+   unsigned int i, index = apf_tasks;
+   bool ret = false;
+
+   raw_spin_lock(>lock);
+
+   if (WARN_ON(table->count >= apf_tasks))
+   goto unlock;
+
+   for (i = 0; i < apf_tasks; i++) {
+   if (!table->tasks[i].task) {
+   if (index == apf_tasks) {
+   ret = true;
+   index = i;
+   }
+   } else if (table->tasks[i].task == task) {
+   WARN_ON(table->tasks[i].token != token);
+   ret = false;
+   break;
+   }
+   }
+
+   if (!ret)
+   goto unlock;
+
+   task->thread.data = >tasks[index].wq;
+   set_tsk_thread_flag(task, TIF_ASYNC_PF);
+
+   table->count++;
+   table->tasks[index].task = task;
+   table->tasks[index].token = token;
+
+unlock:
+   raw_spin_unlock(>lock);
+   return ret;
+}
+
+static inline void kvm_async_pf_remove_one_task(struct kvm_apf_table *table,
+   unsigned int index)
+{
+   clear_tsk_thread_flag(table->tasks[index].task, TIF_ASYNC_PF);
+   WRITE_ONCE(table->tasks[index].task->thread.data, NULL);
+
+   table->count--;
+   table->tasks[index].task = NULL;
+   table-

[PATCH v2 14/17] arm64: Detect async PF para-virtualization feature

2021-02-08 Thread Gavin Shan
This implements kvm_para_available() to check if para-virtualization
features are available or not. Besides, kvm_para_has_feature() is
enhanced to detect the asynchronous page fault para-virtualization
feature. These two functions are going to be used by guest kernel
to enable the asynchronous page fault.

This also adds kernel option (CONFIG_KVM_GUEST), which is the umbrella
for the optimizations related to KVM para-virtualization.

Signed-off-by: Gavin Shan 
---
 arch/arm64/Kconfig | 11 +++
 arch/arm64/include/asm/kvm_para.h  | 12 +++-
 arch/arm64/include/uapi/asm/kvm_para.h |  2 ++
 3 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index f39568b28ec1..792ae09aa690 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1078,6 +1078,17 @@ config PARAVIRT_TIME_ACCOUNTING
 
  If in doubt, say N here.
 
+config KVM_GUEST
+   bool "KVM Guest Support"
+   depends on PARAVIRT
+   default y
+   help
+ This option enables various optimizations for running under the KVM
+ hypervisor. Overhead for the kernel when not running inside KVM should
+ be minimal.
+
+ In case of doubt, say Y
+
 config KEXEC
depends on PM_SLEEP_SMP
select KEXEC_CORE
diff --git a/arch/arm64/include/asm/kvm_para.h 
b/arch/arm64/include/asm/kvm_para.h
index 0ea481dd1c7a..8f39c60a6619 100644
--- a/arch/arm64/include/asm/kvm_para.h
+++ b/arch/arm64/include/asm/kvm_para.h
@@ -3,6 +3,8 @@
 #define _ASM_ARM_KVM_PARA_H
 
 #include 
+#include 
+#include 
 
 static inline bool kvm_check_and_clear_guest_paused(void)
 {
@@ -11,7 +13,12 @@ static inline bool kvm_check_and_clear_guest_paused(void)
 
 static inline unsigned int kvm_arch_para_features(void)
 {
-   return 0;
+   unsigned int features = 0;
+
+   if (kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_ASYNC_PF))
+   features |= (1 << KVM_FEATURE_ASYNC_PF);
+
+   return features;
 }
 
 static inline unsigned int kvm_arch_para_hints(void)
@@ -21,6 +28,9 @@ static inline unsigned int kvm_arch_para_hints(void)
 
 static inline bool kvm_para_available(void)
 {
+   if (IS_ENABLED(CONFIG_KVM_GUEST))
+   return true;
+
return false;
 }
 
diff --git a/arch/arm64/include/uapi/asm/kvm_para.h 
b/arch/arm64/include/uapi/asm/kvm_para.h
index 162325e2638f..70bbc7d1ec75 100644
--- a/arch/arm64/include/uapi/asm/kvm_para.h
+++ b/arch/arm64/include/uapi/asm/kvm_para.h
@@ -4,6 +4,8 @@
 
 #include 
 
+#define KVM_FEATURE_ASYNC_PF   0
+
 /* Async PF */
 #define KVM_ASYNC_PF_ENABLED   (1 << 0)
 #define KVM_ASYNC_PF_SEND_ALWAYS   (1 << 1)
-- 
2.23.0



[PATCH v2 11/17] KVM: arm64: Support async PF hypercalls

2021-02-08 Thread Gavin Shan
This introduces (SMCCC) KVM vendor specific services to configure
the asynchronous page fault functionality. The following services
are introduced:

   * ARM_SMCCC_KVM_FUNC_ASYNC_PF_VERSION
 Returns the version, which can be used to identify ABI changes
 in the future.
   * ARM_SMCCC_KVM_FUNC_ASYNC_PF_SLOTS
 Return maximal number of tokens that current vCPU can have.
 It's used by guest to allocate the required resources.
   * ARM_SMCCC_KVM_FUNC_ASYNC_PF_{SDEI, IRQ}
 Return the associated SDEI or (PPI) IRQ number, configured by
 vCPU ioctl command.
   * ARM_SMCCC_KVM_FUNC_ASYNC_PF_ENABLE
 Enable or disable asynchronous page fault on current vCPU.

The corresponding SDEI event and (PPI) IRQ are owned by VMM. So they
are configured by vCPU ioctl interface and it will be implemented when
the asynchronous page fault capability is exported in the subsequent
patches.

Signed-off-by: Gavin Shan 
---
 arch/arm64/kvm/async_pf.c | 119 ++
 include/linux/arm-smccc.h |   5 ++
 2 files changed, 124 insertions(+)

diff --git a/arch/arm64/kvm/async_pf.c b/arch/arm64/kvm/async_pf.c
index f73c406456e9..4734c5b26aa8 100644
--- a/arch/arm64/kvm/async_pf.c
+++ b/arch/arm64/kvm/async_pf.c
@@ -313,12 +313,115 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
write_cache(vcpu, offsetof(struct kvm_vcpu_pv_apf_data, token), 0);
 }
 
+static void kvm_arch_async_sdei_notifier(struct kvm_vcpu *vcpu,
+unsigned long num,
+unsigned int state)
+{
+   struct kvm *kvm = vcpu->kvm;
+   struct kvm_arch_async_pf_control *apf = vcpu->arch.apf;
+
+   if (!apf)
+   return;
+
+   if (num != apf->sdei_event_num) {
+   kvm_err("%s: Invalid event number (%d-%d %lx-%llx)\n",
+   __func__, kvm->userspace_pid, vcpu->vcpu_idx,
+   num, apf->sdei_event_num);
+   return;
+   }
+
+   switch (state) {
+   case KVM_SDEI_NOTIFY_DELIVERED:
+   if (!apf->notpresent_pending)
+   break;
+
+   apf->notpresent_token = 0;
+   apf->notpresent_pending = false;
+   break;
+   case KVM_SDEI_NOTIFY_COMPLETED:
+   break;
+   default:
+   kvm_err("%s: Invalid state (%d-%d %lx-%d)\n",
+   __func__, kvm->userspace_pid, vcpu->vcpu_idx,
+   num, state);
+   }
+}
+
+static long kvm_arch_async_enable(struct kvm_vcpu *vcpu, u64 data)
+{
+   struct kvm *kvm = vcpu->kvm;
+   struct kvm_arch_async_pf_control *apf = vcpu->arch.apf;
+   gpa_t gpa = (data & ~0x3FUL);
+   bool enabled, enable;
+   int ret;
+
+   if (!apf || !irqchip_in_kernel(kvm))
+   return SMCCC_RET_NOT_SUPPORTED;
+
+   /* Bail if the state transition isn't allowed */
+   enabled = !!(apf->control_block & KVM_ASYNC_PF_ENABLED);
+   enable = !!(data & KVM_ASYNC_PF_ENABLED);
+   if (enable == enabled) {
+   kvm_debug("%s: Async PF has been %s on (%d-%d %llx-%llx)\n",
+ __func__, enabled ? "enabled" : "disabled",
+ kvm->userspace_pid, vcpu->vcpu_idx,
+ apf->control_block, data);
+   return SMCCC_RET_NOT_REQUIRED;
+   }
+
+   /* To disable the functinality */
+   if (!enable) {
+   kvm_clear_async_pf_completion_queue(vcpu);
+   apf->control_block = data;
+   return SMCCC_RET_SUCCESS;
+   }
+
+   /*
+* The SDEI event and IRQ number should have been given
+* prior to enablement.
+*/
+   if (!apf->sdei_event_num || !apf->irq) {
+   kvm_err("%s: Invalid SDEI event or IRQ (%d-%d %llx-%d)\n",
+   __func__, kvm->userspace_pid, vcpu->vcpu_idx,
+   apf->sdei_event_num, apf->irq);
+   return SMCCC_RET_INVALID_PARAMETER;
+   }
+
+   /* Register SDEI event notifier */
+   ret = kvm_sdei_register_notifier(kvm, apf->sdei_event_num,
+kvm_arch_async_sdei_notifier);
+   if (ret) {
+   kvm_err("%s: Error %d registering SDEI notifier (%d-%d %llx)\n",
+   __func__, ret, kvm->userspace_pid, vcpu->vcpu_idx,
+   apf->sdei_event_num);
+   return SMCCC_RET_NOT_SUPPORTED;
+   }
+
+   /* Initialize cache shared by host and guest */
+   ret = kvm_gfn_to_hva_cache_init(kvm, >cache, gpa,
+   offsetofend(struct kvm_vcpu_pv_apf_data, token));
+   if (ret) {
+   kvm_err("%s: Error %d initializing cache (%d-%d

[PATCH v2 13/17] KVM: arm64: Export async PF capability

2021-02-08 Thread Gavin Shan
This exports the asynchronous page fault capability:

* Identify capability KVM_CAP_ASYNC_{PF, PF_INT}.

* Standardize SDEI event for asynchronous page fault.

* Enable kernel config CONFIG_KVM_ASYNC_{PF, PF_SLOT}.

Signed-off-by: Gavin Shan 
---
 arch/arm64/include/uapi/asm/kvm_sdei.h | 1 +
 arch/arm64/kvm/Kconfig | 2 ++
 arch/arm64/kvm/arm.c   | 4 
 arch/arm64/kvm/sdei.c  | 5 +
 4 files changed, 12 insertions(+)

diff --git a/arch/arm64/include/uapi/asm/kvm_sdei.h 
b/arch/arm64/include/uapi/asm/kvm_sdei.h
index 232092de5e21..47d578abba1a 100644
--- a/arch/arm64/include/uapi/asm/kvm_sdei.h
+++ b/arch/arm64/include/uapi/asm/kvm_sdei.h
@@ -13,6 +13,7 @@
 #define KVM_SDEI_MAX_VCPUS 512
 #define KVM_SDEI_INVALID_NUM   0
 #define KVM_SDEI_DEFAULT_NUM   0x4040
+#define KVM_SDEI_ASYNC_PF_NUM  0x4041
 
 struct kvm_sdei_event_state {
uint64_tnum;
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 3964acf5451e..dfb3ed0de2ca 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -31,6 +31,8 @@ menuconfig KVM
select SRCU
select KVM_VFIO
select HAVE_KVM_EVENTFD
+   select KVM_ASYNC_PF
+   select KVM_ASYNC_PF_SLOT
select HAVE_KVM_IRQFD
select HAVE_KVM_MSI
select HAVE_KVM_IRQCHIP
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index be0e6c2db2a5..0940de3ebcff 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -269,6 +269,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ARM_SDEI:
r = 1;
break;
+   case KVM_CAP_ASYNC_PF:
+   case KVM_CAP_ASYNC_PF_INT:
+   r = IS_ENABLED(CONFIG_KVM_ASYNC_PF) ? 1 : 0;
+   break;
default:
r = 0;
}
diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c
index 4f5a582daa97..437303bfafba 100644
--- a/arch/arm64/kvm/sdei.c
+++ b/arch/arm64/kvm/sdei.c
@@ -19,6 +19,11 @@ static struct kvm_sdei_event_state defined_kse[] = {
  1,
  SDEI_EVENT_PRIORITY_CRITICAL
},
+   { KVM_SDEI_ASYNC_PF_NUM,
+ SDEI_EVENT_TYPE_PRIVATE,
+ 1,
+ SDEI_EVENT_PRIORITY_CRITICAL
+   },
 };
 
 static struct kvm_sdei_event *kvm_sdei_find_event(struct kvm *kvm,
-- 
2.23.0



[PATCH v2 10/17] KVM: arm64: Support page-ready notification

2021-02-08 Thread Gavin Shan
The asynchronous page fault starts with a worker when the requested
page isn't present. The worker makes the requested page present
in the background and the worker, together with the associated
information, is queued to the completion queue after that. The
worker and the completion queue are checked as below.

   * A request (KVM_REQ_ASYNC_PF) is raised if the worker is the
 first one enqueued to the completion queue. With the request,
 the completion queue is checked and the worker is dequeued.
 A PPI is sent to guest as the page-ready notification and
 the guest should acknowledge the interrupt by SMCCC interface.

   * When the notification (PPI) is acknowledged by guest, the
 completion queue is checked again and next worker is dequeued
 if we have one. For this particular worker, another notification
 (PPI) is sent to the guest without raising the request. Once the
 notification (PPI) is acknowledged by the guest, the completion
 queue is checked to process next worker, which has been queued
 to it.

Similar to page-not-present notification, the shared memory region
is used to convey the reason and token associated with the page-ready
notification. The region is represented by "struct kvm_vcpu_pv_apf_data".

The feature isn't enabled by CONFIG_KVM_ASYNC_PF yet. Also, the control
path isn't implemented and will be done in the subsequent patches.

Signed-off-by: Gavin Shan 
---
 arch/arm64/include/asm/kvm_host.h  |  17 ++
 arch/arm64/include/uapi/asm/kvm_para.h |   1 +
 arch/arm64/kvm/arm.c   |  24 ++-
 arch/arm64/kvm/async_pf.c  | 207 +
 arch/arm64/kvm/hypercalls.c|   6 +
 include/linux/arm-smccc.h  |  10 ++
 6 files changed, 262 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 49cccefb22cf..6349920fd9ce 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -48,6 +48,7 @@
 #define KVM_REQ_RECORD_STEAL   KVM_ARCH_REQ(3)
 #define KVM_REQ_RELOAD_GICv4   KVM_ARCH_REQ(4)
 #define KVM_REQ_SDEI   KVM_ARCH_REQ(5)
+#define KVM_REQ_ASYNC_PF   KVM_ARCH_REQ(6)
 
 #define KVM_DIRTY_LOG_MANUAL_CAPS   (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
 KVM_DIRTY_LOG_INITIALLY_SET)
@@ -292,10 +293,12 @@ struct kvm_arch_async_pf_control {
u64 control_block;
boolsend_user_only;
u64 sdei_event_num;
+   u32 irq;
 
u16 id;
boolnotpresent_pending;
u32 notpresent_token;
+   boolpageready_pending;
 };
 
 struct kvm_vcpu_arch {
@@ -767,6 +770,14 @@ bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu,
 u32 esr, gpa_t gpa, gfn_t gfn);
 bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
 struct kvm_async_pf *work);
+void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu);
+bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu);
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
+  struct kvm_async_pf *work);
+void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+struct kvm_async_pf *work);
+long kvm_arch_async_pf_hypercall(struct kvm_vcpu *vcpu,
+long *r1, long *r2, long *r3);
 void kvm_arch_async_pf_destroy_vcpu(struct kvm_vcpu *vcpu);
 #else
 static inline void kvm_arch_async_pf_create_vcpu(struct kvm_vcpu *vcpu) { }
@@ -782,6 +793,12 @@ static inline bool kvm_arch_setup_async_pf(struct kvm_vcpu 
*vcpu,
 {
return false;
 }
+
+static inline long kvm_arch_async_pf_hypercall(struct kvm_vcpu *vcpu,
+  long *r1, long *r2, long *r3)
+{
+   return SMCCC_RET_NOT_SUPPORTED;
+}
 #endif
 
 /* Guest/host FPSIMD coordination helpers */
diff --git a/arch/arm64/include/uapi/asm/kvm_para.h 
b/arch/arm64/include/uapi/asm/kvm_para.h
index 3fa04006714e..162325e2638f 100644
--- a/arch/arm64/include/uapi/asm/kvm_para.h
+++ b/arch/arm64/include/uapi/asm/kvm_para.h
@@ -9,6 +9,7 @@
 #define KVM_ASYNC_PF_SEND_ALWAYS   (1 << 1)
 
 #define KVM_PV_REASON_PAGE_NOT_PRESENT 1
+#define KVM_PV_REASON_PAGE_READY   2
 
 struct kvm_vcpu_pv_apf_data {
__u32   reason;
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index c98fbb4e914b..e34fca3fa0ff 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -484,9 +484,23 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
  */
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
+   struct kvm_arch_async_pf_control *apf = v->arch.apf;
bool irq_lines = *vcpu_hcr(v)

[PATCH v2 12/17] KVM: arm64: Support async PF ioctl commands

2021-02-08 Thread Gavin Shan
This supports ioctl commands for configuration and migration:

   KVM_ARM_ASYNC_PF_CMD_GET_VERSION
  Return implementation version
   KVM_ARM_ASYNC_PF_CMD_GET_SDEI
  Return SDEI event number used for page-not-present notification
   KVM_ARM_ASYNC_PF_CMD_GET_IRQ
  Return IRQ number used for page-ready notification
   KVM_ARM_ASYNC_PF_CMD_GET_CONTROL
  Get control block when VM is migrated
   KVM_ARM_ASYNC_PF_CMD_SET_SDEI
  Set SDEI event number when VM is started or migrated
   KVM_ARM_ASYNC_PF_CMD_SET_IRQ
  Set IRQ number during when VM is started or migrated
   KVM_ARM_ASYNC_PF_CMD_SET_CONTROL
  Set control block when VM is migrated

Signed-off-by: Gavin Shan 
---
 arch/arm64/include/asm/kvm_host.h | 14 +++
 arch/arm64/include/uapi/asm/kvm.h | 19 +
 arch/arm64/kvm/arm.c  |  6 +++
 arch/arm64/kvm/async_pf.c | 64 +++
 include/uapi/linux/kvm.h  |  3 ++
 5 files changed, 106 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 6349920fd9ce..14b3d1505b15 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -778,6 +778,8 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 struct kvm_async_pf *work);
 long kvm_arch_async_pf_hypercall(struct kvm_vcpu *vcpu,
 long *r1, long *r2, long *r3);
+long kvm_arch_async_pf_vm_ioctl(struct kvm *kvm, unsigned long arg);
+long kvm_arch_async_pf_vcpu_ioctl(struct kvm_vcpu *vcpu, unsigned long arg);
 void kvm_arch_async_pf_destroy_vcpu(struct kvm_vcpu *vcpu);
 #else
 static inline void kvm_arch_async_pf_create_vcpu(struct kvm_vcpu *vcpu) { }
@@ -799,6 +801,18 @@ static inline long kvm_arch_async_pf_hypercall(struct 
kvm_vcpu *vcpu,
 {
return SMCCC_RET_NOT_SUPPORTED;
 }
+
+static inline long kvm_arch_async_pf_vm_ioctl(struct kvm *kvm,
+ unsigned long arg)
+{
+   return -EPERM;
+}
+
+static inline long kvm_arch_async_pf_vcpu_ioctl(struct kvm_vcpu *vcpu,
+   unsigned long arg)
+{
+   return -EPERM;
+}
 #endif
 
 /* Guest/host FPSIMD coordination helpers */
diff --git a/arch/arm64/include/uapi/asm/kvm.h 
b/arch/arm64/include/uapi/asm/kvm.h
index 15499751997d..a6124068bee6 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -403,6 +403,25 @@ struct kvm_vcpu_events {
 #define KVM_PSCI_RET_INVAL PSCI_RET_INVALID_PARAMS
 #define KVM_PSCI_RET_DENIEDPSCI_RET_DENIED
 
+/* Asynchronous page fault */
+#define KVM_ARM_ASYNC_PF_CMD_GET_VERSION   0
+#define KVM_ARM_ASYNC_PF_CMD_GET_SDEI  1
+#define KVM_ARM_ASYNC_PF_CMD_GET_IRQ   2
+#define KVM_ARM_ASYNC_PF_CMD_GET_CONTROL   3
+#define KVM_ARM_ASYNC_PF_CMD_SET_SDEI  4
+#define KVM_ARM_ASYNC_PF_CMD_SET_IRQ   5
+#define KVM_ARM_ASYNC_PF_CMD_SET_CONTROL   6
+
+struct kvm_arm_async_pf_cmd {
+   __u32   cmd;
+   union {
+   __u32   version;
+   __u64   sdei;
+   __u32   irq;
+   __u64   control;
+   };
+};
+
 #endif
 
 #endif /* __ARM_KVM_H__ */
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index e34fca3fa0ff..be0e6c2db2a5 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1287,6 +1287,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
case KVM_ARM_SDEI_COMMAND: {
return kvm_sdei_vcpu_ioctl(vcpu, arg);
}
+   case KVM_ARM_ASYNC_PF_COMMAND: {
+   return kvm_arch_async_pf_vcpu_ioctl(vcpu, arg);
+   }
default:
r = -EINVAL;
}
@@ -1364,6 +1367,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
case KVM_ARM_SDEI_COMMAND: {
return kvm_sdei_vm_ioctl(kvm, arg);
}
+   case KVM_ARM_ASYNC_PF_COMMAND: {
+   return kvm_arch_async_pf_vm_ioctl(kvm, arg);
+   }
default:
return -EINVAL;
}
diff --git a/arch/arm64/kvm/async_pf.c b/arch/arm64/kvm/async_pf.c
index 4734c5b26aa8..6f763edbe3a3 100644
--- a/arch/arm64/kvm/async_pf.c
+++ b/arch/arm64/kvm/async_pf.c
@@ -464,6 +464,70 @@ long kvm_arch_async_pf_hypercall(struct kvm_vcpu *vcpu,
return ret;
 }
 
+long kvm_arch_async_pf_vm_ioctl(struct kvm *kvm, unsigned long arg)
+{
+   struct kvm_arm_async_pf_cmd cmd;
+   unsigned int version = 0x01; /* v1.0.0 */
+   void __user *argp = (void __user *)arg;
+
+   if (copy_from_user(, argp, sizeof(cmd)))
+   return -EFAULT;
+
+   if (cmd.cmd != KVM_ARM_ASYNC_PF_CMD_GET_VERSION)
+   return -EINVAL;
+
+   cmd.version = version;
+   if (copy_to_user(argp, , sizeof(cmd)))
+   return -EFAULT;
+
+   return 0;
+}
+
+long kvm_arch_async_pf_vcpu_ioctl(struct kvm_vcpu *vcpu, unsigned long arg

[PATCH v2 08/17] KVM: arm64: Add paravirtualization header files

2021-02-08 Thread Gavin Shan
We need put more stuff in the paravirtualization header files when
the asynchronous page fault is supported. The generic header files
can't meet the goal. This duplicate the generic header files to be
our platform specific header files. It's the preparatory work to
support the asynchronous page fault in subsequent patches:

   include/uapi/asm-generic/kvm_para.h
   include/asm-generic/kvm_para.h

   arch/arm64/include/uapi/asm/kvm_para.h
   arch/arm64/include/asm/kvm_para.h

Signed-off-by: Gavin Shan 
---
 arch/arm64/include/asm/kvm_para.h  | 27 ++
 arch/arm64/include/uapi/asm/Kbuild |  2 --
 arch/arm64/include/uapi/asm/kvm_para.h |  5 +
 3 files changed, 32 insertions(+), 2 deletions(-)
 create mode 100644 arch/arm64/include/asm/kvm_para.h
 create mode 100644 arch/arm64/include/uapi/asm/kvm_para.h

diff --git a/arch/arm64/include/asm/kvm_para.h 
b/arch/arm64/include/asm/kvm_para.h
new file mode 100644
index ..0ea481dd1c7a
--- /dev/null
+++ b/arch/arm64/include/asm/kvm_para.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_ARM_KVM_PARA_H
+#define _ASM_ARM_KVM_PARA_H
+
+#include 
+
+static inline bool kvm_check_and_clear_guest_paused(void)
+{
+   return false;
+}
+
+static inline unsigned int kvm_arch_para_features(void)
+{
+   return 0;
+}
+
+static inline unsigned int kvm_arch_para_hints(void)
+{
+   return 0;
+}
+
+static inline bool kvm_para_available(void)
+{
+   return false;
+}
+
+#endif /* _ASM_ARM_KVM_PARA_H */
diff --git a/arch/arm64/include/uapi/asm/Kbuild 
b/arch/arm64/include/uapi/asm/Kbuild
index 602d137932dc..f66554cd5c45 100644
--- a/arch/arm64/include/uapi/asm/Kbuild
+++ b/arch/arm64/include/uapi/asm/Kbuild
@@ -1,3 +1 @@
 # SPDX-License-Identifier: GPL-2.0
-
-generic-y += kvm_para.h
diff --git a/arch/arm64/include/uapi/asm/kvm_para.h 
b/arch/arm64/include/uapi/asm/kvm_para.h
new file mode 100644
index ..cd212282b90c
--- /dev/null
+++ b/arch/arm64/include/uapi/asm/kvm_para.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_ARM_KVM_PARA_H
+#define _UAPI_ASM_ARM_KVM_PARA_H
+
+#endif /* _UAPI_ASM_ARM_KVM_PARA_H */
-- 
2.23.0



[PATCH v2 09/17] KVM: arm64: Support page-not-present notification

2021-02-08 Thread Gavin Shan
The requested page might be not resident in memory during the stage-2
page fault. For example, the requested page could be resident in swap
device (file). In this case, disk I/O is issued in order to fetch the
requested page and it could take tens of milliseconds, even hundreds
of milliseconds in extreme situation. During the period, the guest's
vCPU is suspended until the requested page becomes ready. Actually,
the something else on the guest's vCPU could be rescheduled during
the period, so that the time slice isn't wasted as the guest's vCPU
can see. This is the primary goal of the feature (Asynchronous Page
Fault).

This supports delivery of page-not-present notification through SDEI
event when the requested page isn't present. When the notification is
received on the guest's vCPU, something else (another process) can be
scheduled. The design is highlighted as below:

   * There is dedicated memory region shared by host and guest. It's
 represented by "struct kvm_vcpu_pv_apf_data". The field @reason
 indicates the reason why the SDEI event is triggered, while the
 unique @token is used by guest to associate the event with the
 suspended process.

   * One control block is associated with each guest's vCPU and it's
 represented by "struct kvm_arch_async_pf_control". It allows the
 guest to configure the functionality to indicate the situations
 where the host can deliver the page-not-present notification to
 kick off asyncrhonous page fault. Besides, runtime states are
 also maintained in this struct.

   * Before the page-not-present notification is sent to the guest's
 vCPU, a worker is started and executed asynchronously on host,
 to fetch the requested page. "struct kvm{_,_arch}async_pf" is
 associated with the worker, to track the work.

The feature isn't enabled by CONFIG_KVM_ASYNC_PF yet. Also, the
page-ready notification delivery and control path isn't implemented
and will be done in the subsequent patches.

Signed-off-by: Gavin Shan 
---
 arch/arm64/include/asm/kvm_host.h  |  50 +
 arch/arm64/include/uapi/asm/kvm_para.h |  15 +++
 arch/arm64/kvm/Makefile|   1 +
 arch/arm64/kvm/arm.c   |   3 +
 arch/arm64/kvm/async_pf.c  | 145 +
 arch/arm64/kvm/mmu.c   |  32 +-
 6 files changed, 245 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/kvm/async_pf.c

diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 00b30b7554e5..49cccefb22cf 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -273,6 +273,31 @@ struct vcpu_reset_state {
boolreset;
 };
 
+/* Should be a power of two number */
+#define ASYNC_PF_PER_VCPU  64
+
+/*
+ * The association of gfn and token. The token will be sent to guest as
+ * page fault address. Also, the guest could be in aarch32 mode. So its
+ * length should be 32-bits.
+ */
+struct kvm_arch_async_pf {
+   u32 token;
+   gfn_t   gfn;
+   u32 esr;
+};
+
+struct kvm_arch_async_pf_control {
+   struct gfn_to_hva_cache cache;
+   u64 control_block;
+   boolsend_user_only;
+   u64 sdei_event_num;
+
+   u16 id;
+   boolnotpresent_pending;
+   u32 notpresent_token;
+};
+
 struct kvm_vcpu_arch {
struct kvm_cpu_context ctxt;
void *sve_state;
@@ -375,6 +400,7 @@ struct kvm_vcpu_arch {
} steal;
 
struct kvm_sdei_vcpu *sdei;
+   struct kvm_arch_async_pf_control *apf;
 };
 
 /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */
@@ -734,6 +760,30 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
 int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
   struct kvm_device_attr *attr);
 
+#ifdef CONFIG_KVM_ASYNC_PF
+void kvm_arch_async_pf_create_vcpu(struct kvm_vcpu *vcpu);
+bool kvm_arch_async_not_present_allowed(struct kvm_vcpu *vcpu);
+bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu,
+u32 esr, gpa_t gpa, gfn_t gfn);
+bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+struct kvm_async_pf *work);
+void kvm_arch_async_pf_destroy_vcpu(struct kvm_vcpu *vcpu);
+#else
+static inline void kvm_arch_async_pf_create_vcpu(struct kvm_vcpu *vcpu) { }
+static inline void kvm_arch_async_pf_destroy_vcpu(struct kvm_vcpu *vcpu) { }
+
+static inline bool kvm_arch_async_not_present_allowed(struct kvm_vcpu *vcpu)
+{
+   return false;
+}
+
+static inline bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu,
+  u32 esr, gpa_t gpa, gfn_t gfn)
+{
+   return false;
+}
+#endif
+
 /* Guest/host FPSIM

[PATCH v2 06/17] KVM: arm64: Advertise KVM UID to guests via SMCCC

2021-02-08 Thread Gavin Shan
From: Will Deacon 

We can advertise ourselves to guests as KVM and provide a basic features
bitmap for discoverability of future hypervisor services.

Signed-off-by: Will Deacon 
Signed-off-by: Gavin Shan 
---
 arch/arm64/kvm/hypercalls.c | 27 ++-
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index a54c4805f2a6..e02e29a12bbf 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -12,13 +12,13 @@
 int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 {
u32 func_id = smccc_get_function(vcpu);
-   long val = SMCCC_RET_NOT_SUPPORTED;
+   long val[4] = { SMCCC_RET_NOT_SUPPORTED };
u32 feature;
gpa_t gpa;
 
switch (func_id) {
case ARM_SMCCC_VERSION_FUNC_ID:
-   val = ARM_SMCCC_VERSION_1_1;
+   val[0] = ARM_SMCCC_VERSION_1_1;
break;
case ARM_SMCCC_ARCH_FEATURES_FUNC_ID:
feature = smccc_get_arg1(vcpu);
@@ -28,10 +28,10 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
case SPECTRE_VULNERABLE:
break;
case SPECTRE_MITIGATED:
-   val = SMCCC_RET_SUCCESS;
+   val[0] = SMCCC_RET_SUCCESS;
break;
case SPECTRE_UNAFFECTED:
-   val = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED;
+   val[0] = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED;
break;
}
break;
@@ -54,22 +54,31 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
break;
fallthrough;
case SPECTRE_UNAFFECTED:
-   val = SMCCC_RET_NOT_REQUIRED;
+   val[0] = SMCCC_RET_NOT_REQUIRED;
break;
}
break;
case ARM_SMCCC_HV_PV_TIME_FEATURES:
-   val = SMCCC_RET_SUCCESS;
+   val[0] = SMCCC_RET_SUCCESS;
break;
}
break;
case ARM_SMCCC_HV_PV_TIME_FEATURES:
-   val = kvm_hypercall_pv_features(vcpu);
+   val[0] = kvm_hypercall_pv_features(vcpu);
break;
case ARM_SMCCC_HV_PV_TIME_ST:
gpa = kvm_init_stolen_time(vcpu);
if (gpa != GPA_INVALID)
-   val = gpa;
+   val[0] = gpa;
+   break;
+   case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID:
+   val[0] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0;
+   val[1] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1;
+   val[2] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2;
+   val[3] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3;
+   break;
+   case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
+   val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES);
break;
case SDEI_1_0_FN_SDEI_VERSION:
case SDEI_1_0_FN_SDEI_EVENT_REGISTER:
@@ -93,6 +102,6 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
return kvm_psci_call(vcpu);
}
 
-   smccc_set_retval(vcpu, val, 0, 0, 0);
+   smccc_set_retval(vcpu, val[0], val[1], val[2], val[3]);
return 1;
 }
-- 
2.23.0



[PATCH v2 07/17] KVM: arm64: Export kvm_handle_user_mem_abort()

2021-02-08 Thread Gavin Shan
The main work is handled by user_mem_abort(). After asynchronous
page fault is supported, one page fault need to be handled with
two calls to this function. It means the page fault needs to be
replayed asynchronously in that case. This renames the function
to kvm_handle_user_mem_abort() can exports it. Besides, there are
more changes introduced in order to accommodate asynchronous page
fault:

   * Add arguments @esr and @prefault to user_mem_abort(). @esr
 is the cached value of ESR_EL2 instead of fetching from the
 current vCPU when the page fault is replayed in scenario of
 asynchronous page fault. @prefault is used to indicate the
 page fault is replayed one or not.

   * Define helper functions esr_dbat_*() in asm/esr.h to extract
 or check various fields of the passed ESR_EL2 value because
 those helper functions defined in asm/kvm_emulate.h assumes
 the ESR_EL2 value has been cached in vCPU struct. It won't
 be true on handling the replayed page fault in scenario of
 asynchronous page fault.

   * Some helper functions defined in asm/kvm_emulate.h are used
 by mmu.c only and seem not to be used by other source file
 in near future. They are moved to mmu.c and renamed accordingly.

 kvm_vcpu_trap_is_exec_fault()
is_exec_fault()
 kvm_is_write_fault()
is_write_fault()
 kvm_vcpu_trap_get_fault_level()
Replaced by esr_dabt_get_fault_level()

Signed-off-by: Gavin Shan 
---
 arch/arm64/include/asm/esr.h |  6 
 arch/arm64/include/asm/kvm_emulate.h | 27 ++---
 arch/arm64/include/asm/kvm_host.h|  4 +++
 arch/arm64/kvm/mmu.c | 43 ++--
 4 files changed, 48 insertions(+), 32 deletions(-)

diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index 29f97eb3dad4..db46eb58c633 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -321,8 +321,14 @@
 ESR_ELx_CP15_32_ISS_DIR_READ)
 
 #ifndef __ASSEMBLY__
+#include 
 #include 
 
+#define esr_dabt_get_fault_type(esr)   (esr & ESR_ELx_FSC_TYPE)
+#define esr_dabt_get_fault_level(esr)  (FIELD_GET(ESR_ELx_FSC_LEVEL, esr))
+#define esr_dabt_is_wnr(esr)   (!!(FIELD_GET(ESR_ELx_WNR, esr)))
+#define esr_dabt_is_s1ptw(esr) (!!(FIELD_GET(ESR_ELx_S1PTW, esr)))
+
 static inline bool esr_is_data_abort(u32 esr)
 {
const u32 ec = ESR_ELx_EC(esr);
diff --git a/arch/arm64/include/asm/kvm_emulate.h 
b/arch/arm64/include/asm/kvm_emulate.h
index 0ef213b715a5..119b953828a2 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -282,13 +282,13 @@ static __always_inline int kvm_vcpu_dabt_get_rd(const 
struct kvm_vcpu *vcpu)
 
 static __always_inline bool kvm_vcpu_abt_iss1tw(const struct kvm_vcpu *vcpu)
 {
-   return !!(kvm_vcpu_get_esr(vcpu) & ESR_ELx_S1PTW);
+   return esr_dabt_is_s1ptw(kvm_vcpu_get_esr(vcpu));
 }
 
 /* Always check for S1PTW *before* using this. */
 static __always_inline bool kvm_vcpu_dabt_iswrite(const struct kvm_vcpu *vcpu)
 {
-   return kvm_vcpu_get_esr(vcpu) & ESR_ELx_WNR;
+   return esr_dabt_is_wnr(kvm_vcpu_get_esr(vcpu));
 }
 
 static inline bool kvm_vcpu_dabt_is_cm(const struct kvm_vcpu *vcpu)
@@ -317,11 +317,6 @@ static inline bool kvm_vcpu_trap_is_iabt(const struct 
kvm_vcpu *vcpu)
return kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_IABT_LOW;
 }
 
-static inline bool kvm_vcpu_trap_is_exec_fault(const struct kvm_vcpu *vcpu)
-{
-   return kvm_vcpu_trap_is_iabt(vcpu) && !kvm_vcpu_abt_iss1tw(vcpu);
-}
-
 static __always_inline u8 kvm_vcpu_trap_get_fault(const struct kvm_vcpu *vcpu)
 {
return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC;
@@ -329,12 +324,7 @@ static __always_inline u8 kvm_vcpu_trap_get_fault(const 
struct kvm_vcpu *vcpu)
 
 static __always_inline u8 kvm_vcpu_trap_get_fault_type(const struct kvm_vcpu 
*vcpu)
 {
-   return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC_TYPE;
-}
-
-static __always_inline u8 kvm_vcpu_trap_get_fault_level(const struct kvm_vcpu 
*vcpu)
-{
-   return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC_LEVEL;
+   return esr_dabt_get_fault_type(kvm_vcpu_get_esr(vcpu));
 }
 
 static __always_inline bool kvm_vcpu_abt_issea(const struct kvm_vcpu *vcpu)
@@ -362,17 +352,6 @@ static __always_inline int kvm_vcpu_sys_get_rt(struct 
kvm_vcpu *vcpu)
return ESR_ELx_SYS64_ISS_RT(esr);
 }
 
-static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
-{
-   if (kvm_vcpu_abt_iss1tw(vcpu))
-   return true;
-
-   if (kvm_vcpu_trap_is_iabt(vcpu))
-   return false;
-
-   return kvm_vcpu_dabt_iswrite(vcpu);
-}
-
 static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu)
 {
return vcpu_read_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK;
diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 01eda5c8

[PATCH v2 04/17] KVM: x86: Use generic async PF slot management

2021-02-08 Thread Gavin Shan
This uses the generic slot management mechanism for asynchronous
page fault by enabling CONFIG_KVM_ASYNC_PF_SLOT because the private
implementation is totally duplicate to the generic one.

The changes introduced by this is pretty mechanical and shouldn't
cause any logical changes.

Signed-off-by: Gavin Shan 
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/Kconfig|  1 +
 arch/x86/kvm/mmu/mmu.c  |  2 +-
 arch/x86/kvm/x86.c  | 86 +++--
 4 files changed, 8 insertions(+), 82 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3d6616f6f6ef..3488eeb79c79 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1714,7 +1714,6 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
   struct kvm_async_pf *work);
 void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu);
 bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu);
-extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 
 int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu);
 int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 7ac592664c52..b0ad75087ab5 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -34,6 +34,7 @@ config KVM
select HAVE_KVM_IRQ_ROUTING
select HAVE_KVM_EVENTFD
select KVM_ASYNC_PF
+   select KVM_ASYNC_PF_SLOT
select USER_RETURN_NOTIFIER
select KVM_MMIO
select TASKSTATS
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 6d16481aa29d..ca2e84d6743c 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3678,7 +3678,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool 
prefault, gfn_t gfn,
 
if (!prefault && kvm_can_do_async_pf(vcpu)) {
trace_kvm_try_async_get_page(cr2_or_gpa, gfn);
-   if (kvm_find_async_pf_gfn(vcpu, gfn)) {
+   if (kvm_async_pf_find_slot(vcpu, gfn)) {
trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
kvm_make_request(KVM_REQ_APF_HALT, vcpu);
return true;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f3c9fe5c424e..b04d78a87abe 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -290,13 +290,6 @@ static struct kmem_cache *kvm_alloc_emulator_cache(void)
 
 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
 
-static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
-{
-   int i;
-   for (i = 0; i < ASYNC_PF_PER_VCPU; i++)
-   vcpu->arch.apf.gfns[i] = ~0;
-}
-
 static void kvm_on_user_return(struct user_return_notifier *urn)
 {
unsigned slot;
@@ -812,7 +805,7 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long 
old_cr0, unsigned lon
 
if ((cr0 ^ old_cr0) & X86_CR0_PG) {
kvm_clear_async_pf_completion_queue(vcpu);
-   kvm_async_pf_hash_reset(vcpu);
+   kvm_async_pf_reset_slot(vcpu);
}
 
if ((cr0 ^ old_cr0) & update_bits)
@@ -2905,7 +2898,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, 
u64 data)
 
if (!kvm_pv_async_pf_enabled(vcpu)) {
kvm_clear_async_pf_completion_queue(vcpu);
-   kvm_async_pf_hash_reset(vcpu);
+   kvm_async_pf_reset_slot(vcpu);
return 0;
}
 
@@ -9996,7 +9989,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 
vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
 
-   kvm_async_pf_hash_reset(vcpu);
+   kvm_async_pf_reset_slot(vcpu);
kvm_pmu_init(vcpu);
 
vcpu->arch.pending_external_vector = -1;
@@ -10117,7 +10110,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool 
init_event)
kvmclock_reset(vcpu);
 
kvm_clear_async_pf_completion_queue(vcpu);
-   kvm_async_pf_hash_reset(vcpu);
+   kvm_async_pf_reset_slot(vcpu);
vcpu->arch.apf.halted = false;
 
if (vcpu->arch.guest_fpu && kvm_mpx_supported()) {
@@ -10932,73 +10925,6 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, 
struct kvm_async_pf *work)
kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
 }
 
-static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
-{
-   BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU));
-
-   return hash_32(gfn & 0x, order_base_2(ASYNC_PF_PER_VCPU));
-}
-
-static inline u32 kvm_async_pf_next_probe(u32 key)
-{
-   return (key + 1) & (ASYNC_PF_PER_VCPU - 1);
-}
-
-static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
-   u32 key = kvm_async_pf_hash_fn(gfn);
-
-   while (vcpu->arch.apf.gfns[key] != ~0)
-   key = kvm_async_pf_next_probe(key);
-
-   vcpu->arch.apf.gfns[key] = gfn;
-}
-
-static u32 kvm_async_pf_gfn_slot(struct k

[PATCH v2 05/17] arm64: Probe for the presence of KVM hypervisor services during boot

2021-02-08 Thread Gavin Shan
From: Will Deacon 

Although the SMCCC specification provides some limited functionality for
describing the presence of hypervisor and firmware services, this is
generally applicable only to functions designated as "Arm Architecture
Service Functions" and no portable discovery mechanism is provided for
standard hypervisor services, despite having a designated range of
function identifiers reserved by the specification.

In an attempt to avoid the need for additional firmware changes every
time a new function is added, introduce a UID to identify the service
provider as being compatible with KVM. Once this has been established,
additional services can be discovered via a feature bitmap.

Signed-off-by: Will Deacon 
Signed-off-by: Gavin Shan 
---
 arch/arm64/include/asm/hypervisor.h | 11 ++
 arch/arm64/kernel/setup.c   | 32 +
 include/linux/arm-smccc.h   | 25 ++
 3 files changed, 68 insertions(+)

diff --git a/arch/arm64/include/asm/hypervisor.h 
b/arch/arm64/include/asm/hypervisor.h
index f9cc1d021791..91e4bd890819 100644
--- a/arch/arm64/include/asm/hypervisor.h
+++ b/arch/arm64/include/asm/hypervisor.h
@@ -2,6 +2,17 @@
 #ifndef _ASM_ARM64_HYPERVISOR_H
 #define _ASM_ARM64_HYPERVISOR_H
 
+#include 
 #include 
 
+static inline bool kvm_arm_hyp_service_available(u32 func_id)
+{
+   extern DECLARE_BITMAP(__kvm_arm_hyp_services, ARM_SMCCC_KVM_NUM_FUNCS);
+
+   if (func_id >= ARM_SMCCC_KVM_NUM_FUNCS)
+   return -EINVAL;
+
+   return test_bit(func_id, __kvm_arm_hyp_services);
+}
+
 #endif
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index c18aacde8bb0..8cbb99d80869 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -7,6 +7,7 @@
  */
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -275,12 +276,42 @@ static int __init reserve_memblock_reserved_regions(void)
 arch_initcall(reserve_memblock_reserved_regions);
 
 u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID };
+DECLARE_BITMAP(__kvm_arm_hyp_services, ARM_SMCCC_KVM_NUM_FUNCS) = { };
 
 u64 cpu_logical_map(unsigned int cpu)
 {
return __cpu_logical_map[cpu];
 }
 
+static void __init kvm_init_hyp_services(void)
+{
+   struct arm_smccc_res res;
+   int i;
+
+   arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID, );
+   if (res.a0 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0 ||
+   res.a1 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1 ||
+   res.a2 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2 ||
+   res.a3 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3)
+   return;
+
+   memset(, 0, sizeof(res));
+   arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID, );
+   for (i = 0; i < 32; ++i) {
+   if (res.a0 & (i))
+   set_bit(i + (32 * 0), __kvm_arm_hyp_services);
+   if (res.a1 & (i))
+   set_bit(i + (32 * 1), __kvm_arm_hyp_services);
+   if (res.a2 & (i))
+   set_bit(i + (32 * 2), __kvm_arm_hyp_services);
+   if (res.a3 & (i))
+   set_bit(i + (32 * 3), __kvm_arm_hyp_services);
+   }
+
+   pr_info("KVM hypervisor services detected (0x%08lx 0x%08lx 0x%08lx 
0x%08lx)\n",
+   res.a3, res.a2, res.a1, res.a0);
+}
+
 void __init __no_sanitize_address setup_arch(char **cmdline_p)
 {
init_mm.start_code = (unsigned long) _stext;
@@ -353,6 +384,7 @@ void __init __no_sanitize_address setup_arch(char 
**cmdline_p)
else
psci_acpi_init();
 
+   kvm_init_hyp_services();
init_bootcpu_ops();
smp_init_cpus();
smp_build_mpidr_hash();
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index f860645f6512..7eb816241697 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -55,6 +55,8 @@
 #define ARM_SMCCC_OWNER_TRUSTED_OS 50
 #define ARM_SMCCC_OWNER_TRUSTED_OS_END 63
 
+#define ARM_SMCCC_FUNC_QUERY_CALL_UID  0xff01
+
 #define ARM_SMCCC_QUIRK_NONE   0
 #define ARM_SMCCC_QUIRK_QCOM_A61 /* Save/restore register a6 */
 
@@ -102,6 +104,29 @@
   ARM_SMCCC_OWNER_STANDARD_HYP,\
   0x21)
 
+#define ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID  \
+   ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
+  ARM_SMCCC_SMC_32,\
+  ARM_SMCCC_OWNER_VENDOR_HYP,  \
+  ARM_SMCCC_FUNC_QUERY_CALL_UID)
+
+/* KVM UID value: 28b46fb6-2ec5-11e9-a9ca-4b564d003a74 */
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0 0xb66fb428U
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1 0xe911c52eU
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2 0x564bcaa9U
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3 0x743a004dU
+
+/* 

[PATCH v2 01/17] KVM: async_pf: Move struct kvm_async_pf around

2021-02-08 Thread Gavin Shan
This moves the definitions of "struct kvm_async_pf" and the related
functions after "struct kvm_vcpu" so that newly added inline function
can dereference "struct kvm_vcpu" properly. Otherwise, the unexpected
build error will be raised:

   error: dereferencing pointer to incomplete type ‘struct kvm_vcpu’
   return !list_empty_careful(>async_pf.done);
   ^~

The sepator between type and field is replaced by tab for "struct
kvm_async_pf" since we're here. This is preparatory work for adding
new inline function for next patch. This shouldn't cause logical
changes.

Signed-off-by: Gavin Shan 
---
 include/linux/kvm_host.h | 43 
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f3b1013fb22c..b6697ee1182e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -196,27 +196,6 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum 
kvm_bus bus_idx,
 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 gpa_t addr);
 
-#ifdef CONFIG_KVM_ASYNC_PF
-struct kvm_async_pf {
-   struct work_struct work;
-   struct list_head link;
-   struct list_head queue;
-   struct kvm_vcpu *vcpu;
-   struct mm_struct *mm;
-   gpa_t cr2_or_gpa;
-   unsigned long addr;
-   struct kvm_arch_async_pf arch;
-   bool   wakeup_all;
-   bool notpresent_injected;
-};
-
-void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu);
-void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu);
-bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
-   unsigned long hva, struct kvm_arch_async_pf *arch);
-int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
-#endif
-
 enum {
OUTSIDE_GUEST_MODE,
IN_GUEST_MODE,
@@ -323,6 +302,28 @@ struct kvm_vcpu {
struct kvm_dirty_ring dirty_ring;
 };
 
+#ifdef CONFIG_KVM_ASYNC_PF
+struct kvm_async_pf {
+   struct work_struct  work;
+   struct list_headlink;
+   struct list_headqueue;
+   struct kvm_vcpu *vcpu;
+   struct mm_struct*mm;
+   gpa_t   cr2_or_gpa;
+   unsigned long   addr;
+   struct kvm_arch_async_pfarch;
+   boolwakeup_all;
+   boolnotpresent_injected;
+};
+
+void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu);
+void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu);
+bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+   unsigned long hva, struct kvm_arch_async_pf *arch);
+int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
+#endif
+
+
 static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
 {
/*
-- 
2.23.0



[PATCH v2 03/17] KVM: async_pf: Make GFN slot management generic

2021-02-08 Thread Gavin Shan
It's not allowed to fire duplicate notification for same GFN on
x86 platform, with help of a hash table. This mechanism is going
to be used by arm64 and this makes the code generic and shareable
by multiple platforms.

   * As this mechanism isn't needed by all platforms, a new kernel
 config option (CONFIG_ASYNC_PF_SLOT) is introduced so that it
 can be disabled at compiling time.

   * The code is basically copied from x86 platform and the functions
 are renamed to reflect the fact: (a) the input parameters are
 vCPU and GFN. (b) The operations are resetting, searching, adding
 and removing.

   * Helper stub is also added on !CONFIG_KVM_ASYNC_PF because we're
 going to use IS_ENABLED() instead of #ifdef on arm64 when the
 asynchronous page fault is supported.

This is preparatory work to use the newly introduced functions on x86
platform and arm64 in subsequent patches.

Signed-off-by: Gavin Shan 
---
 include/linux/kvm_host.h | 18 +
 virt/kvm/Kconfig |  3 ++
 virt/kvm/async_pf.c  | 79 
 3 files changed, 100 insertions(+)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 041d93f8f4b0..b52d71030f25 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -277,6 +277,9 @@ struct kvm_vcpu {
 
 #ifdef CONFIG_KVM_ASYNC_PF
struct {
+#ifdef CONFIG_KVM_ASYNC_PF_SLOT
+   gfn_t gfns[ASYNC_PF_PER_VCPU];
+#endif
u32 queued;
struct list_head queue;
struct list_head done;
@@ -321,12 +324,27 @@ static inline bool 
kvm_check_async_pf_completion_queue(struct kvm_vcpu *vcpu)
return !list_empty_careful(>async_pf.done);
 }
 
+#ifdef CONFIG_KVM_ASYNC_PF_SLOT
+void kvm_async_pf_reset_slot(struct kvm_vcpu *vcpu);
+void kvm_async_pf_add_slot(struct kvm_vcpu *vcpu, gfn_t gfn);
+void kvm_async_pf_remove_slot(struct kvm_vcpu *vcpu, gfn_t gfn);
+bool kvm_async_pf_find_slot(struct kvm_vcpu *vcpu, gfn_t gfn);
+#endif
+
 void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu);
 void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu);
 bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
unsigned long hva, struct kvm_arch_async_pf *arch);
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #else
+static inline void kvm_async_pf_reset_slot(struct kvm_vcpu *vcpu) { }
+static inline void kvm_async_pf_add_slot(struct kvm_vcpu *vcpu, gfn_t gfn) { }
+static inline void kvm_async_pf_remove_slot(struct kvm_vcpu *vcpu, gfn_t gfn) 
{ }
+static inline bool kvm_async_pf_find_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+   return false;
+}
+
 static inline bool kvm_check_async_pf_completion_queue(struct kvm_vcpu *vcpu)
 {
return false;
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 1c37ccd5d402..69a282aaa4df 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -23,6 +23,9 @@ config KVM_MMIO
 config KVM_ASYNC_PF
bool
 
+config KVM_ASYNC_PF_SLOT
+   bool
+
 # Toggle to switch between direct notification and batch job
 config KVM_ASYNC_PF_SYNC
bool
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 2cf864aafd0e..7bf22b20af45 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -19,6 +19,85 @@
 
 static struct kmem_cache *async_pf_cache;
 
+#ifdef CONFIG_KVM_ASYNC_PF_SLOT
+static inline u32 kvm_async_pf_hash(gfn_t gfn)
+{
+   BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU));
+
+   return hash_32(gfn & 0x, order_base_2(ASYNC_PF_PER_VCPU));
+}
+
+static inline u32 kvm_async_pf_next_slot(u32 key)
+{
+   return (key + 1) & (ASYNC_PF_PER_VCPU - 1);
+}
+
+static u32 kvm_async_pf_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+   int i;
+   u32 key = kvm_async_pf_hash(gfn);
+
+   for (i = 0; i < ASYNC_PF_PER_VCPU &&
+   (vcpu->async_pf.gfns[key] != gfn &&
+   vcpu->async_pf.gfns[key] != ~0); i++)
+   key = kvm_async_pf_next_slot(key);
+
+   return key;
+}
+
+void kvm_async_pf_reset_slot(struct kvm_vcpu *vcpu)
+{
+   int i;
+
+   for (i = 0; i < ASYNC_PF_PER_VCPU; i++)
+   vcpu->async_pf.gfns[i] = ~0;
+}
+
+bool kvm_async_pf_find_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+   return vcpu->async_pf.gfns[kvm_async_pf_slot(vcpu, gfn)] == gfn;
+}
+
+void kvm_async_pf_add_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+   u32 key = kvm_async_pf_hash(gfn);
+
+   while (vcpu->async_pf.gfns[key] != ~0)
+   key = kvm_async_pf_next_slot(key);
+
+   vcpu->async_pf.gfns[key] = gfn;
+}
+
+void kvm_async_pf_remove_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+   u32 i, j, k;
+
+   i = j = kvm_async_pf_slot(vcpu, gfn);
+
+   if (WARN_ON_ONCE(vcpu->async_pf.gfns[i] != gfn))
+   return;
+
+   while (true) {
+   vcpu->async_pf.gfns[i] = ~0;
+   

[PATCH v2 02/17] KVM: async_pf: Add helper function to check completion queue

2021-02-08 Thread Gavin Shan
This adds inline function kvm_check_async_pf_completion_queue()
and stub on !CONFIG_KVM_ASYNC_PF so that the source code won't
have to care about CONFIG_KVM_ASYNC_PF. The kernel option is
used for once in kvm_main.c and it can be removed then. Besides,
the checks on the completion queue are all replaced by the newly
introduced helper as list_empty() and list_empty_careful() are
interchangeable.

The stub kvm_check_async_pf_completion() on !CONFIG_KVM_ASYNC_PF
is also introduced. It will be used by subsequent patch.

Signed-off-by: Gavin Shan 
---
 arch/x86/kvm/x86.c   |  2 +-
 include/linux/kvm_host.h | 12 
 virt/kvm/async_pf.c  | 12 ++--
 virt/kvm/kvm_main.c  |  4 +---
 4 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 76bce832cade..f3c9fe5c424e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10794,7 +10794,7 @@ static inline bool kvm_guest_apic_has_interrupt(struct 
kvm_vcpu *vcpu)
 
 static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 {
-   if (!list_empty_careful(>async_pf.done))
+   if (kvm_check_async_pf_completion_queue(vcpu))
return true;
 
if (kvm_apic_has_events(vcpu))
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b6697ee1182e..041d93f8f4b0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -316,11 +316,23 @@ struct kvm_async_pf {
boolnotpresent_injected;
 };
 
+static inline bool kvm_check_async_pf_completion_queue(struct kvm_vcpu *vcpu)
+{
+   return !list_empty_careful(>async_pf.done);
+}
+
 void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu);
 void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu);
 bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
unsigned long hva, struct kvm_arch_async_pf *arch);
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
+#else
+static inline bool kvm_check_async_pf_completion_queue(struct kvm_vcpu *vcpu)
+{
+   return false;
+}
+
+static inline void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) { }
 #endif
 
 
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index dd777688d14a..2cf864aafd0e 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -70,7 +70,7 @@ static void async_pf_execute(struct work_struct *work)
kvm_arch_async_page_present(vcpu, apf);
 
spin_lock(>async_pf.lock);
-   first = list_empty(>async_pf.done);
+   first = !kvm_check_async_pf_completion_queue(vcpu);
list_add_tail(>link, >async_pf.done);
apf->vcpu = NULL;
spin_unlock(>async_pf.lock);
@@ -122,7 +122,7 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu 
*vcpu)
spin_lock(>async_pf.lock);
}
 
-   while (!list_empty(>async_pf.done)) {
+   while (kvm_check_async_pf_completion_queue(vcpu)) {
struct kvm_async_pf *work =
list_first_entry(>async_pf.done,
 typeof(*work), link);
@@ -138,8 +138,8 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
 {
struct kvm_async_pf *work;
 
-   while (!list_empty_careful(>async_pf.done) &&
- kvm_arch_can_dequeue_async_page_present(vcpu)) {
+   while (kvm_check_async_pf_completion_queue(vcpu) &&
+  kvm_arch_can_dequeue_async_page_present(vcpu)) {
spin_lock(>async_pf.lock);
work = list_first_entry(>async_pf.done, typeof(*work),
  link);
@@ -205,7 +205,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
struct kvm_async_pf *work;
bool first;
 
-   if (!list_empty_careful(>async_pf.done))
+   if (kvm_check_async_pf_completion_queue(vcpu))
return 0;
 
work = kmem_cache_zalloc(async_pf_cache, GFP_ATOMIC);
@@ -216,7 +216,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
INIT_LIST_HEAD(>queue); /* for list_del to work */
 
spin_lock(>async_pf.lock);
-   first = list_empty(>async_pf.done);
+   first = !kvm_check_async_pf_completion_queue(vcpu);
list_add_tail(>link, >async_pf.done);
spin_unlock(>async_pf.lock);
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8367d88ce39b..632b80b6e485 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2961,10 +2961,8 @@ static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
if (kvm_arch_dy_runnable(vcpu))
return true;
 
-#ifdef CONFIG_KVM_ASYNC_PF
-   if (!list_empty_careful(>async_pf.done))
+   if (kvm_check_async_pf_completion_queue(vcpu))
return true;
-#endif
 
return false;
 }
-- 
2.23.0



[PATCH v2 00/17] Support Asynchronous Page Fault

2021-02-08 Thread Gavin Shan
ieved from "info migrate":

 Param  -APF   +APF Output

store-global-state: on on
only-migratable:offoff
send-configuration: on on
send-section-footer:on on
decompress-error-check: on on
clear-bitmap-shift: 18 18
Migration status:   completed  completed
total time: 9576 ms10461 ms  +9.2%
downtime:   78 ms  44 ms -43.5%
setup:  62 ms  47ms  -24.1%
transferred ram:889007 kbytes  1206436 kbytes+35.7%
throughput: 765.53 mbps949.08 mbps   +24%
remaining ram:  0 kbytes   0 kbytes
total ram:  4325952 kbytes 4325952 kbytes
duplicate:  861559 pages   823954 pages
skipped:0 pages0 pages
normal: 219929 pages   299214 pages
normal bytes:   879716 kbytes  1196856 kbytes
dirty sync count:   2  2
page size:  4 kbytes   4 kbytes
multifd bytes:  0 kbytes   0 kbytes
pages-per-second:   33684  72400 +115%
postcopy request count: 12175-38%

The asynchronous page fault is beneficial to throughput and speed in
the scenario of post-copy live migration.

Chnagelog
=
v2:
   * Rebase to v5.11.rc6  (Gavin)
   * Split the patches(James)
   * Allocate "struct kvm_arch_async_control" dymaicall and use
 it to check if the feature has been enabled. The kernel
 option (CONFIG_KVM_ASYNC_PF) isn't used. (James)
   * Add document to explain the design   (James)
   * Make GFN hash table management generic   (James)
   * Add ioctl commands to support migration  (Gavin)

Gavin Shan (15):
  KVM: async_pf: Move struct kvm_async_pf around
  KVM: async_pf: Add helper function to check completion queue
  KVM: async_pf: Make GFN slot management generic
  KVM: x86: Use generic async PF slot management
  KVM: arm64: Export kvm_handle_user_mem_abort()
  KVM: arm64: Add paravirtualization header files
  KVM: arm64: Support page-not-present notification
  KVM: arm64: Support page-ready notification
  KVM: arm64: Support async PF hypercalls
  KVM: arm64: Support async PF ioctl commands
  KVM: arm64: Export async PF capability
  arm64: Detect async PF para-virtualization feature
  arm64: Reschedule process on aync PF
  arm64: Enable async PF
  KVM: arm64: Add async PF document

Will Deacon (2):
  arm64: Probe for the presence of KVM hypervisor services during boot
  KVM: arm64: Advertise KVM UID to guests via SMCCC

 Documentation/virt/kvm/arm/apf.rst | 143 +++
 Documentation/virt/kvm/arm/index.rst   |   1 +
 arch/arm64/Kconfig |  11 +
 arch/arm64/include/asm/esr.h   |   6 +
 arch/arm64/include/asm/hypervisor.h|  11 +
 arch/arm64/include/asm/kvm_emulate.h   |  27 +-
 arch/arm64/include/asm/kvm_host.h  |  85 
 arch/arm64/include/asm/kvm_para.h  |  37 ++
 arch/arm64/include/asm/processor.h |   1 +
 arch/arm64/include/asm/thread_info.h   |   4 +-
 arch/arm64/include/uapi/asm/Kbuild |   2 -
 arch/arm64/include/uapi/asm/kvm.h  |  19 +
 arch/arm64/include/uapi/asm/kvm_para.h |  23 ++
 arch/arm64/include/uapi/asm/kvm_sdei.h |   1 +
 arch/arm64/kernel/Makefile |   1 +
 arch/arm64/kernel/kvm.c| 452 +
 arch/arm64/kernel/setup.c  |  32 ++
 arch/arm64/kernel/signal.c |  17 +
 arch/arm64/kvm/Kconfig |   2 +
 arch/arm64/kvm/Makefile|   1 +
 arch/arm64/kvm/arm.c   |  37 +-
 arch/arm64/kvm/async_pf.c  | 535 +
 arch/arm64/kvm/hypercalls.c|  33 +-
 arch/arm64/kvm/mmu.c   |  75 +++-
 arch/arm64/kvm/sdei.c  |   5 +
 arch/x86/include/asm/kvm_host.h|   1 -
 arch/x86/kvm/Kconfig   |   1 +
 arch/x86/kvm/mmu/mmu.c |   2 +-
 arch/x86/kvm/x86.c |  88 +---
 include/linux/arm-smccc.h  |  40 ++
 include/linux/kvm_host.h   |  73 +++-
 include/uapi/linux/kvm.h   |   3 +
 virt/kvm/Kconfig   |   3 +
 virt/kvm/async_pf.c|  91 -
 virt/kvm/kvm_main.c|   4 +-
 35 files changed, 1706 insertions(+), 161 deletions(-)
 create mode 100644 Documentation/virt/kvm/arm/apf.rst
 create mode 1006

[PATCH v2 15/21] KVM: arm64: Support SDEI event notifier

2021-02-08 Thread Gavin Shan
The owner of the SDEI event, like asynchronous page fault, need
know the state of injected SDEI event. This supports SDEI event
state updating by introducing notifier mechanism. It's notable
the notifier (handler) should be capable of migration.

Signed-off-by: Gavin Shan 
---
 arch/arm64/include/asm/kvm_sdei.h  | 12 +++
 arch/arm64/include/uapi/asm/kvm_sdei.h |  1 +
 arch/arm64/kvm/sdei.c  | 45 +-
 3 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_sdei.h 
b/arch/arm64/include/asm/kvm_sdei.h
index 7f5f5ad689e6..19f2d9b91f85 100644
--- a/arch/arm64/include/asm/kvm_sdei.h
+++ b/arch/arm64/include/asm/kvm_sdei.h
@@ -16,6 +16,16 @@
 #include 
 #include 
 
+struct kvm_vcpu;
+
+typedef void (*kvm_sdei_notifier)(struct kvm_vcpu *vcpu,
+ unsigned long num,
+ unsigned int state);
+enum {
+   KVM_SDEI_NOTIFY_DELIVERED,
+   KVM_SDEI_NOTIFY_COMPLETED,
+};
+
 struct kvm_sdei_event {
struct kvm_sdei_event_state state;
struct kvm  *kvm;
@@ -112,6 +122,8 @@ KVM_SDEI_FLAG_FUNC(enabled)
 void kvm_sdei_init_vm(struct kvm *kvm);
 void kvm_sdei_create_vcpu(struct kvm_vcpu *vcpu);
 int kvm_sdei_hypercall(struct kvm_vcpu *vcpu);
+int kvm_sdei_register_notifier(struct kvm *kvm, unsigned long num,
+  kvm_sdei_notifier notifier);
 void kvm_sdei_deliver(struct kvm_vcpu *vcpu);
 void kvm_sdei_destroy_vcpu(struct kvm_vcpu *vcpu);
 void kvm_sdei_destroy_vm(struct kvm *kvm);
diff --git a/arch/arm64/include/uapi/asm/kvm_sdei.h 
b/arch/arm64/include/uapi/asm/kvm_sdei.h
index 9dbda2fb457e..20ad724f63c8 100644
--- a/arch/arm64/include/uapi/asm/kvm_sdei.h
+++ b/arch/arm64/include/uapi/asm/kvm_sdei.h
@@ -20,6 +20,7 @@ struct kvm_sdei_event_state {
uint8_t type;
uint8_t signaled;
uint8_t priority;
+   uint64_tnotifier;
 };
 
 struct kvm_sdei_kvm_event_state {
diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c
index 1e8e213c9d70..5f7a37dcaa77 100644
--- a/arch/arm64/kvm/sdei.c
+++ b/arch/arm64/kvm/sdei.c
@@ -314,9 +314,11 @@ static unsigned long kvm_sdei_hypercall_complete(struct 
kvm_vcpu *vcpu,
struct kvm *kvm = vcpu->kvm;
struct kvm_sdei_kvm *ksdei = kvm->arch.sdei;
struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei;
+   struct kvm_sdei_event *kse = NULL;
struct kvm_sdei_kvm_event *kske = NULL;
struct kvm_sdei_vcpu_event *ksve = NULL;
struct kvm_sdei_vcpu_regs *regs;
+   kvm_sdei_notifier notifier;
unsigned long ret = SDEI_SUCCESS;
int index;
 
@@ -349,6 +351,13 @@ static unsigned long kvm_sdei_hypercall_complete(struct 
kvm_vcpu *vcpu,
*vcpu_cpsr(vcpu) = regs->pstate;
*vcpu_pc(vcpu) = regs->pc;
 
+   /* Notifier */
+   kske = ksve->kske;
+   kse = kske->kse;
+   notifier = (kvm_sdei_notifier)(kse->state.notifier);
+   if (notifier)
+   notifier(vcpu, kse->state.num, KVM_SDEI_NOTIFY_COMPLETED);
+
/* Inject interrupt if needed */
if (resume)
kvm_inject_irq(vcpu);
@@ -358,7 +367,6 @@ static unsigned long kvm_sdei_hypercall_complete(struct 
kvm_vcpu *vcpu,
 * event state as it's not destroyed because of the reference
 * count.
 */
-   kske = ksve->kske;
ksve->state.refcount--;
kske->state.refcount--;
if (!ksve->state.refcount) {
@@ -746,6 +754,35 @@ int kvm_sdei_hypercall(struct kvm_vcpu *vcpu)
return 1;
 }
 
+int kvm_sdei_register_notifier(struct kvm *kvm,
+  unsigned long num,
+  kvm_sdei_notifier notifier)
+{
+   struct kvm_sdei_kvm *ksdei = kvm->arch.sdei;
+   struct kvm_sdei_event *kse = NULL;
+   int ret = 0;
+
+   if (!ksdei) {
+   ret = -EPERM;
+   goto out;
+   }
+
+   spin_lock(>lock);
+
+   kse = kvm_sdei_find_event(kvm, num);
+   if (!kse) {
+   ret = -EINVAL;
+   goto unlock;
+   }
+
+   kse->state.notifier = (unsigned long)notifier;
+
+unlock:
+   spin_unlock(>lock);
+out:
+   return ret;
+}
+
 void kvm_sdei_deliver(struct kvm_vcpu *vcpu)
 {
struct kvm *kvm = vcpu->kvm;
@@ -755,6 +792,7 @@ void kvm_sdei_deliver(struct kvm_vcpu *vcpu)
struct kvm_sdei_kvm_event *kske = NULL;
struct kvm_sdei_vcpu_event *ksve = NULL;
struct kvm_sdei_vcpu_regs *regs = NULL;
+   kvm_sdei_notifier notifier;
unsigned long pstate;
int index = 0;
 
@@ -826,6 +864,11 @@ void kvm_sdei_deliver(struct kvm_vcpu *vcpu)
*vcpu_cpsr(vcpu) = pstate;
*vcpu_pc(vcpu) = kske->state.entries[index];
 
+   /* Notifier */
+   notifier = (kvm_sdei_notifier)(kse->state.notifie

[PATCH v2 05/21] KVM: arm64: Support SDEI_EVENT_{ENABLE, DISABLE} hypercall

2021-02-08 Thread Gavin Shan
This supports SDEI_EVENT_{ENABLE, DISABLE} hypercall. After SDEI
event is registered by guest, it won't be delivered to the guest
until it's enabled. On the other hand, the SDEI event won't be
raised to the guest or specific vCPU if it's has been disabled
on the guest or specific vCPU.

Signed-off-by: Gavin Shan 
---
 arch/arm64/kvm/sdei.c | 68 +++
 1 file changed, 68 insertions(+)

diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c
index d3ea3eee154b..b022ce0a202b 100644
--- a/arch/arm64/kvm/sdei.c
+++ b/arch/arm64/kvm/sdei.c
@@ -206,6 +206,70 @@ static unsigned long kvm_sdei_hypercall_register(struct 
kvm_vcpu *vcpu)
return ret;
 }
 
+static unsigned long kvm_sdei_hypercall_enable(struct kvm_vcpu *vcpu,
+  bool enable)
+{
+   struct kvm *kvm = vcpu->kvm;
+   struct kvm_sdei_kvm *ksdei = kvm->arch.sdei;
+   struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei;
+   struct kvm_sdei_event *kse = NULL;
+   struct kvm_sdei_kvm_event *kske = NULL;
+   unsigned long event_num = smccc_get_arg1(vcpu);
+   int index = 0;
+   unsigned long ret = SDEI_SUCCESS;
+
+   /* Sanity check */
+   if (!(ksdei && vsdei)) {
+   ret = SDEI_NOT_SUPPORTED;
+   goto out;
+   }
+
+   if (!kvm_sdei_is_valid_event_num(event_num)) {
+   ret = SDEI_INVALID_PARAMETERS;
+   goto out;
+   }
+
+   /* Check if the KVM event exists */
+   spin_lock(>lock);
+   kske = kvm_sdei_find_kvm_event(kvm, event_num);
+   if (!kske) {
+   ret = SDEI_INVALID_PARAMETERS;
+   goto unlock;
+   }
+
+   /* Check if there is pending events */
+   if (kske->state.refcount) {
+   ret = SDEI_PENDING;
+   goto unlock;
+   }
+
+   /* Check if it has been registered */
+   kse = kske->kse;
+   index = (kse->state.type == SDEI_EVENT_TYPE_PRIVATE) ?
+   vcpu->vcpu_idx : 0;
+   if (!kvm_sdei_is_registered(kske, index)) {
+   ret = SDEI_DENIED;
+   goto unlock;
+   }
+
+   /* Verify its enablement state */
+   if (enable == kvm_sdei_is_enabled(kske, index)) {
+   ret = SDEI_DENIED;
+   goto unlock;
+   }
+
+   /* Update enablement state */
+   if (enable)
+   kvm_sdei_set_enabled(kske, index);
+   else
+   kvm_sdei_clear_enabled(kske, index);
+
+unlock:
+   spin_unlock(>lock);
+out:
+   return ret;
+}
+
 int kvm_sdei_hypercall(struct kvm_vcpu *vcpu)
 {
u32 func = smccc_get_function(vcpu);
@@ -220,7 +284,11 @@ int kvm_sdei_hypercall(struct kvm_vcpu *vcpu)
ret = kvm_sdei_hypercall_register(vcpu);
break;
case SDEI_1_0_FN_SDEI_EVENT_ENABLE:
+   ret = kvm_sdei_hypercall_enable(vcpu, true);
+   break;
case SDEI_1_0_FN_SDEI_EVENT_DISABLE:
+   ret = kvm_sdei_hypercall_enable(vcpu, false);
+   break;
case SDEI_1_0_FN_SDEI_EVENT_CONTEXT:
case SDEI_1_0_FN_SDEI_EVENT_COMPLETE:
case SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME:
-- 
2.23.0



[PATCH v2 03/21] KVM: arm64: Support SDEI_VERSION hypercall

2021-02-08 Thread Gavin Shan
This supports SDEI_VERSION hypercall by returning v1.0.0 simply
when the functionality is supported on the VM and vCPU.

Signed-off-by: Gavin Shan 
---
 arch/arm64/kvm/sdei.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c
index ab330b74a965..aa9485f076a9 100644
--- a/arch/arm64/kvm/sdei.c
+++ b/arch/arm64/kvm/sdei.c
@@ -70,6 +70,22 @@ static void kvm_sdei_remove_vcpu_events(struct kvm_vcpu 
*vcpu)
}
 }
 
+static unsigned long kvm_sdei_hypercall_version(struct kvm_vcpu *vcpu)
+{
+   struct kvm *kvm = vcpu->kvm;
+   struct kvm_sdei_kvm *ksdei = kvm->arch.sdei;
+   struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei;
+   unsigned long ret = SDEI_NOT_SUPPORTED;
+
+   if (!(ksdei && vsdei))
+   return ret;
+
+   /* v1.0.0 */
+   ret = (1UL << SDEI_VERSION_MAJOR_SHIFT);
+
+   return ret;
+}
+
 int kvm_sdei_hypercall(struct kvm_vcpu *vcpu)
 {
u32 func = smccc_get_function(vcpu);
@@ -78,6 +94,8 @@ int kvm_sdei_hypercall(struct kvm_vcpu *vcpu)
 
switch (func) {
case SDEI_1_0_FN_SDEI_VERSION:
+   ret = kvm_sdei_hypercall_version(vcpu);
+   break;
case SDEI_1_0_FN_SDEI_EVENT_REGISTER:
case SDEI_1_0_FN_SDEI_EVENT_ENABLE:
case SDEI_1_0_FN_SDEI_EVENT_DISABLE:
-- 
2.23.0



[PATCH v2 07/21] KVM: arm64: Support SDEI_EVENT_UNREGISTER hypercall

2021-02-08 Thread Gavin Shan
This supports SDEI_EVENT_UNREGISTER hypercall. It's used by the
guest to unregister SDEI event. The SDEI event won't be raised to
the guest or specific vCPU after it's unregistered successfully.
It's notable the SDEI event is disabled automatically on the guest
or specific vCPU once it's unregistered successfully.

Signed-off-by: Gavin Shan 
---
 arch/arm64/kvm/sdei.c | 61 +++
 1 file changed, 61 insertions(+)

diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c
index b4162efda470..a3ba69dc91cb 100644
--- a/arch/arm64/kvm/sdei.c
+++ b/arch/arm64/kvm/sdei.c
@@ -308,6 +308,65 @@ static unsigned long kvm_sdei_hypercall_context(struct 
kvm_vcpu *vcpu)
return ret;
 }
 
+static unsigned long kvm_sdei_hypercall_unregister(struct kvm_vcpu *vcpu)
+{
+   struct kvm *kvm = vcpu->kvm;
+   struct kvm_sdei_kvm *ksdei = kvm->arch.sdei;
+   struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei;
+   struct kvm_sdei_event *kse = NULL;
+   struct kvm_sdei_kvm_event *kske = NULL;
+   unsigned long event_num = smccc_get_arg1(vcpu);
+   int index = 0;
+   unsigned long ret = SDEI_SUCCESS;
+
+   /* Sanity check */
+   if (!(ksdei && vsdei)) {
+   ret = SDEI_NOT_SUPPORTED;
+   goto out;
+   }
+
+   if (!kvm_sdei_is_valid_event_num(event_num)) {
+   ret = SDEI_INVALID_PARAMETERS;
+   goto out;
+   }
+
+   /* Check if the KVM event exists */
+   spin_lock(>lock);
+   kske = kvm_sdei_find_kvm_event(kvm, event_num);
+   if (!kske) {
+   ret = SDEI_INVALID_PARAMETERS;
+   goto unlock;
+   }
+
+   /* Check if there is pending events */
+   if (kske->state.refcount) {
+   ret = SDEI_PENDING;
+   goto unlock;
+   }
+
+   /* Check if it has been registered */
+   kse = kske->kse;
+   index = (kse->state.type == SDEI_EVENT_TYPE_PRIVATE) ?
+   vcpu->vcpu_idx : 0;
+   if (!kvm_sdei_is_registered(kske, index)) {
+   ret = SDEI_DENIED;
+   goto unlock;
+   }
+
+   /* The event is disabled when it's unregistered */
+   kvm_sdei_clear_enabled(kske, index);
+   kvm_sdei_clear_registered(kske, index);
+   if (kvm_sdei_empty_registered(kske)) {
+   list_del(>link);
+   kfree(kske);
+   }
+
+unlock:
+   spin_unlock(>lock);
+out:
+   return ret;
+}
+
 int kvm_sdei_hypercall(struct kvm_vcpu *vcpu)
 {
u32 func = smccc_get_function(vcpu);
@@ -333,6 +392,8 @@ int kvm_sdei_hypercall(struct kvm_vcpu *vcpu)
case SDEI_1_0_FN_SDEI_EVENT_COMPLETE:
case SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME:
case SDEI_1_0_FN_SDEI_EVENT_UNREGISTER:
+   ret = kvm_sdei_hypercall_unregister(vcpu);
+   break;
case SDEI_1_0_FN_SDEI_EVENT_STATUS:
case SDEI_1_0_FN_SDEI_EVENT_GET_INFO:
case SDEI_1_0_FN_SDEI_EVENT_ROUTING_SET:
-- 
2.23.0



[PATCH v2 14/21] KVM: arm64: Support SDEI_EVENT_{COMPLETE, COMPLETE_AND_RESUME} hypercall

2021-02-08 Thread Gavin Shan
This supports SDEI_EVENT_{COMPLETE, COMPLETE_AND_RESUME} hypercall.
They are used by the guest to notify the completion of the SDEI
event in the handler. The registers are changed according to the
SDEI specification as below:

   * x0 - x17, PC and PState are restored to what values we had in
 the interrupted context.

   * If it's SDEI_EVENT_COMPLETE_AND_RESUME hypercall, IRQ exception
 is injected.

Signed-off-by: Gavin Shan 
---
 arch/arm64/include/asm/kvm_emulate.h |  1 +
 arch/arm64/include/asm/kvm_host.h|  1 +
 arch/arm64/kvm/hyp/exception.c   |  7 +++
 arch/arm64/kvm/inject_fault.c| 27 ++
 arch/arm64/kvm/sdei.c| 75 
 5 files changed, 111 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_emulate.h 
b/arch/arm64/include/asm/kvm_emulate.h
index f612c090f2e4..0ef213b715a5 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -37,6 +37,7 @@ bool kvm_condition_valid32(const struct kvm_vcpu *vcpu);
 void kvm_skip_instr32(struct kvm_vcpu *vcpu);
 
 void kvm_inject_undefined(struct kvm_vcpu *vcpu);
+void kvm_inject_irq(struct kvm_vcpu *vcpu);
 void kvm_inject_vabt(struct kvm_vcpu *vcpu);
 void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr);
 void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr);
diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 30e850257ef4..01eda5c84600 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -416,6 +416,7 @@ struct kvm_vcpu_arch {
 #define KVM_ARM64_EXCEPT_AA32_UND  (0 << 9)
 #define KVM_ARM64_EXCEPT_AA32_IABT (1 << 9)
 #define KVM_ARM64_EXCEPT_AA32_DABT (2 << 9)
+#define KVM_ARM64_EXCEPT_AA32_IRQ  (3 << 9)
 /* For AArch64: */
 #define KVM_ARM64_EXCEPT_AA64_ELx_SYNC (0 << 9)
 #define KVM_ARM64_EXCEPT_AA64_ELx_IRQ  (1 << 9)
diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c
index 73629094f903..c1e9bdb67b37 100644
--- a/arch/arm64/kvm/hyp/exception.c
+++ b/arch/arm64/kvm/hyp/exception.c
@@ -309,6 +309,9 @@ void kvm_inject_exception(struct kvm_vcpu *vcpu)
case KVM_ARM64_EXCEPT_AA32_DABT:
enter_exception32(vcpu, PSR_AA32_MODE_ABT, 16);
break;
+   case KVM_ARM64_EXCEPT_AA32_IRQ:
+   enter_exception32(vcpu, PSR_AA32_MODE_IRQ, 4);
+   break;
default:
/* Err... */
break;
@@ -319,6 +322,10 @@ void kvm_inject_exception(struct kvm_vcpu *vcpu)
  KVM_ARM64_EXCEPT_AA64_EL1):
enter_exception64(vcpu, PSR_MODE_EL1h, 
except_type_sync);
break;
+   case (KVM_ARM64_EXCEPT_AA64_ELx_IRQ |
+ KVM_ARM64_EXCEPT_AA64_EL1):
+   enter_exception64(vcpu, PSR_MODE_EL1h, except_type_irq);
+   break;
default:
/*
 * Only EL1_SYNC makes sense so far, EL2_{SYNC,IRQ}
diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
index b47df73e98d7..3a8c55867d2f 100644
--- a/arch/arm64/kvm/inject_fault.c
+++ b/arch/arm64/kvm/inject_fault.c
@@ -66,6 +66,13 @@ static void inject_undef64(struct kvm_vcpu *vcpu)
vcpu_write_sys_reg(vcpu, esr, ESR_EL1);
 }
 
+static void inject_irq64(struct kvm_vcpu *vcpu)
+{
+   vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA64_EL1 |
+KVM_ARM64_EXCEPT_AA64_ELx_IRQ |
+KVM_ARM64_PENDING_EXCEPTION);
+}
+
 #define DFSR_FSC_EXTABT_LPAE   0x10
 #define DFSR_FSC_EXTABT_nLPAE  0x08
 #define DFSR_LPAE  BIT(9)
@@ -77,6 +84,12 @@ static void inject_undef32(struct kvm_vcpu *vcpu)
 KVM_ARM64_PENDING_EXCEPTION);
 }
 
+static void inject_irq32(struct kvm_vcpu *vcpu)
+{
+   vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA32_IRQ |
+KVM_ARM64_PENDING_EXCEPTION);
+}
+
 /*
  * Modelled after TakeDataAbortException() and TakePrefetchAbortException
  * pseudocode.
@@ -160,6 +173,20 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu)
inject_undef64(vcpu);
 }
 
+/**
+ * kvm_inject_irq - inject an IRQ into the guest
+ *
+ * It is assumed that this code is called from the VCPU thread and that the
+ * VCPU therefore is not currently executing guest code.
+ */
+void kvm_inject_irq(struct kvm_vcpu *vcpu)
+{
+   if (vcpu_el1_is_32bit(vcpu))
+   inject_irq32(vcpu);
+   else
+   inject_irq64(vcpu);
+}
+
 void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 esr)
 {
vcpu_set_vsesr(vcpu, esr & ESR_ELx_ISS_MASK);
diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c
index b5d6d1ed3858..1e8e213c9d70 100644
--- a/arch/arm64/kvm/sdei.c
+++ b/arc

[PATCH v2 17/21] KVM: arm64: Support SDEI ioctl commands on vCPU

2021-02-08 Thread Gavin Shan
This supports ioctl commands on vCPU to manage the various object.
It's primarily used by VMM to accomplish live migration. The ioctl
commands introduced by this are highlighted as below:

   * KVM_SDEI_CMD_GET_VEVENT_COUNT
 Retrieve number of SDEI events that pend for handling on the
 vCPU
   * KVM_SDEI_CMD_GET_VEVENT
 Retrieve the state of SDEI event, which has been delivered to
 the vCPU for handling
   * KVM_SDEI_CMD_SET_VEVENT
 Populate the SDEI event, which has been delivered to the vCPU
 for handling
   * KVM_SDEI_CMD_GET_VCPU_STATE
 Retrieve vCPU state related to SDEI handling
   * KVM_SDEI_CMD_SET_VCPU_STATE
 Populate vCPU state related to SDEI handling

Signed-off-by: Gavin Shan 
---
 arch/arm64/include/asm/kvm_sdei.h  |   1 +
 arch/arm64/include/uapi/asm/kvm_sdei.h |   7 +
 arch/arm64/kvm/arm.c   |   3 +
 arch/arm64/kvm/sdei.c  | 228 +
 4 files changed, 239 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_sdei.h 
b/arch/arm64/include/asm/kvm_sdei.h
index 8f5ea947ed0e..a997989bab77 100644
--- a/arch/arm64/include/asm/kvm_sdei.h
+++ b/arch/arm64/include/asm/kvm_sdei.h
@@ -126,6 +126,7 @@ int kvm_sdei_register_notifier(struct kvm *kvm, unsigned 
long num,
   kvm_sdei_notifier notifier);
 void kvm_sdei_deliver(struct kvm_vcpu *vcpu);
 long kvm_sdei_vm_ioctl(struct kvm *kvm, unsigned long arg);
+long kvm_sdei_vcpu_ioctl(struct kvm_vcpu *vcpu, unsigned long arg);
 void kvm_sdei_destroy_vcpu(struct kvm_vcpu *vcpu);
 void kvm_sdei_destroy_vm(struct kvm *kvm);
 
diff --git a/arch/arm64/include/uapi/asm/kvm_sdei.h 
b/arch/arm64/include/uapi/asm/kvm_sdei.h
index 55de8baff841..3485843dd6df 100644
--- a/arch/arm64/include/uapi/asm/kvm_sdei.h
+++ b/arch/arm64/include/uapi/asm/kvm_sdei.h
@@ -59,6 +59,11 @@ struct kvm_sdei_vcpu_state {
 #define KVM_SDEI_CMD_GET_KEVENT_COUNT  2
 #define KVM_SDEI_CMD_GET_KEVENT3
 #define KVM_SDEI_CMD_SET_KEVENT4
+#define KVM_SDEI_CMD_GET_VEVENT_COUNT  5
+#define KVM_SDEI_CMD_GET_VEVENT6
+#define KVM_SDEI_CMD_SET_VEVENT7
+#define KVM_SDEI_CMD_GET_VCPU_STATE8
+#define KVM_SDEI_CMD_SET_VCPU_STATE9
 
 struct kvm_sdei_cmd {
uint32_tcmd;
@@ -68,6 +73,8 @@ struct kvm_sdei_cmd {
uint64_tnum;
struct kvm_sdei_event_state kse_state;
struct kvm_sdei_kvm_event_state kske_state;
+   struct kvm_sdei_vcpu_event_stateksve_state;
+   struct kvm_sdei_vcpu_state  ksv_state;
};
 };
 
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 96b41bf1d094..55ccd234b0ec 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1260,6 +1260,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 
return kvm_arm_vcpu_finalize(vcpu, what);
}
+   case KVM_ARM_SDEI_COMMAND: {
+   return kvm_sdei_vcpu_ioctl(vcpu, arg);
+   }
default:
r = -EINVAL;
}
diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c
index bdd76c3e5153..79315b77f24b 100644
--- a/arch/arm64/kvm/sdei.c
+++ b/arch/arm64/kvm/sdei.c
@@ -35,6 +35,25 @@ static struct kvm_sdei_event *kvm_sdei_find_event(struct kvm 
*kvm,
return NULL;
 }
 
+static struct kvm_sdei_vcpu_event *kvm_sdei_find_vcpu_event(struct kvm_vcpu 
*vcpu,
+   unsigned long num)
+{
+   struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei;
+   struct kvm_sdei_vcpu_event *ksve;
+
+   list_for_each_entry(ksve, >critical_events, link) {
+   if (ksve->state.num == num)
+   return ksve;
+   }
+
+   list_for_each_entry(ksve, >normal_events, link) {
+   if (ksve->state.num == num)
+   return ksve;
+   }
+
+   return NULL;
+}
+
 static void kvm_sdei_remove_events(struct kvm *kvm)
 {
struct kvm_sdei_kvm *ksdei = kvm->arch.sdei;
@@ -1102,6 +1121,215 @@ long kvm_sdei_vm_ioctl(struct kvm *kvm, unsigned long 
arg)
return ret;
 }
 
+static long kvm_sdei_get_vevent_count(struct kvm_vcpu *vcpu, int *count)
+{
+   struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei;
+   struct kvm_sdei_vcpu_event *ksve = NULL;
+   int total = 0;
+
+   list_for_each_entry(ksve, >critical_events, link) {
+   total++;
+   }
+
+   list_for_each_entry(ksve, >normal_events, link) {
+   total++;
+   }
+
+   *count = total;
+   return 0;
+}
+
+static struct kvm_sdei_vcpu_event *next_vcpu_event(struct kvm_vcpu *vcpu,
+  unsigned long num)
+{
+   struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei;

[PATCH v2 10/21] KVM: arm64: Support SDEI_EVENT_ROUTING_SET hypercall

2021-02-08 Thread Gavin Shan
This supports SDEI_EVENT_ROUTING_SET hypercall. It's used by the
guest to set route mode and affinity for the registered KVM event.
It's only valid for the shared events. It's not allowed to do so
when the corresponding event has been raised to the guest.

Signed-off-by: Gavin Shan 
---
 arch/arm64/kvm/sdei.c | 64 +++
 1 file changed, 64 insertions(+)

diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c
index 5dfa74b093f1..458695c2394f 100644
--- a/arch/arm64/kvm/sdei.c
+++ b/arch/arm64/kvm/sdei.c
@@ -489,6 +489,68 @@ static unsigned long kvm_sdei_hypercall_info(struct 
kvm_vcpu *vcpu)
return ret;
 }
 
+static unsigned long kvm_sdei_hypercall_route(struct kvm_vcpu *vcpu)
+{
+   struct kvm *kvm = vcpu->kvm;
+   struct kvm_sdei_kvm *ksdei = kvm->arch.sdei;
+   struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei;
+   struct kvm_sdei_event *kse = NULL;
+   struct kvm_sdei_kvm_event *kske = NULL;
+   unsigned long event_num = smccc_get_arg1(vcpu);
+   unsigned long route_mode = smccc_get_arg2(vcpu);
+   unsigned long route_affinity = smccc_get_arg3(vcpu);
+   int index = 0;
+   unsigned long ret = SDEI_SUCCESS;
+
+   /* Sanity check */
+   if (!(ksdei && vsdei)) {
+   ret = SDEI_NOT_SUPPORTED;
+   goto out;
+   }
+
+   if (!kvm_sdei_is_valid_event_num(event_num)) {
+   ret = SDEI_INVALID_PARAMETERS;
+   goto out;
+   }
+
+   if (!(route_mode == SDEI_EVENT_REGISTER_RM_ANY ||
+ route_mode == SDEI_EVENT_REGISTER_RM_PE)) {
+   ret = SDEI_INVALID_PARAMETERS;
+   goto out;
+   }
+
+   /* Check if the KVM event has been registered */
+   spin_lock(>lock);
+   kske = kvm_sdei_find_kvm_event(kvm, event_num);
+   if (!kske) {
+   ret = SDEI_INVALID_PARAMETERS;
+   goto unlock;
+   }
+
+   /* Validate KVM event state */
+   kse = kske->kse;
+   if (kse->state.type != SDEI_EVENT_TYPE_SHARED) {
+   ret = SDEI_INVALID_PARAMETERS;
+   goto unlock;
+   }
+
+   if (!kvm_sdei_is_registered(kske, index) ||
+   kvm_sdei_is_enabled(kske, index) ||
+   kske->state.refcount) {
+   ret = SDEI_DENIED;
+   goto unlock;
+   }
+
+   /* Update state */
+   kske->state.route_mode = route_mode;
+   kske->state.route_affinity = route_affinity;
+
+unlock:
+   spin_unlock(>lock);
+out:
+   return ret;
+}
+
 int kvm_sdei_hypercall(struct kvm_vcpu *vcpu)
 {
u32 func = smccc_get_function(vcpu);
@@ -523,6 +585,8 @@ int kvm_sdei_hypercall(struct kvm_vcpu *vcpu)
ret = kvm_sdei_hypercall_info(vcpu);
break;
case SDEI_1_0_FN_SDEI_EVENT_ROUTING_SET:
+   ret = kvm_sdei_hypercall_route(vcpu);
+   break;
case SDEI_1_0_FN_SDEI_PE_MASK:
case SDEI_1_0_FN_SDEI_PE_UNMASK:
case SDEI_1_0_FN_SDEI_INTERRUPT_BIND:
-- 
2.23.0



[PATCH v2 06/21] KVM: arm64: Support SDEI_EVENT_CONTEXT hypercall

2021-02-08 Thread Gavin Shan
This supports SDEI_EVENT_CONTEXT hypercall. It's used by the guest
to retrieved the original registers (R0 - R17) in its SDEI event
handler. Those registers can be corrupted during the SDEI event
delivery.

Signed-off-by: Gavin Shan 
---
 arch/arm64/kvm/sdei.c | 40 
 1 file changed, 40 insertions(+)

diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c
index b022ce0a202b..b4162efda470 100644
--- a/arch/arm64/kvm/sdei.c
+++ b/arch/arm64/kvm/sdei.c
@@ -270,6 +270,44 @@ static unsigned long kvm_sdei_hypercall_enable(struct 
kvm_vcpu *vcpu,
return ret;
 }
 
+static unsigned long kvm_sdei_hypercall_context(struct kvm_vcpu *vcpu)
+{
+   struct kvm *kvm = vcpu->kvm;
+   struct kvm_sdei_kvm *ksdei = kvm->arch.sdei;
+   struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei;
+   struct kvm_sdei_vcpu_regs *regs;
+   unsigned long index = smccc_get_arg1(vcpu);
+   unsigned long ret = SDEI_SUCCESS;
+
+   /* Sanity check */
+   if (!(ksdei && vsdei)) {
+   ret = SDEI_NOT_SUPPORTED;
+   goto out;
+   }
+
+   if (index > ARRAY_SIZE(vsdei->state.critical_regs.regs)) {
+   ret = SDEI_INVALID_PARAMETERS;
+   goto out;
+   }
+
+   /* Check if the pending event exists */
+   spin_lock(>lock);
+   if (!(vsdei->critical_event || vsdei->normal_event)) {
+   ret = SDEI_DENIED;
+   goto unlock;
+   }
+
+   /* Fetch the requested register */
+   regs = vsdei->critical_event ? >state.critical_regs :
+  >state.normal_regs;
+   ret = regs->regs[index];
+
+unlock:
+   spin_unlock(>lock);
+out:
+   return ret;
+}
+
 int kvm_sdei_hypercall(struct kvm_vcpu *vcpu)
 {
u32 func = smccc_get_function(vcpu);
@@ -290,6 +328,8 @@ int kvm_sdei_hypercall(struct kvm_vcpu *vcpu)
ret = kvm_sdei_hypercall_enable(vcpu, false);
break;
case SDEI_1_0_FN_SDEI_EVENT_CONTEXT:
+   ret = kvm_sdei_hypercall_context(vcpu);
+   break;
case SDEI_1_0_FN_SDEI_EVENT_COMPLETE:
case SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME:
case SDEI_1_0_FN_SDEI_EVENT_UNREGISTER:
-- 
2.23.0



[PATCH v2 04/21] KVM: arm64: Support SDEI_EVENT_REGISTER hypercall

2021-02-08 Thread Gavin Shan
This supports SDEI_EVENT_REGISTER hypercall, which is used by guest
to register SDEI events. The SDEI event won't be raised to the guest
or specific vCPU until it's registered and enabled explicitly.

Only those events that have been exported by KVM can be registered.
After the event is registered successfully, the KVM SDEI event (object)
is created or updated because the same KVM SDEI event is shared by
multiple vCPUs if it's a private event.

Signed-off-by: Gavin Shan 
---
 arch/arm64/kvm/sdei.c | 122 ++
 1 file changed, 122 insertions(+)

diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c
index aa9485f076a9..d3ea3eee154b 100644
--- a/arch/arm64/kvm/sdei.c
+++ b/arch/arm64/kvm/sdei.c
@@ -21,6 +21,20 @@ static struct kvm_sdei_event_state defined_kse[] = {
},
 };
 
+static struct kvm_sdei_event *kvm_sdei_find_event(struct kvm *kvm,
+ unsigned long num)
+{
+   struct kvm_sdei_kvm *ksdei = kvm->arch.sdei;
+   struct kvm_sdei_event *kse;
+
+   list_for_each_entry(kse, >events, link) {
+   if (kse->state.num == num)
+   return kse;
+   }
+
+   return NULL;
+}
+
 static void kvm_sdei_remove_events(struct kvm *kvm)
 {
struct kvm_sdei_kvm *ksdei = kvm->arch.sdei;
@@ -32,6 +46,20 @@ static void kvm_sdei_remove_events(struct kvm *kvm)
}
 }
 
+static struct kvm_sdei_kvm_event *kvm_sdei_find_kvm_event(struct kvm *kvm,
+ unsigned long num)
+{
+   struct kvm_sdei_kvm *ksdei = kvm->arch.sdei;
+   struct kvm_sdei_kvm_event *kske;
+
+   list_for_each_entry(kske, >kvm_events, link) {
+   if (kske->state.num == num)
+   return kske;
+   }
+
+   return NULL;
+}
+
 static void kvm_sdei_remove_kvm_events(struct kvm *kvm,
   unsigned int mask,
   bool force)
@@ -86,6 +114,98 @@ static unsigned long kvm_sdei_hypercall_version(struct 
kvm_vcpu *vcpu)
return ret;
 }
 
+static unsigned long kvm_sdei_hypercall_register(struct kvm_vcpu *vcpu)
+{
+   struct kvm *kvm = vcpu->kvm;
+   struct kvm_sdei_kvm *ksdei = kvm->arch.sdei;
+   struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei;
+   struct kvm_sdei_event *kse = NULL;
+   struct kvm_sdei_kvm_event *kske = NULL;
+   unsigned long event_num = smccc_get_arg1(vcpu);
+   unsigned long event_entry = smccc_get_arg2(vcpu);
+   unsigned long event_param = smccc_get_arg3(vcpu);
+   unsigned long route_mode = smccc_get_arg4(vcpu);
+   unsigned long route_affinity = smccc_get_arg5(vcpu);
+   int index = vcpu->vcpu_idx;
+   unsigned long ret = SDEI_SUCCESS;
+
+   /* Sanity check */
+   if (!(ksdei && vsdei)) {
+   ret = SDEI_NOT_SUPPORTED;
+   goto out;
+   }
+
+   if (!kvm_sdei_is_valid_event_num(event_num)) {
+   ret = SDEI_INVALID_PARAMETERS;
+   goto out;
+   }
+
+   if (!(route_mode == SDEI_EVENT_REGISTER_RM_ANY ||
+ route_mode == SDEI_EVENT_REGISTER_RM_PE)) {
+   ret = SDEI_INVALID_PARAMETERS;
+   goto out;
+   }
+
+   /*
+* The KVM event could have been created if it's a private event.
+* We needn't create a KVM event in this case.
+*/
+   spin_lock(>lock);
+   kske = kvm_sdei_find_kvm_event(kvm, event_num);
+   if (kske) {
+   kse = kske->kse;
+   index = (kse->state.type == SDEI_EVENT_TYPE_PRIVATE) ?
+   vcpu->vcpu_idx : 0;
+
+   if (kvm_sdei_is_registered(kske, index)) {
+   ret = SDEI_DENIED;
+   goto unlock;
+   }
+
+   kske->state.route_mode = route_mode;
+   kske->state.route_affinity = route_affinity;
+   kske->state.entries[index] = event_entry;
+   kske->state.params[index]  = event_param;
+   kvm_sdei_set_registered(kske, index);
+   goto unlock;
+   }
+
+   /* Check if the event number has been registered */
+   kse = kvm_sdei_find_event(kvm, event_num);
+   if (!kse) {
+   ret = SDEI_INVALID_PARAMETERS;
+   goto unlock;
+   }
+
+   /* Create KVM event */
+   kske = kzalloc(sizeof(*kske), GFP_KERNEL);
+   if (!kske) {
+   ret = SDEI_OUT_OF_RESOURCE;
+   goto unlock;
+   }
+
+   /* Initialize KVM event state */
+   index = (kse->state.type == SDEI_EVENT_TYPE_PRIVATE) ?
+   vcpu->vcpu_idx : 0;
+   kske->state.num= event_num;
+   kske->state.refcount   = 0;
+   kske->state.route_mode = route_affinity;
+   kske->state.route_affini

[PATCH v2 02/21] KVM: arm64: Add SDEI virtualization infrastructure

2021-02-08 Thread Gavin Shan
Software Delegated Exception Interface (SDEI) provides a mechanism for
registering and servicing system events. Those system events are high
priority events, which must be serviced immediately. It's going to be
used by Asynchronous Page Fault (APF) to deliver notification from KVM
to guest. It's noted that SDEI is defined by ARM DEN0054A specification.

This introduces SDEI virtualization infrastructure where the SDEI events
are registered and manuplated by the guest through hypercall. The SDEI
event is delivered to one specific vCPU by KVM once it's raised. This
introduces data structures to represent the needed objects to implement
the feature, which is highlighted as below. As those objects could be
migrated between VMs, these data structures are partially exported to
user space.

   * kvm_sdei_event
 SDEI events are exported from KVM so that guest is able to register
 and manuplate.
   * kvm_sdei_kvm_event
 SDEI event that has been registered by guest.
   * kvm_sdei_kvm_vcpu
 SDEI event that has been delivered to the target vCPU.
   * kvm_sdei_kvm
 Place holder of exported and registered SDEI events.
   * kvm_sdei_vcpu
 Auxiliary object to save the preempted context during SDEI event
 delivery.

The error is returned for all SDEI hypercalls for now. They will be
supported by subsequent patches.

Signed-off-by: Gavin Shan 
---
 arch/arm64/include/asm/kvm_host.h  |   4 +
 arch/arm64/include/asm/kvm_sdei.h  | 118 +++
 arch/arm64/include/uapi/asm/kvm.h  |   1 +
 arch/arm64/include/uapi/asm/kvm_sdei.h |  56 +++
 arch/arm64/kvm/Makefile|   2 +-
 arch/arm64/kvm/arm.c   |   7 +
 arch/arm64/kvm/hypercalls.c|  18 +++
 arch/arm64/kvm/sdei.c  | 198 +
 8 files changed, 403 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/include/asm/kvm_sdei.h
 create mode 100644 arch/arm64/include/uapi/asm/kvm_sdei.h
 create mode 100644 arch/arm64/kvm/sdei.c

diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 8fcfab0c2567..b2d51c6d055c 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -133,6 +133,8 @@ struct kvm_arch {
 
u8 pfr0_csv2;
u8 pfr0_csv3;
+
+   struct kvm_sdei_kvm *sdei;
 };
 
 struct kvm_vcpu_fault_info {
@@ -370,6 +372,8 @@ struct kvm_vcpu_arch {
u64 last_steal;
gpa_t base;
} steal;
+
+   struct kvm_sdei_vcpu *sdei;
 };
 
 /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */
diff --git a/arch/arm64/include/asm/kvm_sdei.h 
b/arch/arm64/include/asm/kvm_sdei.h
new file mode 100644
index ..b0abc13a0256
--- /dev/null
+++ b/arch/arm64/include/asm/kvm_sdei.h
@@ -0,0 +1,118 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Definitions of various KVM SDEI events.
+ *
+ * Copyright (C) 2021 Red Hat, Inc.
+ *
+ * Author(s): Gavin Shan 
+ */
+
+#ifndef __ARM64_KVM_SDEI_H__
+#define __ARM64_KVM_SDEI_H__
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+struct kvm_sdei_event {
+   struct kvm_sdei_event_state state;
+   struct kvm  *kvm;
+   struct list_headlink;
+};
+
+struct kvm_sdei_kvm_event {
+   struct kvm_sdei_kvm_event_state state;
+   struct kvm_sdei_event   *kse;
+   struct kvm  *kvm;
+   struct list_headlink;
+};
+
+struct kvm_sdei_vcpu_event {
+   struct kvm_sdei_vcpu_event_statestate;
+   struct kvm_sdei_kvm_event   *kske;
+   struct kvm_vcpu *vcpu;
+   struct list_headlink;
+};
+
+struct kvm_sdei_kvm {
+   spinlock_t  lock;
+   struct list_headevents; /* kvm_sdei_event */
+   struct list_headkvm_events; /* kvm_sdei_kvm_event */
+};
+
+struct kvm_sdei_vcpu {
+   spinlock_t  lock;
+   struct kvm_sdei_vcpu_state  state;
+   struct kvm_sdei_vcpu_event  *critical_event;
+   struct kvm_sdei_vcpu_event  *normal_event;
+   struct list_headcritical_events;
+   struct list_headnormal_events;
+};
+
+/*
+ * According to SDEI specification (v1.0), the event number spans 32-bits
+ * and the lower 24-bits are used as the (real) event number. I don't
+ * think we can use that much SDEI numbers in one system. So we reserve
+ * two bits from the 24-bits real event number, to indicate its types:
+ * physical event and virtual event. One reserved bit is enough for now,
+ * but two bits are reserved for possible extension in future.
+ *
+ * The physical events are owned by underly firmware while the virtual
+ * events are used by VMM and KVM.
+ */
+#define KVM_SDEI_EV_NUM_TYPE_SHIFT 22
+#define KVM_SDEI_EV_NUM_TYPE_MASK

[PATCH v2 00/21] Support SDEI Virtualization

2021-02-08 Thread Gavin Shan
This series intends to virtualize Software Delegated Exception Interface
(SDEI), which is defined by DEN0054A. It allows the hypervisor to deliver
NMI-alike event to guest and it's needed by asynchronous page fault to
deliver page-not-present notification from hypervisor to guest. The code
and the required qemu changes can be found from:

   https://github.com/gwshan/linux("sdei")
   https://github.com/gwshan/qemu.git ("apf")

The SDEI event is identified by a 32-bits number. Bits[31:24] are used
to indicate the SDEI event properties while bits[23:0] are identifying
the unique number. The implementation takes bits[23:22] to indicate the
owner of the SDEI event. For example, those SDEI events owned by KVM
should have these two bits set to 0b01. Besides, the implementation
supports SDEI events owned by KVM only.

The design is pretty straightforward and the implementation is just
following the SDEI specification. There are several data structures
introduced. Some of the objects have to be migrated by VMM. So their
definitions are split up so that VMM can include their states for
migration.

   struct kvm_sdei_kvm
  Associated with VM and used to track the KVM exposed SDEI events
  and those registered by guest.
   struct kvm_sdei_vcpu
  Associated with vCPU and used to track SDEI event delivery. The
  preempted context is saved prior to the delivery and restored
  after that.
   struct kvm_sdei_event
  SDEI events exposed by KVM so that guest can register and enable.
   struct kvm_sdei_kvm_event
  SDEI events that have been registered by guest.
   struct kvm_sdei_vcpu_event
  SDEI events that have been queued to specific vCPU for delivery.

The series is organized as below:

   PATCH[01]Introduces template for smccc_get_argx()
   PATCH[02]Introduces the data structures and infrastructure
   PATCH[03-14] Supports various SDEI related hypercalls
   PATCH[15]Supports SDEI event notification
   PATCH[16-17] Introduces ioctl command for migration
   PATCH[18-19] Supports SDEI event injection and cancellation
   PATCH[20]Exports SDEI capability
   PATCH[21]Adds self-test case for SDEI virtualization

Testing
===

There are two additional patches in the following repository to create
procfs files allowing inject SDEI event and driver for the guest to
use the SDEI event. Besides, the additional qemu changes are needed
so that guest can detects the SDEI service through ACPI table.

https://github.com/gwshan/linux("sdei")
https://github.com/gwshan/qemu.git ("apf")

The SDEI event is received and handled in the guest after it's injected
through the procfs files on host.

Changelog
=
v2:
   * Rebased to 5.11.rc6
   * Dropped changes related to SDEI client driver(Gavin)
   * Removed support for passthrou SDEI events(Gavin)
   * Redesigned data structures   (Gavin)
   * Implementation is almost rewritten as the data structures
 are totally changed  (Gavin)
   * Added ioctl commands to support migration(Gavin)

Gavin Shan (21):
  KVM: arm64: Introduce template for inline functions
  KVM: arm64: Add SDEI virtualization infrastructure
  KVM: arm64: Support SDEI_VERSION hypercall
  KVM: arm64: Support SDEI_EVENT_REGISTER hypercall
  KVM: arm64: Support SDEI_EVENT_{ENABLE, DISABLE} hypercall
  KVM: arm64: Support SDEI_EVENT_CONTEXT hypercall
  KVM: arm64: Support SDEI_EVENT_UNREGISTER hypercall
  KVM: arm64: Support SDEI_EVENT_STATUS hypercall
  KVM: arm64: Support SDEI_EVENT_GET_INFO hypercall
  KVM: arm64: Support SDEI_EVENT_ROUTING_SET hypercall
  KVM: arm64: Support SDEI_PE_{MASK, UNMASK} hypercall
  KVM: arm64: Support SDEI_{PRIVATE, SHARED}_RESET hypercall
  KVM: arm64: Impment SDEI event delivery
  KVM: arm64: Support SDEI_EVENT_{COMPLETE, COMPLETE_AND_RESUME}
hypercall
  KVM: arm64: Support SDEI event notifier
  KVM: arm64: Support SDEI ioctl commands on VM
  KVM: arm64: Support SDEI ioctl commands on vCPU
  KVM: arm64: Support SDEI event injection
  KVM: arm64: Support SDEI event cancellation
  KVM: arm64: Export SDEI capability
  KVM: selftests: Add SDEI test case

 arch/arm64/include/asm/kvm_emulate.h   |1 +
 arch/arm64/include/asm/kvm_host.h  |6 +
 arch/arm64/include/asm/kvm_sdei.h  |  136 ++
 arch/arm64/include/uapi/asm/kvm.h  |1 +
 arch/arm64/include/uapi/asm/kvm_sdei.h |   82 ++
 arch/arm64/kvm/Makefile|2 +-
 arch/arm64/kvm/arm.c   |   19 +
 arch/arm64/kvm/hyp/exception.c |7 +
 arch/arm64/kvm/hypercalls.c|   18 +
 arch/arm64/kvm/inject_fault.c  |   27 +
 arch/arm64/kvm/sdei.c  | 1519 
 include/kvm/arm_hypercalls.h   |   34 +-
 include/uapi/linux/kvm.h   |4

[PATCH v2 11/21] KVM: arm64: Support SDEI_PE_{MASK, UNMASK} hypercall

2021-02-08 Thread Gavin Shan
This supports SDEI_PE_{MASK, UNMASK} hypercall. They are used by
the guest to stop the specific vCPU from receiving SDEI events.

Signed-off-by: Gavin Shan 
---
 arch/arm64/kvm/sdei.c | 35 +++
 1 file changed, 35 insertions(+)

diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c
index 458695c2394f..3fb33258b494 100644
--- a/arch/arm64/kvm/sdei.c
+++ b/arch/arm64/kvm/sdei.c
@@ -551,6 +551,37 @@ static unsigned long kvm_sdei_hypercall_route(struct 
kvm_vcpu *vcpu)
return ret;
 }
 
+static unsigned long kvm_sdei_hypercall_mask(struct kvm_vcpu *vcpu,
+bool mask)
+{
+   struct kvm *kvm = vcpu->kvm;
+   struct kvm_sdei_kvm *ksdei = kvm->arch.sdei;
+   struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei;
+   unsigned long ret = SDEI_SUCCESS;
+
+   /* Sanity check */
+   if (!(ksdei && vsdei)) {
+   ret = SDEI_NOT_SUPPORTED;
+   goto out;
+   }
+
+   spin_lock(>lock);
+
+   /* Check the state */
+   if (mask == vsdei->state.masked) {
+   ret = SDEI_DENIED;
+   goto unlock;
+   }
+
+   /* Update the state */
+   vsdei->state.masked = mask ? 1 : 0;
+
+unlock:
+   spin_unlock(>lock);
+out:
+   return ret;
+}
+
 int kvm_sdei_hypercall(struct kvm_vcpu *vcpu)
 {
u32 func = smccc_get_function(vcpu);
@@ -588,7 +619,11 @@ int kvm_sdei_hypercall(struct kvm_vcpu *vcpu)
ret = kvm_sdei_hypercall_route(vcpu);
break;
case SDEI_1_0_FN_SDEI_PE_MASK:
+   ret = kvm_sdei_hypercall_mask(vcpu, true);
+   break;
case SDEI_1_0_FN_SDEI_PE_UNMASK:
+   ret = kvm_sdei_hypercall_mask(vcpu, false);
+   break;
case SDEI_1_0_FN_SDEI_INTERRUPT_BIND:
case SDEI_1_0_FN_SDEI_INTERRUPT_RELEASE:
case SDEI_1_0_FN_SDEI_PRIVATE_RESET:
-- 
2.23.0



[PATCH v2 12/21] KVM: arm64: Support SDEI_{PRIVATE, SHARED}_RESET hypercall

2021-02-08 Thread Gavin Shan
This supports SDEI_{PRIVATE, SHARED}_RESET. They are used by the
guest to purge the private or shared SDEI events, which are registered
previously.

Signed-off-by: Gavin Shan 
---
 arch/arm64/kvm/sdei.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c
index 3fb33258b494..62efee2b67b8 100644
--- a/arch/arm64/kvm/sdei.c
+++ b/arch/arm64/kvm/sdei.c
@@ -582,6 +582,29 @@ static unsigned long kvm_sdei_hypercall_mask(struct 
kvm_vcpu *vcpu,
return ret;
 }
 
+static unsigned long kvm_sdei_hypercall_reset(struct kvm_vcpu *vcpu,
+ bool private)
+{
+   struct kvm *kvm = vcpu->kvm;
+   struct kvm_sdei_kvm *ksdei = kvm->arch.sdei;
+   struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei;
+   unsigned int mask = private ? (1 << SDEI_EVENT_TYPE_PRIVATE) :
+ (1 << SDEI_EVENT_TYPE_SHARED);
+   unsigned long ret = SDEI_SUCCESS;
+
+   /* Sanity check */
+   if (!(ksdei && vsdei)) {
+   ret = SDEI_NOT_SUPPORTED;
+   goto out;
+   }
+
+   spin_lock(>lock);
+   kvm_sdei_remove_kvm_events(kvm, mask, false);
+   spin_unlock(>lock);
+out:
+   return ret;
+}
+
 int kvm_sdei_hypercall(struct kvm_vcpu *vcpu)
 {
u32 func = smccc_get_function(vcpu);
@@ -626,8 +649,14 @@ int kvm_sdei_hypercall(struct kvm_vcpu *vcpu)
break;
case SDEI_1_0_FN_SDEI_INTERRUPT_BIND:
case SDEI_1_0_FN_SDEI_INTERRUPT_RELEASE:
+   ret = SDEI_NOT_SUPPORTED;
+   break;
case SDEI_1_0_FN_SDEI_PRIVATE_RESET:
+   ret = kvm_sdei_hypercall_reset(vcpu, true);
+   break;
case SDEI_1_0_FN_SDEI_SHARED_RESET:
+   ret = kvm_sdei_hypercall_reset(vcpu, false);
+   break;
default:
ret = SDEI_NOT_SUPPORTED;
}
-- 
2.23.0



[PATCH v2 20/21] KVM: arm64: Export SDEI capability

2021-02-08 Thread Gavin Shan
The SDEI functionality is ready to be exported so far. This adds
new capability (KVM_CAP_ARM_SDEI) and exports it.

Signed-off-by: Gavin Shan 
---
 arch/arm64/kvm/arm.c | 3 +++
 include/uapi/linux/kvm.h | 1 +
 2 files changed, 4 insertions(+)

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 55ccd234b0ec..f8b44a29e164 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -266,6 +266,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ARM_PTRAUTH_GENERIC:
r = system_has_full_ptr_auth();
break;
+   case KVM_CAP_ARM_SDEI:
+   r = 1;
+   break;
default:
r = 0;
}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index b056b4ac884b..133128d45fcb 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1058,6 +1058,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
 #define KVM_CAP_SYS_HYPERV_CPUID 191
 #define KVM_CAP_DIRTY_LOG_RING 192
+#define KVM_CAP_ARM_SDEI 193
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
2.23.0



[PATCH v2 18/21] KVM: arm64: Support SDEI event injection

2021-02-08 Thread Gavin Shan
This supports SDEI event injection by implementing kvm_sdei_inject().
It's called by kernel directly or VMM through ioctl command to inject
SDEI event to the specific vCPU.

Signed-off-by: Gavin Shan 
---
 arch/arm64/include/asm/kvm_sdei.h  |   2 +
 arch/arm64/include/uapi/asm/kvm_sdei.h |   1 +
 arch/arm64/kvm/sdei.c  | 108 +
 3 files changed, 111 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_sdei.h 
b/arch/arm64/include/asm/kvm_sdei.h
index a997989bab77..51087fe971ba 100644
--- a/arch/arm64/include/asm/kvm_sdei.h
+++ b/arch/arm64/include/asm/kvm_sdei.h
@@ -124,6 +124,8 @@ void kvm_sdei_create_vcpu(struct kvm_vcpu *vcpu);
 int kvm_sdei_hypercall(struct kvm_vcpu *vcpu);
 int kvm_sdei_register_notifier(struct kvm *kvm, unsigned long num,
   kvm_sdei_notifier notifier);
+int kvm_sdei_inject(struct kvm_vcpu *vcpu,
+   unsigned long num, bool immediate);
 void kvm_sdei_deliver(struct kvm_vcpu *vcpu);
 long kvm_sdei_vm_ioctl(struct kvm *kvm, unsigned long arg);
 long kvm_sdei_vcpu_ioctl(struct kvm_vcpu *vcpu, unsigned long arg);
diff --git a/arch/arm64/include/uapi/asm/kvm_sdei.h 
b/arch/arm64/include/uapi/asm/kvm_sdei.h
index 3485843dd6df..232092de5e21 100644
--- a/arch/arm64/include/uapi/asm/kvm_sdei.h
+++ b/arch/arm64/include/uapi/asm/kvm_sdei.h
@@ -64,6 +64,7 @@ struct kvm_sdei_vcpu_state {
 #define KVM_SDEI_CMD_SET_VEVENT7
 #define KVM_SDEI_CMD_GET_VCPU_STATE8
 #define KVM_SDEI_CMD_SET_VCPU_STATE9
+#define KVM_SDEI_CMD_INJECT_EVENT  10
 
 struct kvm_sdei_cmd {
uint32_tcmd;
diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c
index 79315b77f24b..7c2789cd1421 100644
--- a/arch/arm64/kvm/sdei.c
+++ b/arch/arm64/kvm/sdei.c
@@ -802,6 +802,111 @@ int kvm_sdei_register_notifier(struct kvm *kvm,
return ret;
 }
 
+int kvm_sdei_inject(struct kvm_vcpu *vcpu,
+   unsigned long num,
+   bool immediate)
+{
+   struct kvm *kvm = vcpu->kvm;
+   struct kvm_sdei_kvm *ksdei = kvm->arch.sdei;
+   struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei;
+   struct kvm_sdei_event *kse = NULL;
+   struct kvm_sdei_kvm_event *kske = NULL;
+   struct kvm_sdei_vcpu_event *ksve = NULL;
+   int index, ret = 0;
+
+   /* Sanity check */
+   if (!(ksdei && vsdei)) {
+   ret = -EPERM;
+   goto out;
+   }
+
+   if (!kvm_sdei_is_valid_event_num(num)) {
+   ret = -EINVAL;
+   goto out;
+   }
+
+   /* Check the kvm event */
+   spin_lock(>lock);
+   kske = kvm_sdei_find_kvm_event(kvm, num);
+   if (!kske) {
+   ret = -ENOENT;
+   goto unlock_kvm;
+   }
+
+   kse = kske->kse;
+   index = (kse->state.type == SDEI_EVENT_TYPE_PRIVATE) ?
+   vcpu->vcpu_idx : 0;
+   if (!(kvm_sdei_is_registered(kske, index) &&
+ kvm_sdei_is_enabled(kske, index))) {
+   ret = -EPERM;
+   goto unlock_kvm;
+   }
+
+   /* Check the vcpu state */
+   spin_lock(>lock);
+   if (vsdei->state.masked) {
+   ret = -EPERM;
+   goto unlock_vcpu;
+   }
+
+   /* Check if the event can be delivered immediately */
+   if (immediate) {
+   if (kse->state.priority == SDEI_EVENT_PRIORITY_CRITICAL &&
+   !list_empty(>critical_events)) {
+   ret = -ENOSPC;
+   goto unlock_vcpu;
+   }
+
+   if (kse->state.priority == SDEI_EVENT_PRIORITY_NORMAL &&
+   (!list_empty(>critical_events) ||
+!list_empty(>normal_events))) {
+   ret = -ENOSPC;
+   goto unlock_vcpu;
+   }
+   }
+
+   /* Check if the vcpu event exists */
+   ksve = kvm_sdei_find_vcpu_event(vcpu, num);
+   if (ksve) {
+   kske->state.refcount++;
+   ksve->state.refcount++;
+   kvm_make_request(KVM_REQ_SDEI, vcpu);
+   goto unlock_vcpu;
+   }
+
+   /* Allocate vcpu event */
+   ksve = kzalloc(sizeof(*ksve), GFP_KERNEL);
+   if (!ksve) {
+   ret = -ENOMEM;
+   goto unlock_vcpu;
+   }
+
+   /*
+* We should take lock to update KVM event state because its
+* reference count might be zero. In that case, the KVM event
+* could be destroyed.
+*/
+   kske->state.refcount++;
+   ksve->state.num  = num;
+   ksve->state.refcount = 1;
+   ksve->kske   = kske;
+   ksve->vcpu   = vcpu;
+
+   if (kse->state.priority == SDEI_EVENT_PRIORITY_CRITICAL)
+   list_add_tail(>link, >critica

[PATCH v2 19/21] KVM: arm64: Support SDEI event cancellation

2021-02-08 Thread Gavin Shan
The injected SDEI event is to send notification to guest. The SDEI
event might not be needed after it's injected. This introduces API
to support cancellation on the injected SDEI event if it's not fired
to the guest yet.

This mechanism will be needed when we're going to support asynchronous
page fault.

Signed-off-by: Gavin Shan 
---
 arch/arm64/include/asm/kvm_sdei.h |  1 +
 arch/arm64/kvm/sdei.c | 49 +++
 2 files changed, 50 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_sdei.h 
b/arch/arm64/include/asm/kvm_sdei.h
index 51087fe971ba..353744c7bad9 100644
--- a/arch/arm64/include/asm/kvm_sdei.h
+++ b/arch/arm64/include/asm/kvm_sdei.h
@@ -126,6 +126,7 @@ int kvm_sdei_register_notifier(struct kvm *kvm, unsigned 
long num,
   kvm_sdei_notifier notifier);
 int kvm_sdei_inject(struct kvm_vcpu *vcpu,
unsigned long num, bool immediate);
+int kvm_sdei_cancel(struct kvm_vcpu *vcpu, unsigned long num);
 void kvm_sdei_deliver(struct kvm_vcpu *vcpu);
 long kvm_sdei_vm_ioctl(struct kvm *kvm, unsigned long arg);
 long kvm_sdei_vcpu_ioctl(struct kvm_vcpu *vcpu, unsigned long arg);
diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c
index 7c2789cd1421..4f5a582daa97 100644
--- a/arch/arm64/kvm/sdei.c
+++ b/arch/arm64/kvm/sdei.c
@@ -907,6 +907,55 @@ int kvm_sdei_inject(struct kvm_vcpu *vcpu,
return ret;
 }
 
+int kvm_sdei_cancel(struct kvm_vcpu *vcpu, unsigned long num)
+{
+   struct kvm *kvm = vcpu->kvm;
+   struct kvm_sdei_kvm *ksdei = kvm->arch.sdei;
+   struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei;
+   struct kvm_sdei_kvm_event *kske = NULL;
+   struct kvm_sdei_vcpu_event *ksve = NULL;
+   int ret = 0;
+
+   if (!(ksdei && vsdei)) {
+   ret = -EPERM;
+   goto out;
+   }
+
+   /* Find the vCPU event */
+   spin_lock(>lock);
+   ksve = kvm_sdei_find_vcpu_event(vcpu, num);
+   if (!ksve) {
+   ret = -EINVAL;
+   goto unlock;
+   }
+
+   /* Event can't be cancelled if it has been delivered */
+   if (ksve->state.refcount <= 1 &&
+   (vsdei->critical_event == ksve ||
+vsdei->normal_event == ksve)) {
+   ret = -EINPROGRESS;
+   goto unlock;
+   }
+
+   /* Free the vCPU event if necessary */
+   kske = ksve->kske;
+   ksve->state.refcount--;
+   if (!ksve->state.refcount) {
+   list_del(>link);
+   kfree(ksve);
+   }
+
+unlock:
+   spin_unlock(>lock);
+   if (kske) {
+   spin_lock(>lock);
+   kske->state.refcount--;
+   spin_unlock(>lock);
+   }
+out:
+   return ret;
+}
+
 void kvm_sdei_deliver(struct kvm_vcpu *vcpu)
 {
struct kvm *kvm = vcpu->kvm;
-- 
2.23.0



[PATCH v2 13/21] KVM: arm64: Impment SDEI event delivery

2021-02-08 Thread Gavin Shan
This implement kvm_sdei_deliver() to support SDEI event delivery.
The function is called when the request (KVM_REQ_SDEI) is raised.
The following rules are taken according to the SDEI specification:

   * x0 - x17 are saved. All of them are cleared except the following
 registered:
 x0: number SDEI event to be delivered
 x1: parameter associated with the SDEI event
 x2: PC of the interrupted context
 x3: PState of the interrupted context

   * PC is set to the handler of the SDEI event, which was provided
 during its registration. PState is modified accordingly.

   * SDEI event with critical priority can preempt those with normal
 priority.

Signed-off-by: Gavin Shan 
---
 arch/arm64/include/asm/kvm_host.h |  1 +
 arch/arm64/include/asm/kvm_sdei.h |  1 +
 arch/arm64/kvm/arm.c  |  3 ++
 arch/arm64/kvm/sdei.c | 84 +++
 4 files changed, 89 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index b2d51c6d055c..30e850257ef4 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -47,6 +47,7 @@
 #define KVM_REQ_VCPU_RESET KVM_ARCH_REQ(2)
 #define KVM_REQ_RECORD_STEAL   KVM_ARCH_REQ(3)
 #define KVM_REQ_RELOAD_GICv4   KVM_ARCH_REQ(4)
+#define KVM_REQ_SDEI   KVM_ARCH_REQ(5)
 
 #define KVM_DIRTY_LOG_MANUAL_CAPS   (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
 KVM_DIRTY_LOG_INITIALLY_SET)
diff --git a/arch/arm64/include/asm/kvm_sdei.h 
b/arch/arm64/include/asm/kvm_sdei.h
index b0abc13a0256..7f5f5ad689e6 100644
--- a/arch/arm64/include/asm/kvm_sdei.h
+++ b/arch/arm64/include/asm/kvm_sdei.h
@@ -112,6 +112,7 @@ KVM_SDEI_FLAG_FUNC(enabled)
 void kvm_sdei_init_vm(struct kvm *kvm);
 void kvm_sdei_create_vcpu(struct kvm_vcpu *vcpu);
 int kvm_sdei_hypercall(struct kvm_vcpu *vcpu);
+void kvm_sdei_deliver(struct kvm_vcpu *vcpu);
 void kvm_sdei_destroy_vcpu(struct kvm_vcpu *vcpu);
 void kvm_sdei_destroy_vm(struct kvm *kvm);
 
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index a7ae16df3df7..e243bd5ad730 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -668,6 +668,9 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
kvm_reset_vcpu(vcpu);
 
+   if (kvm_check_request(KVM_REQ_SDEI, vcpu))
+   kvm_sdei_deliver(vcpu);
+
/*
 * Clear IRQ_PENDING requests that were made to guarantee
 * that a VCPU sees new virtual interrupts.
diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c
index 62efee2b67b8..b5d6d1ed3858 100644
--- a/arch/arm64/kvm/sdei.c
+++ b/arch/arm64/kvm/sdei.c
@@ -671,6 +671,90 @@ int kvm_sdei_hypercall(struct kvm_vcpu *vcpu)
return 1;
 }
 
+void kvm_sdei_deliver(struct kvm_vcpu *vcpu)
+{
+   struct kvm *kvm = vcpu->kvm;
+   struct kvm_sdei_kvm *ksdei = kvm->arch.sdei;
+   struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei;
+   struct kvm_sdei_event *kse = NULL;
+   struct kvm_sdei_kvm_event *kske = NULL;
+   struct kvm_sdei_vcpu_event *ksve = NULL;
+   struct kvm_sdei_vcpu_regs *regs = NULL;
+   unsigned long pstate;
+   int index = 0;
+
+   /* Sanity check */
+   if (!(ksdei && vsdei))
+   return;
+
+   /* The critical event can't be preempted */
+   spin_lock(>lock);
+   if (vsdei->critical_event)
+   goto unlock;
+
+   /*
+* The normal event can be preempted by the critical event.
+* However, the normal event can't be preempted by another
+* normal event.
+*/
+   ksve = list_first_entry_or_null(>critical_events,
+   struct kvm_sdei_vcpu_event, link);
+   if (!ksve && !vsdei->normal_event) {
+   ksve = list_first_entry_or_null(>normal_events,
+   struct kvm_sdei_vcpu_event, link);
+   }
+
+   if (!ksve)
+   goto unlock;
+
+   kske = ksve->kske;
+   kse = kske->kse;
+   if (kse->state.priority == SDEI_EVENT_PRIORITY_CRITICAL) {
+   vsdei->critical_event = ksve;
+   vsdei->state.critical_num = ksve->state.num;
+   regs = >state.critical_regs;
+   } else {
+   vsdei->normal_event = ksve;
+   vsdei->state.normal_num = ksve->state.num;
+   regs = >state.normal_regs;
+   }
+
+   /* Save registers: x0 -> x17, PC, PState */
+   for (index = 0; index < ARRAY_SIZE(regs->regs); index++)
+   regs->regs[index] = vcpu_get_reg(vcpu, index);
+
+   regs->pc = *vcpu_pc(vcpu);
+   regs->pstate = *vcpu_cpsr(vcpu);
+
+   /*
+* Inject SDEI event: x0 -> x3, PC, PState. We needn't take lock
+

[PATCH v2 21/21] KVM: selftests: Add SDEI test case

2021-02-08 Thread Gavin Shan
This adds SDEI test case into selftests where the various hypercalls
are issued to kvm private event (0x4020) and then ensure that's
completed without error. Note that two vCPUs are started up by default
to run same consequence. Actually, it's simulating what SDEI client
driver does and the following hypercalls are issued in sequence:

   SDEI_1_0_FN_SDEI_VERSION(probing SDEI capability)
   SDEI_1_0_FN_SDEI_PE_UNMASK  (CPU online)
   SDEI_1_0_FN_SDEI_PRIVATE_RESET  (restart SDEI)
   SDEI_1_0_FN_SDEI_SHARED_RESET
   SDEI_1_0_FN_SDEI_EVENT_GET_INFO (register event)
   SDEI_1_0_FN_SDEI_EVENT_GET_INFO
   SDEI_1_0_FN_SDEI_EVENT_GET_INFO
   SDEI_1_0_FN_SDEI_EVENT_REGISTER
   SDEI_1_0_FN_SDEI_EVENT_ENABLE   (enable event)
   SDEI_1_0_FN_SDEI_EVENT_DISABLE  (disable event)
   SDEI_1_0_FN_SDEI_EVENT_UNREGISTER   (unregister event)
   SDEI_1_0_FN_SDEI_PE_MASK(CPU offline)

Signed-off-by: Gavin Shan 
---
 tools/testing/selftests/kvm/Makefile   |   1 +
 tools/testing/selftests/kvm/aarch64/sdei.c | 172 +
 2 files changed, 173 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/aarch64/sdei.c

diff --git a/tools/testing/selftests/kvm/Makefile 
b/tools/testing/selftests/kvm/Makefile
index fe41c6a0fa67..482faa88520b 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -74,6 +74,7 @@ TEST_GEN_PROGS_aarch64 += dirty_log_perf_test
 TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus
 TEST_GEN_PROGS_aarch64 += set_memory_region_test
 TEST_GEN_PROGS_aarch64 += steal_time
+TEST_GEN_PROGS_aarch64 += aarch64/sdei
 
 TEST_GEN_PROGS_s390x = s390x/memop
 TEST_GEN_PROGS_s390x += s390x/resets
diff --git a/tools/testing/selftests/kvm/aarch64/sdei.c 
b/tools/testing/selftests/kvm/aarch64/sdei.c
new file mode 100644
index ..1a4cdae84ad5
--- /dev/null
+++ b/tools/testing/selftests/kvm/aarch64/sdei.c
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ARM SDEI test
+ *
+ * Copyright (C) 2021 Red Hat, Inc.
+ *
+ * Author(s): Gavin Shan 
+ */
+#define _GNU_SOURCE
+#include 
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "asm/kvm_sdei.h"
+#include "linux/arm_sdei.h"
+
+#define NR_VCPUS   2
+#define SDEI_GPA_BASE  (1 << 30)
+
+struct sdei_event {
+   uint32_tcpu;
+   uint64_tversion;
+   uint64_tnum;
+   uint64_ttype;
+   uint64_tpriority;
+   uint64_tsignaled;
+};
+
+static struct sdei_event sdei_events[NR_VCPUS];
+
+static int64_t smccc(uint32_t func, uint64_t arg0, uint64_t arg1,
+uint64_t arg2, uint64_t arg3, uint64_t arg4)
+{
+   int64_t ret;
+
+   asm volatile(
+   "movx0, %1\n"
+   "movx1, %2\n"
+   "movx2, %3\n"
+   "movx3, %4\n"
+   "movx4, %5\n"
+   "movx5, %6\n"
+   "hvc#0\n"
+   "mov%0, x0\n"
+   : "=r" (ret) : "r" (func), "r" (arg0), "r" (arg1),
+   "r" (arg2), "r" (arg3), "r" (arg4) :
+   "x0", "x1", "x2", "x3", "x4", "x5");
+
+   return ret;
+}
+
+static inline bool is_error(int64_t ret)
+{
+   if (ret == SDEI_NOT_SUPPORTED  ||
+   ret == SDEI_INVALID_PARAMETERS ||
+   ret == SDEI_DENIED ||
+   ret == SDEI_PENDING||
+   ret == SDEI_OUT_OF_RESOURCE)
+   return true;
+
+   return false;
+}
+
+static void guest_code(int cpu)
+{
+   struct sdei_event *event = _events[cpu];
+   int64_t ret;
+
+   /* CPU */
+   event->cpu = cpu;
+   event->num = KVM_SDEI_DEFAULT_NUM;
+   GUEST_ASSERT(cpu < NR_VCPUS);
+
+   /* Version */
+   ret = smccc(SDEI_1_0_FN_SDEI_VERSION, 0, 0, 0, 0, 0);
+   GUEST_ASSERT(!is_error(ret));
+   GUEST_ASSERT(SDEI_VERSION_MAJOR(ret) == 1);
+   GUEST_ASSERT(SDEI_VERSION_MINOR(ret) == 0);
+   event->version = ret;
+
+   /* CPU unmasking */
+   ret = smccc(SDEI_1_0_FN_SDEI_PE_UNMASK, 0, 0, 0, 0, 0);
+   GUEST_ASSERT(!is_error(ret));
+
+   /* Reset */
+   ret = smccc(SDEI_1_0_FN_SDEI_PRIVATE_RESET, 0, 0, 0, 0, 0);
+   GUEST_ASSERT(!is_error(ret));
+   ret = smccc(SDEI_1_0_FN_SDEI_SHARED_RESET, 0, 0, 0, 0, 0);
+   GUEST_ASSERT(!is_error(ret));
+
+   /* Event properties */
+   ret = smccc(SDEI_1_0_FN_SDEI_EVENT_GET_INFO,
+event->num, SDEI_EVENT_INFO_EV_TYPE, 0, 0, 0);
+   GUEST_ASSERT(!is_error(ret));
+   event->type = ret;
+
+   ret = smccc(SDEI_1_0_FN_SDEI_EVENT_GET_INFO,
+   event->num, SDEI_EVENT_INF

  1   2   3   4   >