Re: [PATCH v2 0/4] vhost: Cleanup
On 4/30/24 04:50, Michael S. Tsirkin wrote: On Mon, Apr 29, 2024 at 08:13:56PM +1000, Gavin Shan wrote: This is suggested by Michael S. Tsirkin according to [1] and the goal is to apply smp_rmb() inside vhost_get_avail_idx() if needed. With it, the caller of the function needn't to worry about memory barriers. Since we're here, other cleanups are also applied. [1] https://lore.kernel.org/virtualization/20240327155750-mutt-send-email-...@kernel.org/ Patch 1 makes some sense, gave some comments. Rest I think we should just drop. Sure, v3 has been sent with PATCH[v2 2/3/4] dropped. Please take a look when you getting a chance. v3: https://lore.kernel.org/virtualization/20240429232748.642356-1-gs...@redhat.com/T/#u Thanks, Gavin
[PATCH v3] vhost: Improve vhost_get_avail_idx() with smp_rmb()
From: "Michael S. Tsirkin" All the callers of vhost_get_avail_idx() are concerned with the memory barrier, imposed by smp_rmb() to ensure the order of the available ring entry read and avail_idx read. Improve vhost_get_avail_idx() so that smp_rmb() is executed when the avail_idx is accessed. With it, the callers needn't to worry about the memory barrier. As a side benefit, we also validate the index on all paths now, which will hopefully help to catch future errors earlier. Note that current code is inconsistent in how the errors are handled. They are treated as an empty ring in some places, but as non-empty ring in other places. This patch doesn't attempt to change the existing behaviour. No functional change intended. Signed-off-by: Michael S. Tsirkin Reviewed-by: Gavin Shan Acked-by: Will Deacon --- v3: Improved commit log and comments as Michael suggested --- drivers/vhost/vhost.c | 105 +- 1 file changed, 42 insertions(+), 63 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 8995730ce0bf..60d9592eff7b 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1290,10 +1290,36 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d) mutex_unlock(>vqs[i]->mutex); } -static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq, - __virtio16 *idx) +static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq) { - return vhost_get_avail(vq, *idx, >avail->idx); + __virtio16 idx; + int r; + + r = vhost_get_avail(vq, idx, >avail->idx); + if (unlikely(r < 0)) { + vq_err(vq, "Failed to access available index at %p (%d)\n", + >avail->idx, r); + return r; + } + + /* Check it isn't doing very strange thing with available indexes */ + vq->avail_idx = vhost16_to_cpu(vq, idx); + if (unlikely((u16)(vq->avail_idx - vq->last_avail_idx) > vq->num)) { + vq_err(vq, "Invalid available index change from %u to %u", + vq->last_avail_idx, vq->avail_idx); + return -EINVAL; + } + + /* We're done if there is nothing new */ + if (vq->avail_idx == vq->last_avail_idx) + return 0; + + /* +* We updated vq->avail_idx so we need a memory barrier between +* the index read above and the caller reading avail ring entries. +*/ + smp_rmb(); + return 1; } static inline int vhost_get_avail_head(struct vhost_virtqueue *vq, @@ -2498,38 +2524,17 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, { struct vring_desc desc; unsigned int i, head, found = 0; - u16 last_avail_idx; - __virtio16 avail_idx; + u16 last_avail_idx = vq->last_avail_idx; __virtio16 ring_head; int ret, access; - /* Check it isn't doing very strange things with descriptor numbers. */ - last_avail_idx = vq->last_avail_idx; - if (vq->avail_idx == vq->last_avail_idx) { - if (unlikely(vhost_get_avail_idx(vq, _idx))) { - vq_err(vq, "Failed to access avail idx at %p\n", - >avail->idx); - return -EFAULT; - } - vq->avail_idx = vhost16_to_cpu(vq, avail_idx); - - if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) { - vq_err(vq, "Guest moved avail index from %u to %u", - last_avail_idx, vq->avail_idx); - return -EFAULT; - } + ret = vhost_get_avail_idx(vq); + if (unlikely(ret < 0)) + return ret; - /* If there's nothing new since last we looked, return -* invalid. -*/ - if (vq->avail_idx == last_avail_idx) + if (!ret) return vq->num; - - /* Only get avail ring entries after they have been -* exposed by guest. -*/ - smp_rmb(); } /* Grab the next descriptor number they're advertising, and increment @@ -2790,35 +2795,21 @@ EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n); /* return true if we're sure that avaiable ring is empty */ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq) { - __virtio16 avail_idx; int r; if (vq->avail_idx != vq->last_avail_idx) return false; - r = vhost_get_avail_idx(vq, _idx); - if (unlikely(r)) - return false; - - vq->avail_idx = vhost16_to_cpu(vq, avail_idx); - if (vq->avail_idx != vq->last_avail_idx) {
Re: [PATCH v2 1/4] vhost: Improve vhost_get_avail_idx() with smp_rmb()
On 4/30/24 04:44, Michael S. Tsirkin wrote: On Mon, Apr 29, 2024 at 08:13:57PM +1000, Gavin Shan wrote: From: "Michael S. Tsirkin" All the callers of vhost_get_avail_idx() are concerned to the memory *with* the memory barrier Thanks, will be corrected in v3. barrier, imposed by smp_rmb() to ensure the order of the available ring entry read and avail_idx read. Improve vhost_get_avail_idx() so that smp_rmb() is executed when the avail_idx is advanced. accessed, not advanced. guest advances it. smp_rmb() is executed only when vp->last_avail_idx != vp->avail_idx. I used 'advanced' to indicate the condition. 'accessed' is also correct since the 'advanced' case included to 'accessed' case. With it, the callers needn't to worry about the memory barrier. No functional change intended. I'd add: As a side benefit, we also validate the index on all paths now, which will hopefully help catch future errors earlier. Note: current code is inconsistent in how it handles errors: some places treat it as an empty ring, others - non empty. This patch does not attempt to change the existing behaviour. Ok, I will integrate this to v3's commit log. Signed-off-by: Michael S. Tsirkin [gshan: repainted vhost_get_avail_idx()] ?repainted? It's just a indicator to say the changes aren't simply copied from [1]. Some follow-up changes are also applied. So it needs to be reviewed. I will drop this in v3. Reviewed-by: Gavin Shan Acked-by: Will Deacon --- drivers/vhost/vhost.c | 106 +- 1 file changed, 42 insertions(+), 64 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 8995730ce0bf..7aa623117aab 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1290,10 +1290,36 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d) mutex_unlock(>vqs[i]->mutex); } -static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq, - __virtio16 *idx) +static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq) { - return vhost_get_avail(vq, *idx, >avail->idx); + __virtio16 idx; + int r; + + r = vhost_get_avail(vq, idx, >avail->idx); + if (unlikely(r < 0)) { + vq_err(vq, "Failed to access available index at %p (%d)\n", + >avail->idx, r); + return r; + } + + /* Check it isn't doing very strange thing with available indexes */ + vq->avail_idx = vhost16_to_cpu(vq, idx); + if (unlikely((u16)(vq->avail_idx - vq->last_avail_idx) > vq->num)) { + vq_err(vq, "Invalid available index change from %u to %u", + vq->last_avail_idx, vq->avail_idx); + return -EINVAL; + } + + /* We're done if there is nothing new */ + if (vq->avail_idx == vq->last_avail_idx) + return 0; + + /* +* We updated vq->avail_idx so we need a memory barrier between +* the index read above and the caller reading avail ring entries. +*/ + smp_rmb(); + return 1; } static inline int vhost_get_avail_head(struct vhost_virtqueue *vq, @@ -2498,38 +2524,17 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, { struct vring_desc desc; unsigned int i, head, found = 0; - u16 last_avail_idx; - __virtio16 avail_idx; + u16 last_avail_idx = vq->last_avail_idx; __virtio16 ring_head; int ret, access; - /* Check it isn't doing very strange things with descriptor numbers. */ - last_avail_idx = vq->last_avail_idx; - if (vq->avail_idx == vq->last_avail_idx) { - if (unlikely(vhost_get_avail_idx(vq, _idx))) { - vq_err(vq, "Failed to access avail idx at %p\n", - >avail->idx); - return -EFAULT; - } - vq->avail_idx = vhost16_to_cpu(vq, avail_idx); - - if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) { - vq_err(vq, "Guest moved avail index from %u to %u", - last_avail_idx, vq->avail_idx); - return -EFAULT; - } + ret = vhost_get_avail_idx(vq); + if (unlikely(ret < 0)) + return ret; - /* If there's nothing new since last we looked, return -* invalid. -*/ - if (vq->avail_idx == last_avail_idx) + if (!ret) return vq->num; - - /* Only get avail ring entries after they have been -* exposed by guest. -*/ - smp_rmb(); } /* Grab the next d
Re: [PATCH v2 2/4] vhost: Drop variable last_avail_idx in vhost_get_vq_desc()
On 4/30/24 04:45, Michael S. Tsirkin wrote: On Mon, Apr 29, 2024 at 08:13:58PM +1000, Gavin Shan wrote: The local variable @last_avail_idx is equivalent to vq->last_avail_idx. So the code can be simplified a bit by dropping the local variable @last_avail_idx. No functional change intended. Signed-off-by: Gavin Shan --- drivers/vhost/vhost.c | 7 +++ 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 7aa623117aab..b278c0333a66 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2524,7 +2524,6 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, { struct vring_desc desc; unsigned int i, head, found = 0; - u16 last_avail_idx = vq->last_avail_idx; __virtio16 ring_head; int ret, access; @@ -2539,10 +2538,10 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, /* Grab the next descriptor number they're advertising, and increment * the index we've seen. */ - if (unlikely(vhost_get_avail_head(vq, _head, last_avail_idx))) { + if (unlikely(vhost_get_avail_head(vq, _head, vq->last_avail_idx))) { vq_err(vq, "Failed to read head: idx %d address %p\n", - last_avail_idx, - >avail->ring[last_avail_idx % vq->num]); + vq->last_avail_idx, + >avail->ring[vq->last_avail_idx % vq->num]); return -EFAULT; } I don't see the big advantage and the line is long now. The point is to avoid the local variable @last_avail_idx since it's equivalent to vq->last_avail_idx, as stated in the commit log. Besides, it paves the way for PATCH[v2 3/4] where the whole logic fetching the head and sanity check is moved to vhost_get_avail_head(), so that vhost_get_vq_desc() is simplified I will drop PATCH[2, 3, 4] as you suggested. Thanks, Gavin
Re: [PATCH 0/4] vhost: Cleanup
On 4/29/24 17:02, Michael S. Tsirkin wrote: On Tue, Apr 23, 2024 at 01:24:03PM +1000, Gavin Shan wrote: This is suggested by Michael S. Tsirkin according to [1] and the goal is to apply smp_rmb() inside vhost_get_avail_idx() if needed. With it, the caller of the function needn't to worry about memory barriers. Since we're here, other cleanups are also applied. Gavin I suggested another approach. 1. Start with the patch I sent (vhost: order avail ring reads after index updates) just do a diff against latest. simplify error handling a bit. 2. Do any other cleanups on top. My apologies, Michael. I didn't see your patch until now [1] [1] https://lore.kernel.org/virtualization/20240327155750-mutt-send-email-...@kernel.org/ v2 was sent with your changes integrated and other cleanup are applied on top of it. Please take a look when you getting a chance. v2: https://lore.kernel.org/virtualization/20240429101400.617007-1-gs...@redhat.com/T/#t Thanks, Gavin
[PATCH v2 4/4] vhost: Reformat vhost_{get, put}_user()
Reformat the macros to use tab as the terminator for each line so that it looks clean. No functional change intended. Signed-off-by: Gavin Shan --- drivers/vhost/vhost.c | 60 +-- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 4ddb9ec2fe46..c1ed5e750521 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1207,21 +1207,22 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq, return __vhost_get_user_slow(vq, addr, size, type); } -#define vhost_put_user(vq, x, ptr) \ -({ \ - int ret; \ - if (!vq->iotlb) { \ - ret = __put_user(x, ptr); \ - } else { \ - __typeof__(ptr) to = \ +#define vhost_put_user(vq, x, ptr) \ +({ \ + int ret;\ + if (!vq->iotlb) { \ + ret = __put_user(x, ptr); \ + } else {\ + __typeof__(ptr) to =\ (__typeof__(ptr)) __vhost_get_user(vq, ptr, \ - sizeof(*ptr), VHOST_ADDR_USED); \ - if (to != NULL) \ - ret = __put_user(x, to); \ - else \ - ret = -EFAULT; \ - } \ - ret; \ + sizeof(*ptr), \ + VHOST_ADDR_USED); \ + if (to != NULL) \ + ret = __put_user(x, to);\ + else\ + ret = -EFAULT; \ + } \ + ret;\ }) static inline int vhost_put_avail_event(struct vhost_virtqueue *vq) @@ -1252,22 +1253,21 @@ static inline int vhost_put_used_idx(struct vhost_virtqueue *vq) >used->idx); } -#define vhost_get_user(vq, x, ptr, type) \ -({ \ - int ret; \ - if (!vq->iotlb) { \ - ret = __get_user(x, ptr); \ - } else { \ - __typeof__(ptr) from = \ - (__typeof__(ptr)) __vhost_get_user(vq, ptr, \ - sizeof(*ptr), \ - type); \ - if (from != NULL) \ - ret = __get_user(x, from); \ - else \ - ret = -EFAULT; \ - } \ - ret; \ +#define vhost_get_user(vq, x, ptr, type) \ +({ \ + int ret;\ + if (!vq->iotlb) { \ + ret = __get_user(x, ptr); \ + } else {\ + __typeof__(ptr) from = \ + (__typeof__(ptr)) __vhost_get_user(vq, ptr, \ + sizeof(*ptr), type);\ + if (from != NULL) \ + ret = __get_user(x, from); \ + else\ + ret = -EFAULT; \ + } \ + ret;\ }) #define vhost_get_avail(vq, x, ptr) \ -- 2.44.0
[PATCH v2 3/4] vhost: Improve vhost_get_avail_head()
Improve vhost_get_avail_head() so that the head or errno is returned. With it, the relevant sanity checks are squeezed to vhost_get_avail_head() and vhost_get_vq_desc() is further simplified. No functional change intended. Signed-off-by: Gavin Shan --- drivers/vhost/vhost.c | 50 ++- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index b278c0333a66..4ddb9ec2fe46 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1322,11 +1322,27 @@ static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq) return 1; } -static inline int vhost_get_avail_head(struct vhost_virtqueue *vq, - __virtio16 *head, int idx) +static inline int vhost_get_avail_head(struct vhost_virtqueue *vq) { - return vhost_get_avail(vq, *head, - >avail->ring[idx & (vq->num - 1)]); + __virtio16 head; + int r; + + r = vhost_get_avail(vq, head, + >avail->ring[vq->last_avail_idx & (vq->num - 1)]); + if (unlikely(r)) { + vq_err(vq, "Failed to read head: index %u address %p\n", + vq->last_avail_idx, + >avail->ring[vq->last_avail_idx & (vq->num - 1)]); + return r; + } + + r = vhost16_to_cpu(vq, head); + if (unlikely(r >= vq->num)) { + vq_err(vq, "Invalid head %d (%u)\n", r, vq->num); + return -EINVAL; + } + + return r; } static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq, @@ -2523,9 +2539,8 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, struct vhost_log *log, unsigned int *log_num) { struct vring_desc desc; - unsigned int i, head, found = 0; - __virtio16 ring_head; - int ret, access; + unsigned int i, found = 0; + int head, ret, access; if (vq->avail_idx == vq->last_avail_idx) { ret = vhost_get_avail_idx(vq); @@ -2536,23 +2551,10 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, return vq->num; } - /* Grab the next descriptor number they're advertising, and increment -* the index we've seen. */ - if (unlikely(vhost_get_avail_head(vq, _head, vq->last_avail_idx))) { - vq_err(vq, "Failed to read head: idx %d address %p\n", - vq->last_avail_idx, - >avail->ring[vq->last_avail_idx % vq->num]); - return -EFAULT; - } - - head = vhost16_to_cpu(vq, ring_head); - - /* If their number is silly, that's an error. */ - if (unlikely(head >= vq->num)) { - vq_err(vq, "Guest says index %u > %u is available", - head, vq->num); - return -EINVAL; - } + /* Grab the next descriptor number they're advertising */ + head = vhost_get_avail_head(vq); + if (unlikely(head < 0)) + return head; /* When we start there are none of either input nor output. */ *out_num = *in_num = 0; -- 2.44.0
[PATCH v2 2/4] vhost: Drop variable last_avail_idx in vhost_get_vq_desc()
The local variable @last_avail_idx is equivalent to vq->last_avail_idx. So the code can be simplified a bit by dropping the local variable @last_avail_idx. No functional change intended. Signed-off-by: Gavin Shan --- drivers/vhost/vhost.c | 7 +++ 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 7aa623117aab..b278c0333a66 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2524,7 +2524,6 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, { struct vring_desc desc; unsigned int i, head, found = 0; - u16 last_avail_idx = vq->last_avail_idx; __virtio16 ring_head; int ret, access; @@ -2539,10 +2538,10 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, /* Grab the next descriptor number they're advertising, and increment * the index we've seen. */ - if (unlikely(vhost_get_avail_head(vq, _head, last_avail_idx))) { + if (unlikely(vhost_get_avail_head(vq, _head, vq->last_avail_idx))) { vq_err(vq, "Failed to read head: idx %d address %p\n", - last_avail_idx, - >avail->ring[last_avail_idx % vq->num]); + vq->last_avail_idx, + >avail->ring[vq->last_avail_idx % vq->num]); return -EFAULT; } -- 2.44.0
[PATCH v2 1/4] vhost: Improve vhost_get_avail_idx() with smp_rmb()
From: "Michael S. Tsirkin" All the callers of vhost_get_avail_idx() are concerned to the memory barrier, imposed by smp_rmb() to ensure the order of the available ring entry read and avail_idx read. Improve vhost_get_avail_idx() so that smp_rmb() is executed when the avail_idx is advanced. With it, the callers needn't to worry about the memory barrier. No functional change intended. Signed-off-by: Michael S. Tsirkin [gshan: repainted vhost_get_avail_idx()] Reviewed-by: Gavin Shan Acked-by: Will Deacon --- drivers/vhost/vhost.c | 106 +- 1 file changed, 42 insertions(+), 64 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 8995730ce0bf..7aa623117aab 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1290,10 +1290,36 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d) mutex_unlock(>vqs[i]->mutex); } -static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq, - __virtio16 *idx) +static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq) { - return vhost_get_avail(vq, *idx, >avail->idx); + __virtio16 idx; + int r; + + r = vhost_get_avail(vq, idx, >avail->idx); + if (unlikely(r < 0)) { + vq_err(vq, "Failed to access available index at %p (%d)\n", + >avail->idx, r); + return r; + } + + /* Check it isn't doing very strange thing with available indexes */ + vq->avail_idx = vhost16_to_cpu(vq, idx); + if (unlikely((u16)(vq->avail_idx - vq->last_avail_idx) > vq->num)) { + vq_err(vq, "Invalid available index change from %u to %u", + vq->last_avail_idx, vq->avail_idx); + return -EINVAL; + } + + /* We're done if there is nothing new */ + if (vq->avail_idx == vq->last_avail_idx) + return 0; + + /* +* We updated vq->avail_idx so we need a memory barrier between +* the index read above and the caller reading avail ring entries. +*/ + smp_rmb(); + return 1; } static inline int vhost_get_avail_head(struct vhost_virtqueue *vq, @@ -2498,38 +2524,17 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, { struct vring_desc desc; unsigned int i, head, found = 0; - u16 last_avail_idx; - __virtio16 avail_idx; + u16 last_avail_idx = vq->last_avail_idx; __virtio16 ring_head; int ret, access; - /* Check it isn't doing very strange things with descriptor numbers. */ - last_avail_idx = vq->last_avail_idx; - if (vq->avail_idx == vq->last_avail_idx) { - if (unlikely(vhost_get_avail_idx(vq, _idx))) { - vq_err(vq, "Failed to access avail idx at %p\n", - >avail->idx); - return -EFAULT; - } - vq->avail_idx = vhost16_to_cpu(vq, avail_idx); - - if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) { - vq_err(vq, "Guest moved avail index from %u to %u", - last_avail_idx, vq->avail_idx); - return -EFAULT; - } + ret = vhost_get_avail_idx(vq); + if (unlikely(ret < 0)) + return ret; - /* If there's nothing new since last we looked, return -* invalid. -*/ - if (vq->avail_idx == last_avail_idx) + if (!ret) return vq->num; - - /* Only get avail ring entries after they have been -* exposed by guest. -*/ - smp_rmb(); } /* Grab the next descriptor number they're advertising, and increment @@ -2790,35 +2795,20 @@ EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n); /* return true if we're sure that avaiable ring is empty */ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq) { - __virtio16 avail_idx; int r; if (vq->avail_idx != vq->last_avail_idx) return false; - r = vhost_get_avail_idx(vq, _idx); - if (unlikely(r)) - return false; - - vq->avail_idx = vhost16_to_cpu(vq, avail_idx); - if (vq->avail_idx != vq->last_avail_idx) { - /* Since we have updated avail_idx, the following -* call to vhost_get_vq_desc() will read available -* ring entries. Make sure that read happens after -* the avail_idx read. -*/ - smp_rmb(); - return false; - } - - return true; + /* Tre
[PATCH v2 0/4] vhost: Cleanup
This is suggested by Michael S. Tsirkin according to [1] and the goal is to apply smp_rmb() inside vhost_get_avail_idx() if needed. With it, the caller of the function needn't to worry about memory barriers. Since we're here, other cleanups are also applied. [1] https://lore.kernel.org/virtualization/20240327155750-mutt-send-email-...@kernel.org/ PATCH[1] improves vhost_get_avail_idx() so that smp_rmb() is applied if needed. Besides, the sanity checks on the retrieved available queue index are also squeezed to vhost_get_avail_idx() PATCH[2] drops the local variable @last_avail_idx since it's equivalent to vq->last_avail_idx PATCH[3] improves vhost_get_avail_head(), similar to what we're doing for vhost_get_avail_idx(), so that the relevant sanity checks on the head are squeezed to vhost_get_avail_head() PATCH[4] Reformat vhost_{get, put}_user() by using tab instead of space as the terminator for each line Gavin Shan (3): vhost: Drop variable last_avail_idx in vhost_get_vq_desc() vhost: Improve vhost_get_avail_head() vhost: Reformat vhost_{get, put}_user() Michael S. Tsirkin (1): vhost: Improve vhost_get_avail_idx() with smp_rmb() drivers/vhost/vhost.c | 215 +++--- 1 file changed, 97 insertions(+), 118 deletions(-) Changelog = v2: * Improve vhost_get_avail_idx() as Michael suggested in [1] as above (Michael) * Correct @head's type from 'unsigned int' to 'int' (l...@intel.com) -- 2.44.0
Re: [PATCH 3/4] vhost: Improve vhost_get_avail_head()
On 4/26/24 06:42, kernel test robot wrote:> kernel test robot noticed the following build warnings: [auto build test WARNING on mst-vhost/linux-next] [also build test WARNING on linus/master v6.9-rc5 next-20240424] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Gavin-Shan/vhost-Drop-variable-last_avail_idx-in-vhost_get_vq_desc/20240423-112803 base: https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git linux-next patch link: https://lore.kernel.org/r/20240423032407.262329-4-gshan%40redhat.com patch subject: [PATCH 3/4] vhost: Improve vhost_get_avail_head() config: i386-randconfig-141-20240426 (https://download.01.org/0day-ci/archive/20240426/202404260448.g7f06v7m-...@intel.com/config) compiler: clang version 17.0.6 (https://github.com/llvm/llvm-project 6009708b4367171ccdbf4b5905cb6a803753fe18) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot | Closes: https://lore.kernel.org/oe-kbuild-all/202404260448.g7f06v7m-...@intel.com/ smatch warnings: drivers/vhost/vhost.c:2614 vhost_get_vq_desc() warn: unsigned 'head' is never less than zero. drivers/vhost/vhost.c:2614 vhost_get_vq_desc() warn: error code type promoted to positive: 'head' vim +/head +2614 drivers/vhost/vhost.c 2581 2582 /* This looks in the virtqueue and for the first available buffer, and converts 2583 * it to an iovec for convenient access. Since descriptors consist of some 2584 * number of output then some number of input descriptors, it's actually two 2585 * iovecs, but we pack them into one and note how many of each there were. 2586 * 2587 * This function returns the descriptor number found, or vq->num (which is 2588 * never a valid descriptor number) if none was found. A negative code is 2589 * returned on error. */ 2590 int vhost_get_vq_desc(struct vhost_virtqueue *vq, 2591 struct iovec iov[], unsigned int iov_size, 2592 unsigned int *out_num, unsigned int *in_num, 2593 struct vhost_log *log, unsigned int *log_num) 2594 { 2595 struct vring_desc desc; 2596 unsigned int i, head, found = 0; 2597 int ret, access; 2598 2599 if (vq->avail_idx == vq->last_avail_idx) { 2600 ret = vhost_get_avail_idx(vq); 2601 if (unlikely(ret)) 2602 return ret; 2603 2604 /* If there's nothing new since last we looked, return 2605 * invalid. 2606 */ 2607 if (vq->avail_idx == vq->last_avail_idx) 2608 return vq->num; 2609 } 2610 2611 /* Grab the next descriptor number they're advertising, and increment 2612 * the index we've seen. */ 2613 head = vhost_get_avail_head(vq); 2614 if (unlikely(head < 0)) 2615 return head; Thanks for the report. @head needs to be 'int' instead of 'unsigned int' so that it can hold the error number from vhost_get_avail_head(). I would give it more time to see if there are other review comments before I revise it to fix it up. Thanks, Gavin
Re: [PATCH v3 3/3] vhost: Improve vhost_get_avail_idx() with smp_rmb()
On 4/23/24 06:46, Michael S. Tsirkin wrote: On Mon, Apr 08, 2024 at 02:15:24PM +1000, Gavin Shan wrote: On 3/30/24 19:02, Gavin Shan wrote: On 3/28/24 19:31, Michael S. Tsirkin wrote: On Thu, Mar 28, 2024 at 10:21:49AM +1000, Gavin Shan wrote: All the callers of vhost_get_avail_idx() are concerned to the memory barrier, imposed by smp_rmb() to ensure the order of the available ring entry read and avail_idx read. Improve vhost_get_avail_idx() so that smp_rmb() is executed when the avail_idx is advanced. With it, the callers needn't to worry about the memory barrier. Suggested-by: Michael S. Tsirkin Signed-off-by: Gavin Shan Previous patches are ok. This one I feel needs more work - first more code such as sanity checking should go into this function, second there's actually a difference between comparing to last_avail_idx and just comparing to the previous value of avail_idx. I will pick patches 1-2 and post a cleanup on top so you can take a look, ok? Thanks, Michael. It's fine to me. A kindly ping. If it's ok to you, could you please merge PATCH[1-2]? Our downstream 9.4 need the fixes, especially for NVidia's grace-hopper and grace-grace platforms. For PATCH[3], I also can help with the improvement if you don't have time for it. Please let me know. 1-2 are upstream go ahead and post the cleanup. Michael, a cleanup series has been sent for review. https://lore.kernel.org/virtualization/20240423032407.262329-1-gs...@redhat.com/T/#t Thanks, Gavin
[PATCH 4/4] vhost: Reformat vhost_{get, put}_user()
Reformat the macros to use tab as the terminator for each line so that it looks clean. No functional change intended. Signed-off-by: Gavin Shan --- drivers/vhost/vhost.c | 60 +-- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index a3de9325175f..3be19877f9df 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1207,21 +1207,22 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq, return __vhost_get_user_slow(vq, addr, size, type); } -#define vhost_put_user(vq, x, ptr) \ -({ \ - int ret; \ - if (!vq->iotlb) { \ - ret = __put_user(x, ptr); \ - } else { \ - __typeof__(ptr) to = \ +#define vhost_put_user(vq, x, ptr) \ +({ \ + int ret;\ + if (!vq->iotlb) { \ + ret = __put_user(x, ptr); \ + } else {\ + __typeof__(ptr) to =\ (__typeof__(ptr)) __vhost_get_user(vq, ptr, \ - sizeof(*ptr), VHOST_ADDR_USED); \ - if (to != NULL) \ - ret = __put_user(x, to); \ - else \ - ret = -EFAULT; \ - } \ - ret; \ + sizeof(*ptr), \ + VHOST_ADDR_USED); \ + if (to != NULL) \ + ret = __put_user(x, to);\ + else\ + ret = -EFAULT; \ + } \ + ret;\ }) static inline int vhost_put_avail_event(struct vhost_virtqueue *vq) @@ -1252,22 +1253,21 @@ static inline int vhost_put_used_idx(struct vhost_virtqueue *vq) >used->idx); } -#define vhost_get_user(vq, x, ptr, type) \ -({ \ - int ret; \ - if (!vq->iotlb) { \ - ret = __get_user(x, ptr); \ - } else { \ - __typeof__(ptr) from = \ - (__typeof__(ptr)) __vhost_get_user(vq, ptr, \ - sizeof(*ptr), \ - type); \ - if (from != NULL) \ - ret = __get_user(x, from); \ - else \ - ret = -EFAULT; \ - } \ - ret; \ +#define vhost_get_user(vq, x, ptr, type) \ +({ \ + int ret;\ + if (!vq->iotlb) { \ + ret = __get_user(x, ptr); \ + } else {\ + __typeof__(ptr) from = \ + (__typeof__(ptr)) __vhost_get_user(vq, ptr, \ + sizeof(*ptr), type);\ + if (from != NULL) \ + ret = __get_user(x, from); \ + else\ + ret = -EFAULT; \ + } \ + ret;\ }) #define vhost_get_avail(vq, x, ptr) \ -- 2.44.0
[PATCH 3/4] vhost: Improve vhost_get_avail_head()
Improve vhost_get_avail_head() so that the head or errno is returned. With it, the relevant sanity checks are squeezed to vhost_get_avail_head() and vhost_get_vq_desc() is further simplified. No functional change intended. Signed-off-by: Gavin Shan --- drivers/vhost/vhost.c | 43 +++ 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index b3adc0bc9e72..a3de9325175f 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1320,11 +1320,27 @@ static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq) return 0; } -static inline int vhost_get_avail_head(struct vhost_virtqueue *vq, - __virtio16 *head, int idx) +static inline int vhost_get_avail_head(struct vhost_virtqueue *vq) { - return vhost_get_avail(vq, *head, - >avail->ring[idx & (vq->num - 1)]); + __virtio16 head; + int r; + + r = vhost_get_avail(vq, head, + >avail->ring[vq->last_avail_idx & (vq->num - 1)]); + if (unlikely(r)) { + vq_err(vq, "Failed to read head: idx %u address %p\n", + vq->last_avail_idx, + >avail->ring[vq->last_avail_idx % vq->num]); + return r; + } + + r = vhost16_to_cpu(vq, head); + if (unlikely(r >= vq->num)) { + vq_err(vq, "Invalid head %d (%u)\n", r, vq->num); + return -EINVAL; + } + + return r; } static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq, @@ -2522,7 +2538,6 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, { struct vring_desc desc; unsigned int i, head, found = 0; - __virtio16 ring_head; int ret, access; if (vq->avail_idx == vq->last_avail_idx) { @@ -2539,21 +2554,9 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, /* Grab the next descriptor number they're advertising, and increment * the index we've seen. */ - if (unlikely(vhost_get_avail_head(vq, _head, vq->last_avail_idx))) { - vq_err(vq, "Failed to read head: idx %d address %p\n", - vq->last_avail_idx, - >avail->ring[vq->last_avail_idx % vq->num]); - return -EFAULT; - } - - head = vhost16_to_cpu(vq, ring_head); - - /* If their number is silly, that's an error. */ - if (unlikely(head >= vq->num)) { - vq_err(vq, "Guest says index %u > %u is available", - head, vq->num); - return -EINVAL; - } + head = vhost_get_avail_head(vq); + if (unlikely(head < 0)) + return head; /* When we start there are none of either input nor output. */ *out_num = *in_num = 0; -- 2.44.0
[PATCH 2/4] vhost: Improve vhost_get_avail_idx() with smp_rmb()
All the callers of vhost_get_avail_idx() are concerned to the memory barrier, imposed by smp_rmb() to ensure the order of the available ring entry read and avail_idx read. Improve vhost_get_avail_idx() so that smp_rmb() is executed when the avail_idx is advanced. With it, the callers needn't to worry about the memory barrier. No functional change intended. Suggested-by: Michael S. Tsirkin Signed-off-by: Gavin Shan --- drivers/vhost/vhost.c | 91 --- 1 file changed, 34 insertions(+), 57 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index ef7942103232..b3adc0bc9e72 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1290,10 +1290,34 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d) mutex_unlock(>vqs[i]->mutex); } -static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq, - __virtio16 *idx) +static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq) { - return vhost_get_avail(vq, *idx, >avail->idx); + __virtio16 avail_idx; + int r; + + r = vhost_get_avail(vq, avail_idx, >avail->idx); + if (unlikely(r)) { + vq_err(vq, "Failed to access avail idx at %p\n", + >avail->idx); + return r; + } + + vq->avail_idx = vhost16_to_cpu(vq, avail_idx); + if (vq->avail_idx != vq->last_avail_idx) { + /* Ensure the available ring entry read happens +* before the avail_idx read when the avail_idx +* is advanced. +*/ + smp_rmb(); + } + + if (unlikely((u16)(vq->avail_idx - vq->last_avail_idx) > vq->num)) { + vq_err(vq, "Invalid avail index change from %u to %u", + vq->last_avail_idx, vq->avail_idx); + return -EINVAL; + } + + return 0; } static inline int vhost_get_avail_head(struct vhost_virtqueue *vq, @@ -2498,35 +2522,19 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, { struct vring_desc desc; unsigned int i, head, found = 0; - __virtio16 avail_idx; __virtio16 ring_head; int ret, access; - /* Check it isn't doing very strange things with descriptor numbers. */ if (vq->avail_idx == vq->last_avail_idx) { - if (unlikely(vhost_get_avail_idx(vq, _idx))) { - vq_err(vq, "Failed to access avail idx at %p\n", - >avail->idx); - return -EFAULT; - } - vq->avail_idx = vhost16_to_cpu(vq, avail_idx); - - if (unlikely((u16)(vq->avail_idx - vq->last_avail_idx) > vq->num)) { - vq_err(vq, "Guest moved avail index from %u to %u", - vq->last_avail_idx, vq->avail_idx); - return -EFAULT; - } + ret = vhost_get_avail_idx(vq); + if (unlikely(ret)) + return ret; /* If there's nothing new since last we looked, return * invalid. */ if (vq->avail_idx == vq->last_avail_idx) return vq->num; - - /* Only get avail ring entries after they have been -* exposed by guest. -*/ - smp_rmb(); } /* Grab the next descriptor number they're advertising, and increment @@ -2787,35 +2795,19 @@ EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n); /* return true if we're sure that avaiable ring is empty */ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq) { - __virtio16 avail_idx; - int r; - if (vq->avail_idx != vq->last_avail_idx) return false; - r = vhost_get_avail_idx(vq, _idx); - if (unlikely(r)) - return false; - - vq->avail_idx = vhost16_to_cpu(vq, avail_idx); - if (vq->avail_idx != vq->last_avail_idx) { - /* Since we have updated avail_idx, the following -* call to vhost_get_vq_desc() will read available -* ring entries. Make sure that read happens after -* the avail_idx read. -*/ - smp_rmb(); + if (unlikely(vhost_get_avail_idx(vq))) return false; - } - return true; + return vq->avail_idx == vq->last_avail_idx; } EXPORT_SYMBOL_GPL(vhost_vq_avail_empty); /* OK, now we need to know about added descriptors. */ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) { - __virtio16 avail_idx; int r; if (!(vq->used_flags & VRING_USED_F_NO_N
[PATCH 1/4] vhost: Drop variable last_avail_idx in vhost_get_vq_desc()
The local variable @last_avail_idx is equivalent to vq->last_avail_idx. So the code can be simplified a bit by dropping the local variable @last_avail_idx. No functional change intended. Signed-off-by: Gavin Shan --- drivers/vhost/vhost.c | 15 ++- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 8995730ce0bf..ef7942103232 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2498,14 +2498,11 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, { struct vring_desc desc; unsigned int i, head, found = 0; - u16 last_avail_idx; __virtio16 avail_idx; __virtio16 ring_head; int ret, access; /* Check it isn't doing very strange things with descriptor numbers. */ - last_avail_idx = vq->last_avail_idx; - if (vq->avail_idx == vq->last_avail_idx) { if (unlikely(vhost_get_avail_idx(vq, _idx))) { vq_err(vq, "Failed to access avail idx at %p\n", @@ -2514,16 +2511,16 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, } vq->avail_idx = vhost16_to_cpu(vq, avail_idx); - if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) { + if (unlikely((u16)(vq->avail_idx - vq->last_avail_idx) > vq->num)) { vq_err(vq, "Guest moved avail index from %u to %u", - last_avail_idx, vq->avail_idx); + vq->last_avail_idx, vq->avail_idx); return -EFAULT; } /* If there's nothing new since last we looked, return * invalid. */ - if (vq->avail_idx == last_avail_idx) + if (vq->avail_idx == vq->last_avail_idx) return vq->num; /* Only get avail ring entries after they have been @@ -2534,10 +2531,10 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, /* Grab the next descriptor number they're advertising, and increment * the index we've seen. */ - if (unlikely(vhost_get_avail_head(vq, _head, last_avail_idx))) { + if (unlikely(vhost_get_avail_head(vq, _head, vq->last_avail_idx))) { vq_err(vq, "Failed to read head: idx %d address %p\n", - last_avail_idx, - >avail->ring[last_avail_idx % vq->num]); + vq->last_avail_idx, + >avail->ring[vq->last_avail_idx % vq->num]); return -EFAULT; } -- 2.44.0
[PATCH 0/4] vhost: Cleanup
This is suggested by Michael S. Tsirkin according to [1] and the goal is to apply smp_rmb() inside vhost_get_avail_idx() if needed. With it, the caller of the function needn't to worry about memory barriers. Since we're here, other cleanups are also applied. [1] https://lore.kernel.org/virtualization/20240327075940-mutt-send-email-...@kernel.org/ PATCH[1] drops the local variable @last_avail_idx since it's equivalent to vq->last_avail_idx PATCH[2] improves vhost_get_avail_idx() so that smp_rmb() is applied if needed. Besides, the sanity checks on the retrieved available queue index are also squeezed to vhost_get_avail_idx() PATCH[3] improves vhost_get_avail_head(), similar to what we're doing for vhost_get_avail_idx(), so that the relevant sanity checks on the head are squeezed to vhost_get_avail_head() PATCH[4] Reformat vhost_{get, put}_user() by using tab instead of space as the terminator for each line Gavin Shan (4): vhost: Drop variable last_avail_idx in vhost_get_vq_desc() vhost: Improve vhost_get_avail_idx() with smp_rmb() vhost: Improve vhost_get_avail_head() vhost: Reformat vhost_{get, put}_user() drivers/vhost/vhost.c | 199 +++--- 1 file changed, 88 insertions(+), 111 deletions(-) -- 2.44.0
Re: [PATCH v3 3/3] vhost: Improve vhost_get_avail_idx() with smp_rmb()
Hi Michael, On 3/30/24 19:02, Gavin Shan wrote: On 3/28/24 19:31, Michael S. Tsirkin wrote: On Thu, Mar 28, 2024 at 10:21:49AM +1000, Gavin Shan wrote: All the callers of vhost_get_avail_idx() are concerned to the memory barrier, imposed by smp_rmb() to ensure the order of the available ring entry read and avail_idx read. Improve vhost_get_avail_idx() so that smp_rmb() is executed when the avail_idx is advanced. With it, the callers needn't to worry about the memory barrier. Suggested-by: Michael S. Tsirkin Signed-off-by: Gavin Shan Previous patches are ok. This one I feel needs more work - first more code such as sanity checking should go into this function, second there's actually a difference between comparing to last_avail_idx and just comparing to the previous value of avail_idx. I will pick patches 1-2 and post a cleanup on top so you can take a look, ok? Thanks, Michael. It's fine to me. A kindly ping. If it's ok to you, could you please merge PATCH[1-2]? Our downstream 9.4 need the fixes, especially for NVidia's grace-hopper and grace-grace platforms. For PATCH[3], I also can help with the improvement if you don't have time for it. Please let me know. Thanks, Gavin
Re: [PATCH v3 3/3] vhost: Improve vhost_get_avail_idx() with smp_rmb()
On 3/28/24 19:31, Michael S. Tsirkin wrote: On Thu, Mar 28, 2024 at 10:21:49AM +1000, Gavin Shan wrote: All the callers of vhost_get_avail_idx() are concerned to the memory barrier, imposed by smp_rmb() to ensure the order of the available ring entry read and avail_idx read. Improve vhost_get_avail_idx() so that smp_rmb() is executed when the avail_idx is advanced. With it, the callers needn't to worry about the memory barrier. Suggested-by: Michael S. Tsirkin Signed-off-by: Gavin Shan Previous patches are ok. This one I feel needs more work - first more code such as sanity checking should go into this function, second there's actually a difference between comparing to last_avail_idx and just comparing to the previous value of avail_idx. I will pick patches 1-2 and post a cleanup on top so you can take a look, ok? Thanks, Michael. It's fine to me. Thanks, Gavin
Re: [PATCH v2 1/2] vhost: Add smp_rmb() in vhost_vq_avail_empty()
On 3/27/24 17:42, Jason Wang wrote: On Wed, Mar 27, 2024 at 3:35 PM Gavin Shan wrote: On 3/27/24 14:08, Gavin Shan wrote: On 3/27/24 12:44, Jason Wang wrote: On Wed, Mar 27, 2024 at 10:34 AM Jason Wang wrote: On Wed, Mar 27, 2024 at 7:39 AM Gavin Shan wrote: A smp_rmb() has been missed in vhost_vq_avail_empty(), spotted by Will Deacon . Otherwise, it's not ensured the available ring entries pushed by guest can be observed by vhost in time, leading to stale available ring entries fetched by vhost in vhost_get_vq_desc(), as reported by Yihuang Yu on NVidia's grace-hopper (ARM64) platform. /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64 \ -accel kvm -machine virt,gic-version=host -cpu host \ -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \ -m 4096M,slots=16,maxmem=64G \ -object memory-backend-ram,id=mem0,size=4096M\ : \ -netdev tap,id=vnet0,vhost=true \ -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 : guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM virtio_net virtio0: output.0:id 100 is not a head! Add the missed smp_rmb() in vhost_vq_avail_empty(). Note that it should be safe until vq->avail_idx is changed by commit 275bf960ac697 ("vhost: better detection of available buffers"). Fixes: 275bf960ac697 ("vhost: better detection of available buffers") Cc: # v4.11+ Reported-by: Yihuang Yu Signed-off-by: Gavin Shan --- drivers/vhost/vhost.c | 11 ++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 045f666b4f12..00445ab172b3 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2799,9 +2799,18 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq) r = vhost_get_avail_idx(vq, _idx); if (unlikely(r)) return false; + vq->avail_idx = vhost16_to_cpu(vq, avail_idx); + if (vq->avail_idx != vq->last_avail_idx) { + /* Similar to what's done in vhost_get_vq_desc(), we need +* to ensure the available ring entries have been exposed +* by guest. +*/ We need to be more verbose here. For example, which load needs to be ordered with which load. The rmb in vhost_get_vq_desc() is used to order the load of avail idx and the load of head. It is paired with e.g virtio_wmb() in virtqueue_add_split(). vhost_vq_avail_empty() are mostly used as a hint in vhost_net_busy_poll() which is under the protection of the vq mutex. An exception is the tx_can_batch(), but in that case it doesn't even want to read the head. Ok, if it is needed only in that path, maybe we can move the barriers there. [cc Will Deacon] Jason, appreciate for your review and comments. I think PATCH[1/2] is the fix for the hypothesis, meaning PATCH[2/2] is the real fix. However, it would be nice to fix all of them in one shoot. I will try with PATCH[2/2] only to see if our issue will disappear or not. However, the issue still exists if PATCH[2/2] is missed. Jason, PATCH[2/2] is sufficient to fix our current issue. I tried with PATCH[2/2] only and unable to hit the issue. However, PATCH[1/2] may be needed by other scenarios. So it would be nice to fix them in one shoot. Yes, see below. Firstly, We were failing on the transmit queue and {tvq, rvq}->busyloop_timeout == false if I remember correctly. So the added smp_rmb() in vhost_vq_avail_empty() is only a concern to tx_can_batch(). A mutex isn't enough to ensure the order for the available index and available ring entry (head). For example, vhost_vq_avail_empty() called by tx_can_batch() can see next available index, but its corresponding available ring entry (head) may not be seen by vhost yet if smp_rmb() is missed. The next call to get_tx_bufs(), where the available ring entry (head) doesn't arrived yet, leading to stale available ring entry (head) being fetched. handle_tx_copy get_tx_bufs // smp_rmb() won't be executed when vq->avail_idx != vq->last_avail_idx tx_can_batch vhost_vq_avail_empty // vq->avail_idx is updated from vq->avail->idx The reason why I added smp_rmb() to vhost_vq_avail_empty() is because the function is a exposed API, even it's only used by drivers/vhost/net.c at present. It means the API has been broken internally. So it seems more appropriate to fix it up in vhost_vq_avail_empty() so that the API's users needn't worry about the memory access order. When tx_can_batch returns true it means there's still pending tx buffers. Since it might read indices so it still can bypass the smp_rmb() in the vhost_get_vq_desc(). I'd suggest adding those above to change log. With this, Acked-by
Re: [PATCH v2 1/2] vhost: Add smp_rmb() in vhost_vq_avail_empty()
On 3/27/24 22:07, Michael S. Tsirkin wrote: On Wed, Mar 27, 2024 at 09:38:45AM +1000, Gavin Shan wrote: A smp_rmb() has been missed in vhost_vq_avail_empty(), spotted by Will Deacon . Otherwise, it's not ensured the available ring entries pushed by guest can be observed by vhost in time, leading to stale available ring entries fetched by vhost in vhost_get_vq_desc(), as reported by Yihuang Yu on NVidia's grace-hopper (ARM64) platform. /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64 \ -accel kvm -machine virt,gic-version=host -cpu host \ -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \ -m 4096M,slots=16,maxmem=64G \ -object memory-backend-ram,id=mem0,size=4096M\ : \ -netdev tap,id=vnet0,vhost=true \ -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 : guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM virtio_net virtio0: output.0:id 100 is not a head! Add the missed smp_rmb() in vhost_vq_avail_empty(). Note that it should be safe until vq->avail_idx is changed by commit 275bf960ac697 ("vhost: better detection of available buffers"). Fixes: 275bf960ac697 ("vhost: better detection of available buffers") Cc: # v4.11+ Reported-by: Yihuang Yu Signed-off-by: Gavin Shan --- drivers/vhost/vhost.c | 11 ++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 045f666b4f12..00445ab172b3 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2799,9 +2799,18 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq) r = vhost_get_avail_idx(vq, _idx); if (unlikely(r)) return false; + vq->avail_idx = vhost16_to_cpu(vq, avail_idx); + if (vq->avail_idx != vq->last_avail_idx) { + /* Similar to what's done in vhost_get_vq_desc(), we need +* to ensure the available ring entries have been exposed +* by guest. +*/ A slightly clearer comment: /* Since we have updated avail_idx, the following call to * vhost_get_vq_desc will read available ring entries. * Make sure that read happens after the avail_idx read. */ Pls repost with that, and I will apply. Also add suggested-by for will. Sure, the suggested comments have been included to v3. + smp_rmb(); + return false; + } - return vq->avail_idx == vq->last_avail_idx; + return true; } EXPORT_SYMBOL_GPL(vhost_vq_avail_empty); As a follow-up patch, we should clean out code duplication that accumulated with 3 places reading avail idx in essentially the same way - this duplication is what causes the mess in the 1st place. Yes, nice idea. I've added PATCH[v3 3/3] to improve vhost_get_avail_idx() to handle the memory barrier since all the callers have the concern. v3: https://lore.kernel.org/virtualization/20240328002149.1141302-1-gs...@redhat.com/ Thanks, Gavin
[PATCH v3 3/3] vhost: Improve vhost_get_avail_idx() with smp_rmb()
All the callers of vhost_get_avail_idx() are concerned to the memory barrier, imposed by smp_rmb() to ensure the order of the available ring entry read and avail_idx read. Improve vhost_get_avail_idx() so that smp_rmb() is executed when the avail_idx is advanced. With it, the callers needn't to worry about the memory barrier. Suggested-by: Michael S. Tsirkin Signed-off-by: Gavin Shan --- drivers/vhost/vhost.c | 75 +++ 1 file changed, 26 insertions(+), 49 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 32686c79c41d..e6882f4f6ce2 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1290,10 +1290,28 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d) mutex_unlock(>vqs[i]->mutex); } -static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq, - __virtio16 *idx) +static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq) { - return vhost_get_avail(vq, *idx, >avail->idx); + __virtio16 avail_idx; + int r; + + r = vhost_get_avail(vq, avail_idx, >avail->idx); + if (unlikely(r)) { + vq_err(vq, "Failed to access avail idx at %p\n", + >avail->idx); + return r; + } + + vq->avail_idx = vhost16_to_cpu(vq, avail_idx); + if (vq->avail_idx != vq->last_avail_idx) { + /* Ensure the available ring entry read happens +* before the avail_idx read when the avail_idx +* is advanced. +*/ + smp_rmb(); + } + + return 0; } static inline int vhost_get_avail_head(struct vhost_virtqueue *vq, @@ -2499,7 +2517,6 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, struct vring_desc desc; unsigned int i, head, found = 0; u16 last_avail_idx; - __virtio16 avail_idx; __virtio16 ring_head; int ret, access; @@ -2507,12 +2524,8 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, last_avail_idx = vq->last_avail_idx; if (vq->avail_idx == vq->last_avail_idx) { - if (unlikely(vhost_get_avail_idx(vq, _idx))) { - vq_err(vq, "Failed to access avail idx at %p\n", - >avail->idx); + if (unlikely(vhost_get_avail_idx(vq))) return -EFAULT; - } - vq->avail_idx = vhost16_to_cpu(vq, avail_idx); if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) { vq_err(vq, "Guest moved used index from %u to %u", @@ -2525,11 +2538,6 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, */ if (vq->avail_idx == last_avail_idx) return vq->num; - - /* Only get avail ring entries after they have been -* exposed by guest. -*/ - smp_rmb(); } /* Grab the next descriptor number they're advertising, and increment @@ -2790,35 +2798,19 @@ EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n); /* return true if we're sure that avaiable ring is empty */ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq) { - __virtio16 avail_idx; - int r; - if (vq->avail_idx != vq->last_avail_idx) return false; - r = vhost_get_avail_idx(vq, _idx); - if (unlikely(r)) - return false; - - vq->avail_idx = vhost16_to_cpu(vq, avail_idx); - if (vq->avail_idx != vq->last_avail_idx) { - /* Since we have updated avail_idx, the following -* call to vhost_get_vq_desc() will read available -* ring entries. Make sure that read happens after -* the avail_idx read. -*/ - smp_rmb(); + if (unlikely(vhost_get_avail_idx(vq))) return false; - } - return true; + return vq->avail_idx == vq->last_avail_idx; } EXPORT_SYMBOL_GPL(vhost_vq_avail_empty); /* OK, now we need to know about added descriptors. */ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) { - __virtio16 avail_idx; int r; if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY)) @@ -2842,25 +2834,10 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) /* They could have slipped one in as we were doing that: make * sure it's written, then check again. */ smp_mb(); - r = vhost_get_avail_idx(vq, _idx); - if (r) { - vq_err(vq, "Failed to check avail idx at %p: %d\n", - >avail->idx, r); + if (unlikely(vhost_get_avail_idx(vq)))
[PATCH v3 2/3] vhost: Add smp_rmb() in vhost_enable_notify()
A smp_rmb() has been missed in vhost_enable_notify(), inspired by Will. Otherwise, it's not ensured the available ring entries pushed by guest can be observed by vhost in time, leading to stale available ring entries fetched by vhost in vhost_get_vq_desc(), as reported by Yihuang Yu on NVidia's grace-hopper (ARM64) platform. /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64 \ -accel kvm -machine virt,gic-version=host -cpu host \ -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \ -m 4096M,slots=16,maxmem=64G \ -object memory-backend-ram,id=mem0,size=4096M\ : \ -netdev tap,id=vnet0,vhost=true \ -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 : guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM virtio_net virtio0: output.0:id 100 is not a head! Add the missed smp_rmb() in vhost_enable_notify(). When it returns true, it means there's still pending tx buffers. Since it might read indices, so it still can bypass the smp_rmb() in vhost_get_vq_desc(). Note that it should be safe until vq->avail_idx is changed by commit d3bb267bbdcb ("vhost: cache avail index in vhost_enable_notify()"). Fixes: d3bb267bbdcb ("vhost: cache avail index in vhost_enable_notify()") Cc: # v5.18+ Reported-by: Yihuang Yu Suggested-by: Will Deacon Signed-off-by: Gavin Shan Acked-by: Jason Wang --- drivers/vhost/vhost.c | 12 +++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 29df65b2ebf2..32686c79c41d 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2848,9 +2848,19 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) >avail->idx, r); return false; } + vq->avail_idx = vhost16_to_cpu(vq, avail_idx); + if (vq->avail_idx != vq->last_avail_idx) { + /* Since we have updated avail_idx, the following +* call to vhost_get_vq_desc() will read available +* ring entries. Make sure that read happens after +* the avail_idx read. +*/ + smp_rmb(); + return true; + } - return vq->avail_idx != vq->last_avail_idx; + return false; } EXPORT_SYMBOL_GPL(vhost_enable_notify); -- 2.44.0
[PATCH v3 1/3] vhost: Add smp_rmb() in vhost_vq_avail_empty()
A smp_rmb() has been missed in vhost_vq_avail_empty(), spotted by Will. Otherwise, it's not ensured the available ring entries pushed by guest can be observed by vhost in time, leading to stale available ring entries fetched by vhost in vhost_get_vq_desc(), as reported by Yihuang Yu on NVidia's grace-hopper (ARM64) platform. /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64 \ -accel kvm -machine virt,gic-version=host -cpu host \ -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \ -m 4096M,slots=16,maxmem=64G \ -object memory-backend-ram,id=mem0,size=4096M\ : \ -netdev tap,id=vnet0,vhost=true \ -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 : guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM virtio_net virtio0: output.0:id 100 is not a head! Add the missed smp_rmb() in vhost_vq_avail_empty(). When tx_can_batch() returns true, it means there's still pending tx buffers. Since it might read indices, so it still can bypass the smp_rmb() in vhost_get_vq_desc(). Note that it should be safe until vq->avail_idx is changed by commit 275bf960ac697 ("vhost: better detection of available buffers"). Fixes: 275bf960ac69 ("vhost: better detection of available buffers") Cc: # v4.11+ Reported-by: Yihuang Yu Suggested-by: Will Deacon Signed-off-by: Gavin Shan Acked-by: Jason Wang --- drivers/vhost/vhost.c | 12 +++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 045f666b4f12..29df65b2ebf2 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2799,9 +2799,19 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq) r = vhost_get_avail_idx(vq, _idx); if (unlikely(r)) return false; + vq->avail_idx = vhost16_to_cpu(vq, avail_idx); + if (vq->avail_idx != vq->last_avail_idx) { + /* Since we have updated avail_idx, the following +* call to vhost_get_vq_desc() will read available +* ring entries. Make sure that read happens after +* the avail_idx read. +*/ + smp_rmb(); + return false; + } - return vq->avail_idx == vq->last_avail_idx; + return true; } EXPORT_SYMBOL_GPL(vhost_vq_avail_empty); -- 2.44.0
[PATCH v3 0/3] vhost: Fix stale available ring entries
The issue was reported by Yihuang Yu on NVidia's grace-hopper (ARM64) platform. The wrong head (available ring entry) is seen by the guest when running 'netperf' on the guest and running 'netserver' on another NVidia's grace-grace machine. /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64 \ -accel kvm -machine virt,gic-version=host -cpu host \ -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \ -m 4096M,slots=16,maxmem=64G \ -object memory-backend-ram,id=mem0,size=4096M\ : \ -netdev tap,id=tap0,vhost=true \ -device virtio-net-pci,bus=pcie.8,netdev=tap0,mac=52:54:00:f1:26:b0 : guest# ifconfig eth0 | grep 'inet addr' inet addr:10.26.1.220 guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM virtio_net virtio0: output.0:id 100 is not a head! There is missed smp_rmb() in vhost_{vq_avail_empty, enable_notify}() Without smp_rmb(), vq->avail_idx is advanced but the available ring entries aren't arriving to vhost side yet. So a stale available ring entry can be fetched in vhost_get_vq_desc(). Fix it by adding smp_rmb() in those two functions. Note that I need two patches so that they can be easily picked up by the stable kernel. With the changes, I'm unable to hit the issue again. Besides, the function vhost_get_avail_idx() is improved to tackle the memory barrier so that the callers needn't to worry about it. v2: https://lore.kernel.org/virtualization/46c6a9aa-821c-4013-afe7-61ec05fc9...@redhat.com v1: https://lore.kernel.org/virtualization/66e12633-b2d6-4b9a-9103-bb79770fc...@redhat.com Changelog = v3: Improved change log (Jason) Improved comments and added PATCH[v3 3/3] to execute smp_rmb() in vhost_get_avail_idx() (Michael) Gavin Shan (3): vhost: Add smp_rmb() in vhost_vq_avail_empty() vhost: Add smp_rmb() in vhost_enable_notify() vhost: Improve vhost_get_avail_idx() with smp_rmb() drivers/vhost/vhost.c | 51 --- 1 file changed, 24 insertions(+), 27 deletions(-) -- 2.44.0
Re: [PATCH v2 1/2] vhost: Add smp_rmb() in vhost_vq_avail_empty()
On 3/27/24 14:08, Gavin Shan wrote: On 3/27/24 12:44, Jason Wang wrote: On Wed, Mar 27, 2024 at 10:34 AM Jason Wang wrote: On Wed, Mar 27, 2024 at 7:39 AM Gavin Shan wrote: A smp_rmb() has been missed in vhost_vq_avail_empty(), spotted by Will Deacon . Otherwise, it's not ensured the available ring entries pushed by guest can be observed by vhost in time, leading to stale available ring entries fetched by vhost in vhost_get_vq_desc(), as reported by Yihuang Yu on NVidia's grace-hopper (ARM64) platform. /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64 \ -accel kvm -machine virt,gic-version=host -cpu host \ -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \ -m 4096M,slots=16,maxmem=64G \ -object memory-backend-ram,id=mem0,size=4096M \ : \ -netdev tap,id=vnet0,vhost=true \ -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 : guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM virtio_net virtio0: output.0:id 100 is not a head! Add the missed smp_rmb() in vhost_vq_avail_empty(). Note that it should be safe until vq->avail_idx is changed by commit 275bf960ac697 ("vhost: better detection of available buffers"). Fixes: 275bf960ac697 ("vhost: better detection of available buffers") Cc: # v4.11+ Reported-by: Yihuang Yu Signed-off-by: Gavin Shan --- drivers/vhost/vhost.c | 11 ++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 045f666b4f12..00445ab172b3 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2799,9 +2799,18 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq) r = vhost_get_avail_idx(vq, _idx); if (unlikely(r)) return false; + vq->avail_idx = vhost16_to_cpu(vq, avail_idx); + if (vq->avail_idx != vq->last_avail_idx) { + /* Similar to what's done in vhost_get_vq_desc(), we need + * to ensure the available ring entries have been exposed + * by guest. + */ We need to be more verbose here. For example, which load needs to be ordered with which load. The rmb in vhost_get_vq_desc() is used to order the load of avail idx and the load of head. It is paired with e.g virtio_wmb() in virtqueue_add_split(). vhost_vq_avail_empty() are mostly used as a hint in vhost_net_busy_poll() which is under the protection of the vq mutex. An exception is the tx_can_batch(), but in that case it doesn't even want to read the head. Ok, if it is needed only in that path, maybe we can move the barriers there. [cc Will Deacon] Jason, appreciate for your review and comments. I think PATCH[1/2] is the fix for the hypothesis, meaning PATCH[2/2] is the real fix. However, it would be nice to fix all of them in one shoot. I will try with PATCH[2/2] only to see if our issue will disappear or not. However, the issue still exists if PATCH[2/2] is missed. Jason, PATCH[2/2] is sufficient to fix our current issue. I tried with PATCH[2/2] only and unable to hit the issue. However, PATCH[1/2] may be needed by other scenarios. So it would be nice to fix them in one shoot. Firstly, We were failing on the transmit queue and {tvq, rvq}->busyloop_timeout == false if I remember correctly. So the added smp_rmb() in vhost_vq_avail_empty() is only a concern to tx_can_batch(). A mutex isn't enough to ensure the order for the available index and available ring entry (head). For example, vhost_vq_avail_empty() called by tx_can_batch() can see next available index, but its corresponding available ring entry (head) may not be seen by vhost yet if smp_rmb() is missed. The next call to get_tx_bufs(), where the available ring entry (head) doesn't arrived yet, leading to stale available ring entry (head) being fetched. handle_tx_copy get_tx_bufs // smp_rmb() won't be executed when vq->avail_idx != vq->last_avail_idx tx_can_batch vhost_vq_avail_empty // vq->avail_idx is updated from vq->avail->idx The reason why I added smp_rmb() to vhost_vq_avail_empty() is because the function is a exposed API, even it's only used by drivers/vhost/net.c at present. It means the API has been broken internally. So it seems more appropriate to fix it up in vhost_vq_avail_empty() so that the API's users needn't worry about the memory access order. + smp_rmb(); + return false; + } - return vq->avail_idx == vq->last_avail_idx; + return true; } EXPORT_SYMBOL_GPL(vhost_vq_avail_empty); Thanks, Gavin
Re: [PATCH v2 2/2] vhost: Add smp_rmb() in vhost_enable_notify()
On 3/27/24 12:41, Jason Wang wrote: On Wed, Mar 27, 2024 at 7:39 AM Gavin Shan wrote: A smp_rmb() has been missed in vhost_enable_notify(), inspired by Will Deacon . Otherwise, it's not ensured the available ring entries pushed by guest can be observed by vhost in time, leading to stale available ring entries fetched by vhost in vhost_get_vq_desc(), as reported by Yihuang Yu on NVidia's grace-hopper (ARM64) platform. /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64 \ -accel kvm -machine virt,gic-version=host -cpu host \ -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \ -m 4096M,slots=16,maxmem=64G \ -object memory-backend-ram,id=mem0,size=4096M\ : \ -netdev tap,id=vnet0,vhost=true \ -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 : guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM virtio_net virtio0: output.0:id 100 is not a head! Add the missed smp_rmb() in vhost_enable_notify(). Note that it should be safe until vq->avail_idx is changed by commit d3bb267bbdcb ("vhost: cache avail index in vhost_enable_notify()"). Fixes: d3bb267bbdcb ("vhost: cache avail index in vhost_enable_notify()") Cc: # v5.18+ Reported-by: Yihuang Yu Signed-off-by: Gavin Shan --- drivers/vhost/vhost.c | 11 ++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 00445ab172b3..58f9d6a435f0 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2847,9 +2847,18 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) >avail->idx, r); return false; } + vq->avail_idx = vhost16_to_cpu(vq, avail_idx); + if (vq->avail_idx != vq->last_avail_idx) { + /* Similar to what's done in vhost_get_vq_desc(), we need +* to ensure the available ring entries have been exposed +* by guest. +*/ + smp_rmb(); + return true; + } - return vq->avail_idx != vq->last_avail_idx; + return false; So we only care about the case when vhost_enable_notify() returns true. In that case, I think you want to order with vhost_get_vq_desc(): last_avail_idx = vq->last_avail_idx; if (vq->avail_idx == vq->last_avail_idx) { /* false */ } vhost_get_avail_head(vq, _head, last_avail_idx) Assuming I understand the patch correctly. Acked-by: Jason Wang Jason, thanks for your review and comments. Your understanding is exactly what I understood. } EXPORT_SYMBOL_GPL(vhost_enable_notify); Thanks, Gavin
Re: [PATCH v2 1/2] vhost: Add smp_rmb() in vhost_vq_avail_empty()
On 3/27/24 12:44, Jason Wang wrote: On Wed, Mar 27, 2024 at 10:34 AM Jason Wang wrote: On Wed, Mar 27, 2024 at 7:39 AM Gavin Shan wrote: A smp_rmb() has been missed in vhost_vq_avail_empty(), spotted by Will Deacon . Otherwise, it's not ensured the available ring entries pushed by guest can be observed by vhost in time, leading to stale available ring entries fetched by vhost in vhost_get_vq_desc(), as reported by Yihuang Yu on NVidia's grace-hopper (ARM64) platform. /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64 \ -accel kvm -machine virt,gic-version=host -cpu host \ -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \ -m 4096M,slots=16,maxmem=64G \ -object memory-backend-ram,id=mem0,size=4096M\ : \ -netdev tap,id=vnet0,vhost=true \ -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 : guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM virtio_net virtio0: output.0:id 100 is not a head! Add the missed smp_rmb() in vhost_vq_avail_empty(). Note that it should be safe until vq->avail_idx is changed by commit 275bf960ac697 ("vhost: better detection of available buffers"). Fixes: 275bf960ac697 ("vhost: better detection of available buffers") Cc: # v4.11+ Reported-by: Yihuang Yu Signed-off-by: Gavin Shan --- drivers/vhost/vhost.c | 11 ++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 045f666b4f12..00445ab172b3 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2799,9 +2799,18 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq) r = vhost_get_avail_idx(vq, _idx); if (unlikely(r)) return false; + vq->avail_idx = vhost16_to_cpu(vq, avail_idx); + if (vq->avail_idx != vq->last_avail_idx) { + /* Similar to what's done in vhost_get_vq_desc(), we need +* to ensure the available ring entries have been exposed +* by guest. +*/ We need to be more verbose here. For example, which load needs to be ordered with which load. The rmb in vhost_get_vq_desc() is used to order the load of avail idx and the load of head. It is paired with e.g virtio_wmb() in virtqueue_add_split(). vhost_vq_avail_empty() are mostly used as a hint in vhost_net_busy_poll() which is under the protection of the vq mutex. An exception is the tx_can_batch(), but in that case it doesn't even want to read the head. Ok, if it is needed only in that path, maybe we can move the barriers there. [cc Will Deacon] Jason, appreciate for your review and comments. I think PATCH[1/2] is the fix for the hypothesis, meaning PATCH[2/2] is the real fix. However, it would be nice to fix all of them in one shoot. I will try with PATCH[2/2] only to see if our issue will disappear or not. However, the issue still exists if PATCH[2/2] is missed. Firstly, We were failing on the transmit queue and {tvq, rvq}->busyloop_timeout == false if I remember correctly. So the added smp_rmb() in vhost_vq_avail_empty() is only a concern to tx_can_batch(). A mutex isn't enough to ensure the order for the available index and available ring entry (head). For example, vhost_vq_avail_empty() called by tx_can_batch() can see next available index, but its corresponding available ring entry (head) may not be seen by vhost yet if smp_rmb() is missed. The next call to get_tx_bufs(), where the available ring entry (head) doesn't arrived yet, leading to stale available ring entry (head) being fetched. handle_tx_copy get_tx_bufs // smp_rmb() won't be executed when vq->avail_idx != vq->last_avail_idx tx_can_batch vhost_vq_avail_empty // vq->avail_idx is updated from vq->avail->idx The reason why I added smp_rmb() to vhost_vq_avail_empty() is because the function is a exposed API, even it's only used by drivers/vhost/net.c at present. It means the API has been broken internally. So it seems more appropriate to fix it up in vhost_vq_avail_empty() so that the API's users needn't worry about the memory access order. + smp_rmb(); + return false; + } - return vq->avail_idx == vq->last_avail_idx; + return true; } EXPORT_SYMBOL_GPL(vhost_vq_avail_empty); Thanks, Gavin
Re: [PATCH] virtio_ring: Fix the stale index in available ring
On 3/27/24 09:14, Gavin Shan wrote: On 3/27/24 01:46, Will Deacon wrote: On Tue, Mar 26, 2024 at 11:43:13AM +, Will Deacon wrote: Ok, long shot after eyeballing the vhost code, but does the diff below help at all? It looks like vhost_vq_avail_empty() can advance the value saved in 'vq->avail_idx' but without the read barrier, possibly confusing vhost_get_vq_desc() in polling mode. diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 045f666b4f12..87bff710331a 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2801,6 +2801,7 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq) return false; vq->avail_idx = vhost16_to_cpu(vq, avail_idx); + smp_rmb(); return vq->avail_idx == vq->last_avail_idx; } EXPORT_SYMBOL_GPL(vhost_vq_avail_empty); Thanks, Will. I already noticed smp_rmb() has been missed in vhost_vq_avail_empty(). The issue still exists after smp_rmb() is added here. However, I'm inspired by your suggestion and recheck the code again. It seems another smp_rmb() has been missed in vhost_enable_notify(). With smp_rmb() added to vhost_vq_avail_empty() and vhost_enable_notify(), I'm unable to hit the issue. I will try for more times to make sure the issue is really resolved. After that, I will post formal patches for review. Thanks again, Will. The formal patches have been sent for review. https://lkml.org/lkml/2024/3/27/40 Thanks, Gavin
Re: [PATCH v2 0/2] vhost: Fix stale available ring entries
On 3/27/24 09:38, Gavin Shan wrote: The issue was reported by Yihuang Yu on NVidia's grace-hopper (ARM64) platform. The wrong head (available ring entry) is seen by the guest when running 'netperf' on the guest and running 'netserver' on another NVidia's grace-grace machine. /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64 \ -accel kvm -machine virt,gic-version=host -cpu host \ -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \ -m 4096M,slots=16,maxmem=64G \ -object memory-backend-ram,id=mem0,size=4096M\ : \ -netdev tap,id=tap0,vhost=true \ -device virtio-net-pci,bus=pcie.8,netdev=tap0,mac=52:54:00:f1:26:b0 : guest# ifconfig eth0 | grep 'inet addr' inet addr:10.26.1.220 guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM virtio_net virtio0: output.0:id 100 is not a head! There is missed smp_rmb() in vhost_vq_avail_empty() and vhost_enable_notify(). Without smp_rmb(), vq->avail_idx is increased but the available ring entries aren't arriving to vhost side yet. So a stale available ring entry can be fetched in vhost_get_vq_desc(). Fix it by adding smp_rmb() in those two functions. Note that I need two patches so that they can be easily picked up by the stable kernel. With the changes, I'm unable to hit the issue again. Gavin Shan (2): vhost: Add smp_rmb() in vhost_vq_avail_empty() vhost: Add smp_rmb() in vhost_enable_notify() drivers/vhost/vhost.c | 22 -- 1 file changed, 20 insertions(+), 2 deletions(-) Sorry, I was supposed to copy Will. Amending for it. Thanks, Gavin
[PATCH v2 2/2] vhost: Add smp_rmb() in vhost_enable_notify()
A smp_rmb() has been missed in vhost_enable_notify(), inspired by Will Deacon . Otherwise, it's not ensured the available ring entries pushed by guest can be observed by vhost in time, leading to stale available ring entries fetched by vhost in vhost_get_vq_desc(), as reported by Yihuang Yu on NVidia's grace-hopper (ARM64) platform. /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64 \ -accel kvm -machine virt,gic-version=host -cpu host \ -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \ -m 4096M,slots=16,maxmem=64G \ -object memory-backend-ram,id=mem0,size=4096M\ : \ -netdev tap,id=vnet0,vhost=true \ -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 : guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM virtio_net virtio0: output.0:id 100 is not a head! Add the missed smp_rmb() in vhost_enable_notify(). Note that it should be safe until vq->avail_idx is changed by commit d3bb267bbdcb ("vhost: cache avail index in vhost_enable_notify()"). Fixes: d3bb267bbdcb ("vhost: cache avail index in vhost_enable_notify()") Cc: # v5.18+ Reported-by: Yihuang Yu Signed-off-by: Gavin Shan --- drivers/vhost/vhost.c | 11 ++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 00445ab172b3..58f9d6a435f0 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2847,9 +2847,18 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) >avail->idx, r); return false; } + vq->avail_idx = vhost16_to_cpu(vq, avail_idx); + if (vq->avail_idx != vq->last_avail_idx) { + /* Similar to what's done in vhost_get_vq_desc(), we need +* to ensure the available ring entries have been exposed +* by guest. +*/ + smp_rmb(); + return true; + } - return vq->avail_idx != vq->last_avail_idx; + return false; } EXPORT_SYMBOL_GPL(vhost_enable_notify); -- 2.44.0
[PATCH v2 1/2] vhost: Add smp_rmb() in vhost_vq_avail_empty()
A smp_rmb() has been missed in vhost_vq_avail_empty(), spotted by Will Deacon . Otherwise, it's not ensured the available ring entries pushed by guest can be observed by vhost in time, leading to stale available ring entries fetched by vhost in vhost_get_vq_desc(), as reported by Yihuang Yu on NVidia's grace-hopper (ARM64) platform. /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64 \ -accel kvm -machine virt,gic-version=host -cpu host \ -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \ -m 4096M,slots=16,maxmem=64G \ -object memory-backend-ram,id=mem0,size=4096M\ : \ -netdev tap,id=vnet0,vhost=true \ -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 : guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM virtio_net virtio0: output.0:id 100 is not a head! Add the missed smp_rmb() in vhost_vq_avail_empty(). Note that it should be safe until vq->avail_idx is changed by commit 275bf960ac697 ("vhost: better detection of available buffers"). Fixes: 275bf960ac697 ("vhost: better detection of available buffers") Cc: # v4.11+ Reported-by: Yihuang Yu Signed-off-by: Gavin Shan --- drivers/vhost/vhost.c | 11 ++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 045f666b4f12..00445ab172b3 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2799,9 +2799,18 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq) r = vhost_get_avail_idx(vq, _idx); if (unlikely(r)) return false; + vq->avail_idx = vhost16_to_cpu(vq, avail_idx); + if (vq->avail_idx != vq->last_avail_idx) { + /* Similar to what's done in vhost_get_vq_desc(), we need +* to ensure the available ring entries have been exposed +* by guest. +*/ + smp_rmb(); + return false; + } - return vq->avail_idx == vq->last_avail_idx; + return true; } EXPORT_SYMBOL_GPL(vhost_vq_avail_empty); -- 2.44.0
[PATCH v2 0/2] vhost: Fix stale available ring entries
The issue was reported by Yihuang Yu on NVidia's grace-hopper (ARM64) platform. The wrong head (available ring entry) is seen by the guest when running 'netperf' on the guest and running 'netserver' on another NVidia's grace-grace machine. /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64 \ -accel kvm -machine virt,gic-version=host -cpu host \ -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \ -m 4096M,slots=16,maxmem=64G \ -object memory-backend-ram,id=mem0,size=4096M\ : \ -netdev tap,id=tap0,vhost=true \ -device virtio-net-pci,bus=pcie.8,netdev=tap0,mac=52:54:00:f1:26:b0 : guest# ifconfig eth0 | grep 'inet addr' inet addr:10.26.1.220 guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM virtio_net virtio0: output.0:id 100 is not a head! There is missed smp_rmb() in vhost_vq_avail_empty() and vhost_enable_notify(). Without smp_rmb(), vq->avail_idx is increased but the available ring entries aren't arriving to vhost side yet. So a stale available ring entry can be fetched in vhost_get_vq_desc(). Fix it by adding smp_rmb() in those two functions. Note that I need two patches so that they can be easily picked up by the stable kernel. With the changes, I'm unable to hit the issue again. Gavin Shan (2): vhost: Add smp_rmb() in vhost_vq_avail_empty() vhost: Add smp_rmb() in vhost_enable_notify() drivers/vhost/vhost.c | 22 -- 1 file changed, 20 insertions(+), 2 deletions(-) -- 2.44.0
Re: [PATCH] virtio_ring: Fix the stale index in available ring
On 3/27/24 01:46, Will Deacon wrote: On Tue, Mar 26, 2024 at 11:43:13AM +, Will Deacon wrote: Ok, long shot after eyeballing the vhost code, but does the diff below help at all? It looks like vhost_vq_avail_empty() can advance the value saved in 'vq->avail_idx' but without the read barrier, possibly confusing vhost_get_vq_desc() in polling mode. diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 045f666b4f12..87bff710331a 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2801,6 +2801,7 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq) return false; vq->avail_idx = vhost16_to_cpu(vq, avail_idx); + smp_rmb(); return vq->avail_idx == vq->last_avail_idx; } EXPORT_SYMBOL_GPL(vhost_vq_avail_empty); Thanks, Will. I already noticed smp_rmb() has been missed in vhost_vq_avail_empty(). The issue still exists after smp_rmb() is added here. However, I'm inspired by your suggestion and recheck the code again. It seems another smp_rmb() has been missed in vhost_enable_notify(). With smp_rmb() added to vhost_vq_avail_empty() and vhost_enable_notify(), I'm unable to hit the issue. I will try for more times to make sure the issue is really resolved. After that, I will post formal patches for review. Thanks, Gavin
Re: [PATCH] virtio_ring: Fix the stale index in available ring
On 3/20/24 17:14, Michael S. Tsirkin wrote: On Wed, Mar 20, 2024 at 03:24:16PM +1000, Gavin Shan wrote: On 3/20/24 10:49, Michael S. Tsirkin wrote:> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 6f7e5010a673..79456706d0bd 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -685,7 +685,8 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, /* Put entry in available array (but don't update avail->idx until they * do sync). */ avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); - vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); + u16 headwithflag = head | (q->split.avail_idx_shadow & ~(vq->split.vring.num - 1)); + vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, headwithflag); /* Descriptors and available array need to be set before we expose the * new available array entries. */ Ok, Michael. I continued with my debugging code. It still looks like a hardware bug on NVidia's grace-hopper. I really think NVidia needs to be involved for the discussion, as suggested by you. Firstly, I bind the vhost process and vCPU thread to CPU#71 and CPU#70. Note that I have only one vCPU in my configuration. Secondly, the debugging code is enhanced so that the available head for (last_avail_idx - 1) is read for twice and recorded. It means the available head for one specific available index is read for twice. I do see the available heads are different from the consecutive reads. More details are shared as below. From the guest side === virtio_net virtio0: output.0:id 86 is not a head! head to be released: 047 062 112 avail_idx: 000 49665 001 49666 <-- : 015 49664 avail_head: 000 062 001 047 <-- : 015 112 From the host side == avail_idx 000 49663 001 49666 <--- : avail_head 000 062 (062) 001 047 (047) <--- : 015 086 (112) // head 086 is returned from the first read, // but head 112 is returned from the second read vhost_get_vq_desc: Inconsistent head in two read (86 -> 112) for avail_idx 49664 Thanks, Gavin
Re: [PATCH] virtio_ring: Fix the stale index in available ring
On 3/21/24 03:15, Keir Fraser wrote: On Wed, Mar 20, 2024 at 03:24:16PM +1000, Gavin Shan wrote: Before this patch was posted, I had debugging code to record last 16 transactions to the available and used queue from guest and host side. It did reveal the wrong head was fetched from the available queue. [ 11.785745] virtqueue_get_buf_ctx_split [ 11.786238] virtio_net virtio0: output.0:id 74 is not a head! [ 11.786655] head to be released: 036 077 [ 11.786952] [ 11.786952] avail_idx: [ 11.787234] 000 63985 <-- [ 11.787237] 001 63986 [ 11.787444] 002 63987 [ 11.787632] 003 63988 [ 11.787821] 004 63989 [ 11.788006] 005 63990 [ 11.788194] 006 63991 [ 11.788381] 007 63992 [ 11.788567] 008 63993 [ 11.788772] 009 63994 [ 11.788957] 010 63995 [ 11.789141] 011 63996 [ 11.789327] 012 63997 [ 11.789515] 013 63998 [ 11.789701] 014 63999 [ 11.789886] 015 64000 Does the error always occur at such a round idx value? Here, 64000 == 0xFA00. Maybe coincidence but it's improbable enough to be interesting. This debug code seems rather useful! Keir, Nope, it's just coincidence. We don't have such kind of pattern. Thanks, Gavin [ 11.790068] [ 11.790068] avail_head: [ 11.790529] 000 075 <-- [ 11.790718] 001 036 [ 11.790890] 002 077 [ 11.791061] 003 129 [ 11.791231] 004 072 [ 11.791400] 005 130 [ 11.791574] 006 015 [ 11.791748] 007 074 [ 11.791918] 008 130 [ 11.792094] 009 130 [ 11.792263] 010 074 [ 11.792437] 011 015 [ 11.792617] 012 072 [ 11.792788] 013 129 [ 11.792961] 014 077// The last two heads from guest to host: 077, 036 [ 11.793134] 015 036 [root@nvidia-grace-hopper-05 qemu.main]# cat /proc/vhost avail_idx 000 63998 001 64000 002 63954 <--- 003 63955 004 63956 005 63974 006 63981 007 63984 008 63986 009 63987 010 63988 011 63989 012 63992 013 63993 014 63995 015 63997 avail_head 000 074 001 015 002 072 003 129 004 074// The last two heads seen by vhost is: 074, 036 005 036 006 075 <--- 007 036 008 077 009 129 010 072 011 130 012 015 013 074 014 130 015 130 used_idx 000 64000 001 63882 <--- 002 63889 003 63891 004 63898 005 63936 006 63942 007 63946 008 63949 009 63953 010 63957 011 63981 012 63990 013 63992 014 63993 015 63999 used_head 000 072 001 129 002 074 // The last two heads published to guest is: 074, 036 003 036 004 075 <--- 005 036 006 077 007 129 008 072 009 130 010 015 011 074 012 130 013 130 014 074 015 015 Thanks, Gavin
Re: [PATCH] virtio_ring: Fix the stale index in available ring
On 3/20/24 10:49, Michael S. Tsirkin wrote:> I think you are wasting the time with these tests. Even if it helps what does this tell us? Try setting a flag as I suggested elsewhere. Then check it in vhost. Or here's another idea - possibly easier. Copy the high bits from index into ring itself. Then vhost can check that head is synchronized with index. Warning: completely untested, not even compiled. But should give you the idea. If this works btw we should consider making this official in the spec. static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq, diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 6f7e5010a673..79456706d0bd 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -685,7 +685,8 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, /* Put entry in available array (but don't update avail->idx until they * do sync). */ avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); - vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); + u16 headwithflag = head | (q->split.avail_idx_shadow & ~(vq->split.vring.num - 1)); + vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, headwithflag); /* Descriptors and available array need to be set before we expose the * new available array entries. */ diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 045f666b4f12..bd8f7c763caa 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1299,8 +1299,15 @@ static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq, static inline int vhost_get_avail_head(struct vhost_virtqueue *vq, __virtio16 *head, int idx) { - return vhost_get_avail(vq, *head, + unsigned i = idx; + unsigned flag = i & ~(vq->num - 1); + unsigned val = vhost_get_avail(vq, *head, >avail->ring[idx & (vq->num - 1)]); + unsigned valflag = val & ~(vq->num - 1); + + WARN_ON(valflag != flag); + + return val & (vq->num - 1); } Thanks, Michael. The code is already self-explanatory. Since vq->num is 256, I just squeezed the last_avail_idx to the high byte. Unfortunately, I'm unable to hit the WARN_ON(). Does it mean the low byte is stale (or corrupted) while the high byte is still correct and valid? avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head | (avail << 8)); head = vhost16_to_cpu(vq, ring_head); WARN_ON((head >> 8) != (vq->last_avail_idx % vq->num)); head = head & 0xff; One question: Does QEMU has any chance writing data to the available queue when vhost is enabled? My previous understanding is no, the queue is totally owned by vhost instead of QEMU. Before this patch was posted, I had debugging code to record last 16 transactions to the available and used queue from guest and host side. It did reveal the wrong head was fetched from the available queue. [ 11.785745] virtqueue_get_buf_ctx_split [ 11.786238] virtio_net virtio0: output.0:id 74 is not a head! [ 11.786655] head to be released: 036 077 [ 11.786952] [ 11.786952] avail_idx: [ 11.787234] 000 63985 <-- [ 11.787237] 001 63986 [ 11.787444] 002 63987 [ 11.787632] 003 63988 [ 11.787821] 004 63989 [ 11.788006] 005 63990 [ 11.788194] 006 63991 [ 11.788381] 007 63992 [ 11.788567] 008 63993 [ 11.788772] 009 63994 [ 11.788957] 010 63995 [ 11.789141] 011 63996 [ 11.789327] 012 63997 [ 11.789515] 013 63998 [ 11.789701] 014 63999 [ 11.789886] 015 64000 [ 11.790068] [ 11.790068] avail_head: [ 11.790529] 000 075 <-- [ 11.790718] 001 036 [ 11.790890] 002 077 [ 11.791061] 003 129 [ 11.791231] 004 072 [ 11.791400] 005 130 [ 11.791574] 006 015 [ 11.791748] 007 074 [ 11.791918] 008 130 [ 11.792094] 009 130 [ 11.792263] 010 074 [ 11.792437] 011 015 [ 11.792617] 012 072 [ 11.792788] 013 129 [ 11.792961] 014 077// The last two heads from guest to host: 077, 036 [ 11.793134] 015 036 [root@nvidia-grace-hopper-05 qemu.main]# cat /proc/vhost avail_idx 000 63998 001 64000 002 63954 <--- 003 63955 004 63956 005 63974 006 63981 007 63984 008 63986 009 63987 010 63988 011 63989 012 63992 013 63993 014 63995 015 63997 avail_head 000 074 001 015 002 072 003 129 004 074// The last two heads seen by vhost is: 074, 036 005 036 006 075 <--- 007 036 008 077 009 129 010 072 011 130 012 015 013 074 014 130 015 130 used_idx 000 64000 001 63882 <--- 002 63889 003 63891 004 63898 005 63936 006 63942 007 63946 008 63949 009 63953 010 63957 011 63981 012 63990 013 63992 014 63993 015 63999 used_head 000 072 001 129 002 074 // The
Re: [PATCH] virtio_ring: Fix the stale index in available ring
On 3/20/24 04:22, Will Deacon wrote: On Tue, Mar 19, 2024 at 02:59:23PM +1000, Gavin Shan wrote: On 3/19/24 02:59, Will Deacon wrote: drivers/virtio/virtio_ring.c | 12 +--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 49299b1f9ec7..7d852811c912 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -687,9 +687,15 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); - /* Descriptors and available array need to be set before we expose the -* new available array entries. */ - virtio_wmb(vq->weak_barriers); + /* +* Descriptors and available array need to be set before we expose +* the new available array entries. virtio_wmb() should be enough +* to ensuere the order theoretically. However, a stronger barrier +* is needed by ARM64. Otherwise, the stale data can be observed +* by the host (vhost). A stronger barrier should work for other +* architectures, but performance loss is expected. +*/ + virtio_mb(false); vq->split.avail_idx_shadow++; vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->split.avail_idx_shadow); Replacing a DMB with a DSB is _very_ unlikely to be the correct solution here, especially when ordering accesses to coherent memory. In practice, either the larger timing different from the DSB or the fact that you're going from a Store->Store barrier to a full barrier is what makes things "work" for you. Have you tried, for example, a DMB SY (e.g. via __smb_mb()). We definitely shouldn't take changes like this without a proper explanation of what is going on. Thanks for your comments, Will. Yes, DMB should work for us. However, it seems this instruction has issues on NVidia's grace-hopper. It's hard for me to understand how DMB and DSB works from hardware level. I agree it's not the solution to replace DMB with DSB before we fully understand the root cause. I tried the possible replacement like below. __smp_mb() can avoid the issue like __mb() does. __ndelay(10) can avoid the issue, but __ndelay(9) doesn't. static inline int virtqueue_add_split(struct virtqueue *_vq, ...) { : /* Put entry in available array (but don't update avail->idx until they * do sync). */ avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); /* Descriptors and available array need to be set before we expose the * new available array entries. */ // Broken: virtio_wmb(vq->weak_barriers); // Broken: __dma_mb(); // Work: __mb(); // Work: __smp_mb(); It's pretty weird that __dma_mb() is "broken" but __smp_mb() "works". How confident are you in that result? Yes, __dma_mb() is even stronger than __smp_mb(). I retried the test, showing that both __dma_mb() and __smp_mb() work for us. I had too many tests yesterday and something may have been messed up. Instruction Hitting times in 10 tests - __smp_wmb() 8 __smp_mb() 0 __dma_wmb() 7 __dma_mb() 0 __mb() 0 __wmb() 0 It's strange that __smp_mb() works, but __smp_wmb() fails. It seems we need a read barrier here. I will try WRITE_ONCE() + __smp_wmb() as suggested by Michael in another reply. Will update the result soon. Thanks, Gavin
Re: [PATCH] virtio_ring: Fix the stale index in available ring
On 3/19/24 17:09, Michael S. Tsirkin wrote: On Tue, Mar 19, 2024 at 04:49:50PM +1000, Gavin Shan wrote: On 3/19/24 16:43, Michael S. Tsirkin wrote: On Tue, Mar 19, 2024 at 04:38:49PM +1000, Gavin Shan wrote: On 3/19/24 16:09, Michael S. Tsirkin wrote: diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 49299b1f9ec7..7d852811c912 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -687,9 +687,15 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); - /* Descriptors and available array need to be set before we expose the -* new available array entries. */ - virtio_wmb(vq->weak_barriers); + /* +* Descriptors and available array need to be set before we expose +* the new available array entries. virtio_wmb() should be enough +* to ensuere the order theoretically. However, a stronger barrier +* is needed by ARM64. Otherwise, the stale data can be observed +* by the host (vhost). A stronger barrier should work for other +* architectures, but performance loss is expected. +*/ + virtio_mb(false); vq->split.avail_idx_shadow++; vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->split.avail_idx_shadow); Replacing a DMB with a DSB is _very_ unlikely to be the correct solution here, especially when ordering accesses to coherent memory. In practice, either the larger timing different from the DSB or the fact that you're going from a Store->Store barrier to a full barrier is what makes things "work" for you. Have you tried, for example, a DMB SY (e.g. via __smb_mb()). We definitely shouldn't take changes like this without a proper explanation of what is going on. Thanks for your comments, Will. Yes, DMB should work for us. However, it seems this instruction has issues on NVidia's grace-hopper. It's hard for me to understand how DMB and DSB works from hardware level. I agree it's not the solution to replace DMB with DSB before we fully understand the root cause. I tried the possible replacement like below. __smp_mb() can avoid the issue like __mb() does. __ndelay(10) can avoid the issue, but __ndelay(9) doesn't. static inline int virtqueue_add_split(struct virtqueue *_vq, ...) { : /* Put entry in available array (but don't update avail->idx until they * do sync). */ avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); /* Descriptors and available array need to be set before we expose the * new available array entries. */ // Broken: virtio_wmb(vq->weak_barriers); // Broken: __dma_mb(); // Work: __mb(); // Work: __smp_mb(); // Work: __ndelay(100); // Work: __ndelay(10); // Broken: __ndelay(9); vq->split.avail_idx_shadow++; vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->split.avail_idx_shadow); What if you stick __ndelay here? /* Put entry in available array (but don't update avail->idx until they * do sync). */ avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); /* Descriptors and available array need to be set before we expose the * new available array entries. */ virtio_wmb(vq->weak_barriers); vq->split.avail_idx_shadow++; vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->split.avail_idx_shadow); /* Try __ndelay(x) here as Michael suggested * * Work: __ndelay(200);possiblly make it hard to reproduce * Broken:__ndelay(100); * Broken:__ndelay(20); * Broken:__ndelay(10); */ __ndelay(200); So we see that just changing the timing masks the race. What are you using on the host side? vhost or qemu? __ndelay(200) may make the issue harder to be reproduce as I understand. More delays here will give vhost relief, reducing the race. The issue is only reproducible when vhost is turned on. Otherwise, we aren't able to hit the issue. -netdev tap,id=vnet0,vhost=true,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown \ -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 Given it's vhost, it's also possible that the is
Re: [PATCH] virtio_ring: Fix the stale index in available ring
On 3/19/24 17:04, Michael S. Tsirkin wrote: On Tue, Mar 19, 2024 at 04:54:15PM +1000, Gavin Shan wrote: On 3/19/24 16:10, Michael S. Tsirkin wrote: On Tue, Mar 19, 2024 at 02:09:34AM -0400, Michael S. Tsirkin wrote: On Tue, Mar 19, 2024 at 02:59:23PM +1000, Gavin Shan wrote: On 3/19/24 02:59, Will Deacon wrote: [...] diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 49299b1f9ec7..7d852811c912 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -687,9 +687,15 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); - /* Descriptors and available array need to be set before we expose the -* new available array entries. */ - virtio_wmb(vq->weak_barriers); + /* +* Descriptors and available array need to be set before we expose +* the new available array entries. virtio_wmb() should be enough +* to ensuere the order theoretically. However, a stronger barrier +* is needed by ARM64. Otherwise, the stale data can be observed +* by the host (vhost). A stronger barrier should work for other +* architectures, but performance loss is expected. +*/ + virtio_mb(false); vq->split.avail_idx_shadow++; vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->split.avail_idx_shadow); Replacing a DMB with a DSB is _very_ unlikely to be the correct solution here, especially when ordering accesses to coherent memory. In practice, either the larger timing different from the DSB or the fact that you're going from a Store->Store barrier to a full barrier is what makes things "work" for you. Have you tried, for example, a DMB SY (e.g. via __smb_mb()). We definitely shouldn't take changes like this without a proper explanation of what is going on. Thanks for your comments, Will. Yes, DMB should work for us. However, it seems this instruction has issues on NVidia's grace-hopper. It's hard for me to understand how DMB and DSB works from hardware level. I agree it's not the solution to replace DMB with DSB before we fully understand the root cause. I tried the possible replacement like below. __smp_mb() can avoid the issue like __mb() does. __ndelay(10) can avoid the issue, but __ndelay(9) doesn't. static inline int virtqueue_add_split(struct virtqueue *_vq, ...) { : /* Put entry in available array (but don't update avail->idx until they * do sync). */ avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); /* Descriptors and available array need to be set before we expose the * new available array entries. */ // Broken: virtio_wmb(vq->weak_barriers); // Broken: __dma_mb(); // Work: __mb(); // Work: __smp_mb(); Did you try __smp_wmb ? And wmb? virtio_wmb(false) is equivalent to __smb_wmb(), which is broken. __wmb() works either. No issue found with it. Oh interesting. So how do smp_mb() and wmb() disassemble on this platform? Can you please check? I don't see they have been translated wrongly on Nvidia's grace-hopper: ===> virtio_wmb(vq->weak_barriers) 0x8000807b07c8 <+1168>: ldrbw0, [x20, #66] 0x8000807b07cc <+1172>: cbz w0, 0x8000807b089c 0x8000807b07d0 <+1176>: dmb ishst // same to __smp_wmb() : 0x8000807b089c <+1380>: dmb oshst // same to __dma_wmb() 0x8000807b08a0 <+1384>: b 0x8000807b07d4 ===> wmb() 0x8000807b07c8 <+1168>: dsb st // Work: __ndelay(100); // Work: __ndelay(10); // Broken: __ndelay(9); vq->split.avail_idx_shadow++; vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->split.avail_idx_shadow); What if you stick __ndelay here? And keep virtio_wmb above? The result has been shared through a separate reply. vq->num_added++; pr_debug("Added buffer head %i to %p\n", head, vq); END_USE(vq); : } I also tried to measure the consumed time for various barrier-relative instructions using ktime_get_ns() which should have consumed most of the time. __smb_mb() is slower than __smp_wmb() but faster than __mb() Instruction Range of used time in ns -- __smp_wmb() [32 1128032] __smp_mb()[32 1160096] __mb()[32 1162496] Thanks, Gavin
Re: [PATCH] virtio_ring: Fix the stale index in available ring
On 3/19/24 16:10, Michael S. Tsirkin wrote: On Tue, Mar 19, 2024 at 02:09:34AM -0400, Michael S. Tsirkin wrote: On Tue, Mar 19, 2024 at 02:59:23PM +1000, Gavin Shan wrote: On 3/19/24 02:59, Will Deacon wrote: [...] diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 49299b1f9ec7..7d852811c912 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -687,9 +687,15 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); - /* Descriptors and available array need to be set before we expose the -* new available array entries. */ - virtio_wmb(vq->weak_barriers); + /* +* Descriptors and available array need to be set before we expose +* the new available array entries. virtio_wmb() should be enough +* to ensuere the order theoretically. However, a stronger barrier +* is needed by ARM64. Otherwise, the stale data can be observed +* by the host (vhost). A stronger barrier should work for other +* architectures, but performance loss is expected. +*/ + virtio_mb(false); vq->split.avail_idx_shadow++; vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->split.avail_idx_shadow); Replacing a DMB with a DSB is _very_ unlikely to be the correct solution here, especially when ordering accesses to coherent memory. In practice, either the larger timing different from the DSB or the fact that you're going from a Store->Store barrier to a full barrier is what makes things "work" for you. Have you tried, for example, a DMB SY (e.g. via __smb_mb()). We definitely shouldn't take changes like this without a proper explanation of what is going on. Thanks for your comments, Will. Yes, DMB should work for us. However, it seems this instruction has issues on NVidia's grace-hopper. It's hard for me to understand how DMB and DSB works from hardware level. I agree it's not the solution to replace DMB with DSB before we fully understand the root cause. I tried the possible replacement like below. __smp_mb() can avoid the issue like __mb() does. __ndelay(10) can avoid the issue, but __ndelay(9) doesn't. static inline int virtqueue_add_split(struct virtqueue *_vq, ...) { : /* Put entry in available array (but don't update avail->idx until they * do sync). */ avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); /* Descriptors and available array need to be set before we expose the * new available array entries. */ // Broken: virtio_wmb(vq->weak_barriers); // Broken: __dma_mb(); // Work: __mb(); // Work: __smp_mb(); Did you try __smp_wmb ? And wmb? virtio_wmb(false) is equivalent to __smb_wmb(), which is broken. __wmb() works either. No issue found with it. // Work: __ndelay(100); // Work: __ndelay(10); // Broken: __ndelay(9); vq->split.avail_idx_shadow++; vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->split.avail_idx_shadow); What if you stick __ndelay here? And keep virtio_wmb above? The result has been shared through a separate reply. vq->num_added++; pr_debug("Added buffer head %i to %p\n", head, vq); END_USE(vq); : } I also tried to measure the consumed time for various barrier-relative instructions using ktime_get_ns() which should have consumed most of the time. __smb_mb() is slower than __smp_wmb() but faster than __mb() Instruction Range of used time in ns -- __smp_wmb() [32 1128032] __smp_mb()[32 1160096] __mb()[32 1162496] Thanks, Gavin
Re: [PATCH] virtio_ring: Fix the stale index in available ring
On 3/19/24 16:43, Michael S. Tsirkin wrote: On Tue, Mar 19, 2024 at 04:38:49PM +1000, Gavin Shan wrote: On 3/19/24 16:09, Michael S. Tsirkin wrote: diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 49299b1f9ec7..7d852811c912 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -687,9 +687,15 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); - /* Descriptors and available array need to be set before we expose the -* new available array entries. */ - virtio_wmb(vq->weak_barriers); + /* +* Descriptors and available array need to be set before we expose +* the new available array entries. virtio_wmb() should be enough +* to ensuere the order theoretically. However, a stronger barrier +* is needed by ARM64. Otherwise, the stale data can be observed +* by the host (vhost). A stronger barrier should work for other +* architectures, but performance loss is expected. +*/ + virtio_mb(false); vq->split.avail_idx_shadow++; vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->split.avail_idx_shadow); Replacing a DMB with a DSB is _very_ unlikely to be the correct solution here, especially when ordering accesses to coherent memory. In practice, either the larger timing different from the DSB or the fact that you're going from a Store->Store barrier to a full barrier is what makes things "work" for you. Have you tried, for example, a DMB SY (e.g. via __smb_mb()). We definitely shouldn't take changes like this without a proper explanation of what is going on. Thanks for your comments, Will. Yes, DMB should work for us. However, it seems this instruction has issues on NVidia's grace-hopper. It's hard for me to understand how DMB and DSB works from hardware level. I agree it's not the solution to replace DMB with DSB before we fully understand the root cause. I tried the possible replacement like below. __smp_mb() can avoid the issue like __mb() does. __ndelay(10) can avoid the issue, but __ndelay(9) doesn't. static inline int virtqueue_add_split(struct virtqueue *_vq, ...) { : /* Put entry in available array (but don't update avail->idx until they * do sync). */ avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); /* Descriptors and available array need to be set before we expose the * new available array entries. */ // Broken: virtio_wmb(vq->weak_barriers); // Broken: __dma_mb(); // Work: __mb(); // Work: __smp_mb(); // Work: __ndelay(100); // Work: __ndelay(10); // Broken: __ndelay(9); vq->split.avail_idx_shadow++; vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->split.avail_idx_shadow); What if you stick __ndelay here? /* Put entry in available array (but don't update avail->idx until they * do sync). */ avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); /* Descriptors and available array need to be set before we expose the * new available array entries. */ virtio_wmb(vq->weak_barriers); vq->split.avail_idx_shadow++; vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->split.avail_idx_shadow); /* Try __ndelay(x) here as Michael suggested * * Work: __ndelay(200);possiblly make it hard to reproduce * Broken:__ndelay(100); * Broken:__ndelay(20); * Broken:__ndelay(10); */ __ndelay(200); So we see that just changing the timing masks the race. What are you using on the host side? vhost or qemu? __ndelay(200) may make the issue harder to be reproduce as I understand. More delays here will give vhost relief, reducing the race. The issue is only reproducible when vhost is turned on. Otherwise, we aren't able to hit the issue. -netdev tap,id=vnet0,vhost=true,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown \ -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 vq->num_added++; pr_debug("Added buffer head %i to %p\n", head, vq); END_USE(vq); : } I also tried to measure the consumed time fo
Re: [PATCH] virtio_ring: Fix the stale index in available ring
On 3/19/24 16:09, Michael S. Tsirkin wrote: diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 49299b1f9ec7..7d852811c912 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -687,9 +687,15 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); - /* Descriptors and available array need to be set before we expose the -* new available array entries. */ - virtio_wmb(vq->weak_barriers); + /* +* Descriptors and available array need to be set before we expose +* the new available array entries. virtio_wmb() should be enough +* to ensuere the order theoretically. However, a stronger barrier +* is needed by ARM64. Otherwise, the stale data can be observed +* by the host (vhost). A stronger barrier should work for other +* architectures, but performance loss is expected. +*/ + virtio_mb(false); vq->split.avail_idx_shadow++; vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->split.avail_idx_shadow); Replacing a DMB with a DSB is _very_ unlikely to be the correct solution here, especially when ordering accesses to coherent memory. In practice, either the larger timing different from the DSB or the fact that you're going from a Store->Store barrier to a full barrier is what makes things "work" for you. Have you tried, for example, a DMB SY (e.g. via __smb_mb()). We definitely shouldn't take changes like this without a proper explanation of what is going on. Thanks for your comments, Will. Yes, DMB should work for us. However, it seems this instruction has issues on NVidia's grace-hopper. It's hard for me to understand how DMB and DSB works from hardware level. I agree it's not the solution to replace DMB with DSB before we fully understand the root cause. I tried the possible replacement like below. __smp_mb() can avoid the issue like __mb() does. __ndelay(10) can avoid the issue, but __ndelay(9) doesn't. static inline int virtqueue_add_split(struct virtqueue *_vq, ...) { : /* Put entry in available array (but don't update avail->idx until they * do sync). */ avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); /* Descriptors and available array need to be set before we expose the * new available array entries. */ // Broken: virtio_wmb(vq->weak_barriers); // Broken: __dma_mb(); // Work: __mb(); // Work: __smp_mb(); // Work: __ndelay(100); // Work: __ndelay(10); // Broken: __ndelay(9); vq->split.avail_idx_shadow++; vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->split.avail_idx_shadow); What if you stick __ndelay here? /* Put entry in available array (but don't update avail->idx until they * do sync). */ avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); /* Descriptors and available array need to be set before we expose the * new available array entries. */ virtio_wmb(vq->weak_barriers); vq->split.avail_idx_shadow++; vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->split.avail_idx_shadow); /* Try __ndelay(x) here as Michael suggested * * Work: __ndelay(200);possiblly make it hard to reproduce * Broken:__ndelay(100); * Broken:__ndelay(20); * Broken:__ndelay(10); */ __ndelay(200); vq->num_added++; pr_debug("Added buffer head %i to %p\n", head, vq); END_USE(vq); : } I also tried to measure the consumed time for various barrier-relative instructions using ktime_get_ns() which should have consumed most of the time. __smb_mb() is slower than __smp_wmb() but faster than __mb() Instruction Range of used time in ns -- __smp_wmb() [32 1128032] __smp_mb()[32 1160096] __mb()[32 1162496] Thanks, Gavin
Re: [PATCH] virtio_ring: Fix the stale index in available ring
On 3/19/24 02:59, Will Deacon wrote: On Thu, Mar 14, 2024 at 05:49:23PM +1000, Gavin Shan wrote: The issue is reported by Yihuang Yu who have 'netperf' test on NVidia's grace-grace and grace-hopper machines. The 'netperf' client is started in the VM hosted by grace-hopper machine, while the 'netperf' server is running on grace-grace machine. The VM is started with virtio-net and vhost has been enabled. We observe a error message spew from VM and then soft-lockup report. The error message indicates the data associated with the descriptor (index: 135) has been released, and the queue is marked as broken. It eventually leads to the endless effort to fetch free buffer (skb) in drivers/net/virtio_net.c::start_xmit() and soft-lockup. The stale index 135 is fetched from the available ring and published to the used ring by vhost, meaning we have disordred write to the available ring element and available index. /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64 \ -accel kvm -machine virt,gic-version=host\ : \ -netdev tap,id=vnet0,vhost=on\ -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 \ [ 19.993158] virtio_net virtio1: output.0:id 135 is not a head! Fix the issue by replacing virtio_wmb(vq->weak_barriers) with stronger virtio_mb(false), equivalent to replaced 'dmb' by 'dsb' instruction on ARM64. It should work for other architectures, but performance loss is expected. Cc: sta...@vger.kernel.org Reported-by: Yihuang Yu Signed-off-by: Gavin Shan --- drivers/virtio/virtio_ring.c | 12 +--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 49299b1f9ec7..7d852811c912 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -687,9 +687,15 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); - /* Descriptors and available array need to be set before we expose the -* new available array entries. */ - virtio_wmb(vq->weak_barriers); + /* +* Descriptors and available array need to be set before we expose +* the new available array entries. virtio_wmb() should be enough +* to ensuere the order theoretically. However, a stronger barrier +* is needed by ARM64. Otherwise, the stale data can be observed +* by the host (vhost). A stronger barrier should work for other +* architectures, but performance loss is expected. +*/ + virtio_mb(false); vq->split.avail_idx_shadow++; vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->split.avail_idx_shadow); Replacing a DMB with a DSB is _very_ unlikely to be the correct solution here, especially when ordering accesses to coherent memory. In practice, either the larger timing different from the DSB or the fact that you're going from a Store->Store barrier to a full barrier is what makes things "work" for you. Have you tried, for example, a DMB SY (e.g. via __smb_mb()). We definitely shouldn't take changes like this without a proper explanation of what is going on. Thanks for your comments, Will. Yes, DMB should work for us. However, it seems this instruction has issues on NVidia's grace-hopper. It's hard for me to understand how DMB and DSB works from hardware level. I agree it's not the solution to replace DMB with DSB before we fully understand the root cause. I tried the possible replacement like below. __smp_mb() can avoid the issue like __mb() does. __ndelay(10) can avoid the issue, but __ndelay(9) doesn't. static inline int virtqueue_add_split(struct virtqueue *_vq, ...) { : /* Put entry in available array (but don't update avail->idx until they * do sync). */ avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); /* Descriptors and available array need to be set before we expose the * new available array entries. */ // Broken: virtio_wmb(vq->weak_barriers); // Broken: __dma_mb(); // Work: __mb(); // Work: __smp_mb(); // Work: __ndelay(100); // Work: __ndelay(10); // Broken: __ndelay(9); vq->split.avail_idx_shadow++; vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->split.avail_idx_shadow); vq->num_added++; pr_debug("Added buffer head %i to %p\n", head, vq)
Re: [PATCH] virtio_ring: Fix the stale index in available ring
On 3/18/24 02:50, Michael S. Tsirkin wrote: On Fri, Mar 15, 2024 at 09:24:36PM +1000, Gavin Shan wrote: On 3/15/24 21:05, Michael S. Tsirkin wrote: On Fri, Mar 15, 2024 at 08:45:10PM +1000, Gavin Shan wrote: Yes, I guess smp_wmb() ('dmb') is buggy on NVidia's grace-hopper platform. I tried to reproduce it with my own driver where one thread writes to the shared buffer and another thread reads from the buffer. I don't hit the out-of-order issue so far. Make sure the 2 areas you are accessing are in different cache lines. Yes, I already put those 2 areas to separate cache lines. My driver may be not correct somewhere and I will update if I can reproduce the issue with my driver in the future. Then maybe your change is just making virtio slower and masks the bug that is actually elsewhere? You don't really need a driver. Here's a simple test: without barriers assertion will fail. With barriers it will not. (Warning: didn't bother testing too much, could be buggy. --- #include #include #include #include #define FIRST values[0] #define SECOND values[64] volatile int values[100] = {}; void* writer_thread(void* arg) { while (1) { FIRST++; // NEED smp_wmb here __asm__ volatile("dmb ishst" : : : "memory"); SECOND++; } } void* reader_thread(void* arg) { while (1) { int first = FIRST; // NEED smp_rmb here __asm__ volatile("dmb ishld" : : : "memory"); int second = SECOND; assert(first - second == 1 || first - second == 0); } } int main() { pthread_t writer, reader; pthread_create(, NULL, writer_thread, NULL); pthread_create(, NULL, reader_thread, NULL); pthread_join(writer, NULL); pthread_join(reader, NULL); return 0; } Had a quick test on NVidia's grace-hopper and Ampere's CPUs. I hit the assert on both of them. After replacing 'dmb' with 'dsb', I can hit assert on both of them too. I need to look at the code closely. [root@virt-mtcollins-02 test]# ./a a: a.c:26: reader_thread: Assertion `first - second == 1 || first - second == 0' failed. Aborted (core dumped) [root@nvidia-grace-hopper-05 test]# ./a a: a.c:26: reader_thread: Assertion `first - second == 1 || first - second == 0' failed. Aborted (core dumped) Thanks, Gavin Actually this test is broken. No need for ordering it's a simple race. The following works on x86 though (x86 does not need barriers though). #include #include #include #include #if 0 #define x86_rmb() asm volatile("lfence":::"memory") #define x86_mb() asm volatile("mfence":::"memory") #define x86_smb() asm volatile("sfence":::"memory") #else #define x86_rmb() asm volatile("":::"memory") #define x86_mb() asm volatile("":::"memory") #define x86_smb() asm volatile("":::"memory") #endif #define FIRST values[0] #define SECOND values[640] #define FLAG values[1280] volatile unsigned values[2000] = {}; void* writer_thread(void* arg) { while (1) { /* Now synchronize with reader */ while(FLAG); FIRST++; x86_smb(); SECOND++; x86_smb(); FLAG = 1; } } void* reader_thread(void* arg) { while (1) { /* Now synchronize with writer */ while(!FLAG); x86_rmb(); unsigned first = FIRST; x86_rmb(); unsigned second = SECOND; assert(first - second == 1 || first - second == 0); FLAG = 0; if (!(first %100)) printf("%d\n", first); } } int main() { pthread_t writer, reader; pthread_create(, NULL, writer_thread, NULL); pthread_create(, NULL, reader_thread, NULL); pthread_join(writer, NULL); pthread_join(reader, NULL); return 0; } I tried it on host and VM of NVidia's grace-hopper. Without the barriers, I can hit assert. With the barriers, it's working fine without hitting the assert. I also had some code to mimic virtio vring last weekend, and it's just working well. Back to our original issue, __smb_wmb() is issued by guest while __smb_rmb() is executed on host. The VM and host are running at different exception level: EL2 vs EL1. I'm not sure it's the cause. I need to modify my code so that __smb_wmb() and __smb_rmb() can be executed from guest and host. [gshan@gshan code]$ cat test.h #ifndef __TEST_H #define __TEST_H struct vring_desc { uint64_taddr; uint32_tlen; uint16_tflags; uint16_tnext; } __attribute__((aligned(4))); struct vring_avail { uint16_tflags; uint16_tidx; uint16_tring[]; } __attribute__((aligned(4))); struct vring_used_elem { uint32_tid; uint32_tlen; } __attribute__((aligned(4))); stru
Re: [PATCH] virtio_ring: Fix the stale index in available ring
On 3/15/24 21:05, Michael S. Tsirkin wrote: On Fri, Mar 15, 2024 at 08:45:10PM +1000, Gavin Shan wrote: Yes, I guess smp_wmb() ('dmb') is buggy on NVidia's grace-hopper platform. I tried to reproduce it with my own driver where one thread writes to the shared buffer and another thread reads from the buffer. I don't hit the out-of-order issue so far. Make sure the 2 areas you are accessing are in different cache lines. Yes, I already put those 2 areas to separate cache lines. My driver may be not correct somewhere and I will update if I can reproduce the issue with my driver in the future. Then maybe your change is just making virtio slower and masks the bug that is actually elsewhere? You don't really need a driver. Here's a simple test: without barriers assertion will fail. With barriers it will not. (Warning: didn't bother testing too much, could be buggy. --- #include #include #include #include #define FIRST values[0] #define SECOND values[64] volatile int values[100] = {}; void* writer_thread(void* arg) { while (1) { FIRST++; // NEED smp_wmb here __asm__ volatile("dmb ishst" : : : "memory"); SECOND++; } } void* reader_thread(void* arg) { while (1) { int first = FIRST; // NEED smp_rmb here __asm__ volatile("dmb ishld" : : : "memory"); int second = SECOND; assert(first - second == 1 || first - second == 0); } } int main() { pthread_t writer, reader; pthread_create(, NULL, writer_thread, NULL); pthread_create(, NULL, reader_thread, NULL); pthread_join(writer, NULL); pthread_join(reader, NULL); return 0; } Had a quick test on NVidia's grace-hopper and Ampere's CPUs. I hit the assert on both of them. After replacing 'dmb' with 'dsb', I can hit assert on both of them too. I need to look at the code closely. [root@virt-mtcollins-02 test]# ./a a: a.c:26: reader_thread: Assertion `first - second == 1 || first - second == 0' failed. Aborted (core dumped) [root@nvidia-grace-hopper-05 test]# ./a a: a.c:26: reader_thread: Assertion `first - second == 1 || first - second == 0' failed. Aborted (core dumped) Thanks, Gavin
Re: [PATCH] virtio_ring: Fix the stale index in available ring
+ Will, Catalin and Matt from Nvidia On 3/14/24 22:59, Michael S. Tsirkin wrote: On Thu, Mar 14, 2024 at 10:50:15PM +1000, Gavin Shan wrote: On 3/14/24 21:50, Michael S. Tsirkin wrote: On Thu, Mar 14, 2024 at 08:15:22PM +1000, Gavin Shan wrote: On 3/14/24 18:05, Michael S. Tsirkin wrote: On Thu, Mar 14, 2024 at 05:49:23PM +1000, Gavin Shan wrote: The issue is reported by Yihuang Yu who have 'netperf' test on NVidia's grace-grace and grace-hopper machines. The 'netperf' client is started in the VM hosted by grace-hopper machine, while the 'netperf' server is running on grace-grace machine. The VM is started with virtio-net and vhost has been enabled. We observe a error message spew from VM and then soft-lockup report. The error message indicates the data associated with the descriptor (index: 135) has been released, and the queue is marked as broken. It eventually leads to the endless effort to fetch free buffer (skb) in drivers/net/virtio_net.c::start_xmit() and soft-lockup. The stale index 135 is fetched from the available ring and published to the used ring by vhost, meaning we have disordred write to the available ring element and available index. /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64 \ -accel kvm -machine virt,gic-version=host\ : \ -netdev tap,id=vnet0,vhost=on\ -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 \ [ 19.993158] virtio_net virtio1: output.0:id 135 is not a head! Fix the issue by replacing virtio_wmb(vq->weak_barriers) with stronger virtio_mb(false), equivalent to replaced 'dmb' by 'dsb' instruction on ARM64. It should work for other architectures, but performance loss is expected. Cc: sta...@vger.kernel.org Reported-by: Yihuang Yu Signed-off-by: Gavin Shan --- drivers/virtio/virtio_ring.c | 12 +--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 49299b1f9ec7..7d852811c912 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -687,9 +687,15 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); - /* Descriptors and available array need to be set before we expose the -* new available array entries. */ - virtio_wmb(vq->weak_barriers); + /* +* Descriptors and available array need to be set before we expose +* the new available array entries. virtio_wmb() should be enough +* to ensuere the order theoretically. However, a stronger barrier +* is needed by ARM64. Otherwise, the stale data can be observed +* by the host (vhost). A stronger barrier should work for other +* architectures, but performance loss is expected. +*/ + virtio_mb(false); I don't get what is going on here. Any explanation why virtio_wmb is not enough besides "it does not work"? The change is replacing instruction "dmb" with "dsb". "dsb" is stronger barrier than "dmb" because "dsb" ensures that all memory accesses raised before this instruction is completed when the 'dsb' instruction completes. However, "dmb" doesn't guarantee the order of completion of the memory accesses. So 'vq->split.vring.avail->idx = cpu_to_virtio(_vq->vdev, vq->split.avail_idx_shadow)' can be completed before 'vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head)'. Completed as observed by which CPU? We have 2 writes that we want observed by another CPU in order. So if CPU observes a new value of idx we want it to see new value in ring. This is standard use of smp_wmb() How are these 2 writes different? What DMB does, is that is seems to ensure that effects of 'vq->split.vring.avail->idx = cpu_to_virtio(_vq->vdev, vq->split.avail_idx_shadow)' are observed after effects of 'vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head)'. Completed as observed by the CPU where vhost worker is running. I don't think DMB does the work here. If I'm understanding correctly, DMB ensures the order of these two writes from the local CPU's standpoint. No this makes no sense at all. All memory accesses are in order from local CPU standpoint. It's true if compiler doesn't reorder the accesses, and light-weight barrier like 'dmb' and 'isb' is used. Otherwise, the accesses still can be disordered on the local CPU, correct? The written data can be stored in local CPU's cache, not flushed to DRAM and propogated to the cache of the far CPU where vhost worker is running. So DMB isn't ensu
Re: [PATCH] virtio_ring: Fix the stale index in available ring
On 3/14/24 21:50, Michael S. Tsirkin wrote: On Thu, Mar 14, 2024 at 08:15:22PM +1000, Gavin Shan wrote: On 3/14/24 18:05, Michael S. Tsirkin wrote: On Thu, Mar 14, 2024 at 05:49:23PM +1000, Gavin Shan wrote: The issue is reported by Yihuang Yu who have 'netperf' test on NVidia's grace-grace and grace-hopper machines. The 'netperf' client is started in the VM hosted by grace-hopper machine, while the 'netperf' server is running on grace-grace machine. The VM is started with virtio-net and vhost has been enabled. We observe a error message spew from VM and then soft-lockup report. The error message indicates the data associated with the descriptor (index: 135) has been released, and the queue is marked as broken. It eventually leads to the endless effort to fetch free buffer (skb) in drivers/net/virtio_net.c::start_xmit() and soft-lockup. The stale index 135 is fetched from the available ring and published to the used ring by vhost, meaning we have disordred write to the available ring element and available index. /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64 \ -accel kvm -machine virt,gic-version=host\ : \ -netdev tap,id=vnet0,vhost=on\ -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 \ [ 19.993158] virtio_net virtio1: output.0:id 135 is not a head! Fix the issue by replacing virtio_wmb(vq->weak_barriers) with stronger virtio_mb(false), equivalent to replaced 'dmb' by 'dsb' instruction on ARM64. It should work for other architectures, but performance loss is expected. Cc: sta...@vger.kernel.org Reported-by: Yihuang Yu Signed-off-by: Gavin Shan --- drivers/virtio/virtio_ring.c | 12 +--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 49299b1f9ec7..7d852811c912 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -687,9 +687,15 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); - /* Descriptors and available array need to be set before we expose the -* new available array entries. */ - virtio_wmb(vq->weak_barriers); + /* +* Descriptors and available array need to be set before we expose +* the new available array entries. virtio_wmb() should be enough +* to ensuere the order theoretically. However, a stronger barrier +* is needed by ARM64. Otherwise, the stale data can be observed +* by the host (vhost). A stronger barrier should work for other +* architectures, but performance loss is expected. +*/ + virtio_mb(false); I don't get what is going on here. Any explanation why virtio_wmb is not enough besides "it does not work"? The change is replacing instruction "dmb" with "dsb". "dsb" is stronger barrier than "dmb" because "dsb" ensures that all memory accesses raised before this instruction is completed when the 'dsb' instruction completes. However, "dmb" doesn't guarantee the order of completion of the memory accesses. So 'vq->split.vring.avail->idx = cpu_to_virtio(_vq->vdev, vq->split.avail_idx_shadow)' can be completed before 'vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head)'. Completed as observed by which CPU? We have 2 writes that we want observed by another CPU in order. So if CPU observes a new value of idx we want it to see new value in ring. This is standard use of smp_wmb() How are these 2 writes different? What DMB does, is that is seems to ensure that effects of 'vq->split.vring.avail->idx = cpu_to_virtio(_vq->vdev, vq->split.avail_idx_shadow)' are observed after effects of 'vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head)'. Completed as observed by the CPU where vhost worker is running. I don't think DMB does the work here. If I'm understanding correctly, DMB ensures the order of these two writes from the local CPU's standpoint. The written data can be stored in local CPU's cache, not flushed to DRAM and propogated to the cache of the far CPU where vhost worker is running. So DMB isn't ensuring the write data is observed from the far CPU. DSB ensures that the written data is observable from the far CPU immediately. The stronger barrier 'dsb' ensures the completion order as we expected. virtio_wmb(true) virt_mb(false) virt_wmb mb __smp_wmb __mb dmb(ishst) dsb(sy) First, why would you want a non smp barrier when you are
Re: [PATCH] virtio_ring: Fix the stale index in available ring
On 3/14/24 18:05, Michael S. Tsirkin wrote: On Thu, Mar 14, 2024 at 05:49:23PM +1000, Gavin Shan wrote: The issue is reported by Yihuang Yu who have 'netperf' test on NVidia's grace-grace and grace-hopper machines. The 'netperf' client is started in the VM hosted by grace-hopper machine, while the 'netperf' server is running on grace-grace machine. The VM is started with virtio-net and vhost has been enabled. We observe a error message spew from VM and then soft-lockup report. The error message indicates the data associated with the descriptor (index: 135) has been released, and the queue is marked as broken. It eventually leads to the endless effort to fetch free buffer (skb) in drivers/net/virtio_net.c::start_xmit() and soft-lockup. The stale index 135 is fetched from the available ring and published to the used ring by vhost, meaning we have disordred write to the available ring element and available index. /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64 \ -accel kvm -machine virt,gic-version=host\ : \ -netdev tap,id=vnet0,vhost=on\ -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 \ [ 19.993158] virtio_net virtio1: output.0:id 135 is not a head! Fix the issue by replacing virtio_wmb(vq->weak_barriers) with stronger virtio_mb(false), equivalent to replaced 'dmb' by 'dsb' instruction on ARM64. It should work for other architectures, but performance loss is expected. Cc: sta...@vger.kernel.org Reported-by: Yihuang Yu Signed-off-by: Gavin Shan --- drivers/virtio/virtio_ring.c | 12 +--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 49299b1f9ec7..7d852811c912 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -687,9 +687,15 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); - /* Descriptors and available array need to be set before we expose the -* new available array entries. */ - virtio_wmb(vq->weak_barriers); + /* +* Descriptors and available array need to be set before we expose +* the new available array entries. virtio_wmb() should be enough +* to ensuere the order theoretically. However, a stronger barrier +* is needed by ARM64. Otherwise, the stale data can be observed +* by the host (vhost). A stronger barrier should work for other +* architectures, but performance loss is expected. +*/ + virtio_mb(false); I don't get what is going on here. Any explanation why virtio_wmb is not enough besides "it does not work"? The change is replacing instruction "dmb" with "dsb". "dsb" is stronger barrier than "dmb" because "dsb" ensures that all memory accesses raised before this instruction is completed when the 'dsb' instruction completes. However, "dmb" doesn't guarantee the order of completion of the memory accesses. So 'vq->split.vring.avail->idx = cpu_to_virtio(_vq->vdev, vq->split.avail_idx_shadow)' can be completed before 'vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head)'. The stronger barrier 'dsb' ensures the completion order as we expected. virtio_wmb(true) virt_mb(false) virt_wmb mb __smp_wmb __mb dmb(ishst) dsb(sy) Extraced from ARMv9 specificaton The DMB instruction is a memory barrier instruction that ensures the relative order of memory accesses before the barrier with memory accesses after the barrier. The DMB instruction _does not_ ensure the completion of any of the memory accesses for which it ensures relative order. A DSB instruction is a memory barrier that ensures that memory accesses that occur before the DSB instruction have __completed__ before the completion of the DSB instruction. In doing this, it acts as a stronger barrier than a DMB and all ordering that is created by a DMB with specific options is also generated by a DSB with the same options. vq->split.avail_idx_shadow++; vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->split.avail_idx_shadow); -- 2.44.0 Thanks, Gavin
[PATCH] virtio_ring: Fix the stale index in available ring
The issue is reported by Yihuang Yu who have 'netperf' test on NVidia's grace-grace and grace-hopper machines. The 'netperf' client is started in the VM hosted by grace-hopper machine, while the 'netperf' server is running on grace-grace machine. The VM is started with virtio-net and vhost has been enabled. We observe a error message spew from VM and then soft-lockup report. The error message indicates the data associated with the descriptor (index: 135) has been released, and the queue is marked as broken. It eventually leads to the endless effort to fetch free buffer (skb) in drivers/net/virtio_net.c::start_xmit() and soft-lockup. The stale index 135 is fetched from the available ring and published to the used ring by vhost, meaning we have disordred write to the available ring element and available index. /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64 \ -accel kvm -machine virt,gic-version=host\ : \ -netdev tap,id=vnet0,vhost=on\ -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 \ [ 19.993158] virtio_net virtio1: output.0:id 135 is not a head! Fix the issue by replacing virtio_wmb(vq->weak_barriers) with stronger virtio_mb(false), equivalent to replaced 'dmb' by 'dsb' instruction on ARM64. It should work for other architectures, but performance loss is expected. Cc: sta...@vger.kernel.org Reported-by: Yihuang Yu Signed-off-by: Gavin Shan --- drivers/virtio/virtio_ring.c | 12 +--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 49299b1f9ec7..7d852811c912 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -687,9 +687,15 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); - /* Descriptors and available array need to be set before we expose the -* new available array entries. */ - virtio_wmb(vq->weak_barriers); + /* +* Descriptors and available array need to be set before we expose +* the new available array entries. virtio_wmb() should be enough +* to ensuere the order theoretically. However, a stronger barrier +* is needed by ARM64. Otherwise, the stale data can be observed +* by the host (vhost). A stronger barrier should work for other +* architectures, but performance loss is expected. +*/ + virtio_mb(false); vq->split.avail_idx_shadow++; vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->split.avail_idx_shadow); -- 2.44.0
[PATCH v2 3/3] KVM: arm64: Don't retrieve memory slot again in page fault handler
We needn't retrieve the memory slot again in user_mem_abort() because the corresponding memory slot has been passed from the caller. This would save some CPU cycles. For example, the time used to write 1GB memory, which is backed by 2MB hugetlb pages and write-protected, is dropped by 6.8% from 928ms to 864ms. Signed-off-by: Gavin Shan Reviewed-by: Keqian Zhu --- arch/arm64/kvm/mmu.c | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 192e0df2fc8e..2491b40a294a 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -843,10 +843,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * unmapped afterwards, the call to kvm_unmap_hva will take it away * from us again properly. This smp_rmb() interacts with the smp_wmb() * in kvm_mmu_notifier_invalidate_. +* +* Besides, __gfn_to_pfn_memslot() instead of gfn_to_pfn_prot() is +* used to avoid unnecessary overhead introduced to locate the memory +* slot because it's always fixed even @gfn is adjusted for huge pages. */ smp_rmb(); - pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, ); + pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, + write_fault, , NULL); if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(hva, vma_shift); return 0; @@ -912,7 +917,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, /* Mark the page dirty only if the fault is handled successfully */ if (writable && !ret) { kvm_set_pfn_dirty(pfn); - mark_page_dirty(kvm, gfn); + mark_page_dirty_in_slot(kvm, memslot, gfn); } out_unlock: -- 2.23.0
[PATCH v2 1/3] KVM: arm64: Hide kvm_mmu_wp_memory_region()
We needn't expose the function as it's only used by mmu.c since it was introduced by commit c64735554c0a ("KVM: arm: Add initial dirty page locking support"). Signed-off-by: Gavin Shan Reviewed-by: Keqian Zhu --- arch/arm64/include/asm/kvm_host.h | 1 - arch/arm64/kvm/mmu.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 3d10e6527f7d..688f2df1957b 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -632,7 +632,6 @@ void kvm_arm_resume_guest(struct kvm *kvm); }) void force_vm_exit(const cpumask_t *mask); -void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); int handle_exit(struct kvm_vcpu *vcpu, int exception_index); void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 8711894db8c2..28f3b3736dc8 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -555,7 +555,7 @@ static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_ * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, * serializing operations for VM memory regions. */ -void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) +static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) { struct kvm_memslots *slots = kvm_memslots(kvm); struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); -- 2.23.0
[PATCH v2 2/3] KVM: arm64: Use find_vma_intersection()
find_vma_intersection() has been existing to search the intersected vma. This uses the function where it's applicable, to simplify the code. Signed-off-by: Gavin Shan Reviewed-by: Keqian Zhu --- arch/arm64/kvm/mmu.c | 10 ++ 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 28f3b3736dc8..192e0df2fc8e 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -421,10 +421,11 @@ static void stage2_unmap_memslot(struct kvm *kvm, * ++ */ do { - struct vm_area_struct *vma = find_vma(current->mm, hva); + struct vm_area_struct *vma; hva_t vm_start, vm_end; - if (!vma || vma->vm_start >= reg_end) + vma = find_vma_intersection(current->mm, hva, reg_end); + if (!vma) break; /* @@ -1329,10 +1330,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, * ++ */ do { - struct vm_area_struct *vma = find_vma(current->mm, hva); + struct vm_area_struct *vma; hva_t vm_start, vm_end; - if (!vma || vma->vm_start >= reg_end) + vma = find_vma_intersection(current->mm, hva, reg_end); + if (!vma) break; /* -- 2.23.0
[PATCH v2 0/3] KVM: arm64: Minor page fault handler improvement
The series includes several minior improvements to stage-2 page fault handler: PATCH[1/2] are cleaning up the code. PATCH[3] don't retrieve the memory slot again in the page fault handler to save a bit CPU cycles. Changelog = v2: * Rebased to 5.12.rc3 and include r-bs from Keqian (Gavin) * Drop patch to fix IPA limit boundary issue(Keqian) * Comments on why we use __gfn_to_pfn_memslot() (Keqian) Gavin Shan (3): KVM: arm64: Hide kvm_mmu_wp_memory_region() KVM: arm64: Use find_vma_intersection() KVM: arm64: Don't retrieve memory slot again in page fault handler arch/arm64/include/asm/kvm_host.h | 1 - arch/arm64/kvm/mmu.c | 21 ++--- 2 files changed, 14 insertions(+), 8 deletions(-) -- 2.23.0
Re: [PATCH 2/4] KVM: arm64: Use find_vma_intersection()
Hi Keqian, On 3/15/21 8:42 PM, Gavin Shan wrote: On 3/15/21 7:04 PM, Keqian Zhu wrote: On 2021/3/15 12:18, Gavin Shan wrote: find_vma_intersection() has been existing to search the intersected vma. This uses the function where it's applicable, to simplify the code. Signed-off-by: Gavin Shan --- arch/arm64/kvm/mmu.c | 10 ++ 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 84e70f953de6..286b603ed0d3 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -421,10 +421,11 @@ static void stage2_unmap_memslot(struct kvm *kvm, * ++ */ do { - struct vm_area_struct *vma = find_vma(current->mm, hva); + struct vm_area_struct *vma; hva_t vm_start, vm_end; - if (!vma || vma->vm_start >= reg_end) + vma = find_vma_intersection(current->mm, hva, reg_end); Nit: Keep a same style may be better(Assign vma when declare it). Other looks good to me. Yeah, I agree. I will adjust the code in v2 and included your r-b. Thanks for your time to review. After rechecking the code, I think it'd better to keep current style because there is a follow-on validation on @vma. Keeping them together seems a good idea. I think it wouldn't a big deal to you. So I will keep current style with your r-b in v2. vma = find_vma_intersection(current->mm, hva, reg_end); if (!vma) break; Thanks, Gavin + if (!vma) break; /* @@ -1330,10 +1331,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, * ++ */ do { - struct vm_area_struct *vma = find_vma(current->mm, hva); + struct vm_area_struct *vma; hva_t vm_start, vm_end; - if (!vma || vma->vm_start >= reg_end) + vma = find_vma_intersection(current->mm, hva, reg_end); + if (!vma) break; /*
Re: [PATCH 4/4] KVM: arm64: Don't retrieve memory slot again in page fault handler
Hi Keqian, On 3/15/21 7:25 PM, Keqian Zhu wrote: On 2021/3/15 12:18, Gavin Shan wrote: We needn't retrieve the memory slot again in user_mem_abort() because the corresponding memory slot has been passed from the caller. This I think you are right, though fault_ipa will be adjusted when we try to use block mapping, the fault_supports_stage2_huge_mapping() makes sure we're not trying to map anything not covered by the memslot, so the adjusted fault_ipa still belongs to the memslot. Yeah, it's correct. Besides, the @logging_active is determined based on the passed memory slot. It means user_mem_abort() can't support memory range which spans multiple memory slot. would save some CPU cycles. For example, the time used to write 1GB memory, which is backed by 2MB hugetlb pages and write-protected, is dropped by 6.8% from 928ms to 864ms. Signed-off-by: Gavin Shan --- arch/arm64/kvm/mmu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index a5a8ade9fde4..4a4abcccfafb 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -846,7 +846,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, */ smp_rmb(); - pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, ); + pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, + write_fault, , NULL); It's better to update the code comments at same time. I guess you need some comments here? If so, I would add something like below in v2: /* * gfn_to_pfn_prot() can be used either with unnecessary overhead * introduced to locate the memory slot because the memory slot is * always fixed even @gfn is adjusted for huge pages. */ if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(hva, vma_shift); return 0; @@ -912,7 +913,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, /* Mark the page dirty only if the fault is handled successfully */ if (writable && !ret) { kvm_set_pfn_dirty(pfn); - mark_page_dirty(kvm, gfn); + mark_page_dirty_in_slot(kvm, memslot, gfn); } out_unlock: Thanks, Gavin
Re: [PATCH 3/4] KVM: arm64: Fix address check for memory slot
Hi Keqian, On 3/15/21 6:33 PM, Keqian Zhu wrote: FYI, this has been fixed by Marc in commit 262b003d059c. Yeah, I didn't check 5.12.rc3 code where the issue has been fixed. So please ignore this one and sorry for the noise. Thanks, Gavin On 2021/3/15 12:18, Gavin Shan wrote: The last (IPA) page can't be specified when a new memory slot is added. The error -EFAULT is returned when the memory slot is added with the following parameters for the VM, which has 40-bits IPA limit. The host has 4KB base page size. It's not correct because the last (IPA) page is still usable. struct kvm_userspace_memory_region { __u32 slot; /* 1*/ __u32 flags; /* 0*/ __u64 guest_phys_addr;/* 0xfff000 */ __u64 memory_size;/* 0x1000 */ __u64 userspace_addr; }; Signed-off-by: Gavin Shan --- arch/arm64/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 286b603ed0d3..a5a8ade9fde4 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1313,7 +1313,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, * Prevent userspace from creating a memory region outside of the IPA * space addressable by the KVM guest IPA space. */ - if (memslot->base_gfn + memslot->npages >= + if (memslot->base_gfn + memslot->npages > (kvm_phys_size(kvm) >> PAGE_SHIFT)) return -EFAULT;
Re: [PATCH 2/4] KVM: arm64: Use find_vma_intersection()
Hi Keqian, On 3/15/21 7:04 PM, Keqian Zhu wrote: On 2021/3/15 12:18, Gavin Shan wrote: find_vma_intersection() has been existing to search the intersected vma. This uses the function where it's applicable, to simplify the code. Signed-off-by: Gavin Shan --- arch/arm64/kvm/mmu.c | 10 ++ 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 84e70f953de6..286b603ed0d3 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -421,10 +421,11 @@ static void stage2_unmap_memslot(struct kvm *kvm, * ++ */ do { - struct vm_area_struct *vma = find_vma(current->mm, hva); + struct vm_area_struct *vma; hva_t vm_start, vm_end; - if (!vma || vma->vm_start >= reg_end) + vma = find_vma_intersection(current->mm, hva, reg_end); Nit: Keep a same style may be better(Assign vma when declare it). Other looks good to me. Yeah, I agree. I will adjust the code in v2 and included your r-b. Thanks for your time to review. Thanks, Gavin + if (!vma) break; /* @@ -1330,10 +1331,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, * ++ */ do { - struct vm_area_struct *vma = find_vma(current->mm, hva); + struct vm_area_struct *vma; hva_t vm_start, vm_end; - if (!vma || vma->vm_start >= reg_end) + vma = find_vma_intersection(current->mm, hva, reg_end); + if (!vma) break; /*
Re: [PATCH 2/4] KVM: arm64: Use find_vma_intersection()
Hi Marc, On 3/15/21 7:52 PM, Marc Zyngier wrote: On Mon, 15 Mar 2021 04:18:42 +, Gavin Shan wrote: find_vma_intersection() has been existing to search the intersected vma. This uses the function where it's applicable, to simplify the code. Signed-off-by: Gavin Shan --- arch/arm64/kvm/mmu.c | 10 ++ 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 84e70f953de6..286b603ed0d3 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -421,10 +421,11 @@ static void stage2_unmap_memslot(struct kvm *kvm, * ++ */ do { - struct vm_area_struct *vma = find_vma(current->mm, hva); + struct vm_area_struct *vma; hva_t vm_start, vm_end; - if (!vma || vma->vm_start >= reg_end) + vma = find_vma_intersection(current->mm, hva, reg_end); For context, here's the definition of find_vma_intersection(): static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) { struct vm_area_struct * vma = find_vma(mm,start_addr); if (vma && end_addr <= vma->vm_start) vma = NULL; return vma; } It seems that there is a boundary issue in either the old code or the new one in the case where (reg_end == vma->start). Which one is which? The old and new code is interchangeable, meaning "reg_end == vma->start" is invalid in both cases. So if there is a boundary issue, the old and new code should have same issue. According to the code, "reg_end == vma->start" is invalid. So I don't see there is a boundary issue. Hopefully, I don't miss anything :) Thanks, Gavin
[PATCH 4/4] KVM: arm64: Don't retrieve memory slot again in page fault handler
We needn't retrieve the memory slot again in user_mem_abort() because the corresponding memory slot has been passed from the caller. This would save some CPU cycles. For example, the time used to write 1GB memory, which is backed by 2MB hugetlb pages and write-protected, is dropped by 6.8% from 928ms to 864ms. Signed-off-by: Gavin Shan --- arch/arm64/kvm/mmu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index a5a8ade9fde4..4a4abcccfafb 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -846,7 +846,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, */ smp_rmb(); - pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, ); + pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, + write_fault, , NULL); if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(hva, vma_shift); return 0; @@ -912,7 +913,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, /* Mark the page dirty only if the fault is handled successfully */ if (writable && !ret) { kvm_set_pfn_dirty(pfn); - mark_page_dirty(kvm, gfn); + mark_page_dirty_in_slot(kvm, memslot, gfn); } out_unlock: -- 2.23.0
[PATCH 3/4] KVM: arm64: Fix address check for memory slot
The last (IPA) page can't be specified when a new memory slot is added. The error -EFAULT is returned when the memory slot is added with the following parameters for the VM, which has 40-bits IPA limit. The host has 4KB base page size. It's not correct because the last (IPA) page is still usable. struct kvm_userspace_memory_region { __u32 slot; /* 1*/ __u32 flags; /* 0*/ __u64 guest_phys_addr;/* 0xfff000 */ __u64 memory_size;/* 0x1000 */ __u64 userspace_addr; }; Signed-off-by: Gavin Shan --- arch/arm64/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 286b603ed0d3..a5a8ade9fde4 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1313,7 +1313,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, * Prevent userspace from creating a memory region outside of the IPA * space addressable by the KVM guest IPA space. */ - if (memslot->base_gfn + memslot->npages >= + if (memslot->base_gfn + memslot->npages > (kvm_phys_size(kvm) >> PAGE_SHIFT)) return -EFAULT; -- 2.23.0
[PATCH 2/4] KVM: arm64: Use find_vma_intersection()
find_vma_intersection() has been existing to search the intersected vma. This uses the function where it's applicable, to simplify the code. Signed-off-by: Gavin Shan --- arch/arm64/kvm/mmu.c | 10 ++ 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 84e70f953de6..286b603ed0d3 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -421,10 +421,11 @@ static void stage2_unmap_memslot(struct kvm *kvm, * ++ */ do { - struct vm_area_struct *vma = find_vma(current->mm, hva); + struct vm_area_struct *vma; hva_t vm_start, vm_end; - if (!vma || vma->vm_start >= reg_end) + vma = find_vma_intersection(current->mm, hva, reg_end); + if (!vma) break; /* @@ -1330,10 +1331,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, * ++ */ do { - struct vm_area_struct *vma = find_vma(current->mm, hva); + struct vm_area_struct *vma; hva_t vm_start, vm_end; - if (!vma || vma->vm_start >= reg_end) + vma = find_vma_intersection(current->mm, hva, reg_end); + if (!vma) break; /* -- 2.23.0
[PATCH 1/4] KVM: arm64: Hide kvm_mmu_wp_memory_region()
We needn't expose the function as it's only used by mmu.c. Signed-off-by: Gavin Shan --- arch/arm64/include/asm/kvm_host.h | 1 - arch/arm64/kvm/mmu.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 3d10e6527f7d..688f2df1957b 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -632,7 +632,6 @@ void kvm_arm_resume_guest(struct kvm *kvm); }) void force_vm_exit(const cpumask_t *mask); -void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); int handle_exit(struct kvm_vcpu *vcpu, int exception_index); void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 77cb2d28f2a4..84e70f953de6 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -555,7 +555,7 @@ static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_ * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, * serializing operations for VM memory regions. */ -void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) +static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) { struct kvm_memslots *slots = kvm_memslots(kvm); struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); -- 2.23.0
[PATCH 0/4] KVM: arm64: Minor page fault handler improvement
The series includes several minior improvements to stage-2 page fault handler: PATCH[1/2] are cleaning up the code. PATCH[3] fixes the address range check on adding new memory slot. PATCH[4] don't retrieve the memory slot again in the page fault handler to save a bit CPU cycles. Gavin Shan (4): KVM: arm64: Hide kvm_mmu_wp_memory_region() KVM: arm64: Use find_vma_intersection() KVM: arm64: Fix address check for memory slot KVM: arm64: Don't retrieve memory slot again in page fault handler arch/arm64/include/asm/kvm_host.h | 1 - arch/arm64/kvm/mmu.c | 19 +++ 2 files changed, 11 insertions(+), 9 deletions(-) -- 2.23.0
[PATCH v2 17/17] KVM: arm64: Add async PF document
This adds document to explain the interface for asynchronous page fault and how it works in general. Signed-off-by: Gavin Shan --- Documentation/virt/kvm/arm/apf.rst | 143 +++ Documentation/virt/kvm/arm/index.rst | 1 + 2 files changed, 144 insertions(+) create mode 100644 Documentation/virt/kvm/arm/apf.rst diff --git a/Documentation/virt/kvm/arm/apf.rst b/Documentation/virt/kvm/arm/apf.rst new file mode 100644 index ..4f5c01b6699f --- /dev/null +++ b/Documentation/virt/kvm/arm/apf.rst @@ -0,0 +1,143 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Asynchronous Page Fault Support for arm64 += + +There are two stages of page faults when KVM module is enabled as accelerator +to the guest. The guest is responsible for handling the stage-1 page faults, +while the host handles the stage-2 page faults. During the period of handling +the stage-2 page faults, the guest is suspended until the requested page is +ready. It could take several milliseconds, even hundreds of milliseconds in +extreme situations because I/O might be required to move the requested page +from disk to DRAM. The guest does not do any work when it is suspended. The +feature (Asynchronous Page Fault) is introduced to take advantage of the +suspending period and to improve the overall performance. + +There are two paths in order to fulfil the asynchronous page fault, called +as control path and data path. The control path allows the VMM or guest to +configure the functionality, while the notifications are delivered in data +path. The notifications are classified into page-not-present and page-ready +notifications. + +Data Path +- + +There are two types of notifications delivered from host to guest in the +data path: page-not-present and page-ready notification. They are delivered +through SDEI event and (PPI) interrupt separately. Besides, there is a shared +buffer between host and guest to indicate the reason and sequential token, +which is used to identify the asynchronous page fault. The reason and token +resident in the shared buffer is written by host, read and cleared by guest. +An asynchronous page fault is delivered and completed as below. + +(1) When an asynchronous page fault starts, a (workqueue) worker is created +and queued to the vCPU's pending queue. The worker makes the requested +page ready and resident to DRAM in the background. The shared buffer is +updated with reason and sequential token. After that, SDEI event is sent +to guest as page-not-present notification. + +(2) When the SDEI event is received on guest, the current process is tagged +with TIF_ASYNC_PF and associated with a wait queue. The process is ready +to keep rescheduling itself on switching from kernel to user mode. After +that, a reschedule IPI is sent to current CPU and the received SDEI event +is acknowledged. Note that the IPI is delivered when the acknowledgment +on the SDEI event is received on host. + +(3) On the host, the worker is dequeued from the vCPU's pending queue and +enqueued to its completion queue when the requested page becomes ready. +In the mean while, KVM_REQ_ASYNC_PF request is sent the vCPU if the +worker is the first element enqueued to the completion queue. + +(4) With pending KVM_REQ_ASYNC_PF request, the first worker in the completion +queue is dequeued and destroyed. In the mean while, a (PPI) interrupt is +sent to guest with updated reason and token in the shared buffer. + +(5) When the (PPI) interrupt is received on guest, the affected process is +located using the token and waken up after its TIF_ASYNC_PF tag is cleared. +After that, the interrupt is acknowledged through SMCCC interface. The +workers in the completion queue is dequeued and destroyed if any workers +exist, and another (PPI) interrupt is sent to the guest. + +Control Path + + +The configurations are passed through SMCCC or ioctl interface. The SDEI +event and (PPI) interrupt are owned by VMM, so the SDEI event and interrupt +numbers are configured through ioctl command on per-vCPU basis. Besides, +the functionality might be enabled and configured through ioctl interface +by VMM during migration: + + * KVM_ARM_ASYNC_PF_CMD_GET_VERSION + + Returns the current version of the feature, supported by the host. It is + made up of major, minor and revision fields. Each field is one byte in + length. + + * KVM_ARM_ASYNC_PF_CMD_GET_SDEI: + + Retrieve the SDEI event number, used for page-not-present notification, + so that it can be configured on destination VM in the scenario of + migration. + + * KVM_ARM_ASYNC_PF_GET_IRQ: + + Retrieve the IRQ (PPI) number, used for page-ready notification, so that + it can be configured on destination VM in the scenario of migration. + + * KVM_ARM_ASYNC_PF_CMD_GET_CONTROL + + Retrieve the address of control block, so that it can
[PATCH v2 15/17] arm64: Reschedule process on aync PF
The page-not-present notification is delivered by SDEI event. The guest reschedules current process to another one when the SDEI event is received. It's not safe to do so in the SDEI event handler because the SDEI event should be acknowledged as soon as possible. So the rescheduling is postponed until the current process switches from kernel to user mode. In order to trigger the switch, the SDEI event handler sends (reschedule) IPI to current CPU and it's delivered in time after the SDEI event is acknowledged. A new thread flag (TIF_ASYNC_PF) is introduced in order to track the state for the process, to be rescheduled. With the flag is set, there is a head of wait-queue is associated with the process. The process keeps rescheduling itself until the flag is cleared when page-ready notification is received through (PPI) interrupt. Signed-off-by: Gavin Shan --- arch/arm64/include/asm/processor.h | 1 + arch/arm64/include/asm/thread_info.h | 4 +++- arch/arm64/kernel/signal.c | 17 + 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index ca2cd75d3286..2176c88c77a7 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -154,6 +154,7 @@ struct thread_struct { u64 sctlr_tcf0; u64 gcr_user_excl; #endif + void*data; }; static inline void arch_thread_struct_whitelist(unsigned long *offset, diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 9f4e3b266f21..939beb3c7723 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -65,6 +65,7 @@ void arch_release_task_struct(struct task_struct *tsk); #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ #define TIF_MTE_ASYNC_FAULT5 /* MTE Asynchronous Tag Check Fault */ #define TIF_NOTIFY_SIGNAL 6 /* signal notifications exist */ +#define TIF_ASYNC_PF 7 /* Asynchronous page fault */ #define TIF_SYSCALL_TRACE 8 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 9 /* syscall auditing */ #define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */ @@ -95,11 +96,12 @@ void arch_release_task_struct(struct task_struct *tsk); #define _TIF_SVE (1 << TIF_SVE) #define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) +#define _TIF_ASYNC_PF (1 << TIF_ASYNC_PF) #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ _TIF_UPROBE | _TIF_MTE_ASYNC_FAULT | \ -_TIF_NOTIFY_SIGNAL) +_TIF_NOTIFY_SIGNAL | _TIF_ASYNC_PF) #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 6237486ff6bb..2cd2d13aa905 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -915,6 +915,23 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) { do { + if (thread_flags & _TIF_ASYNC_PF) { + struct swait_queue_head *wq = + READ_ONCE(current->thread.data); + DECLARE_SWAITQUEUE(wait); + + local_daif_restore(DAIF_PROCCTX_NOIRQ); + + do { + prepare_to_swait_exclusive(wq, + , TASK_UNINTERRUPTIBLE); + if (!test_thread_flag(TIF_ASYNC_PF)) + break; + + schedule(); + } while (test_thread_flag(TIF_ASYNC_PF)); + } + if (thread_flags & _TIF_NEED_RESCHED) { /* Unmask Debug and SError for the next task */ local_daif_restore(DAIF_PROCCTX_NOIRQ); -- 2.23.0
[PATCH v2 16/17] arm64: Enable async PF
This enables asynchronous page fault from guest side. The design is highlighted as below: * The per-vCPU shared memory region, which is represented by "struct kvm_vcpu_pv_apf_data", is allocated. The reason and token associated with the received notifications of asynchronous page fault are delivered through it. * A per-vCPU table, which is represented by "struct kvm_apf_table", is allocated. The process, on which the page-not-present notification is received, is added into the table so that it can reschedule itself on switching from kernel to user mode. Afterwards, the process, identified by token, is removed from the table and put into runnable state when page-ready notification is received. * During CPU hotplug, the (private) SDEI event is expected to be enabled or disabled on the affected CPU by SDEI client driver. The (PPI) interrupt is enabled or disabled on the affected CPU by ourself. When the system is going to reboot, the SDEI event is disabled and unregistered and the (PPI) interrupt is disabled. * The SDEI event and (PPI) interrupt number are retrieved from host through SMCCC interface. Besides, the version of the asynchronous page fault is validated when the feature is enabled on the guest. * The feature is disabled on guest when boot parameter "no-kvmapf" is specified. Signed-off-by: Gavin Shan --- arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/kvm.c| 452 + 2 files changed, 453 insertions(+) create mode 100644 arch/arm64/kernel/kvm.c diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 86364ab6f13f..c849ef61f043 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -47,6 +47,7 @@ obj-$(CONFIG_ACPI)+= acpi.o obj-$(CONFIG_ACPI_NUMA)+= acpi_numa.o obj-$(CONFIG_ARM64_ACPI_PARKING_PROTOCOL) += acpi_parking_protocol.o obj-$(CONFIG_PARAVIRT) += paravirt.o +obj-$(CONFIG_KVM_GUEST)+= kvm.o obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate-asm.o obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o \ diff --git a/arch/arm64/kernel/kvm.c b/arch/arm64/kernel/kvm.c new file mode 100644 index ..effe8dc7e921 --- /dev/null +++ b/arch/arm64/kernel/kvm.c @@ -0,0 +1,452 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Asynchronous page fault support. + * + * Copyright (C) 2021 Red Hat, Inc. + * + * Author(s): Gavin Shan + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct kvm_apf_task { + unsigned inttoken; + struct task_struct *task; + struct swait_queue_head wq; +}; + +struct kvm_apf_table { + raw_spinlock_t lock; + unsigned intcount; + struct kvm_apf_task tasks[0]; +}; + +static bool async_pf_available = true; +static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_data) __aligned(64); +static struct kvm_apf_table __percpu *apf_tables; +static unsigned int apf_tasks; +static unsigned int apf_sdei_num; +static unsigned int apf_ppi_num; +static int apf_irq; + +static bool kvm_async_pf_add_task(struct task_struct *task, + unsigned int token) +{ + struct kvm_apf_table *table = this_cpu_ptr(apf_tables); + unsigned int i, index = apf_tasks; + bool ret = false; + + raw_spin_lock(>lock); + + if (WARN_ON(table->count >= apf_tasks)) + goto unlock; + + for (i = 0; i < apf_tasks; i++) { + if (!table->tasks[i].task) { + if (index == apf_tasks) { + ret = true; + index = i; + } + } else if (table->tasks[i].task == task) { + WARN_ON(table->tasks[i].token != token); + ret = false; + break; + } + } + + if (!ret) + goto unlock; + + task->thread.data = >tasks[index].wq; + set_tsk_thread_flag(task, TIF_ASYNC_PF); + + table->count++; + table->tasks[index].task = task; + table->tasks[index].token = token; + +unlock: + raw_spin_unlock(>lock); + return ret; +} + +static inline void kvm_async_pf_remove_one_task(struct kvm_apf_table *table, + unsigned int index) +{ + clear_tsk_thread_flag(table->tasks[index].task, TIF_ASYNC_PF); + WRITE_ONCE(table->tasks[index].task->thread.data, NULL); + + table->count--; + table->tasks[index].task = NULL; + table-
[PATCH v2 14/17] arm64: Detect async PF para-virtualization feature
This implements kvm_para_available() to check if para-virtualization features are available or not. Besides, kvm_para_has_feature() is enhanced to detect the asynchronous page fault para-virtualization feature. These two functions are going to be used by guest kernel to enable the asynchronous page fault. This also adds kernel option (CONFIG_KVM_GUEST), which is the umbrella for the optimizations related to KVM para-virtualization. Signed-off-by: Gavin Shan --- arch/arm64/Kconfig | 11 +++ arch/arm64/include/asm/kvm_para.h | 12 +++- arch/arm64/include/uapi/asm/kvm_para.h | 2 ++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index f39568b28ec1..792ae09aa690 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1078,6 +1078,17 @@ config PARAVIRT_TIME_ACCOUNTING If in doubt, say N here. +config KVM_GUEST + bool "KVM Guest Support" + depends on PARAVIRT + default y + help + This option enables various optimizations for running under the KVM + hypervisor. Overhead for the kernel when not running inside KVM should + be minimal. + + In case of doubt, say Y + config KEXEC depends on PM_SLEEP_SMP select KEXEC_CORE diff --git a/arch/arm64/include/asm/kvm_para.h b/arch/arm64/include/asm/kvm_para.h index 0ea481dd1c7a..8f39c60a6619 100644 --- a/arch/arm64/include/asm/kvm_para.h +++ b/arch/arm64/include/asm/kvm_para.h @@ -3,6 +3,8 @@ #define _ASM_ARM_KVM_PARA_H #include +#include +#include static inline bool kvm_check_and_clear_guest_paused(void) { @@ -11,7 +13,12 @@ static inline bool kvm_check_and_clear_guest_paused(void) static inline unsigned int kvm_arch_para_features(void) { - return 0; + unsigned int features = 0; + + if (kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_ASYNC_PF)) + features |= (1 << KVM_FEATURE_ASYNC_PF); + + return features; } static inline unsigned int kvm_arch_para_hints(void) @@ -21,6 +28,9 @@ static inline unsigned int kvm_arch_para_hints(void) static inline bool kvm_para_available(void) { + if (IS_ENABLED(CONFIG_KVM_GUEST)) + return true; + return false; } diff --git a/arch/arm64/include/uapi/asm/kvm_para.h b/arch/arm64/include/uapi/asm/kvm_para.h index 162325e2638f..70bbc7d1ec75 100644 --- a/arch/arm64/include/uapi/asm/kvm_para.h +++ b/arch/arm64/include/uapi/asm/kvm_para.h @@ -4,6 +4,8 @@ #include +#define KVM_FEATURE_ASYNC_PF 0 + /* Async PF */ #define KVM_ASYNC_PF_ENABLED (1 << 0) #define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1) -- 2.23.0
[PATCH v2 11/17] KVM: arm64: Support async PF hypercalls
This introduces (SMCCC) KVM vendor specific services to configure the asynchronous page fault functionality. The following services are introduced: * ARM_SMCCC_KVM_FUNC_ASYNC_PF_VERSION Returns the version, which can be used to identify ABI changes in the future. * ARM_SMCCC_KVM_FUNC_ASYNC_PF_SLOTS Return maximal number of tokens that current vCPU can have. It's used by guest to allocate the required resources. * ARM_SMCCC_KVM_FUNC_ASYNC_PF_{SDEI, IRQ} Return the associated SDEI or (PPI) IRQ number, configured by vCPU ioctl command. * ARM_SMCCC_KVM_FUNC_ASYNC_PF_ENABLE Enable or disable asynchronous page fault on current vCPU. The corresponding SDEI event and (PPI) IRQ are owned by VMM. So they are configured by vCPU ioctl interface and it will be implemented when the asynchronous page fault capability is exported in the subsequent patches. Signed-off-by: Gavin Shan --- arch/arm64/kvm/async_pf.c | 119 ++ include/linux/arm-smccc.h | 5 ++ 2 files changed, 124 insertions(+) diff --git a/arch/arm64/kvm/async_pf.c b/arch/arm64/kvm/async_pf.c index f73c406456e9..4734c5b26aa8 100644 --- a/arch/arm64/kvm/async_pf.c +++ b/arch/arm64/kvm/async_pf.c @@ -313,12 +313,115 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, write_cache(vcpu, offsetof(struct kvm_vcpu_pv_apf_data, token), 0); } +static void kvm_arch_async_sdei_notifier(struct kvm_vcpu *vcpu, +unsigned long num, +unsigned int state) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_arch_async_pf_control *apf = vcpu->arch.apf; + + if (!apf) + return; + + if (num != apf->sdei_event_num) { + kvm_err("%s: Invalid event number (%d-%d %lx-%llx)\n", + __func__, kvm->userspace_pid, vcpu->vcpu_idx, + num, apf->sdei_event_num); + return; + } + + switch (state) { + case KVM_SDEI_NOTIFY_DELIVERED: + if (!apf->notpresent_pending) + break; + + apf->notpresent_token = 0; + apf->notpresent_pending = false; + break; + case KVM_SDEI_NOTIFY_COMPLETED: + break; + default: + kvm_err("%s: Invalid state (%d-%d %lx-%d)\n", + __func__, kvm->userspace_pid, vcpu->vcpu_idx, + num, state); + } +} + +static long kvm_arch_async_enable(struct kvm_vcpu *vcpu, u64 data) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_arch_async_pf_control *apf = vcpu->arch.apf; + gpa_t gpa = (data & ~0x3FUL); + bool enabled, enable; + int ret; + + if (!apf || !irqchip_in_kernel(kvm)) + return SMCCC_RET_NOT_SUPPORTED; + + /* Bail if the state transition isn't allowed */ + enabled = !!(apf->control_block & KVM_ASYNC_PF_ENABLED); + enable = !!(data & KVM_ASYNC_PF_ENABLED); + if (enable == enabled) { + kvm_debug("%s: Async PF has been %s on (%d-%d %llx-%llx)\n", + __func__, enabled ? "enabled" : "disabled", + kvm->userspace_pid, vcpu->vcpu_idx, + apf->control_block, data); + return SMCCC_RET_NOT_REQUIRED; + } + + /* To disable the functinality */ + if (!enable) { + kvm_clear_async_pf_completion_queue(vcpu); + apf->control_block = data; + return SMCCC_RET_SUCCESS; + } + + /* +* The SDEI event and IRQ number should have been given +* prior to enablement. +*/ + if (!apf->sdei_event_num || !apf->irq) { + kvm_err("%s: Invalid SDEI event or IRQ (%d-%d %llx-%d)\n", + __func__, kvm->userspace_pid, vcpu->vcpu_idx, + apf->sdei_event_num, apf->irq); + return SMCCC_RET_INVALID_PARAMETER; + } + + /* Register SDEI event notifier */ + ret = kvm_sdei_register_notifier(kvm, apf->sdei_event_num, +kvm_arch_async_sdei_notifier); + if (ret) { + kvm_err("%s: Error %d registering SDEI notifier (%d-%d %llx)\n", + __func__, ret, kvm->userspace_pid, vcpu->vcpu_idx, + apf->sdei_event_num); + return SMCCC_RET_NOT_SUPPORTED; + } + + /* Initialize cache shared by host and guest */ + ret = kvm_gfn_to_hva_cache_init(kvm, >cache, gpa, + offsetofend(struct kvm_vcpu_pv_apf_data, token)); + if (ret) { + kvm_err("%s: Error %d initializing cache (%d-%d
[PATCH v2 13/17] KVM: arm64: Export async PF capability
This exports the asynchronous page fault capability: * Identify capability KVM_CAP_ASYNC_{PF, PF_INT}. * Standardize SDEI event for asynchronous page fault. * Enable kernel config CONFIG_KVM_ASYNC_{PF, PF_SLOT}. Signed-off-by: Gavin Shan --- arch/arm64/include/uapi/asm/kvm_sdei.h | 1 + arch/arm64/kvm/Kconfig | 2 ++ arch/arm64/kvm/arm.c | 4 arch/arm64/kvm/sdei.c | 5 + 4 files changed, 12 insertions(+) diff --git a/arch/arm64/include/uapi/asm/kvm_sdei.h b/arch/arm64/include/uapi/asm/kvm_sdei.h index 232092de5e21..47d578abba1a 100644 --- a/arch/arm64/include/uapi/asm/kvm_sdei.h +++ b/arch/arm64/include/uapi/asm/kvm_sdei.h @@ -13,6 +13,7 @@ #define KVM_SDEI_MAX_VCPUS 512 #define KVM_SDEI_INVALID_NUM 0 #define KVM_SDEI_DEFAULT_NUM 0x4040 +#define KVM_SDEI_ASYNC_PF_NUM 0x4041 struct kvm_sdei_event_state { uint64_tnum; diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 3964acf5451e..dfb3ed0de2ca 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -31,6 +31,8 @@ menuconfig KVM select SRCU select KVM_VFIO select HAVE_KVM_EVENTFD + select KVM_ASYNC_PF + select KVM_ASYNC_PF_SLOT select HAVE_KVM_IRQFD select HAVE_KVM_MSI select HAVE_KVM_IRQCHIP diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index be0e6c2db2a5..0940de3ebcff 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -269,6 +269,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ARM_SDEI: r = 1; break; + case KVM_CAP_ASYNC_PF: + case KVM_CAP_ASYNC_PF_INT: + r = IS_ENABLED(CONFIG_KVM_ASYNC_PF) ? 1 : 0; + break; default: r = 0; } diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c index 4f5a582daa97..437303bfafba 100644 --- a/arch/arm64/kvm/sdei.c +++ b/arch/arm64/kvm/sdei.c @@ -19,6 +19,11 @@ static struct kvm_sdei_event_state defined_kse[] = { 1, SDEI_EVENT_PRIORITY_CRITICAL }, + { KVM_SDEI_ASYNC_PF_NUM, + SDEI_EVENT_TYPE_PRIVATE, + 1, + SDEI_EVENT_PRIORITY_CRITICAL + }, }; static struct kvm_sdei_event *kvm_sdei_find_event(struct kvm *kvm, -- 2.23.0
[PATCH v2 10/17] KVM: arm64: Support page-ready notification
The asynchronous page fault starts with a worker when the requested page isn't present. The worker makes the requested page present in the background and the worker, together with the associated information, is queued to the completion queue after that. The worker and the completion queue are checked as below. * A request (KVM_REQ_ASYNC_PF) is raised if the worker is the first one enqueued to the completion queue. With the request, the completion queue is checked and the worker is dequeued. A PPI is sent to guest as the page-ready notification and the guest should acknowledge the interrupt by SMCCC interface. * When the notification (PPI) is acknowledged by guest, the completion queue is checked again and next worker is dequeued if we have one. For this particular worker, another notification (PPI) is sent to the guest without raising the request. Once the notification (PPI) is acknowledged by the guest, the completion queue is checked to process next worker, which has been queued to it. Similar to page-not-present notification, the shared memory region is used to convey the reason and token associated with the page-ready notification. The region is represented by "struct kvm_vcpu_pv_apf_data". The feature isn't enabled by CONFIG_KVM_ASYNC_PF yet. Also, the control path isn't implemented and will be done in the subsequent patches. Signed-off-by: Gavin Shan --- arch/arm64/include/asm/kvm_host.h | 17 ++ arch/arm64/include/uapi/asm/kvm_para.h | 1 + arch/arm64/kvm/arm.c | 24 ++- arch/arm64/kvm/async_pf.c | 207 + arch/arm64/kvm/hypercalls.c| 6 + include/linux/arm-smccc.h | 10 ++ 6 files changed, 262 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 49cccefb22cf..6349920fd9ce 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -48,6 +48,7 @@ #define KVM_REQ_RECORD_STEAL KVM_ARCH_REQ(3) #define KVM_REQ_RELOAD_GICv4 KVM_ARCH_REQ(4) #define KVM_REQ_SDEI KVM_ARCH_REQ(5) +#define KVM_REQ_ASYNC_PF KVM_ARCH_REQ(6) #define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ KVM_DIRTY_LOG_INITIALLY_SET) @@ -292,10 +293,12 @@ struct kvm_arch_async_pf_control { u64 control_block; boolsend_user_only; u64 sdei_event_num; + u32 irq; u16 id; boolnotpresent_pending; u32 notpresent_token; + boolpageready_pending; }; struct kvm_vcpu_arch { @@ -767,6 +770,14 @@ bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, u32 esr, gpa_t gpa, gfn_t gfn); bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); +void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu); +bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu); +void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work); +void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, +struct kvm_async_pf *work); +long kvm_arch_async_pf_hypercall(struct kvm_vcpu *vcpu, +long *r1, long *r2, long *r3); void kvm_arch_async_pf_destroy_vcpu(struct kvm_vcpu *vcpu); #else static inline void kvm_arch_async_pf_create_vcpu(struct kvm_vcpu *vcpu) { } @@ -782,6 +793,12 @@ static inline bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, { return false; } + +static inline long kvm_arch_async_pf_hypercall(struct kvm_vcpu *vcpu, + long *r1, long *r2, long *r3) +{ + return SMCCC_RET_NOT_SUPPORTED; +} #endif /* Guest/host FPSIMD coordination helpers */ diff --git a/arch/arm64/include/uapi/asm/kvm_para.h b/arch/arm64/include/uapi/asm/kvm_para.h index 3fa04006714e..162325e2638f 100644 --- a/arch/arm64/include/uapi/asm/kvm_para.h +++ b/arch/arm64/include/uapi/asm/kvm_para.h @@ -9,6 +9,7 @@ #define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1) #define KVM_PV_REASON_PAGE_NOT_PRESENT 1 +#define KVM_PV_REASON_PAGE_READY 2 struct kvm_vcpu_pv_apf_data { __u32 reason; diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index c98fbb4e914b..e34fca3fa0ff 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -484,9 +484,23 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, */ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { + struct kvm_arch_async_pf_control *apf = v->arch.apf; bool irq_lines = *vcpu_hcr(v)
[PATCH v2 12/17] KVM: arm64: Support async PF ioctl commands
This supports ioctl commands for configuration and migration: KVM_ARM_ASYNC_PF_CMD_GET_VERSION Return implementation version KVM_ARM_ASYNC_PF_CMD_GET_SDEI Return SDEI event number used for page-not-present notification KVM_ARM_ASYNC_PF_CMD_GET_IRQ Return IRQ number used for page-ready notification KVM_ARM_ASYNC_PF_CMD_GET_CONTROL Get control block when VM is migrated KVM_ARM_ASYNC_PF_CMD_SET_SDEI Set SDEI event number when VM is started or migrated KVM_ARM_ASYNC_PF_CMD_SET_IRQ Set IRQ number during when VM is started or migrated KVM_ARM_ASYNC_PF_CMD_SET_CONTROL Set control block when VM is migrated Signed-off-by: Gavin Shan --- arch/arm64/include/asm/kvm_host.h | 14 +++ arch/arm64/include/uapi/asm/kvm.h | 19 + arch/arm64/kvm/arm.c | 6 +++ arch/arm64/kvm/async_pf.c | 64 +++ include/uapi/linux/kvm.h | 3 ++ 5 files changed, 106 insertions(+) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 6349920fd9ce..14b3d1505b15 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -778,6 +778,8 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); long kvm_arch_async_pf_hypercall(struct kvm_vcpu *vcpu, long *r1, long *r2, long *r3); +long kvm_arch_async_pf_vm_ioctl(struct kvm *kvm, unsigned long arg); +long kvm_arch_async_pf_vcpu_ioctl(struct kvm_vcpu *vcpu, unsigned long arg); void kvm_arch_async_pf_destroy_vcpu(struct kvm_vcpu *vcpu); #else static inline void kvm_arch_async_pf_create_vcpu(struct kvm_vcpu *vcpu) { } @@ -799,6 +801,18 @@ static inline long kvm_arch_async_pf_hypercall(struct kvm_vcpu *vcpu, { return SMCCC_RET_NOT_SUPPORTED; } + +static inline long kvm_arch_async_pf_vm_ioctl(struct kvm *kvm, + unsigned long arg) +{ + return -EPERM; +} + +static inline long kvm_arch_async_pf_vcpu_ioctl(struct kvm_vcpu *vcpu, + unsigned long arg) +{ + return -EPERM; +} #endif /* Guest/host FPSIMD coordination helpers */ diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 15499751997d..a6124068bee6 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -403,6 +403,25 @@ struct kvm_vcpu_events { #define KVM_PSCI_RET_INVAL PSCI_RET_INVALID_PARAMS #define KVM_PSCI_RET_DENIEDPSCI_RET_DENIED +/* Asynchronous page fault */ +#define KVM_ARM_ASYNC_PF_CMD_GET_VERSION 0 +#define KVM_ARM_ASYNC_PF_CMD_GET_SDEI 1 +#define KVM_ARM_ASYNC_PF_CMD_GET_IRQ 2 +#define KVM_ARM_ASYNC_PF_CMD_GET_CONTROL 3 +#define KVM_ARM_ASYNC_PF_CMD_SET_SDEI 4 +#define KVM_ARM_ASYNC_PF_CMD_SET_IRQ 5 +#define KVM_ARM_ASYNC_PF_CMD_SET_CONTROL 6 + +struct kvm_arm_async_pf_cmd { + __u32 cmd; + union { + __u32 version; + __u64 sdei; + __u32 irq; + __u64 control; + }; +}; + #endif #endif /* __ARM_KVM_H__ */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index e34fca3fa0ff..be0e6c2db2a5 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1287,6 +1287,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, case KVM_ARM_SDEI_COMMAND: { return kvm_sdei_vcpu_ioctl(vcpu, arg); } + case KVM_ARM_ASYNC_PF_COMMAND: { + return kvm_arch_async_pf_vcpu_ioctl(vcpu, arg); + } default: r = -EINVAL; } @@ -1364,6 +1367,9 @@ long kvm_arch_vm_ioctl(struct file *filp, case KVM_ARM_SDEI_COMMAND: { return kvm_sdei_vm_ioctl(kvm, arg); } + case KVM_ARM_ASYNC_PF_COMMAND: { + return kvm_arch_async_pf_vm_ioctl(kvm, arg); + } default: return -EINVAL; } diff --git a/arch/arm64/kvm/async_pf.c b/arch/arm64/kvm/async_pf.c index 4734c5b26aa8..6f763edbe3a3 100644 --- a/arch/arm64/kvm/async_pf.c +++ b/arch/arm64/kvm/async_pf.c @@ -464,6 +464,70 @@ long kvm_arch_async_pf_hypercall(struct kvm_vcpu *vcpu, return ret; } +long kvm_arch_async_pf_vm_ioctl(struct kvm *kvm, unsigned long arg) +{ + struct kvm_arm_async_pf_cmd cmd; + unsigned int version = 0x01; /* v1.0.0 */ + void __user *argp = (void __user *)arg; + + if (copy_from_user(, argp, sizeof(cmd))) + return -EFAULT; + + if (cmd.cmd != KVM_ARM_ASYNC_PF_CMD_GET_VERSION) + return -EINVAL; + + cmd.version = version; + if (copy_to_user(argp, , sizeof(cmd))) + return -EFAULT; + + return 0; +} + +long kvm_arch_async_pf_vcpu_ioctl(struct kvm_vcpu *vcpu, unsigned long arg
[PATCH v2 08/17] KVM: arm64: Add paravirtualization header files
We need put more stuff in the paravirtualization header files when the asynchronous page fault is supported. The generic header files can't meet the goal. This duplicate the generic header files to be our platform specific header files. It's the preparatory work to support the asynchronous page fault in subsequent patches: include/uapi/asm-generic/kvm_para.h include/asm-generic/kvm_para.h arch/arm64/include/uapi/asm/kvm_para.h arch/arm64/include/asm/kvm_para.h Signed-off-by: Gavin Shan --- arch/arm64/include/asm/kvm_para.h | 27 ++ arch/arm64/include/uapi/asm/Kbuild | 2 -- arch/arm64/include/uapi/asm/kvm_para.h | 5 + 3 files changed, 32 insertions(+), 2 deletions(-) create mode 100644 arch/arm64/include/asm/kvm_para.h create mode 100644 arch/arm64/include/uapi/asm/kvm_para.h diff --git a/arch/arm64/include/asm/kvm_para.h b/arch/arm64/include/asm/kvm_para.h new file mode 100644 index ..0ea481dd1c7a --- /dev/null +++ b/arch/arm64/include/asm/kvm_para.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_ARM_KVM_PARA_H +#define _ASM_ARM_KVM_PARA_H + +#include + +static inline bool kvm_check_and_clear_guest_paused(void) +{ + return false; +} + +static inline unsigned int kvm_arch_para_features(void) +{ + return 0; +} + +static inline unsigned int kvm_arch_para_hints(void) +{ + return 0; +} + +static inline bool kvm_para_available(void) +{ + return false; +} + +#endif /* _ASM_ARM_KVM_PARA_H */ diff --git a/arch/arm64/include/uapi/asm/Kbuild b/arch/arm64/include/uapi/asm/Kbuild index 602d137932dc..f66554cd5c45 100644 --- a/arch/arm64/include/uapi/asm/Kbuild +++ b/arch/arm64/include/uapi/asm/Kbuild @@ -1,3 +1 @@ # SPDX-License-Identifier: GPL-2.0 - -generic-y += kvm_para.h diff --git a/arch/arm64/include/uapi/asm/kvm_para.h b/arch/arm64/include/uapi/asm/kvm_para.h new file mode 100644 index ..cd212282b90c --- /dev/null +++ b/arch/arm64/include/uapi/asm/kvm_para.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_ASM_ARM_KVM_PARA_H +#define _UAPI_ASM_ARM_KVM_PARA_H + +#endif /* _UAPI_ASM_ARM_KVM_PARA_H */ -- 2.23.0
[PATCH v2 09/17] KVM: arm64: Support page-not-present notification
The requested page might be not resident in memory during the stage-2 page fault. For example, the requested page could be resident in swap device (file). In this case, disk I/O is issued in order to fetch the requested page and it could take tens of milliseconds, even hundreds of milliseconds in extreme situation. During the period, the guest's vCPU is suspended until the requested page becomes ready. Actually, the something else on the guest's vCPU could be rescheduled during the period, so that the time slice isn't wasted as the guest's vCPU can see. This is the primary goal of the feature (Asynchronous Page Fault). This supports delivery of page-not-present notification through SDEI event when the requested page isn't present. When the notification is received on the guest's vCPU, something else (another process) can be scheduled. The design is highlighted as below: * There is dedicated memory region shared by host and guest. It's represented by "struct kvm_vcpu_pv_apf_data". The field @reason indicates the reason why the SDEI event is triggered, while the unique @token is used by guest to associate the event with the suspended process. * One control block is associated with each guest's vCPU and it's represented by "struct kvm_arch_async_pf_control". It allows the guest to configure the functionality to indicate the situations where the host can deliver the page-not-present notification to kick off asyncrhonous page fault. Besides, runtime states are also maintained in this struct. * Before the page-not-present notification is sent to the guest's vCPU, a worker is started and executed asynchronously on host, to fetch the requested page. "struct kvm{_,_arch}async_pf" is associated with the worker, to track the work. The feature isn't enabled by CONFIG_KVM_ASYNC_PF yet. Also, the page-ready notification delivery and control path isn't implemented and will be done in the subsequent patches. Signed-off-by: Gavin Shan --- arch/arm64/include/asm/kvm_host.h | 50 + arch/arm64/include/uapi/asm/kvm_para.h | 15 +++ arch/arm64/kvm/Makefile| 1 + arch/arm64/kvm/arm.c | 3 + arch/arm64/kvm/async_pf.c | 145 + arch/arm64/kvm/mmu.c | 32 +- 6 files changed, 245 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/kvm/async_pf.c diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 00b30b7554e5..49cccefb22cf 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -273,6 +273,31 @@ struct vcpu_reset_state { boolreset; }; +/* Should be a power of two number */ +#define ASYNC_PF_PER_VCPU 64 + +/* + * The association of gfn and token. The token will be sent to guest as + * page fault address. Also, the guest could be in aarch32 mode. So its + * length should be 32-bits. + */ +struct kvm_arch_async_pf { + u32 token; + gfn_t gfn; + u32 esr; +}; + +struct kvm_arch_async_pf_control { + struct gfn_to_hva_cache cache; + u64 control_block; + boolsend_user_only; + u64 sdei_event_num; + + u16 id; + boolnotpresent_pending; + u32 notpresent_token; +}; + struct kvm_vcpu_arch { struct kvm_cpu_context ctxt; void *sve_state; @@ -375,6 +400,7 @@ struct kvm_vcpu_arch { } steal; struct kvm_sdei_vcpu *sdei; + struct kvm_arch_async_pf_control *apf; }; /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ @@ -734,6 +760,30 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); +#ifdef CONFIG_KVM_ASYNC_PF +void kvm_arch_async_pf_create_vcpu(struct kvm_vcpu *vcpu); +bool kvm_arch_async_not_present_allowed(struct kvm_vcpu *vcpu); +bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, +u32 esr, gpa_t gpa, gfn_t gfn); +bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, +struct kvm_async_pf *work); +void kvm_arch_async_pf_destroy_vcpu(struct kvm_vcpu *vcpu); +#else +static inline void kvm_arch_async_pf_create_vcpu(struct kvm_vcpu *vcpu) { } +static inline void kvm_arch_async_pf_destroy_vcpu(struct kvm_vcpu *vcpu) { } + +static inline bool kvm_arch_async_not_present_allowed(struct kvm_vcpu *vcpu) +{ + return false; +} + +static inline bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, + u32 esr, gpa_t gpa, gfn_t gfn) +{ + return false; +} +#endif + /* Guest/host FPSIM
[PATCH v2 06/17] KVM: arm64: Advertise KVM UID to guests via SMCCC
From: Will Deacon We can advertise ourselves to guests as KVM and provide a basic features bitmap for discoverability of future hypervisor services. Signed-off-by: Will Deacon Signed-off-by: Gavin Shan --- arch/arm64/kvm/hypercalls.c | 27 ++- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index a54c4805f2a6..e02e29a12bbf 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -12,13 +12,13 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) { u32 func_id = smccc_get_function(vcpu); - long val = SMCCC_RET_NOT_SUPPORTED; + long val[4] = { SMCCC_RET_NOT_SUPPORTED }; u32 feature; gpa_t gpa; switch (func_id) { case ARM_SMCCC_VERSION_FUNC_ID: - val = ARM_SMCCC_VERSION_1_1; + val[0] = ARM_SMCCC_VERSION_1_1; break; case ARM_SMCCC_ARCH_FEATURES_FUNC_ID: feature = smccc_get_arg1(vcpu); @@ -28,10 +28,10 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) case SPECTRE_VULNERABLE: break; case SPECTRE_MITIGATED: - val = SMCCC_RET_SUCCESS; + val[0] = SMCCC_RET_SUCCESS; break; case SPECTRE_UNAFFECTED: - val = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED; + val[0] = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED; break; } break; @@ -54,22 +54,31 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) break; fallthrough; case SPECTRE_UNAFFECTED: - val = SMCCC_RET_NOT_REQUIRED; + val[0] = SMCCC_RET_NOT_REQUIRED; break; } break; case ARM_SMCCC_HV_PV_TIME_FEATURES: - val = SMCCC_RET_SUCCESS; + val[0] = SMCCC_RET_SUCCESS; break; } break; case ARM_SMCCC_HV_PV_TIME_FEATURES: - val = kvm_hypercall_pv_features(vcpu); + val[0] = kvm_hypercall_pv_features(vcpu); break; case ARM_SMCCC_HV_PV_TIME_ST: gpa = kvm_init_stolen_time(vcpu); if (gpa != GPA_INVALID) - val = gpa; + val[0] = gpa; + break; + case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID: + val[0] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0; + val[1] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1; + val[2] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2; + val[3] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3; + break; + case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID: + val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES); break; case SDEI_1_0_FN_SDEI_VERSION: case SDEI_1_0_FN_SDEI_EVENT_REGISTER: @@ -93,6 +102,6 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) return kvm_psci_call(vcpu); } - smccc_set_retval(vcpu, val, 0, 0, 0); + smccc_set_retval(vcpu, val[0], val[1], val[2], val[3]); return 1; } -- 2.23.0
[PATCH v2 07/17] KVM: arm64: Export kvm_handle_user_mem_abort()
The main work is handled by user_mem_abort(). After asynchronous page fault is supported, one page fault need to be handled with two calls to this function. It means the page fault needs to be replayed asynchronously in that case. This renames the function to kvm_handle_user_mem_abort() can exports it. Besides, there are more changes introduced in order to accommodate asynchronous page fault: * Add arguments @esr and @prefault to user_mem_abort(). @esr is the cached value of ESR_EL2 instead of fetching from the current vCPU when the page fault is replayed in scenario of asynchronous page fault. @prefault is used to indicate the page fault is replayed one or not. * Define helper functions esr_dbat_*() in asm/esr.h to extract or check various fields of the passed ESR_EL2 value because those helper functions defined in asm/kvm_emulate.h assumes the ESR_EL2 value has been cached in vCPU struct. It won't be true on handling the replayed page fault in scenario of asynchronous page fault. * Some helper functions defined in asm/kvm_emulate.h are used by mmu.c only and seem not to be used by other source file in near future. They are moved to mmu.c and renamed accordingly. kvm_vcpu_trap_is_exec_fault() is_exec_fault() kvm_is_write_fault() is_write_fault() kvm_vcpu_trap_get_fault_level() Replaced by esr_dabt_get_fault_level() Signed-off-by: Gavin Shan --- arch/arm64/include/asm/esr.h | 6 arch/arm64/include/asm/kvm_emulate.h | 27 ++--- arch/arm64/include/asm/kvm_host.h| 4 +++ arch/arm64/kvm/mmu.c | 43 ++-- 4 files changed, 48 insertions(+), 32 deletions(-) diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 29f97eb3dad4..db46eb58c633 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -321,8 +321,14 @@ ESR_ELx_CP15_32_ISS_DIR_READ) #ifndef __ASSEMBLY__ +#include #include +#define esr_dabt_get_fault_type(esr) (esr & ESR_ELx_FSC_TYPE) +#define esr_dabt_get_fault_level(esr) (FIELD_GET(ESR_ELx_FSC_LEVEL, esr)) +#define esr_dabt_is_wnr(esr) (!!(FIELD_GET(ESR_ELx_WNR, esr))) +#define esr_dabt_is_s1ptw(esr) (!!(FIELD_GET(ESR_ELx_S1PTW, esr))) + static inline bool esr_is_data_abort(u32 esr) { const u32 ec = ESR_ELx_EC(esr); diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 0ef213b715a5..119b953828a2 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -282,13 +282,13 @@ static __always_inline int kvm_vcpu_dabt_get_rd(const struct kvm_vcpu *vcpu) static __always_inline bool kvm_vcpu_abt_iss1tw(const struct kvm_vcpu *vcpu) { - return !!(kvm_vcpu_get_esr(vcpu) & ESR_ELx_S1PTW); + return esr_dabt_is_s1ptw(kvm_vcpu_get_esr(vcpu)); } /* Always check for S1PTW *before* using this. */ static __always_inline bool kvm_vcpu_dabt_iswrite(const struct kvm_vcpu *vcpu) { - return kvm_vcpu_get_esr(vcpu) & ESR_ELx_WNR; + return esr_dabt_is_wnr(kvm_vcpu_get_esr(vcpu)); } static inline bool kvm_vcpu_dabt_is_cm(const struct kvm_vcpu *vcpu) @@ -317,11 +317,6 @@ static inline bool kvm_vcpu_trap_is_iabt(const struct kvm_vcpu *vcpu) return kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_IABT_LOW; } -static inline bool kvm_vcpu_trap_is_exec_fault(const struct kvm_vcpu *vcpu) -{ - return kvm_vcpu_trap_is_iabt(vcpu) && !kvm_vcpu_abt_iss1tw(vcpu); -} - static __always_inline u8 kvm_vcpu_trap_get_fault(const struct kvm_vcpu *vcpu) { return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC; @@ -329,12 +324,7 @@ static __always_inline u8 kvm_vcpu_trap_get_fault(const struct kvm_vcpu *vcpu) static __always_inline u8 kvm_vcpu_trap_get_fault_type(const struct kvm_vcpu *vcpu) { - return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC_TYPE; -} - -static __always_inline u8 kvm_vcpu_trap_get_fault_level(const struct kvm_vcpu *vcpu) -{ - return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC_LEVEL; + return esr_dabt_get_fault_type(kvm_vcpu_get_esr(vcpu)); } static __always_inline bool kvm_vcpu_abt_issea(const struct kvm_vcpu *vcpu) @@ -362,17 +352,6 @@ static __always_inline int kvm_vcpu_sys_get_rt(struct kvm_vcpu *vcpu) return ESR_ELx_SYS64_ISS_RT(esr); } -static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu) -{ - if (kvm_vcpu_abt_iss1tw(vcpu)) - return true; - - if (kvm_vcpu_trap_is_iabt(vcpu)) - return false; - - return kvm_vcpu_dabt_iswrite(vcpu); -} - static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu) { return vcpu_read_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK; diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 01eda5c8
[PATCH v2 04/17] KVM: x86: Use generic async PF slot management
This uses the generic slot management mechanism for asynchronous page fault by enabling CONFIG_KVM_ASYNC_PF_SLOT because the private implementation is totally duplicate to the generic one. The changes introduced by this is pretty mechanical and shouldn't cause any logical changes. Signed-off-by: Gavin Shan --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/Kconfig| 1 + arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/x86.c | 86 +++-- 4 files changed, 8 insertions(+), 82 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3d6616f6f6ef..3488eeb79c79 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1714,7 +1714,6 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu); bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu); -extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 7ac592664c52..b0ad75087ab5 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -34,6 +34,7 @@ config KVM select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_EVENTFD select KVM_ASYNC_PF + select KVM_ASYNC_PF_SLOT select USER_RETURN_NOTIFIER select KVM_MMIO select TASKSTATS diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6d16481aa29d..ca2e84d6743c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3678,7 +3678,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, if (!prefault && kvm_can_do_async_pf(vcpu)) { trace_kvm_try_async_get_page(cr2_or_gpa, gfn); - if (kvm_find_async_pf_gfn(vcpu, gfn)) { + if (kvm_async_pf_find_slot(vcpu, gfn)) { trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn); kvm_make_request(KVM_REQ_APF_HALT, vcpu); return true; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f3c9fe5c424e..b04d78a87abe 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -290,13 +290,6 @@ static struct kmem_cache *kvm_alloc_emulator_cache(void) static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); -static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) -{ - int i; - for (i = 0; i < ASYNC_PF_PER_VCPU; i++) - vcpu->arch.apf.gfns[i] = ~0; -} - static void kvm_on_user_return(struct user_return_notifier *urn) { unsigned slot; @@ -812,7 +805,7 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon if ((cr0 ^ old_cr0) & X86_CR0_PG) { kvm_clear_async_pf_completion_queue(vcpu); - kvm_async_pf_hash_reset(vcpu); + kvm_async_pf_reset_slot(vcpu); } if ((cr0 ^ old_cr0) & update_bits) @@ -2905,7 +2898,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) if (!kvm_pv_async_pf_enabled(vcpu)) { kvm_clear_async_pf_completion_queue(vcpu); - kvm_async_pf_hash_reset(vcpu); + kvm_async_pf_reset_slot(vcpu); return 0; } @@ -9996,7 +9989,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; - kvm_async_pf_hash_reset(vcpu); + kvm_async_pf_reset_slot(vcpu); kvm_pmu_init(vcpu); vcpu->arch.pending_external_vector = -1; @@ -10117,7 +10110,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvmclock_reset(vcpu); kvm_clear_async_pf_completion_queue(vcpu); - kvm_async_pf_hash_reset(vcpu); + kvm_async_pf_reset_slot(vcpu); vcpu->arch.apf.halted = false; if (vcpu->arch.guest_fpu && kvm_mpx_supported()) { @@ -10932,73 +10925,6 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true); } -static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) -{ - BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU)); - - return hash_32(gfn & 0x, order_base_2(ASYNC_PF_PER_VCPU)); -} - -static inline u32 kvm_async_pf_next_probe(u32 key) -{ - return (key + 1) & (ASYNC_PF_PER_VCPU - 1); -} - -static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - u32 key = kvm_async_pf_hash_fn(gfn); - - while (vcpu->arch.apf.gfns[key] != ~0) - key = kvm_async_pf_next_probe(key); - - vcpu->arch.apf.gfns[key] = gfn; -} - -static u32 kvm_async_pf_gfn_slot(struct k
[PATCH v2 05/17] arm64: Probe for the presence of KVM hypervisor services during boot
From: Will Deacon Although the SMCCC specification provides some limited functionality for describing the presence of hypervisor and firmware services, this is generally applicable only to functions designated as "Arm Architecture Service Functions" and no portable discovery mechanism is provided for standard hypervisor services, despite having a designated range of function identifiers reserved by the specification. In an attempt to avoid the need for additional firmware changes every time a new function is added, introduce a UID to identify the service provider as being compatible with KVM. Once this has been established, additional services can be discovered via a feature bitmap. Signed-off-by: Will Deacon Signed-off-by: Gavin Shan --- arch/arm64/include/asm/hypervisor.h | 11 ++ arch/arm64/kernel/setup.c | 32 + include/linux/arm-smccc.h | 25 ++ 3 files changed, 68 insertions(+) diff --git a/arch/arm64/include/asm/hypervisor.h b/arch/arm64/include/asm/hypervisor.h index f9cc1d021791..91e4bd890819 100644 --- a/arch/arm64/include/asm/hypervisor.h +++ b/arch/arm64/include/asm/hypervisor.h @@ -2,6 +2,17 @@ #ifndef _ASM_ARM64_HYPERVISOR_H #define _ASM_ARM64_HYPERVISOR_H +#include #include +static inline bool kvm_arm_hyp_service_available(u32 func_id) +{ + extern DECLARE_BITMAP(__kvm_arm_hyp_services, ARM_SMCCC_KVM_NUM_FUNCS); + + if (func_id >= ARM_SMCCC_KVM_NUM_FUNCS) + return -EINVAL; + + return test_bit(func_id, __kvm_arm_hyp_services); +} + #endif diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index c18aacde8bb0..8cbb99d80869 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -7,6 +7,7 @@ */ #include +#include #include #include #include @@ -275,12 +276,42 @@ static int __init reserve_memblock_reserved_regions(void) arch_initcall(reserve_memblock_reserved_regions); u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID }; +DECLARE_BITMAP(__kvm_arm_hyp_services, ARM_SMCCC_KVM_NUM_FUNCS) = { }; u64 cpu_logical_map(unsigned int cpu) { return __cpu_logical_map[cpu]; } +static void __init kvm_init_hyp_services(void) +{ + struct arm_smccc_res res; + int i; + + arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID, ); + if (res.a0 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0 || + res.a1 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1 || + res.a2 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2 || + res.a3 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3) + return; + + memset(, 0, sizeof(res)); + arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID, ); + for (i = 0; i < 32; ++i) { + if (res.a0 & (i)) + set_bit(i + (32 * 0), __kvm_arm_hyp_services); + if (res.a1 & (i)) + set_bit(i + (32 * 1), __kvm_arm_hyp_services); + if (res.a2 & (i)) + set_bit(i + (32 * 2), __kvm_arm_hyp_services); + if (res.a3 & (i)) + set_bit(i + (32 * 3), __kvm_arm_hyp_services); + } + + pr_info("KVM hypervisor services detected (0x%08lx 0x%08lx 0x%08lx 0x%08lx)\n", + res.a3, res.a2, res.a1, res.a0); +} + void __init __no_sanitize_address setup_arch(char **cmdline_p) { init_mm.start_code = (unsigned long) _stext; @@ -353,6 +384,7 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p) else psci_acpi_init(); + kvm_init_hyp_services(); init_bootcpu_ops(); smp_init_cpus(); smp_build_mpidr_hash(); diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index f860645f6512..7eb816241697 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -55,6 +55,8 @@ #define ARM_SMCCC_OWNER_TRUSTED_OS 50 #define ARM_SMCCC_OWNER_TRUSTED_OS_END 63 +#define ARM_SMCCC_FUNC_QUERY_CALL_UID 0xff01 + #define ARM_SMCCC_QUIRK_NONE 0 #define ARM_SMCCC_QUIRK_QCOM_A61 /* Save/restore register a6 */ @@ -102,6 +104,29 @@ ARM_SMCCC_OWNER_STANDARD_HYP,\ 0x21) +#define ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32,\ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_FUNC_QUERY_CALL_UID) + +/* KVM UID value: 28b46fb6-2ec5-11e9-a9ca-4b564d003a74 */ +#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0 0xb66fb428U +#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1 0xe911c52eU +#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2 0x564bcaa9U +#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3 0x743a004dU + +/*
[PATCH v2 01/17] KVM: async_pf: Move struct kvm_async_pf around
This moves the definitions of "struct kvm_async_pf" and the related functions after "struct kvm_vcpu" so that newly added inline function can dereference "struct kvm_vcpu" properly. Otherwise, the unexpected build error will be raised: error: dereferencing pointer to incomplete type ‘struct kvm_vcpu’ return !list_empty_careful(>async_pf.done); ^~ The sepator between type and field is replaced by tab for "struct kvm_async_pf" since we're here. This is preparatory work for adding new inline function for next patch. This shouldn't cause logical changes. Signed-off-by: Gavin Shan --- include/linux/kvm_host.h | 43 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f3b1013fb22c..b6697ee1182e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -196,27 +196,6 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr); -#ifdef CONFIG_KVM_ASYNC_PF -struct kvm_async_pf { - struct work_struct work; - struct list_head link; - struct list_head queue; - struct kvm_vcpu *vcpu; - struct mm_struct *mm; - gpa_t cr2_or_gpa; - unsigned long addr; - struct kvm_arch_async_pf arch; - bool wakeup_all; - bool notpresent_injected; -}; - -void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu); -void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu); -bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - unsigned long hva, struct kvm_arch_async_pf *arch); -int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); -#endif - enum { OUTSIDE_GUEST_MODE, IN_GUEST_MODE, @@ -323,6 +302,28 @@ struct kvm_vcpu { struct kvm_dirty_ring dirty_ring; }; +#ifdef CONFIG_KVM_ASYNC_PF +struct kvm_async_pf { + struct work_struct work; + struct list_headlink; + struct list_headqueue; + struct kvm_vcpu *vcpu; + struct mm_struct*mm; + gpa_t cr2_or_gpa; + unsigned long addr; + struct kvm_arch_async_pfarch; + boolwakeup_all; + boolnotpresent_injected; +}; + +void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu); +void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu); +bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + unsigned long hva, struct kvm_arch_async_pf *arch); +int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); +#endif + + static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) { /* -- 2.23.0
[PATCH v2 03/17] KVM: async_pf: Make GFN slot management generic
It's not allowed to fire duplicate notification for same GFN on x86 platform, with help of a hash table. This mechanism is going to be used by arm64 and this makes the code generic and shareable by multiple platforms. * As this mechanism isn't needed by all platforms, a new kernel config option (CONFIG_ASYNC_PF_SLOT) is introduced so that it can be disabled at compiling time. * The code is basically copied from x86 platform and the functions are renamed to reflect the fact: (a) the input parameters are vCPU and GFN. (b) The operations are resetting, searching, adding and removing. * Helper stub is also added on !CONFIG_KVM_ASYNC_PF because we're going to use IS_ENABLED() instead of #ifdef on arm64 when the asynchronous page fault is supported. This is preparatory work to use the newly introduced functions on x86 platform and arm64 in subsequent patches. Signed-off-by: Gavin Shan --- include/linux/kvm_host.h | 18 + virt/kvm/Kconfig | 3 ++ virt/kvm/async_pf.c | 79 3 files changed, 100 insertions(+) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 041d93f8f4b0..b52d71030f25 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -277,6 +277,9 @@ struct kvm_vcpu { #ifdef CONFIG_KVM_ASYNC_PF struct { +#ifdef CONFIG_KVM_ASYNC_PF_SLOT + gfn_t gfns[ASYNC_PF_PER_VCPU]; +#endif u32 queued; struct list_head queue; struct list_head done; @@ -321,12 +324,27 @@ static inline bool kvm_check_async_pf_completion_queue(struct kvm_vcpu *vcpu) return !list_empty_careful(>async_pf.done); } +#ifdef CONFIG_KVM_ASYNC_PF_SLOT +void kvm_async_pf_reset_slot(struct kvm_vcpu *vcpu); +void kvm_async_pf_add_slot(struct kvm_vcpu *vcpu, gfn_t gfn); +void kvm_async_pf_remove_slot(struct kvm_vcpu *vcpu, gfn_t gfn); +bool kvm_async_pf_find_slot(struct kvm_vcpu *vcpu, gfn_t gfn); +#endif + void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu); void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu); bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, unsigned long hva, struct kvm_arch_async_pf *arch); int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #else +static inline void kvm_async_pf_reset_slot(struct kvm_vcpu *vcpu) { } +static inline void kvm_async_pf_add_slot(struct kvm_vcpu *vcpu, gfn_t gfn) { } +static inline void kvm_async_pf_remove_slot(struct kvm_vcpu *vcpu, gfn_t gfn) { } +static inline bool kvm_async_pf_find_slot(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + return false; +} + static inline bool kvm_check_async_pf_completion_queue(struct kvm_vcpu *vcpu) { return false; diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 1c37ccd5d402..69a282aaa4df 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -23,6 +23,9 @@ config KVM_MMIO config KVM_ASYNC_PF bool +config KVM_ASYNC_PF_SLOT + bool + # Toggle to switch between direct notification and batch job config KVM_ASYNC_PF_SYNC bool diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 2cf864aafd0e..7bf22b20af45 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -19,6 +19,85 @@ static struct kmem_cache *async_pf_cache; +#ifdef CONFIG_KVM_ASYNC_PF_SLOT +static inline u32 kvm_async_pf_hash(gfn_t gfn) +{ + BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU)); + + return hash_32(gfn & 0x, order_base_2(ASYNC_PF_PER_VCPU)); +} + +static inline u32 kvm_async_pf_next_slot(u32 key) +{ + return (key + 1) & (ASYNC_PF_PER_VCPU - 1); +} + +static u32 kvm_async_pf_slot(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + int i; + u32 key = kvm_async_pf_hash(gfn); + + for (i = 0; i < ASYNC_PF_PER_VCPU && + (vcpu->async_pf.gfns[key] != gfn && + vcpu->async_pf.gfns[key] != ~0); i++) + key = kvm_async_pf_next_slot(key); + + return key; +} + +void kvm_async_pf_reset_slot(struct kvm_vcpu *vcpu) +{ + int i; + + for (i = 0; i < ASYNC_PF_PER_VCPU; i++) + vcpu->async_pf.gfns[i] = ~0; +} + +bool kvm_async_pf_find_slot(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + return vcpu->async_pf.gfns[kvm_async_pf_slot(vcpu, gfn)] == gfn; +} + +void kvm_async_pf_add_slot(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + u32 key = kvm_async_pf_hash(gfn); + + while (vcpu->async_pf.gfns[key] != ~0) + key = kvm_async_pf_next_slot(key); + + vcpu->async_pf.gfns[key] = gfn; +} + +void kvm_async_pf_remove_slot(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + u32 i, j, k; + + i = j = kvm_async_pf_slot(vcpu, gfn); + + if (WARN_ON_ONCE(vcpu->async_pf.gfns[i] != gfn)) + return; + + while (true) { + vcpu->async_pf.gfns[i] = ~0; +
[PATCH v2 02/17] KVM: async_pf: Add helper function to check completion queue
This adds inline function kvm_check_async_pf_completion_queue() and stub on !CONFIG_KVM_ASYNC_PF so that the source code won't have to care about CONFIG_KVM_ASYNC_PF. The kernel option is used for once in kvm_main.c and it can be removed then. Besides, the checks on the completion queue are all replaced by the newly introduced helper as list_empty() and list_empty_careful() are interchangeable. The stub kvm_check_async_pf_completion() on !CONFIG_KVM_ASYNC_PF is also introduced. It will be used by subsequent patch. Signed-off-by: Gavin Shan --- arch/x86/kvm/x86.c | 2 +- include/linux/kvm_host.h | 12 virt/kvm/async_pf.c | 12 ++-- virt/kvm/kvm_main.c | 4 +--- 4 files changed, 20 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 76bce832cade..f3c9fe5c424e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10794,7 +10794,7 @@ static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) { - if (!list_empty_careful(>async_pf.done)) + if (kvm_check_async_pf_completion_queue(vcpu)) return true; if (kvm_apic_has_events(vcpu)) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b6697ee1182e..041d93f8f4b0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -316,11 +316,23 @@ struct kvm_async_pf { boolnotpresent_injected; }; +static inline bool kvm_check_async_pf_completion_queue(struct kvm_vcpu *vcpu) +{ + return !list_empty_careful(>async_pf.done); +} + void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu); void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu); bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, unsigned long hva, struct kvm_arch_async_pf *arch); int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); +#else +static inline bool kvm_check_async_pf_completion_queue(struct kvm_vcpu *vcpu) +{ + return false; +} + +static inline void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) { } #endif diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index dd777688d14a..2cf864aafd0e 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -70,7 +70,7 @@ static void async_pf_execute(struct work_struct *work) kvm_arch_async_page_present(vcpu, apf); spin_lock(>async_pf.lock); - first = list_empty(>async_pf.done); + first = !kvm_check_async_pf_completion_queue(vcpu); list_add_tail(>link, >async_pf.done); apf->vcpu = NULL; spin_unlock(>async_pf.lock); @@ -122,7 +122,7 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) spin_lock(>async_pf.lock); } - while (!list_empty(>async_pf.done)) { + while (kvm_check_async_pf_completion_queue(vcpu)) { struct kvm_async_pf *work = list_first_entry(>async_pf.done, typeof(*work), link); @@ -138,8 +138,8 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) { struct kvm_async_pf *work; - while (!list_empty_careful(>async_pf.done) && - kvm_arch_can_dequeue_async_page_present(vcpu)) { + while (kvm_check_async_pf_completion_queue(vcpu) && + kvm_arch_can_dequeue_async_page_present(vcpu)) { spin_lock(>async_pf.lock); work = list_first_entry(>async_pf.done, typeof(*work), link); @@ -205,7 +205,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu) struct kvm_async_pf *work; bool first; - if (!list_empty_careful(>async_pf.done)) + if (kvm_check_async_pf_completion_queue(vcpu)) return 0; work = kmem_cache_zalloc(async_pf_cache, GFP_ATOMIC); @@ -216,7 +216,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu) INIT_LIST_HEAD(>queue); /* for list_del to work */ spin_lock(>async_pf.lock); - first = list_empty(>async_pf.done); + first = !kvm_check_async_pf_completion_queue(vcpu); list_add_tail(>link, >async_pf.done); spin_unlock(>async_pf.lock); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8367d88ce39b..632b80b6e485 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2961,10 +2961,8 @@ static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu) if (kvm_arch_dy_runnable(vcpu)) return true; -#ifdef CONFIG_KVM_ASYNC_PF - if (!list_empty_careful(>async_pf.done)) + if (kvm_check_async_pf_completion_queue(vcpu)) return true; -#endif return false; } -- 2.23.0
[PATCH v2 00/17] Support Asynchronous Page Fault
ieved from "info migrate": Param -APF +APF Output store-global-state: on on only-migratable:offoff send-configuration: on on send-section-footer:on on decompress-error-check: on on clear-bitmap-shift: 18 18 Migration status: completed completed total time: 9576 ms10461 ms +9.2% downtime: 78 ms 44 ms -43.5% setup: 62 ms 47ms -24.1% transferred ram:889007 kbytes 1206436 kbytes+35.7% throughput: 765.53 mbps949.08 mbps +24% remaining ram: 0 kbytes 0 kbytes total ram: 4325952 kbytes 4325952 kbytes duplicate: 861559 pages 823954 pages skipped:0 pages0 pages normal: 219929 pages 299214 pages normal bytes: 879716 kbytes 1196856 kbytes dirty sync count: 2 2 page size: 4 kbytes 4 kbytes multifd bytes: 0 kbytes 0 kbytes pages-per-second: 33684 72400 +115% postcopy request count: 12175-38% The asynchronous page fault is beneficial to throughput and speed in the scenario of post-copy live migration. Chnagelog = v2: * Rebase to v5.11.rc6 (Gavin) * Split the patches(James) * Allocate "struct kvm_arch_async_control" dymaicall and use it to check if the feature has been enabled. The kernel option (CONFIG_KVM_ASYNC_PF) isn't used. (James) * Add document to explain the design (James) * Make GFN hash table management generic (James) * Add ioctl commands to support migration (Gavin) Gavin Shan (15): KVM: async_pf: Move struct kvm_async_pf around KVM: async_pf: Add helper function to check completion queue KVM: async_pf: Make GFN slot management generic KVM: x86: Use generic async PF slot management KVM: arm64: Export kvm_handle_user_mem_abort() KVM: arm64: Add paravirtualization header files KVM: arm64: Support page-not-present notification KVM: arm64: Support page-ready notification KVM: arm64: Support async PF hypercalls KVM: arm64: Support async PF ioctl commands KVM: arm64: Export async PF capability arm64: Detect async PF para-virtualization feature arm64: Reschedule process on aync PF arm64: Enable async PF KVM: arm64: Add async PF document Will Deacon (2): arm64: Probe for the presence of KVM hypervisor services during boot KVM: arm64: Advertise KVM UID to guests via SMCCC Documentation/virt/kvm/arm/apf.rst | 143 +++ Documentation/virt/kvm/arm/index.rst | 1 + arch/arm64/Kconfig | 11 + arch/arm64/include/asm/esr.h | 6 + arch/arm64/include/asm/hypervisor.h| 11 + arch/arm64/include/asm/kvm_emulate.h | 27 +- arch/arm64/include/asm/kvm_host.h | 85 arch/arm64/include/asm/kvm_para.h | 37 ++ arch/arm64/include/asm/processor.h | 1 + arch/arm64/include/asm/thread_info.h | 4 +- arch/arm64/include/uapi/asm/Kbuild | 2 - arch/arm64/include/uapi/asm/kvm.h | 19 + arch/arm64/include/uapi/asm/kvm_para.h | 23 ++ arch/arm64/include/uapi/asm/kvm_sdei.h | 1 + arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/kvm.c| 452 + arch/arm64/kernel/setup.c | 32 ++ arch/arm64/kernel/signal.c | 17 + arch/arm64/kvm/Kconfig | 2 + arch/arm64/kvm/Makefile| 1 + arch/arm64/kvm/arm.c | 37 +- arch/arm64/kvm/async_pf.c | 535 + arch/arm64/kvm/hypercalls.c| 33 +- arch/arm64/kvm/mmu.c | 75 +++- arch/arm64/kvm/sdei.c | 5 + arch/x86/include/asm/kvm_host.h| 1 - arch/x86/kvm/Kconfig | 1 + arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/x86.c | 88 +--- include/linux/arm-smccc.h | 40 ++ include/linux/kvm_host.h | 73 +++- include/uapi/linux/kvm.h | 3 + virt/kvm/Kconfig | 3 + virt/kvm/async_pf.c| 91 - virt/kvm/kvm_main.c| 4 +- 35 files changed, 1706 insertions(+), 161 deletions(-) create mode 100644 Documentation/virt/kvm/arm/apf.rst create mode 1006
[PATCH v2 15/21] KVM: arm64: Support SDEI event notifier
The owner of the SDEI event, like asynchronous page fault, need know the state of injected SDEI event. This supports SDEI event state updating by introducing notifier mechanism. It's notable the notifier (handler) should be capable of migration. Signed-off-by: Gavin Shan --- arch/arm64/include/asm/kvm_sdei.h | 12 +++ arch/arm64/include/uapi/asm/kvm_sdei.h | 1 + arch/arm64/kvm/sdei.c | 45 +- 3 files changed, 57 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_sdei.h b/arch/arm64/include/asm/kvm_sdei.h index 7f5f5ad689e6..19f2d9b91f85 100644 --- a/arch/arm64/include/asm/kvm_sdei.h +++ b/arch/arm64/include/asm/kvm_sdei.h @@ -16,6 +16,16 @@ #include #include +struct kvm_vcpu; + +typedef void (*kvm_sdei_notifier)(struct kvm_vcpu *vcpu, + unsigned long num, + unsigned int state); +enum { + KVM_SDEI_NOTIFY_DELIVERED, + KVM_SDEI_NOTIFY_COMPLETED, +}; + struct kvm_sdei_event { struct kvm_sdei_event_state state; struct kvm *kvm; @@ -112,6 +122,8 @@ KVM_SDEI_FLAG_FUNC(enabled) void kvm_sdei_init_vm(struct kvm *kvm); void kvm_sdei_create_vcpu(struct kvm_vcpu *vcpu); int kvm_sdei_hypercall(struct kvm_vcpu *vcpu); +int kvm_sdei_register_notifier(struct kvm *kvm, unsigned long num, + kvm_sdei_notifier notifier); void kvm_sdei_deliver(struct kvm_vcpu *vcpu); void kvm_sdei_destroy_vcpu(struct kvm_vcpu *vcpu); void kvm_sdei_destroy_vm(struct kvm *kvm); diff --git a/arch/arm64/include/uapi/asm/kvm_sdei.h b/arch/arm64/include/uapi/asm/kvm_sdei.h index 9dbda2fb457e..20ad724f63c8 100644 --- a/arch/arm64/include/uapi/asm/kvm_sdei.h +++ b/arch/arm64/include/uapi/asm/kvm_sdei.h @@ -20,6 +20,7 @@ struct kvm_sdei_event_state { uint8_t type; uint8_t signaled; uint8_t priority; + uint64_tnotifier; }; struct kvm_sdei_kvm_event_state { diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c index 1e8e213c9d70..5f7a37dcaa77 100644 --- a/arch/arm64/kvm/sdei.c +++ b/arch/arm64/kvm/sdei.c @@ -314,9 +314,11 @@ static unsigned long kvm_sdei_hypercall_complete(struct kvm_vcpu *vcpu, struct kvm *kvm = vcpu->kvm; struct kvm_sdei_kvm *ksdei = kvm->arch.sdei; struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei; + struct kvm_sdei_event *kse = NULL; struct kvm_sdei_kvm_event *kske = NULL; struct kvm_sdei_vcpu_event *ksve = NULL; struct kvm_sdei_vcpu_regs *regs; + kvm_sdei_notifier notifier; unsigned long ret = SDEI_SUCCESS; int index; @@ -349,6 +351,13 @@ static unsigned long kvm_sdei_hypercall_complete(struct kvm_vcpu *vcpu, *vcpu_cpsr(vcpu) = regs->pstate; *vcpu_pc(vcpu) = regs->pc; + /* Notifier */ + kske = ksve->kske; + kse = kske->kse; + notifier = (kvm_sdei_notifier)(kse->state.notifier); + if (notifier) + notifier(vcpu, kse->state.num, KVM_SDEI_NOTIFY_COMPLETED); + /* Inject interrupt if needed */ if (resume) kvm_inject_irq(vcpu); @@ -358,7 +367,6 @@ static unsigned long kvm_sdei_hypercall_complete(struct kvm_vcpu *vcpu, * event state as it's not destroyed because of the reference * count. */ - kske = ksve->kske; ksve->state.refcount--; kske->state.refcount--; if (!ksve->state.refcount) { @@ -746,6 +754,35 @@ int kvm_sdei_hypercall(struct kvm_vcpu *vcpu) return 1; } +int kvm_sdei_register_notifier(struct kvm *kvm, + unsigned long num, + kvm_sdei_notifier notifier) +{ + struct kvm_sdei_kvm *ksdei = kvm->arch.sdei; + struct kvm_sdei_event *kse = NULL; + int ret = 0; + + if (!ksdei) { + ret = -EPERM; + goto out; + } + + spin_lock(>lock); + + kse = kvm_sdei_find_event(kvm, num); + if (!kse) { + ret = -EINVAL; + goto unlock; + } + + kse->state.notifier = (unsigned long)notifier; + +unlock: + spin_unlock(>lock); +out: + return ret; +} + void kvm_sdei_deliver(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; @@ -755,6 +792,7 @@ void kvm_sdei_deliver(struct kvm_vcpu *vcpu) struct kvm_sdei_kvm_event *kske = NULL; struct kvm_sdei_vcpu_event *ksve = NULL; struct kvm_sdei_vcpu_regs *regs = NULL; + kvm_sdei_notifier notifier; unsigned long pstate; int index = 0; @@ -826,6 +864,11 @@ void kvm_sdei_deliver(struct kvm_vcpu *vcpu) *vcpu_cpsr(vcpu) = pstate; *vcpu_pc(vcpu) = kske->state.entries[index]; + /* Notifier */ + notifier = (kvm_sdei_notifier)(kse->state.notifie
[PATCH v2 05/21] KVM: arm64: Support SDEI_EVENT_{ENABLE, DISABLE} hypercall
This supports SDEI_EVENT_{ENABLE, DISABLE} hypercall. After SDEI event is registered by guest, it won't be delivered to the guest until it's enabled. On the other hand, the SDEI event won't be raised to the guest or specific vCPU if it's has been disabled on the guest or specific vCPU. Signed-off-by: Gavin Shan --- arch/arm64/kvm/sdei.c | 68 +++ 1 file changed, 68 insertions(+) diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c index d3ea3eee154b..b022ce0a202b 100644 --- a/arch/arm64/kvm/sdei.c +++ b/arch/arm64/kvm/sdei.c @@ -206,6 +206,70 @@ static unsigned long kvm_sdei_hypercall_register(struct kvm_vcpu *vcpu) return ret; } +static unsigned long kvm_sdei_hypercall_enable(struct kvm_vcpu *vcpu, + bool enable) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_sdei_kvm *ksdei = kvm->arch.sdei; + struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei; + struct kvm_sdei_event *kse = NULL; + struct kvm_sdei_kvm_event *kske = NULL; + unsigned long event_num = smccc_get_arg1(vcpu); + int index = 0; + unsigned long ret = SDEI_SUCCESS; + + /* Sanity check */ + if (!(ksdei && vsdei)) { + ret = SDEI_NOT_SUPPORTED; + goto out; + } + + if (!kvm_sdei_is_valid_event_num(event_num)) { + ret = SDEI_INVALID_PARAMETERS; + goto out; + } + + /* Check if the KVM event exists */ + spin_lock(>lock); + kske = kvm_sdei_find_kvm_event(kvm, event_num); + if (!kske) { + ret = SDEI_INVALID_PARAMETERS; + goto unlock; + } + + /* Check if there is pending events */ + if (kske->state.refcount) { + ret = SDEI_PENDING; + goto unlock; + } + + /* Check if it has been registered */ + kse = kske->kse; + index = (kse->state.type == SDEI_EVENT_TYPE_PRIVATE) ? + vcpu->vcpu_idx : 0; + if (!kvm_sdei_is_registered(kske, index)) { + ret = SDEI_DENIED; + goto unlock; + } + + /* Verify its enablement state */ + if (enable == kvm_sdei_is_enabled(kske, index)) { + ret = SDEI_DENIED; + goto unlock; + } + + /* Update enablement state */ + if (enable) + kvm_sdei_set_enabled(kske, index); + else + kvm_sdei_clear_enabled(kske, index); + +unlock: + spin_unlock(>lock); +out: + return ret; +} + int kvm_sdei_hypercall(struct kvm_vcpu *vcpu) { u32 func = smccc_get_function(vcpu); @@ -220,7 +284,11 @@ int kvm_sdei_hypercall(struct kvm_vcpu *vcpu) ret = kvm_sdei_hypercall_register(vcpu); break; case SDEI_1_0_FN_SDEI_EVENT_ENABLE: + ret = kvm_sdei_hypercall_enable(vcpu, true); + break; case SDEI_1_0_FN_SDEI_EVENT_DISABLE: + ret = kvm_sdei_hypercall_enable(vcpu, false); + break; case SDEI_1_0_FN_SDEI_EVENT_CONTEXT: case SDEI_1_0_FN_SDEI_EVENT_COMPLETE: case SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME: -- 2.23.0
[PATCH v2 03/21] KVM: arm64: Support SDEI_VERSION hypercall
This supports SDEI_VERSION hypercall by returning v1.0.0 simply when the functionality is supported on the VM and vCPU. Signed-off-by: Gavin Shan --- arch/arm64/kvm/sdei.c | 18 ++ 1 file changed, 18 insertions(+) diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c index ab330b74a965..aa9485f076a9 100644 --- a/arch/arm64/kvm/sdei.c +++ b/arch/arm64/kvm/sdei.c @@ -70,6 +70,22 @@ static void kvm_sdei_remove_vcpu_events(struct kvm_vcpu *vcpu) } } +static unsigned long kvm_sdei_hypercall_version(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_sdei_kvm *ksdei = kvm->arch.sdei; + struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei; + unsigned long ret = SDEI_NOT_SUPPORTED; + + if (!(ksdei && vsdei)) + return ret; + + /* v1.0.0 */ + ret = (1UL << SDEI_VERSION_MAJOR_SHIFT); + + return ret; +} + int kvm_sdei_hypercall(struct kvm_vcpu *vcpu) { u32 func = smccc_get_function(vcpu); @@ -78,6 +94,8 @@ int kvm_sdei_hypercall(struct kvm_vcpu *vcpu) switch (func) { case SDEI_1_0_FN_SDEI_VERSION: + ret = kvm_sdei_hypercall_version(vcpu); + break; case SDEI_1_0_FN_SDEI_EVENT_REGISTER: case SDEI_1_0_FN_SDEI_EVENT_ENABLE: case SDEI_1_0_FN_SDEI_EVENT_DISABLE: -- 2.23.0
[PATCH v2 07/21] KVM: arm64: Support SDEI_EVENT_UNREGISTER hypercall
This supports SDEI_EVENT_UNREGISTER hypercall. It's used by the guest to unregister SDEI event. The SDEI event won't be raised to the guest or specific vCPU after it's unregistered successfully. It's notable the SDEI event is disabled automatically on the guest or specific vCPU once it's unregistered successfully. Signed-off-by: Gavin Shan --- arch/arm64/kvm/sdei.c | 61 +++ 1 file changed, 61 insertions(+) diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c index b4162efda470..a3ba69dc91cb 100644 --- a/arch/arm64/kvm/sdei.c +++ b/arch/arm64/kvm/sdei.c @@ -308,6 +308,65 @@ static unsigned long kvm_sdei_hypercall_context(struct kvm_vcpu *vcpu) return ret; } +static unsigned long kvm_sdei_hypercall_unregister(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_sdei_kvm *ksdei = kvm->arch.sdei; + struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei; + struct kvm_sdei_event *kse = NULL; + struct kvm_sdei_kvm_event *kske = NULL; + unsigned long event_num = smccc_get_arg1(vcpu); + int index = 0; + unsigned long ret = SDEI_SUCCESS; + + /* Sanity check */ + if (!(ksdei && vsdei)) { + ret = SDEI_NOT_SUPPORTED; + goto out; + } + + if (!kvm_sdei_is_valid_event_num(event_num)) { + ret = SDEI_INVALID_PARAMETERS; + goto out; + } + + /* Check if the KVM event exists */ + spin_lock(>lock); + kske = kvm_sdei_find_kvm_event(kvm, event_num); + if (!kske) { + ret = SDEI_INVALID_PARAMETERS; + goto unlock; + } + + /* Check if there is pending events */ + if (kske->state.refcount) { + ret = SDEI_PENDING; + goto unlock; + } + + /* Check if it has been registered */ + kse = kske->kse; + index = (kse->state.type == SDEI_EVENT_TYPE_PRIVATE) ? + vcpu->vcpu_idx : 0; + if (!kvm_sdei_is_registered(kske, index)) { + ret = SDEI_DENIED; + goto unlock; + } + + /* The event is disabled when it's unregistered */ + kvm_sdei_clear_enabled(kske, index); + kvm_sdei_clear_registered(kske, index); + if (kvm_sdei_empty_registered(kske)) { + list_del(>link); + kfree(kske); + } + +unlock: + spin_unlock(>lock); +out: + return ret; +} + int kvm_sdei_hypercall(struct kvm_vcpu *vcpu) { u32 func = smccc_get_function(vcpu); @@ -333,6 +392,8 @@ int kvm_sdei_hypercall(struct kvm_vcpu *vcpu) case SDEI_1_0_FN_SDEI_EVENT_COMPLETE: case SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME: case SDEI_1_0_FN_SDEI_EVENT_UNREGISTER: + ret = kvm_sdei_hypercall_unregister(vcpu); + break; case SDEI_1_0_FN_SDEI_EVENT_STATUS: case SDEI_1_0_FN_SDEI_EVENT_GET_INFO: case SDEI_1_0_FN_SDEI_EVENT_ROUTING_SET: -- 2.23.0
[PATCH v2 14/21] KVM: arm64: Support SDEI_EVENT_{COMPLETE, COMPLETE_AND_RESUME} hypercall
This supports SDEI_EVENT_{COMPLETE, COMPLETE_AND_RESUME} hypercall. They are used by the guest to notify the completion of the SDEI event in the handler. The registers are changed according to the SDEI specification as below: * x0 - x17, PC and PState are restored to what values we had in the interrupted context. * If it's SDEI_EVENT_COMPLETE_AND_RESUME hypercall, IRQ exception is injected. Signed-off-by: Gavin Shan --- arch/arm64/include/asm/kvm_emulate.h | 1 + arch/arm64/include/asm/kvm_host.h| 1 + arch/arm64/kvm/hyp/exception.c | 7 +++ arch/arm64/kvm/inject_fault.c| 27 ++ arch/arm64/kvm/sdei.c| 75 5 files changed, 111 insertions(+) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index f612c090f2e4..0ef213b715a5 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -37,6 +37,7 @@ bool kvm_condition_valid32(const struct kvm_vcpu *vcpu); void kvm_skip_instr32(struct kvm_vcpu *vcpu); void kvm_inject_undefined(struct kvm_vcpu *vcpu); +void kvm_inject_irq(struct kvm_vcpu *vcpu); void kvm_inject_vabt(struct kvm_vcpu *vcpu); void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr); void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 30e850257ef4..01eda5c84600 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -416,6 +416,7 @@ struct kvm_vcpu_arch { #define KVM_ARM64_EXCEPT_AA32_UND (0 << 9) #define KVM_ARM64_EXCEPT_AA32_IABT (1 << 9) #define KVM_ARM64_EXCEPT_AA32_DABT (2 << 9) +#define KVM_ARM64_EXCEPT_AA32_IRQ (3 << 9) /* For AArch64: */ #define KVM_ARM64_EXCEPT_AA64_ELx_SYNC (0 << 9) #define KVM_ARM64_EXCEPT_AA64_ELx_IRQ (1 << 9) diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c index 73629094f903..c1e9bdb67b37 100644 --- a/arch/arm64/kvm/hyp/exception.c +++ b/arch/arm64/kvm/hyp/exception.c @@ -309,6 +309,9 @@ void kvm_inject_exception(struct kvm_vcpu *vcpu) case KVM_ARM64_EXCEPT_AA32_DABT: enter_exception32(vcpu, PSR_AA32_MODE_ABT, 16); break; + case KVM_ARM64_EXCEPT_AA32_IRQ: + enter_exception32(vcpu, PSR_AA32_MODE_IRQ, 4); + break; default: /* Err... */ break; @@ -319,6 +322,10 @@ void kvm_inject_exception(struct kvm_vcpu *vcpu) KVM_ARM64_EXCEPT_AA64_EL1): enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync); break; + case (KVM_ARM64_EXCEPT_AA64_ELx_IRQ | + KVM_ARM64_EXCEPT_AA64_EL1): + enter_exception64(vcpu, PSR_MODE_EL1h, except_type_irq); + break; default: /* * Only EL1_SYNC makes sense so far, EL2_{SYNC,IRQ} diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index b47df73e98d7..3a8c55867d2f 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -66,6 +66,13 @@ static void inject_undef64(struct kvm_vcpu *vcpu) vcpu_write_sys_reg(vcpu, esr, ESR_EL1); } +static void inject_irq64(struct kvm_vcpu *vcpu) +{ + vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA64_EL1 | +KVM_ARM64_EXCEPT_AA64_ELx_IRQ | +KVM_ARM64_PENDING_EXCEPTION); +} + #define DFSR_FSC_EXTABT_LPAE 0x10 #define DFSR_FSC_EXTABT_nLPAE 0x08 #define DFSR_LPAE BIT(9) @@ -77,6 +84,12 @@ static void inject_undef32(struct kvm_vcpu *vcpu) KVM_ARM64_PENDING_EXCEPTION); } +static void inject_irq32(struct kvm_vcpu *vcpu) +{ + vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA32_IRQ | +KVM_ARM64_PENDING_EXCEPTION); +} + /* * Modelled after TakeDataAbortException() and TakePrefetchAbortException * pseudocode. @@ -160,6 +173,20 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu) inject_undef64(vcpu); } +/** + * kvm_inject_irq - inject an IRQ into the guest + * + * It is assumed that this code is called from the VCPU thread and that the + * VCPU therefore is not currently executing guest code. + */ +void kvm_inject_irq(struct kvm_vcpu *vcpu) +{ + if (vcpu_el1_is_32bit(vcpu)) + inject_irq32(vcpu); + else + inject_irq64(vcpu); +} + void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 esr) { vcpu_set_vsesr(vcpu, esr & ESR_ELx_ISS_MASK); diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c index b5d6d1ed3858..1e8e213c9d70 100644 --- a/arch/arm64/kvm/sdei.c +++ b/arc
[PATCH v2 17/21] KVM: arm64: Support SDEI ioctl commands on vCPU
This supports ioctl commands on vCPU to manage the various object. It's primarily used by VMM to accomplish live migration. The ioctl commands introduced by this are highlighted as below: * KVM_SDEI_CMD_GET_VEVENT_COUNT Retrieve number of SDEI events that pend for handling on the vCPU * KVM_SDEI_CMD_GET_VEVENT Retrieve the state of SDEI event, which has been delivered to the vCPU for handling * KVM_SDEI_CMD_SET_VEVENT Populate the SDEI event, which has been delivered to the vCPU for handling * KVM_SDEI_CMD_GET_VCPU_STATE Retrieve vCPU state related to SDEI handling * KVM_SDEI_CMD_SET_VCPU_STATE Populate vCPU state related to SDEI handling Signed-off-by: Gavin Shan --- arch/arm64/include/asm/kvm_sdei.h | 1 + arch/arm64/include/uapi/asm/kvm_sdei.h | 7 + arch/arm64/kvm/arm.c | 3 + arch/arm64/kvm/sdei.c | 228 + 4 files changed, 239 insertions(+) diff --git a/arch/arm64/include/asm/kvm_sdei.h b/arch/arm64/include/asm/kvm_sdei.h index 8f5ea947ed0e..a997989bab77 100644 --- a/arch/arm64/include/asm/kvm_sdei.h +++ b/arch/arm64/include/asm/kvm_sdei.h @@ -126,6 +126,7 @@ int kvm_sdei_register_notifier(struct kvm *kvm, unsigned long num, kvm_sdei_notifier notifier); void kvm_sdei_deliver(struct kvm_vcpu *vcpu); long kvm_sdei_vm_ioctl(struct kvm *kvm, unsigned long arg); +long kvm_sdei_vcpu_ioctl(struct kvm_vcpu *vcpu, unsigned long arg); void kvm_sdei_destroy_vcpu(struct kvm_vcpu *vcpu); void kvm_sdei_destroy_vm(struct kvm *kvm); diff --git a/arch/arm64/include/uapi/asm/kvm_sdei.h b/arch/arm64/include/uapi/asm/kvm_sdei.h index 55de8baff841..3485843dd6df 100644 --- a/arch/arm64/include/uapi/asm/kvm_sdei.h +++ b/arch/arm64/include/uapi/asm/kvm_sdei.h @@ -59,6 +59,11 @@ struct kvm_sdei_vcpu_state { #define KVM_SDEI_CMD_GET_KEVENT_COUNT 2 #define KVM_SDEI_CMD_GET_KEVENT3 #define KVM_SDEI_CMD_SET_KEVENT4 +#define KVM_SDEI_CMD_GET_VEVENT_COUNT 5 +#define KVM_SDEI_CMD_GET_VEVENT6 +#define KVM_SDEI_CMD_SET_VEVENT7 +#define KVM_SDEI_CMD_GET_VCPU_STATE8 +#define KVM_SDEI_CMD_SET_VCPU_STATE9 struct kvm_sdei_cmd { uint32_tcmd; @@ -68,6 +73,8 @@ struct kvm_sdei_cmd { uint64_tnum; struct kvm_sdei_event_state kse_state; struct kvm_sdei_kvm_event_state kske_state; + struct kvm_sdei_vcpu_event_stateksve_state; + struct kvm_sdei_vcpu_state ksv_state; }; }; diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 96b41bf1d094..55ccd234b0ec 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1260,6 +1260,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, return kvm_arm_vcpu_finalize(vcpu, what); } + case KVM_ARM_SDEI_COMMAND: { + return kvm_sdei_vcpu_ioctl(vcpu, arg); + } default: r = -EINVAL; } diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c index bdd76c3e5153..79315b77f24b 100644 --- a/arch/arm64/kvm/sdei.c +++ b/arch/arm64/kvm/sdei.c @@ -35,6 +35,25 @@ static struct kvm_sdei_event *kvm_sdei_find_event(struct kvm *kvm, return NULL; } +static struct kvm_sdei_vcpu_event *kvm_sdei_find_vcpu_event(struct kvm_vcpu *vcpu, + unsigned long num) +{ + struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei; + struct kvm_sdei_vcpu_event *ksve; + + list_for_each_entry(ksve, >critical_events, link) { + if (ksve->state.num == num) + return ksve; + } + + list_for_each_entry(ksve, >normal_events, link) { + if (ksve->state.num == num) + return ksve; + } + + return NULL; +} + static void kvm_sdei_remove_events(struct kvm *kvm) { struct kvm_sdei_kvm *ksdei = kvm->arch.sdei; @@ -1102,6 +1121,215 @@ long kvm_sdei_vm_ioctl(struct kvm *kvm, unsigned long arg) return ret; } +static long kvm_sdei_get_vevent_count(struct kvm_vcpu *vcpu, int *count) +{ + struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei; + struct kvm_sdei_vcpu_event *ksve = NULL; + int total = 0; + + list_for_each_entry(ksve, >critical_events, link) { + total++; + } + + list_for_each_entry(ksve, >normal_events, link) { + total++; + } + + *count = total; + return 0; +} + +static struct kvm_sdei_vcpu_event *next_vcpu_event(struct kvm_vcpu *vcpu, + unsigned long num) +{ + struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei;
[PATCH v2 10/21] KVM: arm64: Support SDEI_EVENT_ROUTING_SET hypercall
This supports SDEI_EVENT_ROUTING_SET hypercall. It's used by the guest to set route mode and affinity for the registered KVM event. It's only valid for the shared events. It's not allowed to do so when the corresponding event has been raised to the guest. Signed-off-by: Gavin Shan --- arch/arm64/kvm/sdei.c | 64 +++ 1 file changed, 64 insertions(+) diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c index 5dfa74b093f1..458695c2394f 100644 --- a/arch/arm64/kvm/sdei.c +++ b/arch/arm64/kvm/sdei.c @@ -489,6 +489,68 @@ static unsigned long kvm_sdei_hypercall_info(struct kvm_vcpu *vcpu) return ret; } +static unsigned long kvm_sdei_hypercall_route(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_sdei_kvm *ksdei = kvm->arch.sdei; + struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei; + struct kvm_sdei_event *kse = NULL; + struct kvm_sdei_kvm_event *kske = NULL; + unsigned long event_num = smccc_get_arg1(vcpu); + unsigned long route_mode = smccc_get_arg2(vcpu); + unsigned long route_affinity = smccc_get_arg3(vcpu); + int index = 0; + unsigned long ret = SDEI_SUCCESS; + + /* Sanity check */ + if (!(ksdei && vsdei)) { + ret = SDEI_NOT_SUPPORTED; + goto out; + } + + if (!kvm_sdei_is_valid_event_num(event_num)) { + ret = SDEI_INVALID_PARAMETERS; + goto out; + } + + if (!(route_mode == SDEI_EVENT_REGISTER_RM_ANY || + route_mode == SDEI_EVENT_REGISTER_RM_PE)) { + ret = SDEI_INVALID_PARAMETERS; + goto out; + } + + /* Check if the KVM event has been registered */ + spin_lock(>lock); + kske = kvm_sdei_find_kvm_event(kvm, event_num); + if (!kske) { + ret = SDEI_INVALID_PARAMETERS; + goto unlock; + } + + /* Validate KVM event state */ + kse = kske->kse; + if (kse->state.type != SDEI_EVENT_TYPE_SHARED) { + ret = SDEI_INVALID_PARAMETERS; + goto unlock; + } + + if (!kvm_sdei_is_registered(kske, index) || + kvm_sdei_is_enabled(kske, index) || + kske->state.refcount) { + ret = SDEI_DENIED; + goto unlock; + } + + /* Update state */ + kske->state.route_mode = route_mode; + kske->state.route_affinity = route_affinity; + +unlock: + spin_unlock(>lock); +out: + return ret; +} + int kvm_sdei_hypercall(struct kvm_vcpu *vcpu) { u32 func = smccc_get_function(vcpu); @@ -523,6 +585,8 @@ int kvm_sdei_hypercall(struct kvm_vcpu *vcpu) ret = kvm_sdei_hypercall_info(vcpu); break; case SDEI_1_0_FN_SDEI_EVENT_ROUTING_SET: + ret = kvm_sdei_hypercall_route(vcpu); + break; case SDEI_1_0_FN_SDEI_PE_MASK: case SDEI_1_0_FN_SDEI_PE_UNMASK: case SDEI_1_0_FN_SDEI_INTERRUPT_BIND: -- 2.23.0
[PATCH v2 06/21] KVM: arm64: Support SDEI_EVENT_CONTEXT hypercall
This supports SDEI_EVENT_CONTEXT hypercall. It's used by the guest to retrieved the original registers (R0 - R17) in its SDEI event handler. Those registers can be corrupted during the SDEI event delivery. Signed-off-by: Gavin Shan --- arch/arm64/kvm/sdei.c | 40 1 file changed, 40 insertions(+) diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c index b022ce0a202b..b4162efda470 100644 --- a/arch/arm64/kvm/sdei.c +++ b/arch/arm64/kvm/sdei.c @@ -270,6 +270,44 @@ static unsigned long kvm_sdei_hypercall_enable(struct kvm_vcpu *vcpu, return ret; } +static unsigned long kvm_sdei_hypercall_context(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_sdei_kvm *ksdei = kvm->arch.sdei; + struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei; + struct kvm_sdei_vcpu_regs *regs; + unsigned long index = smccc_get_arg1(vcpu); + unsigned long ret = SDEI_SUCCESS; + + /* Sanity check */ + if (!(ksdei && vsdei)) { + ret = SDEI_NOT_SUPPORTED; + goto out; + } + + if (index > ARRAY_SIZE(vsdei->state.critical_regs.regs)) { + ret = SDEI_INVALID_PARAMETERS; + goto out; + } + + /* Check if the pending event exists */ + spin_lock(>lock); + if (!(vsdei->critical_event || vsdei->normal_event)) { + ret = SDEI_DENIED; + goto unlock; + } + + /* Fetch the requested register */ + regs = vsdei->critical_event ? >state.critical_regs : + >state.normal_regs; + ret = regs->regs[index]; + +unlock: + spin_unlock(>lock); +out: + return ret; +} + int kvm_sdei_hypercall(struct kvm_vcpu *vcpu) { u32 func = smccc_get_function(vcpu); @@ -290,6 +328,8 @@ int kvm_sdei_hypercall(struct kvm_vcpu *vcpu) ret = kvm_sdei_hypercall_enable(vcpu, false); break; case SDEI_1_0_FN_SDEI_EVENT_CONTEXT: + ret = kvm_sdei_hypercall_context(vcpu); + break; case SDEI_1_0_FN_SDEI_EVENT_COMPLETE: case SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME: case SDEI_1_0_FN_SDEI_EVENT_UNREGISTER: -- 2.23.0
[PATCH v2 04/21] KVM: arm64: Support SDEI_EVENT_REGISTER hypercall
This supports SDEI_EVENT_REGISTER hypercall, which is used by guest to register SDEI events. The SDEI event won't be raised to the guest or specific vCPU until it's registered and enabled explicitly. Only those events that have been exported by KVM can be registered. After the event is registered successfully, the KVM SDEI event (object) is created or updated because the same KVM SDEI event is shared by multiple vCPUs if it's a private event. Signed-off-by: Gavin Shan --- arch/arm64/kvm/sdei.c | 122 ++ 1 file changed, 122 insertions(+) diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c index aa9485f076a9..d3ea3eee154b 100644 --- a/arch/arm64/kvm/sdei.c +++ b/arch/arm64/kvm/sdei.c @@ -21,6 +21,20 @@ static struct kvm_sdei_event_state defined_kse[] = { }, }; +static struct kvm_sdei_event *kvm_sdei_find_event(struct kvm *kvm, + unsigned long num) +{ + struct kvm_sdei_kvm *ksdei = kvm->arch.sdei; + struct kvm_sdei_event *kse; + + list_for_each_entry(kse, >events, link) { + if (kse->state.num == num) + return kse; + } + + return NULL; +} + static void kvm_sdei_remove_events(struct kvm *kvm) { struct kvm_sdei_kvm *ksdei = kvm->arch.sdei; @@ -32,6 +46,20 @@ static void kvm_sdei_remove_events(struct kvm *kvm) } } +static struct kvm_sdei_kvm_event *kvm_sdei_find_kvm_event(struct kvm *kvm, + unsigned long num) +{ + struct kvm_sdei_kvm *ksdei = kvm->arch.sdei; + struct kvm_sdei_kvm_event *kske; + + list_for_each_entry(kske, >kvm_events, link) { + if (kske->state.num == num) + return kske; + } + + return NULL; +} + static void kvm_sdei_remove_kvm_events(struct kvm *kvm, unsigned int mask, bool force) @@ -86,6 +114,98 @@ static unsigned long kvm_sdei_hypercall_version(struct kvm_vcpu *vcpu) return ret; } +static unsigned long kvm_sdei_hypercall_register(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_sdei_kvm *ksdei = kvm->arch.sdei; + struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei; + struct kvm_sdei_event *kse = NULL; + struct kvm_sdei_kvm_event *kske = NULL; + unsigned long event_num = smccc_get_arg1(vcpu); + unsigned long event_entry = smccc_get_arg2(vcpu); + unsigned long event_param = smccc_get_arg3(vcpu); + unsigned long route_mode = smccc_get_arg4(vcpu); + unsigned long route_affinity = smccc_get_arg5(vcpu); + int index = vcpu->vcpu_idx; + unsigned long ret = SDEI_SUCCESS; + + /* Sanity check */ + if (!(ksdei && vsdei)) { + ret = SDEI_NOT_SUPPORTED; + goto out; + } + + if (!kvm_sdei_is_valid_event_num(event_num)) { + ret = SDEI_INVALID_PARAMETERS; + goto out; + } + + if (!(route_mode == SDEI_EVENT_REGISTER_RM_ANY || + route_mode == SDEI_EVENT_REGISTER_RM_PE)) { + ret = SDEI_INVALID_PARAMETERS; + goto out; + } + + /* +* The KVM event could have been created if it's a private event. +* We needn't create a KVM event in this case. +*/ + spin_lock(>lock); + kske = kvm_sdei_find_kvm_event(kvm, event_num); + if (kske) { + kse = kske->kse; + index = (kse->state.type == SDEI_EVENT_TYPE_PRIVATE) ? + vcpu->vcpu_idx : 0; + + if (kvm_sdei_is_registered(kske, index)) { + ret = SDEI_DENIED; + goto unlock; + } + + kske->state.route_mode = route_mode; + kske->state.route_affinity = route_affinity; + kske->state.entries[index] = event_entry; + kske->state.params[index] = event_param; + kvm_sdei_set_registered(kske, index); + goto unlock; + } + + /* Check if the event number has been registered */ + kse = kvm_sdei_find_event(kvm, event_num); + if (!kse) { + ret = SDEI_INVALID_PARAMETERS; + goto unlock; + } + + /* Create KVM event */ + kske = kzalloc(sizeof(*kske), GFP_KERNEL); + if (!kske) { + ret = SDEI_OUT_OF_RESOURCE; + goto unlock; + } + + /* Initialize KVM event state */ + index = (kse->state.type == SDEI_EVENT_TYPE_PRIVATE) ? + vcpu->vcpu_idx : 0; + kske->state.num= event_num; + kske->state.refcount = 0; + kske->state.route_mode = route_affinity; + kske->state.route_affini
[PATCH v2 02/21] KVM: arm64: Add SDEI virtualization infrastructure
Software Delegated Exception Interface (SDEI) provides a mechanism for registering and servicing system events. Those system events are high priority events, which must be serviced immediately. It's going to be used by Asynchronous Page Fault (APF) to deliver notification from KVM to guest. It's noted that SDEI is defined by ARM DEN0054A specification. This introduces SDEI virtualization infrastructure where the SDEI events are registered and manuplated by the guest through hypercall. The SDEI event is delivered to one specific vCPU by KVM once it's raised. This introduces data structures to represent the needed objects to implement the feature, which is highlighted as below. As those objects could be migrated between VMs, these data structures are partially exported to user space. * kvm_sdei_event SDEI events are exported from KVM so that guest is able to register and manuplate. * kvm_sdei_kvm_event SDEI event that has been registered by guest. * kvm_sdei_kvm_vcpu SDEI event that has been delivered to the target vCPU. * kvm_sdei_kvm Place holder of exported and registered SDEI events. * kvm_sdei_vcpu Auxiliary object to save the preempted context during SDEI event delivery. The error is returned for all SDEI hypercalls for now. They will be supported by subsequent patches. Signed-off-by: Gavin Shan --- arch/arm64/include/asm/kvm_host.h | 4 + arch/arm64/include/asm/kvm_sdei.h | 118 +++ arch/arm64/include/uapi/asm/kvm.h | 1 + arch/arm64/include/uapi/asm/kvm_sdei.h | 56 +++ arch/arm64/kvm/Makefile| 2 +- arch/arm64/kvm/arm.c | 7 + arch/arm64/kvm/hypercalls.c| 18 +++ arch/arm64/kvm/sdei.c | 198 + 8 files changed, 403 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/include/asm/kvm_sdei.h create mode 100644 arch/arm64/include/uapi/asm/kvm_sdei.h create mode 100644 arch/arm64/kvm/sdei.c diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 8fcfab0c2567..b2d51c6d055c 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -133,6 +133,8 @@ struct kvm_arch { u8 pfr0_csv2; u8 pfr0_csv3; + + struct kvm_sdei_kvm *sdei; }; struct kvm_vcpu_fault_info { @@ -370,6 +372,8 @@ struct kvm_vcpu_arch { u64 last_steal; gpa_t base; } steal; + + struct kvm_sdei_vcpu *sdei; }; /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ diff --git a/arch/arm64/include/asm/kvm_sdei.h b/arch/arm64/include/asm/kvm_sdei.h new file mode 100644 index ..b0abc13a0256 --- /dev/null +++ b/arch/arm64/include/asm/kvm_sdei.h @@ -0,0 +1,118 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Definitions of various KVM SDEI events. + * + * Copyright (C) 2021 Red Hat, Inc. + * + * Author(s): Gavin Shan + */ + +#ifndef __ARM64_KVM_SDEI_H__ +#define __ARM64_KVM_SDEI_H__ + +#include +#include +#include +#include +#include + +struct kvm_sdei_event { + struct kvm_sdei_event_state state; + struct kvm *kvm; + struct list_headlink; +}; + +struct kvm_sdei_kvm_event { + struct kvm_sdei_kvm_event_state state; + struct kvm_sdei_event *kse; + struct kvm *kvm; + struct list_headlink; +}; + +struct kvm_sdei_vcpu_event { + struct kvm_sdei_vcpu_event_statestate; + struct kvm_sdei_kvm_event *kske; + struct kvm_vcpu *vcpu; + struct list_headlink; +}; + +struct kvm_sdei_kvm { + spinlock_t lock; + struct list_headevents; /* kvm_sdei_event */ + struct list_headkvm_events; /* kvm_sdei_kvm_event */ +}; + +struct kvm_sdei_vcpu { + spinlock_t lock; + struct kvm_sdei_vcpu_state state; + struct kvm_sdei_vcpu_event *critical_event; + struct kvm_sdei_vcpu_event *normal_event; + struct list_headcritical_events; + struct list_headnormal_events; +}; + +/* + * According to SDEI specification (v1.0), the event number spans 32-bits + * and the lower 24-bits are used as the (real) event number. I don't + * think we can use that much SDEI numbers in one system. So we reserve + * two bits from the 24-bits real event number, to indicate its types: + * physical event and virtual event. One reserved bit is enough for now, + * but two bits are reserved for possible extension in future. + * + * The physical events are owned by underly firmware while the virtual + * events are used by VMM and KVM. + */ +#define KVM_SDEI_EV_NUM_TYPE_SHIFT 22 +#define KVM_SDEI_EV_NUM_TYPE_MASK
[PATCH v2 00/21] Support SDEI Virtualization
This series intends to virtualize Software Delegated Exception Interface (SDEI), which is defined by DEN0054A. It allows the hypervisor to deliver NMI-alike event to guest and it's needed by asynchronous page fault to deliver page-not-present notification from hypervisor to guest. The code and the required qemu changes can be found from: https://github.com/gwshan/linux("sdei") https://github.com/gwshan/qemu.git ("apf") The SDEI event is identified by a 32-bits number. Bits[31:24] are used to indicate the SDEI event properties while bits[23:0] are identifying the unique number. The implementation takes bits[23:22] to indicate the owner of the SDEI event. For example, those SDEI events owned by KVM should have these two bits set to 0b01. Besides, the implementation supports SDEI events owned by KVM only. The design is pretty straightforward and the implementation is just following the SDEI specification. There are several data structures introduced. Some of the objects have to be migrated by VMM. So their definitions are split up so that VMM can include their states for migration. struct kvm_sdei_kvm Associated with VM and used to track the KVM exposed SDEI events and those registered by guest. struct kvm_sdei_vcpu Associated with vCPU and used to track SDEI event delivery. The preempted context is saved prior to the delivery and restored after that. struct kvm_sdei_event SDEI events exposed by KVM so that guest can register and enable. struct kvm_sdei_kvm_event SDEI events that have been registered by guest. struct kvm_sdei_vcpu_event SDEI events that have been queued to specific vCPU for delivery. The series is organized as below: PATCH[01]Introduces template for smccc_get_argx() PATCH[02]Introduces the data structures and infrastructure PATCH[03-14] Supports various SDEI related hypercalls PATCH[15]Supports SDEI event notification PATCH[16-17] Introduces ioctl command for migration PATCH[18-19] Supports SDEI event injection and cancellation PATCH[20]Exports SDEI capability PATCH[21]Adds self-test case for SDEI virtualization Testing === There are two additional patches in the following repository to create procfs files allowing inject SDEI event and driver for the guest to use the SDEI event. Besides, the additional qemu changes are needed so that guest can detects the SDEI service through ACPI table. https://github.com/gwshan/linux("sdei") https://github.com/gwshan/qemu.git ("apf") The SDEI event is received and handled in the guest after it's injected through the procfs files on host. Changelog = v2: * Rebased to 5.11.rc6 * Dropped changes related to SDEI client driver(Gavin) * Removed support for passthrou SDEI events(Gavin) * Redesigned data structures (Gavin) * Implementation is almost rewritten as the data structures are totally changed (Gavin) * Added ioctl commands to support migration(Gavin) Gavin Shan (21): KVM: arm64: Introduce template for inline functions KVM: arm64: Add SDEI virtualization infrastructure KVM: arm64: Support SDEI_VERSION hypercall KVM: arm64: Support SDEI_EVENT_REGISTER hypercall KVM: arm64: Support SDEI_EVENT_{ENABLE, DISABLE} hypercall KVM: arm64: Support SDEI_EVENT_CONTEXT hypercall KVM: arm64: Support SDEI_EVENT_UNREGISTER hypercall KVM: arm64: Support SDEI_EVENT_STATUS hypercall KVM: arm64: Support SDEI_EVENT_GET_INFO hypercall KVM: arm64: Support SDEI_EVENT_ROUTING_SET hypercall KVM: arm64: Support SDEI_PE_{MASK, UNMASK} hypercall KVM: arm64: Support SDEI_{PRIVATE, SHARED}_RESET hypercall KVM: arm64: Impment SDEI event delivery KVM: arm64: Support SDEI_EVENT_{COMPLETE, COMPLETE_AND_RESUME} hypercall KVM: arm64: Support SDEI event notifier KVM: arm64: Support SDEI ioctl commands on VM KVM: arm64: Support SDEI ioctl commands on vCPU KVM: arm64: Support SDEI event injection KVM: arm64: Support SDEI event cancellation KVM: arm64: Export SDEI capability KVM: selftests: Add SDEI test case arch/arm64/include/asm/kvm_emulate.h |1 + arch/arm64/include/asm/kvm_host.h |6 + arch/arm64/include/asm/kvm_sdei.h | 136 ++ arch/arm64/include/uapi/asm/kvm.h |1 + arch/arm64/include/uapi/asm/kvm_sdei.h | 82 ++ arch/arm64/kvm/Makefile|2 +- arch/arm64/kvm/arm.c | 19 + arch/arm64/kvm/hyp/exception.c |7 + arch/arm64/kvm/hypercalls.c| 18 + arch/arm64/kvm/inject_fault.c | 27 + arch/arm64/kvm/sdei.c | 1519 include/kvm/arm_hypercalls.h | 34 +- include/uapi/linux/kvm.h |4
[PATCH v2 11/21] KVM: arm64: Support SDEI_PE_{MASK, UNMASK} hypercall
This supports SDEI_PE_{MASK, UNMASK} hypercall. They are used by the guest to stop the specific vCPU from receiving SDEI events. Signed-off-by: Gavin Shan --- arch/arm64/kvm/sdei.c | 35 +++ 1 file changed, 35 insertions(+) diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c index 458695c2394f..3fb33258b494 100644 --- a/arch/arm64/kvm/sdei.c +++ b/arch/arm64/kvm/sdei.c @@ -551,6 +551,37 @@ static unsigned long kvm_sdei_hypercall_route(struct kvm_vcpu *vcpu) return ret; } +static unsigned long kvm_sdei_hypercall_mask(struct kvm_vcpu *vcpu, +bool mask) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_sdei_kvm *ksdei = kvm->arch.sdei; + struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei; + unsigned long ret = SDEI_SUCCESS; + + /* Sanity check */ + if (!(ksdei && vsdei)) { + ret = SDEI_NOT_SUPPORTED; + goto out; + } + + spin_lock(>lock); + + /* Check the state */ + if (mask == vsdei->state.masked) { + ret = SDEI_DENIED; + goto unlock; + } + + /* Update the state */ + vsdei->state.masked = mask ? 1 : 0; + +unlock: + spin_unlock(>lock); +out: + return ret; +} + int kvm_sdei_hypercall(struct kvm_vcpu *vcpu) { u32 func = smccc_get_function(vcpu); @@ -588,7 +619,11 @@ int kvm_sdei_hypercall(struct kvm_vcpu *vcpu) ret = kvm_sdei_hypercall_route(vcpu); break; case SDEI_1_0_FN_SDEI_PE_MASK: + ret = kvm_sdei_hypercall_mask(vcpu, true); + break; case SDEI_1_0_FN_SDEI_PE_UNMASK: + ret = kvm_sdei_hypercall_mask(vcpu, false); + break; case SDEI_1_0_FN_SDEI_INTERRUPT_BIND: case SDEI_1_0_FN_SDEI_INTERRUPT_RELEASE: case SDEI_1_0_FN_SDEI_PRIVATE_RESET: -- 2.23.0
[PATCH v2 12/21] KVM: arm64: Support SDEI_{PRIVATE, SHARED}_RESET hypercall
This supports SDEI_{PRIVATE, SHARED}_RESET. They are used by the guest to purge the private or shared SDEI events, which are registered previously. Signed-off-by: Gavin Shan --- arch/arm64/kvm/sdei.c | 29 + 1 file changed, 29 insertions(+) diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c index 3fb33258b494..62efee2b67b8 100644 --- a/arch/arm64/kvm/sdei.c +++ b/arch/arm64/kvm/sdei.c @@ -582,6 +582,29 @@ static unsigned long kvm_sdei_hypercall_mask(struct kvm_vcpu *vcpu, return ret; } +static unsigned long kvm_sdei_hypercall_reset(struct kvm_vcpu *vcpu, + bool private) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_sdei_kvm *ksdei = kvm->arch.sdei; + struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei; + unsigned int mask = private ? (1 << SDEI_EVENT_TYPE_PRIVATE) : + (1 << SDEI_EVENT_TYPE_SHARED); + unsigned long ret = SDEI_SUCCESS; + + /* Sanity check */ + if (!(ksdei && vsdei)) { + ret = SDEI_NOT_SUPPORTED; + goto out; + } + + spin_lock(>lock); + kvm_sdei_remove_kvm_events(kvm, mask, false); + spin_unlock(>lock); +out: + return ret; +} + int kvm_sdei_hypercall(struct kvm_vcpu *vcpu) { u32 func = smccc_get_function(vcpu); @@ -626,8 +649,14 @@ int kvm_sdei_hypercall(struct kvm_vcpu *vcpu) break; case SDEI_1_0_FN_SDEI_INTERRUPT_BIND: case SDEI_1_0_FN_SDEI_INTERRUPT_RELEASE: + ret = SDEI_NOT_SUPPORTED; + break; case SDEI_1_0_FN_SDEI_PRIVATE_RESET: + ret = kvm_sdei_hypercall_reset(vcpu, true); + break; case SDEI_1_0_FN_SDEI_SHARED_RESET: + ret = kvm_sdei_hypercall_reset(vcpu, false); + break; default: ret = SDEI_NOT_SUPPORTED; } -- 2.23.0
[PATCH v2 20/21] KVM: arm64: Export SDEI capability
The SDEI functionality is ready to be exported so far. This adds new capability (KVM_CAP_ARM_SDEI) and exports it. Signed-off-by: Gavin Shan --- arch/arm64/kvm/arm.c | 3 +++ include/uapi/linux/kvm.h | 1 + 2 files changed, 4 insertions(+) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 55ccd234b0ec..f8b44a29e164 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -266,6 +266,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ARM_PTRAUTH_GENERIC: r = system_has_full_ptr_auth(); break; + case KVM_CAP_ARM_SDEI: + r = 1; + break; default: r = 0; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index b056b4ac884b..133128d45fcb 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1058,6 +1058,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190 #define KVM_CAP_SYS_HYPERV_CPUID 191 #define KVM_CAP_DIRTY_LOG_RING 192 +#define KVM_CAP_ARM_SDEI 193 #ifdef KVM_CAP_IRQ_ROUTING -- 2.23.0
[PATCH v2 18/21] KVM: arm64: Support SDEI event injection
This supports SDEI event injection by implementing kvm_sdei_inject(). It's called by kernel directly or VMM through ioctl command to inject SDEI event to the specific vCPU. Signed-off-by: Gavin Shan --- arch/arm64/include/asm/kvm_sdei.h | 2 + arch/arm64/include/uapi/asm/kvm_sdei.h | 1 + arch/arm64/kvm/sdei.c | 108 + 3 files changed, 111 insertions(+) diff --git a/arch/arm64/include/asm/kvm_sdei.h b/arch/arm64/include/asm/kvm_sdei.h index a997989bab77..51087fe971ba 100644 --- a/arch/arm64/include/asm/kvm_sdei.h +++ b/arch/arm64/include/asm/kvm_sdei.h @@ -124,6 +124,8 @@ void kvm_sdei_create_vcpu(struct kvm_vcpu *vcpu); int kvm_sdei_hypercall(struct kvm_vcpu *vcpu); int kvm_sdei_register_notifier(struct kvm *kvm, unsigned long num, kvm_sdei_notifier notifier); +int kvm_sdei_inject(struct kvm_vcpu *vcpu, + unsigned long num, bool immediate); void kvm_sdei_deliver(struct kvm_vcpu *vcpu); long kvm_sdei_vm_ioctl(struct kvm *kvm, unsigned long arg); long kvm_sdei_vcpu_ioctl(struct kvm_vcpu *vcpu, unsigned long arg); diff --git a/arch/arm64/include/uapi/asm/kvm_sdei.h b/arch/arm64/include/uapi/asm/kvm_sdei.h index 3485843dd6df..232092de5e21 100644 --- a/arch/arm64/include/uapi/asm/kvm_sdei.h +++ b/arch/arm64/include/uapi/asm/kvm_sdei.h @@ -64,6 +64,7 @@ struct kvm_sdei_vcpu_state { #define KVM_SDEI_CMD_SET_VEVENT7 #define KVM_SDEI_CMD_GET_VCPU_STATE8 #define KVM_SDEI_CMD_SET_VCPU_STATE9 +#define KVM_SDEI_CMD_INJECT_EVENT 10 struct kvm_sdei_cmd { uint32_tcmd; diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c index 79315b77f24b..7c2789cd1421 100644 --- a/arch/arm64/kvm/sdei.c +++ b/arch/arm64/kvm/sdei.c @@ -802,6 +802,111 @@ int kvm_sdei_register_notifier(struct kvm *kvm, return ret; } +int kvm_sdei_inject(struct kvm_vcpu *vcpu, + unsigned long num, + bool immediate) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_sdei_kvm *ksdei = kvm->arch.sdei; + struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei; + struct kvm_sdei_event *kse = NULL; + struct kvm_sdei_kvm_event *kske = NULL; + struct kvm_sdei_vcpu_event *ksve = NULL; + int index, ret = 0; + + /* Sanity check */ + if (!(ksdei && vsdei)) { + ret = -EPERM; + goto out; + } + + if (!kvm_sdei_is_valid_event_num(num)) { + ret = -EINVAL; + goto out; + } + + /* Check the kvm event */ + spin_lock(>lock); + kske = kvm_sdei_find_kvm_event(kvm, num); + if (!kske) { + ret = -ENOENT; + goto unlock_kvm; + } + + kse = kske->kse; + index = (kse->state.type == SDEI_EVENT_TYPE_PRIVATE) ? + vcpu->vcpu_idx : 0; + if (!(kvm_sdei_is_registered(kske, index) && + kvm_sdei_is_enabled(kske, index))) { + ret = -EPERM; + goto unlock_kvm; + } + + /* Check the vcpu state */ + spin_lock(>lock); + if (vsdei->state.masked) { + ret = -EPERM; + goto unlock_vcpu; + } + + /* Check if the event can be delivered immediately */ + if (immediate) { + if (kse->state.priority == SDEI_EVENT_PRIORITY_CRITICAL && + !list_empty(>critical_events)) { + ret = -ENOSPC; + goto unlock_vcpu; + } + + if (kse->state.priority == SDEI_EVENT_PRIORITY_NORMAL && + (!list_empty(>critical_events) || +!list_empty(>normal_events))) { + ret = -ENOSPC; + goto unlock_vcpu; + } + } + + /* Check if the vcpu event exists */ + ksve = kvm_sdei_find_vcpu_event(vcpu, num); + if (ksve) { + kske->state.refcount++; + ksve->state.refcount++; + kvm_make_request(KVM_REQ_SDEI, vcpu); + goto unlock_vcpu; + } + + /* Allocate vcpu event */ + ksve = kzalloc(sizeof(*ksve), GFP_KERNEL); + if (!ksve) { + ret = -ENOMEM; + goto unlock_vcpu; + } + + /* +* We should take lock to update KVM event state because its +* reference count might be zero. In that case, the KVM event +* could be destroyed. +*/ + kske->state.refcount++; + ksve->state.num = num; + ksve->state.refcount = 1; + ksve->kske = kske; + ksve->vcpu = vcpu; + + if (kse->state.priority == SDEI_EVENT_PRIORITY_CRITICAL) + list_add_tail(>link, >critica
[PATCH v2 19/21] KVM: arm64: Support SDEI event cancellation
The injected SDEI event is to send notification to guest. The SDEI event might not be needed after it's injected. This introduces API to support cancellation on the injected SDEI event if it's not fired to the guest yet. This mechanism will be needed when we're going to support asynchronous page fault. Signed-off-by: Gavin Shan --- arch/arm64/include/asm/kvm_sdei.h | 1 + arch/arm64/kvm/sdei.c | 49 +++ 2 files changed, 50 insertions(+) diff --git a/arch/arm64/include/asm/kvm_sdei.h b/arch/arm64/include/asm/kvm_sdei.h index 51087fe971ba..353744c7bad9 100644 --- a/arch/arm64/include/asm/kvm_sdei.h +++ b/arch/arm64/include/asm/kvm_sdei.h @@ -126,6 +126,7 @@ int kvm_sdei_register_notifier(struct kvm *kvm, unsigned long num, kvm_sdei_notifier notifier); int kvm_sdei_inject(struct kvm_vcpu *vcpu, unsigned long num, bool immediate); +int kvm_sdei_cancel(struct kvm_vcpu *vcpu, unsigned long num); void kvm_sdei_deliver(struct kvm_vcpu *vcpu); long kvm_sdei_vm_ioctl(struct kvm *kvm, unsigned long arg); long kvm_sdei_vcpu_ioctl(struct kvm_vcpu *vcpu, unsigned long arg); diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c index 7c2789cd1421..4f5a582daa97 100644 --- a/arch/arm64/kvm/sdei.c +++ b/arch/arm64/kvm/sdei.c @@ -907,6 +907,55 @@ int kvm_sdei_inject(struct kvm_vcpu *vcpu, return ret; } +int kvm_sdei_cancel(struct kvm_vcpu *vcpu, unsigned long num) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_sdei_kvm *ksdei = kvm->arch.sdei; + struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei; + struct kvm_sdei_kvm_event *kske = NULL; + struct kvm_sdei_vcpu_event *ksve = NULL; + int ret = 0; + + if (!(ksdei && vsdei)) { + ret = -EPERM; + goto out; + } + + /* Find the vCPU event */ + spin_lock(>lock); + ksve = kvm_sdei_find_vcpu_event(vcpu, num); + if (!ksve) { + ret = -EINVAL; + goto unlock; + } + + /* Event can't be cancelled if it has been delivered */ + if (ksve->state.refcount <= 1 && + (vsdei->critical_event == ksve || +vsdei->normal_event == ksve)) { + ret = -EINPROGRESS; + goto unlock; + } + + /* Free the vCPU event if necessary */ + kske = ksve->kske; + ksve->state.refcount--; + if (!ksve->state.refcount) { + list_del(>link); + kfree(ksve); + } + +unlock: + spin_unlock(>lock); + if (kske) { + spin_lock(>lock); + kske->state.refcount--; + spin_unlock(>lock); + } +out: + return ret; +} + void kvm_sdei_deliver(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; -- 2.23.0
[PATCH v2 13/21] KVM: arm64: Impment SDEI event delivery
This implement kvm_sdei_deliver() to support SDEI event delivery. The function is called when the request (KVM_REQ_SDEI) is raised. The following rules are taken according to the SDEI specification: * x0 - x17 are saved. All of them are cleared except the following registered: x0: number SDEI event to be delivered x1: parameter associated with the SDEI event x2: PC of the interrupted context x3: PState of the interrupted context * PC is set to the handler of the SDEI event, which was provided during its registration. PState is modified accordingly. * SDEI event with critical priority can preempt those with normal priority. Signed-off-by: Gavin Shan --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/include/asm/kvm_sdei.h | 1 + arch/arm64/kvm/arm.c | 3 ++ arch/arm64/kvm/sdei.c | 84 +++ 4 files changed, 89 insertions(+) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index b2d51c6d055c..30e850257ef4 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -47,6 +47,7 @@ #define KVM_REQ_VCPU_RESET KVM_ARCH_REQ(2) #define KVM_REQ_RECORD_STEAL KVM_ARCH_REQ(3) #define KVM_REQ_RELOAD_GICv4 KVM_ARCH_REQ(4) +#define KVM_REQ_SDEI KVM_ARCH_REQ(5) #define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ KVM_DIRTY_LOG_INITIALLY_SET) diff --git a/arch/arm64/include/asm/kvm_sdei.h b/arch/arm64/include/asm/kvm_sdei.h index b0abc13a0256..7f5f5ad689e6 100644 --- a/arch/arm64/include/asm/kvm_sdei.h +++ b/arch/arm64/include/asm/kvm_sdei.h @@ -112,6 +112,7 @@ KVM_SDEI_FLAG_FUNC(enabled) void kvm_sdei_init_vm(struct kvm *kvm); void kvm_sdei_create_vcpu(struct kvm_vcpu *vcpu); int kvm_sdei_hypercall(struct kvm_vcpu *vcpu); +void kvm_sdei_deliver(struct kvm_vcpu *vcpu); void kvm_sdei_destroy_vcpu(struct kvm_vcpu *vcpu); void kvm_sdei_destroy_vm(struct kvm *kvm); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index a7ae16df3df7..e243bd5ad730 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -668,6 +668,9 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu)) kvm_reset_vcpu(vcpu); + if (kvm_check_request(KVM_REQ_SDEI, vcpu)) + kvm_sdei_deliver(vcpu); + /* * Clear IRQ_PENDING requests that were made to guarantee * that a VCPU sees new virtual interrupts. diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c index 62efee2b67b8..b5d6d1ed3858 100644 --- a/arch/arm64/kvm/sdei.c +++ b/arch/arm64/kvm/sdei.c @@ -671,6 +671,90 @@ int kvm_sdei_hypercall(struct kvm_vcpu *vcpu) return 1; } +void kvm_sdei_deliver(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_sdei_kvm *ksdei = kvm->arch.sdei; + struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei; + struct kvm_sdei_event *kse = NULL; + struct kvm_sdei_kvm_event *kske = NULL; + struct kvm_sdei_vcpu_event *ksve = NULL; + struct kvm_sdei_vcpu_regs *regs = NULL; + unsigned long pstate; + int index = 0; + + /* Sanity check */ + if (!(ksdei && vsdei)) + return; + + /* The critical event can't be preempted */ + spin_lock(>lock); + if (vsdei->critical_event) + goto unlock; + + /* +* The normal event can be preempted by the critical event. +* However, the normal event can't be preempted by another +* normal event. +*/ + ksve = list_first_entry_or_null(>critical_events, + struct kvm_sdei_vcpu_event, link); + if (!ksve && !vsdei->normal_event) { + ksve = list_first_entry_or_null(>normal_events, + struct kvm_sdei_vcpu_event, link); + } + + if (!ksve) + goto unlock; + + kske = ksve->kske; + kse = kske->kse; + if (kse->state.priority == SDEI_EVENT_PRIORITY_CRITICAL) { + vsdei->critical_event = ksve; + vsdei->state.critical_num = ksve->state.num; + regs = >state.critical_regs; + } else { + vsdei->normal_event = ksve; + vsdei->state.normal_num = ksve->state.num; + regs = >state.normal_regs; + } + + /* Save registers: x0 -> x17, PC, PState */ + for (index = 0; index < ARRAY_SIZE(regs->regs); index++) + regs->regs[index] = vcpu_get_reg(vcpu, index); + + regs->pc = *vcpu_pc(vcpu); + regs->pstate = *vcpu_cpsr(vcpu); + + /* +* Inject SDEI event: x0 -> x3, PC, PState. We needn't take lock +
[PATCH v2 21/21] KVM: selftests: Add SDEI test case
This adds SDEI test case into selftests where the various hypercalls are issued to kvm private event (0x4020) and then ensure that's completed without error. Note that two vCPUs are started up by default to run same consequence. Actually, it's simulating what SDEI client driver does and the following hypercalls are issued in sequence: SDEI_1_0_FN_SDEI_VERSION(probing SDEI capability) SDEI_1_0_FN_SDEI_PE_UNMASK (CPU online) SDEI_1_0_FN_SDEI_PRIVATE_RESET (restart SDEI) SDEI_1_0_FN_SDEI_SHARED_RESET SDEI_1_0_FN_SDEI_EVENT_GET_INFO (register event) SDEI_1_0_FN_SDEI_EVENT_GET_INFO SDEI_1_0_FN_SDEI_EVENT_GET_INFO SDEI_1_0_FN_SDEI_EVENT_REGISTER SDEI_1_0_FN_SDEI_EVENT_ENABLE (enable event) SDEI_1_0_FN_SDEI_EVENT_DISABLE (disable event) SDEI_1_0_FN_SDEI_EVENT_UNREGISTER (unregister event) SDEI_1_0_FN_SDEI_PE_MASK(CPU offline) Signed-off-by: Gavin Shan --- tools/testing/selftests/kvm/Makefile | 1 + tools/testing/selftests/kvm/aarch64/sdei.c | 172 + 2 files changed, 173 insertions(+) create mode 100644 tools/testing/selftests/kvm/aarch64/sdei.c diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index fe41c6a0fa67..482faa88520b 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -74,6 +74,7 @@ TEST_GEN_PROGS_aarch64 += dirty_log_perf_test TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus TEST_GEN_PROGS_aarch64 += set_memory_region_test TEST_GEN_PROGS_aarch64 += steal_time +TEST_GEN_PROGS_aarch64 += aarch64/sdei TEST_GEN_PROGS_s390x = s390x/memop TEST_GEN_PROGS_s390x += s390x/resets diff --git a/tools/testing/selftests/kvm/aarch64/sdei.c b/tools/testing/selftests/kvm/aarch64/sdei.c new file mode 100644 index ..1a4cdae84ad5 --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/sdei.c @@ -0,0 +1,172 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ARM SDEI test + * + * Copyright (C) 2021 Red Hat, Inc. + * + * Author(s): Gavin Shan + */ +#define _GNU_SOURCE +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "asm/kvm_sdei.h" +#include "linux/arm_sdei.h" + +#define NR_VCPUS 2 +#define SDEI_GPA_BASE (1 << 30) + +struct sdei_event { + uint32_tcpu; + uint64_tversion; + uint64_tnum; + uint64_ttype; + uint64_tpriority; + uint64_tsignaled; +}; + +static struct sdei_event sdei_events[NR_VCPUS]; + +static int64_t smccc(uint32_t func, uint64_t arg0, uint64_t arg1, +uint64_t arg2, uint64_t arg3, uint64_t arg4) +{ + int64_t ret; + + asm volatile( + "movx0, %1\n" + "movx1, %2\n" + "movx2, %3\n" + "movx3, %4\n" + "movx4, %5\n" + "movx5, %6\n" + "hvc#0\n" + "mov%0, x0\n" + : "=r" (ret) : "r" (func), "r" (arg0), "r" (arg1), + "r" (arg2), "r" (arg3), "r" (arg4) : + "x0", "x1", "x2", "x3", "x4", "x5"); + + return ret; +} + +static inline bool is_error(int64_t ret) +{ + if (ret == SDEI_NOT_SUPPORTED || + ret == SDEI_INVALID_PARAMETERS || + ret == SDEI_DENIED || + ret == SDEI_PENDING|| + ret == SDEI_OUT_OF_RESOURCE) + return true; + + return false; +} + +static void guest_code(int cpu) +{ + struct sdei_event *event = _events[cpu]; + int64_t ret; + + /* CPU */ + event->cpu = cpu; + event->num = KVM_SDEI_DEFAULT_NUM; + GUEST_ASSERT(cpu < NR_VCPUS); + + /* Version */ + ret = smccc(SDEI_1_0_FN_SDEI_VERSION, 0, 0, 0, 0, 0); + GUEST_ASSERT(!is_error(ret)); + GUEST_ASSERT(SDEI_VERSION_MAJOR(ret) == 1); + GUEST_ASSERT(SDEI_VERSION_MINOR(ret) == 0); + event->version = ret; + + /* CPU unmasking */ + ret = smccc(SDEI_1_0_FN_SDEI_PE_UNMASK, 0, 0, 0, 0, 0); + GUEST_ASSERT(!is_error(ret)); + + /* Reset */ + ret = smccc(SDEI_1_0_FN_SDEI_PRIVATE_RESET, 0, 0, 0, 0, 0); + GUEST_ASSERT(!is_error(ret)); + ret = smccc(SDEI_1_0_FN_SDEI_SHARED_RESET, 0, 0, 0, 0, 0); + GUEST_ASSERT(!is_error(ret)); + + /* Event properties */ + ret = smccc(SDEI_1_0_FN_SDEI_EVENT_GET_INFO, +event->num, SDEI_EVENT_INFO_EV_TYPE, 0, 0, 0); + GUEST_ASSERT(!is_error(ret)); + event->type = ret; + + ret = smccc(SDEI_1_0_FN_SDEI_EVENT_GET_INFO, + event->num, SDEI_EVENT_INF