[PATCH 7/7] vfio: iommu_type1: Drop parameter "pgsize" of update_user_bitmap

2020-12-09 Thread Keqian Zhu
We always use the smallest supported page size of vfio_iommu as
pgsize. Drop parameter "pgsize" of update_user_bitmap.

Signed-off-by: Keqian Zhu 
---
 drivers/vfio/vfio_iommu_type1.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 2d7a5cd9b916..edb0a6468e8d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -989,10 +989,9 @@ static void vfio_update_pgsize_bitmap(struct vfio_iommu 
*iommu)
 }
 
 static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
- struct vfio_dma *dma, dma_addr_t base_iova,
- size_t pgsize)
+ struct vfio_dma *dma, dma_addr_t base_iova)
 {
-   unsigned long pgshift = __ffs(pgsize);
+   unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
unsigned long nbits = dma->size >> pgshift;
unsigned long bit_offset = (dma->iova - base_iova) >> pgshift;
unsigned long copy_offset = bit_offset / BITS_PER_LONG;
@@ -1057,7 +1056,7 @@ static int vfio_iova_dirty_bitmap(u64 __user *bitmap, 
struct vfio_iommu *iommu,
if (dma->iova > iova + size - 1)
break;
 
-   ret = update_user_bitmap(bitmap, iommu, dma, iova, pgsize);
+   ret = update_user_bitmap(bitmap, iommu, dma, iova);
if (ret)
return ret;
 
@@ -1203,7 +1202,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 
if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
ret = update_user_bitmap(bitmap->data, iommu, dma,
-unmap->iova, pgsize);
+unmap->iova);
if (ret)
break;
}
-- 
2.23.0

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 6/7] vfio: iommu_type1: Drop parameter "pgsize" of vfio_iova_dirty_bitmap.

2020-12-09 Thread Keqian Zhu
We always use the smallest supported page size of vfio_iommu as
pgsize. Remove parameter "pgsize" of vfio_iova_dirty_bitmap.

Signed-off-by: Keqian Zhu 
---
 drivers/vfio/vfio_iommu_type1.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 32ab889c8193..2d7a5cd9b916 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1026,11 +1026,12 @@ static int update_user_bitmap(u64 __user *bitmap, 
struct vfio_iommu *iommu,
 }
 
 static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
- dma_addr_t iova, size_t size, size_t pgsize)
+ dma_addr_t iova, size_t size)
 {
struct vfio_dma *dma;
struct rb_node *n;
-   unsigned long pgshift = __ffs(pgsize);
+   unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
+   size_t pgsize = (size_t)1 << pgshift;
int ret;
 
/*
@@ -2861,8 +2862,7 @@ static int vfio_iommu_type1_dirty_pages(struct vfio_iommu 
*iommu,
if (iommu->dirty_page_tracking)
ret = vfio_iova_dirty_bitmap(range.bitmap.data,
 iommu, range.iova,
-range.size,
-range.bitmap.pgsize);
+range.size);
else
ret = -EINVAL;
 out_unlock:
-- 
2.23.0

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 5/7] vfio: iommu_type1: Drop parameter "pgsize" of vfio_dma_bitmap_alloc_all

2020-12-09 Thread Keqian Zhu
We always use the smallest supported page size of vfio_iommu as
pgsize. Remove parameter "pgsize" of vfio_dma_bitmap_alloc_all.

Signed-off-by: Keqian Zhu 
---
 drivers/vfio/vfio_iommu_type1.c | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 00684597b098..32ab889c8193 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -236,9 +236,10 @@ static void vfio_dma_populate_bitmap(struct vfio_dma *dma, 
size_t pgsize)
}
 }
 
-static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
+static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu)
 {
struct rb_node *n;
+   size_t pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
 
for (n = rb_first(>dma_list); n; n = rb_next(n)) {
struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
@@ -2798,12 +2799,9 @@ static int vfio_iommu_type1_dirty_pages(struct 
vfio_iommu *iommu,
return -EINVAL;
 
if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
-   size_t pgsize;
-
mutex_lock(>lock);
-   pgsize = 1 << __ffs(iommu->pgsize_bitmap);
if (!iommu->dirty_page_tracking) {
-   ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
+   ret = vfio_dma_bitmap_alloc_all(iommu);
if (!ret)
iommu->dirty_page_tracking = true;
}
-- 
2.23.0

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 1/7] vfio: iommu_type1: Clear added dirty bit when unwind pin

2020-12-09 Thread Keqian Zhu
Currently we do not clear added dirty bit of bitmap when unwind
pin, so if pin failed at halfway, we set unnecessary dirty bit
in bitmap. Clearing added dirty bit when unwind pin, userspace
will see less dirty page, which can save much time to handle them.

Note that we should distinguish the bits added by pin and the bits
already set before pin, so introduce bitmap_added to record this.

Signed-off-by: Keqian Zhu 
---
 drivers/vfio/vfio_iommu_type1.c | 33 ++---
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 67e827638995..f129d24a6ec3 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -637,7 +637,11 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
struct vfio_iommu *iommu = iommu_data;
struct vfio_group *group;
int i, j, ret;
+   unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
unsigned long remote_vaddr;
+   unsigned long bitmap_offset;
+   unsigned long *bitmap_added;
+   dma_addr_t iova;
struct vfio_dma *dma;
bool do_accounting;
 
@@ -650,6 +654,12 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 
mutex_lock(>lock);
 
+   bitmap_added = bitmap_zalloc(npage, GFP_KERNEL);
+   if (!bitmap_added) {
+   ret = -ENOMEM;
+   goto pin_done;
+   }
+
/* Fail if notifier list is empty */
if (!iommu->notifier.head) {
ret = -EINVAL;
@@ -664,7 +674,6 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
 
for (i = 0; i < npage; i++) {
-   dma_addr_t iova;
struct vfio_pfn *vpfn;
 
iova = user_pfn[i] << PAGE_SHIFT;
@@ -699,14 +708,10 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
}
 
if (iommu->dirty_page_tracking) {
-   unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
-
-   /*
-* Bitmap populated with the smallest supported page
-* size
-*/
-   bitmap_set(dma->bitmap,
-  (iova - dma->iova) >> pgshift, 1);
+   /* Populated with the smallest supported page size */
+   bitmap_offset = (iova - dma->iova) >> pgshift;
+   if (!test_and_set_bit(bitmap_offset, dma->bitmap))
+   set_bit(i, bitmap_added);
}
}
ret = i;
@@ -722,14 +727,20 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 pin_unwind:
phys_pfn[i] = 0;
for (j = 0; j < i; j++) {
-   dma_addr_t iova;
-
iova = user_pfn[j] << PAGE_SHIFT;
dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
vfio_unpin_page_external(dma, iova, do_accounting);
phys_pfn[j] = 0;
+
+   if (test_bit(j, bitmap_added)) {
+   bitmap_offset = (iova - dma->iova) >> pgshift;
+   clear_bit(bitmap_offset, dma->bitmap);
+   }
}
 pin_done:
+   if (bitmap_added)
+   bitmap_free(bitmap_added);
+
mutex_unlock(>lock);
return ret;
 }
-- 
2.23.0

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 2/7] vfio: iommu_type1: Initially set the pinned_page_dirty_scope

2020-12-09 Thread Keqian Zhu
Currently there are 3 ways to promote the pinned_page_dirty_scope
status of vfio_iommu:

1. Through pin interface.
2. Detach a group without dirty tracking.
3. Attach a group with dirty tracking.

For point 3, the only chance to change the pinned status is that
the vfio_iommu is newly created.

Consider that we can safely set the pinned status when create a
new vfio_iommu, as we do it, the point 3 can be removed to reduce
operations.

Signed-off-by: Keqian Zhu 
---
 drivers/vfio/vfio_iommu_type1.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index f129d24a6ec3..c52bcefba96b 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2064,12 +2064,8 @@ static int vfio_iommu_type1_attach_group(void 
*iommu_data,
 * Non-iommu backed group cannot dirty memory directly,
 * it can only use interfaces that provide dirty
 * tracking.
-* The iommu scope can only be promoted with the
-* addition of a dirty tracking group.
 */
group->pinned_page_dirty_scope = true;
-   if (!iommu->pinned_page_dirty_scope)
-   update_pinned_page_dirty_scope(iommu);
mutex_unlock(>lock);
 
return 0;
@@ -2457,6 +2453,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
INIT_LIST_HEAD(>iova_list);
iommu->dma_list = RB_ROOT;
iommu->dma_avail = dma_entry_limit;
+   iommu->pinned_page_dirty_scope = true;
mutex_init(>lock);
BLOCKING_INIT_NOTIFIER_HEAD(>notifier);
 
-- 
2.23.0

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 4/7] vfio: iommu_type1: Fix missing dirty page when promote pinned_scope

2020-12-09 Thread Keqian Zhu
When we pin or detach a group which is not dirty tracking capable,
we will try to promote pinned_scope of vfio_iommu.

If we succeed to do so, vfio only report pinned_scope as dirty to
userspace next time, but these memory written before pin or detach
is missed.

The solution is that we must populate all dma range as dirty before
promoting pinned_scope of vfio_iommu.

Signed-off-by: Keqian Zhu 
---
 drivers/vfio/vfio_iommu_type1.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index bd9a94590ebc..00684597b098 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1633,6 +1633,20 @@ static struct vfio_group 
*vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
return group;
 }
 
+static void vfio_populate_bitmap_all(struct vfio_iommu *iommu)
+{
+   struct rb_node *n;
+   unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
+
+   for (n = rb_first(>dma_list); n; n = rb_next(n)) {
+   struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+   unsigned long nbits = dma->size >> pgshift;
+
+   if (dma->iommu_mapped)
+   bitmap_set(dma->bitmap, 0, nbits);
+   }
+}
+
 static void promote_pinned_page_dirty_scope(struct vfio_iommu *iommu)
 {
struct vfio_domain *domain;
@@ -1657,6 +1671,10 @@ static void promote_pinned_page_dirty_scope(struct 
vfio_iommu *iommu)
}
 
iommu->pinned_page_dirty_scope = true;
+
+   /* Set all bitmap to avoid missing dirty page */
+   if (iommu->dirty_page_tracking)
+   vfio_populate_bitmap_all(iommu);
 }
 
 static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
-- 
2.23.0

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 3/7] vfio: iommu_type1: Make an explicit "promote" semantic

2020-12-09 Thread Keqian Zhu
When we want to promote pinned_page_scope of vfio_iommu, we
should call the "update" function to visit all vfio_group,
but when we want to downgrade it, we can set the flag directly.

Giving above, we can give an explicit "promote" semantic to
that function. BTW, if vfio_iommu has been promoted, then it
can return early.

Signed-off-by: Keqian Zhu 
---
 drivers/vfio/vfio_iommu_type1.c | 27 +--
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index c52bcefba96b..bd9a94590ebc 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -148,7 +148,7 @@ static int put_pfn(unsigned long pfn, int prot);
 static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
   struct iommu_group *iommu_group);
 
-static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu);
+static void promote_pinned_page_dirty_scope(struct vfio_iommu *iommu);
 /*
  * This code handles mapping and unmapping of user data buffers
  * into DMA'ble space using the IOMMU
@@ -719,7 +719,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
group = vfio_iommu_find_iommu_group(iommu, iommu_group);
if (!group->pinned_page_dirty_scope) {
group->pinned_page_dirty_scope = true;
-   update_pinned_page_dirty_scope(iommu);
+   promote_pinned_page_dirty_scope(iommu);
}
 
goto pin_done;
@@ -1633,27 +1633,26 @@ static struct vfio_group 
*vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
return group;
 }
 
-static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu)
+static void promote_pinned_page_dirty_scope(struct vfio_iommu *iommu)
 {
struct vfio_domain *domain;
struct vfio_group *group;
 
+   if (iommu->pinned_page_dirty_scope)
+   return;
+
list_for_each_entry(domain, >domain_list, next) {
list_for_each_entry(group, >group_list, next) {
-   if (!group->pinned_page_dirty_scope) {
-   iommu->pinned_page_dirty_scope = false;
+   if (!group->pinned_page_dirty_scope)
return;
-   }
}
}
 
if (iommu->external_domain) {
domain = iommu->external_domain;
list_for_each_entry(group, >group_list, next) {
-   if (!group->pinned_page_dirty_scope) {
-   iommu->pinned_page_dirty_scope = false;
+   if (!group->pinned_page_dirty_scope)
return;
-   }
}
}
 
@@ -2348,7 +2347,7 @@ static void vfio_iommu_type1_detach_group(void 
*iommu_data,
struct vfio_iommu *iommu = iommu_data;
struct vfio_domain *domain;
struct vfio_group *group;
-   bool update_dirty_scope = false;
+   bool promote_dirty_scope = false;
LIST_HEAD(iova_copy);
 
mutex_lock(>lock);
@@ -2356,7 +2355,7 @@ static void vfio_iommu_type1_detach_group(void 
*iommu_data,
if (iommu->external_domain) {
group = find_iommu_group(iommu->external_domain, iommu_group);
if (group) {
-   update_dirty_scope = !group->pinned_page_dirty_scope;
+   promote_dirty_scope = !group->pinned_page_dirty_scope;
list_del(>next);
kfree(group);
 
@@ -2386,7 +2385,7 @@ static void vfio_iommu_type1_detach_group(void 
*iommu_data,
continue;
 
vfio_iommu_detach_group(domain, group);
-   update_dirty_scope = !group->pinned_page_dirty_scope;
+   promote_dirty_scope = !group->pinned_page_dirty_scope;
list_del(>next);
kfree(group);
/*
@@ -2422,8 +2421,8 @@ static void vfio_iommu_type1_detach_group(void 
*iommu_data,
 * Removal of a group without dirty tracking may allow the iommu scope
 * to be promoted.
 */
-   if (update_dirty_scope)
-   update_pinned_page_dirty_scope(iommu);
+   if (promote_dirty_scope)
+   promote_pinned_page_dirty_scope(iommu);
mutex_unlock(>lock);
 }
 
-- 
2.23.0

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 0/7] vfio: iommu_type1: Some fixes and optimization

2020-12-09 Thread Keqian Zhu
Hi folks,

This patch series aim to fix up or optimize some code about vfio
dirty log tracking.

patch   1: Optimize dirty log when unwind pin pages.
patch 2-3: Optimize promoting pinned_page_dirty_scope.
patch   4: Fix up dirty log missing when promote pinned_page_dirty_scope.
patch 5-7: Drop superfluous parameter "pgsize" of some functions.

Wish they improves the robustness of vfio dirty log tracking.

Thanks,
Keqian

Keqian Zhu (7):
  vfio: iommu_type1: Clear added dirty bit when unwind pin
  vfio: iommu_type1: Initially set the pinned_page_dirty_scope
  vfio: iommu_type1: Make an explicit "promote" semantic
  vfio: iommu_type1: Fix missing dirty page when promote pinned_scope
  vfio: iommu_type1: Drop parameter "pgsize" of
vfio_dma_bitmap_alloc_all
  vfio: iommu_type1: Drop parameter "pgsize" of vfio_iova_dirty_bitmap.
  vfio: iommu_type1: Drop parameter "pgsize" of update_user_bitmap

 drivers/vfio/vfio_iommu_type1.c | 108 +++-
 1 file changed, 65 insertions(+), 43 deletions(-)

-- 
2.23.0

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v5 0/2] MTE support for KVM guest

2020-12-09 Thread Peter Maydell
On Wed, 9 Dec 2020 at 20:13, Richard Henderson
 wrote:
>
> On 12/9/20 12:39 PM, Catalin Marinas wrote:
> >> I would have thought that the best way is to use TCO, so that we don't 
> >> have to
> >> have dual mappings (and however many MB of extra page tables that might 
> >> imply).
> >
> > The problem appears when the VMM wants to use MTE itself (e.g. linked
> > against an MTE-aware glibc), toggling TCO is no longer generic enough,
> > especially when it comes to device emulation.
>
> But we do know exactly when we're manipulating guest memory -- we have special
> routines for that.

Well, yes and no. It's not like every access to guest memory
is through a specific set of "load from guest"/"store from guest"
functions, and in some situations there's a "get a pointer to
guest RAM, keep using it over a long-ish sequence of QEMU code,
then be done with it" pattern. It's because it's not that trivial
to isolate when something is accessing guest RAM that I don't want
to just have it be mapped PROT_MTE into QEMU. I think we'd end
up spending a lot of time hunting down "whoops, turns out this
is accessing guest RAM and sometimes it trips over the tags in
a hard-to-debug way" bugs. I'd much rather the kernel just
provided us with an API for what we want, which is (1) the guest
RAM as just RAM with no tag checking and separately (2) some
mechanism yet-to-be-designed which lets us bulk copy a page's
worth of tags for migration.

thanks
-- PMM
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v5 0/2] MTE support for KVM guest

2020-12-09 Thread Richard Henderson
On 12/9/20 12:39 PM, Catalin Marinas wrote:
>> I would have thought that the best way is to use TCO, so that we don't have 
>> to
>> have dual mappings (and however many MB of extra page tables that might 
>> imply).
> 
> The problem appears when the VMM wants to use MTE itself (e.g. linked
> against an MTE-aware glibc), toggling TCO is no longer generic enough,
> especially when it comes to device emulation.

But we do know exactly when we're manipulating guest memory -- we have special
routines for that.  So the special routines gain a toggle of TCO around the
exact guest memory manipulation, not a blanket disable of MTE across large
swaths of QEMU.


r~
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v5 0/2] MTE support for KVM guest

2020-12-09 Thread Catalin Marinas
On Wed, Dec 09, 2020 at 12:27:59PM -0600, Richard Henderson wrote:
> On 12/9/20 9:27 AM, Catalin Marinas wrote:
> > On Wed, Dec 09, 2020 at 01:25:18PM +, Marc Zyngier wrote:
> >> Would this syscall operate on the guest address space? Or on the VMM's
> >> own mapping?
> ...
> > Whatever is easier for the VMM, I don't think it matters as long as the
> > host kernel can get the actual physical address (and linear map
> > correspondent). Maybe simpler if it's the VMM address space as the
> > kernel can check the access permissions in case you want to hide the
> > guest memory from the VMM for other reasons (migration is also off the
> > table).
> 
> Indeed, such a syscall is no longer specific to vmm's and may be used for any
> bulk move of tags that userland might want.

For CRIU, I think the current ptrace interface would do. With VMMs, the
same remote VM model doesn't apply (the "remote" VM is actually the
guest memory). I'd keep this under a KVM ioctl() number rather than a
new, specific syscall.

> > Without syscalls, an option would be for the VMM to create two mappings:
> > one with PROT_MTE for migration and the other without for normal DMA
> > etc. That's achievable using memfd_create() or shm_open() and two mmap()
> > calls, only one having PROT_MTE. The VMM address space should be
> > sufficiently large to map two guest IPAs.
> 
> I would have thought that the best way is to use TCO, so that we don't have to
> have dual mappings (and however many MB of extra page tables that might 
> imply).

The problem appears when the VMM wants to use MTE itself (e.g. linked
against an MTE-aware glibc), toggling TCO is no longer generic enough,
especially when it comes to device emulation.

-- 
Catalin
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v5 0/2] MTE support for KVM guest

2020-12-09 Thread Richard Henderson
On 12/9/20 9:27 AM, Catalin Marinas wrote:
> On Wed, Dec 09, 2020 at 01:25:18PM +, Marc Zyngier wrote:
>> Would this syscall operate on the guest address space? Or on the VMM's
>> own mapping?
...
> Whatever is easier for the VMM, I don't think it matters as long as the
> host kernel can get the actual physical address (and linear map
> correspondent). Maybe simpler if it's the VMM address space as the
> kernel can check the access permissions in case you want to hide the
> guest memory from the VMM for other reasons (migration is also off the
> table).

Indeed, such a syscall is no longer specific to vmm's and may be used for any
bulk move of tags that userland might want.

> Without syscalls, an option would be for the VMM to create two mappings:
> one with PROT_MTE for migration and the other without for normal DMA
> etc. That's achievable using memfd_create() or shm_open() and two mmap()
> calls, only one having PROT_MTE. The VMM address space should be
> sufficiently large to map two guest IPAs.

I would have thought that the best way is to use TCO, so that we don't have to
have dual mappings (and however many MB of extra page tables that might imply).


r~
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 9/9] KVM: arm64: Remove hyp_symbol_addr

2020-12-09 Thread David Brazdil
Hyp code used the hyp_symbol_addr helper to force PC-relative addressing
because absolute addressing results in kernel VAs due to the way hyp
code is linked. This is not true anymore, so remove the helper and
update all of its users.

Acked-by: Ard Biesheuvel 
Signed-off-by: David Brazdil 
---
 arch/arm64/include/asm/kvm_asm.h | 20 
 arch/arm64/kvm/hyp/include/hyp/switch.h  |  4 ++--
 arch/arm64/kvm/hyp/nvhe/hyp-smp.c|  4 ++--
 arch/arm64/kvm/hyp/nvhe/psci-relay.c | 24 
 arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c |  2 +-
 5 files changed, 17 insertions(+), 37 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 7ccf770c53d9..22d933e9b59e 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -199,26 +199,6 @@ extern void __vgic_v3_init_lrs(void);
 
 extern u32 __kvm_get_mdcr_el2(void);
 
-/*
- * Obtain the PC-relative address of a kernel symbol
- * s: symbol
- *
- * The goal of this macro is to return a symbol's address based on a
- * PC-relative computation, as opposed to a loading the VA from a
- * constant pool or something similar. This works well for HYP, as an
- * absolute VA is guaranteed to be wrong. Only use this if trying to
- * obtain the address of a symbol (i.e. not something you obtained by
- * following a pointer).
- */
-#define hyp_symbol_addr(s) \
-   ({  \
-   typeof(s) *addr;\
-   asm("adrp   %0, %1\n"   \
-   "add%0, %0, :lo12:%1\n" \
-   : "=r" (addr) : "S" ());  \
-   addr;   \
-   })
-
 #define __KVM_EXTABLE(from, to)
\
"   .pushsection__kvm_ex_table, \"a\"\n"\
"   .align  3\n"\
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h 
b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 84473574c2e7..54f4860cd87c 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -505,8 +505,8 @@ static inline void __kvm_unexpected_el2_exception(void)
struct exception_table_entry *entry, *end;
unsigned long elr_el2 = read_sysreg(elr_el2);
 
-   entry = hyp_symbol_addr(__start___kvm_ex_table);
-   end = hyp_symbol_addr(__stop___kvm_ex_table);
+   entry = &__start___kvm_ex_table;
+   end = &__stop___kvm_ex_table;
 
while (entry < end) {
addr = (unsigned long)>insn + entry->insn;
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c 
b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
index cbab0c6246e2..2048725517f8 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
@@ -33,8 +33,8 @@ unsigned long __hyp_per_cpu_offset(unsigned int cpu)
if (cpu >= ARRAY_SIZE(kvm_arm_hyp_percpu_base))
hyp_panic();
 
-   cpu_base_array = (unsigned long 
*)hyp_symbol_addr(kvm_arm_hyp_percpu_base);
+   cpu_base_array = (unsigned long *)_arm_hyp_percpu_base;
this_cpu_base = kern_hyp_va(cpu_base_array[cpu]);
-   elf_base = (unsigned long)hyp_symbol_addr(__per_cpu_start);
+   elf_base = (unsigned long)&__per_cpu_start;
return this_cpu_base - elf_base;
 }
diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c 
b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
index 08dc9de69314..746fb7079581 100644
--- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c
+++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
@@ -151,8 +151,8 @@ static int psci_cpu_on(u64 func_id, struct kvm_cpu_context 
*host_ctxt)
if (cpu_id == INVALID_CPU_ID)
return PSCI_RET_INVALID_PARAMS;
 
-   boot_args = per_cpu_ptr(hyp_symbol_addr(cpu_on_args), cpu_id);
-   init_params = per_cpu_ptr(hyp_symbol_addr(kvm_init_params), cpu_id);
+   boot_args = per_cpu_ptr(_on_args, cpu_id);
+   init_params = per_cpu_ptr(_init_params, cpu_id);
 
/* Check if the target CPU is already being booted. */
if (!try_acquire_boot_args(boot_args))
@@ -163,7 +163,7 @@ static int psci_cpu_on(u64 func_id, struct kvm_cpu_context 
*host_ctxt)
wmb();
 
ret = psci_call(func_id, mpidr,
-   __hyp_pa(hyp_symbol_addr(kvm_hyp_cpu_entry)),
+   __hyp_pa(_hyp_cpu_entry),
__hyp_pa(init_params));
 
/* If successful, the lock will be released by the target CPU. */
@@ -182,8 +182,8 @@ static int psci_cpu_suspend(u64 func_id, struct 
kvm_cpu_context *host_ctxt)
struct psci_boot_args *boot_args;
struct kvm_nvhe_init_params *init_params;
 
-   boot_args = this_cpu_ptr(hyp_symbol_addr(suspend_args));
-  

[PATCH 8/9] KVM: arm64: Remove patching of fn pointers in hyp

2020-12-09 Thread David Brazdil
Storing a function pointer in hyp now generates relocation information
used at early boot to convert the address to hyp VA. The existing
alternative-based conversion mechanism is therefore obsolete. Remove it
and simplify its users.

Acked-by: Ard Biesheuvel 
Signed-off-by: David Brazdil 
---
 arch/arm64/include/asm/kvm_mmu.h   | 18 --
 arch/arm64/kernel/image-vars.h |  1 -
 arch/arm64/kvm/hyp/nvhe/hyp-main.c | 11 ---
 arch/arm64/kvm/va_layout.c |  6 --
 4 files changed, 4 insertions(+), 32 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index adadc468cc71..90873851f677 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -135,24 +135,6 @@ static __always_inline unsigned long 
__kern_hyp_va(unsigned long v)
 
 #define kern_hyp_va(v) ((typeof(v))(__kern_hyp_va((unsigned long)(v
 
-static __always_inline unsigned long __kimg_hyp_va(unsigned long v)
-{
-   unsigned long offset;
-
-   asm volatile(ALTERNATIVE_CB("movz %0, #0\n"
-   "movk %0, #0, lsl #16\n"
-   "movk %0, #0, lsl #32\n"
-   "movk %0, #0, lsl #48\n",
-   kvm_update_kimg_phys_offset)
-: "=r" (offset));
-
-   return __kern_hyp_va((v - offset) | PAGE_OFFSET);
-}
-
-#define kimg_fn_hyp_va(v)  ((typeof(*v))(__kimg_hyp_va((unsigned 
long)(v
-
-#define kimg_fn_ptr(x) (typeof(x) **)(x)
-
 /*
  * We currently support using a VM-specified IPA size. For backward
  * compatibility, the default IPA size is fixed to 40bits.
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 39289d75118d..3242502f45fa 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -64,7 +64,6 @@ __efistub__ctype  = _ctype;
 /* Alternative callbacks for init-time patching of nVHE hyp code. */
 KVM_NVHE_ALIAS(kvm_patch_vector_branch);
 KVM_NVHE_ALIAS(kvm_update_va_mask);
-KVM_NVHE_ALIAS(kvm_update_kimg_phys_offset);
 KVM_NVHE_ALIAS(kvm_get_kimage_voffset);
 
 /* Global kernel state accessed by nVHE hyp code. */
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c 
b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index bde658d51404..0cf4b750a090 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -108,9 +108,9 @@ static void handle___vgic_v3_restore_aprs(struct 
kvm_cpu_context *host_ctxt)
 
 typedef void (*hcall_t)(struct kvm_cpu_context *);
 
-#define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = kimg_fn_ptr(handle_##x)
+#define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
 
-static const hcall_t *host_hcall[] = {
+static const hcall_t host_hcall[] = {
HANDLE_FUNC(__kvm_vcpu_run),
HANDLE_FUNC(__kvm_flush_vm_context),
HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa),
@@ -130,7 +130,6 @@ static const hcall_t *host_hcall[] = {
 static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
 {
DECLARE_REG(unsigned long, id, host_ctxt, 0);
-   const hcall_t *kfn;
hcall_t hfn;
 
id -= KVM_HOST_SMCCC_ID(0);
@@ -138,13 +137,11 @@ static void handle_host_hcall(struct kvm_cpu_context 
*host_ctxt)
if (unlikely(id >= ARRAY_SIZE(host_hcall)))
goto inval;
 
-   kfn = host_hcall[id];
-   if (unlikely(!kfn))
+   hfn = host_hcall[id];
+   if (unlikely(!hfn))
goto inval;
 
cpu_reg(host_ctxt, 0) = SMCCC_RET_SUCCESS;
-
-   hfn = kimg_fn_hyp_va(kfn);
hfn(host_ctxt);
 
return;
diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c
index fb2ca02b7270..e0021ba960b5 100644
--- a/arch/arm64/kvm/va_layout.c
+++ b/arch/arm64/kvm/va_layout.c
@@ -284,12 +284,6 @@ static void generate_mov_q(u64 val, __le32 *origptr, 
__le32 *updptr, int nr_inst
*updptr++ = cpu_to_le32(insn);
 }
 
-void kvm_update_kimg_phys_offset(struct alt_instr *alt,
-__le32 *origptr, __le32 *updptr, int nr_inst)
-{
-   generate_mov_q(kimage_voffset + PHYS_OFFSET, origptr, updptr, nr_inst);
-}
-
 void kvm_get_kimage_voffset(struct alt_instr *alt,
__le32 *origptr, __le32 *updptr, int nr_inst)
 {
-- 
2.29.2.576.ga3fc446d84-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 7/9] KVM: arm64: Fix constant-pool users in hyp

2020-12-09 Thread David Brazdil
Hyp code uses absolute addressing to obtain a kimg VA of a small number
of kernel symbols. Since the kernel now converts constant pool addresses
to hyp VAs, this trick does not work anymore.

Change the helpers to convert from hyp VA back to kimg VA or PA, as
needed and rework the callers accordingly.

Signed-off-by: David Brazdil 
---
 arch/arm64/include/asm/kvm_mmu.h   | 42 --
 arch/arm64/kvm/hyp/nvhe/host.S | 29 +++--
 arch/arm64/kvm/hyp/nvhe/hyp-init.S |  2 --
 3 files changed, 31 insertions(+), 42 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 6bbb44011c84..adadc468cc71 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -73,49 +73,39 @@ alternative_cb_end
 .endm
 
 /*
- * Convert a kernel image address to a PA
- * reg: kernel address to be converted in place
+ * Convert a hypervisor VA to a PA
+ * reg: hypervisor address to be converted in place
  * tmp: temporary register
- *
- * The actual code generation takes place in kvm_get_kimage_voffset, and
- * the instructions below are only there to reserve the space and
- * perform the register allocation (kvm_get_kimage_voffset uses the
- * specific registers encoded in the instructions).
  */
-.macro kimg_pa reg, tmp
-alternative_cb kvm_get_kimage_voffset
-   movz\tmp, #0
-   movk\tmp, #0, lsl #16
-   movk\tmp, #0, lsl #32
-   movk\tmp, #0, lsl #48
-alternative_cb_end
-
-   /* reg = __pa(reg) */
-   sub \reg, \reg, \tmp
+.macro hyp_pa reg, tmp
+   ldr_l   \tmp, hyp_physvirt_offset
+   add \reg, \reg, \tmp
 .endm
 
 /*
- * Convert a kernel image address to a hyp VA
- * reg: kernel address to be converted in place
+ * Convert a hypervisor VA to a kernel image address
+ * reg: hypervisor address to be converted in place
  * tmp: temporary register
  *
  * The actual code generation takes place in kvm_get_kimage_voffset, and
  * the instructions below are only there to reserve the space and
- * perform the register allocation (kvm_update_kimg_phys_offset uses the
+ * perform the register allocation (kvm_get_kimage_voffset uses the
  * specific registers encoded in the instructions).
  */
-.macro kimg_hyp_va reg, tmp
-alternative_cb kvm_update_kimg_phys_offset
+.macro hyp_kimg_va reg, tmp
+   /* Convert hyp VA -> PA. */
+   hyp_pa  \reg, \tmp
+
+   /* Load kimage_voffset. */
+alternative_cb kvm_get_kimage_voffset
movz\tmp, #0
movk\tmp, #0, lsl #16
movk\tmp, #0, lsl #32
movk\tmp, #0, lsl #48
 alternative_cb_end
 
-   sub \reg, \reg, \tmp
-   mov_q   \tmp, PAGE_OFFSET
-   orr \reg, \reg, \tmp
-   kern_hyp_va \reg
+   /* Convert PA -> kimg VA. */
+   add \reg, \reg, \tmp
 .endm
 
 #else
diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
index a820dfdc9c25..6585a7cbbc56 100644
--- a/arch/arm64/kvm/hyp/nvhe/host.S
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -74,27 +74,28 @@ SYM_FUNC_END(__host_enter)
  * void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 
par);
  */
 SYM_FUNC_START(__hyp_do_panic)
-   /* Load the format arguments into x1-7 */
-   mov x6, x3
-   get_vcpu_ptr x7, x3
-
-   mrs x3, esr_el2
-   mrs x4, far_el2
-   mrs x5, hpfar_el2
-
/* Prepare and exit to the host's panic funciton. */
mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
  PSR_MODE_EL1h)
msr spsr_el2, lr
ldr lr, =panic
+   hyp_kimg_va lr, x6
msr elr_el2, lr
 
-   /*
-* Set the panic format string and enter the host, conditionally
-* restoring the host context.
-*/
+   /* Set the panic format string. Use the, now free, LR as scratch. */
+   ldr lr, =__hyp_panic_string
+   hyp_kimg_va lr, x6
+
+   /* Load the format arguments into x1-7. */
+   mov x6, x3
+   get_vcpu_ptr x7, x3
+   mrs x3, esr_el2
+   mrs x4, far_el2
+   mrs x5, hpfar_el2
+
+   /* Enter the host, conditionally restoring the host context. */
cmp x0, xzr
-   ldr x0, =__hyp_panic_string
+   mov x0, lr
b.eq__host_enter_without_restoring
b   __host_enter_for_panic
 SYM_FUNC_END(__hyp_do_panic)
@@ -124,7 +125,7 @@ SYM_FUNC_END(__hyp_do_panic)
 * Preserve x0-x4, which may contain stub parameters.
 */
ldr x5, =__kvm_handle_stub_hvc
-   kimg_pa x5, x6
+   hyp_pa  x5, x6
br  x5
 .L__vect_end\@:
 .if ((.L__vect_end\@ - .L__vect_start\@) > 0x80)
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S 
b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
index 68fd64f2313e..99b408fe09ee 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -139,7 +139,6 @@ alternative_else_nop_endif

[PATCH 5/9] KVM: arm64: Generate hyp relocation data

2020-12-09 Thread David Brazdil
Add a post-processing step to compilation of KVM nVHE hyp code which
calls a custom host tool (gen-hyprel) on the partially linked object
file (hyp sections' names prefixed).

The tool lists all R_AARCH64_ABS64 data relocations targeting hyp
sections and generates an assembly file that will form a new section
.hyp.reloc in the kernel binary. The new section contains an array of
32-bit offsets to the positions targeted by these relocations.

Since these addresses of those positions will not be determined until
linking of `vmlinux`, each 32-bit entry carries a R_AARCH64_PREL32
relocation with addend  + . The linker of
`vmlinux` will therefore fill the slot accordingly.

This relocation data will be used at runtime to convert the kernel VAs
at those positions to hyp VAs.

Signed-off-by: David Brazdil 
---
 arch/arm64/kernel/vmlinux.lds.S  |  11 +
 arch/arm64/kvm/hyp/nvhe/Makefile |  28 ++-
 arch/arm64/kvm/hyp/nvhe/gen-hyprel.c | 326 +++
 3 files changed, 362 insertions(+), 3 deletions(-)
 create mode 100644 arch/arm64/kvm/hyp/nvhe/gen-hyprel.c

diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index f294f2048955..93cef9607c0e 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -43,10 +43,19 @@ jiffies = jiffies_64;
HYP_SECTION_NAME(.data..percpu) : { \
*(HYP_SECTION_NAME(.data..percpu))  \
}
+
+#define HYPERVISOR_RELOC_SECTION   \
+   .hyp.reloc : ALIGN(4) { \
+   __hyp_reloc_begin = .;  \
+   *(.hyp.reloc)   \
+   __hyp_reloc_end = .;\
+   }
+
 #else /* CONFIG_KVM */
 #define HYPERVISOR_EXTABLE
 #define HYPERVISOR_DATA_SECTIONS
 #define HYPERVISOR_PERCPU_SECTION
+#define HYPERVISOR_RELOC_SECTION
 #endif
 
 #define HYPERVISOR_TEXT\
@@ -219,6 +228,8 @@ SECTIONS
PERCPU_SECTION(L1_CACHE_BYTES)
HYPERVISOR_PERCPU_SECTION
 
+   HYPERVISOR_RELOC_SECTION
+
.rela.dyn : ALIGN(8) {
*(.rela .rela*)
}
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index 1f1e351c5fe2..268be1376f74 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -6,6 +6,8 @@
 asflags-y := -D__KVM_NVHE_HYPERVISOR__
 ccflags-y := -D__KVM_NVHE_HYPERVISOR__
 
+hostprogs := gen-hyprel
+
 obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \
 hyp-main.o hyp-smp.o psci-relay.o
 obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
@@ -19,7 +21,7 @@ obj-y += ../vgic-v3-sr.o ../aarch32.o 
../vgic-v2-cpuif-proxy.o ../entry.o \
 
 hyp-obj := $(patsubst %.o,%.nvhe.o,$(obj-y))
 obj-y := kvm_nvhe.o
-extra-y := $(hyp-obj) kvm_nvhe.tmp.o hyp.lds
+extra-y := $(hyp-obj) kvm_nvhe.tmp.o kvm_nvhe.rel.o hyp.lds hyp-reloc.S 
hyp-reloc.o
 
 # 1) Compile all source files to `.nvhe.o` object files. The file extension
 #avoids file name clashes for files shared with VHE.
@@ -42,11 +44,31 @@ LDFLAGS_kvm_nvhe.tmp.o := -r -T
 $(obj)/kvm_nvhe.tmp.o: $(obj)/hyp.lds $(addprefix $(obj)/,$(hyp-obj)) FORCE
$(call if_changed,ld)
 
-# 4) Produce the final 'kvm_nvhe.o', ready to be linked into 'vmlinux'.
+# 4) Generate list of hyp code/data positions that need to be relocated at
+#runtime. Because the hypervisor is part of the kernel binary, relocations
+#produce a kernel VA. We enumerate relocations targeting hyp at build time
+#and convert the kernel VAs at those positions to hyp VAs.
+$(obj)/hyp-reloc.S: $(obj)/kvm_nvhe.tmp.o $(obj)/gen-hyprel
+   $(call if_changed,hyprel)
+
+# 5) Compile hyp-reloc.S and link it into the existing partially linked object.
+#The object file now contains a section with pointers to hyp positions that
+#will contain kernel VAs at runtime. These pointers have relocations on 
them
+#so that they get updated as the hyp object is linked into `vmlinux`.
+LDFLAGS_kvm_nvhe.rel.o := -r
+$(obj)/kvm_nvhe.rel.o: $(obj)/kvm_nvhe.tmp.o $(obj)/hyp-reloc.o FORCE
+   $(call if_changed,ld)
+
+# 6) Produce the final 'kvm_nvhe.o', ready to be linked into 'vmlinux'.
 #Prefixes names of ELF symbols with '__kvm_nvhe_'.
-$(obj)/kvm_nvhe.o: $(obj)/kvm_nvhe.tmp.o FORCE
+$(obj)/kvm_nvhe.o: $(obj)/kvm_nvhe.rel.o FORCE
$(call if_changed,hypcopy)
 
+# The HYPREL command calls `gen-hyprel` to generate an assembly file with
+# a list of relocations targeting hyp code/data.
+quiet_cmd_hyprel = HYPREL  $@
+  cmd_hyprel = $(obj)/gen-hyprel $< > $@
+
 # The HYPCOPY command uses `objcopy` to prefix all ELF symbol names
 # to avoid clashes with VHE code/data.
 quiet_cmd_hypcopy = HYPCOPY $@
diff --git a/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c 
b/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c
new 

[PATCH 4/9] KVM: arm64: Add symbol at the beginning of each hyp section

2020-12-09 Thread David Brazdil
Generating hyp relocations will require referencing positions at a given
offset from the beginning of hyp sections. Since the final layout will
not be determined until the linking of `vmlinux`, modify the hyp linker
script to insert a symbol at the first byte of each hyp section to use
as an anchor. The linker of `vmlinux` will place the symbols together
with the sections.

Signed-off-by: David Brazdil 
---
 arch/arm64/include/asm/hyp_image.h | 29 +++--
 arch/arm64/kvm/hyp/nvhe/hyp.lds.S  |  4 ++--
 2 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/hyp_image.h 
b/arch/arm64/include/asm/hyp_image.h
index daa1a1da539e..65e8008da932 100644
--- a/arch/arm64/include/asm/hyp_image.h
+++ b/arch/arm64/include/asm/hyp_image.h
@@ -7,6 +7,9 @@
 #ifndef __ARM64_HYP_IMAGE_H__
 #define __ARM64_HYP_IMAGE_H__
 
+#define HYP_CONCAT(a, b)   __HYP_CONCAT(a, b)
+#define __HYP_CONCAT(a, b) a ## b
+
 /*
  * KVM nVHE code has its own symbol namespace prefixed with __kvm_nvhe_,
  * to separate it from the kernel proper.
@@ -21,9 +24,31 @@
  */
 #define HYP_SECTION_NAME(NAME) .hyp##NAME
 
+/* Symbol defined at the beginning of each hyp section. */
+#define HYP_SECTION_SYMBOL_NAME(NAME) \
+   HYP_CONCAT(__hyp_section_, HYP_SECTION_NAME(NAME))
+
+/*
+ * Helper to generate linker script statements starting a hyp section.
+ *
+ * A symbol with a well-known name is defined at the first byte. This
+ * is used as a base for hyp relocations (see gen-hyprel.c). It must
+ * be defined inside the section so the linker of `vmlinux` cannot
+ * separate it from the section data.
+ */
+#define BEGIN_HYP_SECTION(NAME)\
+   HYP_SECTION_NAME(NAME) : {  \
+   HYP_SECTION_SYMBOL_NAME(NAME) = .;
+
+/* Helper to generate linker script statements ending a hyp section. */
+#define END_HYP_SECTION\
+   }
+
 /* Defines an ELF hyp section from input section @NAME and its subsections. */
-#define HYP_SECTION(NAME) \
-   HYP_SECTION_NAME(NAME) : { *(NAME NAME##.*) }
+#define HYP_SECTION(NAME)  \
+   BEGIN_HYP_SECTION(NAME) \
+   *(NAME NAME##.*)\
+   END_HYP_SECTION
 
 /*
  * Defines a linker script alias of a kernel-proper symbol referenced by
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S 
b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
index cfdc59b4329b..cd119d82d8e3 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
@@ -22,7 +22,7 @@ SECTIONS {
 * alignment for when linking into vmlinux.
 */
. = ALIGN(PAGE_SIZE);
-   HYP_SECTION_NAME(.data..percpu) : {
+   BEGIN_HYP_SECTION(.data..percpu)
PERCPU_INPUT(L1_CACHE_BYTES)
-   }
+   END_HYP_SECTION
 }
-- 
2.29.2.576.ga3fc446d84-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 3/9] KVM: arm64: Set up .hyp.rodata ELF section

2020-12-09 Thread David Brazdil
We will need to recognize pointers in .rodata specific to hyp, so
establish a .hyp.rodata ELF section. Merge it with the existing
.hyp.data..ro_after_init as they are treated the same at runtime.

Signed-off-by: David Brazdil 
---
 arch/arm64/include/asm/sections.h | 2 +-
 arch/arm64/kernel/vmlinux.lds.S   | 7 ---
 arch/arm64/kvm/arm.c  | 7 +++
 arch/arm64/kvm/hyp/nvhe/hyp.lds.S | 4 +++-
 4 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/include/asm/sections.h 
b/arch/arm64/include/asm/sections.h
index 8ff579361731..a6f3557d1ab2 100644
--- a/arch/arm64/include/asm/sections.h
+++ b/arch/arm64/include/asm/sections.h
@@ -11,7 +11,7 @@ extern char __alt_instructions[], __alt_instructions_end[];
 extern char __hibernate_exit_text_start[], __hibernate_exit_text_end[];
 extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[];
 extern char __hyp_text_start[], __hyp_text_end[];
-extern char __hyp_data_ro_after_init_start[], __hyp_data_ro_after_init_end[];
+extern char __hyp_rodata_start[], __hyp_rodata_end[];
 extern char __idmap_text_start[], __idmap_text_end[];
 extern char __initdata_begin[], __initdata_end[];
 extern char __inittext_begin[], __inittext_end[];
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 43af13968dfd..f294f2048955 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -31,10 +31,11 @@ jiffies = jiffies_64;
__stop___kvm_ex_table = .;
 
 #define HYPERVISOR_DATA_SECTIONS   \
-   HYP_SECTION_NAME(.data..ro_after_init) : {  \
-   __hyp_data_ro_after_init_start = .; \
+   HYP_SECTION_NAME(.rodata) : {   \
+   __hyp_rodata_start = .; \
*(HYP_SECTION_NAME(.data..ro_after_init))   \
-   __hyp_data_ro_after_init_end = .;   \
+   *(HYP_SECTION_NAME(.rodata))\
+   __hyp_rodata_end = .;   \
}
 
 #define HYPERVISOR_PERCPU_SECTION  \
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 6e637d2b4cfb..c244e57f9cd9 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1745,11 +1745,10 @@ static int init_hyp_mode(void)
goto out_err;
}
 
-   err = create_hyp_mappings(kvm_ksym_ref(__hyp_data_ro_after_init_start),
- kvm_ksym_ref(__hyp_data_ro_after_init_end),
- PAGE_HYP_RO);
+   err = create_hyp_mappings(kvm_ksym_ref(__hyp_rodata_start),
+ kvm_ksym_ref(__hyp_rodata_end), PAGE_HYP_RO);
if (err) {
-   kvm_err("Cannot map .hyp.data..ro_after_init section\n");
+   kvm_err("Cannot map .hyp.rodata section\n");
goto out_err;
}
 
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S 
b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
index 70ac48ccede7..cfdc59b4329b 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
@@ -14,6 +14,9 @@
 SECTIONS {
HYP_SECTION(.idmap.text)
HYP_SECTION(.text)
+   HYP_SECTION(.data..ro_after_init)
+   HYP_SECTION(.rodata)
+
/*
 * .hyp..data..percpu needs to be page aligned to maintain the same
 * alignment for when linking into vmlinux.
@@ -22,5 +25,4 @@ SECTIONS {
HYP_SECTION_NAME(.data..percpu) : {
PERCPU_INPUT(L1_CACHE_BYTES)
}
-   HYP_SECTION(.data..ro_after_init)
 }
-- 
2.29.2.576.ga3fc446d84-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 0/9] KVM: arm64: Relocate absolute hyp VAs

2020-12-09 Thread David Brazdil
nVHE hyp code is linked into the same kernel binary but executes under
different memory mappings. If the compiler of hyp code chooses absolute
addressing for accessing a symbol, the kernel linker will relocate that
address to a kernel image virtual address, causing a runtime exception.

So far the strategy has been to force PC-relative addressing by wrapping
all symbol references with the hyp_symbol_addr macro. This is error
prone and developer unfriendly.

The series adds a new build-time step for nVHE hyp object file where
positions targeted by R_AARCH64_ABS64 relocations are enumerated and
the information stored in a separate ELF section in the kernel image.
At runtime, the kernel first relocates all absolute addresses to their
actual virtual offset (eg. for KASLR), and then addresses listed in this
section are converted to hyp VAs.

The RFC of this series did not have a build-time step and instead relied
on filtering dynamic relocations at runtime. That approach does not work
if the kernel is built with !CONFIG_RELOCATABLE, hence an always-present
set of relocation positions was added.

The series is based on the current kvmarm/next (commit 3a514592b6) and
structured as follows:
  * patch 1 is Jamie's fix of .hyp.data..percpu alignment; already in
master, not yet in kvmarm/next; included to avoid merge conflicts
  * patches 2-3 make sure that all sections referred to by hyp code are
handled by the hyp linker script and prefixed with .hyp so they can
be identified by the build-time tool
  * patches 4-6 contain the actual changes to identify and relocate VAs
  * patches 7-8 fix existing code that assumes kernel VAs
  * patch 9 removes the (now redundant) hyp_symbol_addr

The series is also available at:
  https://android-kvm.googlesource.com/linux topic/hyp-reloc_v1

-David

David Brazdil (8):
  KVM: arm64: Rename .idmap.text in hyp linker script
  KVM: arm64: Set up .hyp.rodata ELF section
  KVM: arm64: Add symbol at the beginning of each hyp section
  KVM: arm64: Generate hyp relocation data
  KVM: arm64: Apply hyp relocations at runtime
  KVM: arm64: Fix constant-pool users in hyp
  KVM: arm64: Remove patching of fn pointers in hyp
  KVM: arm64: Remove hyp_symbol_addr

Jamie Iles (1):
  KVM: arm64: Correctly align nVHE percpu data

 arch/arm64/configs/defconfig |   1 +
 arch/arm64/include/asm/hyp_image.h   |  29 +-
 arch/arm64/include/asm/kvm_asm.h |  20 --
 arch/arm64/include/asm/kvm_mmu.h |  61 ++---
 arch/arm64/include/asm/sections.h|   3 +-
 arch/arm64/kernel/image-vars.h   |   1 -
 arch/arm64/kernel/smp.c  |   4 +-
 arch/arm64/kernel/vmlinux.lds.S  |  18 +-
 arch/arm64/kvm/arm.c |   7 +-
 arch/arm64/kvm/hyp/include/hyp/switch.h  |   4 +-
 arch/arm64/kvm/hyp/nvhe/Makefile |  28 +-
 arch/arm64/kvm/hyp/nvhe/gen-hyprel.c | 326 +++
 arch/arm64/kvm/hyp/nvhe/host.S   |  29 +-
 arch/arm64/kvm/hyp/nvhe/hyp-init.S   |   4 +-
 arch/arm64/kvm/hyp/nvhe/hyp-main.c   |  11 +-
 arch/arm64/kvm/hyp/nvhe/hyp-smp.c|   4 +-
 arch/arm64/kvm/hyp/nvhe/hyp.lds.S|  14 +-
 arch/arm64/kvm/hyp/nvhe/psci-relay.c |  24 +-
 arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c |   2 +-
 arch/arm64/kvm/va_layout.c   |  34 ++-
 20 files changed, 495 insertions(+), 129 deletions(-)
 create mode 100644 arch/arm64/kvm/hyp/nvhe/gen-hyprel.c

--
2.29.2.576.ga3fc446d84-goog
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 6/9] KVM: arm64: Apply hyp relocations at runtime

2020-12-09 Thread David Brazdil
KVM nVHE code runs under a different VA mapping than the kernel, hence
so far it avoided using absolute addressing because the VA in a constant
pool is relocated by the linker to a kernel VA (see hyp_symbol_addr).

Now the kernel has access to a list of positions that contain a kimg VA
but will be accessed only in hyp execution context. These are generated
by the gen-hyprel build-time tool and stored in .hyp.reloc.

Add early boot pass over the entries and convert the kimg VAs to hyp VAs.
Note that this requires for .hyp* ELF sections to be mapped read-write
at that point.

Signed-off-by: David Brazdil 
---
 arch/arm64/configs/defconfig  |  1 +
 arch/arm64/include/asm/kvm_mmu.h  |  1 +
 arch/arm64/include/asm/sections.h |  1 +
 arch/arm64/kernel/smp.c   |  4 +++-
 arch/arm64/kvm/va_layout.c| 28 
 5 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 5cfe3cf6f2ac..73fc9f2f2661 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -1092,3 +1092,4 @@ CONFIG_DEBUG_KERNEL=y
 # CONFIG_DEBUG_PREEMPT is not set
 # CONFIG_FTRACE is not set
 CONFIG_MEMTEST=y
+# CONFIG_ARM64_VHE is not set
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index e52d82aeadca..6bbb44011c84 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -129,6 +129,7 @@ alternative_cb_end
 void kvm_update_va_mask(struct alt_instr *alt,
__le32 *origptr, __le32 *updptr, int nr_inst);
 void kvm_compute_layout(void);
+void kvm_apply_hyp_relocations(void);
 
 static __always_inline unsigned long __kern_hyp_va(unsigned long v)
 {
diff --git a/arch/arm64/include/asm/sections.h 
b/arch/arm64/include/asm/sections.h
index a6f3557d1ab2..2f36b16a5b5d 100644
--- a/arch/arm64/include/asm/sections.h
+++ b/arch/arm64/include/asm/sections.h
@@ -12,6 +12,7 @@ extern char __hibernate_exit_text_start[], 
__hibernate_exit_text_end[];
 extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[];
 extern char __hyp_text_start[], __hyp_text_end[];
 extern char __hyp_rodata_start[], __hyp_rodata_end[];
+extern char __hyp_reloc_begin[], __hyp_reloc_end[];
 extern char __idmap_text_start[], __idmap_text_end[];
 extern char __initdata_begin[], __initdata_end[];
 extern char __inittext_begin[], __inittext_end[];
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 18e9727d3f64..47142395bc91 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -434,8 +434,10 @@ static void __init hyp_mode_check(void)
   "CPU: CPUs started in inconsistent modes");
else
pr_info("CPU: All CPU(s) started at EL1\n");
-   if (IS_ENABLED(CONFIG_KVM))
+   if (IS_ENABLED(CONFIG_KVM)) {
kvm_compute_layout();
+   kvm_apply_hyp_relocations();
+   }
 }
 
 void __init smp_cpus_done(unsigned int max_cpus)
diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c
index d8cc51bd60bf..fb2ca02b7270 100644
--- a/arch/arm64/kvm/va_layout.c
+++ b/arch/arm64/kvm/va_layout.c
@@ -82,6 +82,34 @@ __init void kvm_compute_layout(void)
init_hyp_physvirt_offset();
 }
 
+/*
+ * The .hyp.reloc ELF section contains a list of kimg positions that
+ * contains kimg VAs but will be accessed only in hyp execution context.
+ * Convert them to hyp VAs. See gen-hyprel.c for more details.
+ */
+__init void kvm_apply_hyp_relocations(void)
+{
+   int32_t *rel;
+   int32_t *begin = (int32_t*)__hyp_reloc_begin;
+   int32_t *end = (int32_t*)__hyp_reloc_end;
+
+   for (rel = begin; rel < end; ++rel) {
+   uintptr_t *ptr, kimg_va;
+
+   /*
+* Each entry contains a 32-bit relative offset from itself
+* to a kimg VA position.
+*/
+   ptr = (uintptr_t*)lm_alias((char*)rel + *rel);
+
+   /* Read the kimg VA value at the relocation address. */
+   kimg_va = *ptr;
+
+   /* Convert to hyp VA and store back to the relocation address. 
*/
+   *ptr = __early_kern_hyp_va((uintptr_t)lm_alias(kimg_va));
+   }
+}
+
 static u32 compute_instruction(int n, u32 rd, u32 rn)
 {
u32 insn = AARCH64_BREAK_FAULT;
-- 
2.29.2.576.ga3fc446d84-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 1/9] KVM: arm64: Correctly align nVHE percpu data

2020-12-09 Thread David Brazdil
From: Jamie Iles 

The nVHE percpu data is partially linked but the nVHE linker script did
not align the percpu section.  The PERCPU_INPUT macro would then align
the data to a page boundary:

  #define PERCPU_INPUT(cacheline)   \
__per_cpu_start = .;\
*(.data..percpu..first) \
. = ALIGN(PAGE_SIZE);   \
*(.data..percpu..page_aligned)  \
. = ALIGN(cacheline);   \
*(.data..percpu..read_mostly)   \
. = ALIGN(cacheline);   \
*(.data..percpu)\
*(.data..percpu..shared_aligned)\
PERCPU_DECRYPTED_SECTION\
__per_cpu_end = .;

but then when the final vmlinux linking happens the hypervisor percpu
data is included after page alignment and so the offsets potentially
don't match.  On my build I saw that the .hyp.data..percpu section was
at address 0x20 and then the percpu data would begin at 0x1000 (because
of the page alignment in PERCPU_INPUT), but when linked into vmlinux,
everything would be shifted down by 0x20 bytes.

This manifests as one of the CPUs getting lost when running
kvm-unit-tests or starting any VM and subsequent soft lockup on a Cortex
A72 device.

Fixes: 30c953911c43 ("kvm: arm64: Set up hyp percpu data for nVHE")
Signed-off-by: Jamie Iles 
Signed-off-by: Marc Zyngier 
Acked-by: David Brazdil 
Cc: David Brazdil 
Cc: Marc Zyngier 
Cc: Will Deacon 
Link: https://lore.kernel.org/r/20201113150406.14314-1-ja...@nuviainc.com
---
 arch/arm64/kvm/hyp/nvhe/hyp.lds.S | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S 
b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
index 5d76ff2ba63e..1206d0d754d5 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
@@ -13,6 +13,11 @@
 
 SECTIONS {
HYP_SECTION(.text)
+   /*
+* .hyp..data..percpu needs to be page aligned to maintain the same
+* alignment for when linking into vmlinux.
+*/
+   . = ALIGN(PAGE_SIZE);
HYP_SECTION_NAME(.data..percpu) : {
PERCPU_INPUT(L1_CACHE_BYTES)
}
-- 
2.29.2.576.ga3fc446d84-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 2/9] KVM: arm64: Rename .idmap.text in hyp linker script

2020-12-09 Thread David Brazdil
So far hyp-init.S created a .hyp.idmap.text section directly, without
relying on the hyp linker script to prefix its name. Change it to create
.idmap.text and add a HYP_SECTION entry to hyp.lds.S. This way all .hyp*
sections go through the linker script and can be instrumented there.

Signed-off-by: David Brazdil 
---
 arch/arm64/kvm/hyp/nvhe/hyp-init.S | 2 +-
 arch/arm64/kvm/hyp/nvhe/hyp.lds.S  | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S 
b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
index 31b060a44045..68fd64f2313e 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -18,7 +18,7 @@
 #include 
 
.text
-   .pushsection.hyp.idmap.text, "ax"
+   .pushsection.idmap.text, "ax"
 
.align  11
 
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S 
b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
index 1206d0d754d5..70ac48ccede7 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
@@ -12,6 +12,7 @@
 #include 
 
 SECTIONS {
+   HYP_SECTION(.idmap.text)
HYP_SECTION(.text)
/*
 * .hyp..data..percpu needs to be page aligned to maintain the same
-- 
2.29.2.576.ga3fc446d84-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC PATCH 5/6] kvm: arm64: Fix constant-pool users in hyp

2020-12-09 Thread David Brazdil
Hey, relized I never replied to this...

On Tue, Nov 24, 2020 at 03:08:20PM +0100, Ard Biesheuvel wrote:
> On Thu, 19 Nov 2020 at 17:26, David Brazdil  wrote:
> >
> > Hyp code used to use absolute addressing via a constant pool to obtain
> > the kernel VA of 3 symbols - panic, __hyp_panic_string and
> > __kvm_handle_stub_hvc. This used to work because the kernel would
> > relocate the addresses in the constant pool to kernel VA at boot and hyp
> > would simply load them from there.
> >
> > Now that relocations are fixed up to point to hyp VAs, this does not
> > work any longer. Rework the helpers to convert hyp VA to kernel VA / PA
> > as needed.
> >
> 
> Ok, so the reason for the problem is that the literal exists inside
> the HYP text, and all literals are fixed up using the HYP mapping,
> even if they don't point to something that is mapped at HYP. Would it
> make sense to simply disregard literals that point outside of the HYP
> VA mapping?

That would be possible - I'm about to post a v1 with build-time generation of
these relocs and kernel symbols are easily recognizable as they're undefined
in that ELF object. But I do worry that that would confuse people a lot.
And if somebody referenced a kernel symbol in C (maybe not a use case, but...)
they would get different results depending on which addressing mode the
compiler picked.

> 
> > Signed-off-by: David Brazdil 
> > ---
> >  arch/arm64/include/asm/kvm_mmu.h | 29 +++--
> >  arch/arm64/kvm/hyp/nvhe/host.S   | 29 +++--
> >  2 files changed, 34 insertions(+), 24 deletions(-)
> >
> > diff --git a/arch/arm64/include/asm/kvm_mmu.h 
> > b/arch/arm64/include/asm/kvm_mmu.h
> > index 8cb8974ec9cc..0676ff2105bb 100644
> > --- a/arch/arm64/include/asm/kvm_mmu.h
> > +++ b/arch/arm64/include/asm/kvm_mmu.h
> > @@ -72,9 +72,14 @@ alternative_cb kvm_update_va_mask
> >  alternative_cb_end
> >  .endm
> >
> > +.macro hyp_pa reg, tmp
> > +   ldr_l   \tmp, hyp_physvirt_offset
> > +   add \reg, \reg, \tmp
> > +.endm
> > +
> >  /*
> > - * Convert a kernel image address to a PA
> > - * reg: kernel address to be converted in place
> > + * Convert a hypervisor VA to a kernel image address
> > + * reg: hypervisor address to be converted in place
> >   * tmp: temporary register
> >   *
> >   * The actual code generation takes place in kvm_get_kimage_voffset, and
> > @@ -82,18 +87,22 @@ alternative_cb_end
> >   * perform the register allocation (kvm_get_kimage_voffset uses the
> >   * specific registers encoded in the instructions).
> >   */
> > -.macro kimg_pa reg, tmp
> > +.macro hyp_kimg reg, tmp
> > +   /* Convert hyp VA -> PA. */
> > +   hyp_pa  \reg, \tmp
> > +
> > +   /* Load kimage_voffset. */
> >  alternative_cb kvm_get_kimage_voffset
> > -   movz\tmp, #0
> > -   movk\tmp, #0, lsl #16
> > -   movk\tmp, #0, lsl #32
> > -   movk\tmp, #0, lsl #48
> > +   movz\tmp, #0
> > +   movk\tmp, #0, lsl #16
> > +   movk\tmp, #0, lsl #32
> > +   movk\tmp, #0, lsl #48
> >  alternative_cb_end
> >
> > -   /* reg = __pa(reg) */
> > -   sub \reg, \reg, \tmp
> > +   /* Convert PA -> kimg VA. */
> > +   add \reg, \reg, \tmp
> >  .endm
> > -
> > +
> >  #else
> >
> >  #include 
> > diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
> > index 596dd5ae8e77..bcb80d525d8c 100644
> > --- a/arch/arm64/kvm/hyp/nvhe/host.S
> > +++ b/arch/arm64/kvm/hyp/nvhe/host.S
> > @@ -74,27 +74,28 @@ SYM_FUNC_END(__host_enter)
> >   * void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, 
> > u64 par);
> >   */
> >  SYM_FUNC_START(__hyp_do_panic)
> > -   /* Load the format arguments into x1-7 */
> > -   mov x6, x3
> > -   get_vcpu_ptr x7, x3
> > -
> > -   mrs x3, esr_el2
> > -   mrs x4, far_el2
> > -   mrs x5, hpfar_el2
> > -
> > /* Prepare and exit to the host's panic funciton. */
> > mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
> >   PSR_MODE_EL1h)
> > msr spsr_el2, lr
> > ldr lr, =panic
> > +   hyp_kimg lr, x6
> > msr elr_el2, lr
> >
> > -   /*
> > -* Set the panic format string and enter the host, conditionally
> > -* restoring the host context.
> > -*/
> > +   /* Set the panic format string. Use the, now free, LR as scratch. */
> > +   ldr lr, =__hyp_panic_string
> > +   hyp_kimg lr, x6
> > +
> > +   /* Load the format arguments into x1-7. */
> > +   mov x6, x3
> > +   get_vcpu_ptr x7, x3
> > +   mrs x3, esr_el2
> > +   mrs x4, far_el2
> > +   mrs x5, hpfar_el2
> > +
> > +   /* Enter the host, conditionally restoring the host context. */
> > cmp x0, xzr
> > -   ldr x0, =__hyp_panic_string
> > +   mov x0, lr
> > b.eq__host_enter_without_restoring
> >  

Re: [PATCH v5 0/2] MTE support for KVM guest

2020-12-09 Thread Catalin Marinas
On Wed, Dec 09, 2020 at 01:25:18PM +, Marc Zyngier wrote:
> On 2020-12-09 12:44, Catalin Marinas wrote:
> > On Tue, Dec 08, 2020 at 06:21:12PM +, Marc Zyngier wrote:
> > > On 2020-12-08 17:21, Catalin Marinas wrote:
> > > > On Mon, Dec 07, 2020 at 07:03:13PM +, Marc Zyngier wrote:
> > > > > I wonder whether we will have to have something kernel side to
> > > > > dump/reload tags in a way that matches the patterns used by live
> > > > > migration.
> > > >
> > > > We have something related - ptrace dumps/resores the tags. Can the same
> > > > concept be expanded to a KVM ioctl?
> > > 
> > > Yes, although I wonder whether we should integrate this deeply into
> > > the dirty-log mechanism: it would be really interesting to dump the
> > > tags at the point where the page is flagged as clean from a dirty-log
> > > point of view. As the page is dirtied, discard the saved tags.
> > 
> > From the VMM perspective, the tags can be treated just like additional
> > (meta)data in a page. We'd only need the tags when copying over. It can
> > race with the VM dirtying the page (writing tags would dirty it) but I
> > don't think the current migration code cares about this. If dirtied, it
> > copies it again.
> > 
> > The only downside I see is an extra syscall per page both on the origin
> > VMM and the destination one to dump/restore the tags. Is this a
> > performance issue?
> 
> I'm not sure. Migrating VMs already has a massive overhead, so an extra
> syscall per page isn't terrifying. But that's the point where I admit
> not knowing enough about what the VMM expects, nor whether that matches
> what happens on other architectures that deal with per-page metadata.
> 
> Would this syscall operate on the guest address space? Or on the VMM's
> own mapping?

Whatever is easier for the VMM, I don't think it matters as long as the
host kernel can get the actual physical address (and linear map
correspondent). Maybe simpler if it's the VMM address space as the
kernel can check the access permissions in case you want to hide the
guest memory from the VMM for other reasons (migration is also off the
table).

Without syscalls, an option would be for the VMM to create two mappings:
one with PROT_MTE for migration and the other without for normal DMA
etc. That's achievable using memfd_create() or shm_open() and two mmap()
calls, only one having PROT_MTE. The VMM address space should be
sufficiently large to map two guest IPAs.

-- 
Catalin
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


RE: [PATCH v13 05/15] iommu/smmuv3: Get prepared for nested stage support

2020-12-09 Thread Shameerali Kolothum Thodi
Hi Eric,

> -Original Message-
> From: Eric Auger [mailto:eric.au...@redhat.com]
> Sent: 18 November 2020 11:22
> To: eric.auger@gmail.com; eric.au...@redhat.com;
> io...@lists.linux-foundation.org; linux-ker...@vger.kernel.org;
> k...@vger.kernel.org; kvmarm@lists.cs.columbia.edu; w...@kernel.org;
> j...@8bytes.org; m...@kernel.org; robin.mur...@arm.com;
> alex.william...@redhat.com
> Cc: jean-phili...@linaro.org; zhangfei@linaro.org;
> zhangfei@gmail.com; vivek.gau...@arm.com; Shameerali Kolothum
> Thodi ;
> jacob.jun@linux.intel.com; yi.l@intel.com; t...@semihalf.com;
> nicoleots...@gmail.com; yuzenghui 
> Subject: [PATCH v13 05/15] iommu/smmuv3: Get prepared for nested stage
> support
> 
> When nested stage translation is setup, both s1_cfg and
> s2_cfg are set.
> 
> We introduce a new smmu domain abort field that will be set
> upon guest stage1 configuration passing.
> 
> arm_smmu_write_strtab_ent() is modified to write both stage
> fields in the STE and deal with the abort field.
> 
> In nested mode, only stage 2 is "finalized" as the host does
> not own/configure the stage 1 context descriptor; guest does.
> 
> Signed-off-by: Eric Auger 
> 
> ---
> v10 -> v11:
> - Fix an issue reported by Shameer when switching from with vSMMU
>   to without vSMMU. Despite the spec does not seem to mention it
>   seems to be needed to reset the 2 high 64b when switching from
>   S1+S2 cfg to S1 only. Especially dst[3] needs to be reset (S2TTB).
>   On some implementations, if the S2TTB is not reset, this causes
>   a C_BAD_STE error
> ---
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 64
> +
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  2 +
>  2 files changed, 56 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index 18ac5af1b284..412ea1bafa50 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -1181,8 +1181,10 @@ static void arm_smmu_write_strtab_ent(struct
> arm_smmu_master *master, u32 sid,
>* three cases at the moment:
>*
>* 1. Invalid (all zero) -> bypass/fault (init)
> -  * 2. Bypass/fault -> translation/bypass (attach)
> -  * 3. Translation/bypass -> bypass/fault (detach)
> +  * 2. Bypass/fault -> single stage translation/bypass (attach)
> +  * 3. Single or nested stage Translation/bypass -> bypass/fault (detach)
> +  * 4. S2 -> S1 + S2 (attach_pasid_table)
> +  * 5. S1 + S2 -> S2 (detach_pasid_table)
>*
>* Given that we can't update the STE atomically and the SMMU
>* doesn't read the thing in a defined order, that leaves us
> @@ -1193,7 +1195,8 @@ static void arm_smmu_write_strtab_ent(struct
> arm_smmu_master *master, u32 sid,
>* 3. Update Config, sync
>*/
>   u64 val = le64_to_cpu(dst[0]);
> - bool ste_live = false;
> + bool s1_live = false, s2_live = false, ste_live;
> + bool abort, nested = false, translate = false;
>   struct arm_smmu_device *smmu = NULL;
>   struct arm_smmu_s1_cfg *s1_cfg;
>   struct arm_smmu_s2_cfg *s2_cfg;
> @@ -1233,6 +1236,8 @@ static void arm_smmu_write_strtab_ent(struct
> arm_smmu_master *master, u32 sid,
>   default:
>   break;
>   }
> + nested = s1_cfg->set && s2_cfg->set;

This is a problem when the Guest is booted with iommu.passthrough = 1 as we
set s1_cfg.set = false for IOMMU_PASID_CONFIG_BYPASS. 

Results in BUG_ON(ste_live && !nested).

Can we instead have nested = true set a bit above in the code, where we set
s2_cfg->set = true for the ARM_SMMU_DOMAIN_NESTED case?

Please take a look.

Thanks,
Shameer

> + translate = s1_cfg->set || s2_cfg->set;
>   }
> 
>   if (val & STRTAB_STE_0_V) {
> @@ -1240,23 +1245,36 @@ static void arm_smmu_write_strtab_ent(struct
> arm_smmu_master *master, u32 sid,
>   case STRTAB_STE_0_CFG_BYPASS:
>   break;
>   case STRTAB_STE_0_CFG_S1_TRANS:
> + s1_live = true;
> + break;
>   case STRTAB_STE_0_CFG_S2_TRANS:
> - ste_live = true;
> + s2_live = true;
> + break;
> + case STRTAB_STE_0_CFG_NESTED:
> + s1_live = true;
> + s2_live = true;
>   break;
>   case STRTAB_STE_0_CFG_ABORT:
> - BUG_ON(!disable_bypass);
>   break;
>   default:
>   BUG(); /* STE corruption */
>   }
>   }
> 
> + ste_live = s1_live || s2_live;
> +
>   /* Nuke the existing STE_0 value, as we're going to rewrite it */
>   val = STRTAB_STE_0_V;
> 
>   /* Bypass/fault */
> - if (!smmu_domain || !(s1_cfg->set || s2_cfg->set)) {
> - 

Re: [PATCH v5 0/2] MTE support for KVM guest

2020-12-09 Thread Marc Zyngier

On 2020-12-09 12:44, Catalin Marinas wrote:

On Tue, Dec 08, 2020 at 06:21:12PM +, Marc Zyngier wrote:

On 2020-12-08 17:21, Catalin Marinas wrote:
> On Mon, Dec 07, 2020 at 07:03:13PM +, Marc Zyngier wrote:
> > I wonder whether we will have to have something kernel side to
> > dump/reload tags in a way that matches the patterns used by live
> > migration.
>
> We have something related - ptrace dumps/resores the tags. Can the same
> concept be expanded to a KVM ioctl?

Yes, although I wonder whether we should integrate this deeply into
the dirty-log mechanism: it would be really interesting to dump the
tags at the point where the page is flagged as clean from a dirty-log
point of view. As the page is dirtied, discard the saved tags.


From the VMM perspective, the tags can be treated just like additional
(meta)data in a page. We'd only need the tags when copying over. It can
race with the VM dirtying the page (writing tags would dirty it) but I
don't think the current migration code cares about this. If dirtied, it
copies it again.

The only downside I see is an extra syscall per page both on the origin
VMM and the destination one to dump/restore the tags. Is this a
performance issue?


I'm not sure. Migrating VMs already has a massive overhead, so an extra
syscall per page isn't terrifying. But that's the point where I admit
not knowing enough about what the VMM expects, nor whether that matches
what happens on other architectures that deal with per-page metadata.

Would this syscall operate on the guest address space? Or on the VMM's
own mapping?

M.
--
Jazz is not dead. It just smells funny...
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v5 0/2] MTE support for KVM guest

2020-12-09 Thread Catalin Marinas
On Tue, Dec 08, 2020 at 06:21:12PM +, Marc Zyngier wrote:
> On 2020-12-08 17:21, Catalin Marinas wrote:
> > On Mon, Dec 07, 2020 at 07:03:13PM +, Marc Zyngier wrote:
> > > I wonder whether we will have to have something kernel side to
> > > dump/reload tags in a way that matches the patterns used by live
> > > migration.
> > 
> > We have something related - ptrace dumps/resores the tags. Can the same
> > concept be expanded to a KVM ioctl?
> 
> Yes, although I wonder whether we should integrate this deeply into
> the dirty-log mechanism: it would be really interesting to dump the
> tags at the point where the page is flagged as clean from a dirty-log
> point of view. As the page is dirtied, discard the saved tags.

>From the VMM perspective, the tags can be treated just like additional
(meta)data in a page. We'd only need the tags when copying over. It can
race with the VM dirtying the page (writing tags would dirty it) but I
don't think the current migration code cares about this. If dirtied, it
copies it again.

The only downside I see is an extra syscall per page both on the origin
VMM and the destination one to dump/restore the tags. Is this a
performance issue?

-- 
Catalin
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [kvm-unit-tests PATCH 10/10] arm64: gic: Use IPI test checking for the LPI tests

2020-12-09 Thread Alexandru Elisei
Hi Eric,

On 12/3/20 2:59 PM, Auger Eric wrote:
> Hi Alexandru,
> On 11/25/20 4:51 PM, Alexandru Elisei wrote:
>> The LPI code validates a result similarly to the IPI tests, by checking if
>> the target CPU received the interrupt with the expected interrupt number.
>> However, the LPI tests invent their own way of checking the test results by
>> creating a global struct (lpi_stats), using a separate interrupt handler
>> (lpi_handler) and test function (check_lpi_stats).
>>
>> There are several areas that can be improved in the LPI code, which are
>> already covered by the IPI tests:
>>
>> - check_lpi_stats() doesn't take into account that the target CPU can
>>   receive the correct interrupt multiple times.
>> - check_lpi_stats() doesn't take into the account the scenarios where all
>>   online CPUs can receive the interrupt, but the target CPU is the last CPU
>>   that touches lpi_stats.observed.
>> - Insufficient or missing memory synchronization.
>>
>> Instead of duplicating code, let's convert the LPI tests to use
>> check_acked() and the same interrupt handler as the IPI tests, which has
>> been renamed to irq_handler() to avoid any confusion.
>>
>> check_lpi_stats() has been replaced with check_acked() which, together with
>> using irq_handler(), instantly gives us more correctness checks and proper
>> memory synchronization between threads. lpi_stats.expected has been
>> replaced by the CPU mask and the expected interrupt number arguments to
>> check_acked(), with no change in semantics.
>>
>> lpi_handler() aborted the test if the interrupt number was not an LPI. This
>> was changed in favor of allowing the test to continue, as it will fail in
>> check_acked(), but possibly print information useful for debugging. If the
>> test receives spurious interrupts, those are reported via report_info() at
>> the end of the test for consistency with the IPI tests, which don't treat
>> spurious interrupts as critical errors.
>>
>> In the spirit of code reuse, secondary_lpi_tests() has been replaced with
>> ipi_recv() because the two are now identical; ipi_recv() has been renamed
>> to irq_recv(), similarly to irq_handler(), to avoid confusion.
>>
>> CC: Eric Auger 
>> Signed-off-by: Alexandru Elisei 
>> ---
>> [..]
>> @@ -767,13 +700,27 @@ static void test_its_trigger(void)
>>  
>>  report_prefix_push("int");
>>  
>> -lpi_stats_expect(3, 8195);
>> +stats_reset();
>> +/*
>> + * its_send_int() is missing the synchronization from the GICv3 IPI
>> + * trigger functions.
>> + */
>> +wmb();
> so don't you want to add it in __its_send_int instead?

The memory synchronization in the IPI sender functions make perfect sense, 
that's
how IPIs are used - one CPU kicks the target, the target reads from a shared
memory location.

I don't think receiving an interrupt from a device is how one would usually 
expect
to do inter-processor communication. However, I did more digging about this
ability to trigger interrupts from made-up devices, and it seems to me that this
was introduced for testing purposes (please correct me if I'm wrong). With this 
in
mind, I guess it wouldn't be awkward to have the wmb() in its_send_int(), 
because
we are using it just like we would an IPI. And it also reduces the boilerplate 
code.

I'll make the change in the next iteration.

Thanks,
Alex
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC][PATCH 0/4] arm64:kvm: teach guest sched that VCPUs can be preempted

2020-12-09 Thread Marc Zyngier

Hi all,

On 2020-12-08 20:02, Joel Fernandes wrote:

On Fri, Sep 11, 2020 at 4:58 AM Sergey Senozhatsky
 wrote:


My apologies for the slow reply.

On (20/08/17 13:25), Marc Zyngier wrote:
>
> It really isn't the same thing at all. You are exposing PV spinlocks,
> while Sergey exposes preemption to vcpus.
>

Correct, we see vcpu preemption as a "fundamental" feature, with
consequences that affect scheduling, which is a core feature :)

Marc, is there anything in particular that you dislike about this RFC
patch set? Joel has some ideas, which we may discuss offline if that
works for you.


Hi Marc, Sergey, Just checking what is the latest on this series?


I was planning to give it a go, but obviously got sidetracked. :-(



About the idea me and Sergey discussed, at a high level we discussed
being able to share information similar to "Is the vCPU preempted?"
using a more arch-independent infrastructure. I do not believe this
needs to be arch-specific. Maybe the speciifc mechanism about how to
share a page of information needs to be arch-specific, but the actual
information shared need not be.


We already have some information sharing in the form of steal time
accounting, and I believe this "vcpu preempted" falls in the same
bucket. It looks like we could implement the feature as an extension
of the steal-time accounting, as the two concepts are linked
(one describes the accumulation of non-running time, the other is
instantaneous).


This could open the door to sharing
more such information in an arch-independent way (for example, if the
scheduler needs to know other information such as the capacity of the
CPU that the vCPU is on).


Quentin and I have discussed potential ways of improving guest 
scheduling

on terminally broken systems (otherwise known as big-little), in the
form of a capacity request from the guest to the host. I'm not really
keen on the host exposing its own capacity, as that doesn't tell the
host what the guest actually needs.

Thanks,

M.
--
Jazz is not dead. It just smells funny...
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm