[PATCH 11/16] firmware/smccc: Call arch-specific hook on discovering KVM services

2021-07-15 Thread Marc Zyngier
arm64 will soon require its own callback to initialise services
that are only availably on this architecture. Introduce a hook
that can be overloaded by the architecture.

Signed-off-by: Marc Zyngier 
---
 arch/arm/include/asm/hypervisor.h   | 1 +
 arch/arm64/include/asm/hypervisor.h | 1 +
 drivers/firmware/smccc/kvm_guest.c  | 4 
 3 files changed, 6 insertions(+)

diff --git a/arch/arm/include/asm/hypervisor.h 
b/arch/arm/include/asm/hypervisor.h
index bd61502b9715..8133c8c81a35 100644
--- a/arch/arm/include/asm/hypervisor.h
+++ b/arch/arm/include/asm/hypervisor.h
@@ -6,5 +6,6 @@
 
 void kvm_init_hyp_services(void);
 bool kvm_arm_hyp_service_available(u32 func_id);
+void kvm_arm_init_hyp_services(void);
 
 #endif
diff --git a/arch/arm64/include/asm/hypervisor.h 
b/arch/arm64/include/asm/hypervisor.h
index 0ae427f352c8..8e77f411903f 100644
--- a/arch/arm64/include/asm/hypervisor.h
+++ b/arch/arm64/include/asm/hypervisor.h
@@ -6,5 +6,6 @@
 
 void kvm_init_hyp_services(void);
 bool kvm_arm_hyp_service_available(u32 func_id);
+void kvm_arm_init_hyp_services(void);
 
 #endif
diff --git a/drivers/firmware/smccc/kvm_guest.c 
b/drivers/firmware/smccc/kvm_guest.c
index 2d3e866decaa..56169e73252a 100644
--- a/drivers/firmware/smccc/kvm_guest.c
+++ b/drivers/firmware/smccc/kvm_guest.c
@@ -9,6 +9,8 @@
 
 #include 
 
+void __weak kvm_arm_init_hyp_services(void) {}
+
 static DECLARE_BITMAP(__kvm_arm_hyp_services, ARM_SMCCC_KVM_NUM_FUNCS) 
__ro_after_init = { };
 
 void __init kvm_init_hyp_services(void)
@@ -38,6 +40,8 @@ void __init kvm_init_hyp_services(void)
 
pr_info("hypervisor services detected (0x%08lx 0x%08lx 0x%08lx 
0x%08lx)\n",
 res.a3, res.a2, res.a1, res.a0);
+
+   kvm_arm_init_hyp_services();
 }
 
 bool kvm_arm_hyp_service_available(u32 func_id)
-- 
2.30.2

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 14/16] arm64: Enroll into KVM's MMIO guard if required

2021-07-15 Thread Marc Zyngier
Should a guest desire to enroll into the MMIO guard, allow it to
do so with a command-line option.

Signed-off-by: Marc Zyngier 
---
 .../admin-guide/kernel-parameters.txt |  3 ++
 arch/arm64/include/asm/hypervisor.h   |  1 +
 arch/arm64/kernel/setup.c |  6 +++
 arch/arm64/mm/ioremap.c   | 38 +++
 4 files changed, 48 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index bdb22006f713..a398585bed90 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2062,6 +2062,9 @@
1 - Bypass the IOMMU for DMA.
unset - Use value of CONFIG_IOMMU_DEFAULT_PASSTHROUGH.
 
+   ioremap_guard   [ARM64] enable the KVM MMIO guard functionality
+   if available.
+
io7=[HW] IO7 for Marvel-based Alpha systems
See comment before marvel_specify_io7 in
arch/alpha/kernel/core_marvel.c.
diff --git a/arch/arm64/include/asm/hypervisor.h 
b/arch/arm64/include/asm/hypervisor.h
index 8e77f411903f..b130c7b82eaa 100644
--- a/arch/arm64/include/asm/hypervisor.h
+++ b/arch/arm64/include/asm/hypervisor.h
@@ -7,5 +7,6 @@
 void kvm_init_hyp_services(void);
 bool kvm_arm_hyp_service_available(u32 func_id);
 void kvm_arm_init_hyp_services(void);
+void kvm_init_ioremap_services(void);
 
 #endif
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index be5f85b0a24d..c325647f675f 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -49,6 +49,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -445,3 +446,8 @@ static int __init register_arm64_panic_block(void)
return 0;
 }
 device_initcall(register_arm64_panic_block);
+
+void kvm_arm_init_hyp_services(void)
+{
+   kvm_init_ioremap_services();
+}
diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c
index 0801fd92f0e3..d82b63bcc554 100644
--- a/arch/arm64/mm/ioremap.c
+++ b/arch/arm64/mm/ioremap.c
@@ -23,6 +23,44 @@
 
 static DEFINE_STATIC_KEY_FALSE(ioremap_guard_key);
 
+static bool ioremap_guard;
+static int __init ioremap_guard_setup(char *str)
+{
+   ioremap_guard = true;
+
+   return 0;
+}
+early_param("ioremap_guard", ioremap_guard_setup);
+
+void kvm_init_ioremap_services(void)
+{
+   struct arm_smccc_res res;
+
+   if (!ioremap_guard)
+   return;
+
+   /* We need all the functions to be implemented */
+   if (!kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_INFO) 
||
+   
!kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_ENROLL) ||
+   !kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP) ||
+   !kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_UNMAP))
+   return;
+
+   arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_INFO_FUNC_ID,
+);
+   if (res.a0 != PAGE_SIZE)
+   return;
+
+   arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_ENROLL_FUNC_ID,
+);
+   if (res.a0 == SMCCC_RET_SUCCESS) {
+   static_branch_enable(_guard_key);
+   pr_info("Using KVM MMIO guard for ioremap\n");
+   } else {
+   pr_warn("KVM MMIO guard registration failed (%ld)\n", res.a0);
+   }
+}
+
 void ioremap_page_range_hook(unsigned long addr, unsigned long end,
 phys_addr_t phys_addr, pgprot_t prot)
 {
-- 
2.30.2

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 12/16] mm/ioremap: Add arch-specific callbacks on ioremap/iounmap calls

2021-07-15 Thread Marc Zyngier
Add a pair of hooks (ioremap_page_range_hook/iounmap_page_range_hook)
that can be implemented by an architecture.

Signed-off-by: Marc Zyngier 
---
 include/linux/io.h |  3 +++
 mm/ioremap.c   | 13 -
 mm/vmalloc.c   |  8 
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/include/linux/io.h b/include/linux/io.h
index 9595151d800d..0ffc265f114c 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -21,6 +21,9 @@ void __ioread32_copy(void *to, const void __iomem *from, 
size_t count);
 void __iowrite64_copy(void __iomem *to, const void *from, size_t count);
 
 #ifdef CONFIG_MMU
+void ioremap_page_range_hook(unsigned long addr, unsigned long end,
+phys_addr_t phys_addr, pgprot_t prot);
+void iounmap_page_range_hook(phys_addr_t phys_addr, size_t size);
 int ioremap_page_range(unsigned long addr, unsigned long end,
   phys_addr_t phys_addr, pgprot_t prot);
 #else
diff --git a/mm/ioremap.c b/mm/ioremap.c
index 8ee0136f8cb0..bd77a86088f2 100644
--- a/mm/ioremap.c
+++ b/mm/ioremap.c
@@ -28,10 +28,21 @@ early_param("nohugeiomap", set_nohugeiomap);
 static const unsigned int iomap_max_page_shift = PAGE_SHIFT;
 #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
 
+void __weak ioremap_page_range_hook(unsigned long addr, unsigned long end,
+   phys_addr_t phys_addr, pgprot_t prot)
+{
+}
+
 int ioremap_page_range(unsigned long addr,
   unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
 {
-   return vmap_range(addr, end, phys_addr, prot, iomap_max_page_shift);
+   int ret;
+
+   ret = vmap_range(addr, end, phys_addr, prot, iomap_max_page_shift);
+   if (!ret)
+   ioremap_page_range_hook(addr, end, phys_addr, prot);
+
+   return ret;
 }
 
 #ifdef CONFIG_GENERIC_IOREMAP
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d5cd52805149..af18a6141093 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -38,6 +38,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -2551,6 +2552,10 @@ static void vm_remove_mappings(struct vm_struct *area, 
int deallocate_pages)
set_area_direct_map(area, set_direct_map_default_noflush);
 }
 
+void __weak iounmap_page_range_hook(phys_addr_t phys_addr, size_t size)
+{
+}
+
 static void __vunmap(const void *addr, int deallocate_pages)
 {
struct vm_struct *area;
@@ -2574,6 +2579,9 @@ static void __vunmap(const void *addr, int 
deallocate_pages)
 
kasan_poison_vmalloc(area->addr, get_vm_area_size(area));
 
+   if (area->flags & VM_IOREMAP)
+   iounmap_page_range_hook(area->phys_addr, 
get_vm_area_size(area));
+
vm_remove_mappings(area, deallocate_pages);
 
if (deallocate_pages) {
-- 
2.30.2

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 15/16] arm64: Add a helper to retrieve the PTE of a fixmap

2021-07-15 Thread Marc Zyngier
In order to transfer the early mapping state into KVM's MMIO
guard infrastucture, provide a small helper that will retrieve
the associated PTE.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/include/asm/fixmap.h |  2 ++
 arch/arm64/mm/mmu.c | 15 +++
 2 files changed, 17 insertions(+)

diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h
index 4335800201c9..1aae625b944f 100644
--- a/arch/arm64/include/asm/fixmap.h
+++ b/arch/arm64/include/asm/fixmap.h
@@ -105,6 +105,8 @@ void __init early_fixmap_init(void);
 
 extern void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t 
prot);
 
+extern pte_t *__get_fixmap_pte(enum fixed_addresses idx);
+
 #include 
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index d74586508448..f1b7abd04025 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1286,6 +1286,21 @@ void __set_fixmap(enum fixed_addresses idx,
}
 }
 
+pte_t *__get_fixmap_pte(enum fixed_addresses idx)
+{
+   unsigned long   addr = __fix_to_virt(idx);
+   pte_t *ptep;
+
+   BUG_ON(idx <= FIX_HOLE || idx >= __end_of_fixed_addresses);
+
+   ptep = fixmap_pte(addr);
+
+   if (!pte_valid(*ptep))
+   return NULL;
+
+   return ptep;
+}
+
 void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
 {
const u64 dt_virt_base = __fix_to_virt(FIX_FDT);
-- 
2.30.2

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 16/16] arm64: Register earlycon fixmap with the MMIO guard

2021-07-15 Thread Marc Zyngier
On initialising the MMIO guard infrastructure, register the
earlycon mapping if present.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/mm/ioremap.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c
index d82b63bcc554..a27b58e03c93 100644
--- a/arch/arm64/mm/ioremap.c
+++ b/arch/arm64/mm/ioremap.c
@@ -32,6 +32,18 @@ static int __init ioremap_guard_setup(char *str)
 }
 early_param("ioremap_guard", ioremap_guard_setup);
 
+static void fixup_fixmap(void)
+{
+   unsigned long addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
+   pte_t *ptep = __get_fixmap_pte(FIX_EARLYCON_MEM_BASE);
+
+   if (!ptep)
+   return;
+
+   ioremap_page_range_hook(addr, addr + PAGE_SIZE, __pte_to_phys(*ptep),
+   __pgprot(pte_val(*ptep) & PTE_ATTRINDX_MASK));
+}
+
 void kvm_init_ioremap_services(void)
 {
struct arm_smccc_res res;
@@ -55,6 +67,7 @@ void kvm_init_ioremap_services(void)
 );
if (res.a0 == SMCCC_RET_SUCCESS) {
static_branch_enable(_guard_key);
+   fixup_fixmap();
pr_info("Using KVM MMIO guard for ioremap\n");
} else {
pr_warn("KVM MMIO guard registration failed (%ld)\n", res.a0);
-- 
2.30.2

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 13/16] arm64: Implement ioremap/iounmap hooks calling into KVM's MMIO guard

2021-07-15 Thread Marc Zyngier
Implement the previously defined ioremap/iounmap hooks for arm64,
calling into KVM's MMIO guard if available.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/mm/ioremap.c | 56 +
 1 file changed, 56 insertions(+)

diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c
index b7c81dacabf0..0801fd92f0e3 100644
--- a/arch/arm64/mm/ioremap.c
+++ b/arch/arm64/mm/ioremap.c
@@ -9,13 +9,69 @@
  * Copyright (C) 2012 ARM Ltd.
  */
 
+#define pr_fmt(fmt)"ioremap: " fmt
+
 #include 
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
+#include 
+
+static DEFINE_STATIC_KEY_FALSE(ioremap_guard_key);
+
+void ioremap_page_range_hook(unsigned long addr, unsigned long end,
+phys_addr_t phys_addr, pgprot_t prot)
+{
+   size_t size = end - addr;
+
+   if (!static_branch_unlikely(_guard_key))
+   return;
+
+   if (pfn_valid(__phys_to_pfn(phys_addr)))
+   return;
+
+   while (size) {
+   struct arm_smccc_res res;
+
+   
arm_smccc_1_1_hvc(ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_MAP_FUNC_ID,
+ phys_addr, prot, );
+   if (res.a0 != SMCCC_RET_SUCCESS) {
+   pr_warn_ratelimited("Failed to register %llx\n",
+   phys_addr);
+   return;
+   }
+
+   size -= PAGE_SIZE;
+   phys_addr += PAGE_SIZE;
+   }
+}
+
+void iounmap_page_range_hook(phys_addr_t phys_addr, size_t size)
+{
+   if (!static_branch_unlikely(_guard_key))
+   return;
+
+   VM_BUG_ON(phys_addr & ~PAGE_MASK || size & ~PAGE_MASK);
+
+   while (size) {
+   struct arm_smccc_res res;
+
+   
arm_smccc_1_1_hvc(ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_UNMAP_FUNC_ID,
+ phys_addr, );
+   if (res.a0 != SMCCC_RET_SUCCESS) {
+   pr_warn_ratelimited("Failed to unregister %llx\n",
+   phys_addr);
+   return;
+   }
+
+   size -= PAGE_SIZE;
+   phys_addr += PAGE_SIZE;
+   }
+}
 
 static void __iomem *__ioremap_caller(phys_addr_t phys_addr, size_t size,
  pgprot_t prot, void *caller)
-- 
2.30.2

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 10/16] KVM: arm64: Add some documentation for the MMIO guard feature

2021-07-15 Thread Marc Zyngier
Document the hypercalls user for the MMIO guard infrastructure.

Signed-off-by: Marc Zyngier 
---
 Documentation/virt/kvm/arm/index.rst  |  1 +
 Documentation/virt/kvm/arm/mmio-guard.rst | 73 +++
 2 files changed, 74 insertions(+)
 create mode 100644 Documentation/virt/kvm/arm/mmio-guard.rst

diff --git a/Documentation/virt/kvm/arm/index.rst 
b/Documentation/virt/kvm/arm/index.rst
index 78a9b670aafe..e77a0ee2e2d4 100644
--- a/Documentation/virt/kvm/arm/index.rst
+++ b/Documentation/virt/kvm/arm/index.rst
@@ -11,3 +11,4 @@ ARM
psci
pvtime
ptp_kvm
+   mmio-guard
diff --git a/Documentation/virt/kvm/arm/mmio-guard.rst 
b/Documentation/virt/kvm/arm/mmio-guard.rst
new file mode 100644
index ..a5563a3e12cc
--- /dev/null
+++ b/Documentation/virt/kvm/arm/mmio-guard.rst
@@ -0,0 +1,73 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==
+KVM MMIO guard
+==
+
+KVM implements device emulation by handling translation faults to any
+IPA range that is not contained a memory slot. Such translation fault
+is in most cases passed on to userspace (or in rare cases to the host
+kernel) with the address, size and possibly data of the access for
+emulation.
+
+Should the guest exit with an address that is not one that corresponds
+to an emulatable device, userspace may take measures that are not the
+most graceful as far as the guest is concerned (such as terminating it
+or delivering a fatal exception).
+
+There is also an element of trust: by forwarding the request to
+userspace, the kernel asumes that the guest trusts userspace to do the
+right thing.
+
+The KVM MMIO guard offers a way to mitigate this last point: a guest
+can request that only certainly regions of the IPA space are valid as
+MMIO. Only these regions will be handled as an MMIO, and any other
+will result in an exception being delivered to the guest.
+
+This relies on a set of hypercalls defined in the KVM-specific range,
+using the HVC64 calling convention.
+
+* ARM_SMCCC_KVM_FUNC_MMIO_GUARD_INFO
+
+==
+Function ID:  (uint32)0xC602
+Arguments:none
+Return Values:(int64) NOT_SUPPORTED(-1) on error, or
+  (uint64)Protection Granule (PG) size in
+ bytes (r0)
+==
+
+* ARM_SMCCC_KVM_FUNC_MMIO_GUARD_ENROLL
+
+====
+Function ID:  (uint32)0xC603
+Arguments:none
+Return Values:(int64) NOT_SUPPORTED(-1) on error, or
+  RET_SUCCESS(0) (r0)
+====
+
+* ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP
+
+====
+Function ID:  (uint32)0xC604
+Arguments:(uint64)The base of the PG-sized IPA range
+  that is allowed to be accessed as
+ MMIO. Must aligned to the PG size (r1)
+  (uint64)Index in the MAIR_EL1 register
+ providing the memory attribute that
+ is used by the guest (r2)
+Return Values:(int64) NOT_SUPPORTED(-1) on error, or
+  RET_SUCCESS(0) (r0)
+====
+
+* ARM_SMCCC_KVM_FUNC_MMIO_GUARD_UNMAP
+
+====
+Function ID:  (uint32)0xC604
+Arguments:(uint64)The base of the PG-sized IPA range
+  that is forbidden to be accessed as
+ MMIO. Must aligned to the PG size
+ and have been previously mapped (r1)
+Return Values:(int64) NOT_SUPPORTED(-1) on error, or
+  RET_SUCCESS(0) (r0)
+====
-- 
2.30.2

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 08/16] KVM: arm64: Add tracepoint for failed MMIO guard check

2021-07-15 Thread Marc Zyngier
In order to make debugging easier, expose a new trace point
that triggers when a MMIO check fails.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/kvm/mmu.c   |  4 +++-
 arch/arm64/kvm/trace_arm.h | 17 +
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 638827c8842b..c2a23457552b 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1229,8 +1229,10 @@ bool kvm_check_ioguard_page(struct kvm_vcpu *vcpu, gpa_t 
ipa)
ret = __check_ioguard_page(vcpu, ipa & PAGE_MASK);
spin_unlock(>kvm->mmu_lock);
 
-   if (!ret)
+   if (!ret) {
+   trace_kvm_failed_mmio_check(*vcpu_pc(vcpu), ipa);
kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
+   }
 
return ret;
 }
diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h
index 33e4e7dd2719..e40cfeb251ad 100644
--- a/arch/arm64/kvm/trace_arm.h
+++ b/arch/arm64/kvm/trace_arm.h
@@ -89,6 +89,23 @@ TRACE_EVENT(kvm_access_fault,
TP_printk("IPA: %lx", __entry->ipa)
 );
 
+TRACE_EVENT(kvm_failed_mmio_check,
+   TP_PROTO(unsigned long vcpu_pc, unsigned long ipa),
+   TP_ARGS(vcpu_pc, ipa),
+
+   TP_STRUCT__entry(
+   __field(unsigned long,  vcpu_pc )
+   __field(unsigned long,  ipa )
+   ),
+
+   TP_fast_assign(
+   __entry->vcpu_pc= vcpu_pc;
+   __entry->ipa= ipa;
+   ),
+
+   TP_printk("PC: %lx IPA: %lx", __entry->vcpu_pc, __entry->ipa)
+);
+
 TRACE_EVENT(kvm_irq_line,
TP_PROTO(unsigned int type, int vcpu_idx, int irq_num, int level),
TP_ARGS(type, vcpu_idx, irq_num, level),
-- 
2.30.2

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 09/16] KVM: arm64: Advertise a capability for MMIO guard

2021-07-15 Thread Marc Zyngier
In order for userspace to find out whether the MMIO guard is
exposed to a guest, expose a capability that says so.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/kvm/arm.c | 1 +
 include/uapi/linux/kvm.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index b0d2225190d2..72ebad749b0c 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -214,6 +214,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_SET_GUEST_DEBUG:
case KVM_CAP_VCPU_ATTRIBUTES:
case KVM_CAP_PTP_KVM:
+   case KVM_CAP_ARM_MMIO_GUARD:
r = 1;
break;
case KVM_CAP_SET_GUEST_DEBUG2:
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d9e4aabcb31a..d4a5715c5c8f 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1112,6 +1112,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_BINARY_STATS_FD 203
 #define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204
 #define KVM_CAP_ARM_MTE 205
+#define KVM_CAP_ARM_MMIO_GUARD 206
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
2.30.2

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 06/16] KVM: arm64: Force a full unmap on vpcu reinit

2021-07-15 Thread Marc Zyngier
As we now keep information in the S2PT, we must be careful not
to keep it across a VM reboot, which could otherwise lead to
interesting problems.

Make sure that the S2 is completely discarded on reset of
a vcpu, and remove the flag that enforces the MMIO check.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/kvm/arm.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 97ab1512c44f..b0d2225190d2 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1096,12 +1096,18 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct 
kvm_vcpu *vcpu,
 * ensuring that the data side is always coherent. We still
 * need to invalidate the I-cache though, as FWB does *not*
 * imply CTR_EL0.DIC.
+*
+* If the MMIO guard was enabled, we pay the price of a full
+* unmap to get back to a sane state (and clear the flag).
 */
if (vcpu->arch.has_run_once) {
-   if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
+   if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB) ||
+   test_bit(KVM_ARCH_FLAG_MMIO_GUARD, >kvm->arch.flags))
stage2_unmap_vm(vcpu->kvm);
else
icache_inval_all_pou();
+
+   clear_bit(KVM_ARCH_FLAG_MMIO_GUARD, >kvm->arch.flags);
}
 
vcpu_reset_hcr(vcpu);
-- 
2.30.2

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 03/16] KVM: arm64: Turn kvm_pgtable_stage2_set_owner into kvm_pgtable_stage2_annotate

2021-07-15 Thread Marc Zyngier
kvm_pgtable_stage2_set_owner() could be generalised into a way
to store up to 63 bits in the page tables, as long as we don't
set bit 0.

Let's just do that.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/include/asm/kvm_pgtable.h  | 12 +++-
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 14 --
 arch/arm64/kvm/hyp/pgtable.c  | 20 ++--
 3 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h 
b/arch/arm64/include/asm/kvm_pgtable.h
index f004c0115d89..9579e8c2793b 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -274,14 +274,16 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 
addr, u64 size,
   void *mc);
 
 /**
- * kvm_pgtable_stage2_set_owner() - Unmap and annotate pages in the IPA space 
to
- * track ownership.
+ * kvm_pgtable_stage2_annotate() - Unmap and annotate pages in the IPA space
+ *to track ownership (and more).
  * @pgt:   Page-table structure initialised by kvm_pgtable_stage2_init*().
  * @addr:  Base intermediate physical address to annotate.
  * @size:  Size of the annotated range.
  * @mc:Cache of pre-allocated and zeroed memory from which to 
allocate
  * page-table pages.
- * @owner_id:  Unique identifier for the owner of the page.
+ * @annotation:A 63 bit value that will be stored in the page tables.
+ * @annotation[0] must be 0, and @annotation[63:1] is stored
+ * in the page tables. @annotation as a whole must not be 0.
  *
  * By default, all page-tables are owned by identifier 0. This function can be
  * used to mark portions of the IPA space as owned by other entities. When a
@@ -290,8 +292,8 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 
addr, u64 size,
  *
  * Return: 0 on success, negative error code on failure.
  */
-int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
-void *mc, u8 owner_id);
+int kvm_pgtable_stage2_annotate(struct kvm_pgtable *pgt, u64 addr, u64 size,
+   void *mc, kvm_pte_t annotation);
 
 /**
  * kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 
page-table.
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c 
b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index d938ce95d3bd..ffe482c3b818 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -245,6 +245,15 @@ static int host_stage2_idmap(u64 addr)
return ret;
 }
 
+#define KVM_INVALID_PTE_OWNER_MASK GENMASK(63, 56)
+#define KVM_MAX_OWNER_ID   1
+
+static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id)
+{
+   BUG_ON(owner_id > KVM_MAX_OWNER_ID);
+   return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id);
+}
+
 int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end)
 {
int ret;
@@ -257,8 +266,9 @@ int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end)
return -EINVAL;
 
hyp_spin_lock(_kvm.lock);
-   ret = kvm_pgtable_stage2_set_owner(_kvm.pgt, start, end - start,
-  _s2_pool, pkvm_hyp_id);
+   ret = kvm_pgtable_stage2_annotate(_kvm.pgt, start, end - start,
+ _s2_pool,
+ 
kvm_init_invalid_leaf_owner(pkvm_hyp_id));
hyp_spin_unlock(_kvm.lock);
 
return ret != -EAGAIN ? ret : 0;
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index a5874ebd0354..a065f6d960af 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -50,9 +50,6 @@
 
 #define KVM_PTE_LEAF_ATTR_S2_IGNORED   GENMASK(58, 55)
 
-#define KVM_INVALID_PTE_OWNER_MASK GENMASK(63, 56)
-#define KVM_MAX_OWNER_ID   1
-
 struct kvm_pgtable_walk_data {
struct kvm_pgtable  *pgt;
struct kvm_pgtable_walker   *walker;
@@ -206,11 +203,6 @@ static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t 
attr, u32 level)
return pte;
 }
 
-static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id)
-{
-   return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id);
-}
-
 static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
  u32 level, kvm_pte_t *ptep,
  enum kvm_pgtable_walk_flags flag)
@@ -466,7 +458,7 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
 struct stage2_map_data {
u64 phys;
kvm_pte_t   attr;
-   u8  owner_id;
+   u64 annotation;
 
kvm_pte_t   *anchor;
kvm_pte_t   *childp;
@@ -603,7 +595,7 @@ static int stage2_map_walker_try_leaf(u64 

[PATCH 07/16] KVM: arm64: Wire MMIO guard hypercalls

2021-07-15 Thread Marc Zyngier
Plumb in the hypercall interface to allow a guest to discover,
enroll, map and unmap MMIO regions.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/kvm/hypercalls.c | 20 
 include/linux/arm-smccc.h   | 28 
 2 files changed, 48 insertions(+)

diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index 30da78f72b3b..a3deeb907fdd 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -5,6 +5,7 @@
 #include 
 
 #include 
+#include 
 
 #include 
 #include 
@@ -129,10 +130,29 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES);
val[0] |= BIT(ARM_SMCCC_KVM_FUNC_PTP);
+   val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_INFO);
+   val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_ENROLL);
+   val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP);
+   val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_UNMAP);
break;
case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID:
kvm_ptp_get_time(vcpu, val);
break;
+   case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_INFO_FUNC_ID:
+   val[0] = PAGE_SIZE;
+   break;
+   case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_ENROLL_FUNC_ID:
+   set_bit(KVM_ARCH_FLAG_MMIO_GUARD, >kvm->arch.flags);
+   val[0] = SMCCC_RET_SUCCESS;
+   break;
+   case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_MAP_FUNC_ID:
+   if (kvm_install_ioguard_page(vcpu, vcpu_get_reg(vcpu, 1)))
+   val[0] = SMCCC_RET_SUCCESS;
+   break;
+   case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_UNMAP_FUNC_ID:
+   if (kvm_remove_ioguard_page(vcpu, vcpu_get_reg(vcpu, 1)))
+   val[0] = SMCCC_RET_SUCCESS;
+   break;
case ARM_SMCCC_TRNG_VERSION:
case ARM_SMCCC_TRNG_FEATURES:
case ARM_SMCCC_TRNG_GET_UUID:
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index 7d1cabe15262..4aab2078d8d3 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -107,6 +107,10 @@
 /* KVM "vendor specific" services */
 #define ARM_SMCCC_KVM_FUNC_FEATURES0
 #define ARM_SMCCC_KVM_FUNC_PTP 1
+#define ARM_SMCCC_KVM_FUNC_MMIO_GUARD_INFO 2
+#define ARM_SMCCC_KVM_FUNC_MMIO_GUARD_ENROLL   3
+#define ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP  4
+#define ARM_SMCCC_KVM_FUNC_MMIO_GUARD_UNMAP5
 #define ARM_SMCCC_KVM_FUNC_FEATURES_2  127
 #define ARM_SMCCC_KVM_NUM_FUNCS128
 
@@ -133,6 +137,30 @@
 #define KVM_PTP_VIRT_COUNTER   0
 #define KVM_PTP_PHYS_COUNTER   1
 
+#define ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_INFO_FUNC_ID   \
+   ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
+  ARM_SMCCC_SMC_64,\
+  ARM_SMCCC_OWNER_VENDOR_HYP,  \
+  ARM_SMCCC_KVM_FUNC_MMIO_GUARD_INFO)
+
+#define ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_ENROLL_FUNC_ID \
+   ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
+  ARM_SMCCC_SMC_64,\
+  ARM_SMCCC_OWNER_VENDOR_HYP,  \
+  ARM_SMCCC_KVM_FUNC_MMIO_GUARD_ENROLL)
+
+#define ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_MAP_FUNC_ID
\
+   ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
+  ARM_SMCCC_SMC_64,\
+  ARM_SMCCC_OWNER_VENDOR_HYP,  \
+  ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP)
+
+#define ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_UNMAP_FUNC_ID  \
+   ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
+  ARM_SMCCC_SMC_64,\
+  ARM_SMCCC_OWNER_VENDOR_HYP,  \
+  ARM_SMCCC_KVM_FUNC_MMIO_GUARD_UNMAP)
+
 /* Paravirtualised time calls (defined by ARM DEN0057A) */
 #define ARM_SMCCC_HV_PV_TIME_FEATURES  \
ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
-- 
2.30.2

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 05/16] KVM: arm64: Plumb MMIO checking into the fault handling

2021-07-15 Thread Marc Zyngier
Plumb the MMIO checking code into the MMIO fault handling code.
Nothing allows a region to be registered yet, so there should be
no funtional change either.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/kvm/mmio.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c
index 3dd38a151d2a..fd5747279d27 100644
--- a/arch/arm64/kvm/mmio.c
+++ b/arch/arm64/kvm/mmio.c
@@ -6,6 +6,7 @@
 
 #include 
 #include 
+#include 
 #include 
 
 #include "trace.h"
@@ -130,6 +131,10 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t 
fault_ipa)
int len;
u8 data_buf[8];
 
+   /* Check failed? Return to the guest for debriefing... */
+   if (!kvm_check_ioguard_page(vcpu, fault_ipa))
+   return 1;
+
/*
 * No valid syndrome? Ask userspace for help if it has
 * volunteered to do so, and bail out otherwise.
@@ -156,6 +161,11 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t 
fault_ipa)
len = kvm_vcpu_dabt_get_as(vcpu);
rt = kvm_vcpu_dabt_get_rd(vcpu);
 
+   /* If we cross a page boundary, check that too... */
+   if (((fault_ipa + len - 1) & PAGE_MASK) != (fault_ipa & PAGE_MASK) &&
+   !kvm_check_ioguard_page(vcpu, fault_ipa + len - 1))
+   return 1;
+
if (is_write) {
data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt),
   len);
-- 
2.30.2

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 00/16] KVM: arm64: MMIO guard PV services

2021-07-15 Thread Marc Zyngier
KVM/arm64 currently considers that any memory access outside of a
memslot is a MMIO access. This so far has served us very well, but
obviously relies on the guest trusting the host, and especially
userspace to do the right thing.

As we keep on hacking away at pKVM, it becomes obvious that this trust
model is not really fit for a confidential computing environment, and
that the guest would require some guarantees that emulation only
occurs on portions of the address space that have clearly been
identified for this purpose.

This series aims at providing the two sides of the above coin:

- a set of PV services (collectively called 'MMIO guard' -- better
  name required!) where the guest can flag portion of its address
  space that it considers as MMIO, with map/unmap semantics. Any
  attempt to access a MMIO range outside of these regions will result
  in an external abort being injected.

- a set of hooks into the ioremap code allowing a Linux guest to tell
  KVM about things it want to consider as MMIO. I definitely hate this
  part of the series, as it feels clumsy and brittle.

For now, the enrolment in this scheme is controlled by a guest kernel
command-line parameters, but it is expected that KVM will enforce this
for protected VMs.

Note that this crucially misses a save/restore interface for non
protected VMs, and I currently don't have a good solution for
that. Ideas welcome.

I also plan to use this series as a base for some other purposes,
namely to trick the guest in telling us how it maps things like
prefetchable BARs (see the discussion at [1]). That part is not
implemented yet, but there is already some provision to pass the MAIR
index across.

Patches on top of 5.14-rc1, branch pushed at the usual location.

[1] 20210429162906.32742-1-sdonthin...@nvidia.com

Marc Zyngier (16):
  KVM: arm64: Generalise VM features into a set of flags
  KVM: arm64: Don't issue CMOs when the physical address is invalid
  KVM: arm64: Turn kvm_pgtable_stage2_set_owner into
kvm_pgtable_stage2_annotate
  KVM: arm64: Add MMIO checking infrastructure
  KVM: arm64: Plumb MMIO checking into the fault handling
  KVM: arm64: Force a full unmap on vpcu reinit
  KVM: arm64: Wire MMIO guard hypercalls
  KVM: arm64: Add tracepoint for failed MMIO guard check
  KVM: arm64: Advertise a capability for MMIO guard
  KVM: arm64: Add some documentation for the MMIO guard feature
  firmware/smccc: Call arch-specific hook on discovering KVM services
  mm/ioremap: Add arch-specific callbacks on ioremap/iounmap calls
  arm64: Implement ioremap/iounmap hooks calling into KVM's MMIO guard
  arm64: Enroll into KVM's MMIO guard if required
  arm64: Add a helper to retrieve the PTE of a fixmap
  arm64: Register earlycon fixmap with the MMIO guard

 .../admin-guide/kernel-parameters.txt |   3 +
 Documentation/virt/kvm/arm/index.rst  |   1 +
 Documentation/virt/kvm/arm/mmio-guard.rst |  73 +++
 arch/arm/include/asm/hypervisor.h |   1 +
 arch/arm64/include/asm/fixmap.h   |   2 +
 arch/arm64/include/asm/hypervisor.h   |   2 +
 arch/arm64/include/asm/kvm_host.h |  14 ++-
 arch/arm64/include/asm/kvm_mmu.h  |   5 +
 arch/arm64/include/asm/kvm_pgtable.h  |  12 +-
 arch/arm64/kernel/setup.c |   6 +
 arch/arm64/kvm/arm.c  |  14 ++-
 arch/arm64/kvm/hyp/nvhe/mem_protect.c |  14 ++-
 arch/arm64/kvm/hyp/pgtable.c  |  36 +++---
 arch/arm64/kvm/hypercalls.c   |  20 +++
 arch/arm64/kvm/mmio.c |  13 +-
 arch/arm64/kvm/mmu.c  | 117 ++
 arch/arm64/kvm/trace_arm.h|  17 +++
 arch/arm64/mm/ioremap.c   | 107 
 arch/arm64/mm/mmu.c   |  15 +++
 drivers/firmware/smccc/kvm_guest.c|   4 +
 include/linux/arm-smccc.h |  28 +
 include/linux/io.h|   3 +
 include/uapi/linux/kvm.h  |   1 +
 mm/ioremap.c  |  13 +-
 mm/vmalloc.c  |   8 ++
 25 files changed, 492 insertions(+), 37 deletions(-)
 create mode 100644 Documentation/virt/kvm/arm/mmio-guard.rst

-- 
2.30.2

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 04/16] KVM: arm64: Add MMIO checking infrastructure

2021-07-15 Thread Marc Zyngier
Introduce the infrastructure required to identify an IPA region
that is expected to be used as an MMIO window.

This include mapping, unmapping and checking the regions. Nothing
calls into it yet, so no expected functional change.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/include/asm/kvm_host.h |   2 +
 arch/arm64/include/asm/kvm_mmu.h  |   5 ++
 arch/arm64/kvm/mmu.c  | 115 ++
 3 files changed, 122 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 4add6c27251f..914c1b7bb3ad 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -125,6 +125,8 @@ struct kvm_arch {
 #define KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER 0
/* Memory Tagging Extension enabled for the guest */
 #define KVM_ARCH_FLAG_MTE_ENABLED  1
+   /* Gues has bought into the MMIO guard extension */
+#define KVM_ARCH_FLAG_MMIO_GUARD   2
unsigned long flags;
 
/*
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index b52c5c4b9a3d..f6b8fc1671b3 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -170,6 +170,11 @@ phys_addr_t kvm_mmu_get_httbr(void);
 phys_addr_t kvm_get_idmap_vector(void);
 int kvm_mmu_init(u32 *hyp_va_bits);
 
+/* MMIO guard */
+bool kvm_install_ioguard_page(struct kvm_vcpu *vcpu, gpa_t ipa);
+bool kvm_remove_ioguard_page(struct kvm_vcpu *vcpu, gpa_t ipa);
+bool kvm_check_ioguard_page(struct kvm_vcpu *vcpu, gpa_t ipa);
+
 static inline void *__kvm_vector_slot2addr(void *base,
   enum arm64_hyp_spectre_vector slot)
 {
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 3155c9e778f0..638827c8842b 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1120,6 +1120,121 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa)
kvm_set_pfn_accessed(pte_pfn(pte));
 }
 
+#define MMIO_NOTE  ('M' << 24 | 'M' << 16 | 'I' << 8 | '0')
+
+bool kvm_install_ioguard_page(struct kvm_vcpu *vcpu, gpa_t ipa)
+{
+   struct kvm_mmu_memory_cache *memcache;
+   struct kvm_memory_slot *memslot;
+   int ret, idx;
+
+   if (!test_bit(KVM_ARCH_FLAG_MMIO_GUARD, >kvm->arch.flags))
+   return false;
+
+   /* Must be page-aligned */
+   if (ipa & ~PAGE_MASK)
+   return false;
+
+   /*
+* The page cannot be in a memslot. At some point, this will
+* have to deal with device mappings though.
+*/
+   idx = srcu_read_lock(>kvm->srcu);
+   memslot = gfn_to_memslot(vcpu->kvm, ipa >> PAGE_SHIFT);
+   srcu_read_unlock(>kvm->srcu, idx);
+
+   if (memslot)
+   return false;
+
+   /* Guest has direct access to the GICv2 virtual CPU interface */
+   if (irqchip_in_kernel(vcpu->kvm) &&
+   vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2 &&
+   ipa == vcpu->kvm->arch.vgic.vgic_cpu_base)
+   return true;
+
+   memcache = >arch.mmu_page_cache;
+   if (kvm_mmu_topup_memory_cache(memcache,
+  kvm_mmu_cache_min_pages(vcpu->kvm)))
+   return false;
+
+   spin_lock(>kvm->mmu_lock);
+   ret = kvm_pgtable_stage2_annotate(vcpu->arch.hw_mmu->pgt,
+ ipa, PAGE_SIZE, memcache,
+ MMIO_NOTE);
+   spin_unlock(>kvm->mmu_lock);
+
+   return ret == 0;
+}
+
+struct s2_walk_data {
+   kvm_pte_t   pteval;
+   u32 level;
+};
+
+static int s2_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+enum kvm_pgtable_walk_flags flag, void * const arg)
+{
+   struct s2_walk_data *data = arg;
+
+   data->level = level;
+   data->pteval = *ptep;
+   return 0;
+}
+
+/* Assumes mmu_lock taken */
+static bool __check_ioguard_page(struct kvm_vcpu *vcpu, gpa_t ipa)
+{
+   struct s2_walk_data data;
+   struct kvm_pgtable_walker walker = {
+   .cb = s2_walker,
+   .flags  = KVM_PGTABLE_WALK_LEAF,
+   .arg= ,
+   };
+
+   kvm_pgtable_walk(vcpu->arch.hw_mmu->pgt, ALIGN_DOWN(ipa, PAGE_SIZE),
+PAGE_SIZE, );
+
+   /* Must be a PAGE_SIZE mapping with our annotation */
+   return (BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(data.level)) == PAGE_SIZE &&
+   data.pteval == MMIO_NOTE);
+}
+
+bool kvm_remove_ioguard_page(struct kvm_vcpu *vcpu, gpa_t ipa)
+{
+   bool ret;
+
+   if (!test_bit(KVM_ARCH_FLAG_MMIO_GUARD, >kvm->arch.flags))
+   return false;
+
+   /* Keep the PT locked across the two walks */
+   spin_lock(>kvm->mmu_lock);
+
+   ret = __check_ioguard_page(vcpu, ipa);
+   if (ret)/* Drop the annotation */
+   

[PATCH 01/16] KVM: arm64: Generalise VM features into a set of flags

2021-07-15 Thread Marc Zyngier
We currently deal with a set of booleans for VM features,
while they could be better represented as set of flags
contained in an unsigned long, similarily to what we are
doing on the CPU side.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/include/asm/kvm_host.h | 12 +++-
 arch/arm64/kvm/arm.c  |  5 +++--
 arch/arm64/kvm/mmio.c |  3 ++-
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 41911585ae0c..4add6c27251f 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -122,7 +122,10 @@ struct kvm_arch {
 * should) opt in to this feature if KVM_CAP_ARM_NISV_TO_USER is
 * supported.
 */
-   bool return_nisv_io_abort_to_user;
+#define KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER 0
+   /* Memory Tagging Extension enabled for the guest */
+#define KVM_ARCH_FLAG_MTE_ENABLED  1
+   unsigned long flags;
 
/*
 * VM-wide PMU filter, implemented as a bitmap and big enough for
@@ -133,9 +136,6 @@ struct kvm_arch {
 
u8 pfr0_csv2;
u8 pfr0_csv3;
-
-   /* Memory Tagging Extension enabled for the guest */
-   bool mte_enabled;
 };
 
 struct kvm_vcpu_fault_info {
@@ -777,7 +777,9 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
 #define kvm_arm_vcpu_sve_finalized(vcpu) \
((vcpu)->arch.flags & KVM_ARM64_VCPU_SVE_FINALIZED)
 
-#define kvm_has_mte(kvm) (system_supports_mte() && (kvm)->arch.mte_enabled)
+#define kvm_has_mte(kvm)   \
+   (system_supports_mte() &&   \
+test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &(kvm)->arch.flags))
 #define kvm_vcpu_has_pmu(vcpu) \
(test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features))
 
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index e9a2b8f27792..97ab1512c44f 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -91,13 +91,14 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
switch (cap->cap) {
case KVM_CAP_ARM_NISV_TO_USER:
r = 0;
-   kvm->arch.return_nisv_io_abort_to_user = true;
+   set_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER,
+   >arch.flags);
break;
case KVM_CAP_ARM_MTE:
if (!system_supports_mte() || kvm->created_vcpus)
return -EINVAL;
r = 0;
-   kvm->arch.mte_enabled = true;
+   set_bit(KVM_ARCH_FLAG_MTE_ENABLED, >arch.flags);
break;
default:
r = -EINVAL;
diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c
index 3e2d8ba11a02..3dd38a151d2a 100644
--- a/arch/arm64/kvm/mmio.c
+++ b/arch/arm64/kvm/mmio.c
@@ -135,7 +135,8 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t 
fault_ipa)
 * volunteered to do so, and bail out otherwise.
 */
if (!kvm_vcpu_dabt_isvalid(vcpu)) {
-   if (vcpu->kvm->arch.return_nisv_io_abort_to_user) {
+   if (test_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER,
+>kvm->arch.flags)) {
run->exit_reason = KVM_EXIT_ARM_NISV;
run->arm_nisv.esr_iss = 
kvm_vcpu_dabt_iss_nisv_sanitized(vcpu);
run->arm_nisv.fault_ipa = fault_ipa;
-- 
2.30.2

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 02/16] KVM: arm64: Don't issue CMOs when the physical address is invalid

2021-07-15 Thread Marc Zyngier
Make sure we don't issue CMOs when mapping something that
is not a memory address in the S2 page tables.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/kvm/hyp/pgtable.c | 16 ++--
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 05321f4165e3..a5874ebd0354 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -619,12 +619,16 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, 
u32 level,
}
 
/* Perform CMOs before installation of the guest stage-2 PTE */
-   if (mm_ops->dcache_clean_inval_poc && stage2_pte_cacheable(pgt, new))
-   mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops),
-   granule);
-
-   if (mm_ops->icache_inval_pou && stage2_pte_executable(new))
-   mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
+   if (kvm_phys_is_valid(phys)) {
+   if (mm_ops->dcache_clean_inval_poc &&
+   stage2_pte_cacheable(pgt, new))
+   mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new,
+ mm_ops),
+  granule);
+   if (mm_ops->icache_inval_pou && stage2_pte_executable(new))
+   mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops),
+granule);
+   }
 
smp_store_release(ptep, new);
if (stage2_pte_is_counted(new))
-- 
2.30.2

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Any way to disable KVM VHE extension?

2021-07-15 Thread Qu Wenruo

Hi,

Recently I'm playing around the Nvidia Xavier AGX board, which has VHE 
extension support.


In theory, considering the CPU and memory, it should be pretty powerful 
compared to boards like RPI CM4.


But to my surprise, KVM runs pretty poor on Xavier.

Just booting the edk2 firmware could take over 10s, and 20s to fully 
boot the kernel.
Even my VM on RPI CM4 has way faster boot time, even just running on 
PCIE2.0 x1 lane NVME, and just 4 2.1Ghz A72 core.


This is definitely out of my expectation, I double checked to be sure 
that it's running in KVM mode.


But further digging shows that, since Xavier AGX CPU supports VHE, kvm 
is running in VHE mode other than HYP mode on CM4.


Is there anyway to manually disable VHE mode to test the more common HYP 
mode on Xavier?


BTW, this is the dmesg related to KVM on Xavier, running v5.13 upstream 
kernel, with 64K page size:

[0.852357] kvm [1]: IPA Size Limit: 40 bits
[0.857378] kvm [1]: vgic interrupt IRQ9
[0.862122] kvm: pmu event creation failed -2
[0.866734] kvm [1]: VHE mode initialized successfully

While on CM4, the host runs v5.12.10 upstream kernel (with downstream 
dtb), with 4K page size:

[1.276818] kvm [1]: IPA Size Limit: 44 bits
[1.278425] kvm [1]: vgic interrupt IRQ9
[1.278620] kvm [1]: Hyp mode initialized successfully

Could it be the PAGE size causing problem?

Thanks,
Qu

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: Any way to disable KVM VHE extension?

2021-07-15 Thread Qu Wenruo



On 2021/7/15 下午5:28, Robin Murphy wrote:

On 2021-07-15 09:55, Qu Wenruo wrote:

Hi,

Recently I'm playing around the Nvidia Xavier AGX board, which has VHE 
extension support.


In theory, considering the CPU and memory, it should be pretty 
powerful compared to boards like RPI CM4.


But to my surprise, KVM runs pretty poor on Xavier.

Just booting the edk2 firmware could take over 10s, and 20s to fully 
boot the kernel.
Even my VM on RPI CM4 has way faster boot time, even just running on 
PCIE2.0 x1 lane NVME, and just 4 2.1Ghz A72 core.


This is definitely out of my expectation, I double checked to be sure 
that it's running in KVM mode.


But further digging shows that, since Xavier AGX CPU supports VHE, kvm 
is running in VHE mode other than HYP mode on CM4.


Is there anyway to manually disable VHE mode to test the more common 
HYP mode on Xavier?


According to kernel-parameters.txt, "kvm-arm.mode=nvhe" (or its 
low-level equivalent "id_aa64mmfr1.vh=0") on the command line should do 
that.


Thanks for this one, I stupidly only searched modinfo of kvm, and didn't 
even bother to search arch/arm64/kvm...




However I'd imagine the discrepancy is likely to be something more 
fundamental to the wildly different microarchitectures. There's 
certainly no harm in giving non-VHE a go for comparison, but I wouldn't 
be surprised if it turns out even slower...


You're totally right, with nvhe mode, it's still the same slow speed.

BTW, what did you mean by the "wildly different microarch"?
Is ARMv8.2 arch that different from ARMv8 of RPI4?

And any extra methods I could try to explore the reason of the slowness?

At least RPI CM4 is beyond my expectation and is working pretty fine.

Thanks,
Qu



Robin.

BTW, this is the dmesg related to KVM on Xavier, running v5.13 
upstream kernel, with 64K page size:

[    0.852357] kvm [1]: IPA Size Limit: 40 bits
[    0.857378] kvm [1]: vgic interrupt IRQ9
[    0.862122] kvm: pmu event creation failed -2
[    0.866734] kvm [1]: VHE mode initialized successfully

While on CM4, the host runs v5.12.10 upstream kernel (with downstream 
dtb), with 4K page size:

[    1.276818] kvm [1]: IPA Size Limit: 44 bits
[    1.278425] kvm [1]: vgic interrupt IRQ9
[    1.278620] kvm [1]: Hyp mode initialized successfully

Could it be the PAGE size causing problem?

Thanks,
Qu


___
linux-arm-kernel mailing list
linux-arm-ker...@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel




___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: Any way to disable KVM VHE extension?

2021-07-15 Thread Qu Wenruo



On 2021/7/15 下午4:55, Qu Wenruo wrote:

Hi,

Recently I'm playing around the Nvidia Xavier AGX board, which has VHE 
extension support.


In theory, considering the CPU and memory, it should be pretty powerful 
compared to boards like RPI CM4.


But to my surprise, KVM runs pretty poor on Xavier.

Just booting the edk2 firmware could take over 10s, and 20s to fully 
boot the kernel.
Even my VM on RPI CM4 has way faster boot time, even just running on 
PCIE2.0 x1 lane NVME, and just 4 2.1Ghz A72 core.


This is definitely out of my expectation, I double checked to be sure 
that it's running in KVM mode.


But further digging shows that, since Xavier AGX CPU supports VHE, kvm 
is running in VHE mode other than HYP mode on CM4.


Is there anyway to manually disable VHE mode to test the more common HYP 
mode on Xavier?


BTW, this is the dmesg related to KVM on Xavier, running v5.13 upstream 
kernel, with 64K page size:

[    0.852357] kvm [1]: IPA Size Limit: 40 bits
[    0.857378] kvm [1]: vgic interrupt IRQ9
[    0.862122] kvm: pmu event creation failed -2
[    0.866734] kvm [1]: VHE mode initialized successfully


Wait, the kernel I'm currently running on Xavier is still using 4K page 
size, just like CM4.


Thus it should not be a problem in page size.

Thanks,
Qu


While on CM4, the host runs v5.12.10 upstream kernel (with downstream 
dtb), with 4K page size:

[    1.276818] kvm [1]: IPA Size Limit: 44 bits
[    1.278425] kvm [1]: vgic interrupt IRQ9
[    1.278620] kvm [1]: Hyp mode initialized successfully

Could it be the PAGE size causing problem?

Thanks,
Qu


___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 1/3] KVM: arm64: Narrow PMU sysreg reset values to architectural requirements

2021-07-15 Thread Marc Zyngier
On Thu, 15 Jul 2021 12:51:49 +0100,
Robin Murphy  wrote:
> 
> On 2021-07-15 12:11, Marc Zyngier wrote:
> > Hi Alex,
> > 
> > On Wed, 14 Jul 2021 16:48:07 +0100,
> > Alexandru Elisei  wrote:
> >> 
> >> Hi Marc,
> >> 
> >> On 7/13/21 2:58 PM, Marc Zyngier wrote:
> >>> A number of the PMU sysregs expose reset values that are not in
> >>> compliant with the architecture (set bits in the RES0 ranges,
> >>> for example).
> >>> 
> >>> This in turn has the effect that we need to pointlessly mask
> >>> some register when using them.
> >>> 
> >>> Let's start by making sure we don't have illegal values in the
> >>> shadow registers at reset time. This affects all the registers
> >>> that dedicate one bit per counter, the counters themselves,
> >>> PMEVTYPERn_EL0 and PMSELR_EL0.
> >>> 
> >>> Reported-by: Alexandre Chartre 
> >>> Signed-off-by: Marc Zyngier 
> >>> ---
> >>>   arch/arm64/kvm/sys_regs.c | 46 ---
> >>>   1 file changed, 43 insertions(+), 3 deletions(-)
> >>> 
> >>> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> >>> index f6f126eb6ac1..95ccb8f45409 100644
> >>> --- a/arch/arm64/kvm/sys_regs.c
> >>> +++ b/arch/arm64/kvm/sys_regs.c
> >>> @@ -603,6 +603,44 @@ static unsigned int pmu_visibility(const struct 
> >>> kvm_vcpu *vcpu,
> >>>   return REG_HIDDEN;
> >>>   }
> >>>   +static void reset_pmu_reg(struct kvm_vcpu *vcpu, const struct
> >>> sys_reg_desc *r)
> >>> +{
> >>> + u64 n, mask;
> >>> +
> >>> + /* No PMU available, any PMU reg may UNDEF... */
> >>> + if (!kvm_arm_support_pmu_v3())
> >>> + return;
> >>> +
> >>> + n = read_sysreg(pmcr_el0) >> ARMV8_PMU_PMCR_N_SHIFT;
> >> 
> >> Isn't this going to cause a lot of unnecessary traps with NV? Is
> >> that going to be a problem?
> > 
> > We'll get a new traps at L2 VM creation if we expose a PMU to the L1
> > guest, and if L2 gets one too. I don't think that's a real problem, as
> > the performance of an L2 PMU is bound to be hilarious, and if we are
> > really worried about that, we can always cache it locally. Which is
> > likely the best thing to do if you think of big-little.
> > 
> > Let's not think of big-little.
> > 
> > Another thing is that we could perfectly ignore the number of counter
> > on the host and always expose the architectural maximum, given that
> > the PMU is completely emulated. With that, no trap.
> 
> Although that would deliberately exacerbate the existing problem of
> guest counters mysteriously under-reporting due to the host event
> getting multiplexed, thus arguably make the L2 PMU even less useful.

Oh, absolutely. But the current implementation of the PMU emulation
would be pretty terrible on NV anyway.

> But then trying to analyse application performance under NV at all
> seems to stand a high chance of being akin to shovelling fog, so...

Indeed. Not to mention that there is no (publicly available) HW to
measure performance on anyway...

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 1/3] KVM: arm64: Narrow PMU sysreg reset values to architectural requirements

2021-07-15 Thread Robin Murphy

On 2021-07-15 12:11, Marc Zyngier wrote:

Hi Alex,

On Wed, 14 Jul 2021 16:48:07 +0100,
Alexandru Elisei  wrote:


Hi Marc,

On 7/13/21 2:58 PM, Marc Zyngier wrote:

A number of the PMU sysregs expose reset values that are not in
compliant with the architecture (set bits in the RES0 ranges,
for example).

This in turn has the effect that we need to pointlessly mask
some register when using them.

Let's start by making sure we don't have illegal values in the
shadow registers at reset time. This affects all the registers
that dedicate one bit per counter, the counters themselves,
PMEVTYPERn_EL0 and PMSELR_EL0.

Reported-by: Alexandre Chartre 
Signed-off-by: Marc Zyngier 
---
  arch/arm64/kvm/sys_regs.c | 46 ---
  1 file changed, 43 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index f6f126eb6ac1..95ccb8f45409 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -603,6 +603,44 @@ static unsigned int pmu_visibility(const struct kvm_vcpu 
*vcpu,
return REG_HIDDEN;
  }
  
+static void reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)

+{
+   u64 n, mask;
+
+   /* No PMU available, any PMU reg may UNDEF... */
+   if (!kvm_arm_support_pmu_v3())
+   return;
+
+   n = read_sysreg(pmcr_el0) >> ARMV8_PMU_PMCR_N_SHIFT;


Isn't this going to cause a lot of unnecessary traps with NV? Is
that going to be a problem?


We'll get a new traps at L2 VM creation if we expose a PMU to the L1
guest, and if L2 gets one too. I don't think that's a real problem, as
the performance of an L2 PMU is bound to be hilarious, and if we are
really worried about that, we can always cache it locally. Which is
likely the best thing to do if you think of big-little.

Let's not think of big-little.

Another thing is that we could perfectly ignore the number of counter
on the host and always expose the architectural maximum, given that
the PMU is completely emulated. With that, no trap.


Although that would deliberately exacerbate the existing problem of 
guest counters mysteriously under-reporting due to the host event 
getting multiplexed, thus arguably make the L2 PMU even less useful.


But then trying to analyse application performance under NV at all seems 
to stand a high chance of being akin to shovelling fog, so...


Robin.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 1/3] KVM: arm64: Narrow PMU sysreg reset values to architectural requirements

2021-07-15 Thread Marc Zyngier
Hi Alex,

On Wed, 14 Jul 2021 16:48:07 +0100,
Alexandru Elisei  wrote:
> 
> Hi Marc,
> 
> On 7/13/21 2:58 PM, Marc Zyngier wrote:
> > A number of the PMU sysregs expose reset values that are not in
> > compliant with the architecture (set bits in the RES0 ranges,
> > for example).
> >
> > This in turn has the effect that we need to pointlessly mask
> > some register when using them.
> >
> > Let's start by making sure we don't have illegal values in the
> > shadow registers at reset time. This affects all the registers
> > that dedicate one bit per counter, the counters themselves,
> > PMEVTYPERn_EL0 and PMSELR_EL0.
> >
> > Reported-by: Alexandre Chartre 
> > Signed-off-by: Marc Zyngier 
> > ---
> >  arch/arm64/kvm/sys_regs.c | 46 ---
> >  1 file changed, 43 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> > index f6f126eb6ac1..95ccb8f45409 100644
> > --- a/arch/arm64/kvm/sys_regs.c
> > +++ b/arch/arm64/kvm/sys_regs.c
> > @@ -603,6 +603,44 @@ static unsigned int pmu_visibility(const struct 
> > kvm_vcpu *vcpu,
> > return REG_HIDDEN;
> >  }
> >  
> > +static void reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc 
> > *r)
> > +{
> > +   u64 n, mask;
> > +
> > +   /* No PMU available, any PMU reg may UNDEF... */
> > +   if (!kvm_arm_support_pmu_v3())
> > +   return;
> > +
> > +   n = read_sysreg(pmcr_el0) >> ARMV8_PMU_PMCR_N_SHIFT;
> 
> Isn't this going to cause a lot of unnecessary traps with NV? Is
> that going to be a problem?

We'll get a new traps at L2 VM creation if we expose a PMU to the L1
guest, and if L2 gets one too. I don't think that's a real problem, as
the performance of an L2 PMU is bound to be hilarious, and if we are
really worried about that, we can always cache it locally. Which is
likely the best thing to do if you think of big-little.

Let's not think of big-little.

Another thing is that we could perfectly ignore the number of counter
on the host and always expose the architectural maximum, given that
the PMU is completely emulated. With that, no trap.

> Because at the moment I can't think of an elegant way to avoid it,
> other than special casing PMCR_EL0 in kvm_reset_sys_regs() and using
> here __vcpu_sys_reg(vcpu, PMCR_EL0). Or, even better, using
> kvm_pmu_valid_counter_mask(vcpu), since this is identical to what
> that function does.

I looked into that and bailed out, as it creates interesting ordering
problems...

> 
> > +   n &= ARMV8_PMU_PMCR_N_MASK;
> > +
> > +   reset_unknown(vcpu, r);
> > +
> > +   mask = BIT(ARMV8_PMU_CYCLE_IDX);
> 
> PMSWINC_EL0 has bit 31 RES0. Other than that, looked at all the PMU
> registers and everything looks correct to me.

PMSWINC_EL0 is a RAZ/WO register, which really shouldn't have a shadow
counterpart (the storage is completely unused). Let me get rid on this
sucker in v2.

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: Any way to disable KVM VHE extension?

2021-07-15 Thread Marc Zyngier
On Thu, 15 Jul 2021 10:44:32 +0100,
Qu Wenruo  wrote:
> 
> 
> 
> On 2021/7/15 下午5:28, Robin Murphy wrote:
> > On 2021-07-15 09:55, Qu Wenruo wrote:
> >> Hi,
> >> 
> >> Recently I'm playing around the Nvidia Xavier AGX board, which has
> >> VHE extension support.
> >> 
> >> In theory, considering the CPU and memory, it should be pretty
> >> powerful compared to boards like RPI CM4.
> >> 
> >> But to my surprise, KVM runs pretty poor on Xavier.
> >> 
> >> Just booting the edk2 firmware could take over 10s, and 20s to
> >> fully boot the kernel.
> >> Even my VM on RPI CM4 has way faster boot time, even just running
> >> on PCIE2.0 x1 lane NVME, and just 4 2.1Ghz A72 core.
> >> 
> >> This is definitely out of my expectation, I double checked to be
> >> sure that it's running in KVM mode.
> >> 
> >> But further digging shows that, since Xavier AGX CPU supports VHE,
> >> kvm is running in VHE mode other than HYP mode on CM4.
> >> 
> >> Is there anyway to manually disable VHE mode to test the more
> >> common HYP mode on Xavier?
> > 
> > According to kernel-parameters.txt, "kvm-arm.mode=nvhe" (or its
> > low-level equivalent "id_aa64mmfr1.vh=0") on the command line should
> > do that.
> 
> Thanks for this one, I stupidly only searched modinfo of kvm, and
> didn't even bother to search arch/arm64/kvm...
> 
> > 
> > However I'd imagine the discrepancy is likely to be something more
> > fundamental to the wildly different microarchitectures. There's
> > certainly no harm in giving non-VHE a go for comparison, but I
> > wouldn't be surprised if it turns out even slower...
> 
> You're totally right, with nvhe mode, it's still the same slow speed.

My experience with Denver (Nvidia's previous core) is that it is
horribly slow when running KVM. I guess that the JIT-like microarch
fares poorly with exceptions and save-restore operations.

> BTW, what did you mean by the "wildly different microarch"?
> Is ARMv8.2 arch that different from ARMv8 of RPI4?
> 
> And any extra methods I could try to explore the reason of the slowness?
> 
> At least RPI CM4 is beyond my expectation and is working pretty fine.
> 
> Thanks,
> Qu
> 
> > 
> > Robin.
> > 
> >> BTW, this is the dmesg related to KVM on Xavier, running v5.13
> >> upstream kernel, with 64K page size:
> >> [    0.852357] kvm [1]: IPA Size Limit: 40 bits
> >> [    0.857378] kvm [1]: vgic interrupt IRQ9
> >> [    0.862122] kvm: pmu event creation failed -2

And this isn't going to help finding out the bottleneck, as the kernel
doesn't find a PMU. On Denver, once the PMU is enabled, profiling
anything makes the whole thing even slower. At which point, I just
parked the board and forgot about it.

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: Any way to disable KVM VHE extension?

2021-07-15 Thread Mark Rutland
On Thu, Jul 15, 2021 at 11:00:42AM +0100, Robin Murphy wrote:
> On 2021-07-15 10:44, Qu Wenruo wrote:
> > 
> > 
> > On 2021/7/15 下午5:28, Robin Murphy wrote:
> > > On 2021-07-15 09:55, Qu Wenruo wrote:
> > > > Hi,
> > > > 
> > > > Recently I'm playing around the Nvidia Xavier AGX board, which
> > > > has VHE extension support.
> > > > 
> > > > In theory, considering the CPU and memory, it should be pretty
> > > > powerful compared to boards like RPI CM4.
> > > > 
> > > > But to my surprise, KVM runs pretty poor on Xavier.
> > > > 
> > > > Just booting the edk2 firmware could take over 10s, and 20s to
> > > > fully boot the kernel.
> > > > Even my VM on RPI CM4 has way faster boot time, even just
> > > > running on PCIE2.0 x1 lane NVME, and just 4 2.1Ghz A72 core.
> > > > 
> > > > This is definitely out of my expectation, I double checked to be
> > > > sure that it's running in KVM mode.
> > > > 
> > > > But further digging shows that, since Xavier AGX CPU supports
> > > > VHE, kvm is running in VHE mode other than HYP mode on CM4.
> > > > 
> > > > Is there anyway to manually disable VHE mode to test the more
> > > > common HYP mode on Xavier?
> > > 
> > > According to kernel-parameters.txt, "kvm-arm.mode=nvhe" (or its
> > > low-level equivalent "id_aa64mmfr1.vh=0") on the command line should
> > > do that.
> > 
> > Thanks for this one, I stupidly only searched modinfo of kvm, and didn't
> > even bother to search arch/arm64/kvm...
> > 
> > > 
> > > However I'd imagine the discrepancy is likely to be something more
> > > fundamental to the wildly different microarchitectures. There's
> > > certainly no harm in giving non-VHE a go for comparison, but I
> > > wouldn't be surprised if it turns out even slower...
> > 
> > You're totally right, with nvhe mode, it's still the same slow speed.
> > 
> > BTW, what did you mean by the "wildly different microarch"?
> > Is ARMv8.2 arch that different from ARMv8 of RPI4?
> 
> I don't mean Armv8.x architectural features, I mean the actual
> implementation of NVIDIA's Carmel core is very, very different from
> Cortex-A72 or indeed our newer v8.2 Cortex-A designs.
> 
> > And any extra methods I could try to explore the reason of the slowness?
> 
> I guess the first check would be whether you're trapping and exiting the VM
> significantly more. I believe there are stats somewhere, but I don't know
> exactly where, sorry - I know very little about actually *using* KVM :)
> 
> If it's not that, then it might just be that EDK2 is doing a lot of cache
> maintenance or system register modification or some other operation that
> happens to be slower on Carmel compared to Cortex-A72.

It would also be worthchecking tha the CPUs are running at the speed you
expect, in e.g. case the lack of a DVFS driver means they're running
slow, and this just happens to be more noticeable in a VM.

You can estimate that by using `perf stat` on the host on a busy loop,
and looking what the cpu-cycles count implies.

Thanks,
Mark.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: Any way to disable KVM VHE extension?

2021-07-15 Thread Robin Murphy

On 2021-07-15 10:44, Qu Wenruo wrote:



On 2021/7/15 下午5:28, Robin Murphy wrote:

On 2021-07-15 09:55, Qu Wenruo wrote:

Hi,

Recently I'm playing around the Nvidia Xavier AGX board, which has 
VHE extension support.


In theory, considering the CPU and memory, it should be pretty 
powerful compared to boards like RPI CM4.


But to my surprise, KVM runs pretty poor on Xavier.

Just booting the edk2 firmware could take over 10s, and 20s to fully 
boot the kernel.
Even my VM on RPI CM4 has way faster boot time, even just running on 
PCIE2.0 x1 lane NVME, and just 4 2.1Ghz A72 core.


This is definitely out of my expectation, I double checked to be sure 
that it's running in KVM mode.


But further digging shows that, since Xavier AGX CPU supports VHE, 
kvm is running in VHE mode other than HYP mode on CM4.


Is there anyway to manually disable VHE mode to test the more common 
HYP mode on Xavier?


According to kernel-parameters.txt, "kvm-arm.mode=nvhe" (or its 
low-level equivalent "id_aa64mmfr1.vh=0") on the command line should 
do that.


Thanks for this one, I stupidly only searched modinfo of kvm, and didn't 
even bother to search arch/arm64/kvm...




However I'd imagine the discrepancy is likely to be something more 
fundamental to the wildly different microarchitectures. There's 
certainly no harm in giving non-VHE a go for comparison, but I 
wouldn't be surprised if it turns out even slower...


You're totally right, with nvhe mode, it's still the same slow speed.

BTW, what did you mean by the "wildly different microarch"?
Is ARMv8.2 arch that different from ARMv8 of RPI4?


I don't mean Armv8.x architectural features, I mean the actual 
implementation of NVIDIA's Carmel core is very, very different from 
Cortex-A72 or indeed our newer v8.2 Cortex-A designs.



And any extra methods I could try to explore the reason of the slowness?


I guess the first check would be whether you're trapping and exiting the 
VM significantly more. I believe there are stats somewhere, but I don't 
know exactly where, sorry - I know very little about actually *using* KVM :)


If it's not that, then it might just be that EDK2 is doing a lot of 
cache maintenance or system register modification or some other 
operation that happens to be slower on Carmel compared to Cortex-A72.


Robin.


At least RPI CM4 is beyond my expectation and is working pretty fine.

Thanks,
Qu



Robin.

BTW, this is the dmesg related to KVM on Xavier, running v5.13 
upstream kernel, with 64K page size:

[    0.852357] kvm [1]: IPA Size Limit: 40 bits
[    0.857378] kvm [1]: vgic interrupt IRQ9
[    0.862122] kvm: pmu event creation failed -2
[    0.866734] kvm [1]: VHE mode initialized successfully

While on CM4, the host runs v5.12.10 upstream kernel (with downstream 
dtb), with 4K page size:

[    1.276818] kvm [1]: IPA Size Limit: 44 bits
[    1.278425] kvm [1]: vgic interrupt IRQ9
[    1.278620] kvm [1]: Hyp mode initialized successfully

Could it be the PAGE size causing problem?

Thanks,
Qu


___
linux-arm-kernel mailing list
linux-arm-ker...@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel





___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 5/5] KVM: arm64: nVHE: Remove unneeded isb() when modifying PMSCR_EL1

2021-07-15 Thread Alexandru Elisei
Hi Will,

On 7/14/21 7:20 PM, Will Deacon wrote:
> On Wed, Jul 14, 2021 at 10:56:01AM +0100, Alexandru Elisei wrote:
>> According to ARM DDI 0487G.a, page D9-2930, profiling is disabled when
>> the PE is executing at a higher exception level than the profiling
>> buffer owning exception level. This is also confirmed by the pseudocode
>> for the StatisticalProfilingEnabled() function.
>>
>> During the world switch and before activating guest traps, KVM executes
>> at EL2 with the buffer owning exception level being EL1 (MDCR_EL2.E2PB =
>> 0b11). As a result, profiling is already disabled when draining the
>> buffer, making the isb() after the write to PMSCR_EL1 unnecessary.
>>
>> CC: Will Deacon 
>> Signed-off-by: Alexandru Elisei 
>> ---
>>  arch/arm64/kvm/hyp/nvhe/debug-sr.c | 1 -
>>  1 file changed, 1 deletion(-)
>>
>> diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c 
>> b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
>> index 7d3f25868cae..fdf0e0ba17e9 100644
>> --- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c
>> +++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
>> @@ -33,7 +33,6 @@ static void __debug_save_spe(u64 *pmscr_el1)
>>  /* Yes; save the control register and disable data generation */
>>  *pmscr_el1 = read_sysreg_s(SYS_PMSCR_EL1);
>>  write_sysreg_s(0, SYS_PMSCR_EL1);
>> -isb();
> Hmm, but we still need an ISB somewhere between clearing pmscr_el1 and
> mdcr_el2.e2pb, right? Where does that occur?

Yes, we do need an ISB to make sure we don't start profiling using the EL2&0
translation regime, but with a buffer pointer programmed by the host at EL1 
which
is most likely not even mapped at EL2.

When I wrote the patch, I reasoned that the ISB in
__sysreg_restore_state_nvhe->__sysreg_restore_el1_state and the isb from
__load_stage2 will make sure that PMSCR_EL1 is cleared before the change to the
buffer owning regime.

As I was double checking that just now, I realized that *both* ISBs are executed
only if the system has ARM64_WORKAROUND_SPECULATIVE_AT. No ISB gets executed 
when
the workaround is not needed. We could make the ISB here depend on the system 
not
having the workaround, but it looks to me like there's little to be gained from
that (just one less ISB when the workaround is applied), at the expense of 
making
the code even more difficult to reason about.

My preference would be to drop this patch.

Thanks,

Alex

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: Any way to disable KVM VHE extension?

2021-07-15 Thread Robin Murphy

On 2021-07-15 09:55, Qu Wenruo wrote:

Hi,

Recently I'm playing around the Nvidia Xavier AGX board, which has VHE 
extension support.


In theory, considering the CPU and memory, it should be pretty powerful 
compared to boards like RPI CM4.


But to my surprise, KVM runs pretty poor on Xavier.

Just booting the edk2 firmware could take over 10s, and 20s to fully 
boot the kernel.
Even my VM on RPI CM4 has way faster boot time, even just running on 
PCIE2.0 x1 lane NVME, and just 4 2.1Ghz A72 core.


This is definitely out of my expectation, I double checked to be sure 
that it's running in KVM mode.


But further digging shows that, since Xavier AGX CPU supports VHE, kvm 
is running in VHE mode other than HYP mode on CM4.


Is there anyway to manually disable VHE mode to test the more common HYP 
mode on Xavier?


According to kernel-parameters.txt, "kvm-arm.mode=nvhe" (or its 
low-level equivalent "id_aa64mmfr1.vh=0") on the command line should do 
that.


However I'd imagine the discrepancy is likely to be something more 
fundamental to the wildly different microarchitectures. There's 
certainly no harm in giving non-VHE a go for comparison, but I wouldn't 
be surprised if it turns out even slower...


Robin.

BTW, this is the dmesg related to KVM on Xavier, running v5.13 upstream 
kernel, with 64K page size:

[    0.852357] kvm [1]: IPA Size Limit: 40 bits
[    0.857378] kvm [1]: vgic interrupt IRQ9
[    0.862122] kvm: pmu event creation failed -2
[    0.866734] kvm [1]: VHE mode initialized successfully

While on CM4, the host runs v5.12.10 upstream kernel (with downstream 
dtb), with 4K page size:

[    1.276818] kvm [1]: IPA Size Limit: 44 bits
[    1.278425] kvm [1]: vgic interrupt IRQ9
[    1.278620] kvm [1]: Hyp mode initialized successfully

Could it be the PAGE size causing problem?

Thanks,
Qu


___
linux-arm-kernel mailing list
linux-arm-ker...@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 0/3] kvm-arm64: Fix PMU reset values (and more)

2021-07-15 Thread Alexandre Chartre




On 7/13/21 3:58 PM, Marc Zyngier wrote:

Hi all,

After some back and forth with Alexandre about patch #3 of this
series, it became apparent that some of the PMU code paths perform
some unnecessary masking, only to hide the fact that some of the PMU
register reset values are not architecturally compliant (RES0 bits get
set, among other things).

The first patch of this series addresses the reset value problem, the
second one rids us of the pointless masking, and Alexandre's patch
(which depends on the first two) is slapped on top, with a small
cosmetic change.



Thanks Marc.

You can add my Reviewed-by to patch 1 and 2:

Reviewed-by: Alexandre Chartre 

alex.



Alexandre Chartre (1):
   KVM: arm64: Disabling disabled PMU counters wastes a lot of time

Marc Zyngier (2):
   KVM: arm64: Narrow PMU sysreg reset values to architectural
 requirements
   KVM: arm64: Drop unnecessary masking of PMU registers

  arch/arm64/kvm/pmu-emul.c |  8 +++---
  arch/arm64/kvm/sys_regs.c | 52 ++-
  2 files changed, 50 insertions(+), 10 deletions(-)


___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm