[PATCH 01/17] arm64: Turn kaslr_feature_override into a generic SW feature override

2022-10-20 Thread Marc Zyngier
Disabling KASLR from the command line is implemented as a feature
override. Repaint it slightly so that it can further be used as
more generic infrastructure for SW override purposes.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/include/asm/cpufeature.h |  4 
 arch/arm64/kernel/cpufeature.c  |  2 ++
 arch/arm64/kernel/idreg-override.c  | 16 ++--
 arch/arm64/kernel/kaslr.c   |  6 +++---
 4 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h 
b/arch/arm64/include/asm/cpufeature.h
index f73f11b55042..f44a7860636f 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -15,6 +15,8 @@
 #define MAX_CPU_FEATURES   128
 #define cpu_feature(x) KERNEL_HWCAP_ ## x
 
+#define ARM64_SW_FEATURE_OVERRIDE_NOKASLR  0
+
 #ifndef __ASSEMBLY__
 
 #include 
@@ -914,6 +916,8 @@ extern struct arm64_ftr_override id_aa64smfr0_override;
 extern struct arm64_ftr_override id_aa64isar1_override;
 extern struct arm64_ftr_override id_aa64isar2_override;
 
+extern struct arm64_ftr_override arm64_sw_feature_override;
+
 u32 get_kvm_ipa_limit(void);
 void dump_cpu_features(void);
 
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 6062454a9067..a3959e9f7d55 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -620,6 +620,8 @@ struct arm64_ftr_override __ro_after_init 
id_aa64smfr0_override;
 struct arm64_ftr_override __ro_after_init id_aa64isar1_override;
 struct arm64_ftr_override __ro_after_init id_aa64isar2_override;
 
+struct arm64_ftr_override arm64_sw_feature_override;
+
 static const struct __ftr_reg_entry {
u32 sys_id;
struct arm64_ftr_reg*reg;
diff --git a/arch/arm64/kernel/idreg-override.c 
b/arch/arm64/kernel/idreg-override.c
index 95133765ed29..4e8ef5e05db7 100644
--- a/arch/arm64/kernel/idreg-override.c
+++ b/arch/arm64/kernel/idreg-override.c
@@ -137,15 +137,11 @@ static const struct ftr_set_desc smfr0 __initconst = {
},
 };
 
-extern struct arm64_ftr_override kaslr_feature_override;
-
-static const struct ftr_set_desc kaslr __initconst = {
-   .name   = "kaslr",
-#ifdef CONFIG_RANDOMIZE_BASE
-   .override   = _feature_override,
-#endif
+static const struct ftr_set_desc sw_features __initconst = {
+   .name   = "arm64_sw",
+   .override   = _sw_feature_override,
.fields = {
-   FIELD("disabled", 0, NULL),
+   FIELD("nokaslr", ARM64_SW_FEATURE_OVERRIDE_NOKASLR, NULL),
{}
},
 };
@@ -157,7 +153,7 @@ static const struct ftr_set_desc * const regs[] __initconst 
= {
,
,
,
-   ,
+   _features,
 };
 
 static const struct {
@@ -174,7 +170,7 @@ static const struct {
  "id_aa64isar1.api=0 id_aa64isar1.apa=0 "
  "id_aa64isar2.gpa3=0 id_aa64isar2.apa3=0"},
{ "arm64.nomte","id_aa64pfr1.mte=0" },
-   { "nokaslr","kaslr.disabled=1" },
+   { "nokaslr","arm64_sw.nokaslr=1" },
 };
 
 static int __init find_field(const char *cmdline,
diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c
index 325455d16dbc..7b39283278e5 100644
--- a/arch/arm64/kernel/kaslr.c
+++ b/arch/arm64/kernel/kaslr.c
@@ -23,8 +23,6 @@
 u64 __ro_after_init module_alloc_base;
 u16 __initdata memstart_offset_seed;
 
-struct arm64_ftr_override kaslr_feature_override __initdata;
-
 static int __init kaslr_init(void)
 {
u64 module_range;
@@ -36,7 +34,9 @@ static int __init kaslr_init(void)
 */
module_alloc_base = (u64)_etext - MODULES_VSIZE;
 
-   if (kaslr_feature_override.val & kaslr_feature_override.mask & 0xf) {
+   if (cpuid_feature_extract_unsigned_field(arm64_sw_feature_override.val &
+arm64_sw_feature_override.mask,
+
ARM64_SW_FEATURE_OVERRIDE_NOKASLR)) {
pr_info("KASLR disabled on command line\n");
return 0;
}
-- 
2.34.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 00/17] KVM: arm64: Allow using VHE in the nVHE hypervisor

2022-10-20 Thread Marc Zyngier
KVM (on ARMv8.0) and pKVM (on all revisions of the architecture) uses
the split hypervisor model that makes the EL2 code more or less
standalone. For this, we totally ignore the VHE mode and stick with
the good old v8.0 EL2 setup.

This is all good, but means that the EL2 code is limited in what it
can do with its own address space. This series proposes to remove this
limitation and to allow VHE to be used even with the split hypervisor
model. This has some potential isolation benefits[1], and *maybe*
allow deviant systems to eventually run pKVM.

It introduce a new "mode" for KVM called hVHE, in reference to the
nVHE mode, and indicating that only the hypervisor is using VHE. Note
that this is all this series does. No effort is made to improve the VA
space management, which will be the subject of another series if this
one ever makes it.

This has been lightly tested on a M1 box, with no measurable change in
performance.

Thanks,

M.

[1] 
https://www.youtube.com/watch?v=1F_Mf2j9eIo=PLbzoR-pLrL6qWL3v2KOcvwZ54-w0z5uXV=11

Marc Zyngier (17):
  arm64: Turn kaslr_feature_override into a generic SW feature override
  arm64: Add KVM_HVHE capability and has_hvhe() predicate
  arm64: Don't enable VHE for the kernel if OVERRIDE_HVHE is set
  arm64: Prevent the use of is_kernel_in_hyp_mode() in hypervisor code
  arm64: Allow EL1 physical timer access when running VHE
  arm64: Use CPACR_EL1 format to set CPTR_EL2 when E2H is set
  KVM: arm64: Elide kern_hyp_va() in VHE-specific parts of the
hypervisor
  KVM: arm64: Remove alternatives from sysreg accessors in VHE
hypervisor context
  KVM: arm64: Key use of VHE instructions in nVHE code off
ARM64_KVM_HVHE
  KVM: arm64: Force HCR_EL2.E2H when ARM64_KVM_HVHE is set
  KVM: arm64: Disable TTBR1_EL2 when using ARM64_KVM_HVHE
  KVM: arm64: Adjust EL2 stage-1 leaf AP bits when ARM64_KVM_HVHE is set
  KVM: arm64: Rework CPTR_EL2 programming for HVHE configuration
  KVM: arm64: Program the timer traps with VHE layout in hVHE mode
  KVM: arm64: Force HCR_E2H in guest context when ARM64_KVM_HVHE is set
  arm64: Allow arm64_sw.hvhe on command line
  KVM: arm64: Terrible timer hack for M1 with hVHE

 arch/arm64/include/asm/arch_timer.h |  8 
 arch/arm64/include/asm/cpufeature.h |  5 +++
 arch/arm64/include/asm/el2_setup.h  | 16 +++-
 arch/arm64/include/asm/kvm_arm.h|  3 --
 arch/arm64/include/asm/kvm_asm.h|  1 +
 arch/arm64/include/asm/kvm_emulate.h| 33 +++-
 arch/arm64/include/asm/kvm_hyp.h| 37 +-
 arch/arm64/include/asm/kvm_mmu.h|  4 ++
 arch/arm64/include/asm/virt.h   | 15 +++-
 arch/arm64/kernel/cpufeature.c  | 17 +
 arch/arm64/kernel/hyp-stub.S| 21 ++-
 arch/arm64/kernel/idreg-override.c  | 25 -
 arch/arm64/kernel/image-vars.h  |  3 ++
 arch/arm64/kernel/kaslr.c   |  6 +--
 arch/arm64/kvm/arch_timer.c |  5 +++
 arch/arm64/kvm/arm.c| 12 +-
 arch/arm64/kvm/fpsimd.c |  4 +-
 arch/arm64/kvm/hyp/include/hyp/switch.h |  2 +-
 arch/arm64/kvm/hyp/nvhe/hyp-main.c  | 17 -
 arch/arm64/kvm/hyp/nvhe/pkvm.c  | 27 ++---
 arch/arm64/kvm/hyp/nvhe/switch.c| 28 --
 arch/arm64/kvm/hyp/nvhe/timer-sr.c  | 29 ++
 arch/arm64/kvm/hyp/pgtable.c|  6 ++-
 arch/arm64/kvm/hyp/vhe/switch.c |  2 +-
 arch/arm64/tools/cpucaps|  1 +
 drivers/irqchip/irq-apple-aic.c | 50 -
 26 files changed, 312 insertions(+), 65 deletions(-)

-- 
2.34.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 02/17] arm64: Add KVM_HVHE capability and has_hvhe() predicate

2022-10-20 Thread Marc Zyngier
Expose a capability keying the hVHE feature as well as a new
predicate testing it. Nothing is so far using it, and nothing
is enabling it yet.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/include/asm/cpufeature.h |  1 +
 arch/arm64/include/asm/virt.h   |  8 
 arch/arm64/kernel/cpufeature.c  | 15 +++
 arch/arm64/tools/cpucaps|  1 +
 4 files changed, 25 insertions(+)

diff --git a/arch/arm64/include/asm/cpufeature.h 
b/arch/arm64/include/asm/cpufeature.h
index f44a7860636f..2d41015429b3 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -16,6 +16,7 @@
 #define cpu_feature(x) KERNEL_HWCAP_ ## x
 
 #define ARM64_SW_FEATURE_OVERRIDE_NOKASLR  0
+#define ARM64_SW_FEATURE_OVERRIDE_HVHE 4
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h
index 4eb601e7de50..b11a1c8c8b36 100644
--- a/arch/arm64/include/asm/virt.h
+++ b/arch/arm64/include/asm/virt.h
@@ -140,6 +140,14 @@ static __always_inline bool is_protected_kvm_enabled(void)
return cpus_have_final_cap(ARM64_KVM_PROTECTED_MODE);
 }
 
+static __always_inline bool has_hvhe(void)
+{
+   if (is_vhe_hyp_code())
+   return false;
+
+   return cpus_have_final_cap(ARM64_KVM_HVHE);
+}
+
 static inline bool is_hyp_nvhe(void)
 {
return is_hyp_mode_available() && !is_kernel_in_hyp_mode();
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index a3959e9f7d55..efac89c4c548 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1932,6 +1932,15 @@ static void cpu_copy_el2regs(const struct 
arm64_cpu_capabilities *__unused)
write_sysreg(read_sysreg(tpidr_el1), tpidr_el2);
 }
 
+static bool hvhe_possible(const struct arm64_cpu_capabilities *entry,
+ int __unused)
+{
+   u64 val;
+
+   val = arm64_sw_feature_override.val & arm64_sw_feature_override.mask;
+   return cpuid_feature_extract_unsigned_field(val, 
ARM64_SW_FEATURE_OVERRIDE_HVHE);
+}
+
 #ifdef CONFIG_ARM64_PAN
 static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused)
 {
@@ -2642,6 +2651,12 @@ static const struct arm64_cpu_capabilities 
arm64_features[] = {
.matches = has_cpuid_feature,
.cpu_enable = cpu_trap_el0_impdef,
},
+   {
+   .desc = "VHE for hypervisor only",
+   .capability = ARM64_KVM_HVHE,
+   .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE,
+   .matches = hvhe_possible,
+   },
{},
 };
 
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index f1c0347ec31a..cee2be85b89b 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -43,6 +43,7 @@ HAS_TLB_RANGE
 HAS_VIRT_HOST_EXTN
 HAS_WFXT
 HW_DBM
+KVM_HVHE
 KVM_PROTECTED_MODE
 MISMATCHED_CACHE_TYPE
 MTE
-- 
2.34.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 09/17] KVM: arm64: Key use of VHE instructions in nVHE code off ARM64_KVM_HVHE

2022-10-20 Thread Marc Zyngier
We can now start with the fun stuff: if we enable VHE *only* for
the hypervisor, we need to generate the VHE instructions when
accessing the system registers.

For this, reporpose the alternative sequence to be keyed off
ARM64_KVM_HVHE in the nVHE hypervisor code, and only there.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/include/asm/kvm_hyp.h | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 461fc0dc4a70..e45215a62b43 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -33,12 +33,18 @@ DECLARE_PER_CPU(struct kvm_nvhe_init_params, 
kvm_init_params);
 
 #else // !__KVM_VHE_HYPERVISOR__
 
+#if defined(__KVM_NVHE_HYPERVISOR__)
+#define VHE_ALT_KEYARM64_KVM_HVHE
+#else
+#define VHE_ALT_KEYARM64_HAS_VIRT_HOST_EXTN
+#endif
+
 #define read_sysreg_elx(r,nvh,vh)  \
({  \
u64 reg;\
-   asm volatile(ALTERNATIVE(__mrs_s("%0", r##nvh), \
+   asm volatile(ALTERNATIVE(__mrs_s("%0", r##nvh), \
 __mrs_s("%0", r##vh),  \
-ARM64_HAS_VIRT_HOST_EXTN)  \
+VHE_ALT_KEY)   \
 : "=r" (reg)); \
reg;\
})
@@ -48,7 +54,7 @@ DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
u64 __val = (u64)(v);   \
asm volatile(ALTERNATIVE(__msr_s(r##nvh, "%x0"),\
 __msr_s(r##vh, "%x0"), \
-ARM64_HAS_VIRT_HOST_EXTN)  \
+VHE_ALT_KEY)   \
 : : "rZ" (__val)); \
} while (0)
 
-- 
2.34.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 06/17] arm64: Use CPACR_EL1 format to set CPTR_EL2 when E2H is set

2022-10-20 Thread Marc Zyngier
When HCR_EL2.E2H is set, the CPTR_EL2 register takes the CPACR_EL1
format. Yes, this is good fun.

Hack the bits of startup code that assume E2H=0 while setting up
CPTR_EL2 to make them grok the CPTR_EL1 format.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/include/asm/el2_setup.h | 11 +--
 arch/arm64/kernel/hyp-stub.S   | 11 +++
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/el2_setup.h 
b/arch/arm64/include/asm/el2_setup.h
index fa1045f709bb..605ff1a482db 100644
--- a/arch/arm64/include/asm/el2_setup.h
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -129,8 +129,15 @@
 .endm
 
 /* Coprocessor traps */
-.macro __init_el2_nvhe_cptr
+.macro __init_el2_cptr
+   mrs x1, hcr_el2
+   and x1, x1, #HCR_E2H
+   cbz x1, .LnVHE_\@
+   mov x0, #(CPACR_EL1_FPEN_EL1EN | CPACR_EL1_FPEN_EL0EN)
+   b   .Lset_cptr_\@
+.LnVHE_\@:
mov x0, #0x33ff
+.Lset_cptr_\@:
msr cptr_el2, x0// Disable copro. traps to EL2
 .endm
 
@@ -196,7 +203,7 @@
__init_el2_gicv3
__init_el2_hstr
__init_el2_nvhe_idregs
-   __init_el2_nvhe_cptr
+   __init_el2_cptr
__init_el2_fgt
__init_el2_nvhe_prepare_eret
 .endm
diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S
index 0601cc9592bd..7d2f24ae8c98 100644
--- a/arch/arm64/kernel/hyp-stub.S
+++ b/arch/arm64/kernel/hyp-stub.S
@@ -102,7 +102,18 @@ SYM_CODE_START_LOCAL(__finalise_el2)
 
 .Linit_sve:/* SVE register access */
mrs x0, cptr_el2// Disable SVE traps
+
+   mrs x1, hcr_el2
+   and x1, x1, #HCR_E2H
+   cbz x1, .Lcptr_nvhe
+
+   // VHE case
+   orr x0, x0, #(CPACR_EL1_ZEN_EL1EN | CPACR_EL1_ZEN_EL0EN)
+   b   .Lset_cptr
+
+.Lcptr_nvhe: // nVHE case
bic x0, x0, #CPTR_EL2_TZ
+.Lset_cptr:
msr cptr_el2, x0
isb
mov x1, #ZCR_ELx_LEN_MASK   // SVE: Enable full vector
-- 
2.34.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 08/17] KVM: arm64: Remove alternatives from sysreg accessors in VHE hypervisor context

2022-10-20 Thread Marc Zyngier
In the VHE hypervisor code, we should be using the remapped VHE
accessors, no ifs, no buts. No need to generate any alternative.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/include/asm/kvm_hyp.h | 25 +++--
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index aa7fa2a08f06..461fc0dc4a70 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -16,6 +16,23 @@ DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
 DECLARE_PER_CPU(unsigned long, kvm_hyp_vector);
 DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
 
+/*
+ * Unified accessors for registers that have a different encoding
+ * between VHE and non-VHE. They must be specified without their "ELx"
+ * encoding, but with the SYS_ prefix, as defined in asm/sysreg.h.
+ */
+
+#if defined(__KVM_VHE_HYPERVISOR__)
+
+#define read_sysreg_el0(r) read_sysreg_s(r##_EL02)
+#define write_sysreg_el0(v,r)  write_sysreg_s(v, r##_EL02)
+#define read_sysreg_el1(r) read_sysreg_s(r##_EL12)
+#define write_sysreg_el1(v,r)  write_sysreg_s(v, r##_EL12)
+#define read_sysreg_el2(r) read_sysreg_s(r##_EL1)
+#define write_sysreg_el2(v,r)  write_sysreg_s(v, r##_EL1)
+
+#else // !__KVM_VHE_HYPERVISOR__
+
 #define read_sysreg_elx(r,nvh,vh)  \
({  \
u64 reg;\
@@ -35,12 +52,6 @@ DECLARE_PER_CPU(struct kvm_nvhe_init_params, 
kvm_init_params);
 : : "rZ" (__val)); \
} while (0)
 
-/*
- * Unified accessors for registers that have a different encoding
- * between VHE and non-VHE. They must be specified without their "ELx"
- * encoding, but with the SYS_ prefix, as defined in asm/sysreg.h.
- */
-
 #define read_sysreg_el0(r) read_sysreg_elx(r, _EL0, _EL02)
 #define write_sysreg_el0(v,r)  write_sysreg_elx(v, r, _EL0, _EL02)
 #define read_sysreg_el1(r) read_sysreg_elx(r, _EL1, _EL12)
@@ -48,6 +59,8 @@ DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
 #define read_sysreg_el2(r) read_sysreg_elx(r, _EL2, _EL1)
 #define write_sysreg_el2(v,r)  write_sysreg_elx(v, r, _EL2, _EL1)
 
+#endif // __KVM_VHE_HYPERVISOR__
+
 /*
  * Without an __arch_swab32(), we fall back to ___constant_swab32(), but the
  * static inline can allow the compiler to out-of-line this. KVM always wants
-- 
2.34.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 04/17] arm64: Prevent the use of is_kernel_in_hyp_mode() in hypervisor code

2022-10-20 Thread Marc Zyngier
Using is_kernel_in_hyp_mode() in hypervisor code is a pretty bad
mistake. This helper only checks for CurrentEL being EL2, which
is always true.

Make the link fail if using the helper in hypervisor context
by referencing a non-existent function. Whilst we're at it,
flag the helper as __always_inline, which it really should be.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/include/asm/virt.h | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h
index b11a1c8c8b36..5f84a87a6a2d 100644
--- a/arch/arm64/include/asm/virt.h
+++ b/arch/arm64/include/asm/virt.h
@@ -110,8 +110,13 @@ static inline bool is_hyp_mode_mismatched(void)
return __boot_cpu_mode[0] != __boot_cpu_mode[1];
 }
 
-static inline bool is_kernel_in_hyp_mode(void)
+extern void gotcha_is_kernel_in_hyp_mode(void);
+
+static __always_inline bool is_kernel_in_hyp_mode(void)
 {
+#if defined(__KVM_NVHE_HYPERVISOR__) || defined(__KVM_VHE_HYPERVISOR__)
+   gotcha_is_kernel_in_hyp_mode();
+#endif
return read_sysreg(CurrentEL) == CurrentEL_EL2;
 }
 
-- 
2.34.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 05/17] arm64: Allow EL1 physical timer access when running VHE

2022-10-20 Thread Marc Zyngier
To initialise the timer access from EL2 when HCR_EL2.E2H is set,
we must make use the CNTHCTL_EL2 formap used is appropriate.

This amounts to shifting the timer/counter enable bits by 10
to the left.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/include/asm/el2_setup.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/arm64/include/asm/el2_setup.h 
b/arch/arm64/include/asm/el2_setup.h
index 668569adf4d3..fa1045f709bb 100644
--- a/arch/arm64/include/asm/el2_setup.h
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -34,6 +34,11 @@
  */
 .macro __init_el2_timers
mov x0, #3  // Enable EL1 physical timers
+   mrs x1, hcr_el2
+   and x1, x1, #HCR_E2H
+   cbz x1, .LnVHE_\@
+   lsl x0, x0, #10
+.LnVHE_\@:
msr cnthctl_el2, x0
msr cntvoff_el2, xzr// Clear virtual offset
 .endm
-- 
2.34.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 03/17] arm64: Don't enable VHE for the kernel if OVERRIDE_HVHE is set

2022-10-20 Thread Marc Zyngier
If the OVERRIDE_HVHE SW override is set (as a precursor of
the KVM_HVHE capability), do not enable VHE for the kernel
and drop to EL1 as if VHE was either disabled or unavailable.

Further changes will enable VHE at EL2 only, with the kernel
still running at EL1.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/kernel/hyp-stub.S | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S
index 2ee18c860f2a..0601cc9592bd 100644
--- a/arch/arm64/kernel/hyp-stub.S
+++ b/arch/arm64/kernel/hyp-stub.S
@@ -157,7 +157,15 @@ SYM_CODE_START_LOCAL(__finalise_el2)
tbnzx1, #0, 1f
 
// Needs to be VHE capable, obviously
-   check_override id_aa64mmfr1 ID_AA64MMFR1_EL1_VH_SHIFT 2f 1f
+   check_override id_aa64mmfr1 ID_AA64MMFR1_EL1_VH_SHIFT 0f 1f
+
+0: // Check whether we only want the hypervisor to run VHE, not the kernel
+   adr_l   x1, arm64_sw_feature_override
+   ldr x2, [x1, FTR_OVR_VAL_OFFSET]
+   ldr x1, [x1, FTR_OVR_MASK_OFFSET]
+   and x2, x2, x1
+   ubfxx2, x2, #ARM64_SW_FEATURE_OVERRIDE_HVHE, #4
+   cbz x2, 2f
 
 1: mov_q   x0, HVC_STUB_ERR
eret
-- 
2.34.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 07/17] KVM: arm64: Elide kern_hyp_va() in VHE-specific parts of the hypervisor

2022-10-20 Thread Marc Zyngier
For VHE-specific hypervisor code, kern_hyp_va() is a NOP.

Actually, it is a whole range of NOPs. It'd be much better if
this code simply didn't exist. Let's just do that.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/include/asm/kvm_mmu.h | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 7784081088e7..e32725e90360 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -63,6 +63,7 @@
  * specific registers encoded in the instructions).
  */
 .macro kern_hyp_va reg
+#ifndef __KVM_VHE_HYPERVISOR__
 alternative_cb ARM64_ALWAYS_SYSTEM, kvm_update_va_mask
and \reg, \reg, #1  /* mask with va_mask */
ror \reg, \reg, #1  /* rotate to the first tag bit */
@@ -70,6 +71,7 @@ alternative_cb ARM64_ALWAYS_SYSTEM, kvm_update_va_mask
add \reg, \reg, #0, lsl 12  /* insert the top 12 bits of the tag */
ror \reg, \reg, #63 /* rotate back */
 alternative_cb_end
+#endif
 .endm
 
 /*
@@ -126,6 +128,7 @@ void kvm_apply_hyp_relocations(void);
 
 static __always_inline unsigned long __kern_hyp_va(unsigned long v)
 {
+#ifndef __KVM_VHE_HYPERVISOR__
asm volatile(ALTERNATIVE_CB("and %0, %0, #1\n"
"ror %0, %0, #1\n"
"add %0, %0, #0\n"
@@ -134,6 +137,7 @@ static __always_inline unsigned long __kern_hyp_va(unsigned 
long v)
ARM64_ALWAYS_SYSTEM,
kvm_update_va_mask)
 : "+r" (v));
+#endif
return v;
 }
 
-- 
2.34.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 0/3] KVM: arm64: nv: Fixes for Nested Virtualization issues

2022-10-19 Thread Marc Zyngier
On Mon, 10 Oct 2022 06:56:31 +0100,
Ganapatrao Kulkarni  wrote:
>
> Hi Marc,
> 
> Any review comments on this series?

Not yet. So far, the NV stuff is put on ice until I can source some
actual HW to make the development less painful.

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v3] KVM: arm64: nvhe: Fix build with profile optimization

2022-10-15 Thread Marc Zyngier
On Fri, 14 Oct 2022 11:45:32 -0700, Denis Nikitin wrote:
> Kernel build with clang and KCFLAGS=-fprofile-sample-use= fails with:
> 
> error: arch/arm64/kvm/hyp/nvhe/kvm_nvhe.tmp.o: Unexpected SHT_REL
> section ".rel.llvm.call-graph-profile"
> 
> Starting from 13.0.0 llvm can generate SHT_REL section, see
> https://reviews.llvm.org/rGca3bdb57fa1ac98b711a735de048c12b5fdd8086.
> gen-hyprel does not support SHT_REL relocation section.
> 
> [...]

Applied to fixes, thanks!

[1/1] KVM: arm64: nvhe: Fix build with profile optimization
  commit: bde971a83bbff78561458ded236605a365411b87

Cheers,

M.
-- 
Without deviation from the norm, progress is not possible.


___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH] KVM: arm64: vgic: fix wrong loop condition in scan_its_table()

2022-10-14 Thread Marc Zyngier
On Thu, 13 Oct 2022 17:42:31 +0100,
Eric Auger  wrote:
> 
> Hi Eric,
> 
> On 10/12/22 20:33, Marc Zyngier wrote:
> > Hi Eric,
> > 
> > Before I comment on this patch, a couple of things that need
> > addressing:
> > 
> >> "Cc: marc.zyng...@arm.com, cd...@linaro.org"
> > 
> > None of these two addresses are valid anymore, and haven't been for
> > several years.
> > 
> > Please consult the MAINTAINERS file for up-to-date addresses for
> > current maintainers and reviewers, all of whom should be Cc'd on this
> > email. I've now added them as well as Eric Auger who has written most
> > of the ITS migration code, and the new mailing list (the Columbia list
> > is about to be killed).

Duh, I never CC'd the new list... Now hopefully done.

> > 
> > On Wed, 12 Oct 2022 17:59:25 +0100,
> > Eric Ren  wrote:
> >>
> >> Reproducer hints:
> >> 1. Create ARM virt VM with pxb-pcie bus which adds
> >>extra host bridges, with qemu command like:
> >>
> >> ```
> >>   -device pxb-pcie,bus_nr=8,id=pci.x,numa_node=0,bus=pcie.0 \
> >>   -device pcie-root-port,..,bus=pci.x \
> >>   ...
> >>   -device pxb-pcie,bus_nr=37,id=pci.y,numa_node=1,bus=pcie.0 \
> >>   -device pcie-root-port,..,bus=pci.y \
> >>   ...
> >>
> >> ```
> >> 2. Perform VM migration which calls save/restore device tables.
> >>
> >> In that setup, we get a big "offset" between 2 device_ids (
> >> one is small, another is big), which makes unsigned "len" round
> >> up a big positive number, causing loop to continue exceptionally.
> > 
> > You'll have to spell it out for me here. If you have a very sparse
> > device ID and you are only using a single level device table, you are
> > bound to have a large len. Now, is the issue that 'size' is so large
> > that it is negative as an 'int'? Describing the exact situation you're
> > in would help a lot.
> > 
> >>
> >> Signed-off-by: Eric Ren 
> >> ---
> >>  arch/arm64/kvm/vgic/vgic-its.c | 2 +-
> >>  1 file changed, 1 insertion(+), 1 deletion(-)
> >>
> >> diff --git a/arch/arm64/kvm/vgic/vgic-its.c 
> >> b/arch/arm64/kvm/vgic/vgic-its.c
> >> index 24d7778d1ce6..673554ef02f9 100644
> >> --- a/arch/arm64/kvm/vgic/vgic-its.c
> >> +++ b/arch/arm64/kvm/vgic/vgic-its.c
> >> @@ -2141,7 +2141,7 @@ static int scan_its_table(struct vgic_its *its, 
> >> gpa_t base, int size, u32 esz,
> >>  int start_id, entry_fn_t fn, void *opaque)
> >>  {
> >>struct kvm *kvm = its->dev->kvm;
> >> -  unsigned long len = size;
> >> +  ssize_t len = size;
> > 
> > This feels wrong, really. If anything, all these types should be
> > unsigned, not signed. Signed types in this context make very little
> > sense...
> 
> After digging into the code back again, I realized I told you something
> wrong. The next_offset is the delta between the current device id and
> the next one. The next device can perfectly be in a different L1 device

A different L2 table, surely? By definition, we only have a single L1
table.

> table, - it is your case actually- , in which case the code is
> definitely broken.
> 
> So I guess we should rather have a
> while (true) {
>   ../..
>   if (byte_offset >= len)
>   break;
>   len -= byte_offset;
> }
> 
> You can add a Fixes tag too:
> Fixes: 920a7a8fa92a ("KVM: arm64: vgic-its: Add infrastructure for table
> lookup")
> and cc sta...@vger.kernel.org

Just to make it clear, do you mean this:

diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index 9d3299a70242..e722cafdff60 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -2162,6 +2162,9 @@ static int scan_its_table(struct vgic_its *its, gpa_t 
base, int size, u32 esz,
return next_offset;
 
byte_offset = next_offset * esz;
+   if (byte_offset >= len)
+   break;
+
id += next_offset;
gpa += byte_offset;
len -= byte_offset;


If so, then I agree that this is a sensible fix. EricR, do you mind
respinning this ASAP so that I can get it merged and backported?

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[REPOST][URGENT] kvmarm mailing list migration

2022-10-13 Thread Marc Zyngier
[Reposting this, as it has been almost two weeks since the initial
 announcement and we're still at sub-10% of the users having
 subscribed to the new list]
 
Hi all,

As you probably all know, the kvmarm mailing has been hosted on
Columbia's machines for as long as the project existed (over 13
years). After all this time, the university has decided to retire the
list infrastructure and asked us to find a new hosting.

A new mailing list has been created on lists.linux.dev[1], and I'm
kindly asking everyone interested in following the KVM/arm64
developments to start subscribing to it (and start posting your
patches there). I hope that people will move over to it quickly enough
that we can soon give Columbia the green light to turn their systems
off.

Note that the new list will only get archived automatically once we
fully switch over, but I'll make sure we fill any gap and not lose any
message. In the meantime, please Cc both lists.

I would like to thank Columbia University for their long lasting
support and willingness to help during this transition, as well as
Konstantin (and the kernel.org crew) for quickly stepping up to the
challenge and giving us a new home!

Thanks,

M.

[1] https://subspace.kernel.org/lists.linux.dev.html

-- 
Without deviation from the norm, progress is not possible.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[GIT PULL] KVM/arm64 fixes for 6.1, take #1

2022-10-13 Thread Marc Zyngier
Paolo,

Here's the first set of fixes for 6.1. The most interesting bit is
Oliver's fix limiting the S2 invalidation batch size the the largest
block mapping, solving (at least for now) the RCU stall problems we
have been seeing for a while. We may have to find another solution
when (and if) we decide to allow 4TB mapping at S2...

The rest is a set of minor selftest fixes as well as enabling stack
protection and profiling in the VHE code.

Please pull,

   M.

The following changes since commit b302ca52ba8235ff0e18c0fa1fa92b51784aef6a:

  Merge branch kvm-arm64/misc-6.1 into kvmarm-master/next (2022-10-01 10:19:36 
+0100)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git 
tags/kvmarm-fixes-6.1-1

for you to fetch changes up to 05c2224d4b049406b0545a10be05280ff4b8ba0a:

  KVM: selftests: Fix number of pages for memory slot in 
memslot_modification_stress_test (2022-10-13 11:46:51 +0100)


KVM/arm64 fixes for 6.1, take #1

- Fix for stage-2 invalidation holding the VM MMU lock
  for too long by limiting the walk to the largest
  block mapping size

- Enable stack protection and branch profiling for VHE

- Two selftest fixes


Gavin Shan (1):
  KVM: selftests: Fix number of pages for memory slot in 
memslot_modification_stress_test

Oliver Upton (2):
  KVM: arm64: Work out supported block level at compile time
  KVM: arm64: Limit stage2_apply_range() batch size to largest block

Vincent Donnefort (1):
  KVM: arm64: Enable stack protection and branch profiling for VHE

Zenghui Yu (1):
  KVM: arm64: selftests: Fix multiple versions of GIC creation

 arch/arm64/include/asm/kvm_pgtable.h | 18 +-
 arch/arm64/include/asm/stage2_pgtable.h  | 20 
 arch/arm64/kvm/hyp/Makefile  |  5 +
 arch/arm64/kvm/hyp/nvhe/Makefile |  3 +++
 arch/arm64/kvm/mmu.c |  9 -
 tools/testing/selftests/kvm/aarch64/vgic_init.c  |  4 ++--
 .../selftests/kvm/memslot_modification_stress_test.c |  2 +-
 7 files changed, 28 insertions(+), 33 deletions(-)
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH] KVM: selftests: Fix number of pages for memory slot in memslot_modification_stress_test

2022-10-13 Thread Marc Zyngier
On Thu, 13 Oct 2022 14:30:20 +0800, Gavin Shan wrote:
> It's required by vm_userspace_mem_region_add() that memory size
> should be aligned to host page size. However, one guest page is
> provided by memslot_modification_stress_test. It triggers failure
> in the scenario of 64KB-page-size-host and 4KB-page-size-guest,
> as the following messages indicate.
> 
>  # ./memslot_modification_stress_test
>  Testing guest mode: PA-bits:40,  VA-bits:48,  4K pages
>  guest physical test memory: [0xffbfff, 0xff)
>  Finished creating vCPUs
>  Started all vCPUs
>   Test Assertion Failure 
>lib/kvm_util.c:824: vm_adjust_num_guest_pages(vm->mode, npages) == npages
>pid=5712 tid=5712 errno=0 - Success
>   1   0x00404eeb: vm_userspace_mem_region_add at 
> kvm_util.c:822
>   2   0x00401a5b: add_remove_memslot at 
> memslot_modification_stress_test.c:82
>   3(inlined by) run_test at memslot_modification_stress_test.c:110
>   4   0x00402417: for_each_guest_mode at guest_modes.c:100
>   5   0x004016a7: main at 
> memslot_modification_stress_test.c:187
>   6   0xb8cd4383: ?? ??:0
>   7   0x00401827: _start at :?
>Number of guest pages is not compatible with the host. Try npages=16
> 
> [...]

Applied to fixes, thanks!

[1/1] KVM: selftests: Fix number of pages for memory slot in 
memslot_modification_stress_test
  commit: 05c2224d4b049406b0545a10be05280ff4b8ba0a

Cheers,

M.
-- 
Without deviation from the norm, progress is not possible.


___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2] KVM: arm64: nvhe: Fix build with profile optimization

2022-10-13 Thread Marc Zyngier
On Tue, 11 Oct 2022 03:15:36 +0100,
Denis Nikitin  wrote:
> 
> On Sat, Oct 8, 2022 at 7:22 PM Marc Zyngier  wrote:
> >
> > On Thu, 06 Oct 2022 17:28:17 +0100,
> > Denis Nikitin  wrote:
> > >
> > > Hi Mark,
> >
> > s/k/c/
> >
> > >
> > > This problem currently blocks the PGO roll on the ChromeOS kernel and
> > > we need some kind of a solution.
> >
> > I'm sorry, but I don't feel constrained by your internal deadlines. I
> > have my own...
> >
> > > Could you please take a look?
> >
> > I have asked for a reproducer. All I got for an answer is "this is
> > hard". Providing a profiling file would help, for example.
> 
> Could you please try the following profile on the 5.15 branch?
> 
> $ cat < prof.txt
> kvm_pgtable_walk:100:10
>  2: 5
>  3: 5
>  5: 5
>  6: 5
>  10: 5
>  10: _kvm_pgtable_walk:50
>   5: 5
>   7: 5
>   10: 5
>   13.2: 5
>   14: 5
>   16: 5 __kvm_pgtable_walk:5
>   13: kvm_pgd_page_idx:30
>2: __kvm_pgd_page_idx:30
> 2: 5
> 3: 5
> 5: 5
> 2: kvm_granule_shift:5
>  3: 5
> EOF
> 
> $ make LLVM=1 ARCH=arm64 KCFLAGS=-fprofile-sample-use=prof.txt -j8 vmlinux

Thanks, this was helpful, as I was able to reproduce the build failure.

FWIW, it seems pretty easy to work around by filtering out the
offending option, making it consistent with the mechanism we already
use for tracing and the like.

I came up with the hack below, which does the trick and is IMHO better
than dropping the section (extra work) or adding the negation of this
option (which depends on the compiler option evaluation order).

M.

diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index 48f6ae7cc6e6..7df1b6afca7f 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -91,7 +91,7 @@ quiet_cmd_hypcopy = HYPCOPY $@
 
 # Remove ftrace, Shadow Call Stack, and CFI CFLAGS.
 # This is equivalent to the 'notrace', '__noscs', and '__nocfi' annotations.
-KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) 
$(CC_FLAGS_CFI), $(KBUILD_CFLAGS))
+KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) 
$(CC_FLAGS_CFI) -fprofile-sample-use=%, $(KBUILD_CFLAGS))
 
 # KVM nVHE code is run at a different exception code with a different map, so
 # compiler instrumentation that inserts callbacks or checks into the code may


-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH] KVM: arm64: vgic: fix wrong loop condition in scan_its_table()

2022-10-12 Thread Marc Zyngier
Hi Eric,

Before I comment on this patch, a couple of things that need
addressing:

> "Cc: marc.zyng...@arm.com, cd...@linaro.org"

None of these two addresses are valid anymore, and haven't been for
several years.

Please consult the MAINTAINERS file for up-to-date addresses for
current maintainers and reviewers, all of whom should be Cc'd on this
email. I've now added them as well as Eric Auger who has written most
of the ITS migration code, and the new mailing list (the Columbia list
is about to be killed).

On Wed, 12 Oct 2022 17:59:25 +0100,
Eric Ren  wrote:
> 
> Reproducer hints:
> 1. Create ARM virt VM with pxb-pcie bus which adds
>extra host bridges, with qemu command like:
> 
> ```
>   -device pxb-pcie,bus_nr=8,id=pci.x,numa_node=0,bus=pcie.0 \
>   -device pcie-root-port,..,bus=pci.x \
>   ...
>   -device pxb-pcie,bus_nr=37,id=pci.y,numa_node=1,bus=pcie.0 \
>   -device pcie-root-port,..,bus=pci.y \
>   ...
> 
> ```
> 2. Perform VM migration which calls save/restore device tables.
> 
> In that setup, we get a big "offset" between 2 device_ids (
> one is small, another is big), which makes unsigned "len" round
> up a big positive number, causing loop to continue exceptionally.

You'll have to spell it out for me here. If you have a very sparse
device ID and you are only using a single level device table, you are
bound to have a large len. Now, is the issue that 'size' is so large
that it is negative as an 'int'? Describing the exact situation you're
in would help a lot.

>
> Signed-off-by: Eric Ren 
> ---
>  arch/arm64/kvm/vgic/vgic-its.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
> index 24d7778d1ce6..673554ef02f9 100644
> --- a/arch/arm64/kvm/vgic/vgic-its.c
> +++ b/arch/arm64/kvm/vgic/vgic-its.c
> @@ -2141,7 +2141,7 @@ static int scan_its_table(struct vgic_its *its, gpa_t 
> base, int size, u32 esz,
> int start_id, entry_fn_t fn, void *opaque)
>  {
>   struct kvm *kvm = its->dev->kvm;
> - unsigned long len = size;
> + ssize_t len = size;

This feels wrong, really. If anything, all these types should be
unsigned, not signed. Signed types in this context make very little
sense...

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH] KVM: arm64: pkvm: Fixup boot mode to reflect that the kernel resumes from EL1

2022-10-11 Thread Marc Zyngier
On Tue, 11 Oct 2022 19:48:39 +0100,
Oliver Upton  wrote:
> 
> On Tue, Oct 11, 2022 at 05:54:00PM +0100, Marc Zyngier wrote:
> > The kernel has an awfully complicated boot sequence in order to cope
> > with the various EL2 configurations, including those that "enhanced"
> > the architecture. We go from EL2 to EL1, then back to EL2, staying
> > at EL2 if VHE capable and otherwise go back to EL1.
> > 
> > Here's a paracetamol tablet for you.
> 
> Heh, still have a bit of a headache from this :)
> 
> I'm having a hard time following where we skip the EL2 promotion based
> on __boot_cpu_mode.
> 
> On the cpu_resume() path it looks like we take the return of
> init_kernel_el() and pass that along to finalise_el2(). As we are in EL1
> at this point, it seems like we'd go init_kernel_el() -> init_el1().
> 
> What am I missing?

That I'm an idiot.

This is only necessary on pre-6.0, before 005e12676af0 ("arm64: head:
record CPU boot mode after enabling the MMU"), as this code-path
*used* to reload the boot mode from memory. Now, this is directly
passed as a parameter, making this patch useless.

The joys of looking at too many code bases at the same time... I'll
see how we can add it to 5.19.

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH] KVM: arm64: pkvm: Fixup boot mode to reflect that the kernel resumes from EL1

2022-10-11 Thread Marc Zyngier
The kernel has an awfully complicated boot sequence in order to cope
with the various EL2 configurations, including those that "enhanced"
the architecture. We go from EL2 to EL1, then back to EL2, staying
at EL2 if VHE capable and otherwise go back to EL1.

Here's a paracetamol tablet for you.

The cpu_resume path follows the same logic, because coming up with
two versions of a square wheel is hard.

However, things aren't this straightforward with pKVM, as the host
resume path is always proxied by the hypervisor, which means that
the kernel is always entered at EL1. Which contradicts what the
__boot_cpu_mode[] array contains (it obviously says EL2).

This thus triggers a HVC call from EL1 to EL2 in a vain attempt
to upgrade from EL1 to EL2 VHE, which we are, funnily enough,
reluctant to grant to the host kernel. This is also completely
unexpected, and puzzles your average EL2 hacker.

Address it by fixing up the boot mode at the point the host gets
deprivileged. is_hyp_mode_available() and co already have a static
branch to deal with this, making it pretty safe.

Reported-by: Vincent Donnefort 
Signed-off-by: Marc Zyngier 
---
 arch/arm64/kvm/arm.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index b6c9bfa8492f..cf075c9b9ab1 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -2107,6 +2107,17 @@ static int pkvm_drop_host_privileges(void)
 * once the host stage 2 is installed.
 */
static_branch_enable(_protected_mode_initialized);
+
+   /*
+* Fixup the boot mode so that we don't take spurious round
+* trips via EL2 on cpu_resume. Flush to the PoC for a good
+* measure, so that it can be observed by a CPU coming out of
+* suspend with the MMU off.
+*/
+   __boot_cpu_mode[0] = __boot_cpu_mode[1] = BOOT_CPU_MODE_EL1;
+   dcache_clean_poc((unsigned long)__boot_cpu_mode,
+(unsigned long)(__boot_cpu_mode + 2));
+
on_each_cpu(_kvm_host_prot_finalize, , 1);
return ret;
 }
-- 
2.34.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH] KVM: arm64: selftests: Fix multiple versions of GIC creation

2022-10-10 Thread Marc Zyngier
On Sun, 9 Oct 2022 11:31:31 +0800, Zenghui Yu wrote:
> Commit 98f94ce42ac6 ("KVM: selftests: Move KVM_CREATE_DEVICE_TEST code to
> separate helper") wrongly converted a "real" GIC device creation to
> __kvm_test_create_device() and caused the test failure on my D05 (which
> supports v2 emulation). Fix it.

Applied to fixes, thanks!

[1/1] KVM: arm64: selftests: Fix multiple versions of GIC creation
  commit: 8a6ffcbe26fd14d58075dcf3cbdf1b5b69b20402

Cheers,

M.
-- 
Without deviation from the norm, progress is not possible.


___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2] KVM: arm64: nvhe: Fix build with profile optimization

2022-10-08 Thread Marc Zyngier
On Thu, 06 Oct 2022 17:28:17 +0100,
Denis Nikitin  wrote:
> 
> Hi Mark,

s/k/c/

> 
> This problem currently blocks the PGO roll on the ChromeOS kernel and
> we need some kind of a solution.

I'm sorry, but I don't feel constrained by your internal deadlines. I
have my own...

> Could you please take a look?

I have asked for a reproducer. All I got for an answer is "this is
hard". Providing a profiling file would help, for example.

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v3 0/2] KVM: arm64: Limit stage2_apply_range() batch size to largest block

2022-10-08 Thread Marc Zyngier
On Fri, 7 Oct 2022 23:41:49 +, Oliver Upton wrote:
> Continuing with MMU patches to post, a small series fixing some soft
> lockups caused by stage2_apply_range(). Depending on the paging setup,
> we could walk a very large amount of memory before dropping the lock and
> rescheduling.
> 
> Applies to kvmarm-6.1. Tested with KVM selftests and kvm-unit-tests with
> all supported page sizes (4K, 16K, 64K). Additionally, I no longer saw
> soft lockups with the following:
> 
> [...]

Applied to fixes, thanks!

[1/2] KVM: arm64: Work out supported block level at compile time
  commit: 3b5c082bbfa20d9a57924edd655bbe63fe98ab06
[2/2] KVM: arm64: Limit stage2_apply_range() batch size to largest block
  commit: 5994bc9e05c2f8811f233aa434e391cd2783f0f5

Cheers,

M.
-- 
Without deviation from the norm, progress is not possible.


___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: arm64 build failure on kvm/next

2022-10-04 Thread Marc Zyngier
On Tue, 04 Oct 2022 22:02:40 +0100,
Oliver Upton  wrote:
> 
> Hey Paolo,
> 
> Just wanted to give you a heads up about a build failure on kvm/next.
> Marc pulled some of the sysreg refactoring updates from core arm64 to
> resolve a conflict, which resulted in:
> 
> drivers/perf/arm_spe_pmu.c:677:7: error: use of undeclared identifier 
> 'ID_AA64DFR0_PMSVER_8_2'
> case ID_AA64DFR0_PMSVER_8_2:
>  ^
> drivers/perf/arm_spe_pmu.c:679:7: error: use of undeclared identifier 
> 'ID_AA64DFR0_PMSVER_8_3'
> case ID_AA64DFR0_PMSVER_8_3:
>  ^
> drivers/perf/arm_spe_pmu.c:961:10: error: use of undeclared identifier 
> 'ID_AA64DFR0_PMSVER_SHIFT'
>ID_AA64DFR0_PMSVER_SHIFT);
> 
> The fix has since gone in on the arm64 side [1], in case you want to
> mention in your pull request.
> 
> [1]: 
> https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git/commit/?h=for-next/sysreg=db74cd6337d2691ea932e36b84683090f0712ec1

Also worth noting that the SPE driver is not part of defconfig, which
is probably why it wasn't spotted the first place. Anyway, odds are
that the arm64 pull-request will get in before the KVM one, making
this pretty much invisible...

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v4 3/6] KVM: arm64: Enable ring-based dirty memory tracking

2022-10-04 Thread Marc Zyngier
On Tue, 04 Oct 2022 05:26:23 +0100,
Gavin Shan  wrote:

[...]

> > Why another capability? Just allowing dirty logging to be enabled
> > before we saving the GIC state should be enough, shouldn't it?
> > 
> 
> The GIC state would be just one case where no vcpu can be used to push
> dirty page information. As you mentioned, SMMMU HTTU feature could possibly
> be another case to ARM64. It's uncertain about other architectures where
> dirty-ring will be supported. In QEMU, the dirty (bitmap) logging is enabled
> at the beginning of migration and the bitmap is synchronized to global
> dirty bitmap and RAMBlock's dirty bitmap gradually, as the following
> backtrace shows. What we need to do for QEMU is probably retrieve the
> bitmap at point (A).
> 
> Without the new capability, we will have to rely on the return value
> from ioctls KVM_GET_DIRTY_LOG and KVM_CLEAR_DIRTY_LOG to detect the
> capability. For example, -ENXIO is returned on old kernels.

Huh. Fair enough.

KVM_CAP_ALLOW_DIRTY_LOG_AND_DIRTY_RING_TOGETHER_UNTIL_THE_NEXT_TIME...


> 
>migration_thread
>  qemu_savevm_state_setup
>ram_save_setup
>  ram_init_all
>ram_init_bitmaps
>  memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION)   // dirty 
> logging enabled
>  migration_bitmap_sync_precopy(rs)
>:
>  migration_iteration_run // 
> iteration 0
>qemu_savevm_state_pending
>  migration_bitmap_sync_precopy
>qemu_savevm_state_iterate
>  ram_save_iterate
>  migration_iteration_run// 
> iteration 1
>qemu_savevm_state_pending
>  migration_bitmap_sync_precopy
>qemu_savevm_state_iterate
>  ram_save_iterate
>  migration_iteration_run// 
> iteration 2
>qemu_savevm_state_pending
>  migration_bitmap_sync_precopy
>qemu_savevm_state_iterate
>  ram_save_iterate
>:
>  migration_iteration_run   // 
> iteration N
>qemu_savevm_state_pending
>  migration_bitmap_sync_precopy
>migration_completion
>  qemu_savevm_state_complete_precopy
>qemu_savevm_state_complete_precopy_iterable
>  ram_save_complete
>migration_bitmap_sync_precopy  // A
>
> 
> Note: for post-copy and snapshot, I assume we need to save the dirty bitmap
>   in the last synchronization, right after the VM is stopped.

Not only the VM stopped, but also the devices made quiescent.

> >> If all of us agree on this, I can send another kernel patch to address
> >> this. QEMU still need more patches so that the feature can be supported.
> > 
> > Yes, this will also need some work.
> > 
>  
>  To me, this is just a relaxation of an arbitrary limitation, as the
>  current assumption that only vcpus can dirty memory doesn't hold at
>  all.
> >>> 
> >>> The initial dirty ring proposal has a per-vm ring, but after we
> >>> investigated x86 we found that all legal dirty paths are with a vcpu
> >>> context (except one outlier on kvmgt which fixed within itself), so we
> >>> dropped the per-vm ring.
> >>> 
> >>> One thing to mention is that DMAs should not count in this case because
> >>> that's from device perspective, IOW either IOMMU or SMMU dirty tracking
> >>> should be reported to the device driver that interacts with the userspace
> >>> not from KVM interfaces (e.g. vfio with VFIO_IOMMU_DIRTY_PAGES).  That 
> >>> even
> >>> includes emulated DMA like vhost (VHOST_SET_LOG_BASE).
> >>> 
> >> 
> >> Thanks to Peter for mentioning the per-vm ring's history. As I said above,
> >> lets use bitmap instead if all of us agree.
> >> 
> >> If I'm correct, Marc may be talking about SMMU, which is emulated in host
> >> instead of QEMU. In this case, the DMA target pages are similar to those
> >> pages for vgic/its tables. Both sets of pages are invisible from QEMU.
> > 
> > No, I'm talking about an actual HW SMMU using the HTTU feature that
> > set the Dirty bit in the PTEs. And people have been working on sharing
> > SMMU and CPU PTs for some time, which would give us the one true
> > source of dirty page.
> > 
> > In this configuration, the dirty ring mechanism will be pretty useless.
> > 
> 
> Ok. I don't know the details. Marc, the dirty bitmap is helpful in this case?

Yes, the dirty bitmap is useful if the source of dirty bits is
obtained from the page tables. The cost of collecting/resetting the
bits is pretty high though.

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[GIT PULL] KVM/arm64 updates for 6.1

2022-10-02 Thread Marc Zyngier
Paolo,

Please find below the rather small set of KVM/arm64 updates
for 6.1. This is mostly a set of fixes for existing features,
the most interesting one being Reiji's really good work tracking
an annoying set of bugs in our single-stepping implementation.
Also, I've included the changes making it possible to enable
the dirty-ring tracking on arm64. Full details in the tag.

Note that this pull-request comes with a branch shared with the
arm64 tree, in order to avoid some bad conflicts due to the
ongoing repainting of all the system registers.

Please pull,

M.

The following changes since commit b90cb1053190353cc30f0fef0ef1f378ccc063c5:

  Linux 6.0-rc3 (2022-08-28 15:05:29 -0700)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git 
tags/kvmarm-6.1

for you to fetch changes up to b302ca52ba8235ff0e18c0fa1fa92b51784aef6a:

  Merge branch kvm-arm64/misc-6.1 into kvmarm-master/next (2022-10-01 10:19:36 
+0100)


KVM/arm64 updates for v6.1

- Fixes for single-stepping in the presence of an async
  exception as well as the preservation of PSTATE.SS

- Better handling of AArch32 ID registers on AArch64-only
  systems

- Fixes for the dirty-ring API, allowing it to work on
  architectures with relaxed memory ordering

- Advertise the new kvmarm mailing list

- Various minor cleanups and spelling fixes


Elliot Berman (1):
  KVM: arm64: Ignore kvm-arm.mode if !is_hyp_mode_available()

Gavin Shan (1):
  KVM: arm64: vgic: Remove duplicate check in update_affinity_collection()

Kristina Martsenko (3):
  arm64: cache: Remove unused CTR_CACHE_MINLINE_MASK
  arm64/sysreg: Standardise naming for ID_AA64MMFR1_EL1 fields
  arm64/sysreg: Convert ID_AA64MMFR1_EL1 to automatic generation

Marc Zyngier (12):
  Merge branch kvm-arm64/aarch32-raz-idregs into kvmarm-master/next
  Merge remote-tracking branch 'arm64/for-next/sysreg' into 
kvmarm-master/next
  Merge branch kvm-arm64/single-step-async-exception into kvmarm-master/next
  KVM: Use acquire/release semantics when accessing dirty ring GFN state
  KVM: Add KVM_CAP_DIRTY_LOG_RING_ACQ_REL capability and config option
  KVM: x86: Select CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL
  KVM: Document weakly ordered architecture requirements for dirty ring
  KVM: selftests: dirty-log: Upgrade flag accesses to acquire/release 
semantics
  KVM: selftests: dirty-log: Use KVM_CAP_DIRTY_LOG_RING_ACQ_REL if available
  KVM: arm64: Advertise new kvmarm mailing list
  Merge branch kvm-arm64/dirty-log-ordered into kvmarm-master/next
  Merge branch kvm-arm64/misc-6.1 into kvmarm-master/next

Mark Brown (31):
  arm64/sysreg: Remove stray SMIDR_EL1 defines
  arm64/sysreg: Describe ID_AA64SMFR0_EL1.SMEVer as an enumeration
  arm64/sysreg: Add _EL1 into ID_AA64MMFR0_EL1 definition names
  arm64/sysreg: Add _EL1 into ID_AA64MMFR2_EL1 definition names
  arm64/sysreg: Add _EL1 into ID_AA64PFR0_EL1 definition names
  arm64/sysreg: Add _EL1 into ID_AA64PFR1_EL1 constant names
  arm64/sysreg: Standardise naming of ID_AA64MMFR0_EL1.BigEnd
  arm64/sysreg: Standardise naming of ID_AA64MMFR0_EL1.ASIDBits
  arm64/sysreg: Standardise naming for ID_AA64MMFR2_EL1.VARange
  arm64/sysreg: Standardise naming for ID_AA64MMFR2_EL1.CnP
  arm64/sysreg: Standardise naming for ID_AA64PFR0_EL1 constants
  arm64/sysreg: Standardise naming for ID_AA64PFR0_EL1.AdvSIMD constants
  arm64/sysreg: Standardise naming for SSBS feature enumeration
  arm64/sysreg: Standardise naming for MTE feature enumeration
  arm64/sysreg: Standardise naming of ID_AA64PFR1_EL1 fractional version 
fields
  arm64/sysreg: Standardise naming of ID_AA64PFR1_EL1 BTI enumeration
  arm64/sysreg: Standardise naming of ID_AA64PFR1_EL1 SME enumeration
  arm64/sysreg: Convert HCRX_EL2 to automatic generation
  arm64/sysreg: Convert ID_AA64MMFR0_EL1 to automatic generation
  arm64/sysreg: Convert ID_AA64MMFR2_EL1 to automatic generation
  arm64/sysreg: Convert ID_AA64PFR0_EL1 to automatic generation
  arm64/sysreg: Convert ID_AA64PFR1_EL1 to automatic generation
  arm64/sysreg: Convert TIPDR_EL1 to automatic generation
  arm64/sysreg: Convert SCXTNUM_EL1 to automatic generation
  arm64/sysreg: Add defintion for ALLINT
  arm64/sysreg: Align field names in ID_AA64DFR0_EL1 with architecture
  arm64/sysreg: Add _EL1 into ID_AA64DFR0_EL1 definition names
  arm64/sysreg: Use feature numbering for PMU and SPE revisions
  arm64/sysreg: Convert ID_AA64FDR0_EL1 to automatic generation
  arm64/sysreg: Convert ID_AA64DFR1_EL1 to automatic generation
  arm64/sysreg: Convert ID_AA64AFRn_EL1 to automatic generation

Oliver Upton (8):
  KVM: arm64: Use visibility hook to treat ID regs as RAZ

Re: [PATCH] KVM: arm64: Advertise new kvmarm mailing list

2022-10-01 Thread Marc Zyngier
On Sat, 1 Oct 2022 10:12:45 +0100, Marc Zyngier wrote:
> As announced on the kvmarm list, we're moving the mailing list over
> to kvm...@lists.linux.dev:
> 
> 
> As you probably all know, the kvmarm mailing has been hosted on
> Columbia's machines for as long as the project existed (over 13
> years). After all this time, the university has decided to retire the
> list infrastructure and asked us to find a new hosting.
> 
> [...]

Applied to kvm-arm64/misc-6.1, thanks!

[1/1] KVM: arm64: Advertise new kvmarm mailing list
  commit: ac107abef197660c9db529fe550080ad07b46a67

Cheers,

M.
-- 
Without deviation from the norm, progress is not possible.


___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH] KVM: arm64: Advertise new kvmarm mailing list

2022-10-01 Thread Marc Zyngier
As announced on the kvmarm list, we're moving the mailing list over
to kvm...@lists.linux.dev:


As you probably all know, the kvmarm mailing has been hosted on
Columbia's machines for as long as the project existed (over 13
years). After all this time, the university has decided to retire the
list infrastructure and asked us to find a new hosting.

A new mailing list has been created on lists.linux.dev[1], and I'm
kindly asking everyone interested in following the KVM/arm64
developments to start subscribing to it (and start posting your
patches there). I hope that people will move over to it quickly enough
that we can soon give Columbia the green light to turn their systems
off.

Note that the new list will only get archived automatically once we
fully switch over, but I'll make sure we fill any gap and not lose any
message. In the meantime, please Cc both lists.

[...]

[1] https://subspace.kernel.org/lists.linux.dev.html


Signed-off-by: Marc Zyngier 
---
 MAINTAINERS | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 589517372408..f29f27717de4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11124,7 +11124,8 @@ R:  Alexandru Elisei 
 R: Suzuki K Poulose 
 R: Oliver Upton 
 L: linux-arm-ker...@lists.infradead.org (moderated for non-subscribers)
-L: kvmarm@lists.cs.columbia.edu (moderated for non-subscribers)
+L: kvm...@lists.linux.dev
+L: kvmarm@lists.cs.columbia.edu (deprecated, moderated for non-subscribers)
 S: Maintained
 T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git
 F: arch/arm64/include/asm/kvm*
-- 
2.34.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[URGENT] kvmarm mailing list migration

2022-09-30 Thread Marc Zyngier
Hi all,

As you probably all know, the kvmarm mailing has been hosted on
Columbia's machines for as long as the project existed (over 13
years). After all this time, the university has decided to retire the
list infrastructure and asked us to find a new hosting.

A new mailing list has been created on lists.linux.dev[1], and I'm
kindly asking everyone interested in following the KVM/arm64
developments to start subscribing to it (and start posting your
patches there). I hope that people will move over to it quickly enough
that we can soon give Columbia the green light to turn their systems
off.

Note that the new list will only get archived automatically once we
fully switch over, but I'll make sure we fill any gap and not lose any
message. In the meantime, please Cc both lists.

I would like to thank Columbia University for their long lasting
support and willingness to help during this transition, as well as
Konstantin (and the kernel.org crew) for quickly stepping up to the
challenge and giving us a new home!

Thanks,

M.

[1] https://subspace.kernel.org/lists.linux.dev.html

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v4 3/6] KVM: arm64: Enable ring-based dirty memory tracking

2022-09-30 Thread Marc Zyngier
On Thu, 29 Sep 2022 15:32:03 +0100,
Peter Xu  wrote:
> 
> > If I'm correct, Marc may be talking about SMMU, which is emulated in host
> > instead of QEMU. In this case, the DMA target pages are similar to those
> > pages for vgic/its tables. Both sets of pages are invisible from QEMU.
> 
> OK, I'm not aware the SMMU can also be emulated in the host kernel, thanks
> Gavin.  If so then we really need to think as I mentioned above since we
> will only sync this when switchover, and if the DMA range can cover a large
> portion of guest mem (assuming the physical pages to DMA can be randomly
> allocated by guest device drivers) then it may be another problem to cause
> drastic downtime.

I don't know where this is coming from. The kernel has no SMMU
emulation at all.

  M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v4 3/6] KVM: arm64: Enable ring-based dirty memory tracking

2022-09-29 Thread Marc Zyngier
On Thu, 29 Sep 2022 12:31:34 +0100,
Gavin Shan  wrote:
> 
> I've had the following PATCH[v5 3/7] to reuse bitmap for these particular
> cases. KVM_GET_DIRTY_LOG and KVM_CLEAR_DIRTY_LOG ioctls are used to visit
> the bitmap. The new capability is advertised by KVM_CAP_DIRTY_LOG_RING_BITMAP.
> Note those two ioctls are disabled when dirty-ring is enabled, we need to
> enable them accordingly.
> 
>PATCH[v5 3/7] KVM: x86: Use bitmap in ring-based dirty page tracking
> 
> I would like to post v5 after someone reviews or acks kvm/selftests part
> of this series.

Feel free to post the series. This is too late for 6.1 anyway (I'll
probably send the PR to Paolo tomorrow), so this gives us plenty of
time to sort this out.

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v4 3/6] KVM: arm64: Enable ring-based dirty memory tracking

2022-09-29 Thread Marc Zyngier
On Thu, 29 Sep 2022 10:50:12 +0100,
Gavin Shan  wrote:
> 
> Hi Marc and Peter,
> 
> On 9/29/22 12:52 AM, Peter Xu wrote:
> > On Wed, Sep 28, 2022 at 09:25:34AM +0100, Marc Zyngier wrote:
> >> On Wed, 28 Sep 2022 00:47:43 +0100,
> >> Gavin Shan  wrote:
> >> 
> >>> I have rough idea as below. It's appreciated if you can comment before I'm
> >>> going a head for the prototype. The overall idea is to introduce another
> >>> dirty ring for KVM (kvm-dirty-ring). It's updated and visited separately
> >>> to dirty ring for vcpu (vcpu-dirty-ring).
> >>> 
> >>> - When the various VGIC/ITS table base addresses are specified, 
> >>> kvm-dirty-ring
> >>>   entries are added to mark those pages as 'always-dirty'. In 
> >>> mark_page_dirty_in_slot(),
> >>>   those 'always-dirty' pages will be skipped, no entries pushed to 
> >>> vcpu-dirty-ring.
> >>> 
> >>> - Similar to vcpu-dirty-ring, kvm-dirty-ring is accessed from 
> >>> userspace through
> >>>   mmap(kvm->fd). However, there won't have similar reset interface. 
> >>> It means
> >>>   'struct kvm_dirty_gfn::flags' won't track any information as we do 
> >>> for
> >>>   vcpu-dirty-ring. In this regard, kvm-dirty-ring is purely shared 
> >>> buffer to
> >>>   advertise 'always-dirty' pages from host to userspace.
> >>>  - For QEMU, shutdown/suspend/resume cases won't be concerning
> >>> us any more. The
> >>>   only concerned case is migration. When the migration is about to 
> >>> complete,
> >>>   kvm-dirty-ring entries are fetched and the dirty bits are updated 
> >>> to global
> >>>   dirty page bitmap and RAMBlock's dirty page bitmap. For this, I'm 
> >>> still reading
> >>>   the code to find the best spot to do it.
> >> 
> >> I think it makes a lot of sense to have a way to log writes that are
> >> not generated by a vpcu, such as the GIC and maybe other things in the
> >> future, such as DMA traffic (some SMMUs are able to track dirty pages
> >> as well).
> >> 
> >> However, I don't really see the point in inventing a new mechanism for
> >> that. Why don't we simply allow non-vpcu dirty pages to be tracked in
> >> the dirty *bitmap*?
> >> 
> >>  From a kernel perspective, this is dead easy:
> >> 
> >> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> >> index 5b064dbadaf4..ae9138f29d51 100644
> >> --- a/virt/kvm/kvm_main.c
> >> +++ b/virt/kvm/kvm_main.c
> >> @@ -3305,7 +3305,7 @@ void mark_page_dirty_in_slot(struct kvm *kvm,
> >>struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
> >> #ifdef CONFIG_HAVE_KVM_DIRTY_RING
> >> -  if (WARN_ON_ONCE(!vcpu) || WARN_ON_ONCE(vcpu->kvm != kvm))
> >> +  if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm))
> >>return;
> >>   #endif
> >>   @@ -3313,10 +3313,11 @@ void mark_page_dirty_in_slot(struct kvm
> >> *kvm,
> >>unsigned long rel_gfn = gfn - memslot->base_gfn;
> >>u32 slot = (memslot->as_id << 16) | memslot->id;
> >>   -if (kvm->dirty_ring_size)
> >> +  if (vpcu && kvm->dirty_ring_size)
> >>kvm_dirty_ring_push(>dirty_ring,
> >>slot, rel_gfn);
> >> -  else
> >> +  /* non-vpcu dirtying ends up in the global bitmap */
> >> +  if (!vcpu && memslot->dirty_bitmap)
> >>set_bit_le(rel_gfn, memslot->dirty_bitmap);
> >>}
> >>   }
> >> 
> >> though I'm sure there is a few more things to it.
> > 
> > Yes, currently the bitmaps are not created when rings are enabled.
> > kvm_prepare_memory_region() has:
> > 
> > else if (!kvm->dirty_ring_size) {
> > r = kvm_alloc_dirty_bitmap(new);
> > 
> > But I think maybe that's a solution worth considering.  Using the rings
> > have a major challenge on the limitation of ring size, so that for e.g. an
> > ioctl we need to make sure the pages to dirty within an ioctl procedure
> > will not be more than the ring can take.  Using dirty bitmap for a last
> > phase sync of constant (but still very small amount of) dirty pages does
> > sound rea

Re: [PATCH v4 3/6] KVM: arm64: Enable ring-based dirty memory tracking

2022-09-29 Thread Marc Zyngier
On Wed, 28 Sep 2022 15:52:00 +0100,
Peter Xu  wrote:
> 
> On Wed, Sep 28, 2022 at 09:25:34AM +0100, Marc Zyngier wrote:
> > Hi Gavin,
> > 
> > On Wed, 28 Sep 2022 00:47:43 +0100,
> > Gavin Shan  wrote:
> > 
> > > I have rough idea as below. It's appreciated if you can comment before I'm
> > > going a head for the prototype. The overall idea is to introduce another
> > > dirty ring for KVM (kvm-dirty-ring). It's updated and visited separately
> > > to dirty ring for vcpu (vcpu-dirty-ring).
> > > 
> > >- When the various VGIC/ITS table base addresses are specified, 
> > > kvm-dirty-ring
> > >  entries are added to mark those pages as 'always-dirty'. In 
> > > mark_page_dirty_in_slot(),
> > >  those 'always-dirty' pages will be skipped, no entries pushed to 
> > > vcpu-dirty-ring.
> > > 
> > >- Similar to vcpu-dirty-ring, kvm-dirty-ring is accessed from 
> > > userspace through
> > >  mmap(kvm->fd). However, there won't have similar reset interface. It 
> > > means
> > >  'struct kvm_dirty_gfn::flags' won't track any information as we do 
> > > for
> > >  vcpu-dirty-ring. In this regard, kvm-dirty-ring is purely shared 
> > > buffer to
> > >  advertise 'always-dirty' pages from host to userspace.
> > > - For QEMU, shutdown/suspend/resume cases won't be concerning
> > > us any more. The
> > >  only concerned case is migration. When the migration is about to 
> > > complete,
> > >  kvm-dirty-ring entries are fetched and the dirty bits are updated to 
> > > global
> > >  dirty page bitmap and RAMBlock's dirty page bitmap. For this, I'm 
> > > still reading
> > >  the code to find the best spot to do it.
> > 
> > I think it makes a lot of sense to have a way to log writes that are
> > not generated by a vpcu, such as the GIC and maybe other things in the
> > future, such as DMA traffic (some SMMUs are able to track dirty pages
> > as well).
> > 
> > However, I don't really see the point in inventing a new mechanism for
> > that. Why don't we simply allow non-vpcu dirty pages to be tracked in
> > the dirty *bitmap*?
> > 
> > From a kernel perspective, this is dead easy:
> > 
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index 5b064dbadaf4..ae9138f29d51 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -3305,7 +3305,7 @@ void mark_page_dirty_in_slot(struct kvm *kvm,
> > struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
> >  
> >  #ifdef CONFIG_HAVE_KVM_DIRTY_RING
> > -   if (WARN_ON_ONCE(!vcpu) || WARN_ON_ONCE(vcpu->kvm != kvm))
> > +   if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm))
> > return;
> >  #endif
> >  
> > @@ -3313,10 +3313,11 @@ void mark_page_dirty_in_slot(struct kvm *kvm,
> > unsigned long rel_gfn = gfn - memslot->base_gfn;
> > u32 slot = (memslot->as_id << 16) | memslot->id;
> >  
> > -   if (kvm->dirty_ring_size)
> > +   if (vpcu && kvm->dirty_ring_size)
> > kvm_dirty_ring_push(>dirty_ring,
> > slot, rel_gfn);
> > -   else
> > +   /* non-vpcu dirtying ends up in the global bitmap */
> > +   if (!vcpu && memslot->dirty_bitmap)
> > set_bit_le(rel_gfn, memslot->dirty_bitmap);
> > }
> >  }
> > 
> > though I'm sure there is a few more things to it.
> 
> Yes, currently the bitmaps are not created when rings are enabled.
> kvm_prepare_memory_region() has:
> 
>   else if (!kvm->dirty_ring_size) {
>   r = kvm_alloc_dirty_bitmap(new);
> 
> But I think maybe that's a solution worth considering.  Using the rings
> have a major challenge on the limitation of ring size, so that for e.g. an
> ioctl we need to make sure the pages to dirty within an ioctl procedure
> will not be more than the ring can take.  Using dirty bitmap for a last
> phase sync of constant (but still very small amount of) dirty pages does
> sound reasonable and can avoid that complexity.  The payoff is we'll need
> to allocate both the rings and the bitmaps.

I think that's a reasonable price to pay.

> 
> > 
> > To me, this is just a relaxation of an arbitrary limitation, as the
> > current assumption that only vcpus can dirty memory doesn't hold at
> > all.
> 
> The in

Re: [PATCH] KVM: arm64: Fix comment typo in nvhe/switch.c

2022-09-29 Thread Marc Zyngier
On Thu, 29 Sep 2022 12:28:39 +0800, Wei-Lin Chang wrote:
> Fix the comment of __hyp_vgic_restore_state() from saying VEH to VHE,
> also change the underscore to a dash to match the comment
> above __hyp_vgic_save_state().

Applied to next, thanks!

[1/1] KVM: arm64: Fix comment typo in nvhe/switch.c
  commit: 43b233b1582de501e441deb7c4ed1f944e60b1f9

Cheers,

M.
-- 
Without deviation from the norm, progress is not possible.


___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2 0/6] KVM: Fix dirty-ring ordering on weakly ordered architectures

2022-09-29 Thread Marc Zyngier
On Mon, 26 Sep 2022 15:51:14 +0100, Marc Zyngier wrote:
> [Same distribution list as Gavin's dirty-ring on arm64 series]
> 
> This is an update on the initial series posted as [0].
> 
> As Gavin started posting patches enabling the dirty-ring infrastructure
> on arm64 [1], it quickly became apparent that the API was never intended
> to work on relaxed memory ordering architectures (owing to its x86
> origins).
> 
> [...]

Applied to next, thanks!

[1/6] KVM: Use acquire/release semantics when accessing dirty ring GFN state
  commit: 8929bc9659640f35dd2ef8373263cbd885b4a072
[2/6] KVM: Add KVM_CAP_DIRTY_LOG_RING_ACQ_REL capability and config option
  commit: 17601bfed909fa080fcfd227b57da2bd4dc2d2a6
[3/6] KVM: x86: Select CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL
  commit: fc0693d4e5afe3c110503c3afa9f60600f9e964b
[4/6] KVM: Document weakly ordered architecture requirements for dirty ring
  commit: 671c8c7f9f2349d8b2176ad810f1406794011f63
[5/6] KVM: selftests: dirty-log: Upgrade flag accesses to acquire/release 
semantics
  commit: 4eb6486cb43c93382c27a2659ba978c660e98498
[6/6] KVM: selftests: dirty-log: Use KVM_CAP_DIRTY_LOG_RING_ACQ_REL if available
  commit: 4b3402f1f4d9860301d6d5cd7aff3b67f678d577

Cheers,

M.
-- 
Without deviation from the norm, progress is not possible.


___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RESEND PATCH] KVM: selftests: Update top-of-file comment in psci_test

2022-09-28 Thread Marc Zyngier
On Fri, 19 Aug 2022 16:21:00 +, Oliver Upton wrote:
> Fix the comment to accurately describe the test and recently added
> SYSTEM_SUSPEND test case.
> 
> What was once psci_cpu_on_test was renamed and extended to squeeze in a
> test case for PSCI SYSTEM_SUSPEND. Nonetheless, the author of those
> changes (whoever they may be...) failed to update the file comment to
> reflect what had changed.

Applied to next, thanks!

[1/1] KVM: selftests: Update top-of-file comment in psci_test
  commit: 448e711693e48d03f7933ab3673334701b0c3f41

Cheers,

M.
-- 
Without deviation from the norm, progress is not possible.


___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2] KVM: arm64: Limit stage2_apply_range() batch size to 1GB

2022-09-28 Thread Marc Zyngier
On Tue, 27 Sep 2022 15:43:10 -0400,
Oliver Upton  wrote:
> 
> Hi Marc,
> 
> On Tue, Sep 27, 2022 at 07:34:22AM -0400, Marc Zyngier wrote:
> 
> [...]
> 
> > > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> > > index c9a13e487187..5d05bb92e129 100644
> > > --- a/arch/arm64/kvm/mmu.c
> > > +++ b/arch/arm64/kvm/mmu.c
> > > @@ -31,6 +31,12 @@ static phys_addr_t hyp_idmap_vector;
> > >  
> > >  static unsigned long io_map_base;
> > >  
> > > +static inline phys_addr_t stage2_apply_range_next(phys_addr_t addr, 
> > > phys_addr_t end)
> > 
> > Please drop the inline. I'm sure the compiler will perform its
> > magic.
> > 
> > Can I also bikeshed a bit about the name? This doesn't "apply"
> > anything, nor does it return the next range. It really computes the
> > end of the current one.
> > 
> > Something like stage2_range_addr_end() would at least be consistent
> > with the rest of the arm64 code (grep for _addr_end ...).
> 
> Bikeshed all you want :) But yeah, I like your suggestion.
> 
> > > +{
> > > + phys_addr_t boundary = round_down(addr + SZ_1G, SZ_1G);
> > 
> > nit: the rest of the code is using ALIGN_DOWN(). Any reason why this
> > can't be used here?
> 
> Nope!
> 
> > > +
> > > + return (boundary - 1 < end - 1) ? boundary : end;
> > > +}
> > >  
> > >  /*
> > >   * Release kvm_mmu_lock periodically if the memory region is large. 
> > > Otherwise,
> > > @@ -52,7 +58,7 @@ static int stage2_apply_range(struct kvm *kvm, 
> > > phys_addr_t addr,
> > >   if (!pgt)
> > >   return -EINVAL;
> > >  
> > > - next = stage2_pgd_addr_end(kvm, addr, end);
> > > + next = stage2_apply_range_next(addr, end);
> > >   ret = fn(pgt, addr, next - addr);
> > >   if (ret)
> > >   break;
> > > 
> > 
> > The main problem I see with this is that some entries now get visited
> > multiple times if they cover more than a single 1GB entry (like a
> > 512GB level-0 entry with 4k pages and 48bit IPA) . As long as this
> > isn't destructive (CMOs, for example), this is probably OK. For
> > operations that are not idempotent (such as stage2_unmap_walker), this
> > is a significant change in behaviour.
> > 
> > My concern is that we have on one side a walker that is strictly
> > driven by the page-table sizes, and we now get an arbitrary value that
> > doesn't necessarily a multiple of block sizes. Yes, this works right
> > now because you can't create a block mapping larger than 1GB with any
> > of the supported page size.
> > 
> > But with 52bit VA/PA support, this changes: we can have 512GB (4k),
> > 64GB (16k) and 4TB (64k) block mappings at S2. We don't support this
> > yet at S2, but when this hits, we'll be in potential trouble.
> 
> Ah, I didn't fully capture the reasoning about the batch size. I had
> thought about batching by operating on at most 1 block of the largest
> supported granularity, but that felt like an inefficient walk restarting
> from root every 32M (for 16K paging).
> 
> OTOH, if/when we add support for larger blocks in S2 we will run into
> the same exact problem if we batch on the largest block size. If
> dirty logging caused the large blocks to be shattered down to leaf
> granularity then we will visit a crazy amount of PTEs before releasing
> the lock.
> 
> I guess what I'm getting at is we need to detect lock contention and the
> need to reschedule in the middle of the walk instead of at some
> predetermined boundary, though that ventures into the territory of 'TDP
> MMU' features...
> 
> So, seems to me we can crack this a few ways:
> 
>   1.Batch at the arbitrary 1GB since it works currently and produces a
> more efficient walk for all page sizes. I can rework some of the
> logic in kvm_level_supports_block_mapping() such that we can
> BUILD_BUG_ON() if the largest block size exceeds 1GB. Kicks the can
> down the road on a better implementation.
> 
>   2.Batch by the largest supported block mapping size. This will result
> in less efficient walks for !4K page sizes and likely produce soft
> lockups when we support even larger blocks. Nonetheless, the
> implementation will remain correct regardless of block size.
> 
>   3.Test for lock contention and need_resched() in the middle of the
> walk, rewalking from wherever we left off when scheduled again. TDP
> MMU already doe

Re: [PATCH] KVM: arm64: Cleanup the __get_fault_info() to take out the code that validates HPFAR

2022-09-28 Thread Marc Zyngier
Mingwei,

On Tue, 27 Sep 2022 13:48:52 -0400,
Mingwei Zhang  wrote:
> 
> >
> > Honestly, I'd refrain from such changes *unless* they enable something
> > else. The current code is well understood by people hacking on it, and
> > although I don't mind revamping it, it has to be for a good reason.
> >
> > I'd be much more receptive to such a change if it was a prefix to
> > something that actually made a significant change.
> >
> > Thanks,
> >
> > M.
> >
> Hi Marc,
> 
> Thanks for the feedback.  I am not sure about the style of the KVM ARM
> side. But in general I think mixing the generic code for ARM and
> specific CPU errata handling is misleading. For instance, in this
> case:
> 
> + if ((esr & ESR_ELx_FSC_TYPE) == FSC_PERM)
> + return false;
> +
> + if (cpus_have_final_cap(ARM64_WORKAROUND_834220))
> + return false;
> 
> As shown it would be much cleaner to separate the two cases as the
> former case is suggested in ARMv8 Spec D13.2.55. The latter case would
> definitely come from a different source.

I think we're talking at cross purposes. I don't object to the change
per se. I simply question its value *in isolation*. One of the many
things that makes the kernel hard to maintain is churn. Refactoring
just for the sake of it *is* churn. In this case, cosmetic churn.

But if you make this is part of something touching this area and
improving things from a functional perspective, then I'll happily
merge it.

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v4 3/6] KVM: arm64: Enable ring-based dirty memory tracking

2022-09-28 Thread Marc Zyngier
Hi Gavin,

On Wed, 28 Sep 2022 00:47:43 +0100,
Gavin Shan  wrote:

> I have rough idea as below. It's appreciated if you can comment before I'm
> going a head for the prototype. The overall idea is to introduce another
> dirty ring for KVM (kvm-dirty-ring). It's updated and visited separately
> to dirty ring for vcpu (vcpu-dirty-ring).
> 
>- When the various VGIC/ITS table base addresses are specified, 
> kvm-dirty-ring
>  entries are added to mark those pages as 'always-dirty'. In 
> mark_page_dirty_in_slot(),
>  those 'always-dirty' pages will be skipped, no entries pushed to 
> vcpu-dirty-ring.
> 
>- Similar to vcpu-dirty-ring, kvm-dirty-ring is accessed from userspace 
> through
>  mmap(kvm->fd). However, there won't have similar reset interface. It 
> means
>  'struct kvm_dirty_gfn::flags' won't track any information as we do for
>  vcpu-dirty-ring. In this regard, kvm-dirty-ring is purely shared buffer 
> to
>  advertise 'always-dirty' pages from host to userspace.
> - For QEMU, shutdown/suspend/resume cases won't be concerning
> us any more. The
>  only concerned case is migration. When the migration is about to 
> complete,
>  kvm-dirty-ring entries are fetched and the dirty bits are updated to 
> global
>  dirty page bitmap and RAMBlock's dirty page bitmap. For this, I'm still 
> reading
>  the code to find the best spot to do it.

I think it makes a lot of sense to have a way to log writes that are
not generated by a vpcu, such as the GIC and maybe other things in the
future, such as DMA traffic (some SMMUs are able to track dirty pages
as well).

However, I don't really see the point in inventing a new mechanism for
that. Why don't we simply allow non-vpcu dirty pages to be tracked in
the dirty *bitmap*?

>From a kernel perspective, this is dead easy:

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5b064dbadaf4..ae9138f29d51 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3305,7 +3305,7 @@ void mark_page_dirty_in_slot(struct kvm *kvm,
struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
 
 #ifdef CONFIG_HAVE_KVM_DIRTY_RING
-   if (WARN_ON_ONCE(!vcpu) || WARN_ON_ONCE(vcpu->kvm != kvm))
+   if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm))
return;
 #endif
 
@@ -3313,10 +3313,11 @@ void mark_page_dirty_in_slot(struct kvm *kvm,
unsigned long rel_gfn = gfn - memslot->base_gfn;
u32 slot = (memslot->as_id << 16) | memslot->id;
 
-   if (kvm->dirty_ring_size)
+   if (vpcu && kvm->dirty_ring_size)
kvm_dirty_ring_push(>dirty_ring,
slot, rel_gfn);
-   else
+   /* non-vpcu dirtying ends up in the global bitmap */
+   if (!vcpu && memslot->dirty_bitmap)
set_bit_le(rel_gfn, memslot->dirty_bitmap);
}
 }

though I'm sure there is a few more things to it.

To me, this is just a relaxation of an arbitrary limitation, as the
current assumption that only vcpus can dirty memory doesn't hold at
all.

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v4 3/6] KVM: arm64: Enable ring-based dirty memory tracking

2022-09-27 Thread Marc Zyngier
On Tue, 27 Sep 2022 12:02:52 -0400,
Peter Xu  wrote:
> 
> On Tue, Sep 27, 2022 at 08:54:36AM +0800, Gavin Shan wrote:
> > Enable ring-based dirty memory tracking on arm64 by selecting
> > CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL and providing the ring buffer's
> > physical page offset (KVM_DIRTY_LOG_PAGE_OFFSET).
> > 
> > Signed-off-by: Gavin Shan 
> 
> Gavin,
> 
> Any decision made on how to tackle with the GIC status dirty bits?

Which dirty bits? Are you talking of the per-RD pending bits?

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2] KVM: arm64: Limit stage2_apply_range() batch size to 1GB

2022-09-27 Thread Marc Zyngier
On Mon, 26 Sep 2022 18:21:45 -0400,
Oliver Upton  wrote:
> 
> Presently stage2_apply_range() works on a batch of memory addressed by a
> stage 2 root table entry for the VM. Depending on the IPA limit of the
> VM and PAGE_SIZE of the host, this could address a massive range of
> memory. Some examples:
> 
>   4 level, 4K paging -> 512 GB batch size
> 
>   3 level, 64K paging -> 4TB batch size
> 
> Unsurprisingly, working on such a large range of memory can lead to soft
> lockups. When running dirty_log_perf_test:
> 
>   ./dirty_log_perf_test -m -2 -s anonymous_thp -b 4G -v 48
> 
>   watchdog: BUG: soft lockup - CPU#0 stuck for 45s! [dirty_log_perf_:16703]
>   Modules linked in: vfat fat cdc_ether usbnet mii xhci_pci xhci_hcd 
> sha3_generic gq(O)
>   CPU: 0 PID: 16703 Comm: dirty_log_perf_ Tainted: G   O   
> 6.0.0-smp-DEV #1
>   pstate: 8049 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
>   pc : dcache_clean_inval_poc+0x24/0x38
>   lr : clean_dcache_guest_page+0x28/0x4c
>   sp : 800021763990
>   pmr_save: 00e0
>   x29: 800021763990 x28: 0005 x27: 0de0
>   x26: 0001 x25: 00400830b13bc77f x24: ad4f91ead9c0
>   x23:  x22: 882ad9c8 x21: fffafa7bc000
>   x20: ad4f9066ce50 x19: 0003 x18: ad4f92402000
>   x17: 011b x16: 011b x15: 0124
>   x14: 07ff8301d280 x13:  x12: 
>   x11: 00010001 x10: fc00 x9 : ad4f9069e580
>   x8 : 000c x7 :  x6 : 003f
>   x5 : 07ffa2076980 x4 : 0001 x3 : 003f
>   x2 : 0040 x1 : 0830313bd000 x0 : 0830313bcc40
>   Call trace:
>dcache_clean_inval_poc+0x24/0x38
>stage2_unmap_walker+0x138/0x1ec
>__kvm_pgtable_walk+0x130/0x1d4
>__kvm_pgtable_walk+0x170/0x1d4
>__kvm_pgtable_walk+0x170/0x1d4
>__kvm_pgtable_walk+0x170/0x1d4
>kvm_pgtable_stage2_unmap+0xc4/0xf8
>kvm_arch_flush_shadow_memslot+0xa4/0x10c
>kvm_set_memslot+0xb8/0x454
>__kvm_set_memory_region+0x194/0x244
>kvm_vm_ioctl_set_memory_region+0x58/0x7c
>kvm_vm_ioctl+0x49c/0x560
>__arm64_sys_ioctl+0x9c/0xd4
>invoke_syscall+0x4c/0x124
>el0_svc_common+0xc8/0x194
>do_el0_svc+0x38/0xc0
>el0_svc+0x2c/0xa4
>el0t_64_sync_handler+0x84/0xf0
>el0t_64_sync+0x1a0/0x1a4
> 
> Given the various paging configurations used by KVM at stage 2 there
> isn't a sensible page table level to use as the batch size. Use 1GB as
> the batch size instead, as it is evenly divisible by all supported
> hugepage sizes across 4K, 16K, and 64K paging.
> 
> Signed-off-by: Oliver Upton 
> ---
> 
> Applies to 6.0-rc3. Tested with 4K, 16K, and 64K pages with the above
> dirty_log_perf_test command and noticed no more soft lockups.
> 
> v1: 
> https://lore.kernel.org/kvmarm/20220920183630.3376939-1-oliver.up...@linux.dev/
> 
> v1 -> v2:
>  - Align down to the next 1GB boundary (Ricardo)
> 
>  arch/arm64/include/asm/stage2_pgtable.h | 20 
>  arch/arm64/kvm/mmu.c|  8 +++-
>  2 files changed, 7 insertions(+), 21 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/stage2_pgtable.h 
> b/arch/arm64/include/asm/stage2_pgtable.h
> index fe341a6578c3..c8dca8ae359c 100644
> --- a/arch/arm64/include/asm/stage2_pgtable.h
> +++ b/arch/arm64/include/asm/stage2_pgtable.h
> @@ -10,13 +10,6 @@
>  
>  #include 
>  
> -/*
> - * PGDIR_SHIFT determines the size a top-level page table entry can map
> - * and depends on the number of levels in the page table. Compute the
> - * PGDIR_SHIFT for a given number of levels.
> - */
> -#define pt_levels_pgdir_shift(lvls)  ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - (lvls))
> -
>  /*
>   * The hardware supports concatenation of up to 16 tables at stage2 entry
>   * level and we use the feature whenever possible, which means we resolve 4
> @@ -30,11 +23,6 @@
>  #define stage2_pgtable_levels(ipa)   ARM64_HW_PGTABLE_LEVELS((ipa) - 4)
>  #define kvm_stage2_levels(kvm)   VTCR_EL2_LVLS(kvm->arch.vtcr)
>  
> -/* stage2_pgdir_shift() is the size mapped by top-level stage2 entry for the 
> VM */
> -#define stage2_pgdir_shift(kvm)  
> pt_levels_pgdir_shift(kvm_stage2_levels(kvm))
> -#define stage2_pgdir_size(kvm)   (1ULL << 
> stage2_pgdir_shift(kvm))
> -#define stage2_pgdir_mask(kvm)   ~(stage2_pgdir_size(kvm) - 1)
> -
>  /*
>   * kvm_mmmu_cache_min_pages() is the number of pages required to install
>   * a stage-2 translation. We pre-allocate the entry level page table at
> @@ -42,12 +30,4 @@
>   */
>  #define kvm_mmu_cache_min_pages(kvm) (kvm_stage2_levels(kvm) - 1)
>  
> -static inline phys_addr_t
> -stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
> -{
> - phys_addr_t boundary = (addr + stage2_pgdir_size(kvm)) & 
> stage2_pgdir_mask(kvm);
> -
> - return (boundary - 1 < end - 1) ? boundary 

Re: [PATCH v4 0/6] KVM: arm64: Enable ring-based dirty memory tracking

2022-09-27 Thread Marc Zyngier
+ Sean

On Mon, 26 Sep 2022 20:54:33 -0400,
Gavin Shan  wrote:
> 
> This series enables the ring-based dirty memory tracking for ARM64.
> The feature has been available and enabled on x86 for a while. It
> is beneficial when the number of dirty pages is small in a checkpointing
> system or live migration scenario. More details can be found from
> fb04a1eddb1a ("KVM: X86: Implement ring-based dirty memory tracking").
> 
> This series is applied on top of Marc's v2 series [0], fixing dirty-ring
> ordering issue.

This looks good to me as it stands. If someone on the x86 side of
things is willing to ack the x86 changes (both here and in my
series[0]), I'm happy to queue the whole thing.

Thanks,

M.

[0] https://lore.kernel.org/kvmarm/20220926145120.27974-1-...@kernel.org

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v4 1/6] KVM: x86: Introduce KVM_REQ_RING_SOFT_FULL

2022-09-27 Thread Marc Zyngier
On Mon, 26 Sep 2022 20:54:34 -0400,
Gavin Shan  wrote:
> 
> This adds KVM_REQ_RING_SOFT_FULL, which is raised when the dirty
> ring of the specific VCPU becomes softly full in kvm_dirty_ring_push().
> The VCPU is enforced to exit when the request is raised and its
> dirty ring is softly full on its entrance.
> 
> The event is checked and handled in the newly introduced helper
> kvm_dirty_ring_check_request(). With this, kvm_dirty_ring_soft_full()
> becomes a private function.
> 
> Suggested-by: Marc Zyngier 
> Signed-off-by: Gavin Shan 
> ---
>  arch/x86/kvm/x86.c | 15 ++-
>  include/linux/kvm_dirty_ring.h | 13 +++--
>  include/linux/kvm_host.h   |  1 +
>  virt/kvm/dirty_ring.c  | 19 ++-
>  4 files changed, 32 insertions(+), 16 deletions(-)
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index b0c47b41c264..0dd0d32073e7 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -10260,16 +10260,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  
>   bool req_immediate_exit = false;
>  
> - /* Forbid vmenter if vcpu dirty ring is soft-full */
> - if (unlikely(vcpu->kvm->dirty_ring_size &&
> -  kvm_dirty_ring_soft_full(>dirty_ring))) {
> - vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL;
> - trace_kvm_dirty_ring_exit(vcpu);
> - r = 0;
> - goto out;
> - }
> -
>   if (kvm_request_pending(vcpu)) {
> + /* Forbid vmenter if vcpu dirty ring is soft-full */
> + if (kvm_dirty_ring_check_request(vcpu)) {
> + r = 0;
> + goto out;
> + }
> +
>   if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
>   r = -EIO;
>   goto out;
> diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h
> index 906f899813dc..b188bfcf3a09 100644
> --- a/include/linux/kvm_dirty_ring.h
> +++ b/include/linux/kvm_dirty_ring.h
> @@ -54,6 +54,11 @@ static inline void kvm_dirty_ring_push(struct 
> kvm_dirty_ring *ring,
>  {
>  }
>  
> +static inline bool kvm_dirty_ring_check_request(struct kvm_vcpu *vcpu)
> +{
> + return false;
> +}
> +

nit: I don't think this is needed at all. The dirty ring feature is
not user-selectable, and this is always called from arch code that is
fully aware of that option.

This can be fixed when applying the patch though, no need to resend
for this.

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH] KVM: arm64: Cleanup the __get_fault_info() to take out the code that validates HPFAR

2022-09-27 Thread Marc Zyngier
On Tue, 27 Sep 2022 01:14:16 -0400,
Oliver Upton  wrote:
> 
> Hi Mingwei,
> 
> On Tue, Sep 27, 2022 at 12:27:15AM +, Mingwei Zhang wrote:
> > Cleanup __get_fault_info() to take out the code that checks HPFAR. The
> > conditions in __get_fault_info() that checks if HPFAR contains a valid IPA
> > is slightly messy in that several conditions are written within one IF
> > statement acrossing multiple lines and are connected with different logical
> > operators. Among them, some conditions come from ARM Spec, while others
>  ^~~~
> 
> Call it the ARM ARM or Arm ARM, depending on what stylization you
> subscribe to :)
> 
> > come from CPU erratum. This makes the code hard to read and
> > difficult to extend.
> 
> I'd recommend you avoid alluding to future changes unless they're posted
> on the mailing list.

Honestly, I'd refrain from such changes *unless* they enable something
else. The current code is well understood by people hacking on it, and
although I don't mind revamping it, it has to be for a good reason.

I'd be much more receptive to such a change if it was a prefix to
something that actually made a significant change.

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v2 2/6] KVM: Add KVM_CAP_DIRTY_LOG_RING_ACQ_REL capability and config option

2022-09-26 Thread Marc Zyngier
In order to differenciate between architectures that require no extra
synchronisation when accessing the dirty ring and those who do,
add a new capability (KVM_CAP_DIRTY_LOG_RING_ACQ_REL) that identify
the latter sort. TSO architectures can obviously advertise both, while
relaxed architectures must only advertise the ACQ_REL version.

This requires some configuration symbol rejigging, with HAVE_KVM_DIRTY_RING
being only indirectly selected by two top-level config symbols:
- HAVE_KVM_DIRTY_RING_TSO for strongly ordered architectures (x86)
- HAVE_KVM_DIRTY_RING_ACQ_REL for weakly ordered architectures (arm64)

Suggested-by: Paolo Bonzini 
Signed-off-by: Marc Zyngier 
---
 arch/x86/kvm/Kconfig |  2 +-
 include/uapi/linux/kvm.h |  1 +
 virt/kvm/Kconfig | 14 ++
 virt/kvm/kvm_main.c  |  9 -
 4 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index e3cbd7706136..876748b236ff 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,7 +28,7 @@ config KVM
select HAVE_KVM_IRQCHIP
select HAVE_KVM_PFNCACHE
select HAVE_KVM_IRQFD
-   select HAVE_KVM_DIRTY_RING
+   select HAVE_KVM_DIRTY_RING_TSO
select IRQ_BYPASS_MANAGER
select HAVE_KVM_IRQ_BYPASS
select HAVE_KVM_IRQ_ROUTING
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index eed0315a77a6..0d5d4419139a 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1177,6 +1177,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_VM_DISABLE_NX_HUGE_PAGES 220
 #define KVM_CAP_S390_ZPCI_OP 221
 #define KVM_CAP_S390_CPU_TOPOLOGY 222
+#define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index a8c5c9f06b3c..800f9470e36b 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -19,6 +19,20 @@ config HAVE_KVM_IRQ_ROUTING
 config HAVE_KVM_DIRTY_RING
bool
 
+# Only strongly ordered architectures can select this, as it doesn't
+# put any explicit constraint on userspace ordering. They can also
+# select the _ACQ_REL version.
+config HAVE_KVM_DIRTY_RING_TSO
+   bool
+   select HAVE_KVM_DIRTY_RING
+   depends on X86
+
+# Weakly ordered architectures can only select this, advertising
+# to userspace the additional ordering requirements.
+config HAVE_KVM_DIRTY_RING_ACQ_REL
+   bool
+   select HAVE_KVM_DIRTY_RING
+
 config HAVE_KVM_EVENTFD
bool
select EVENTFD
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 584a5bab3af3..5b064dbadaf4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4475,7 +4475,13 @@ static long kvm_vm_ioctl_check_extension_generic(struct 
kvm *kvm, long arg)
case KVM_CAP_NR_MEMSLOTS:
return KVM_USER_MEM_SLOTS;
case KVM_CAP_DIRTY_LOG_RING:
-#ifdef CONFIG_HAVE_KVM_DIRTY_RING
+#ifdef CONFIG_HAVE_KVM_DIRTY_RING_TSO
+   return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct 
kvm_dirty_gfn);
+#else
+   return 0;
+#endif
+   case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
+#ifdef CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL
return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct 
kvm_dirty_gfn);
 #else
return 0;
@@ -4580,6 +4586,7 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm 
*kvm,
return 0;
}
case KVM_CAP_DIRTY_LOG_RING:
+   case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
default:
return kvm_vm_ioctl_enable_cap(kvm, cap);
-- 
2.34.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v2 6/6] KVM: selftests: dirty-log: Use KVM_CAP_DIRTY_LOG_RING_ACQ_REL if available

2022-09-26 Thread Marc Zyngier
Pick KVM_CAP_DIRTY_LOG_RING_ACQ_REL if exposed by the kernel.

Signed-off-by: Marc Zyngier 
---
 tools/testing/selftests/kvm/dirty_log_test.c | 3 ++-
 tools/testing/selftests/kvm/lib/kvm_util.c   | 5 -
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/kvm/dirty_log_test.c 
b/tools/testing/selftests/kvm/dirty_log_test.c
index 53627add8a7c..b5234d6efbe1 100644
--- a/tools/testing/selftests/kvm/dirty_log_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_test.c
@@ -265,7 +265,8 @@ static void default_after_vcpu_run(struct kvm_vcpu *vcpu, 
int ret, int err)
 
 static bool dirty_ring_supported(void)
 {
-   return kvm_has_cap(KVM_CAP_DIRTY_LOG_RING);
+   return (kvm_has_cap(KVM_CAP_DIRTY_LOG_RING) ||
+   kvm_has_cap(KVM_CAP_DIRTY_LOG_RING_ACQ_REL));
 }
 
 static void dirty_ring_create_vm_done(struct kvm_vm *vm)
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
b/tools/testing/selftests/kvm/lib/kvm_util.c
index 9889fe0d8919..411a4c0bc81c 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -82,7 +82,10 @@ unsigned int kvm_check_cap(long cap)
 
 void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size)
 {
-   vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING, ring_size);
+   if (vm_check_cap(vm, KVM_CAP_DIRTY_LOG_RING_ACQ_REL))
+   vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING_ACQ_REL, ring_size);
+   else
+   vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING, ring_size);
vm->dirty_ring_size = ring_size;
 }
 
-- 
2.34.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v2 3/6] KVM: x86: Select CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL

2022-09-26 Thread Marc Zyngier
Since x86 is TSO (give or take), allow it to advertise the new
ACQ_REL version of the dirty ring capability. No other change is
required for it.

Signed-off-by: Marc Zyngier 
---
 arch/x86/kvm/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 876748b236ff..67be7f217e37 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -29,6 +29,7 @@ config KVM
select HAVE_KVM_PFNCACHE
select HAVE_KVM_IRQFD
select HAVE_KVM_DIRTY_RING_TSO
+   select HAVE_KVM_DIRTY_RING_ACQ_REL
select IRQ_BYPASS_MANAGER
select HAVE_KVM_IRQ_BYPASS
select HAVE_KVM_IRQ_ROUTING
-- 
2.34.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v2 4/6] KVM: Document weakly ordered architecture requirements for dirty ring

2022-09-26 Thread Marc Zyngier
Now that the kernel can expose to userspace that its dirty ring
management relies on explicit ordering, document these new requirements
for VMMs to do the right thing.

Signed-off-by: Marc Zyngier 
---
 Documentation/virt/kvm/api.rst | 17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index abd7c32126ce..32427ea160df 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -8019,8 +8019,8 @@ guest according to the bits in the KVM_CPUID_FEATURES 
CPUID leaf
 (0x4001). Otherwise, a guest may use the paravirtual features
 regardless of what has actually been exposed through the CPUID leaf.
 
-8.29 KVM_CAP_DIRTY_LOG_RING

+8.29 KVM_CAP_DIRTY_LOG_RING/KVM_CAP_DIRTY_LOG_RING_ACQ_REL
+--
 
 :Architectures: x86
 :Parameters: args[0] - size of the dirty log ring
@@ -8078,6 +8078,11 @@ on to the next GFN.  The userspace should continue to do 
this until the
 flags of a GFN have the DIRTY bit cleared, meaning that it has harvested
 all the dirty GFNs that were available.
 
+Note that on weakly ordered architectures, userspace accesses to the
+ring buffer (and more specifically the 'flags' field) must be ordered,
+using load-acquire/store-release accessors when available, or any
+other memory barrier that will ensure this ordering.
+
 It's not necessary for userspace to harvest the all dirty GFNs at once.
 However it must collect the dirty GFNs in sequence, i.e., the userspace
 program cannot skip one dirty GFN to collect the one next to it.
@@ -8106,6 +8111,14 @@ KVM_CAP_DIRTY_LOG_RING with an acceptable dirty ring 
size, the virtual
 machine will switch to ring-buffer dirty page tracking and further
 KVM_GET_DIRTY_LOG or KVM_CLEAR_DIRTY_LOG ioctls will fail.
 
+NOTE: KVM_CAP_DIRTY_LOG_RING_ACQ_REL is the only capability that
+should be exposed by weakly ordered architecture, in order to indicate
+the additional memory ordering requirements imposed on userspace when
+reading the state of an entry and mutating it from DIRTY to HARVESTED.
+Architecture with TSO-like ordering (such as x86) are allowed to
+expose both KVM_CAP_DIRTY_LOG_RING and KVM_CAP_DIRTY_LOG_RING_ACQ_REL
+to userspace.
+
 8.30 KVM_CAP_XEN_HVM
 
 
-- 
2.34.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v2 1/6] KVM: Use acquire/release semantics when accessing dirty ring GFN state

2022-09-26 Thread Marc Zyngier
The current implementation of the dirty ring has an implicit requirement
that stores to the dirty ring from userspace must be:

- be ordered with one another

- visible from another CPU executing a ring reset

While these implicit requirements work well for x86 (and any other
TSO-like architecture), they do not work for more relaxed architectures
such as arm64 where stores to different addresses can be freely
reordered, and loads from these addresses not observing writes from
another CPU unless the required barriers (or acquire/release semantics)
are used.

In order to start fixing this, upgrade the ring reset accesses:

- the kvm_dirty_gfn_harvested() helper now uses acquire semantics
  so it is ordered after all previous writes, including that from
  userspace

- the kvm_dirty_gfn_set_invalid() helper now uses release semantics
  so that the next_slot and next_offset reads don't drift past
  the entry invalidation

This is only a partial fix as the userspace side also need upgrading.

Signed-off-by: Marc Zyngier 
---
 virt/kvm/dirty_ring.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c
index f4c2a6eb1666..d6fabf238032 100644
--- a/virt/kvm/dirty_ring.c
+++ b/virt/kvm/dirty_ring.c
@@ -74,7 +74,7 @@ int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int 
index, u32 size)
 
 static inline void kvm_dirty_gfn_set_invalid(struct kvm_dirty_gfn *gfn)
 {
-   gfn->flags = 0;
+   smp_store_release(>flags, 0);
 }
 
 static inline void kvm_dirty_gfn_set_dirtied(struct kvm_dirty_gfn *gfn)
@@ -84,7 +84,7 @@ static inline void kvm_dirty_gfn_set_dirtied(struct 
kvm_dirty_gfn *gfn)
 
 static inline bool kvm_dirty_gfn_harvested(struct kvm_dirty_gfn *gfn)
 {
-   return gfn->flags & KVM_DIRTY_GFN_F_RESET;
+   return smp_load_acquire(>flags) & KVM_DIRTY_GFN_F_RESET;
 }
 
 int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring)
-- 
2.34.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v2 0/6] KVM: Fix dirty-ring ordering on weakly ordered architectures

2022-09-26 Thread Marc Zyngier
[Same distribution list as Gavin's dirty-ring on arm64 series]

This is an update on the initial series posted as [0].

As Gavin started posting patches enabling the dirty-ring infrastructure
on arm64 [1], it quickly became apparent that the API was never intended
to work on relaxed memory ordering architectures (owing to its x86
origins).

This series tries to retrofit some ordering into the existing API by:

- relying on acquire/release semantics which are the default on x86,
  but need to be explicit on arm64

- adding a new capability that indicate which flavor is supported, either
  with explicit ordering (arm64) or both implicit and explicit (x86),
  as suggested by Paolo at KVM Forum

- documenting the requirements for this new capability on weakly ordered
  architectures

- updating the selftests to do the right thing

Ideally, this series should be a prefix of Gavin's, plus a small change
to his series:

diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 0309b2d0f2da..7785379c5048 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -32,7 +32,7 @@ menuconfig KVM
select KVM_VFIO
select HAVE_KVM_EVENTFD
select HAVE_KVM_IRQFD
-   select HAVE_KVM_DIRTY_RING
+   select HAVE_KVM_DIRTY_RING_ACQ_REL
select HAVE_KVM_MSI
select HAVE_KVM_IRQCHIP
select HAVE_KVM_IRQ_ROUTING

This has been very lightly tested on an arm64 box with Gavin's v3 [2] series.

* From v1:
  - Repainted the config symbols and new capability so that their
naming is more acceptable and causes less churn
  - Fixed a couple of blunders as pointed out by Peter and Paolo
  - Updated the documentation

[0] https://lore.kernel.org/r/20220922170133.2617189-1-...@kernel.org
[1] https://lore.kernel.org/lkml/YyiV%2Fl7O23aw5aaO@xz-m1.local/T/
[2] https://lore.kernel.org/r/20220922003214.276736-1-gs...@redhat.com

Marc Zyngier (6):
  KVM: Use acquire/release semantics when accessing dirty ring GFN state
  KVM: Add KVM_CAP_DIRTY_LOG_RING_ACQ_REL capability and config option
  KVM: x86: Select CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL
  KVM: Document weakly ordered architecture requirements for dirty ring
  KVM: selftests: dirty-log: Upgrade flag accesses to acquire/release
semantics
  KVM: selftests: dirty-log: Use KVM_CAP_DIRTY_LOG_RING_ACQ_REL if
available

 Documentation/virt/kvm/api.rst   | 17 +++--
 arch/x86/kvm/Kconfig |  3 ++-
 include/uapi/linux/kvm.h |  1 +
 tools/testing/selftests/kvm/dirty_log_test.c |  8 +---
 tools/testing/selftests/kvm/lib/kvm_util.c   |  5 -
 virt/kvm/Kconfig | 14 ++
 virt/kvm/dirty_ring.c|  4 ++--
 virt/kvm/kvm_main.c  |  9 -
 8 files changed, 51 insertions(+), 10 deletions(-)

-- 
2.34.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v2 5/6] KVM: selftests: dirty-log: Upgrade flag accesses to acquire/release semantics

2022-09-26 Thread Marc Zyngier
In order to preserve ordering, make sure that the flag accesses
in the dirty log are done using acquire/release accessors.

Signed-off-by: Marc Zyngier 
---
 tools/testing/selftests/kvm/dirty_log_test.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/kvm/dirty_log_test.c 
b/tools/testing/selftests/kvm/dirty_log_test.c
index 9c883c94d478..53627add8a7c 100644
--- a/tools/testing/selftests/kvm/dirty_log_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_test.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "kvm_util.h"
 #include "test_util.h"
@@ -279,12 +280,12 @@ static void dirty_ring_create_vm_done(struct kvm_vm *vm)
 
 static inline bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn)
 {
-   return gfn->flags == KVM_DIRTY_GFN_F_DIRTY;
+   return smp_load_acquire(>flags) == KVM_DIRTY_GFN_F_DIRTY;
 }
 
 static inline void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn)
 {
-   gfn->flags = KVM_DIRTY_GFN_F_RESET;
+   smp_store_release(>flags, KVM_DIRTY_GFN_F_RESET);
 }
 
 static uint32_t dirty_ring_collect_one(struct kvm_dirty_gfn *dirty_gfns,
-- 
2.34.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v5] KVM: arm64: Ignore kvm-arm.mode if !is_hyp_mode_available()

2022-09-26 Thread Marc Zyngier
On Tue, 20 Sep 2022 12:06:58 -0700, Elliot Berman wrote:
> Ignore kvm-arm.mode if !is_hyp_mode_available(). Specifically, we want
> to avoid switching kvm_mode to KVM_MODE_PROTECTED if hypervisor mode is
> not available. This prevents "Protected KVM" cpu capability being
> reported when Linux is booting in EL1 and would not have KVM enabled.
> Reasonably though, we should warn if the command line is requesting a
> KVM mode at all if KVM isn't actually available. Allow
> "kvm-arm.mode=none" to skip the warning since this would disable KVM
> anyway.

Applied to next, thanks!

[1/1] KVM: arm64: Ignore kvm-arm.mode if !is_hyp_mode_available()
  commit: b2a4d007c347b4cb4c60f7512733c3f8300a129c

Cheers,

M.
-- 
Without deviation from the norm, progress is not possible.


___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH] KVM: arm64: vgic: Remove duplicate check in update_affinity_collection()

2022-09-26 Thread Marc Zyngier
On Fri, 23 Sep 2022 14:54:47 +0800, Gavin Shan wrote:
> The ITS collection is guranteed to be !NULL when update_affinity_collection()
> is called. So we needn't check ITE's collection with NULL because the
> check has been included to the later one.
> 
> Remove the duplicate check in update_affinity_collection().

Applied to next, thanks!

[1/1] KVM: arm64: vgic: Remove duplicate check in update_affinity_collection()
  commit: 096560dd13251e351176aef54b7aee91c99920a3

Cheers,

M.
-- 
Without deviation from the norm, progress is not possible.


___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH] KVM: arm64: vgic: Remove duplicate check in update_affinity_collection()

2022-09-26 Thread Marc Zyngier
On Mon, 26 Sep 2022 00:21:08 +0100,
Gavin Shan  wrote:
> 
> Hi Marc,
> 
> On 9/24/22 9:56 PM, Marc Zyngier wrote:
> > Side note: please make sure you always Cc all the KVM/arm64 reviewers
> > when sending patches (now added).
> > 
> 
> Sure. The reason, why I didn't run './scripts/get_maintainer.pl' to get
> all reviewers, is the patch is super simple one :)

Sure, but the whole point of having multiple reviewers is to share the
review load. If you only send it to a reduced number of them, you are
defeating this process (although this is more wishful thinking than a
process...).

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v3 3/6] KVM: arm64: Enable ring-based dirty memory tracking

2022-09-24 Thread Marc Zyngier
On Thu, 22 Sep 2022 01:32:11 +0100,
Gavin Shan  wrote:
> 
> This enables the ring-based dirty memory tracking on ARM64. The
> feature is configured by CONFIG_HAVE_KVM_DIRTY_RING, detected and
> enabled by KVM_CAP_DIRTY_LOG_RING. A ring buffer is created on every
> VCPU when the feature is enabled. Each entry in the ring buffer is
> described by 'struct kvm_dirty_gfn'.
> 
> A ring buffer entry is pushed when a page becomes dirty on host,
> and pulled by userspace after the ring buffer is mapped at physical
> page offset KVM_DIRTY_LOG_PAGE_OFFSET. The specific VCPU is enforced
> to exit if its ring buffer becomes softly full. Besides, the ring
> buffer can be reset by ioctl command KVM_RESET_DIRTY_RINGS to release
> those pulled ring buffer entries.

I think you can cut this message short. This description was useful
when the feature was initially merged, but this is only a "plumb the
damn thing" patch.

> 
> Signed-off-by: Gavin Shan 
> ---
>  Documentation/virt/kvm/api.rst| 2 +-
>  arch/arm64/include/uapi/asm/kvm.h | 1 +
>  arch/arm64/kvm/Kconfig| 1 +
>  arch/arm64/kvm/arm.c  | 8 
>  4 files changed, 11 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index abd7c32126ce..19fa1ac017ed 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -8022,7 +8022,7 @@ regardless of what has actually been exposed through 
> the CPUID leaf.
>  8.29 KVM_CAP_DIRTY_LOG_RING
>  ---
>  
> -:Architectures: x86
> +:Architectures: x86, arm64
>  :Parameters: args[0] - size of the dirty log ring
>  
>  KVM is capable of tracking dirty memory using ring buffers that are
> diff --git a/arch/arm64/include/uapi/asm/kvm.h 
> b/arch/arm64/include/uapi/asm/kvm.h
> index 316917b98707..a7a857f1784d 100644
> --- a/arch/arm64/include/uapi/asm/kvm.h
> +++ b/arch/arm64/include/uapi/asm/kvm.h
> @@ -43,6 +43,7 @@
>  #define __KVM_HAVE_VCPU_EVENTS
>  
>  #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
> +#define KVM_DIRTY_LOG_PAGE_OFFSET 64
>  
>  #define KVM_REG_SIZE(id) \
>   (1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT))
> diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
> index 815cc118c675..0309b2d0f2da 100644
> --- a/arch/arm64/kvm/Kconfig
> +++ b/arch/arm64/kvm/Kconfig
> @@ -32,6 +32,7 @@ menuconfig KVM
>   select KVM_VFIO
>   select HAVE_KVM_EVENTFD
>   select HAVE_KVM_IRQFD
> + select HAVE_KVM_DIRTY_RING
>   select HAVE_KVM_MSI
>   select HAVE_KVM_IRQCHIP
>   select HAVE_KVM_IRQ_ROUTING
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index 2ff0ef62abad..76816f8e082b 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -747,6 +747,14 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu)
>  
>   if (kvm_check_request(KVM_REQ_SUSPEND, vcpu))
>   return kvm_vcpu_suspend(vcpu);
> +
> + if (kvm_check_request(KVM_REQ_RING_SOFT_FULL, vcpu) &&
> + kvm_dirty_ring_soft_full(>dirty_ring)) {
> + kvm_make_request(KVM_REQ_RING_SOFT_FULL, vcpu);
> + vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL;
> + trace_kvm_dirty_ring_exit(vcpu);
> + return 0;
> + }

This is *very* similar to the x86 code. Could we move it to common
code? Something like the diff below, to be for most of it squashed
into patch #1.

Thanks,

M.

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 76816f8e082b..93a16cdbe163 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -748,13 +748,8 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_SUSPEND, vcpu))
return kvm_vcpu_suspend(vcpu);
 
-   if (kvm_check_request(KVM_REQ_RING_SOFT_FULL, vcpu) &&
-   kvm_dirty_ring_soft_full(>dirty_ring)) {
-   kvm_make_request(KVM_REQ_RING_SOFT_FULL, vcpu);
-   vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL;
-   trace_kvm_dirty_ring_exit(vcpu);
+   if (kvm_dirty_ring_check_request(vcpu))
return 0;
-   }
}
 
return 1;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index eb7d0d7654bb..48f2519b1db7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10249,11 +10249,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
if (kvm_request_pending(vcpu)) {
/* Forbid vmenter if vcpu dirty ring is soft-full */
-   if (kvm_check_request(KVM_REQ_RING_SOFT_FULL, vcpu) &&
-   kvm_dirty_ring_soft_full(>dirty_ring)) {
-   kvm_make_request(KVM_REQ_RING_SOFT_FULL, vcpu);
-   vcpu->run->exit_reason = 

Re: [PATCH v3 2/6] KVM: x86: Move declaration of kvm_cpu_dirty_log_size() to kvm_dirty_ring.h

2022-09-24 Thread Marc Zyngier
On Thu, 22 Sep 2022 01:32:10 +0100,
Gavin Shan  wrote:
> 
> Not all architectures like ARM64 need to override the function. Move
> its declaration to kvm_dirty_ring.h to avoid the following compiling
> warning on ARM64 when the feature is enabled.
> 
>   arch/arm64/kvm/../../../virt/kvm/dirty_ring.c:14:12:\
>   warning: no previous prototype for 'kvm_cpu_dirty_log_size' \
>   [-Wmissing-prototypes]  \
>   int __weak kvm_cpu_dirty_log_size(void)
>
> Reported-by: kernel test robot 
> Signed-off-by: Gavin Shan 
> ---
>  arch/x86/include/asm/kvm_host.h | 2 --
>  arch/x86/kvm/mmu/mmu.c  | 2 ++
>  include/linux/kvm_dirty_ring.h  | 1 +
>  3 files changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 2c96c43c313a..4c0fd517282b 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -2082,8 +2082,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
>  #define GET_SMSTATE(type, buf, offset)   \
>   (*(type *)((buf) + (offset) - 0x7e00))
>  
> -int kvm_cpu_dirty_log_size(void);
> -
>  int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
>  
>  #define KVM_CLOCK_VALID_FLAGS
> \
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index e418ef3ecfcb..b3eb6a3627ec 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -1349,10 +1349,12 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct 
> kvm *kvm,
>   kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
>  }
>  
> +#ifdef CONFIG_HAVE_KVM_DIRTY_RING

I think you can drop the ifdeffery, as HAVE_KVM_DIRTY_RING is
unconditionally selected by the arch Kconfig.

>  int kvm_cpu_dirty_log_size(void)
>  {
>   return kvm_x86_ops.cpu_dirty_log_size;
>  }
> +#endif
>  
>  bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
>   struct kvm_memory_slot *slot, u64 gfn,
> diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h
> index 906f899813dc..8c6755981c9b 100644
> --- a/include/linux/kvm_dirty_ring.h
> +++ b/include/linux/kvm_dirty_ring.h
> @@ -71,6 +71,7 @@ static inline bool kvm_dirty_ring_soft_full(struct 
> kvm_dirty_ring *ring)
>  
>  #else /* CONFIG_HAVE_KVM_DIRTY_RING */
>  
> +int kvm_cpu_dirty_log_size(void);
>  u32 kvm_dirty_ring_get_rsvd_entries(void);
>  int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size);

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 2/6] KVM: Add KVM_CAP_DIRTY_LOG_RING_ORDERED capability and config option

2022-09-24 Thread Marc Zyngier
On Sat, 24 Sep 2022 14:22:32 +0100,
Peter Xu  wrote:
> 
> On Sat, Sep 24, 2022 at 12:26:53PM +0100, Marc Zyngier wrote:
> > On Sat, 24 Sep 2022 09:51:39 +0100,
> > Marc Zyngier  wrote:
> > > 
> > > I'm happy to bikeshed, but please spell it out for me. If we follow
> > > the current scheme, we need 3 configuration symbols (of which we
> > > already have one), and 2 capabilities (of which we already have one).
> 
> I hope it's not bikeshedding.  I normally don't comment on namings at all
> because many of them can be "bikeshedding" to me.  But this one is so
> special because it directly collides with KVM_GET_DIRTY_LOG, which is other
> method of dirty tracking.

Fair enough. I'm notoriously bad at sticking a name to things, so I'm
always happy to receive suggestions.

> 
> > > 
> > > Do you have any concrete proposal for those?
> > 
> > In order to make some forward progress, I've reworked the series[1]
> > with another proposal for those:
> > 
> > Config symbols:
> > 
> > - HAVE_KVM_DIRTY_RING:
> >   * mostly the same meaning as today
> >   * not directly selected by any architecture
> >   * doesn't expose any capability on its own
> > 
> > - HAVE_KVM_DIRTY_RING_TSO:
> >   * only for strongly ordered architectures
> >   * selects HAVE_KVM_DIRTY_RING
> >   * exposes KVM_CAP_DIRTY_LOG_RING
> >   * selected by x86
> > 
> > - HAVE_KVM_DIRTY_RING_ACQ_REL:
> >   * selects HAVE_KVM_DIRTY_RING
> >   * exposes KVM_CAP_DIRTY_LOG_RING_ACQ_REL
> >   * selected by arm64 and x86
> > 
> > Capabilities:
> > 
> > - KVM_CAP_DIRTY_LOG_RING: the good old x86-specific stuff, advertised
> >   when HAVE_KVM_DIRTY_RING_TSO is selected
> > 
> > - KVM_CAP_DIRTY_LOG_RING_ACQ_REL: the new acquire/release semantics,
> >   advertised when HAVE_KVM_DIRTY_RING_ACQ_REL is selected
> > 
> > This significantly reduces the churn and makes things slightly more
> > explicit.
> 
> This looks good to me, thanks.

OK, thanks for having a quick look. I'll repost this shortly, after
I'm done reviewing Gavin's series.

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH] KVM: arm64: vgic: Remove duplicate check in update_affinity_collection()

2022-09-24 Thread Marc Zyngier
Gavin,

Side note: please make sure you always Cc all the KVM/arm64 reviewers
when sending patches (now added).

On Fri, 23 Sep 2022 07:54:47 +0100,
Gavin Shan  wrote:
> 
> The ITS collection is guranteed to be !NULL when update_affinity_collection()
> is called. So we needn't check ITE's collection with NULL because the
> check has been included to the later one.

It took me a while to understand what you meant by this: the 'coll'
parameter to update_affinity_collection() is never NULL, so comparing
it with 'ite->collection' is enough to cover both the NULL case and
the "another collection" case.

If you agree with this, I can directly fix the commit message when
applying the patch.

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 2/6] KVM: Add KVM_CAP_DIRTY_LOG_RING_ORDERED capability and config option

2022-09-24 Thread Marc Zyngier
On Sat, 24 Sep 2022 09:51:39 +0100,
Marc Zyngier  wrote:
> 
> On Fri, 23 Sep 2022 19:26:18 +0100,
> Peter Xu  wrote:
> > 
> > On Fri, Sep 23, 2022 at 03:28:34PM +0100, Marc Zyngier wrote:
> > > On Thu, 22 Sep 2022 22:48:19 +0100,
> > > Peter Xu  wrote:
> > > > 
> > > > On Thu, Sep 22, 2022 at 06:01:29PM +0100, Marc Zyngier wrote:
> > > > > In order to differenciate between architectures that require no extra
> > > > > synchronisation when accessing the dirty ring and those who do,
> > > > > add a new capability (KVM_CAP_DIRTY_LOG_RING_ORDERED) that identify
> > > > > the latter sort. TSO architectures can obviously advertise both, while
> > > > > relaxed architectures most only advertise the ORDERED version.
> > > > > 
> > > > > Suggested-by: Paolo Bonzini 
> > > > > Signed-off-by: Marc Zyngier 
> > > > > ---
> > > > >  include/linux/kvm_dirty_ring.h |  6 +++---
> > > > >  include/uapi/linux/kvm.h   |  1 +
> > > > >  virt/kvm/Kconfig   | 14 ++
> > > > >  virt/kvm/Makefile.kvm  |  2 +-
> > > > >  virt/kvm/kvm_main.c| 11 +--
> > > > >  5 files changed, 28 insertions(+), 6 deletions(-)
> > > > > 
> > > > > diff --git a/include/linux/kvm_dirty_ring.h 
> > > > > b/include/linux/kvm_dirty_ring.h
> > > > > index 906f899813dc..7a0c90ae9a3f 100644
> > > > > --- a/include/linux/kvm_dirty_ring.h
> > > > > +++ b/include/linux/kvm_dirty_ring.h
> > > > > @@ -27,7 +27,7 @@ struct kvm_dirty_ring {
> > > > >   int index;
> > > > >  };
> > > > >  
> > > > > -#ifndef CONFIG_HAVE_KVM_DIRTY_RING
> > > > > +#ifndef CONFIG_HAVE_KVM_DIRTY_LOG
> > > > 
> > > > s/LOG/LOG_RING/ according to the commit message? Or the name seems too
> > > > generic.
> > > 
> > > The commit message talks about the capability, while the above is the
> > > config option. If you find the names inappropriate, feel free to
> > > suggest alternatives (for all I care, they could be called FOO, BAR
> > > and BAZ).
> > 
> > The existing name from David looks better than the new one.. to me.
> 
> I'm happy to bikeshed, but please spell it out for me. If we follow
> the current scheme, we need 3 configuration symbols (of which we
> already have one), and 2 capabilities (of which we already have one).
> 
> Do you have any concrete proposal for those?

In order to make some forward progress, I've reworked the series[1]
with another proposal for those:

Config symbols:

- HAVE_KVM_DIRTY_RING:
  * mostly the same meaning as today
  * not directly selected by any architecture
  * doesn't expose any capability on its own

- HAVE_KVM_DIRTY_RING_TSO:
  * only for strongly ordered architectures
  * selects HAVE_KVM_DIRTY_RING
  * exposes KVM_CAP_DIRTY_LOG_RING
  * selected by x86

- HAVE_KVM_DIRTY_RING_ACQ_REL:
  * selects HAVE_KVM_DIRTY_RING
  * exposes KVM_CAP_DIRTY_LOG_RING_ACQ_REL
  * selected by arm64 and x86

Capabilities:

- KVM_CAP_DIRTY_LOG_RING: the good old x86-specific stuff, advertised
  when HAVE_KVM_DIRTY_RING_TSO is selected

- KVM_CAP_DIRTY_LOG_RING_ACQ_REL: the new acquire/release semantics,
  advertised when HAVE_KVM_DIRTY_RING_ACQ_REL is selected

This significantly reduces the churn and makes things slightly more
explicit.

Thoughts?

M.

[1] 
https://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git/log/?h=kvm-arm64/dirty-log-ordered-bikeshed

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 2/6] KVM: Add KVM_CAP_DIRTY_LOG_RING_ORDERED capability and config option

2022-09-24 Thread Marc Zyngier
On Fri, 23 Sep 2022 19:26:18 +0100,
Peter Xu  wrote:
> 
> On Fri, Sep 23, 2022 at 03:28:34PM +0100, Marc Zyngier wrote:
> > On Thu, 22 Sep 2022 22:48:19 +0100,
> > Peter Xu  wrote:
> > > 
> > > On Thu, Sep 22, 2022 at 06:01:29PM +0100, Marc Zyngier wrote:
> > > > In order to differenciate between architectures that require no extra
> > > > synchronisation when accessing the dirty ring and those who do,
> > > > add a new capability (KVM_CAP_DIRTY_LOG_RING_ORDERED) that identify
> > > > the latter sort. TSO architectures can obviously advertise both, while
> > > > relaxed architectures most only advertise the ORDERED version.
> > > > 
> > > > Suggested-by: Paolo Bonzini 
> > > > Signed-off-by: Marc Zyngier 
> > > > ---
> > > >  include/linux/kvm_dirty_ring.h |  6 +++---
> > > >  include/uapi/linux/kvm.h   |  1 +
> > > >  virt/kvm/Kconfig   | 14 ++
> > > >  virt/kvm/Makefile.kvm  |  2 +-
> > > >  virt/kvm/kvm_main.c| 11 +--
> > > >  5 files changed, 28 insertions(+), 6 deletions(-)
> > > > 
> > > > diff --git a/include/linux/kvm_dirty_ring.h 
> > > > b/include/linux/kvm_dirty_ring.h
> > > > index 906f899813dc..7a0c90ae9a3f 100644
> > > > --- a/include/linux/kvm_dirty_ring.h
> > > > +++ b/include/linux/kvm_dirty_ring.h
> > > > @@ -27,7 +27,7 @@ struct kvm_dirty_ring {
> > > > int index;
> > > >  };
> > > >  
> > > > -#ifndef CONFIG_HAVE_KVM_DIRTY_RING
> > > > +#ifndef CONFIG_HAVE_KVM_DIRTY_LOG
> > > 
> > > s/LOG/LOG_RING/ according to the commit message? Or the name seems too
> > > generic.
> > 
> > The commit message talks about the capability, while the above is the
> > config option. If you find the names inappropriate, feel free to
> > suggest alternatives (for all I care, they could be called FOO, BAR
> > and BAZ).
> 
> The existing name from David looks better than the new one.. to me.

I'm happy to bikeshed, but please spell it out for me. If we follow
the current scheme, we need 3 configuration symbols (of which we
already have one), and 2 capabilities (of which we already have one).

Do you have any concrete proposal for those?

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 3/6] KVM: x86: Select CONFIG_HAVE_KVM_DIRTY_RING_ORDERED

2022-09-24 Thread Marc Zyngier
On Fri, 23 Sep 2022 23:46:40 +0100,
Peter Xu  wrote:
> 
> On Thu, Sep 22, 2022 at 06:01:30PM +0100, Marc Zyngier wrote:
> > Since x86 is TSO (give or take), allow it to advertise the new
> > ORDERED version of the dirty ring capability. No other change is
> > required for it.
> > 
> > Signed-off-by: Marc Zyngier 
> > ---
> >  arch/x86/kvm/Kconfig | 1 +
> >  1 file changed, 1 insertion(+)
> > 
> > diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> > index e3cbd7706136..eb63bc31ed1d 100644
> > --- a/arch/x86/kvm/Kconfig
> > +++ b/arch/x86/kvm/Kconfig
> > @@ -29,6 +29,7 @@ config KVM
> > select HAVE_KVM_PFNCACHE
> > select HAVE_KVM_IRQFD
> > select HAVE_KVM_DIRTY_RING
> > +   select HAVE_KVM_DIRTY_RING_ORDERED
> > select IRQ_BYPASS_MANAGER
> > select HAVE_KVM_IRQ_BYPASS
> > select HAVE_KVM_IRQ_ROUTING
> 
> Before patch 2-3, we only have HAVE_KVM_DIRTY_RING.
> 
> After that, we'll have:
> 
> HAVE_KVM_DIRTY_LOG
> HAVE_KVM_DIRTY_RING
> HAVE_KVM_DIRTY_RING_ORDERED
> 
> I'm wondering whether we can just keep using the old HAVE_KVM_DIRTY_RING,
> but just declare a new KVM_CAP_DIRTY_LOG_RING_ORDERED only after all memory
> barrier patches merged (after patch 1).

The problem is to decide, on a per architecture basis, how to expose
the old property. I'm happy to key it on being x86 specific, but that
feels pretty gross, and totally unnecessary for strongly ordered
architectures (s390?).

> IIUC it's a matter of whether any of the arch would like to support
> !ORDERED version of dirty ring at all, but then IIUC we'll need to have the
> memory barriers conditional too or not sure how it'll help.

Memory barriers do not affect the semantics of the userspace, while
the lack thereof do. On strongly ordered architectures,
acquire/release is usually "free", because that's implied by their
memory model. If it thus free for these to implement both versions of
the API.

So are we discussing the cost of couple of (mostly invisible) config
options?

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 5/6] KVM: selftests: dirty-log: Upgrade dirty_gfn_set_collected() to store-release

2022-09-23 Thread Marc Zyngier
On Thu, 22 Sep 2022 22:38:58 +0100,
Paolo Bonzini  wrote:
> 
> On Thu, Sep 22, 2022 at 7:02 PM Marc Zyngier  wrote:
> > To make sure that all the writes to the log marking the entries
> > as being in need of reset are observed in order, use a
> > smp_store_release() when updating the log entry flags.
> >
> > Signed-off-by: Marc Zyngier 
> 
> You also need a load-acquire on the load of gfn->flags in
> dirty_gfn_is_dirtied. Otherwise reading cur->slot or cur->offset might
> see a stale value.

Ah, indeed. smp_wmb() is implemented as DMB ISHST, which only orders
writes, and not loads against writes. Global barriers are just
confusing.

/me goes and repaint the stuff...

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 1/6] KVM: Use acquire/release semantics when accessing dirty ring GFN state

2022-09-23 Thread Marc Zyngier
On Fri, 23 Sep 2022 00:46:58 +0100,
Gavin Shan  wrote:
> 
> Hi Peter,
> 
> On 9/23/22 7:38 AM, Peter Xu wrote:
> > On Thu, Sep 22, 2022 at 06:01:28PM +0100, Marc Zyngier wrote:
> >> The current implementation of the dirty ring has an implicit requirement
> >> that stores to the dirty ring from userspace must be:
> >> 
> >> - be ordered with one another
> >> 
> >> - visible from another CPU executing a ring reset
> >> 
> >> While these implicit requirements work well for x86 (and any other
> >> TSO-like architecture), they do not work for more relaxed architectures
> >> such as arm64 where stores to different addresses can be freely
> >> reordered, and loads from these addresses not observing writes from
> >> another CPU unless the required barriers (or acquire/release semantics)
> >> are used.
> >> 
> >> In order to start fixing this, upgrade the ring reset accesses:
> >> 
> >> - the kvm_dirty_gfn_harvested() helper now uses acquire semantics
> >>so it is ordered after all previous writes, including that from
> >>userspace
> >> 
> >> - the kvm_dirty_gfn_set_invalid() helper now uses release semantics
> >>so that the next_slot and next_offset reads don't drift past
> >>the entry invalidation
> >> 
> >> This is only a partial fix as the userspace side also need upgrading.
> > 
> > Paolo has one fix 4802bf910e ("KVM: dirty ring: add missing memory
> > barrier", 2022-09-01) which has already landed.
> > 
> > I think the other one to reset it was lost too.  I just posted a patch.
> > 
> > https://lore.kernel.org/qemu-devel/20220922213522.68861-1-pet...@redhat.com/
> > (link still not yet available so far, but should be)
> > 
> >> 
> >> Signed-off-by: Marc Zyngier 
> >> ---
> >>   virt/kvm/dirty_ring.c | 4 ++--
> >>   1 file changed, 2 insertions(+), 2 deletions(-)
> >> 
> >> diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c
> >> index f4c2a6eb1666..784bed80221d 100644
> >> --- a/virt/kvm/dirty_ring.c
> >> +++ b/virt/kvm/dirty_ring.c
> >> @@ -79,12 +79,12 @@ static inline void kvm_dirty_gfn_set_invalid(struct 
> >> kvm_dirty_gfn *gfn)
> >> static inline void kvm_dirty_gfn_set_dirtied(struct
> >> kvm_dirty_gfn *gfn)
> >>   {
> >> -  gfn->flags = KVM_DIRTY_GFN_F_DIRTY;
> >> +  smp_store_release(>flags, KVM_DIRTY_GFN_F_DIRTY);
> > 
> > IIUC you meant kvm_dirty_gfn_set_invalid as the comment says?
> > 
> > kvm_dirty_gfn_set_dirtied() has been guarded by smp_wmb() and AFAICT that's
> > already safe.  Otherwise looks good to me.
> > 
> 
> If I'm understanding the full context, smp_store_release() also
> enforces guard on 'gfn->flags' itself. It is needed by user space
> for the synchronization.

There are multiple things at play here:

- userspace needs a store-release when making the flags 'harvested',
  so that the kernel using a load-acquire can observe this write (and
  avoid the roach-motel effect of a non-acquire load)

- the kernel needs a store-release when making the flags 'invalid',
  preventing this write from occuring before the next_* fields have
  been sampled

On the ring production side, there is a heavy handed smp_wmb(), which
makes things pretty safe.

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 2/6] KVM: Add KVM_CAP_DIRTY_LOG_RING_ORDERED capability and config option

2022-09-23 Thread Marc Zyngier
On Thu, 22 Sep 2022 22:48:19 +0100,
Peter Xu  wrote:
> 
> On Thu, Sep 22, 2022 at 06:01:29PM +0100, Marc Zyngier wrote:
> > In order to differenciate between architectures that require no extra
> > synchronisation when accessing the dirty ring and those who do,
> > add a new capability (KVM_CAP_DIRTY_LOG_RING_ORDERED) that identify
> > the latter sort. TSO architectures can obviously advertise both, while
> > relaxed architectures most only advertise the ORDERED version.
> > 
> > Suggested-by: Paolo Bonzini 
> > Signed-off-by: Marc Zyngier 
> > ---
> >  include/linux/kvm_dirty_ring.h |  6 +++---
> >  include/uapi/linux/kvm.h   |  1 +
> >  virt/kvm/Kconfig   | 14 ++
> >  virt/kvm/Makefile.kvm  |  2 +-
> >  virt/kvm/kvm_main.c| 11 +--
> >  5 files changed, 28 insertions(+), 6 deletions(-)
> > 
> > diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h
> > index 906f899813dc..7a0c90ae9a3f 100644
> > --- a/include/linux/kvm_dirty_ring.h
> > +++ b/include/linux/kvm_dirty_ring.h
> > @@ -27,7 +27,7 @@ struct kvm_dirty_ring {
> > int index;
> >  };
> >  
> > -#ifndef CONFIG_HAVE_KVM_DIRTY_RING
> > +#ifndef CONFIG_HAVE_KVM_DIRTY_LOG
> 
> s/LOG/LOG_RING/ according to the commit message? Or the name seems too
> generic.

The commit message talks about the capability, while the above is the
config option. If you find the names inappropriate, feel free to
suggest alternatives (for all I care, they could be called FOO, BAR
and BAZ).

> Pure question to ask: is it required to have a new cap just for the
> ordering?  IIUC if x86 was the only supported anyway before, it means all
> released old kvm binaries are always safe even without the strict
> orderings.  As long as we rework all the memory ordering bits before
> declaring support of yet another arch, we're good.  Or am I wrong?

Someone will show up with an old userspace which probes for the sole
existing capability, and things start failing subtly. It is quite
likely that the userspace code is built for all architectures, and we
want to make sure that userspace actively buys into the new ordering
requirements. A simple way to do this is to expose a new capability,
making the new requirement obvious. Architectures with relaxed
ordering semantics will only implement the new one, while x86 will
implement both.

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 1/6] KVM: Use acquire/release semantics when accessing dirty ring GFN state

2022-09-23 Thread Marc Zyngier
On Thu, 22 Sep 2022 22:38:33 +0100,
Peter Xu  wrote:
> 
> Marc,
> 
> On Thu, Sep 22, 2022 at 06:01:28PM +0100, Marc Zyngier wrote:
> > The current implementation of the dirty ring has an implicit requirement
> > that stores to the dirty ring from userspace must be:
> > 
> > - be ordered with one another
> > 
> > - visible from another CPU executing a ring reset
> > 
> > While these implicit requirements work well for x86 (and any other
> > TSO-like architecture), they do not work for more relaxed architectures
> > such as arm64 where stores to different addresses can be freely
> > reordered, and loads from these addresses not observing writes from
> > another CPU unless the required barriers (or acquire/release semantics)
> > are used.
> > 
> > In order to start fixing this, upgrade the ring reset accesses:
> > 
> > - the kvm_dirty_gfn_harvested() helper now uses acquire semantics
> >   so it is ordered after all previous writes, including that from
> >   userspace
> > 
> > - the kvm_dirty_gfn_set_invalid() helper now uses release semantics
> >   so that the next_slot and next_offset reads don't drift past
> >   the entry invalidation
> > 
> > This is only a partial fix as the userspace side also need upgrading.
> 
> Paolo has one fix 4802bf910e ("KVM: dirty ring: add missing memory
> barrier", 2022-09-01) which has already landed.

What is this commit? It doesn't exist in the kernel as far as I can see.

> 
> I think the other one to reset it was lost too.  I just posted a patch.
> 
> https://lore.kernel.org/qemu-devel/20220922213522.68861-1-pet...@redhat.com/
> (link still not yet available so far, but should be)

That's a QEMU patch, right?

> 
> > 
> > Signed-off-by: Marc Zyngier 
> > ---
> >  virt/kvm/dirty_ring.c | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> > 
> > diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c
> > index f4c2a6eb1666..784bed80221d 100644
> > --- a/virt/kvm/dirty_ring.c
> > +++ b/virt/kvm/dirty_ring.c
> > @@ -79,12 +79,12 @@ static inline void kvm_dirty_gfn_set_invalid(struct 
> > kvm_dirty_gfn *gfn)
> >  
> >  static inline void kvm_dirty_gfn_set_dirtied(struct kvm_dirty_gfn *gfn)
> >  {
> > -   gfn->flags = KVM_DIRTY_GFN_F_DIRTY;
> > +   smp_store_release(>flags, KVM_DIRTY_GFN_F_DIRTY);
> 
> IIUC you meant kvm_dirty_gfn_set_invalid as the comment says?

Gah, you're right, I redid the patch at the last minute and messed it
up

> kvm_dirty_gfn_set_dirtied() has been guarded by smp_wmb() and AFAICT that's
> already safe.  Otherwise looks good to me.

Indeed. Let me fix this.

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 0/6] KVM: Fix dirty-ring ordering on weakly ordered architectures

2022-09-22 Thread Marc Zyngier
[Same distribution list as Gavin's dirty-ring on arm64 series]

As Gavin started posting patches enabling the dirty-ring infrastructure
on arm64 [1], it quickly became apparent that the API was never intended
to work on relaxed memory ordering architectures (owing to its x86
origins).

This series tries to retrofit some ordering into the existing API by:

- relying on acquire/release semantics which are the default on x86,
  but need to be explicit on arm64

- adding a new capability that indicate which flavor is supported, either
  with explicit ordering (arm64) or both implicit and explicit (x86),
  as suggested by Paolo at KVM Forum

- documenting the requirements for this new capability on weakly ordered
  architectures

- updating the selftests to do the right thing

Ideally, this series should be a prefix of Gavin's, plus a small change
to his series:

diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 0309b2d0f2da..7785379c5048 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -32,7 +32,7 @@ menuconfig KVM
select KVM_VFIO
select HAVE_KVM_EVENTFD
select HAVE_KVM_IRQFD
-   select HAVE_KVM_DIRTY_RING
+   select HAVE_KVM_DIRTY_RING_ORDERED
select HAVE_KVM_MSI
select HAVE_KVM_IRQCHIP
select HAVE_KVM_IRQ_ROUTING

This has been very lightly tested on an arm64 box with Gavin's v3 [2] series.

[1] https://lore.kernel.org/lkml/YyiV%2Fl7O23aw5aaO@xz-m1.local/T/
[2] https://lore.kernel.org/r/20220922003214.276736-1-gs...@redhat.com

Marc Zyngier (6):
  KVM: Use acquire/release semantics when accessing dirty ring GFN state
  KVM: Add KVM_CAP_DIRTY_LOG_RING_ORDERED capability and config option
  KVM: x86: Select CONFIG_HAVE_KVM_DIRTY_RING_ORDERED
  KVM: Document weakly ordered architecture requirements for dirty ring
  KVM: selftests: dirty-log: Upgrade dirty_gfn_set_collected() to
store-release
  KVM: selftests: dirty-log: Use KVM_CAP_DIRTY_LOG_RING_ORDERED of
available

 Documentation/virt/kvm/api.rst   | 16 +---
 arch/x86/kvm/Kconfig |  1 +
 include/linux/kvm_dirty_ring.h   |  6 +++---
 include/uapi/linux/kvm.h |  1 +
 tools/testing/selftests/kvm/dirty_log_test.c |  6 --
 tools/testing/selftests/kvm/lib/kvm_util.c   |  5 -
 virt/kvm/Kconfig | 14 ++
 virt/kvm/Makefile.kvm|  2 +-
 virt/kvm/dirty_ring.c|  4 ++--
 virt/kvm/kvm_main.c  | 11 +--
 10 files changed, 52 insertions(+), 14 deletions(-)

-- 
2.34.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 6/6] KVM: selftests: dirty-log: Use KVM_CAP_DIRTY_LOG_RING_ORDERED of available

2022-09-22 Thread Marc Zyngier
Pick KVM_CAP_DIRTY_LOG_RING_ORDERED if exposed by the kernel.

Signed-off-by: Marc Zyngier 
---
 tools/testing/selftests/kvm/dirty_log_test.c | 3 ++-
 tools/testing/selftests/kvm/lib/kvm_util.c   | 5 -
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/kvm/dirty_log_test.c 
b/tools/testing/selftests/kvm/dirty_log_test.c
index 3d29f4bf4f9c..30cdda41b8ec 100644
--- a/tools/testing/selftests/kvm/dirty_log_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_test.c
@@ -265,7 +265,8 @@ static void default_after_vcpu_run(struct kvm_vcpu *vcpu, 
int ret, int err)
 
 static bool dirty_ring_supported(void)
 {
-   return kvm_has_cap(KVM_CAP_DIRTY_LOG_RING);
+   return (kvm_has_cap(KVM_CAP_DIRTY_LOG_RING) ||
+   kvm_has_cap(KVM_CAP_DIRTY_LOG_RING_ORDERED));
 }
 
 static void dirty_ring_create_vm_done(struct kvm_vm *vm)
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
b/tools/testing/selftests/kvm/lib/kvm_util.c
index 9889fe0d8919..4c031f9fe717 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -82,7 +82,10 @@ unsigned int kvm_check_cap(long cap)
 
 void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size)
 {
-   vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING, ring_size);
+   if (vm_check_cap(vm, KVM_CAP_DIRTY_LOG_RING_ORDERED))
+   vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING_ORDERED, ring_size);
+   else
+   vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING, ring_size);
vm->dirty_ring_size = ring_size;
 }
 
-- 
2.34.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 5/6] KVM: selftests: dirty-log: Upgrade dirty_gfn_set_collected() to store-release

2022-09-22 Thread Marc Zyngier
To make sure that all the writes to the log marking the entries
as being in need of reset are observed in order, use a
smp_store_release() when updating the log entry flags.

Signed-off-by: Marc Zyngier 
---
 tools/testing/selftests/kvm/dirty_log_test.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/kvm/dirty_log_test.c 
b/tools/testing/selftests/kvm/dirty_log_test.c
index 9c883c94d478..3d29f4bf4f9c 100644
--- a/tools/testing/selftests/kvm/dirty_log_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_test.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "kvm_util.h"
 #include "test_util.h"
@@ -284,7 +285,7 @@ static inline bool dirty_gfn_is_dirtied(struct 
kvm_dirty_gfn *gfn)
 
 static inline void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn)
 {
-   gfn->flags = KVM_DIRTY_GFN_F_RESET;
+   smp_store_release(>flags, KVM_DIRTY_GFN_F_RESET);
 }
 
 static uint32_t dirty_ring_collect_one(struct kvm_dirty_gfn *dirty_gfns,
-- 
2.34.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 3/6] KVM: x86: Select CONFIG_HAVE_KVM_DIRTY_RING_ORDERED

2022-09-22 Thread Marc Zyngier
Since x86 is TSO (give or take), allow it to advertise the new
ORDERED version of the dirty ring capability. No other change is
required for it.

Signed-off-by: Marc Zyngier 
---
 arch/x86/kvm/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index e3cbd7706136..eb63bc31ed1d 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -29,6 +29,7 @@ config KVM
select HAVE_KVM_PFNCACHE
select HAVE_KVM_IRQFD
select HAVE_KVM_DIRTY_RING
+   select HAVE_KVM_DIRTY_RING_ORDERED
select IRQ_BYPASS_MANAGER
select HAVE_KVM_IRQ_BYPASS
select HAVE_KVM_IRQ_ROUTING
-- 
2.34.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 1/6] KVM: Use acquire/release semantics when accessing dirty ring GFN state

2022-09-22 Thread Marc Zyngier
The current implementation of the dirty ring has an implicit requirement
that stores to the dirty ring from userspace must be:

- be ordered with one another

- visible from another CPU executing a ring reset

While these implicit requirements work well for x86 (and any other
TSO-like architecture), they do not work for more relaxed architectures
such as arm64 where stores to different addresses can be freely
reordered, and loads from these addresses not observing writes from
another CPU unless the required barriers (or acquire/release semantics)
are used.

In order to start fixing this, upgrade the ring reset accesses:

- the kvm_dirty_gfn_harvested() helper now uses acquire semantics
  so it is ordered after all previous writes, including that from
  userspace

- the kvm_dirty_gfn_set_invalid() helper now uses release semantics
  so that the next_slot and next_offset reads don't drift past
  the entry invalidation

This is only a partial fix as the userspace side also need upgrading.

Signed-off-by: Marc Zyngier 
---
 virt/kvm/dirty_ring.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c
index f4c2a6eb1666..784bed80221d 100644
--- a/virt/kvm/dirty_ring.c
+++ b/virt/kvm/dirty_ring.c
@@ -79,12 +79,12 @@ static inline void kvm_dirty_gfn_set_invalid(struct 
kvm_dirty_gfn *gfn)
 
 static inline void kvm_dirty_gfn_set_dirtied(struct kvm_dirty_gfn *gfn)
 {
-   gfn->flags = KVM_DIRTY_GFN_F_DIRTY;
+   smp_store_release(>flags, KVM_DIRTY_GFN_F_DIRTY);
 }
 
 static inline bool kvm_dirty_gfn_harvested(struct kvm_dirty_gfn *gfn)
 {
-   return gfn->flags & KVM_DIRTY_GFN_F_RESET;
+   return smp_load_acquire(>flags) & KVM_DIRTY_GFN_F_RESET;
 }
 
 int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring)
-- 
2.34.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 4/6] KVM: Document weakly ordered architecture requirements for dirty ring

2022-09-22 Thread Marc Zyngier
Now that the kernel can expose to userspace that its dirty ring
management relies on explicit ordering, document these new requirements
for VMMs to do the right thing.

Signed-off-by: Marc Zyngier 
---
 Documentation/virt/kvm/api.rst | 16 +---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index abd7c32126ce..0912db16425f 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -8019,8 +8019,8 @@ guest according to the bits in the KVM_CPUID_FEATURES 
CPUID leaf
 (0x4001). Otherwise, a guest may use the paravirtual features
 regardless of what has actually been exposed through the CPUID leaf.
 
-8.29 KVM_CAP_DIRTY_LOG_RING

+8.29 KVM_CAP_DIRTY_LOG_RING/KVM_CAP_DIRTY_LOG_RING_ORDERED
+--
 
 :Architectures: x86
 :Parameters: args[0] - size of the dirty log ring
@@ -8076,7 +8076,10 @@ The userspace should harvest this GFN and mark the flags 
from state
 to show that this GFN is harvested and waiting for a reset), and move
 on to the next GFN.  The userspace should continue to do this until the
 flags of a GFN have the DIRTY bit cleared, meaning that it has harvested
-all the dirty GFNs that were available.
+all the dirty GFNs that were available.  Note that on relaxed memory
+architectures, userspace stores to the ring buffer must be ordered,
+for example by using a store-release when available, or by using any
+other memory barrier that will ensure this ordering
 
 It's not necessary for userspace to harvest the all dirty GFNs at once.
 However it must collect the dirty GFNs in sequence, i.e., the userspace
@@ -8106,6 +8109,13 @@ KVM_CAP_DIRTY_LOG_RING with an acceptable dirty ring 
size, the virtual
 machine will switch to ring-buffer dirty page tracking and further
 KVM_GET_DIRTY_LOG or KVM_CLEAR_DIRTY_LOG ioctls will fail.
 
+NOTE: KVM_CAP_DIRTY_LOG_RING_ORDERED is the only capability that can
+be exposed by relaxed memory architecture, in order to indicate the
+additional memory ordering requirements imposed on userspace when
+mutating an entry from DIRTY to HARVESTED states. Architecture with
+TSO-like ordering (such as x86) can expose both KVM_CAP_DIRTY_LOG_RING
+and KVM_CAP_DIRTY_LOG_RING_ORDERED to userspace.
+
 8.30 KVM_CAP_XEN_HVM
 
 
-- 
2.34.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH 2/6] KVM: Add KVM_CAP_DIRTY_LOG_RING_ORDERED capability and config option

2022-09-22 Thread Marc Zyngier
In order to differenciate between architectures that require no extra
synchronisation when accessing the dirty ring and those who do,
add a new capability (KVM_CAP_DIRTY_LOG_RING_ORDERED) that identify
the latter sort. TSO architectures can obviously advertise both, while
relaxed architectures most only advertise the ORDERED version.

Suggested-by: Paolo Bonzini 
Signed-off-by: Marc Zyngier 
---
 include/linux/kvm_dirty_ring.h |  6 +++---
 include/uapi/linux/kvm.h   |  1 +
 virt/kvm/Kconfig   | 14 ++
 virt/kvm/Makefile.kvm  |  2 +-
 virt/kvm/kvm_main.c| 11 +--
 5 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h
index 906f899813dc..7a0c90ae9a3f 100644
--- a/include/linux/kvm_dirty_ring.h
+++ b/include/linux/kvm_dirty_ring.h
@@ -27,7 +27,7 @@ struct kvm_dirty_ring {
int index;
 };
 
-#ifndef CONFIG_HAVE_KVM_DIRTY_RING
+#ifndef CONFIG_HAVE_KVM_DIRTY_LOG
 /*
  * If CONFIG_HAVE_HVM_DIRTY_RING not defined, kvm_dirty_ring.o should
  * not be included as well, so define these nop functions for the arch.
@@ -69,7 +69,7 @@ static inline bool kvm_dirty_ring_soft_full(struct 
kvm_dirty_ring *ring)
return true;
 }
 
-#else /* CONFIG_HAVE_KVM_DIRTY_RING */
+#else /* CONFIG_HAVE_KVM_DIRTY_LOG */
 
 u32 kvm_dirty_ring_get_rsvd_entries(void);
 int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size);
@@ -92,6 +92,6 @@ struct page *kvm_dirty_ring_get_page(struct kvm_dirty_ring 
*ring, u32 offset);
 void kvm_dirty_ring_free(struct kvm_dirty_ring *ring);
 bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring);
 
-#endif /* CONFIG_HAVE_KVM_DIRTY_RING */
+#endif /* CONFIG_HAVE_KVM_DIRTY_LOG */
 
 #endif /* KVM_DIRTY_RING_H */
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index eed0315a77a6..c1c9c0c8be5c 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1177,6 +1177,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_VM_DISABLE_NX_HUGE_PAGES 220
 #define KVM_CAP_S390_ZPCI_OP 221
 #define KVM_CAP_S390_CPU_TOPOLOGY 222
+#define KVM_CAP_DIRTY_LOG_RING_ORDERED 223
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index a8c5c9f06b3c..1023426bf7dd 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -16,8 +16,22 @@ config HAVE_KVM_IRQFD
 config HAVE_KVM_IRQ_ROUTING
bool
 
+config HAVE_KVM_DIRTY_LOG
+   bool
+
+# Only strongly ordered architectures can select this, as it doesn't
+# put any constraint on userspace ordering. They also can select the
+# _ORDERED version.
 config HAVE_KVM_DIRTY_RING
bool
+   select HAVE_KVM_DIRTY_LOG
+   depends on X86
+
+# Weakly ordered architectures can only select this, advertising
+# to userspace the additional ordering requirements.
+config HAVE_KVM_DIRTY_RING_ORDERED
+   bool
+   select HAVE_KVM_DIRTY_LOG
 
 config HAVE_KVM_EVENTFD
bool
diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm
index 2c27d5d0c367..2bc6d0bb5e5c 100644
--- a/virt/kvm/Makefile.kvm
+++ b/virt/kvm/Makefile.kvm
@@ -10,5 +10,5 @@ kvm-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o
 kvm-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o
 kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
 kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
-kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o
+kvm-$(CONFIG_HAVE_KVM_DIRTY_LOG) += $(KVM)/dirty_ring.o
 kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 584a5bab3af3..cb1c103e2018 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3304,7 +3304,7 @@ void mark_page_dirty_in_slot(struct kvm *kvm,
 {
struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
 
-#ifdef CONFIG_HAVE_KVM_DIRTY_RING
+#ifdef CONFIG_HAVE_KVM_DIRTY_LOG
if (WARN_ON_ONCE(!vcpu) || WARN_ON_ONCE(vcpu->kvm != kvm))
return;
 #endif
@@ -3758,7 +3758,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
 
 static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
 {
-#ifdef CONFIG_HAVE_KVM_DIRTY_RING
+#ifdef CONFIG_HAVE_KVM_DIRTY_LOG
return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
(pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
 kvm->dirty_ring_size / PAGE_SIZE);
@@ -4479,6 +4479,12 @@ static long kvm_vm_ioctl_check_extension_generic(struct 
kvm *kvm, long arg)
return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct 
kvm_dirty_gfn);
 #else
return 0;
+#endif
+   case KVM_CAP_DIRTY_LOG_RING_ORDERED:
+#ifdef CONFIG_HAVE_KVM_DIRTY_RING_ORDERED
+   return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct 
kvm_dirty_gfn);
+#else
+   return 0;
 #endif
case KVM_CAP_BINARY_STATS_FD:
case KVM_CAP_SYSTEM_EVENT_DATA:
@@ -4580,6 +4586,7 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm 
*kvm,
return 0;
}
case KV

Re: [PATCH v4 12/12] KVM: remove KVM_REQ_UNHALT

2022-09-22 Thread Marc Zyngier
On Wed, 21 Sep 2022 01:32:01 +0100,
Sean Christopherson  wrote:
> 
> From: Paolo Bonzini 
> 
> KVM_REQ_UNHALT is now unnecessary because it is replaced by the return
> value of kvm_vcpu_block/kvm_vcpu_halt.  Remove it.
> 
> No functional change intended.
> 
> Signed-off-by: Paolo Bonzini 
> Signed-off-by: Sean Christopherson 

Acked-by: Marc Zyngier 

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2] KVM: arm64: nvhe: Fix build with profile optimization

2022-09-22 Thread Marc Zyngier
On Thu, 22 Sep 2022 06:31:45 +0100,
Denis Nikitin  wrote:
> 
> Kernel build with clang and KCFLAGS=-fprofile-sample-use fails with:
> 
> error: arch/arm64/kvm/hyp/nvhe/kvm_nvhe.tmp.o: Unexpected SHT_REL
> section ".rel.llvm.call-graph-profile"
> 
> Starting from 13.0.0 llvm can generate SHT_REL section, see
> https://reviews.llvm.org/rGca3bdb57fa1ac98b711a735de048c12b5fdd8086.
> gen-hyprel does not support SHT_REL relocation section.
> 
> Remove ".llvm.call-graph-profile" SHT_REL relocation from kvm_nvhe
> to fix the build.
> 
> Signed-off-by: Denis Nikitin 
> ---
> V1 -> V2: Remove the relocation instead of disabling the profile-use flags.
> ---
>  arch/arm64/kvm/hyp/nvhe/Makefile | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile 
> b/arch/arm64/kvm/hyp/nvhe/Makefile
> index b5c5119c7396..49ec950ac57b 100644
> --- a/arch/arm64/kvm/hyp/nvhe/Makefile
> +++ b/arch/arm64/kvm/hyp/nvhe/Makefile
> @@ -78,8 +78,10 @@ $(obj)/kvm_nvhe.o: $(obj)/kvm_nvhe.rel.o FORCE
>  
>  # The HYPREL command calls `gen-hyprel` to generate an assembly file with
>  # a list of relocations targeting hyp code/data.
> +# Starting from 13.0.0 llvm emits SHT_REL section '.llvm.call-graph-profile'
> +# when profile optimization is applied. gen-hyprel does not support SHT_REL.
>  quiet_cmd_hyprel = HYPREL  $@
> -  cmd_hyprel = $(obj)/gen-hyprel $< > $@
> + cmd_hyprel = $(OBJCOPY) -R .llvm.call-graph-profile $<; 
> $(obj)/gen-hyprel $< > $@

I was really hoping that you'd just drop the flags from the CFLAGS
instead of removing the generated section. Something like:

diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index b5c5119c7396..e5b2d43925b4 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -88,7 +88,7 @@ quiet_cmd_hypcopy = HYPCOPY $@
 
 # Remove ftrace, Shadow Call Stack, and CFI CFLAGS.
 # This is equivalent to the 'notrace', '__noscs', and '__nocfi' annotations.
-KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) 
$(CC_FLAGS_CFI), $(KBUILD_CFLAGS))
+KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) 
$(CC_FLAGS_CFI) -fprofile-sample-use, $(KBUILD_CFLAGS))
 
 # KVM nVHE code is run at a different exception code with a different map, so
 # compiler instrumentation that inserts callbacks or checks into the code may

However, I even failed to reproduce your problem using LLVM 14 as
packaged by Debian (if that matters, I'm using an arm64 build
machine). I build the kernel with:

$ make LLVM=1 KCFLAGS=-fprofile-sample-use -j8 vmlinux

and the offending object only contains the following sections:

arch/arm64/kvm/hyp/nvhe/kvm_nvhe.tmp.o: file format elf64-littleaarch64

Sections:
Idx Name  Size  VMA   LMA   File off  Algn
  0 .hyp.idmap.text 0ae4      0800  
2**11
  CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
  1 .hyp.text e988      1800  2**11
  CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
  2 .hyp.data..ro_after_init 0820      
00010188  2**3
  CONTENTS, ALLOC, LOAD, DATA
  3 .hyp.rodata   2e70      000109a8  2**3
  CONTENTS, ALLOC, LOAD, RELOC, READONLY, DATA
  4 .hyp.data..percpu 1ee0      00013820  
2**4
  CONTENTS, ALLOC, LOAD, DATA
  5 .hyp.bss  1158      00015700  2**3
  ALLOC
  6 .comment  001f      00017830  2**0
  CONTENTS, READONLY
  7 .llvm_addrsig 00b8      0001784f  2**0
  CONTENTS, READONLY, EXCLUDE
  8 .altinstructions 1284      00015700  
2**0
  CONTENTS, ALLOC, LOAD, RELOC, READONLY, DATA
  9 __jump_table  0960      00016988  2**3
  CONTENTS, ALLOC, LOAD, RELOC, DATA
 10 __bug_table   051c      000172e8  2**2
  CONTENTS, ALLOC, LOAD, RELOC, DATA
 11 __kvm_ex_table 0028      00017808  2**3
  CONTENTS, ALLOC, LOAD, RELOC, READONLY, DATA
 12 .note.GNU-stack       00027370  2**0
  CONTENTS, READONLY

So what am I missing to trigger this issue? Does it rely on something
like PGO, which is not upstream yet? A bit of handholding would be
much appreciated.

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v3 3/7] arm64/fpsimd: Have KVM explicitly say which FP registers to save

2022-09-21 Thread Marc Zyngier
On Tue, 20 Sep 2022 19:32:49 +0100,
Mark Brown  wrote:
> 
> [1  ]
> On Tue, Sep 20, 2022 at 06:52:59PM +0100, Marc Zyngier wrote:
> > On Mon, 15 Aug 2022 23:55:25 +0100,
> > Mark Brown  wrote:
> 
> > >  enum fp_state {
> > > + FP_STATE_TASK,  /* Save based on current, invalid as fp_type */
> 
> > How is that related to the FP_TYPE_TASK in the commit message? What
> 
> TYPE in the commit message should be STATE.
> 
> > does this 'invalid as fp_type' mean?
> 
> It means that using FP_STATE_TASK as a value for the fp_type
> member of the task struck recording what type of state is
> currently stored for the task is not valid, one of the other two
> values representing what was actually saved must be chosen.

Then this definitely represents something else, and shouldn't be a
state or a type, whatever you decide to call it in the end. There is
the state of the FP/SVE unit, and what some piece of SW wants to
save. They match in some cases, and differ in other (the TASK
value). I'd rather you encode them as them as different types to lift
the ambiguity.

> 
> > > + /*
> > > +  * For now we're just validating that the requested state is
> > > +  * consistent with what we'd otherwise work out.
> 
> > Nit: work out? or worked out? the "we'd" doesn't help disambiguate it
> > for a non-native speaker.
> 
> we'd == we would so work out to match the tense.
> 
> > >  void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *st, void 
> > > *sve_state,
> > > unsigned int sve_vl, void *za_state,
> > > unsigned int sme_vl, u64 *svcr,
> > > -   enum fp_state *type)
> > > +   enum fp_state *type, enum fp_state to_save)
> 
> > OK, how many discrete arguments are we going to pass to this function,
> > which most of them are part the vcpu structure? It really feels like
> > what you want is a getter for the per-cpu structure, and let the KVM
> > code do the actual business. If this function was supposed to provide
> > some level of abstraction, well, it's a fail.
> 
> I agree that this is not an ideal interface, I am merely
> following the previously chosen idiom since I haven't been able
> to figure out why we were doing it in the first place and with a
> lot of these things it turns out that there's some actual reason.

Huh. If we're changing anything around this code, we'd better
understand what we are doing...

> It's not even like fpsimd_bind_task_to_cpu() has ever been
> written in terms of this function, there's two parallel
> implementations.  My best guess was that it was some combination
> of not peering at KVM internals and keeping struct
> fpsimd_last_state_struct internal to fpsimd.c (since we're
> effectively just passing one of those in in a more verbose form)
> but never anything solid enough to be sure.

Up to you, but adding extra parameters to this function really feels
like the wrong thing to do.

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v3 1/7] KVM: arm64: Discard any SVE state when entering KVM guests

2022-09-21 Thread Marc Zyngier
On Tue, 20 Sep 2022 21:21:33 +0100,
Mark Brown  wrote:
> 
> [1  ]
> On Tue, Sep 20, 2022 at 05:44:01PM +0100, Marc Zyngier wrote:
> > Mark Brown  wrote:
> 
> > >  void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu)
> > >  {
> > >   BUG_ON(!current->mm);
> > > - BUG_ON(test_thread_flag(TIF_SVE));
> > > +
> > > + fpsimd_kvm_prepare();
> > 
> > Why is this *before* the check against system_supports_fpsimd()? I
> > don't think the architecture allows SVE without FP, for obvious
> > reasons...
> 
> Good point, though now that I think about it I can't think of a
> requirement for FP when implementing SME (there's certainly not
> one for SVE).

Even if the architecture was allowing this madness, KVM doesn't allow
SVE if FP is not available, just like the rest of the kernel.

> There's no use for that hook now though.

Care to clarify?

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH] KVM: arm64: nvhe: Disable profile optimization

2022-09-21 Thread Marc Zyngier
On Wed, 21 Sep 2022 07:02:50 +0100,
Denis Nikitin  wrote:
> 
> Adding a few more comments...
> 
> On Tue, Sep 20, 2022 at 5:08 PM Denis Nikitin  wrote:
> >
> > Hi Mark,
> >
> > Thank you for a quick response.
> >
> > On Tue, Sep 20, 2022 at 2:34 AM Marc Zyngier  wrote:
> > >
> > > Hi Denis,
> > >
> > > On Tue, 20 Sep 2022 09:20:05 +0100,
> > > Denis Nikitin  wrote:
> > > >
> > > > Kernel build with -fprofile-sample-use raises the following failure:
> > > >
> > > > error: arch/arm64/kvm/hyp/nvhe/kvm_nvhe.tmp.o: Unexpected SHT_REL
> > > > section ".rel.llvm.call-graph-profile"
> > >
> > > How is this flag provided? I don't see any occurrence of it in the
> > > kernel so far.
> >
> > On ChromeOS we build the kernel with sample profiles by adding
> > -fprofile-sample-use=/path/to/gcov.profile to KCFLAGS.
> >
> > >
> > > >
> > > > SHT_REL is generated by the latest lld, see
> > > > https://reviews.llvm.org/rGca3bdb57fa1ac98b711a735de048c12b5fdd8086.
> > >
> > > Is this part of a released toolchain? If so, can you spell out the
> > > first version where this occurs?
> >
> > Yes, it was added in llvm-13. I will update the patch.
> >
> > >
> > > > Disable profile optimization in kvm/nvhe to fix the build with
> > > > AutoFDO.
> > >
> > > It'd be good to at least mention how AutoFDO and -fprofile-sample-use
> > > relate to each other.
> >
> > Good point. AutoFDO is an example of sample profiles.
> > It's not actually relevant for the bug. I will better remove it.
> >
> > >
> > > >
> > > > Signed-off-by: Denis Nikitin 
> > > > ---
> > > >  arch/arm64/kvm/hyp/nvhe/Makefile | 3 +++
> > > >  1 file changed, 3 insertions(+)
> > > >
> > > > diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile 
> > > > b/arch/arm64/kvm/hyp/nvhe/Makefile
> > > > index b5c5119c7396..6a6188374a52 100644
> > > > --- a/arch/arm64/kvm/hyp/nvhe/Makefile
> > > > +++ b/arch/arm64/kvm/hyp/nvhe/Makefile
> > > > @@ -89,6 +89,9 @@ quiet_cmd_hypcopy = HYPCOPY $@
> > > >  # Remove ftrace, Shadow Call Stack, and CFI CFLAGS.
> > > >  # This is equivalent to the 'notrace', '__noscs', and '__nocfi' 
> > > > annotations.
> > > >  KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) 
> > > > $(CC_FLAGS_CFI), $(KBUILD_CFLAGS))
> > > > +# Profile optimization creates SHT_REL section 
> > > > '.llvm.call-graph-profile' for
> > > > +# the hot code. SHT_REL is currently not supported by the KVM tools.
> > >
> > > 'KVM tools' seems vague. Maybe call out the actual helper that
> > > processes the relocations?
> >
> > Agreed.
> >
> > >
> > > > +KBUILD_CFLAGS += $(call 
> > > > cc-option,-fno-profile-sample-use,-fno-profile-use)
> > >
> > > Why adding these options instead of filtering out the offending option
> > > as it is done just above?
> >
> > That was actually the alternative solution and it worked as well.
> > Let me double check if profile optimization doesn't mess up with other
> > sections and if it doesn't I will remove the '.llvm.call-graph-profile'
> > section instead.
> 
> When I remove the '.llvm.call-graph-profile' section the layout of other
> sections slightly changes (offsets and sizes) compared to
> `-fno-profile-sample-use`. But the list of sections remains the same.

If this method works well enough, I'd rather we stick to it, instead
of having two ways to disable this sort of things.

> > > Also, is this the only place the kernel fails to compile? The EFI stub
> > > does similar things AFAIR, and could potentially fail the same way.
> >
> > This was the only place in 5.15 where we tested it.
> > Let me see if EFI has this section.
> 
> EFI code is not marked as hot in the profile.
> 
> Regarding "could potentially fail", I don't see any explicit manipulations
> with code sections in EFI.
> The hardcoded EFI stub entries should not be affected.

I was more worried by the runtime relocation that the EFI stub
performs for the kernel, but if you've checked that already, that
works for me.

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH kernel v3] KVM: PPC: Make KVM_CAP_IRQFD_RESAMPLE support platform dependent

2022-09-21 Thread Marc Zyngier
On Wed, 21 Sep 2022 05:49:25 +0100,
Alexey Kardashevskiy  wrote:
> 
> When introduced, IRQFD resampling worked on POWER8 with XICS. However
> KVM on POWER9 has never implemented it - the compatibility mode code
> ("XICS-on-XIVE") misses the kvm_notify_acked_irq() call and the native
> XIVE mode does not handle INTx in KVM at all.
> 
> This moved the capability support advertising to platforms and stops
> advertising it on XIVE, i.e. POWER9 and later.
> 
> This stops advertising the capability on MIPS and RISC-V as these
> do not select HAVE_KVM_IRQFD and do not implement IRQFD resampling
> anyway.

This paragraph makes no sense. Not selecting HAVE_KVM_IRQFD, by
definition, prevents the advertising of the capability. Hell, you are
even removing it from a block guarded by "#ifdef CONFIG_HAVE_KVM_IRQFD".

> 
> Signed-off-by: Alexey Kardashevskiy 
> Acked-by: Nicholas Piggin 
> ---
> Changes:
> v3:
> * removed all ifdeferry
> * removed the capability for MIPS and RISCV
> * adjusted the commit log about MIPS and RISCV
> 
> v2:
> * removed ifdef for ARM64.
> ---
>  arch/arm64/kvm/arm.c   | 1 +
>  arch/powerpc/kvm/powerpc.c | 6 ++
>  arch/s390/kvm/kvm-s390.c   | 1 +
>  arch/x86/kvm/x86.c | 1 +
>  virt/kvm/kvm_main.c| 1 -
>  5 files changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index 2ff0ef62abad..d2daa4d375b5 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -218,6 +218,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
> ext)
>   case KVM_CAP_VCPU_ATTRIBUTES:
>   case KVM_CAP_PTP_KVM:
>   case KVM_CAP_ARM_SYSTEM_SUSPEND:
> + case KVM_CAP_IRQFD_RESAMPLE:
>   r = 1;
>   break;
>   case KVM_CAP_SET_GUEST_DEBUG2:
> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
> index fb1490761c87..908ce8bd91c9 100644
> --- a/arch/powerpc/kvm/powerpc.c
> +++ b/arch/powerpc/kvm/powerpc.c
> @@ -593,6 +593,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
> ext)
>   break;
>  #endif
>  
> +#ifdef CONFIG_HAVE_KVM_IRQFD
> + case KVM_CAP_IRQFD_RESAMPLE:
> + r = !xive_enabled();
> + break;
> +#endif
> +
>   case KVM_CAP_PPC_ALLOC_HTAB:
>   r = hv_enabled;
>   break;
> diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
> index edfd4bbd0cba..7521adadb81b 100644
> --- a/arch/s390/kvm/kvm-s390.c
> +++ b/arch/s390/kvm/kvm-s390.c
> @@ -577,6 +577,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
> ext)
>   case KVM_CAP_SET_GUEST_DEBUG:
>   case KVM_CAP_S390_DIAG318:
>   case KVM_CAP_S390_MEM_OP_EXTENSION:
> + case KVM_CAP_IRQFD_RESAMPLE:
>   r = 1;
>   break;
>   case KVM_CAP_SET_GUEST_DEBUG2:
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 43a6a7efc6ec..2d6c5a8fdf14 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -4395,6 +4395,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
> ext)
>   case KVM_CAP_VAPIC:
>   case KVM_CAP_ENABLE_CAP:
>   case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
> + case KVM_CAP_IRQFD_RESAMPLE:
>   r = 1;
>   break;
>   case KVM_CAP_EXIT_HYPERCALL:
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 584a5bab3af3..05cf94013f02 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -4447,7 +4447,6 @@ static long kvm_vm_ioctl_check_extension_generic(struct 
> kvm *kvm, long arg)
>  #endif
>  #ifdef CONFIG_HAVE_KVM_IRQFD
>   case KVM_CAP_IRQFD:
> - case KVM_CAP_IRQFD_RESAMPLE:
>  #endif

Do you see what I mean?

>   case KVM_CAP_IOEVENTFD_ANY_LENGTH:
>   case KVM_CAP_CHECK_EXTENSION_VM:

So, with the nonsensical paragraph removed from the commit log:

Acked-by: Marc Zyngier 

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH kernel v2] KVM: PPC: Make KVM_CAP_IRQFD_RESAMPLE support platform dependent

2022-09-21 Thread Marc Zyngier
On Tue, 20 Sep 2022 22:46:21 +0100,
Alexey Kardashevskiy  wrote:
> 
> 
> 
> On 21/09/2022 02:08, Marc Zyngier wr
> ote:
> > On Tue, 20 Sep 2022 13:51:43 +0100,
> > Alexey Kardashevskiy  wrote:
> >> 
> >> When introduced, IRQFD resampling worked on POWER8 with XICS. However
> >> KVM on POWER9 has never implemented it - the compatibility mode code
> >> ("XICS-on-XIVE") misses the kvm_notify_acked_irq() call and the native
> >> XIVE mode does not handle INTx in KVM at all.
> >> 
> >> This moved the capability support advertising to platforms and stops
> >> advertising it on XIVE, i.e. POWER9 and later.
> >> 
> >> Signed-off-by: Alexey Kardashevskiy 
> >> Acked-by: Nicholas Piggin 
> >> [For KVM RISC-V]
> >> Acked-by: Anup Patel 
> >> ---
> >> Changes:
> >> v2:
> >> * removed ifdef for ARM64.
> > 
> > The same argument applies to both x86 and s390, which do select
> > HAVE_KVM_IRQFD from the KVM config option. Only power allows this
> > option to be selected depending on the underlying interrupt controller
> > emulation.
> > 
> > As for riscv and mips, they don't select HAVE_KVM_IRQFD, and this
> > isn't a user-selectable option. So why do they get patched at all?
> 
> Before the patch, the capability was advertised on those, with your
> proposal it will stop.

No, they were never advertised, since none of these architectures
select CONFIG_HAVE_KVM_IRQFD, and this option is not selectable by the
user.

> Which is a change in behavior which some may
> say requires a separate patch (like, one per platform). I am
> definitely overthinking it though and you are right.

Well, either there is no change in behaviour (and therefore I am
right), or there is one (and I am wrong).

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v3 2/7] arm64/fpsimd: Track the saved FPSIMD state type separately to TIF_SVE

2022-09-20 Thread Marc Zyngier
On Tue, 20 Sep 2022 19:09:15 +0100,
Mark Brown  wrote:
> 
> [1  ]
> On Tue, Sep 20, 2022 at 06:14:13PM +0100, Marc Zyngier wrote:
> > Mark Brown  wrote:
> 
> > > When we save the state for the floating point registers this can be done
> > > in the form visible through either the FPSIMD V registers or the SVE Z and
> > > P registers. At present we track which format is currently used based on
> > > TIF_SVE and the SME streaming mode state but particularly in the SVE case
> > > this limits our options for optimising things, especially around syscalls.
> > > Introduce a new enum in thread_struct which explicitly states which format
> > > is active and keep it up to date when we change it.
> 
> > > At present we do not use this state except to verify that it has the
> > > expected value when loading the state, future patches will introduce
> > > functional changes.
> 
> > > + enum fp_state fp_type;
> 
> > Is it a state or a type? Some consistency would help. Also, what does
> 
> We can bikeshed this either way - the state currently stored is
> of a particular type.  I'll probably go for type.

Then please do it consistently. At the moment, this is a bizarre mix
of the two, and this is already hard enough to reason about this that
we don't need extra complexity!

> 
> > this represent? Your commit message keeps talking about the FP/SVE
> > state for the host, but this is obviously a guest-related structure.
> > How do the two relate?
> 
> The commit message talks about saving the floating point state in
> general which is something we do for both the host and the guest.
> The optimisation cases I am focusing on right now are more on
> host usage but the complexity with tracking that currently blocks
> them crosses both host and guest, indeed the biggest improvement
> overall is probably that tracking the guest state stops requiring
> us to fiddle with the host task's state which to me at least
> makes things clearer.

At least for the KVM part, I want a clear comment explaining what this
tracks and how this is used, because at the moment, I'm only guessing.
And I've had enough guessing with this code...

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v3 5/7] arm64/fpsimd: Load FP state based on recorded data type

2022-09-20 Thread Marc Zyngier
On Mon, 15 Aug 2022 23:55:27 +0100,
Mark Brown  wrote:
> 
> Now that we are recording the type of floating point register state we
> are saving when we save it we can use that information when we load to
> decide which register state is required and bring the TIF_SVE state into
> sync with the loaded register state.

Really, this sentence makes zero sense to me. Please at least add some
punctuation, because the only words that spring to mind here are "DOES
NOT COMPUTE".

> 
> The SME state detauls are already recorded directly in the saved
> SVCR and handled based on the information there.
> 
> Since we are not changing any of the save paths there should be no
> functional change from this patch, further patches will make use of this
> to optimise and clarify the code.
> 
> Signed-off-by: Mark Brown 
> ---
>  arch/arm64/kernel/fpsimd.c | 39 ++
>  1 file changed, 31 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
> index aaea2dc02cbd..4096530dd4c6 100644
> --- a/arch/arm64/kernel/fpsimd.c
> +++ b/arch/arm64/kernel/fpsimd.c
> @@ -392,11 +392,36 @@ static void task_fpsimd_load(void)
>   WARN_ON(!system_supports_fpsimd());
>   WARN_ON(!have_cpu_fpsimd_context());
>  
> - /* Check if we should restore SVE first */
> - if (IS_ENABLED(CONFIG_ARM64_SVE) && test_thread_flag(TIF_SVE)) {
> - sve_set_vq(sve_vq_from_vl(task_get_sve_vl(current)) - 1);
> - restore_sve_regs = true;
> - restore_ffr = true;
> + if (system_supports_sve()) {
> + switch (current->thread.fp_type) {
> + case FP_STATE_FPSIMD:
> + /* Stop tracking SVE for this task until next use. */
> + if (test_and_clear_thread_flag(TIF_SVE))
> + sve_user_disable();
> + break;
> + case FP_STATE_SVE:
> + if (!thread_sm_enabled(>thread) &&
> + !WARN_ON_ONCE(!test_and_set_thread_flag(TIF_SVE)))
> + sve_user_enable();
> +
> + if (test_thread_flag(TIF_SVE))
> + 
> sve_set_vq(sve_vq_from_vl(task_get_sve_vl(current)) - 1);
> +
> + restore_sve_regs = true;
> + restore_ffr = true;
> + break;
> + default:
> + /*
> +  * This should never happen, we should always
> +  * record what we saved when we save. We
> +  * always at least have the memory allocated
> +  * for FPSMID registers so try that and hope
> +  * for the best.
> +  */
> + WARN_ON_ONCE(1);
> + clear_thread_flag(TIF_SVE);
> + break;

What makes it impossible for FP_STATE_TASK to reach this point? If
that's indeed an impossible case, please document it.

> + }
>   }
>  
>   /* Restore SME, override SVE register configuration if needed */
> @@ -412,10 +437,8 @@ static void task_fpsimd_load(void)
>   if (thread_za_enabled(>thread))
>   za_load_state(current->thread.za_state);
>  
> - if (thread_sm_enabled(>thread)) {
> - restore_sve_regs = true;
> + if (thread_sm_enabled(>thread))
>   restore_ffr = system_supports_fa64();
> - }
>   }
>  
>   if (restore_sve_regs) {

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v3 4/7] arm64/fpsimd: Stop using TIF_SVE to manage register saving in KVM

2022-09-20 Thread Marc Zyngier
On Mon, 15 Aug 2022 23:55:26 +0100,
Mark Brown  wrote:
> 
> Now that we are explicitly telling the host FP code which register state
> it needs to save we can remove the manipulation of TIF_SVE from the KVM
> code, simplifying it and allowing us to optimise our handling of normal
> tasks. Remove the manipulation of TIF_SVE from KVM and instead rely on
> to_save to ensure we save the correct data for it.
> 
> Signed-off-by: Mark Brown 
> ---
>  arch/arm64/kernel/fpsimd.c | 22 --
>  arch/arm64/kvm/fpsimd.c|  3 ---
>  2 files changed, 4 insertions(+), 21 deletions(-)
> 
> diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
> index 7be20ced2c45..aaea2dc02cbd 100644
> --- a/arch/arm64/kernel/fpsimd.c
> +++ b/arch/arm64/kernel/fpsimd.c
> @@ -436,8 +436,8 @@ static void task_fpsimd_load(void)
>   * last, if KVM is involved this may be the guest VM context rather
>   * than the host thread for the VM pointed to by current. This means
>   * that we must always reference the state storage via last rather
> - * than via current, other than the TIF_ flags which KVM will
> - * carefully maintain for us.
> + * than via current, if we are saving KVM state then it will have
> + * ensured that the type of registers to save is set in last->to_save.
>   */
>  static void fpsimd_save(void)
>  {
> @@ -454,27 +454,13 @@ static void fpsimd_save(void)
>   if (test_thread_flag(TIF_FOREIGN_FPSTATE))
>   return;
>  
> - if (test_thread_flag(TIF_SVE)) {
> + if ((last->to_save == FP_STATE_TASK && test_thread_flag(TIF_SVE)) ||
> + last->to_save == FP_STATE_SVE) {
>   save_sve_regs = true;
>   save_ffr = true;
>   vl = last->sve_vl;
>   }
>  
> - /*
> -  * For now we're just validating that the requested state is
> -  * consistent with what we'd otherwise work out.
> -  */
> - switch (last->to_save) {
> - case FP_STATE_TASK:
> - break;
> - case FP_STATE_FPSIMD:
> - WARN_ON_ONCE(save_sve_regs);
> - break;
> - case FP_STATE_SVE:
> - WARN_ON_ONCE(!save_sve_regs);
> - break;
> - }
> -

Given how short-lived this code is, consider dropping it altogether.
Actually, the previous patch would make a lot more sense if it was
merged with this one.

>   if (system_supports_sme()) {
>   u64 *svcr = last->svcr;
>  
> diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c
> index db0b2bacaeb8..8a79823fce68 100644
> --- a/arch/arm64/kvm/fpsimd.c
> +++ b/arch/arm64/kvm/fpsimd.c
> @@ -151,7 +151,6 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu)
>>arch.fp_type, fp_type);
>  
>   clear_thread_flag(TIF_FOREIGN_FPSTATE);
> - update_thread_flag(TIF_SVE, vcpu_has_sve(vcpu));
>   }
>  }
>  
> @@ -208,7 +207,5 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu)
>   sysreg_clear_set(CPACR_EL1, CPACR_EL1_ZEN_EL0EN, 0);
>   }
>  
> - update_thread_flag(TIF_SVE, 0);
> -
>   local_irq_restore(flags);
>  }

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v3 3/7] arm64/fpsimd: Have KVM explicitly say which FP registers to save

2022-09-20 Thread Marc Zyngier
On Mon, 15 Aug 2022 23:55:25 +0100,
Mark Brown  wrote:
> 
> In order to avoid needlessly saving and restoring the guest registers KVM
> relies on the host FPSMID code to save the guest registers when we context
> switch away from the guest. This is done by binding the KVM guest state to
> the CPU on top of the task state that was originally there, then carefully
> managing the TIF_SVE flag for the task to cause the host to save the full
> SVE state when needed regardless of the needs of the host task. This works
> well enough but isn't terribly direct about what is going on and makes it
> much more complicated to try to optimise what we're doing with the SVE
> register state.
> 
> Let's instead have KVM pass in the register state it wants saving when it
> binds to the CPU. We introduce a new FP_TYPE_TASK for use during normal
> task binding to indicate that we should base our decisions on the current
> task. In order to ease any future debugging that might be required this
> patch does not actually update any of the decision making about what to
> save, it merely starts tracking the new information and warns if the
> requested state is not what we would otherwise have decided to save.
> 
> Signed-off-by: Mark Brown 
> ---
>  arch/arm64/include/asm/fpsimd.h|  3 ++-
>  arch/arm64/include/asm/processor.h |  1 +
>  arch/arm64/kernel/fpsimd.c | 20 +++-
>  arch/arm64/kvm/fpsimd.c|  9 -
>  4 files changed, 30 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
> index b74103a79052..21a1dd320ca5 100644
> --- a/arch/arm64/include/asm/fpsimd.h
> +++ b/arch/arm64/include/asm/fpsimd.h
> @@ -61,7 +61,8 @@ extern void fpsimd_kvm_prepare(void);
>  extern void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *state,
>void *sve_state, unsigned int sve_vl,
>void *za_state, unsigned int sme_vl,
> -  u64 *svcr, enum fp_state *type);
> +  u64 *svcr, enum fp_state *type,
> +  enum fp_state to_save);
>  
>  extern void fpsimd_flush_task_state(struct task_struct *target);
>  extern void fpsimd_save_and_flush_cpu_state(void);
> diff --git a/arch/arm64/include/asm/processor.h 
> b/arch/arm64/include/asm/processor.h
> index 4818a6b77f39..89c248b8d4ba 100644
> --- a/arch/arm64/include/asm/processor.h
> +++ b/arch/arm64/include/asm/processor.h
> @@ -123,6 +123,7 @@ enum vec_type {
>  };
>  
>  enum fp_state {
> + FP_STATE_TASK,  /* Save based on current, invalid as fp_type */

How is that related to the FP_TYPE_TASK in the commit message? What
does this 'invalid as fp_type' mean?

>   FP_STATE_FPSIMD,
>   FP_STATE_SVE,
>  };
> diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
> index 6544ae00230f..7be20ced2c45 100644
> --- a/arch/arm64/kernel/fpsimd.c
> +++ b/arch/arm64/kernel/fpsimd.c
> @@ -126,6 +126,7 @@ struct fpsimd_last_state_struct {
>   unsigned int sve_vl;
>   unsigned int sme_vl;
>   enum fp_state *fp_type;
> + enum fp_state to_save;
>  };
>  
>  static DEFINE_PER_CPU(struct fpsimd_last_state_struct, fpsimd_last_state);
> @@ -459,6 +460,21 @@ static void fpsimd_save(void)
>   vl = last->sve_vl;
>   }
>  
> + /*
> +  * For now we're just validating that the requested state is
> +  * consistent with what we'd otherwise work out.

Nit: work out? or worked out? the "we'd" doesn't help disambiguate it
for a non-native speaker.

> +  */
> + switch (last->to_save) {
> + case FP_STATE_TASK:
> + break;
> + case FP_STATE_FPSIMD:
> + WARN_ON_ONCE(save_sve_regs);
> + break;
> + case FP_STATE_SVE:
> + WARN_ON_ONCE(!save_sve_regs);
> + break;
> + }
> +
>   if (system_supports_sme()) {
>   u64 *svcr = last->svcr;
>  
> @@ -1693,6 +1709,7 @@ static void fpsimd_bind_task_to_cpu(void)
>   last->sme_vl = task_get_sme_vl(current);
>   last->svcr = >thread.svcr;
>   last->fp_type = >thread.fp_type;
> + last->to_save = FP_STATE_TASK;
>   current->thread.fpsimd_cpu = smp_processor_id();
>  
>   /*
> @@ -1717,7 +1734,7 @@ static void fpsimd_bind_task_to_cpu(void)
>  void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *st, void *sve_state,
> unsigned int sve_vl, void *za_state,
> unsigned int sme_vl, u64 *svcr,
> -   enum fp_state *type)
> +   enum fp_state *type, enum fp_state to_save)

OK, how many discrete arguments are we going to pass to this function,
which most of them are part the vcpu structure? It really feels like
what you want is a getter for the per-cpu structure, and let the KVM
code do the actual business. If this function was supposed to provide
some 

Re: [PATCH v3 2/7] arm64/fpsimd: Track the saved FPSIMD state type separately to TIF_SVE

2022-09-20 Thread Marc Zyngier
On Mon, 15 Aug 2022 23:55:24 +0100,
Mark Brown  wrote:
> 
> When we save the state for the floating point registers this can be done
> in the form visible through either the FPSIMD V registers or the SVE Z and
> P registers. At present we track which format is currently used based on
> TIF_SVE and the SME streaming mode state but particularly in the SVE case
> this limits our options for optimising things, especially around syscalls.
> Introduce a new enum in thread_struct which explicitly states which format
> is active and keep it up to date when we change it.
> 
> At present we do not use this state except to verify that it has the
> expected value when loading the state, future patches will introduce
> functional changes.
> 
> Signed-off-by: Mark Brown 
> ---
>  arch/arm64/include/asm/fpsimd.h|  2 +-
>  arch/arm64/include/asm/kvm_host.h  |  1 +
>  arch/arm64/include/asm/processor.h |  6 
>  arch/arm64/kernel/fpsimd.c | 58 ++
>  arch/arm64/kernel/process.c|  2 ++
>  arch/arm64/kernel/ptrace.c |  3 ++
>  arch/arm64/kernel/signal.c |  7 +++-
>  arch/arm64/kvm/fpsimd.c|  3 +-
>  8 files changed, 64 insertions(+), 18 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
> index c07e4abaca3d..b74103a79052 100644
> --- a/arch/arm64/include/asm/fpsimd.h
> +++ b/arch/arm64/include/asm/fpsimd.h
> @@ -61,7 +61,7 @@ extern void fpsimd_kvm_prepare(void);
>  extern void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *state,
>void *sve_state, unsigned int sve_vl,
>void *za_state, unsigned int sme_vl,
> -  u64 *svcr);
> +  u64 *svcr, enum fp_state *type);
>  
>  extern void fpsimd_flush_task_state(struct task_struct *target);
>  extern void fpsimd_save_and_flush_cpu_state(void);
> diff --git a/arch/arm64/include/asm/kvm_host.h 
> b/arch/arm64/include/asm/kvm_host.h
> index f38ef299f13b..ebd37f97aeb4 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -310,6 +310,7 @@ struct kvm_vcpu_arch {
>   void *sve_state;
>   unsigned int sve_max_vl;
>   u64 svcr;
> + enum fp_state fp_type;

Is it a state or a type? Some consistency would help. Also, what does
this represent? Your commit message keeps talking about the FP/SVE
state for the host, but this is obviously a guest-related structure.
How do the two relate?

Finally, before this patch, pahole shows this:

struct kvm_vcpu_arch {
struct kvm_cpu_context ctxt; /* 0  1824 */
/* --- cacheline 28 boundary (1792 bytes) was 32 bytes ago --- */
void * sve_state;/*  1824 8 */
unsigned int   sve_max_vl;   /*  1832 4 */

/* XXX 4 bytes hole, try to pack */

u64svcr; /*  1840 8 */
struct kvm_s2_mmu *hw_mmu;   /*  1848 8 */
[...]
};

After it, we gain an extra hole:

struct kvm_vcpu_arch {
struct kvm_cpu_context ctxt; /* 0  1824 */
/* --- cacheline 28 boundary (1792 bytes) was 32 bytes ago --- */
void * sve_state;/*  1824 8 */
unsigned int   sve_max_vl;   /*  1832 4 */

/* XXX 4 bytes hole, try to pack */

u64svcr; /*  1840 8 */
enum fp_state  fp_type;  /*  1848 4 */

/* XXX 4 bytes hole, try to pack */

/* --- cacheline 29 boundary (1856 bytes) --- */
struct kvm_s2_mmu *hw_mmu;   /*  1856 8 */
[...]
};

Packing things wouldn't hurt.

>  
>   /* Stage 2 paging state used by the hardware on next switch */
>   struct kvm_s2_mmu *hw_mmu;
> diff --git a/arch/arm64/include/asm/processor.h 
> b/arch/arm64/include/asm/processor.h
> index 86eb0bfe3b38..4818a6b77f39 100644
> --- a/arch/arm64/include/asm/processor.h
> +++ b/arch/arm64/include/asm/processor.h
> @@ -122,6 +122,11 @@ enum vec_type {
>   ARM64_VEC_MAX,
>  };
>  
> +enum fp_state {
> + FP_STATE_FPSIMD,
> + FP_STATE_SVE,
> +};
> +
>  struct cpu_context {
>   unsigned long x19;
>   unsigned long x20;
> @@ -152,6 +157,7 @@ struct thread_struct {
>   struct user_fpsimd_state fpsimd_state;
>   } uw;
>  
> + enum fp_state   fp_type;/* registers FPSIMD or SVE? */

Same comment about the state vs type.

>   unsigned intfpsimd_cpu;
>   void*sve_state; /* SVE registers, if any */
>   void*za_state;  /* ZA register, if any */
> diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
> index 

Re: [PATCH v3 1/7] KVM: arm64: Discard any SVE state when entering KVM guests

2022-09-20 Thread Marc Zyngier
On Mon, 15 Aug 2022 23:55:23 +0100,
Mark Brown  wrote:
> 
> Since 8383741ab2e773a99 (KVM: arm64: Get rid of host SVE tracking/saving)
> KVM has not tracked the host SVE state, relying on the fact that we
> currently disable SVE whenever we perform a syscall. This may not be true
> in future since performance optimisation may result in us keeping SVE
> enabled in order to avoid needing to take access traps to reenable it.
> Handle this by clearing TIF_SVE and converting the stored task state to
> FPSIMD format when preparing to run the guest.  This is done with a new
> call fpsimd_kvm_prepare() to keep the direct state manipulation
> functions internal to fpsimd.c.
> 
> Signed-off-by: Mark Brown 
> ---
>  arch/arm64/include/asm/fpsimd.h |  1 +
>  arch/arm64/kernel/fpsimd.c  | 23 +++
>  arch/arm64/kvm/fpsimd.c |  3 ++-
>  3 files changed, 26 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
> index 6f86b7ab6c28..c07e4abaca3d 100644
> --- a/arch/arm64/include/asm/fpsimd.h
> +++ b/arch/arm64/include/asm/fpsimd.h
> @@ -56,6 +56,7 @@ extern void fpsimd_signal_preserve_current_state(void);
>  extern void fpsimd_preserve_current_state(void);
>  extern void fpsimd_restore_current_state(void);
>  extern void fpsimd_update_current_state(struct user_fpsimd_state const 
> *state);
> +extern void fpsimd_kvm_prepare(void);
>  
>  extern void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *state,
>void *sve_state, unsigned int sve_vl,
> diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
> index 23834d96d1e7..549e11645e0f 100644
> --- a/arch/arm64/kernel/fpsimd.c
> +++ b/arch/arm64/kernel/fpsimd.c
> @@ -1627,6 +1627,29 @@ void fpsimd_signal_preserve_current_state(void)
>   sve_to_fpsimd(current);
>  }
>  
> +/*
> + * Called by KVM when entering the guest.
> + */
> +void fpsimd_kvm_prepare(void)
> +{
> + if (!system_supports_sve())
> + return;
> +
> + /*
> +  * KVM does not save host SVE state since we can only enter
> +  * the guest from a syscall so the ABI means that only the
> +  * non-saved SVE state needs to be saved.  If we have left
> +  * SVE enabled for performance reasons then update the task
> +  * state to be FPSIMD only.
> +  */
> + get_cpu_fpsimd_context();
> +
> + if (test_and_clear_thread_flag(TIF_SVE))
> + sve_to_fpsimd(current);
> +
> + put_cpu_fpsimd_context();
> +}
> +
>  /*
>   * Associate current's FPSIMD context with this cpu
>   * The caller must have ownership of the cpu FPSIMD context before calling
> diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c
> index ec8e4494873d..1c1b309ef420 100644
> --- a/arch/arm64/kvm/fpsimd.c
> +++ b/arch/arm64/kvm/fpsimd.c
> @@ -75,7 +75,8 @@ int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu)
>  void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu)
>  {
>   BUG_ON(!current->mm);
> - BUG_ON(test_thread_flag(TIF_SVE));
> +
> + fpsimd_kvm_prepare();

Why is this *before* the check against system_supports_fpsimd()? I
don't think the architecture allows SVE without FP, for obvious
reasons...

>  
>   if (!system_supports_fpsimd())
>   return;

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v3 3/7] mm: Add PG_arch_3 page flag

2022-09-20 Thread Marc Zyngier
On Tue, 20 Sep 2022 16:39:47 +0100,
Catalin Marinas  wrote:
> 
> On Mon, Sep 19, 2022 at 07:12:53PM +0100, Marc Zyngier wrote:
> > On Mon, 05 Sep 2022 18:01:55 +0100,
> > Catalin Marinas  wrote:
> > > Peter, please let me know if you want to pick this series up together
> > > with your other KVM patches. Otherwise I can post it separately, it's
> > > worth merging it on its own as it clarifies the page flag vs tag setting
> > > ordering.
> > 
> > I'm looking at queuing this, but I'm confused by this comment. Do I
> > need to pick this as part of the series? Or is this an independent
> > thing (my hunch is that it is actually required not to break other
> > architectures...).
> 
> This series series (at least the first patches) won't apply cleanly on
> top of 6.0-rc1 and, of course, we shouldn't break other architectures. I
> can repost the whole series but I don't have the setup to test the
> MAP_SHARED KVM option (unless Peter plans to post it soon).

I don't feel brave enough to take a series affecting all architectures
so late in the game, and the whole thing had very little arm64
exposure. The latest QEMU doesn't seem to work anymore, so I don't
have any MTE-capable emulation (and using the FVP remotely is a pain
in the proverbial neck).

I'll come back to this after the merge window, should Peter decide to
respin the series.

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH kernel v2] KVM: PPC: Make KVM_CAP_IRQFD_RESAMPLE support platform dependent

2022-09-20 Thread Marc Zyngier
On Tue, 20 Sep 2022 13:51:43 +0100,
Alexey Kardashevskiy  wrote:
> 
> When introduced, IRQFD resampling worked on POWER8 with XICS. However
> KVM on POWER9 has never implemented it - the compatibility mode code
> ("XICS-on-XIVE") misses the kvm_notify_acked_irq() call and the native
> XIVE mode does not handle INTx in KVM at all.
> 
> This moved the capability support advertising to platforms and stops
> advertising it on XIVE, i.e. POWER9 and later.
> 
> Signed-off-by: Alexey Kardashevskiy 
> Acked-by: Nicholas Piggin 
> [For KVM RISC-V]
> Acked-by: Anup Patel 
> ---
> Changes:
> v2:
> * removed ifdef for ARM64.

The same argument applies to both x86 and s390, which do select
HAVE_KVM_IRQFD from the KVM config option. Only power allows this
option to be selected depending on the underlying interrupt controller
emulation.

As for riscv and mips, they don't select HAVE_KVM_IRQFD, and this
isn't a user-selectable option. So why do they get patched at all?

My conclusion is that:

- only power needs the #ifdefery in the arch-specific code
- arm64, s390 and x86 can use KVM_CAP_IRQFD_RESAMPLE without a #ifdef
- mips and riscv should be left alone

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH] KVM: arm64: nvhe: Disable profile optimization

2022-09-20 Thread Marc Zyngier
Hi Denis,

On Tue, 20 Sep 2022 09:20:05 +0100,
Denis Nikitin  wrote:
> 
> Kernel build with -fprofile-sample-use raises the following failure:
> 
> error: arch/arm64/kvm/hyp/nvhe/kvm_nvhe.tmp.o: Unexpected SHT_REL
> section ".rel.llvm.call-graph-profile"

How is this flag provided? I don't see any occurrence of it in the
kernel so far.

> 
> SHT_REL is generated by the latest lld, see
> https://reviews.llvm.org/rGca3bdb57fa1ac98b711a735de048c12b5fdd8086.

Is this part of a released toolchain? If so, can you spell out the
first version where this occurs?

> Disable profile optimization in kvm/nvhe to fix the build with
> AutoFDO.

It'd be good to at least mention how AutoFDO and -fprofile-sample-use
relate to each other.

> 
> Signed-off-by: Denis Nikitin 
> ---
>  arch/arm64/kvm/hyp/nvhe/Makefile | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile 
> b/arch/arm64/kvm/hyp/nvhe/Makefile
> index b5c5119c7396..6a6188374a52 100644
> --- a/arch/arm64/kvm/hyp/nvhe/Makefile
> +++ b/arch/arm64/kvm/hyp/nvhe/Makefile
> @@ -89,6 +89,9 @@ quiet_cmd_hypcopy = HYPCOPY $@
>  # Remove ftrace, Shadow Call Stack, and CFI CFLAGS.
>  # This is equivalent to the 'notrace', '__noscs', and '__nocfi' annotations.
>  KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) 
> $(CC_FLAGS_CFI), $(KBUILD_CFLAGS))
> +# Profile optimization creates SHT_REL section '.llvm.call-graph-profile' for
> +# the hot code. SHT_REL is currently not supported by the KVM tools.

'KVM tools' seems vague. Maybe call out the actual helper that
processes the relocations?

> +KBUILD_CFLAGS += $(call cc-option,-fno-profile-sample-use,-fno-profile-use)

Why adding these options instead of filtering out the offending option
as it is done just above?

Also, is this the only place the kernel fails to compile? The EFI stub
does similar things AFAIR, and could potentially fail the same way.

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v3 3/7] mm: Add PG_arch_3 page flag

2022-09-19 Thread Marc Zyngier
On Mon, 05 Sep 2022 18:01:55 +0100,
Catalin Marinas  wrote:
> 
> On Thu, Sep 01, 2022 at 06:59:23PM +0100, Catalin Marinas wrote:
> > On Thu, Aug 11, 2022 at 03:16:08PM +0800, kernel test robot wrote:
> > > Thank you for the patch! Perhaps something to improve:
> > > 
> > > [auto build test WARNING on arm64/for-next/core]
> > > [also build test WARNING on linus/master next-20220811]
> > > [cannot apply to kvmarm/next arm/for-next soc/for-next xilinx-xlnx/master 
> > > v5.19]
> > > [If your patch is applied to the wrong git tree, kindly drop us a note.
> > > And when submitting patch, we suggest to use '--base' as documented in
> > > https://git-scm.com/docs/git-format-patch#_base_tree_information]
> > > 
> > > url:
> > > https://github.com/intel-lab-lkp/linux/commits/Peter-Collingbourne/KVM-arm64-permit-MAP_SHARED-mappings-with-MTE-enabled/20220811-033310
> > > base:   https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git 
> > > for-next/core
> > > config: loongarch-defconfig 
> > > (https://download.01.org/0day-ci/archive/20220811/202208111500.62e0bl2l-...@intel.com/config)
> > > compiler: loongarch64-linux-gcc (GCC) 12.1.0
> > > reproduce (this is a W=1 build):
> > > wget 
> > > https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross 
> > > -O ~/bin/make.cross
> > > chmod +x ~/bin/make.cross
> > > # 
> > > https://github.com/intel-lab-lkp/linux/commit/1a400517d8428df0ec9f86f8d303b2227ee9702f
> > > git remote add linux-review https://github.com/intel-lab-lkp/linux
> > > git fetch --no-tags linux-review 
> > > Peter-Collingbourne/KVM-arm64-permit-MAP_SHARED-mappings-with-MTE-enabled/20220811-033310
> > > git checkout 1a400517d8428df0ec9f86f8d303b2227ee9702f
> > > # save the config file
> > > mkdir build_dir && cp config build_dir/.config
> > > COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross 
> > > W=1 O=build_dir ARCH=loongarch SHELL=/bin/bash
> > > 
> > > If you fix the issue, kindly add following tag where applicable
> > > Reported-by: kernel test robot 
> > > 
> > > All warnings (new ones prefixed by >>):
> > > 
> > > >> mm/memory.c:92:2: warning: #warning Unfortunate NUMA and NUMA 
> > > >> Balancing config, growing page-frame for last_cpupid. [-Wcpp]
> > >   92 | #warning Unfortunate NUMA and NUMA Balancing config, growing 
> > > page-frame for last_cpupid.
> > >  |  ^~~
> > > 
> > > 
> > > vim +92 mm/memory.c
> > > 
> > > 42b7772812d15b Jan Beulich2008-07-23  90  
> > > af27d9403f5b80 Arnd Bergmann  2018-02-16  91  #if 
> > > defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
> > > 90572890d20252 Peter Zijlstra 2013-10-07 @92  #warning Unfortunate NUMA 
> > > and NUMA Balancing config, growing page-frame for last_cpupid.
> > > 75980e97daccfc Peter Zijlstra 2013-02-22  93  #endif
> > > 75980e97daccfc Peter Zijlstra 2013-02-22  94  
> > 
> > It looks like ith CONFIG_NUMA_BALANCING=y on loongarch we run out of
> > spare bits in page->flags to fit last_cpupid. The reason we don't see it
> > on arm64 is that we select SPARSEMEM_VMEMMAP and SECTIONS_WIDTH becomes
> > 0. On loongarch SECTIONS_WIDTH takes 19 bits (48 - 29) in page->flags.
> > 
> > I think instead of always defining PG_arch_{2,3} if CONFIG_64BIT, we
> > could add a CONFIG_ARCH_WANTS_PG_ARCH_23 option and only select it on
> > arm64 for the time being.
> 
> I pushed a patch as the first one on the arm64 devel/mte-pg-flags
> branch. Also updated the last patch on this branch following Steven's
> comments.
> 
> Peter, please let me know if you want to pick this series up together
> with your other KVM patches. Otherwise I can post it separately, it's
> worth merging it on its own as it clarifies the page flag vs tag setting
> ordering.

I'm looking at queuing this, but I'm confused by this comment. Do I
need to pick this as part of the series? Or is this an independent
thing (my hunch is that it is actually required not to break other
architectures...).

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[GIT PULL] KVM/arm64 fixes for 6.0, take #2

2022-09-19 Thread Marc Zyngier
Paolo,

Here's the last KVM/arm64 pull request for this cycle, with
a small fix for pKVM and kmemleak.

Please pull,

M.

The following changes since commit 1c23f9e627a7b412978b4e852793c5e3c3efc555:

  Linux 6.0-rc2 (2022-08-21 17:32:54 -0700)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git 
tags/kvmarm-fixes-6.0-2

for you to fetch changes up to 522c9a64c7049f50c7b1299741c13fac3f231cd4:

  KVM: arm64: Use kmemleak_free_part_phys() to unregister hyp_mem_base 
(2022-09-19 17:59:48 +0100)


KVM/arm64 fixes for 6.0, take #2

- Fix kmemleak usage in Protected KVM (again)


Zenghui Yu (1):
  KVM: arm64: Use kmemleak_free_part_phys() to unregister hyp_mem_base

 arch/arm64/kvm/arm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v6 00/13] KVM: selftests: Add aarch64/page_fault_test

2022-09-19 Thread Marc Zyngier
On Mon, 19 Sep 2022 17:38:14 +0100,
Sean Christopherson  wrote:
> 
> On Mon, Sep 19, 2022, Marc Zyngier wrote:
> > On Tue, 6 Sep 2022 18:09:17 +, Ricardo Koller wrote:
> > > This series adds a new aarch64 selftest for testing stage 2 fault 
> > > handling for
> > > various combinations of guest accesses (e.g., write, S1PTW), backing 
> > > sources
> > > (e.g., anon), and types of faults (e.g., read on hugetlbfs with a hole, 
> > > write
> > > on a readonly memslot). Each test tries a different combination and then 
> > > checks
> > > that the access results in the right behavior (e.g., uffd faults with the 
> > > right
> > > address and write/read flag). Some interesting combinations are:
> > > 
> > > [...]
> > 
> > Given how long this has been around, I've picked this series up, applying
> > Oliver's fixes in the process.
> 
> Any chance this can be undone?  A big reason why this is at v6 is
> because of the common API changes, and due to KVM Forum I've
> effectively had three working days since this was posted, and others
> have probably had even less, i.e. lack of reviews on v6 isn't
> because no one cares.

Hey, I'm still not back at work, and won't be for another week! But
fair enough, if there is going to be a respin, I'd rather see that
(and I'm less hung up on tests having been in -next for some time
before sending out a PR that eventually reaches Linus).

> It's not the end of the world if we have to fix things up on top,
> but we'd avoid a decent amount of churn if we can instead unwind and
> do a v7.

No skin off my nose, as this leaves on its own topic branch. Now
dropped.

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v6 00/13] KVM: selftests: Add aarch64/page_fault_test

2022-09-19 Thread Marc Zyngier
On Tue, 6 Sep 2022 18:09:17 +, Ricardo Koller wrote:
> This series adds a new aarch64 selftest for testing stage 2 fault handling for
> various combinations of guest accesses (e.g., write, S1PTW), backing sources
> (e.g., anon), and types of faults (e.g., read on hugetlbfs with a hole, write
> on a readonly memslot). Each test tries a different combination and then 
> checks
> that the access results in the right behavior (e.g., uffd faults with the 
> right
> address and write/read flag). Some interesting combinations are:
> 
> [...]

Given how long this has been around, I've picked this series up, applying
Oliver's fixes in the process. Any additional fixes will be added on top.

Applied to next, thanks!

[01/13] KVM: selftests: Add a userfaultfd library
commit: b9e9f0060d69ace412846b3dda318189e74d80ea
[02/13] KVM: selftests: aarch64: Add virt_get_pte_hva() library function
commit: b3e02786e384d83637fc29dca2af9604a6314e62
[03/13] KVM: selftests: Add missing close and munmap in __vm_mem_region_delete()
commit: 77c071a5376c9fdc0c510138d081af5e6ead754d
[04/13] KVM: selftests: aarch64: Construct DEFAULT_MAIR_EL1 using sysreg.h 
macros
commit: a68c2b6545af95729eb57fe39f173557d1f22649
[05/13] tools: Copy bitfield.h from the kernel sources
commit: 8998ed5834af0a9cc3de8d5bd485576c654620fc
[06/13] KVM: selftests: Stash backing_src_type in struct userspace_mem_region
commit: 20a8d07c0828592d72c756c98f2708e905bfabd3
[07/13] KVM: selftests: Change vm_create() to take struct kvm_vm_mem_params
commit: cb7f9c457b94b2ad71eaf9621a9e7f3e9c3832db
[08/13] KVM: selftests: Use the right memslot for code, page-tables, and data 
allocations
commit: 72ad71ddb5fae4dc26a2fa7e885213598baab9ad
[09/13] KVM: selftests: aarch64: Add aarch64/page_fault_test
commit: fa7b86adf85be5de36a797df7b1509542af1f119
[10/13] KVM: selftests: aarch64: Add userfaultfd tests into page_fault_test
commit: 596fcc0f6888241b0b2f02577a2b608f274b299d
[11/13] KVM: selftests: aarch64: Add dirty logging tests into page_fault_test
commit: 0740d435146f69be9950df5dd45a31fbaec943ba
[12/13] KVM: selftests: aarch64: Add readonly memslot tests into page_fault_test
commit: a9246644455d51faa4063749bb17aa7e060adc70
[13/13] KVM: selftests: aarch64: Add mix of tests into page_fault_test
commit: 383d48a1442ca477732ea77d6231b3cc73b9d7f8

Cheers,

M.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2 0/4] KVM: arm64: Fix bugs of single-step execution enabled by userspace

2022-09-19 Thread Marc Zyngier
On Fri, 16 Sep 2022 18:05:56 -0700, Reiji Watanabe wrote:
> This series fixes two bugs of single-step execution enabled by
> userspace, and add a test case for KVM_GUESTDBG_SINGLESTEP to
> the debug-exception test to verify the single-step behavior.
> 
> Patch 1 fixes a bug that KVM might unintentionally change PSTATE.SS
> for the guest when single-step execution is enabled for the vCPU by
> userspace.
> 
> [...]

Applied to next, thanks!

[1/4] KVM: arm64: Preserve PSTATE.SS for the guest while single-step is enabled
  commit: 34fbdee086cfcc20fe889d2b83afddfbe2ac3096
[2/4] KVM: arm64: Clear PSTATE.SS when the Software Step state was 
Active-pending
  commit: 370531d1e95be57c62fdf065fb04fd8db7ade8f9
[3/4] KVM: arm64: selftests: Refactor debug-exceptions to make it amenable to 
new test cases
  commit: ff00e737090e0f015059e59829aaa58565b16321
[4/4] KVM: arm64: selftests: Add a test case for KVM_GUESTDBG_SINGLESTEP
  commit: b18e4d4aeb05810ceb2f73d7f72afcd11b41

Cheers,

M.
-- 
Marc Zyngier 

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2 4/4] KVM: arm64: selftests: Add a test case for KVM_GUESTDBG_SINGLESTEP

2022-09-19 Thread Marc Zyngier
On Sat, 17 Sep 2022 02:06:00 +0100,
Reiji Watanabe  wrote:
> 
> Add a test case for KVM_GUESTDBG_SINGLESTEP to the debug-exceptions test.
> The test enables single-step execution from userspace, and check if the
> exit to userspace occurs for each instruction that is stepped.
> Set the default number of the test iterations to a number of iterations
> sufficient to always reproduce the problem that the previous patch fixes
> on an Ampere Altra machine.

A possibly more aggressive version of this test would be to force a
(short lived) timer to fire on the same CPU, forcing an exit. This
should hopefully result in a more predictable way to trigger the
issue. But that's a reasonable test as a start.

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2 1/5] KVM: x86: Introduce KVM_REQ_RING_SOFT_FULL

2022-09-19 Thread Marc Zyngier
On Mon, 19 Sep 2022 00:58:10 +0100,
Gavin Shan  wrote:
> 
> On 9/18/22 7:00 PM, Marc Zyngier wrote:
> > On Fri, 16 Sep 2022 19:09:52 +0100,
> > Peter Xu  wrote:
> >> 
> >> On Fri, Sep 16, 2022 at 12:51:31PM +0800, Gavin Shan wrote:
> >>> This adds KVM_REQ_RING_SOFT_FULL, which is raised when the dirty
> >>> ring of the specific VCPU becomes softly full in kvm_dirty_ring_push().
> >>> The VCPU is enforced to exit when the request is raised and its
> >>> dirty ring is softly full on its entrance.
> >>> 
> >>> Suggested-by: Marc Zyngier 
> >>> Signed-off-by: Gavin Shan 
> >>> ---
> >>>   arch/x86/kvm/x86.c   | 5 +++--
> >>>   include/linux/kvm_host.h | 1 +
> >>>   virt/kvm/dirty_ring.c| 4 
> >>>   3 files changed, 8 insertions(+), 2 deletions(-)
> >>> 
> >>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> >>> index 43a6a7efc6ec..7f368f59f033 100644
> >>> --- a/arch/x86/kvm/x86.c
> >>> +++ b/arch/x86/kvm/x86.c
> >>> @@ -10265,8 +10265,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> >>>   bool req_immediate_exit = false;
> >>>   /* Forbid vmenter if vcpu dirty ring is soft-full */
> >>> - if (unlikely(vcpu->kvm->dirty_ring_size &&
> >>> -  kvm_dirty_ring_soft_full(>dirty_ring))) {
> >>> + if (kvm_check_request(KVM_REQ_RING_SOFT_FULL, vcpu) &&
> >>> + kvm_dirty_ring_soft_full(>dirty_ring)) {
> >>> + kvm_make_request(KVM_REQ_RING_SOFT_FULL, vcpu);
> >>>   vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL;
> >>>   trace_kvm_dirty_ring_exit(vcpu);
> >>>   r = 0;
> >> 
> >> As commented previously - can we use kvm_test_request() instead? because we
> >> don't want to unconditionally clear the bit.  Instead of making the request
> >> again, we can clear request only if !full.
> > 
> > I have the feeling that this is a micro-optimisation that won't lead
> > to much benefit in practice. You already have the cache line, just not
> > in exclusive mode, and given that this is per-vcpu, you'd only see the
> > cost if someone else is setting a request to this vcpu while
> > evaluating the local requests.
> > 
> > And now you need extra barriers...
> > 
> > Also, can we please refrain from changing things without data showing
> > that this actually is worse than what we had before? The point below
> > makes me think that this is actually beneficial as is.
> > 
> 
> I think Marc's explanation makes sense. It won't make difference in terms
> of performance. We need to explicitly handle barrier when kvm_test_request()
> is used. So I prefer to keep the code if Peter agrees.
> 
> >> We can also safely move this into the block of below kvm_request_pending()
> >> as Marc used to suggest.
> > 
> > This, on the other hand, makes sure that we share the cost across all
> > requests. Requests should be extremely rare anyway (and if they
> > aren't, you have a whole lot of performance issues on your hands
> > anyway).
> > 
> 
> Yeah, We shouldn't have too much requests. I missed the comment from Marc
> to move this chunk to kvm_request_pending(). I will fix it in v3.
> 
> >> 
> >> To explicitly use kvm_clear_request(), we may need to be careful on the
> >> memory barriers.  I'm wondering whether we should have moved
> >> smp_mb__after_atomic() into kvm_clear_request() because kvm_clear_request()
> >> is used outside kvm_check_request() and IIUC all the call sites should
> >> better have that barrier too to be safe.
> >> 
> >> Side note: when I read the code around I also see some mis-use of clear
> >> request where it can be omitted, e.g.:
> >> 
> >>if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
> >>kvm_clear_request(KVM_REQ_UNHALT, vcpu);
> >>vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
> >>}
> >> 
> >> Maybe it's a sign of bad naming, so we should renamed kvm_check_request()
> >> to kvm_test_clear_request() too to show that clearing after that is not
> >> needed?
> > 
> > Yeah, this kvm_clear_request() is superfluous. But this is rather well
> > documented, for once, and I don't think we should repaint it based on
> > a sample of one.
> > 
> 
> Yeah, I 

Re: [PATCH v2 1/5] KVM: x86: Introduce KVM_REQ_RING_SOFT_FULL

2022-09-18 Thread Marc Zyngier
On Fri, 16 Sep 2022 19:09:52 +0100,
Peter Xu  wrote:
> 
> On Fri, Sep 16, 2022 at 12:51:31PM +0800, Gavin Shan wrote:
> > This adds KVM_REQ_RING_SOFT_FULL, which is raised when the dirty
> > ring of the specific VCPU becomes softly full in kvm_dirty_ring_push().
> > The VCPU is enforced to exit when the request is raised and its
> > dirty ring is softly full on its entrance.
> > 
> > Suggested-by: Marc Zyngier 
> > Signed-off-by: Gavin Shan 
> > ---
> >  arch/x86/kvm/x86.c   | 5 +++--
> >  include/linux/kvm_host.h | 1 +
> >  virt/kvm/dirty_ring.c| 4 
> >  3 files changed, 8 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 43a6a7efc6ec..7f368f59f033 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -10265,8 +10265,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> > bool req_immediate_exit = false;
> >  
> > /* Forbid vmenter if vcpu dirty ring is soft-full */
> > -   if (unlikely(vcpu->kvm->dirty_ring_size &&
> > -kvm_dirty_ring_soft_full(>dirty_ring))) {
> > +   if (kvm_check_request(KVM_REQ_RING_SOFT_FULL, vcpu) &&
> > +   kvm_dirty_ring_soft_full(>dirty_ring)) {
> > +   kvm_make_request(KVM_REQ_RING_SOFT_FULL, vcpu);
> > vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL;
> > trace_kvm_dirty_ring_exit(vcpu);
> > r = 0;
> 
> As commented previously - can we use kvm_test_request() instead? because we
> don't want to unconditionally clear the bit.  Instead of making the request
> again, we can clear request only if !full.

I have the feeling that this is a micro-optimisation that won't lead
to much benefit in practice. You already have the cache line, just not
in exclusive mode, and given that this is per-vcpu, you'd only see the
cost if someone else is setting a request to this vcpu while
evaluating the local requests.

And now you need extra barriers...

Also, can we please refrain from changing things without data showing
that this actually is worse than what we had before? The point below
makes me think that this is actually beneficial as is.

> We can also safely move this into the block of below kvm_request_pending()
> as Marc used to suggest.

This, on the other hand, makes sure that we share the cost across all
requests. Requests should be extremely rare anyway (and if they
aren't, you have a whole lot of performance issues on your hands
anyway).

> 
> To explicitly use kvm_clear_request(), we may need to be careful on the
> memory barriers.  I'm wondering whether we should have moved
> smp_mb__after_atomic() into kvm_clear_request() because kvm_clear_request()
> is used outside kvm_check_request() and IIUC all the call sites should
> better have that barrier too to be safe.
>
> Side note: when I read the code around I also see some mis-use of clear
> request where it can be omitted, e.g.:
> 
>   if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
>   kvm_clear_request(KVM_REQ_UNHALT, vcpu);
>   vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
>   }
> 
> Maybe it's a sign of bad naming, so we should renamed kvm_check_request()
> to kvm_test_clear_request() too to show that clearing after that is not
> needed?

Yeah, this kvm_clear_request() is superfluous. But this is rather well
documented, for once, and I don't think we should repaint it based on
a sample of one.

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v3 0/7] KVM: arm64: Use visibility hook to treat ID regs as RAZ

2022-09-14 Thread Marc Zyngier
On Tue, 13 Sep 2022 09:44:33 +, Oliver Upton wrote:
> For reasons unknown, the Arm architecture defines the 64-bit views of
> the 32-bit ID registers as UNKNOWN [1]. This combines poorly with the
> fact that KVM unconditionally exposes these registers to userspace,
> which could throw a wrench in migration between 64-bit only systems.
> 
> This series reworks KVM's definition of these registers to RAZ/WI with
> the goal of providing consistent register values across 64-bit machines.
> 
> [...]

Applied to kvm-arm64/next, thanks!

[1/7] KVM: arm64: Use visibility hook to treat ID regs as RAZ
  commit: 34b4d20399e6fad2e3379b11e68dff1d1549274e
[2/7] KVM: arm64: Remove internal accessor helpers for id regs
  commit: 4782ccc8ef50fabb70bab9fa73186285dba6d91d
[3/7] KVM: arm64: Drop raz parameter from read_id_reg()
  commit: cdd5036d048ca96ef5212fb37f4f56db40cb1bc2
[4/7] KVM: arm64: Spin off helper for calling visibility hook
  commit: 5d9a718b64e428a40939806873ecf16f072008b3
[5/7] KVM: arm64: Add a visibility bit to ignore user writes
  commit: 4de06e4c1dc949c35c16e4423b4ccd735264b0a9
[6/7] KVM: arm64: Treat 32bit ID registers as RAZ/WI on 64bit-only system
  commit: d5efec7ed826b3b29c6847bf59383d8d07347a4e
[7/7] KVM: selftests: Add test for AArch32 ID registers
  commit: 797b84517c190053597e3f7e03ead15da872e04d

Cheers,

M.
-- 
Marc Zyngier 

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 1/3] KVM: arm64: Don't set PSTATE.SS when Software Step state is Active-pending

2022-09-14 Thread Marc Zyngier
On Wed, 14 Sep 2022 07:13:04 +0100,
Reiji Watanabe  wrote:
> 
> Hi Marc,
> 
> Thank you for the review!
> 
> On Sat, Sep 10, 2022 at 3:36 AM Marc Zyngier  wrote:
> >
> > On Fri, 09 Sep 2022 05:46:34 +0100,
> > Reiji Watanabe  wrote:
> > >
> > > Currently, PSTATE.SS is set on every guest entry if single-step is
> > > enabled for the vCPU by userspace.  However, it could cause extra
> > > single-step execution without returning to userspace, which shouldn't
> > > be performed, if the Software Step state at the last guest exit was
> > > Active-pending (i.e. the last exit was not triggered by Software Step
> > > exception, but by an asynchronous exception after the single-step
> > > execution is performed).
> > >
> > > Fix this by not setting PSTATE.SS on guest entry if the Software
> > > Step state at the last exit was Active-pending.
> > >
> > > Fixes: 337b99bf7edf ("KVM: arm64: guest debug, add support for 
> > > single-step")
> > > Signed-off-by: Reiji Watanabe 
> >
> > Now that I'm a bit more clued about what the architecture actually
> > mandates, I can try and review this patch.
> >
> > > ---
> > >  arch/arm64/include/asm/kvm_host.h |  3 +++
> > >  arch/arm64/kvm/debug.c| 19 ++-
> > >  arch/arm64/kvm/guest.c|  1 +
> > >  arch/arm64/kvm/handle_exit.c  |  2 ++
> > >  4 files changed, 24 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/arch/arm64/include/asm/kvm_host.h 
> > > b/arch/arm64/include/asm/kvm_host.h
> > > index e9c9388ccc02..4cf6eef02565 100644
> > > --- a/arch/arm64/include/asm/kvm_host.h
> > > +++ b/arch/arm64/include/asm/kvm_host.h
> > > @@ -535,6 +535,9 @@ struct kvm_vcpu_arch {
> > >  #define IN_WFIT  __vcpu_single_flag(sflags, BIT(3))
> > >  /* vcpu system registers loaded on physical CPU */
> > >  #define SYSREGS_ON_CPU   __vcpu_single_flag(sflags, BIT(4))
> > > +/* Software step state is Active-pending */
> > > +#define DBG_SS_ACTIVE_PENDING__vcpu_single_flag(sflags, BIT(5))
> > > +
> > >
> > >  /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */
> > >  #define vcpu_sve_pffr(vcpu) (kern_hyp_va((vcpu)->arch.sve_state) +   \
> > > diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
> > > index 0b28d7db7c76..125cfb94b4ad 100644
> > > --- a/arch/arm64/kvm/debug.c
> > > +++ b/arch/arm64/kvm/debug.c
> > > @@ -188,7 +188,16 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
> > >* debugging the system.
> > >*/
> > >   if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
> > > - *vcpu_cpsr(vcpu) |=  DBG_SPSR_SS;
> > > + /*
> > > +  * If the software step state at the last guest exit
> > > +  * was Active-pending, we don't set DBG_SPSR_SS so
> > > +  * that the state is maintained (to not run another
> > > +  * single-step until the pending Software Step
> > > +  * exception is taken).
> > > +  */
> > > + if (!vcpu_get_flag(vcpu, DBG_SS_ACTIVE_PENDING))
> > > + *vcpu_cpsr(vcpu) |= DBG_SPSR_SS;
> > > +
> > >   mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1);
> > >   mdscr |= DBG_MDSCR_SS;
> > >   vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1);
> > > @@ -279,6 +288,14 @@ void kvm_arm_clear_debug(struct kvm_vcpu *vcpu)
> > >   
> > > >arch.debug_ptr->dbg_wcr[0],
> > >   
> > > >arch.debug_ptr->dbg_wvr[0]);
> > >   }
> > > +
> > > + if ((vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) &&
> > > + !(*vcpu_cpsr(vcpu) & DBG_SPSR_SS))
> > > + /*
> > > +  * Mark the vcpu as ACTIVE_PENDING
> > > +  * until Software Step exception is confirmed.
> >
> > s/confirmed/taken/? This would match the comment in the previous hunk.
> 
> Yes, I will fix that.
> 
> >
> > > +  */
> > > + vcpu_set_flag(vcpu,

Re: [PATCH v3 6/7] KVM: arm64: permit all VM_MTE_ALLOWED mappings with MTE enabled

2022-09-12 Thread Marc Zyngier
On Wed, 10 Aug 2022 20:30:32 +0100,
Peter Collingbourne  wrote:
> 
> Certain VMMs such as crosvm have features (e.g. sandboxing) that depend
> on being able to map guest memory as MAP_SHARED. The current restriction
> on sharing MAP_SHARED pages with the guest is preventing the use of
> those features with MTE. Now that the races between tasks concurrently
> clearing tags on the same page have been fixed, remove this restriction.
> 
> Signed-off-by: Peter Collingbourne 
> ---
>  arch/arm64/kvm/mmu.c | 8 
>  1 file changed, 8 deletions(-)
> 
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index d54be80e31dd..fc65dc20655d 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1075,14 +1075,6 @@ static void sanitise_mte_tags(struct kvm *kvm, 
> kvm_pfn_t pfn,
>  
>  static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
>  {
> - /*
> -  * VM_SHARED mappings are not allowed with MTE to avoid races
> -  * when updating the PG_mte_tagged page flag, see
> -  * sanitise_mte_tags for more details.
> -  */
> - if (vma->vm_flags & VM_SHARED)
> - return false;
> -
>   return vma->vm_flags & VM_MTE_ALLOWED;
>  }
>  

Can you provide a pointer to some VMM making use of this functionality
and enabling MTE? A set of crosvm patches (for example) would be
useful to evaluate this series.

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2] KVM: arm64: Only set KVM_MODE_PROTECTED if is_hyp_mode_available()

2022-09-10 Thread Marc Zyngier
On Sat, 10 Sep 2022 14:43:44 +0100,
Will Deacon  wrote:
> 
> On Sat, Sep 10, 2022 at 10:09:31AM +0100, Marc Zyngier wrote:
> > On Fri, 09 Sep 2022 18:55:18 +0100,
> > Elliot Berman  wrote:
> > > 
> > > 
> > > 
> > > On 9/9/2022 10:28 AM, Catalin Marinas wrote:
> > > > On Fri, Sep 09, 2022 at 07:45:52AM -0700, Elliot Berman wrote:
> > > >> Do not switch kvm_mode to KVM_MODE_PROTECTED if hypervisor mode is not
> > > >> available. This prevents "Protected KVM" cpu capability being reported
> > > >> when Linux is booting in EL1 and would not have KVM enabled.
> > > >> 
> > > >> Signed-off-by: Elliot Berman 
> > > >> ---
> > > >>   arch/arm64/kvm/arm.c | 4 +++-
> > > >>   1 file changed, 3 insertions(+), 1 deletion(-)
> > > >> 
> > > >> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> > > >> index 8fe73ee5fa84..861f4b388879 100644
> > > >> --- a/arch/arm64/kvm/arm.c
> > > >> +++ b/arch/arm64/kvm/arm.c
> > > >> @@ -2272,7 +2272,9 @@ static int __init early_kvm_mode_cfg(char *arg)
> > > >>return -EINVAL;
> > > >>if (strcmp(arg, "protected") == 0) {
> > > >> -  if (!is_kernel_in_hyp_mode())
> > > >> +  if (!is_hyp_mode_available())
> > > >> +  kvm_mode = KVM_MODE_DEFAULT;
> > > > 
> > > > I think kvm_mode is already KVM_MODE_DEFAULT at this point. You may want
> > > > to print a warning instead.
> > > > 
> > > 
> > > Does it make sense to print warning for kvm-arm.mode=nvhe as well?
> > 
> > In general, specifying a kvm-arm.mode when no hypervisor mode is
> > available should be reported as a warning.
> 
> As long as this is pr_warn() rather than WARN() then I agree. Otherwise,
> kernels with a kvm-arm.mode hardcoded in CONFIG_CMDLINE (e.g. Android's
> GKI) will make for noisy guests.

Indeed, pr_warn() is what I had in mind. A WARN() would be pretty
overkill, as there is nothing majorly wrong with booting at EL1, just
an impossibility to honour the request from the command line.

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


<    1   2   3   4   5   6   7   8   9   10   >