Re: [PATCH v6 06/21] arm64: Move VHE-specific SPE setup to mutate_to_vhe()

2021-02-04 Thread Will Deacon
On Thu, Feb 04, 2021 at 09:30:08AM +, Marc Zyngier wrote:
> Hi Will,
> 
> On 2021-02-03 21:13, Will Deacon wrote:
> > Hi Marc,
> > 
> > On Mon, Feb 01, 2021 at 11:56:22AM +, Marc Zyngier wrote:
> > > There isn't much that a VHE kernel needs on top of whatever has
> > > been done for nVHE, so let's move the little we need to the
> > > VHE stub (the SPE setup), and drop the init_el2_state macro.
> > > 
> > > No expected functional change.
> > > 
> > > Signed-off-by: Marc Zyngier 
> > > Acked-by: David Brazdil 
> > > Acked-by: Catalin Marinas 
> > > ---
> > >  arch/arm64/kernel/hyp-stub.S | 28 +---
> > >  1 file changed, 25 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/arch/arm64/kernel/hyp-stub.S
> > > b/arch/arm64/kernel/hyp-stub.S
> > > index 373ed2213e1d..6b5c73cf9d52 100644
> > > --- a/arch/arm64/kernel/hyp-stub.S
> > > +++ b/arch/arm64/kernel/hyp-stub.S
> > > @@ -92,9 +92,6 @@ SYM_CODE_START_LOCAL(mutate_to_vhe)
> > >   msr hcr_el2, x0
> > >   isb
> > > 
> > > - // Doesn't do much on VHE, but still, worth a shot
> > > - init_el2_state vhe
> > > -
> > >   // Use the EL1 allocated stack, per-cpu offset
> > >   mrs x0, sp_el1
> > >   mov sp, x0
> > > @@ -107,6 +104,31 @@ SYM_CODE_START_LOCAL(mutate_to_vhe)
> > >   mrs_s   x0, SYS_VBAR_EL12
> > >   msr vbar_el1, x0
> > > 
> > > + // Fixup SPE configuration, if supported...
> > > + mrs x1, id_aa64dfr0_el1
> > > + ubfxx1, x1, #ID_AA64DFR0_PMSVER_SHIFT, #4
> > > + mov x2, xzr
> > > + cbz x1, skip_spe
> > > +
> > > + // ... and not owned by EL3
> > > + mrs_s   x0, SYS_PMBIDR_EL1
> > > + and x0, x0, #(1 << SYS_PMBIDR_EL1_P_SHIFT)
> > > + cbnzx0, skip_spe
> > > +
> > > + // Let the SPE driver in control of the sampling
> > > + mrs_s   x0, SYS_PMSCR_EL1
> > > + bic x0, x0, #(1 << SYS_PMSCR_EL2_PCT_SHIFT)
> > > + bic x0, x0, #(1 << SYS_PMSCR_EL2_PA_SHIFT)
> > > + msr_s   SYS_PMSCR_EL1, x0
> > 
> > Why do we need to touch pmscr_el1 at all? The SPE driver should take
> > care of
> > that, no? If you drop the pmscr_el1 accesses, I think you can drop the
> > pmbidr_el1 check as well.
> 
> That's definitely a brain fart, and is what we need on the nVHE path,
> not here. Doing the same thing twice isn't exactly helpful.
> 
> > And actually, then why even check dfr0? Doing the
> > bic for the mdcr_el1.e2pb bits is harmless.
> > 
> > > + mov x2, #MDCR_EL2_TPMS
> > > +
> > > +skip_spe:
> > > + // For VHE, use EL2 translation and disable access from EL1
> > > + mrs x0, mdcr_el2
> > > + bic x0, x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT)
> > > + orr x0, x0, x2
> > > + msr mdcr_el2, x0
> > 
> > Doesn't this undo the setting of mdcr_el2.hpmn if SPE is not present or
> > unavailable? (This wouldn't be an issue if we removed the skip_spe paths
> > altogether).
> 
> I don't think it does. We only clear the E2PB bits (harmless, as you point
> out above), and OR something that is either 0 (no SPE) or MDCR_EL2_TPMS.
> In any case, the HPMN bits are preserved (having been set during the nVHE
> setup).

Duh, OR not AND. Yes, sorry.

> I think the following patch addresses the above issue, which I'll squash
> with the original patch. Please shout if I missed anything.
> 
> Thanks,
> 
> M.
> 
> diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S
> index aa7e8d592295..3e08dcc924b5 100644
> --- a/arch/arm64/kernel/hyp-stub.S
> +++ b/arch/arm64/kernel/hyp-stub.S
> @@ -115,29 +115,9 @@ SYM_CODE_START_LOCAL(mutate_to_vhe)
>   mrs_s   x0, SYS_VBAR_EL12
>   msr vbar_el1, x0
> 
> - // Fixup SPE configuration, if supported...
> - mrs x1, id_aa64dfr0_el1
> - ubfxx1, x1, #ID_AA64DFR0_PMSVER_SHIFT, #4
> - mov x2, xzr
> - cbz x1, skip_spe
> -
> - // ... and not owned by EL3
> - mrs_s   x0, SYS_PMBIDR_EL1
> - and x0, x0, #(1 << SYS_PMBIDR_EL1_P_SHIFT)
> - cbnzx0, skip_spe
> -
> - // Let the SPE driver in control of the sampling
> - mrs_s   x0, SYS_PMSCR_EL1
> - bic x0, x0, #(1 << SYS_PMSCR_EL2_PCT_SHIFT)
> - bic x0, x0, #(1 << SYS_PMSCR_EL2_PA_SHIFT)
> - msr_s   SYS_PMSCR_EL1, x0
> - mov x2, #MDCR_EL2_TPMS
> -
> -skip_spe:
> - // For VHE, use EL2 translation and disable access from EL1
> + // Use EL2 translations for SPE and disable access from EL1
>   mrs x0, mdcr_el2
>   bic x0, x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT)
> - orr x0, x0, x2
>   msr mdcr_el2, x0

Looks a tonne better, thanks! Be nice if somebody could test it for us.

Will
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v6 06/21] arm64: Move VHE-specific SPE setup to mutate_to_vhe()

2021-02-04 Thread Marc Zyngier

Hi Will,

On 2021-02-03 21:13, Will Deacon wrote:

Hi Marc,

On Mon, Feb 01, 2021 at 11:56:22AM +, Marc Zyngier wrote:

There isn't much that a VHE kernel needs on top of whatever has
been done for nVHE, so let's move the little we need to the
VHE stub (the SPE setup), and drop the init_el2_state macro.

No expected functional change.

Signed-off-by: Marc Zyngier 
Acked-by: David Brazdil 
Acked-by: Catalin Marinas 
---
 arch/arm64/kernel/hyp-stub.S | 28 +---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kernel/hyp-stub.S 
b/arch/arm64/kernel/hyp-stub.S

index 373ed2213e1d..6b5c73cf9d52 100644
--- a/arch/arm64/kernel/hyp-stub.S
+++ b/arch/arm64/kernel/hyp-stub.S
@@ -92,9 +92,6 @@ SYM_CODE_START_LOCAL(mutate_to_vhe)
msr hcr_el2, x0
isb

-   // Doesn't do much on VHE, but still, worth a shot
-   init_el2_state vhe
-
// Use the EL1 allocated stack, per-cpu offset
mrs x0, sp_el1
mov sp, x0
@@ -107,6 +104,31 @@ SYM_CODE_START_LOCAL(mutate_to_vhe)
mrs_s   x0, SYS_VBAR_EL12
msr vbar_el1, x0

+   // Fixup SPE configuration, if supported...
+   mrs x1, id_aa64dfr0_el1
+   ubfxx1, x1, #ID_AA64DFR0_PMSVER_SHIFT, #4
+   mov x2, xzr
+   cbz x1, skip_spe
+
+   // ... and not owned by EL3
+   mrs_s   x0, SYS_PMBIDR_EL1
+   and x0, x0, #(1 << SYS_PMBIDR_EL1_P_SHIFT)
+   cbnzx0, skip_spe
+
+   // Let the SPE driver in control of the sampling
+   mrs_s   x0, SYS_PMSCR_EL1
+   bic x0, x0, #(1 << SYS_PMSCR_EL2_PCT_SHIFT)
+   bic x0, x0, #(1 << SYS_PMSCR_EL2_PA_SHIFT)
+   msr_s   SYS_PMSCR_EL1, x0


Why do we need to touch pmscr_el1 at all? The SPE driver should take 
care of

that, no? If you drop the pmscr_el1 accesses, I think you can drop the
pmbidr_el1 check as well.


That's definitely a brain fart, and is what we need on the nVHE path,
not here. Doing the same thing twice isn't exactly helpful.


And actually, then why even check dfr0? Doing the
bic for the mdcr_el1.e2pb bits is harmless.


+   mov x2, #MDCR_EL2_TPMS
+
+skip_spe:
+   // For VHE, use EL2 translation and disable access from EL1
+   mrs x0, mdcr_el2
+   bic x0, x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT)
+   orr x0, x0, x2
+   msr mdcr_el2, x0


Doesn't this undo the setting of mdcr_el2.hpmn if SPE is not present or
unavailable? (This wouldn't be an issue if we removed the skip_spe 
paths

altogether).


I don't think it does. We only clear the E2PB bits (harmless, as you 
point

out above), and OR something that is either 0 (no SPE) or MDCR_EL2_TPMS.
In any case, the HPMN bits are preserved (having been set during the 
nVHE

setup).

I think the following patch addresses the above issue, which I'll squash
with the original patch. Please shout if I missed anything.

Thanks,

M.

diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S
index aa7e8d592295..3e08dcc924b5 100644
--- a/arch/arm64/kernel/hyp-stub.S
+++ b/arch/arm64/kernel/hyp-stub.S
@@ -115,29 +115,9 @@ SYM_CODE_START_LOCAL(mutate_to_vhe)
mrs_s   x0, SYS_VBAR_EL12
msr vbar_el1, x0

-   // Fixup SPE configuration, if supported...
-   mrs x1, id_aa64dfr0_el1
-   ubfxx1, x1, #ID_AA64DFR0_PMSVER_SHIFT, #4
-   mov x2, xzr
-   cbz x1, skip_spe
-
-   // ... and not owned by EL3
-   mrs_s   x0, SYS_PMBIDR_EL1
-   and x0, x0, #(1 << SYS_PMBIDR_EL1_P_SHIFT)
-   cbnzx0, skip_spe
-
-   // Let the SPE driver in control of the sampling
-   mrs_s   x0, SYS_PMSCR_EL1
-   bic x0, x0, #(1 << SYS_PMSCR_EL2_PCT_SHIFT)
-   bic x0, x0, #(1 << SYS_PMSCR_EL2_PA_SHIFT)
-   msr_s   SYS_PMSCR_EL1, x0
-   mov x2, #MDCR_EL2_TPMS
-
-skip_spe:
-   // For VHE, use EL2 translation and disable access from EL1
+   // Use EL2 translations for SPE and disable access from EL1
mrs x0, mdcr_el2
bic x0, x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT)
-   orr x0, x0, x2
msr mdcr_el2, x0

// Transfer the MM state from EL1 to EL2

--
Jazz is not dead. It just smells funny...
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v6 06/21] arm64: Move VHE-specific SPE setup to mutate_to_vhe()

2021-02-04 Thread Marc Zyngier

On 2021-02-04 09:34, Will Deacon wrote:

On Thu, Feb 04, 2021 at 09:30:08AM +, Marc Zyngier wrote:


[...]

I think the following patch addresses the above issue, which I'll 
squash

with the original patch. Please shout if I missed anything.

Thanks,

M.

diff --git a/arch/arm64/kernel/hyp-stub.S 
b/arch/arm64/kernel/hyp-stub.S

index aa7e8d592295..3e08dcc924b5 100644
--- a/arch/arm64/kernel/hyp-stub.S
+++ b/arch/arm64/kernel/hyp-stub.S
@@ -115,29 +115,9 @@ SYM_CODE_START_LOCAL(mutate_to_vhe)
mrs_s   x0, SYS_VBAR_EL12
msr vbar_el1, x0

-   // Fixup SPE configuration, if supported...
-   mrs x1, id_aa64dfr0_el1
-   ubfxx1, x1, #ID_AA64DFR0_PMSVER_SHIFT, #4
-   mov x2, xzr
-   cbz x1, skip_spe
-
-   // ... and not owned by EL3
-   mrs_s   x0, SYS_PMBIDR_EL1
-   and x0, x0, #(1 << SYS_PMBIDR_EL1_P_SHIFT)
-   cbnzx0, skip_spe
-
-   // Let the SPE driver in control of the sampling
-   mrs_s   x0, SYS_PMSCR_EL1
-   bic x0, x0, #(1 << SYS_PMSCR_EL2_PCT_SHIFT)
-   bic x0, x0, #(1 << SYS_PMSCR_EL2_PA_SHIFT)
-   msr_s   SYS_PMSCR_EL1, x0
-   mov x2, #MDCR_EL2_TPMS
-
-skip_spe:
-   // For VHE, use EL2 translation and disable access from EL1
+   // Use EL2 translations for SPE and disable access from EL1
mrs x0, mdcr_el2
bic x0, x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT)
-   orr x0, x0, x2
msr mdcr_el2, x0


Looks a tonne better, thanks! Be nice if somebody could test it for us.


SPE-equipped machines are the silicon equivalent of hen's teeth...

Alex, any chance you could give this a go?

M.
--
Jazz is not dead. It just smells funny...
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2 6/7] KVM: arm64: Upgrade PMU support to ARMv8.4

2021-02-04 Thread Alexandru Elisei
Hi,

On 2/3/21 1:28 PM, Marc Zyngier wrote:
> On 2021-02-03 12:39, Auger Eric wrote:
>> Hi,
>>
>> On 2/3/21 12:20 PM, Marc Zyngier wrote:
>>> On 2021-02-03 11:07, Auger Eric wrote:
 Hi Marc,
 On 2/3/21 11:36 AM, Marc Zyngier wrote:
> Hi Eric,
>
> On 2021-01-27 17:53, Auger Eric wrote:
>> Hi Marc,
>>
>> On 1/25/21 1:26 PM, Marc Zyngier wrote:
>>> Upgrading the PMU code from ARMv8.1 to ARMv8.4 turns out to be
>>> pretty easy. All that is required is support for PMMIR_EL1, which
>>> is read-only, and for which returning 0 is a valid option as long
>>> as we don't advertise STALL_SLOT as an implemented event.
>>>
>>> Let's just do that and adjust what we return to the guest.
>>>
>>> Signed-off-by: Marc Zyngier 
>>> ---
>>>  arch/arm64/include/asm/sysreg.h |  3 +++
>>>  arch/arm64/kvm/pmu-emul.c   |  6 ++
>>>  arch/arm64/kvm/sys_regs.c   | 11 +++
>>>  3 files changed, 16 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/arch/arm64/include/asm/sysreg.h
>>> b/arch/arm64/include/asm/sysreg.h
>>> index 8b5e7e5c3cc8..2fb3f386588c 100644
>>> --- a/arch/arm64/include/asm/sysreg.h
>>> +++ b/arch/arm64/include/asm/sysreg.h
>>> @@ -846,7 +846,10 @@
>>>
>>>  #define ID_DFR0_PERFMON_SHIFT    24
>>>
>>> +#define ID_DFR0_PERFMON_8_0    0x3
>>>  #define ID_DFR0_PERFMON_8_1    0x4
>>> +#define ID_DFR0_PERFMON_8_4    0x5
>>> +#define ID_DFR0_PERFMON_8_5    0x6
>>>
>>>  #define ID_ISAR4_SWP_FRAC_SHIFT    28
>>>  #define ID_ISAR4_PSR_M_SHIFT    24
>>> diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
>>> index 398f6df1bbe4..72cd704a8368 100644
>>> --- a/arch/arm64/kvm/pmu-emul.c
>>> +++ b/arch/arm64/kvm/pmu-emul.c
>>> @@ -795,6 +795,12 @@ u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu,
>>> bool pmceid1)
>>>  base = 0;
>>>  } else {
>>>  val = read_sysreg(pmceid1_el0);
>>> +    /*
>>> + * Don't advertise STALL_SLOT, as PMMIR_EL0 is handled
>>> + * as RAZ
>>> + */
>>> +    if (vcpu->kvm->arch.pmuver >= ID_AA64DFR0_PMUVER_8_4)
>>> +    val &= ~BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT - 32);
>> what about the STALL_SLOT_BACKEND and FRONTEND events then?
>
> Aren't these a mandatory ARMv8.1 feature? I don't see a reason to
> drop them.

 I understand the 3 are linked together.

 In D7.11 it is said
 "
 When any of the following common events are implemented, all three of
 them are implemented:
 0x003D , STALL_SLOT_BACKEND, No operation sent for execution on a Slot
 due to the backend,
 0x003E , STALL_SLOT_FRONTEND, No operation sent for execution on a Slot
 due to the frontend.
 0x003F , STALL_SLOT, No operation sent for execution on a Slot.
 "
>>>
>>> They are linked in the sense that they report related events, but they
>>> don't have to be implemented in the same level of the architecure, if only
>>> because BACKEND/FRONTEND were introducedway before ARMv8.4.
>>>
>>> What the architecture says is:
>>>
>>> - For FEAT_PMUv3p1 (ARMv8.1):
>>>   "The STALL_FRONTEND and STALL_BACKEND events are required to be
>>>    implemented." (A2.4.1, DDI0487G.a)
>> OK
>>>
>>> - For FEAT_PMUv3p4 (ARMv8.4):
>>>   "If FEAT_PMUv3p4 is implemented:
>>>    - If STALL_SLOT is not implemented, it is IMPLEMENTATION DEFINED
>>> whether the PMMIR System registers are implemented.
>>>    - If STALL_SLOT is implemented, then the PMMIR System registers are
>>> implemented." (D7-2873, DDI0487G.a)
>>>
>>> So while BACKEND/FRONTEND are required in an ARMv8.4 implementation
>>> by virtue of being mandatory in ARMv8.1, STALL_SLOT isn't at any point.
>> But then how do you understand "When any of the following common events
>> are implemented, all three of them are implemented"?
>
> I think that's wholly inconsistent, because it would mean that STALL_SLOT
> isn't optional on ARMv8.4, and would make PMMIR mandatory.

I think there's some confusion regarding the event names. From my reading of the
architecture, STALL != STALL_SLOT, STALL_BACKEND != STALL_SLOT_BACKEND,
STALL_FRONTEND != STALL_SLOT_FRONTEND.

STALL{, _BACKEND, _FRONTEND} count the number of CPU cycles where no 
instructions
are being executed on the PE (page D7-2872), STALL_SLOT{, _BACKEND, _FRONTEND}
count the number of slots where no instructions are being executed (page 
D7-2873).

STALL_{BACKEND, FRONTEND} are required by ARMv8.1-PMU (pages A2-76, D7-2913);
STALL is required by ARMv8.4-PMU (page D7-2914).

STALL_SLOT{, _BACKEND, _FRONTEND} are optional in ARMv8.4-PMU (pages D7-2913,
D7-2914), but if one of them is implemented, all of them must be implemented 
(page
D7-2914).

The problem I see with this patch is that it doesn't clear the
STALL_SLOT_{BACKEND, FRONTEND} event bits along with the STALL_SLOT 

Re: [PATCH v7 2/3] arm64: kvm: Introduce MTE VCPU feature

2021-02-04 Thread Steven Price

On 02/02/2021 17:12, Marc Zyngier wrote:

On 2021-01-15 15:28, Steven Price wrote:

Add a new VM feature 'KVM_ARM_CAP_MTE' which enables memory tagging
for a VM. This exposes the feature to the guest and automatically tags
memory pages touched by the VM as PG_mte_tagged (and clears the tags
storage) to ensure that the guest cannot see stale tags, and so that the
tags are correctly saved/restored across swap.

Signed-off-by: Steven Price 
---
 arch/arm64/include/asm/kvm_emulate.h |  3 +++
 arch/arm64/include/asm/kvm_host.h    |  3 +++
 arch/arm64/include/asm/pgtable.h |  2 +-
 arch/arm64/kernel/mte.c  | 36 +---
 arch/arm64/kvm/arm.c |  9 +++
 arch/arm64/kvm/hyp/exception.c   |  3 ++-
 arch/arm64/kvm/mmu.c | 16 +
 arch/arm64/kvm/sys_regs.c    |  6 -
 include/uapi/linux/kvm.h |  1 +
 9 files changed, 62 insertions(+), 17 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_emulate.h
b/arch/arm64/include/asm/kvm_emulate.h
index f612c090f2e4..6bf776c2399c 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -84,6 +84,9 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu 
*vcpu)

 if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE) ||
 vcpu_el1_is_32bit(vcpu))
 vcpu->arch.hcr_el2 |= HCR_TID2;
+
+    if (kvm_has_mte(vcpu->kvm))
+    vcpu->arch.hcr_el2 |= HCR_ATA;
 }

 static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/include/asm/kvm_host.h
b/arch/arm64/include/asm/kvm_host.h
index 51590a397e4b..1ca5785fb0e9 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -132,6 +132,8 @@ struct kvm_arch {

 u8 pfr0_csv2;
 u8 pfr0_csv3;
+    /* Memory Tagging Extension enabled for the guest */
+    bool mte_enabled;
 };

 struct kvm_vcpu_fault_info {
@@ -749,6 +751,7 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu 
*vcpu);

 #define kvm_arm_vcpu_sve_finalized(vcpu) \
 ((vcpu)->arch.flags & KVM_ARM64_VCPU_SVE_FINALIZED)

+#define kvm_has_mte(kvm) (system_supports_mte() && 
(kvm)->arch.mte_enabled)

 #define kvm_vcpu_has_pmu(vcpu)    \
 (test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features))

diff --git a/arch/arm64/include/asm/pgtable.h 
b/arch/arm64/include/asm/pgtable.h

index 501562793ce2..27416d52f6a9 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -312,7 +312,7 @@ static inline void set_pte_at(struct mm_struct
*mm, unsigned long addr,
 __sync_icache_dcache(pte);

 if (system_supports_mte() &&
-    pte_present(pte) && pte_tagged(pte) && !pte_special(pte))
+    pte_present(pte) && pte_valid_user(pte) && !pte_special(pte))
 mte_sync_tags(ptep, pte);


Care to elaborate on this change?


Sorry I should have called this out in the commit message. The change 
here is instead of only calling mte_sync_tags() on pages which are 
already tagged in the PTE, it is called for all (normal) user pages 
instead. See below for why.




 __check_racy_pte_update(mm, ptep, pte);
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index dc9ada64feed..f9e089be1603 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -25,27 +25,33 @@

 u64 gcr_kernel_excl __ro_after_init;

-static void mte_sync_page_tags(struct page *page, pte_t *ptep, bool 
check_swap)
+static void mte_sync_page_tags(struct page *page, pte_t *ptep, bool 
check_swap,

+   bool pte_is_tagged)
 {
 pte_t old_pte = READ_ONCE(*ptep);

 if (check_swap && is_swap_pte(old_pte)) {
 swp_entry_t entry = pte_to_swp_entry(old_pte);

-    if (!non_swap_entry(entry) && mte_restore_tags(entry, page))
+    if (!non_swap_entry(entry) && mte_restore_tags(entry, page)) {
+    set_bit(PG_mte_tagged, >flags);
 return;
+    }
 }

-    page_kasan_tag_reset(page);
-    /*
- * We need smp_wmb() in between setting the flags and clearing the
- * tags because if another thread reads page->flags and builds a
- * tagged address out of it, there is an actual dependency to the
- * memory access, but on the current thread we do not guarantee that
- * the new page->flags are visible before the tags were updated.
- */
-    smp_wmb();
-    mte_clear_page_tags(page_address(page));
+    if (pte_is_tagged) {
+    set_bit(PG_mte_tagged, >flags);
+    page_kasan_tag_reset(page);
+    /*
+ * We need smp_wmb() in between setting the flags and 
clearing the

+ * tags because if another thread reads page->flags and builds a
+ * tagged address out of it, there is an actual dependency to 
the
+ * memory access, but on the current thread we do not 
guarantee that

+ * the new page->flags are visible before the tags were updated.
+ */
+    smp_wmb();
+    

Re: [PATCH v7 1/3] arm64: kvm: Save/restore MTE registers

2021-02-04 Thread Steven Price

On 02/02/2021 15:36, Marc Zyngier wrote:

On 2021-01-15 15:28, Steven Price wrote:

Define the new system registers that MTE introduces and context switch
them. The MTE feature is still hidden from the ID register as it isn't
supported in a VM yet.

Signed-off-by: Steven Price 
---
 arch/arm64/include/asm/kvm_host.h  |  4 ++
 arch/arm64/include/asm/kvm_mte.h   | 74 ++
 arch/arm64/include/asm/sysreg.h    |  3 +-
 arch/arm64/kernel/asm-offsets.c    |  3 +
 arch/arm64/kvm/hyp/entry.S |  7 ++
 arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h |  4 ++
 arch/arm64/kvm/sys_regs.c  | 14 ++--
 7 files changed, 104 insertions(+), 5 deletions(-)
 create mode 100644 arch/arm64/include/asm/kvm_mte.h

diff --git a/arch/arm64/include/asm/kvm_host.h
b/arch/arm64/include/asm/kvm_host.h
index 11beda85ee7e..51590a397e4b 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -148,6 +148,8 @@ enum vcpu_sysreg {
 SCTLR_EL1,    /* System Control Register */
 ACTLR_EL1,    /* Auxiliary Control Register */
 CPACR_EL1,    /* Coprocessor Access Control */
+    RGSR_EL1,    /* Random Allocation Tag Seed Register */
+    GCR_EL1,    /* Tag Control Register */
 ZCR_EL1,    /* SVE Control */
 TTBR0_EL1,    /* Translation Table Base Register 0 */
 TTBR1_EL1,    /* Translation Table Base Register 1 */
@@ -164,6 +166,8 @@ enum vcpu_sysreg {
 TPIDR_EL1,    /* Thread ID, Privileged */
 AMAIR_EL1,    /* Aux Memory Attribute Indirection Register */
 CNTKCTL_EL1,    /* Timer Control Register (EL1) */
+    TFSRE0_EL1,    /* Tag Fault Status Register (EL0) */
+    TFSR_EL1,    /* Tag Fault Stauts Register (EL1) */


s/Stauts/Status/

Is there any reason why the MTE registers aren't grouped together?


I has been under the impression this list is sorted by the encoding of 
the system registers, although double checking I've screwed up the order 
of TFSRE0_EL1/TFSR_EL1, and not all the other fields are sorted that way.


I'll move them together in their own section.


 PAR_EL1,    /* Physical Address Register */
 MDSCR_EL1,    /* Monitor Debug System Control Register */
 MDCCINT_EL1,    /* Monitor Debug Comms Channel Interrupt Enable 
Reg */
diff --git a/arch/arm64/include/asm/kvm_mte.h 
b/arch/arm64/include/asm/kvm_mte.h

new file mode 100644
index ..62bbfae77f33
--- /dev/null
+++ b/arch/arm64/include/asm/kvm_mte.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020 ARM Ltd.
+ */
+#ifndef __ASM_KVM_MTE_H
+#define __ASM_KVM_MTE_H
+
+#ifdef __ASSEMBLY__
+
+#include 
+
+#ifdef CONFIG_ARM64_MTE
+
+.macro mte_switch_to_guest g_ctxt, h_ctxt, reg1
+alternative_if_not ARM64_MTE
+    b    .L__skip_switch\@
+alternative_else_nop_endif
+    mrs    \reg1, hcr_el2
+    and    \reg1, \reg1, #(HCR_ATA)
+    cbz    \reg1, .L__skip_switch\@
+
+    mrs_s    \reg1, SYS_RGSR_EL1
+    str    \reg1, [\h_ctxt, #CPU_RGSR_EL1]
+    mrs_s    \reg1, SYS_GCR_EL1
+    str    \reg1, [\h_ctxt, #CPU_GCR_EL1]
+    mrs_s    \reg1, SYS_TFSRE0_EL1
+    str    \reg1, [\h_ctxt, #CPU_TFSRE0_EL1]
+
+    ldr    \reg1, [\g_ctxt, #CPU_RGSR_EL1]
+    msr_s    SYS_RGSR_EL1, \reg1
+    ldr    \reg1, [\g_ctxt, #CPU_GCR_EL1]
+    msr_s    SYS_GCR_EL1, \reg1
+    ldr    \reg1, [\g_ctxt, #CPU_TFSRE0_EL1]
+    msr_s    SYS_TFSRE0_EL1, \reg1
+
+.L__skip_switch\@:
+.endm
+
+.macro mte_switch_to_hyp g_ctxt, h_ctxt, reg1
+alternative_if_not ARM64_MTE
+    b    .L__skip_switch\@
+alternative_else_nop_endif
+    mrs    \reg1, hcr_el2
+    and    \reg1, \reg1, #(HCR_ATA)
+    cbz    \reg1, .L__skip_switch\@
+
+    mrs_s    \reg1, SYS_RGSR_EL1
+    str    \reg1, [\g_ctxt, #CPU_RGSR_EL1]
+    mrs_s    \reg1, SYS_GCR_EL1
+    str    \reg1, [\g_ctxt, #CPU_GCR_EL1]
+    mrs_s    \reg1, SYS_TFSRE0_EL1
+    str    \reg1, [\g_ctxt, #CPU_TFSRE0_EL1]


Can't the EL0 state save/restore be moved to the C code?


True, that should be safe. I'm not sure how I missed that.


+
+    ldr    \reg1, [\h_ctxt, #CPU_RGSR_EL1]
+    msr_s    SYS_RGSR_EL1, \reg1
+    ldr    \reg1, [\h_ctxt, #CPU_GCR_EL1]
+    msr_s    SYS_GCR_EL1, \reg1
+    ldr    \reg1, [\h_ctxt, #CPU_TFSRE0_EL1]
+    msr_s    SYS_TFSRE0_EL1, \reg1
+
+.L__skip_switch\@:
+.endm
+
+#else /* CONFIG_ARM64_MTE */
+
+.macro mte_switch_to_guest g_ctxt, h_ctxt, reg1
+.endm
+
+.macro mte_switch_to_hyp g_ctxt, h_ctxt, reg1
+.endm
+
+#endif /* CONFIG_ARM64_MTE */
+#endif /* __ASSEMBLY__ */
+#endif /* __ASM_KVM_MTE_H */
diff --git a/arch/arm64/include/asm/sysreg.h 
b/arch/arm64/include/asm/sysreg.h

index 8b5e7e5c3cc8..0a01975d331d 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -574,7 +574,8 @@
 #define SCTLR_ELx_M    (BIT(0))

 #define SCTLR_ELx_FLAGS    (SCTLR_ELx_M  | SCTLR_ELx_A | SCTLR_ELx_C | \
- SCTLR_ELx_SA | SCTLR_ELx_I | SCTLR_ELx_IESB)
+ SCTLR_ELx_SA | SCTLR_ELx_I | SCTLR_ELx_IESB | \
+   

Re: [PATCH v2 6/7] KVM: arm64: Upgrade PMU support to ARMv8.4

2021-02-04 Thread Marc Zyngier

On 2021-02-04 12:32, Alexandru Elisei wrote:

Hi,

On 2/3/21 1:28 PM, Marc Zyngier wrote:

On 2021-02-03 12:39, Auger Eric wrote:

Hi,

On 2/3/21 12:20 PM, Marc Zyngier wrote:

On 2021-02-03 11:07, Auger Eric wrote:

Hi Marc,
On 2/3/21 11:36 AM, Marc Zyngier wrote:

Hi Eric,

On 2021-01-27 17:53, Auger Eric wrote:

Hi Marc,

On 1/25/21 1:26 PM, Marc Zyngier wrote:

Upgrading the PMU code from ARMv8.1 to ARMv8.4 turns out to be
pretty easy. All that is required is support for PMMIR_EL1, 
which
is read-only, and for which returning 0 is a valid option as 
long

as we don't advertise STALL_SLOT as an implemented event.

Let's just do that and adjust what we return to the guest.

Signed-off-by: Marc Zyngier 
---
 arch/arm64/include/asm/sysreg.h |  3 +++
 arch/arm64/kvm/pmu-emul.c   |  6 ++
 arch/arm64/kvm/sys_regs.c   | 11 +++
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/sysreg.h
b/arch/arm64/include/asm/sysreg.h
index 8b5e7e5c3cc8..2fb3f386588c 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -846,7 +846,10 @@

 #define ID_DFR0_PERFMON_SHIFT    24

+#define ID_DFR0_PERFMON_8_0    0x3
 #define ID_DFR0_PERFMON_8_1    0x4
+#define ID_DFR0_PERFMON_8_4    0x5
+#define ID_DFR0_PERFMON_8_5    0x6

 #define ID_ISAR4_SWP_FRAC_SHIFT    28
 #define ID_ISAR4_PSR_M_SHIFT    24
diff --git a/arch/arm64/kvm/pmu-emul.c 
b/arch/arm64/kvm/pmu-emul.c

index 398f6df1bbe4..72cd704a8368 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -795,6 +795,12 @@ u64 kvm_pmu_get_pmceid(struct kvm_vcpu 
*vcpu,

bool pmceid1)
 base = 0;
 } else {
 val = read_sysreg(pmceid1_el0);
+    /*
+ * Don't advertise STALL_SLOT, as PMMIR_EL0 is handled
+ * as RAZ
+ */
+    if (vcpu->kvm->arch.pmuver >= ID_AA64DFR0_PMUVER_8_4)
+    val &= ~BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT - 
32);

what about the STALL_SLOT_BACKEND and FRONTEND events then?


Aren't these a mandatory ARMv8.1 feature? I don't see a reason to
drop them.


I understand the 3 are linked together.

In D7.11 it is said
"
When any of the following common events are implemented, all three 
of

them are implemented:
0x003D , STALL_SLOT_BACKEND, No operation sent for execution on a 
Slot

due to the backend,
0x003E , STALL_SLOT_FRONTEND, No operation sent for execution on a 
Slot

due to the frontend.
0x003F , STALL_SLOT, No operation sent for execution on a Slot.
"


They are linked in the sense that they report related events, but 
they
don't have to be implemented in the same level of the architecure, 
if only

because BACKEND/FRONTEND were introducedway before ARMv8.4.

What the architecture says is:

- For FEAT_PMUv3p1 (ARMv8.1):
  "The STALL_FRONTEND and STALL_BACKEND events are required to be
   implemented." (A2.4.1, DDI0487G.a)

OK


- For FEAT_PMUv3p4 (ARMv8.4):
  "If FEAT_PMUv3p4 is implemented:
   - If STALL_SLOT is not implemented, it is IMPLEMENTATION DEFINED
whether the PMMIR System registers are implemented.
   - If STALL_SLOT is implemented, then the PMMIR System registers 
are

implemented." (D7-2873, DDI0487G.a)

So while BACKEND/FRONTEND are required in an ARMv8.4 implementation
by virtue of being mandatory in ARMv8.1, STALL_SLOT isn't at any 
point.
But then how do you understand "When any of the following common 
events

are implemented, all three of them are implemented"?


I think that's wholly inconsistent, because it would mean that 
STALL_SLOT

isn't optional on ARMv8.4, and would make PMMIR mandatory.


I think there's some confusion regarding the event names. From my 
reading of the

architecture, STALL != STALL_SLOT, STALL_BACKEND != STALL_SLOT_BACKEND,
STALL_FRONTEND != STALL_SLOT_FRONTEND.


Dammit, I hadn't realised that at all. Thanks for putting me right.



STALL{, _BACKEND, _FRONTEND} count the number of CPU cycles where no
instructions
are being executed on the PE (page D7-2872), STALL_SLOT{, _BACKEND, 
_FRONTEND}

count the number of slots where no instructions are being executed
(page D7-2873).

STALL_{BACKEND, FRONTEND} are required by ARMv8.1-PMU (pages A2-76, 
D7-2913);

STALL is required by ARMv8.4-PMU (page D7-2914).

STALL_SLOT{, _BACKEND, _FRONTEND} are optional in ARMv8.4-PMU (pages 
D7-2913,

D7-2914), but if one of them is implemented, all of them must be
implemented (page D7-2914).

The problem I see with this patch is that it doesn't clear the
STALL_SLOT_{BACKEND, FRONTEND} event bits along with the STALL_SLOT bit 
from

PMCEID1_EL0.


OK, so Eric was right, and I'm an illiterate idiot! I'll fix the patch 
and repost

the whole thing.

Thanks,

M.
--
Jazz is not dead. It just smells funny...
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC PATCH v2 12/26] KVM: arm64: Introduce a Hyp buddy page allocator

2021-02-04 Thread Will Deacon
On Wed, Feb 03, 2021 at 06:33:30PM +, Quentin Perret wrote:
> On Tuesday 02 Feb 2021 at 18:13:08 (+), Will Deacon wrote:
> > On Fri, Jan 08, 2021 at 12:15:10PM +, Quentin Perret wrote:
> > > + *   __find_buddy(pool, page 0, order 0) => page 1
> > > + *   __find_buddy(pool, page 0, order 1) => page 2
> > > + *   __find_buddy(pool, page 1, order 0) => page 0
> > > + *   __find_buddy(pool, page 2, order 0) => page 3
> > > + */
> > > +static struct hyp_page *__find_buddy(struct hyp_pool *pool, struct 
> > > hyp_page *p,
> > > +  unsigned int order)
> > > +{
> > > + phys_addr_t addr = hyp_page_to_phys(p);
> > > +
> > > + addr ^= (PAGE_SIZE << order);
> > > + if (addr < pool->range_start || addr >= pool->range_end)
> > > + return NULL;
> > 
> > Are these range checks only needed because the pool isn't required to be
> > an exact power-of-2 pages in size? If so, maybe it would be more
> > straightforward to limit the max order on a per-pool basis depending upon
> > its size?
> 
> More importantly, it is because pages outside of the pool are not
> guaranteed to be covered by the hyp_vmemmap, so I really need to make
> sure I don't dereference them.

Wouldn't having a per-pool max order help with that?

> > > + return hyp_phys_to_page(addr);
> > > +}
> > > +
> > > +static void __hyp_attach_page(struct hyp_pool *pool,
> > > +   struct hyp_page *p)
> > > +{
> > > + unsigned int order = p->order;
> > > + struct hyp_page *buddy;
> > > +
> > > + p->order = HYP_NO_ORDER;
> > 
> > Why is this needed?
> 
> If p->order is say 3, I may be able to coalesce with the buddy of order
> 3 to form a higher order page of order 4. And that higher order page
> will be represented by the 'first' of the two order-3 pages (let's call
> it the head), and the other order 3 page (let's say the tail) will be
> assigned 'HYP_NO_ORDER'.
> 
> And basically at this point I don't know if 'p' is going be the head or
> the tail, so I set it to HYP_NO_ORDER a priori so I don't have to think
> about this in the loop below. Is that helping?
> 
> I suppose this could use more comments as well ...

Comments would definitely help, but perhaps even having a simple function to
do the coalescing, which you could call from the loop body and which would
deal with marking the tail pages as HYP_NO_ORDER?

> > > + for (; order < HYP_MAX_ORDER; order++) {
> > > + /* Nothing to do if the buddy isn't in a free-list */
> > > + buddy = __find_buddy(pool, p, order);
> > > + if (!buddy || list_empty(>node) || buddy->order != order)
> > 
> > Could we move the "buddy->order" check into __find_buddy()?
> 
> I think might break __hyp_extract_page() below. The way I think about
> __find_buddy() is as a low level function which gives you the buddy page
> blindly if it exists in the hyp_vmemmap, and it's up to the callers to
> decide whether the buddy is in the right state for their use or not.

Just feels a bit backwards having __find_buddy() take an order parameter,
yet then return a page of the wrong order! __hyp_extract_page() always
passes the p->order as the order, so I think it would be worth having a
separate function that just takes the pool and the page for that.

Will
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC PATCH v2 17/26] KVM: arm64: Elevate Hyp mappings creation at EL2

2021-02-04 Thread Quentin Perret
On Wednesday 03 Feb 2021 at 15:31:39 (+), Will Deacon wrote:
> On Fri, Jan 08, 2021 at 12:15:15PM +, Quentin Perret wrote:
> > Previous commits have introduced infrastructure at EL2 to enable the Hyp
> > code to manage its own memory, and more specifically its stage 1 page
> > tables. However, this was preliminary work, and none of it is currently
> > in use.
> > 
> > Put all of this together by elevating the hyp mappings creation at EL2
> > when memory protection is enabled. In this case, the host kernel running
> > at EL1 still creates _temporary_ Hyp mappings, only used while
> > initializing the hypervisor, but frees them right after.
> > 
> > As such, all calls to create_hyp_mappings() after kvm init has finished
> > turn into hypercalls, as the host now has no 'legal' way to modify the
> > hypevisor page tables directly.
> > 
> > Signed-off-by: Quentin Perret 
> > ---
> >  arch/arm64/include/asm/kvm_mmu.h |  1 -
> >  arch/arm64/kvm/arm.c | 62 +---
> >  arch/arm64/kvm/mmu.c | 34 ++
> >  3 files changed, 92 insertions(+), 5 deletions(-)
> > 
> > diff --git a/arch/arm64/include/asm/kvm_mmu.h 
> > b/arch/arm64/include/asm/kvm_mmu.h
> > index d7ebd73ec86f..6c8466a042a9 100644
> > --- a/arch/arm64/include/asm/kvm_mmu.h
> > +++ b/arch/arm64/include/asm/kvm_mmu.h
> > @@ -309,6 +309,5 @@ static __always_inline void __load_guest_stage2(struct 
> > kvm_s2_mmu *mmu)
> >  */
> > asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
> >  }
> > -
> >  #endif /* __ASSEMBLY__ */
> >  #endif /* __ARM64_KVM_MMU_H__ */
> > diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> > index 6af9204bcd5b..e524682c2ccf 100644
> > --- a/arch/arm64/kvm/arm.c
> > +++ b/arch/arm64/kvm/arm.c
> > @@ -1421,7 +1421,7 @@ static void cpu_prepare_hyp_mode(int cpu)
> > kvm_flush_dcache_to_poc(params, sizeof(*params));
> >  }
> >  
> > -static void cpu_init_hyp_mode(void)
> > +static void kvm_set_hyp_vector(void)
> 
> Please do something about the naming: now we have both cpu_set_hyp_vector()
> and kvm_set_hyp_vector()!

I'll try to find something different, but no guarantees it'll be much
better :) Suggestions welcome.

> >  {
> > struct kvm_nvhe_init_params *params;
> > struct arm_smccc_res res;
> > @@ -1439,6 +1439,11 @@ static void cpu_init_hyp_mode(void)
> > params = this_cpu_ptr_nvhe_sym(kvm_init_params);
> > arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init), 
> > virt_to_phys(params), );
> > WARN_ON(res.a0 != SMCCC_RET_SUCCESS);
> > +}
> > +
> > +static void cpu_init_hyp_mode(void)
> > +{
> > +   kvm_set_hyp_vector();
> >  
> > /*
> >  * Disabling SSBD on a non-VHE system requires us to enable SSBS
> > @@ -1481,7 +1486,10 @@ static void cpu_set_hyp_vector(void)
> > struct bp_hardening_data *data = this_cpu_ptr(_hardening_data);
> > void *vector = hyp_spectre_vector_selector[data->slot];
> >  
> > -   *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector;
> > +   if (!is_protected_kvm_enabled())
> > +   *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector;
> > +   else
> > +   kvm_call_hyp_nvhe(__pkvm_cpu_set_vector, data->slot);
> 
> *Very* minor nit, but it might be cleaner to have static inline functions
> with the same prototypes as the hypercalls, just to make the code even
> easier to read. e.g
> 
>   if (!is_protected_kvm_enabled())
>   _cpu_set_vector(data->slot);
>   else
>   kvm_call_hyp_nvhe(__pkvm_cpu_set_vector, data->slot);
> 
> you could then conceivably wrap that in a macro and avoid having the
> "is_protected_kvm_enabled()" checks explicit every time.

Happy to do this here, but are you suggesting to generalize this pattern
to other places as well?

> >  }
> >  
> >  static void cpu_hyp_reinit(void)
> > @@ -1489,13 +1497,14 @@ static void cpu_hyp_reinit(void)
> > 
> > kvm_init_host_cpu_context(_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt);
> >  
> > cpu_hyp_reset();
> > -   cpu_set_hyp_vector();
> >  
> > if (is_kernel_in_hyp_mode())
> > kvm_timer_init_vhe();
> > else
> > cpu_init_hyp_mode();
> >  
> > +   cpu_set_hyp_vector();
> > +
> > kvm_arm_init_debug();
> >  
> > if (vgic_present)
> > @@ -1714,13 +1723,52 @@ static int copy_cpu_ftr_regs(void)
> > return 0;
> >  }
> >  
> > +static int kvm_hyp_enable_protection(void)
> > +{
> > +   void *per_cpu_base = kvm_ksym_ref(kvm_arm_hyp_percpu_base);
> > +   int ret, cpu;
> > +   void *addr;
> > +
> > +   if (!is_protected_kvm_enabled())
> > +   return 0;
> 
> Maybe I'm hung up on my previous suggestion, but I feel like we shouldn't
> get here if protected kvm isn't enabled.

The alternative is to move this check next to the call site, but it
won't help much IMO.

> 
> > +   if (!hyp_mem_base)
> > +   return -ENOMEM;
> > +
> > +   addr = phys_to_virt(hyp_mem_base);
> > +   ret = 

Re: [RFC PATCH v2 24/26] KVM: arm64: Make memcache anonymous in pgtable allocator

2021-02-04 Thread Will Deacon
On Thu, Feb 04, 2021 at 02:24:44PM +, Quentin Perret wrote:
> On Wednesday 03 Feb 2021 at 15:59:44 (+), Will Deacon wrote:
> > On Fri, Jan 08, 2021 at 12:15:22PM +, Quentin Perret wrote:
> > > The current stage2 page-table allocator uses a memcache to get
> > > pre-allocated pages when it needs any. To allow re-using this code at
> > > EL2 which uses a concept of memory pools, make the memcache argument to
> > > kvm_pgtable_stage2_map() anonymous. and let the mm_ops zalloc_page()
> > > callbacks use it the way they need to.
> > > 
> > > Signed-off-by: Quentin Perret 
> > > ---
> > >  arch/arm64/include/asm/kvm_pgtable.h | 6 +++---
> > >  arch/arm64/kvm/hyp/pgtable.c | 4 ++--
> > >  2 files changed, 5 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/arch/arm64/include/asm/kvm_pgtable.h 
> > > b/arch/arm64/include/asm/kvm_pgtable.h
> > > index 8e8f1d2c5e0e..d846bc3d3b77 100644
> > > --- a/arch/arm64/include/asm/kvm_pgtable.h
> > > +++ b/arch/arm64/include/asm/kvm_pgtable.h
> > > @@ -176,8 +176,8 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable 
> > > *pgt);
> > >   * @size:Size of the mapping.
> > >   * @phys:Physical address of the memory to map.
> > >   * @prot:Permissions and attributes for the mapping.
> > > - * @mc:  Cache of pre-allocated GFP_PGTABLE_USER memory from 
> > > which to
> > > - *   allocate page-table pages.
> > > + * @mc:  Cache of pre-allocated memory from which to allocate 
> > > page-table
> > > + *   pages.
> > 
> > We should probably mention that this memory must be zeroed, since I don't
> > think the page-table code takes care of that.
> 
> OK, though I think this is unrelated to this change -- this is already
> true today I believe. Anyhow, I'll pile a change on top.

It is, but GFP_PGTABLE_USER implies __GFP_ZERO, so the existing comment
captures that.

Will
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC PATCH v2 26/26] KVM: arm64: Wrap the host with a stage 2

2021-02-04 Thread Will Deacon
On Thu, Feb 04, 2021 at 02:26:35PM +, Quentin Perret wrote:
> On Wednesday 03 Feb 2021 at 16:11:47 (+), Will Deacon wrote:
> > On Fri, Jan 08, 2021 at 12:15:24PM +, Quentin Perret wrote:
> > > When KVM runs in protected nVHE mode, make use of a stage 2 page-table
> > > to give the hypervisor some control over the host memory accesses. At
> > > the moment all memory aborts from the host will be instantly idmapped
> > > RWX at stage 2 in a lazy fashion. Later patches will make use of that
> > > infrastructure to implement access control restrictions to e.g. protect
> > > guest memory from the host.
> > > 
> > > Signed-off-by: Quentin Perret 
> > > ---
> > >  arch/arm64/include/asm/kvm_cpufeature.h   |   2 +
> > >  arch/arm64/kernel/image-vars.h|   3 +
> > >  arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |  33 +++
> > >  arch/arm64/kvm/hyp/nvhe/Makefile  |   2 +-
> > >  arch/arm64/kvm/hyp/nvhe/hyp-init.S|   1 +
> > >  arch/arm64/kvm/hyp/nvhe/hyp-main.c|   6 +
> > >  arch/arm64/kvm/hyp/nvhe/mem_protect.c | 191 ++
> > >  arch/arm64/kvm/hyp/nvhe/setup.c   |   6 +
> > >  arch/arm64/kvm/hyp/nvhe/switch.c  |   7 +-
> > >  arch/arm64/kvm/hyp/nvhe/tlb.c |   4 +-
> > >  10 files changed, 248 insertions(+), 7 deletions(-)
> > >  create mode 100644 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
> > >  create mode 100644 arch/arm64/kvm/hyp/nvhe/mem_protect.c
> > 
> > [...]
> > 
> > > +void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
> > > +{
> > > + enum kvm_pgtable_prot prot;
> > > + u64 far, hpfar, esr, ipa;
> > > + int ret;
> > > +
> > > + esr = read_sysreg_el2(SYS_ESR);
> > > + if (!__get_fault_info(esr, , ))
> > > + hyp_panic();
> > > +
> > > + prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W | KVM_PGTABLE_PROT_X;
> > > + ipa = (hpfar & HPFAR_MASK) << 8;
> > > + ret = host_stage2_map(ipa, PAGE_SIZE, prot);
> > 
> > Can we try to put down a block mapping if the whole thing falls within
> > memory?
> 
> Yes we can! And in fact we can do that outside of memory too. It's
> queued for v3 already, so stay tuned ... :)

Awesome! The Stage-2 TLB thanks you.

Will
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC PATCH v2 16/26] KVM: arm64: Prepare Hyp memory protection

2021-02-04 Thread Quentin Perret
On Wednesday 03 Feb 2021 at 14:37:10 (+), Will Deacon wrote:
> On Fri, Jan 08, 2021 at 12:15:14PM +, Quentin Perret wrote:
> > +static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages)
> > +{
> > +   unsigned long total = 0, i;
> > +
> > +   /* Provision the worst case scenario with 4 levels of page-table */
> > +   for (i = 0; i < 4; i++) {
> 
> Looks like you want KVM_PGTABLE_MAX_LEVELS, so maybe move that into a
> header?

Will do.

> 
> > +   nr_pages = DIV_ROUND_UP(nr_pages, PTRS_PER_PTE);
> > +   total += nr_pages;
> > +   }
> 
> ... that said, I'm not sure this needs to iterate at all. What exactly are
> you trying to compute?

I'm trying to figure out how many pages I will need to construct a
page-table covering nr_pages contiguous pages. The first iteration tells
me how many level 0 pages I need to cover nr_pages, the second iteration
how many level 1 pages I need to cover the level 0 pages, and so on...

I might be doing this naively though. Got a better idea?

> > +
> > +   return total;
> > +}
> > +
> > +static inline unsigned long hyp_s1_pgtable_size(void)
> > +{
> > +   struct hyp_memblock_region *reg;
> > +   unsigned long nr_pages, res = 0;
> > +   int i;
> > +
> > +   if (kvm_nvhe_sym(hyp_memblock_nr) <= 0)
> > +   return 0;
> 
> It's a bit grotty having this be signed. Why do we need to encode the error
> case differently from the 0 case?

Here specifically we don't, but it is needed in early_init_dt_add_memory_hyp()
to distinguish the overflow case from the first memblock being added.

> 
> > +
> > +   for (i = 0; i < kvm_nvhe_sym(hyp_memblock_nr); i++) {
> > +   reg = _nvhe_sym(hyp_memory)[i];
> 
> You could declare reg in the loop body.

I found it prettier like that and assumed the compiler would optimize it
anyway, but ok.

> > +   nr_pages = (reg->end - reg->start) >> PAGE_SHIFT;
> > +   nr_pages = __hyp_pgtable_max_pages(nr_pages);
> 
> Maybe it would make more sense for __hyp_pgtable_max_pages to take the
> size in bytes rather than pages, since most callers seem to have to do the
> conversion?

Yes, and it seems I can apply small cleanups in other places, so I'll
fix this throughout the patch.

> > +   res += nr_pages << PAGE_SHIFT;
> > +   }
> > +
> > +   /* Allow 1 GiB for private mappings */
> > +   nr_pages = (1 << 30) >> PAGE_SHIFT;
> 
> SZ_1G >> PAGE_SHIFT

Much nicer, thanks. I was also considering adding a Kconfig option for
that, because 1GiB is totally arbitrary. Toughts?

> > +   nr_pages = __hyp_pgtable_max_pages(nr_pages);
> > +   res += nr_pages << PAGE_SHIFT;
> > +
> > +   return res;
> 
> Might make more sense to keep res in pages until here, then just shift when
> returning.
> 
> > +}
> > +
> > +#endif /* __KVM_HYP_MM_H */
> > diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile 
> > b/arch/arm64/kvm/hyp/nvhe/Makefile
> > index 72cfe53f106f..d7381a503182 100644
> > --- a/arch/arm64/kvm/hyp/nvhe/Makefile
> > +++ b/arch/arm64/kvm/hyp/nvhe/Makefile
> > @@ -11,9 +11,9 @@ lib-objs := $(addprefix ../../../lib/, $(lib-objs))
> >  
> >  obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o 
> > host.o \
> >  hyp-main.o hyp-smp.o psci-relay.o early_alloc.o stub.o page_alloc.o \
> > -cache.o cpufeature.o
> > +cache.o cpufeature.o setup.o mm.o
> >  obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
> > -../fpsimd.o ../hyp-entry.o ../exception.o
> > +../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o
> >  obj-y += $(lib-objs)
> >  
> >  ##
> > diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S 
> > b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
> > index 31b060a44045..ad943966c39f 100644
> > --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
> > +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
> > @@ -251,4 +251,35 @@ alternative_else_nop_endif
> >  
> >  SYM_CODE_END(__kvm_handle_stub_hvc)
> >  
> > +SYM_FUNC_START(__pkvm_init_switch_pgd)
> > +   /* Turn the MMU off */
> > +   pre_disable_mmu_workaround
> > +   mrs x2, sctlr_el2
> > +   bic x3, x2, #SCTLR_ELx_M
> > +   msr sctlr_el2, x3
> > +   isb
> > +
> > +   tlbialle2
> > +
> > +   /* Install the new pgtables */
> > +   ldr x3, [x0, #NVHE_INIT_PGD_PA]
> > +   phys_to_ttbr x4, x3
> > +alternative_if ARM64_HAS_CNP
> > +   orr x4, x4, #TTBR_CNP_BIT
> > +alternative_else_nop_endif
> > +   msr ttbr0_el2, x4
> > +
> > +   /* Set the new stack pointer */
> > +   ldr x0, [x0, #NVHE_INIT_STACK_HYP_VA]
> > +   mov sp, x0
> > +
> > +   /* And turn the MMU back on! */
> > +   dsb nsh
> > +   isb
> > +   msr sctlr_el2, x2
> > +   ic  iallu
> > +   isb
> > +   ret x1
> > +SYM_FUNC_END(__pkvm_init_switch_pgd)
> > +
> > .popsection
> > diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c 
> > b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
> > index a906f9e2ff34..3075f117651c 100644
> > --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
> > +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
> > @@ -6,12 

Re: [PATCH v7 1/3] arm64: kvm: Save/restore MTE registers

2021-02-04 Thread Marc Zyngier

On 2021-02-04 14:33, Steven Price wrote:

On 02/02/2021 15:36, Marc Zyngier wrote:

On 2021-01-15 15:28, Steven Price wrote:
Define the new system registers that MTE introduces and context 
switch
them. The MTE feature is still hidden from the ID register as it 
isn't

supported in a VM yet.

Signed-off-by: Steven Price 
---
 arch/arm64/include/asm/kvm_host.h  |  4 ++
 arch/arm64/include/asm/kvm_mte.h   | 74 
++

 arch/arm64/include/asm/sysreg.h    |  3 +-
 arch/arm64/kernel/asm-offsets.c    |  3 +
 arch/arm64/kvm/hyp/entry.S |  7 ++
 arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h |  4 ++
 arch/arm64/kvm/sys_regs.c  | 14 ++--
 7 files changed, 104 insertions(+), 5 deletions(-)
 create mode 100644 arch/arm64/include/asm/kvm_mte.h

diff --git a/arch/arm64/include/asm/kvm_host.h
b/arch/arm64/include/asm/kvm_host.h
index 11beda85ee7e..51590a397e4b 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -148,6 +148,8 @@ enum vcpu_sysreg {
 SCTLR_EL1,    /* System Control Register */
 ACTLR_EL1,    /* Auxiliary Control Register */
 CPACR_EL1,    /* Coprocessor Access Control */
+    RGSR_EL1,    /* Random Allocation Tag Seed Register */
+    GCR_EL1,    /* Tag Control Register */
 ZCR_EL1,    /* SVE Control */
 TTBR0_EL1,    /* Translation Table Base Register 0 */
 TTBR1_EL1,    /* Translation Table Base Register 1 */
@@ -164,6 +166,8 @@ enum vcpu_sysreg {
 TPIDR_EL1,    /* Thread ID, Privileged */
 AMAIR_EL1,    /* Aux Memory Attribute Indirection Register */
 CNTKCTL_EL1,    /* Timer Control Register (EL1) */
+    TFSRE0_EL1,    /* Tag Fault Status Register (EL0) */
+    TFSR_EL1,    /* Tag Fault Stauts Register (EL1) */


s/Stauts/Status/

Is there any reason why the MTE registers aren't grouped together?


I has been under the impression this list is sorted by the encoding of
the system registers, although double checking I've screwed up the
order of TFSRE0_EL1/TFSR_EL1, and not all the other fields are sorted
that way.


It grew organically, and was initially matching the original order
of the save/restore sequence. This order has long disappeared with
VHE, and this is essentially nothing more than a bag of indices
(although NV does bring some order back to deal with VNCR-backed
registers).

[...]


diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
index cce43bfe158f..94d9736f0133 100644
--- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
+++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
@@ -45,6 +45,8 @@ static inline void __sysreg_save_el1_state(struct
kvm_cpu_context *ctxt)
 ctxt_sys_reg(ctxt, CNTKCTL_EL1)    = 
read_sysreg_el1(SYS_CNTKCTL);

 ctxt_sys_reg(ctxt, PAR_EL1)    = read_sysreg_par();
 ctxt_sys_reg(ctxt, TPIDR_EL1)    = read_sysreg(tpidr_el1);
+    if (system_supports_mte())
+    ctxt_sys_reg(ctxt, TFSR_EL1) = read_sysreg_el1(SYS_TFSR);


I already asked for it, and I'm going to ask for it again:
Most of the sysreg save/restore is guarded by a per-vcpu check
(HCR_EL2.ATA), while this one is unconditionally saved/restore
if the host is MTE capable. Why is that so?


Sorry, I thought your concern was for registers that affect the host
(as they are obviously more performance critical as they are hit on
every guest exit). Although I guess that's incorrect for nVHE which is
what all the cool kids want now ;)


I think we want both correctness *and* performance, for both VHE
and nVHE. Things like EL0 registers should be able to be moved
to load/put on all implementations, and the correct switching
be done at the right spot only when required.

Thanks,

M.
--
Jazz is not dead. It just smells funny...
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC] Use SMMU HTTU for DMA dirty page tracking

2021-02-04 Thread Keqian Zhu
Hi Jean and Kevin,

FYI, I have send out the SMMUv3 HTTU support for DMA dirty tracking[1] a week 
ago.

Thanks,
Keqian

[1] 
https://lore.kernel.org/linux-iommu/20210128151742.18840-1-zhukeqi...@huawei.com/

On 2020/5/27 17:14, Jean-Philippe Brucker wrote:
> On Wed, May 27, 2020 at 08:40:47AM +, Tian, Kevin wrote:
>>> From: Xiang Zheng 
>>> Sent: Wednesday, May 27, 2020 2:45 PM
>>>
>>>
>>> On 2020/5/27 11:27, Tian, Kevin wrote:
> From: Xiang Zheng
> Sent: Monday, May 25, 2020 7:34 PM
>
> [+cc Kirti, Yan, Alex]
>
> On 2020/5/23 1:14, Jean-Philippe Brucker wrote:
>> Hi,
>>
>> On Tue, May 19, 2020 at 05:42:55PM +0800, Xiang Zheng wrote:
>>> Hi all,
>>>
>>> Is there any plan for enabling SMMU HTTU?
>>
>> Not outside of SVA, as far as I know.
>>
>
>>> I have seen the patch locates in the SVA series patch, which adds
>>> support for HTTU:
>>> https://www.spinics.net/lists/arm-kernel/msg798694.html
>>>
>>> HTTU reduces the number of access faults on SMMU fault queue
>>> (permission faults also benifit from it).
>>>
>>> Besides reducing the faults, HTTU also helps to track dirty pages for
>>> device DMA. Is it feasible to utilize HTTU to get dirty pages on device
>>> DMA during VFIO live migration?
>>
>> As you know there is a VFIO interface for this under discussion:
>> https://lore.kernel.org/kvm/1589781397-28368-1-git-send-email-
> kwankh...@nvidia.com/
>> It doesn't implement an internal API to communicate with the IOMMU
> driver
>> about dirty pages.

 We plan to add such API later, e.g. to utilize A/D bit in VT-d 2nd-level
 page tables (Rev 3.0).

>>>
>>> Thank you, Kevin.
>>>
>>> When will you send this series patches? Maybe(Hope) we can also support
>>> hardware-based dirty pages tracking via common APIs based on your
>>> patches. :)
>>
>> Yan is working with Kirti on basic live migration support now. After that
>> part is done, we will start working on A/D bit support. Yes, common APIs
>> are definitely the goal here.
>>
>>>
>
>>
>>> If SMMU can track dirty pages, devices are not required to implement
>>> additional dirty pages tracking to support VFIO live migration.
>>
>> It seems feasible, though tracking it in the device might be more
>> efficient. I might have misunderstood but I think for live migration of
>> the Intel NIC they trap guest accesses to the device and introspect its
>> state to figure out which pages it is accessing.

 Does HTTU implement A/D-like mechanism in SMMU page tables, or just
 report dirty pages in a log buffer? Either way tracking dirty pages in 
 IOMMU
 side is generic thus doesn't require device-specific tweak like in Intel 
 NIC.

>>>
>>> Currently HTTU just implement A/D-like mechanism in SMMU page tables.
>>> We certainly
>>> expect SMMU can also implement PML-like feature so that we can avoid
>>> walking the
>>> whole page table to get the dirty pages.
> 
> There is no reporting of dirty pages in log buffer. It might be possible
> to do software logging based on PRI or Stall, but that requires special
> support in the endpoint as well as the SMMU.
> 
>> Is there a link to HTTU introduction?
> 
> I don't know any gentle introduction, but there are sections D5.4.11
> "Hardware management of the Access flag and dirty state" in the ARM
> Architecture Reference Manual (DDI0487E), and section 3.13 "Translation
> table entries and Access/Dirty flags" in the SMMU specification
> (IHI0070C). HTTU stands for "Hardware Translation Table Update".
> 
> In short, when HTTU is enabled, the SMMU translation performs an atomic
> read-modify-write on the leaf translation table descriptor, setting some
> bits depending on the type of memory access. This can be enabled
> independently on both stage-1 and stage-2 tables (equivalent to your 1st
> and 2nd page tables levels, I think).
> 
> Thanks,
> Jean
> ___
> kvmarm mailing list
> kvmarm@lists.cs.columbia.edu
> https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
> .
> 
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


RE: [RFC] Use SMMU HTTU for DMA dirty page tracking

2021-02-04 Thread Tian, Kevin
> From: Keqian Zhu 
> Sent: Friday, February 5, 2021 11:31 AM
> 
> Hi Jean and Kevin,
> 
> FYI, I have send out the SMMUv3 HTTU support for DMA dirty tracking[1] a
> week ago.
> 
> Thanks,
> Keqian
> 
> [1] https://lore.kernel.org/linux-iommu/20210128151742.18840-1-
> zhukeqi...@huawei.com/

Thanks for the pointer. We will take a look.

> 
> On 2020/5/27 17:14, Jean-Philippe Brucker wrote:
> > On Wed, May 27, 2020 at 08:40:47AM +, Tian, Kevin wrote:
> >>> From: Xiang Zheng 
> >>> Sent: Wednesday, May 27, 2020 2:45 PM
> >>>
> >>>
> >>> On 2020/5/27 11:27, Tian, Kevin wrote:
> > From: Xiang Zheng
> > Sent: Monday, May 25, 2020 7:34 PM
> >
> > [+cc Kirti, Yan, Alex]
> >
> > On 2020/5/23 1:14, Jean-Philippe Brucker wrote:
> >> Hi,
> >>
> >> On Tue, May 19, 2020 at 05:42:55PM +0800, Xiang Zheng wrote:
> >>> Hi all,
> >>>
> >>> Is there any plan for enabling SMMU HTTU?
> >>
> >> Not outside of SVA, as far as I know.
> >>
> >
> >>> I have seen the patch locates in the SVA series patch, which adds
> >>> support for HTTU:
> >>> https://www.spinics.net/lists/arm-kernel/msg798694.html
> >>>
> >>> HTTU reduces the number of access faults on SMMU fault queue
> >>> (permission faults also benifit from it).
> >>>
> >>> Besides reducing the faults, HTTU also helps to track dirty pages for
> >>> device DMA. Is it feasible to utilize HTTU to get dirty pages on 
> >>> device
> >>> DMA during VFIO live migration?
> >>
> >> As you know there is a VFIO interface for this under discussion:
> >> https://lore.kernel.org/kvm/1589781397-28368-1-git-send-email-
> > kwankh...@nvidia.com/
> >> It doesn't implement an internal API to communicate with the
> IOMMU
> > driver
> >> about dirty pages.
> 
>  We plan to add such API later, e.g. to utilize A/D bit in VT-d 2nd-level
>  page tables (Rev 3.0).
> 
> >>>
> >>> Thank you, Kevin.
> >>>
> >>> When will you send this series patches? Maybe(Hope) we can also
> support
> >>> hardware-based dirty pages tracking via common APIs based on your
> >>> patches. :)
> >>
> >> Yan is working with Kirti on basic live migration support now. After that
> >> part is done, we will start working on A/D bit support. Yes, common APIs
> >> are definitely the goal here.
> >>
> >>>
> >
> >>
> >>> If SMMU can track dirty pages, devices are not required to
> implement
> >>> additional dirty pages tracking to support VFIO live migration.
> >>
> >> It seems feasible, though tracking it in the device might be more
> >> efficient. I might have misunderstood but I think for live migration of
> >> the Intel NIC they trap guest accesses to the device and introspect its
> >> state to figure out which pages it is accessing.
> 
>  Does HTTU implement A/D-like mechanism in SMMU page tables, or
> just
>  report dirty pages in a log buffer? Either way tracking dirty pages in
> IOMMU
>  side is generic thus doesn't require device-specific tweak like in Intel 
>  NIC.
> 
> >>>
> >>> Currently HTTU just implement A/D-like mechanism in SMMU page
> tables.
> >>> We certainly
> >>> expect SMMU can also implement PML-like feature so that we can avoid
> >>> walking the
> >>> whole page table to get the dirty pages.
> >
> > There is no reporting of dirty pages in log buffer. It might be possible
> > to do software logging based on PRI or Stall, but that requires special
> > support in the endpoint as well as the SMMU.
> >
> >> Is there a link to HTTU introduction?
> >
> > I don't know any gentle introduction, but there are sections D5.4.11
> > "Hardware management of the Access flag and dirty state" in the ARM
> > Architecture Reference Manual (DDI0487E), and section 3.13 "Translation
> > table entries and Access/Dirty flags" in the SMMU specification
> > (IHI0070C). HTTU stands for "Hardware Translation Table Update".
> >
> > In short, when HTTU is enabled, the SMMU translation performs an atomic
> > read-modify-write on the leaf translation table descriptor, setting some
> > bits depending on the type of memory access. This can be enabled
> > independently on both stage-1 and stage-2 tables (equivalent to your 1st
> > and 2nd page tables levels, I think).
> >
> > Thanks,
> > Jean
> > ___
> > kvmarm mailing list
> > kvmarm@lists.cs.columbia.edu
> > https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
> > .
> >
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC PATCH v2 12/26] KVM: arm64: Introduce a Hyp buddy page allocator

2021-02-04 Thread Quentin Perret
On Thursday 04 Feb 2021 at 14:31:08 (+), Will Deacon wrote:
> Just feels a bit backwards having __find_buddy() take an order parameter,
> yet then return a page of the wrong order! __hyp_extract_page() always
> passes the p->order as the order,

Gotcha, so maybe this is just a naming problem. __find_buddy() is simply
a helper to lookup/index the vmemmap, but it's perfectly possible that
the 'destination' page that is being indexed has already been allocated,
and split up multiple time (and so at a different order), etc ... And
that is the caller's job to decide.

How about __lookup_potential_buddy() ? Any suggestion?
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC PATCH v2 12/26] KVM: arm64: Introduce a Hyp buddy page allocator

2021-02-04 Thread Quentin Perret
On Thursday 04 Feb 2021 at 18:24:05 (+), Will Deacon wrote:
> On Thu, Feb 04, 2021 at 06:19:36PM +, Quentin Perret wrote:
> > On Thursday 04 Feb 2021 at 14:31:08 (+), Will Deacon wrote:
> > > Just feels a bit backwards having __find_buddy() take an order parameter,
> > > yet then return a page of the wrong order! __hyp_extract_page() always
> > > passes the p->order as the order,
> > 
> > Gotcha, so maybe this is just a naming problem. __find_buddy() is simply
> > a helper to lookup/index the vmemmap, but it's perfectly possible that
> > the 'destination' page that is being indexed has already been allocated,
> > and split up multiple time (and so at a different order), etc ... And
> > that is the caller's job to decide.
> > 
> > How about __lookup_potential_buddy() ? Any suggestion?
> 
> Hey, my job here is to waffle incoherently and hope that you find bugs in
> your own code. Now you want me to _name_ something! Jeez...

Hey, that's my special -- I already got Marc to make a suggestion on v1
and it's been my favorite function name so far, so why not try again?

https://lore.kernel.org/kvmarm/d6a674a0e8e259161ab741d78924c...@kernel.org/

> Ok, how about __find_buddy() does what it does today but doesn't take an
> order argument, whereas __find_buddy_of_order() takes the order argument
> and checks the page order before returning?

Sounds like a plan!

Cheers,
Quentin
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC PATCH v2 12/26] KVM: arm64: Introduce a Hyp buddy page allocator

2021-02-04 Thread Quentin Perret
On Thursday 04 Feb 2021 at 17:48:49 (+), Will Deacon wrote:
> On Thu, Feb 04, 2021 at 02:52:52PM +, Quentin Perret wrote:
> > On Thursday 04 Feb 2021 at 14:31:08 (+), Will Deacon wrote:
> > > On Wed, Feb 03, 2021 at 06:33:30PM +, Quentin Perret wrote:
> > > > On Tuesday 02 Feb 2021 at 18:13:08 (+), Will Deacon wrote:
> > > > > On Fri, Jan 08, 2021 at 12:15:10PM +, Quentin Perret wrote:
> > > > > > + *   __find_buddy(pool, page 0, order 0) => page 1
> > > > > > + *   __find_buddy(pool, page 0, order 1) => page 2
> > > > > > + *   __find_buddy(pool, page 1, order 0) => page 0
> > > > > > + *   __find_buddy(pool, page 2, order 0) => page 3
> > > > > > + */
> > > > > > +static struct hyp_page *__find_buddy(struct hyp_pool *pool, struct 
> > > > > > hyp_page *p,
> > > > > > +unsigned int order)
> > > > > > +{
> > > > > > +   phys_addr_t addr = hyp_page_to_phys(p);
> > > > > > +
> > > > > > +   addr ^= (PAGE_SIZE << order);
> > > > > > +   if (addr < pool->range_start || addr >= pool->range_end)
> > > > > > +   return NULL;
> > > > > 
> > > > > Are these range checks only needed because the pool isn't required to 
> > > > > be
> > > > > an exact power-of-2 pages in size? If so, maybe it would be more
> > > > > straightforward to limit the max order on a per-pool basis depending 
> > > > > upon
> > > > > its size?
> > > > 
> > > > More importantly, it is because pages outside of the pool are not
> > > > guaranteed to be covered by the hyp_vmemmap, so I really need to make
> > > > sure I don't dereference them.
> > > 
> > > Wouldn't having a per-pool max order help with that?
> > 
> > The issue is, I have no alignment guarantees for the pools, so I may end
> > up with max_order = 0 ...
> 
> Yeah, so you would still need the range tracking,

Hmm actually I don't think I would, but that would essentially mean the
'buddy' allocator is now turned into a free list of single pages
(because we cannot create pages of order 1).

> but it would at least help
> to reduce HYP_MAX_ORDER failed searches each time. Still, we can always do
> that later.

Sorry but I am not following. In which case do we have HYP_MAX_ORDER
failed searches?

Thanks,
Quentin
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC PATCH v2 12/26] KVM: arm64: Introduce a Hyp buddy page allocator

2021-02-04 Thread Quentin Perret
On Thursday 04 Feb 2021 at 18:13:18 (+), Will Deacon wrote:
> I was going from memory, but the loop in __hyp_alloc_pages() searches up to
> HYP_MAX_ORDER, whereas this is _never_ going to succeed beyond some per-pool
> order determined by the size of the pool. But I doubt it matters -- I
> thought we did more than just check a list.

Ah, I see -- I was looking at the __hyp_attach_page() loop.

I think it's a good point, I should be able to figure out a max order
based on the size and alignment of the pool, and cache that in struct
hyp_pool to optimize cases where this is < HYP_MAX_ORDER.
Should be easy enough, I'll see what I can do in v3.

Thanks!
Quentin
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC PATCH v2 21/26] KVM: arm64: Refactor kvm_arm_setup_stage2()

2021-02-04 Thread Quentin Perret
On Wednesday 03 Feb 2021 at 15:53:54 (+), Will Deacon wrote:
> On Fri, Jan 08, 2021 at 12:15:19PM +, Quentin Perret wrote:
> > In order to re-use some of the stage 2 setup at EL2, factor parts of
> > kvm_arm_setup_stage2() out into static inline functions.
> > 
> > No functional change intended.
> > 
> > Signed-off-by: Quentin Perret 
> > ---
> >  arch/arm64/include/asm/kvm_mmu.h | 48 
> >  arch/arm64/kvm/reset.c   | 42 +++-
> >  2 files changed, 52 insertions(+), 38 deletions(-)
> > 
> > diff --git a/arch/arm64/include/asm/kvm_mmu.h 
> > b/arch/arm64/include/asm/kvm_mmu.h
> > index 662f0415344e..83b4c5cf4768 100644
> > --- a/arch/arm64/include/asm/kvm_mmu.h
> > +++ b/arch/arm64/include/asm/kvm_mmu.h
> > @@ -280,6 +280,54 @@ static inline int kvm_write_guest_lock(struct kvm 
> > *kvm, gpa_t gpa,
> > return ret;
> >  }
> >  
> > +static inline u64 kvm_get_parange(u64 mmfr0)
> > +{
> > +   u64 parange = cpuid_feature_extract_unsigned_field(mmfr0,
> > +   ID_AA64MMFR0_PARANGE_SHIFT);
> > +   if (parange > ID_AA64MMFR0_PARANGE_MAX)
> > +   parange = ID_AA64MMFR0_PARANGE_MAX;
> > +
> > +   return parange;
> > +}
> > +
> > +/*
> > + * The VTCR value is common across all the physical CPUs on the system.
> > + * We use system wide sanitised values to fill in different fields,
> > + * except for Hardware Management of Access Flags. HA Flag is set
> > + * unconditionally on all CPUs, as it is safe to run with or without
> > + * the feature and the bit is RES0 on CPUs that don't support it.
> > + */
> > +static inline u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
> > +{
> > +   u64 vtcr = VTCR_EL2_FLAGS;
> > +   u8 lvls;
> > +
> > +   vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT;
> > +   vtcr |= VTCR_EL2_T0SZ(phys_shift);
> > +   /*
> > +* Use a minimum 2 level page table to prevent splitting
> > +* host PMD huge pages at stage2.
> > +*/
> > +   lvls = stage2_pgtable_levels(phys_shift);
> > +   if (lvls < 2)
> > +   lvls = 2;
> > +   vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls);
> > +
> > +   /*
> > +* Enable the Hardware Access Flag management, unconditionally
> > +* on all CPUs. The features is RES0 on CPUs without the support
> > +* and must be ignored by the CPUs.
> > +*/
> > +   vtcr |= VTCR_EL2_HA;
> > +
> > +   /* Set the vmid bits */
> > +   vtcr |= (get_vmid_bits(mmfr1) == 16) ?
> > +   VTCR_EL2_VS_16BIT :
> > +   VTCR_EL2_VS_8BIT;
> > +
> > +   return vtcr;
> > +}
> 
> Although I think this is functionally fine, I think it's unusual to see
> large "static inline" functions like this in shared header files. One
> alternative approach would be to follow the example of
> kernel/locking/qspinlock_paravirt.h, where the header is guarded in such a
> way that is only ever included by kernel/locking/qspinlock.c and therefore
> doesn't need the "inline" at all. That separation really helps, I think.

Alternatively, I might be able to have an mmu.c file in the hyp/ folder,
and to compile it for both the host kernel and the EL2 obj as we do for
a few things already. Or maybe I'll just stick it in pgtable.c. Either
way, it'll add a function call, but I can't really see that having any
measurable impact, so we should be fine.

Cheers,
Quentin
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC PATCH v2 12/26] KVM: arm64: Introduce a Hyp buddy page allocator

2021-02-04 Thread Will Deacon
On Thu, Feb 04, 2021 at 06:01:12PM +, Quentin Perret wrote:
> On Thursday 04 Feb 2021 at 17:48:49 (+), Will Deacon wrote:
> > On Thu, Feb 04, 2021 at 02:52:52PM +, Quentin Perret wrote:
> > > On Thursday 04 Feb 2021 at 14:31:08 (+), Will Deacon wrote:
> > > > On Wed, Feb 03, 2021 at 06:33:30PM +, Quentin Perret wrote:
> > > > > On Tuesday 02 Feb 2021 at 18:13:08 (+), Will Deacon wrote:
> > > > > > On Fri, Jan 08, 2021 at 12:15:10PM +, Quentin Perret wrote:
> > > > > > > + *   __find_buddy(pool, page 0, order 0) => page 1
> > > > > > > + *   __find_buddy(pool, page 0, order 1) => page 2
> > > > > > > + *   __find_buddy(pool, page 1, order 0) => page 0
> > > > > > > + *   __find_buddy(pool, page 2, order 0) => page 3
> > > > > > > + */
> > > > > > > +static struct hyp_page *__find_buddy(struct hyp_pool *pool, 
> > > > > > > struct hyp_page *p,
> > > > > > > +  unsigned int order)
> > > > > > > +{
> > > > > > > + phys_addr_t addr = hyp_page_to_phys(p);
> > > > > > > +
> > > > > > > + addr ^= (PAGE_SIZE << order);
> > > > > > > + if (addr < pool->range_start || addr >= pool->range_end)
> > > > > > > + return NULL;
> > > > > > 
> > > > > > Are these range checks only needed because the pool isn't required 
> > > > > > to be
> > > > > > an exact power-of-2 pages in size? If so, maybe it would be more
> > > > > > straightforward to limit the max order on a per-pool basis 
> > > > > > depending upon
> > > > > > its size?
> > > > > 
> > > > > More importantly, it is because pages outside of the pool are not
> > > > > guaranteed to be covered by the hyp_vmemmap, so I really need to make
> > > > > sure I don't dereference them.
> > > > 
> > > > Wouldn't having a per-pool max order help with that?
> > > 
> > > The issue is, I have no alignment guarantees for the pools, so I may end
> > > up with max_order = 0 ...
> > 
> > Yeah, so you would still need the range tracking,
> 
> Hmm actually I don't think I would, but that would essentially mean the
> 'buddy' allocator is now turned into a free list of single pages
> (because we cannot create pages of order 1).

Right, I'm not suggesting we do that.

> > but it would at least help
> > to reduce HYP_MAX_ORDER failed searches each time. Still, we can always do
> > that later.
> 
> Sorry but I am not following. In which case do we have HYP_MAX_ORDER
> failed searches?

I was going from memory, but the loop in __hyp_alloc_pages() searches up to
HYP_MAX_ORDER, whereas this is _never_ going to succeed beyond some per-pool
order determined by the size of the pool. But I doubt it matters -- I
thought we did more than just check a list.

Will
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC PATCH v2 23/26] KVM: arm64: Refactor __populate_fault_info()

2021-02-04 Thread Quentin Perret
On Wednesday 03 Feb 2021 at 15:58:32 (+), Will Deacon wrote:
> On Fri, Jan 08, 2021 at 12:15:21PM +, Quentin Perret wrote:
> > Refactor __populate_fault_info() to introduce __get_fault_info() which
> > will be used once the host is wrapped in a stage 2.
> > 
> > Signed-off-by: Quentin Perret 
> > ---
> >  arch/arm64/kvm/hyp/include/hyp/switch.h | 36 +++--
> >  1 file changed, 22 insertions(+), 14 deletions(-)
> > 
> > diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h 
> > b/arch/arm64/kvm/hyp/include/hyp/switch.h
> > index 84473574c2e7..e9005255d639 100644
> > --- a/arch/arm64/kvm/hyp/include/hyp/switch.h
> > +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
> > @@ -157,19 +157,9 @@ static inline bool __translate_far_to_hpfar(u64 far, 
> > u64 *hpfar)
> > return true;
> >  }
> >  
> > -static inline bool __populate_fault_info(struct kvm_vcpu *vcpu)
> > +static inline bool __get_fault_info(u64 esr, u64 *far, u64 *hpfar)
> 
> Could this take a pointer to a struct kvm_vcpu_fault_info instead?

The disr_el1 field will be unused in this case, but yes, that should
work.

Cheers,
Quentin
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC PATCH v2 12/26] KVM: arm64: Introduce a Hyp buddy page allocator

2021-02-04 Thread Quentin Perret
On Thursday 04 Feb 2021 at 14:31:08 (+), Will Deacon wrote:
> On Wed, Feb 03, 2021 at 06:33:30PM +, Quentin Perret wrote:
> > On Tuesday 02 Feb 2021 at 18:13:08 (+), Will Deacon wrote:
> > > On Fri, Jan 08, 2021 at 12:15:10PM +, Quentin Perret wrote:
> > > > + *   __find_buddy(pool, page 0, order 0) => page 1
> > > > + *   __find_buddy(pool, page 0, order 1) => page 2
> > > > + *   __find_buddy(pool, page 1, order 0) => page 0
> > > > + *   __find_buddy(pool, page 2, order 0) => page 3
> > > > + */
> > > > +static struct hyp_page *__find_buddy(struct hyp_pool *pool, struct 
> > > > hyp_page *p,
> > > > +unsigned int order)
> > > > +{
> > > > +   phys_addr_t addr = hyp_page_to_phys(p);
> > > > +
> > > > +   addr ^= (PAGE_SIZE << order);
> > > > +   if (addr < pool->range_start || addr >= pool->range_end)
> > > > +   return NULL;
> > > 
> > > Are these range checks only needed because the pool isn't required to be
> > > an exact power-of-2 pages in size? If so, maybe it would be more
> > > straightforward to limit the max order on a per-pool basis depending upon
> > > its size?
> > 
> > More importantly, it is because pages outside of the pool are not
> > guaranteed to be covered by the hyp_vmemmap, so I really need to make
> > sure I don't dereference them.
> 
> Wouldn't having a per-pool max order help with that?

The issue is, I have no alignment guarantees for the pools, so I may end
up with max_order = 0 ...
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC PATCH 06/11] iommu/arm-smmu-v3: Scan leaf TTD to sync hardware dirty log

2021-02-04 Thread Robin Murphy

On 2021-01-28 15:17, Keqian Zhu wrote:

From: jiangkunkun 

During dirty log tracking, user will try to retrieve dirty log from
iommu if it supports hardware dirty log. This adds a new interface
named sync_dirty_log in iommu layer and arm smmuv3 implements it,
which scans leaf TTD and treats it's dirty if it's writable (As we
just enable HTTU for stage1, so check AP[2] is not set).

Co-developed-by: Keqian Zhu 
Signed-off-by: Kunkun Jiang 
---
  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 27 +++
  drivers/iommu/io-pgtable-arm.c  | 90 +
  drivers/iommu/iommu.c   | 41 ++
  include/linux/io-pgtable.h  |  4 +
  include/linux/iommu.h   | 17 
  5 files changed, 179 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 2434519e4bb6..43d0536b429a 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2548,6 +2548,32 @@ static size_t arm_smmu_merge_page(struct iommu_domain 
*domain, unsigned long iov
return ops->merge_page(ops, iova, paddr, size, prot);
  }
  
+static int arm_smmu_sync_dirty_log(struct iommu_domain *domain,

+  unsigned long iova, size_t size,
+  unsigned long *bitmap,
+  unsigned long base_iova,
+  unsigned long bitmap_pgshift)
+{
+   struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
+   struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu;
+
+   if (!(smmu->features & ARM_SMMU_FEAT_HTTU_HD)) {
+   dev_err(smmu->dev, "don't support HTTU_HD and sync dirty 
log\n");
+   return -EPERM;
+   }
+
+   if (!ops || !ops->sync_dirty_log) {
+   pr_err("don't support sync dirty log\n");
+   return -ENODEV;
+   }
+
+   /* To ensure all inflight transactions are completed */
+   arm_smmu_flush_iotlb_all(domain);


What about transactions that arrive between the point that this 
completes, and the point - potentially much later - that we actually 
access any given PTE during the walk? I don't see what this is supposed 
to be synchronising against, even if it were just a CMD_SYNC (I 
especially don't see why we'd want to knock out the TLBs).



+
+   return ops->sync_dirty_log(ops, iova, size, bitmap,
+   base_iova, bitmap_pgshift);
+}
+
  static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
  {
return iommu_fwspec_add_ids(dev, args->args, 1);
@@ -2649,6 +2675,7 @@ static struct iommu_ops arm_smmu_ops = {
.domain_set_attr= arm_smmu_domain_set_attr,
.split_block= arm_smmu_split_block,
.merge_page = arm_smmu_merge_page,
+   .sync_dirty_log = arm_smmu_sync_dirty_log,
.of_xlate   = arm_smmu_of_xlate,
.get_resv_regions   = arm_smmu_get_resv_regions,
.put_resv_regions   = generic_iommu_put_resv_regions,
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 17390f258eb1..6cfe1ef3fedd 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -877,6 +877,95 @@ static size_t arm_lpae_merge_page(struct io_pgtable_ops 
*ops, unsigned long iova
return __arm_lpae_merge_page(data, iova, paddr, size, lvl, ptep, prot);
  }
  
+static int __arm_lpae_sync_dirty_log(struct arm_lpae_io_pgtable *data,

+unsigned long iova, size_t size,
+int lvl, arm_lpae_iopte *ptep,
+unsigned long *bitmap,
+unsigned long base_iova,
+unsigned long bitmap_pgshift)
+{
+   arm_lpae_iopte pte;
+   struct io_pgtable *iop = >iop;
+   size_t base, next_size;
+   unsigned long offset;
+   int nbits, ret;
+
+   if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
+   return -EINVAL;
+
+   ptep += ARM_LPAE_LVL_IDX(iova, lvl, data);
+   pte = READ_ONCE(*ptep);
+   if (WARN_ON(!pte))
+   return -EINVAL;
+
+   if (size == ARM_LPAE_BLOCK_SIZE(lvl, data)) {
+   if (iopte_leaf(pte, lvl, iop->fmt)) {
+   if (pte & ARM_LPAE_PTE_AP_RDONLY)
+   return 0;
+
+   /* It is writable, set the bitmap */
+   nbits = size >> bitmap_pgshift;
+   offset = (iova - base_iova) >> bitmap_pgshift;
+   bitmap_set(bitmap, offset, nbits);
+   return 0;
+   } else {
+   /* To traverse next level */
+   next_size = ARM_LPAE_BLOCK_SIZE(lvl + 1, data);
+  

Re: [RFC PATCH v2 12/26] KVM: arm64: Introduce a Hyp buddy page allocator

2021-02-04 Thread Will Deacon
On Thu, Feb 04, 2021 at 06:19:36PM +, Quentin Perret wrote:
> On Thursday 04 Feb 2021 at 14:31:08 (+), Will Deacon wrote:
> > Just feels a bit backwards having __find_buddy() take an order parameter,
> > yet then return a page of the wrong order! __hyp_extract_page() always
> > passes the p->order as the order,
> 
> Gotcha, so maybe this is just a naming problem. __find_buddy() is simply
> a helper to lookup/index the vmemmap, but it's perfectly possible that
> the 'destination' page that is being indexed has already been allocated,
> and split up multiple time (and so at a different order), etc ... And
> that is the caller's job to decide.
> 
> How about __lookup_potential_buddy() ? Any suggestion?

Hey, my job here is to waffle incoherently and hope that you find bugs in
your own code. Now you want me to _name_ something! Jeez...

Ok, how about __find_buddy() does what it does today but doesn't take an
order argument, whereas __find_buddy_of_order() takes the order argument
and checks the page order before returning?

Will
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC PATCH v2 24/26] KVM: arm64: Make memcache anonymous in pgtable allocator

2021-02-04 Thread Quentin Perret
On Wednesday 03 Feb 2021 at 15:59:44 (+), Will Deacon wrote:
> On Fri, Jan 08, 2021 at 12:15:22PM +, Quentin Perret wrote:
> > The current stage2 page-table allocator uses a memcache to get
> > pre-allocated pages when it needs any. To allow re-using this code at
> > EL2 which uses a concept of memory pools, make the memcache argument to
> > kvm_pgtable_stage2_map() anonymous. and let the mm_ops zalloc_page()
> > callbacks use it the way they need to.
> > 
> > Signed-off-by: Quentin Perret 
> > ---
> >  arch/arm64/include/asm/kvm_pgtable.h | 6 +++---
> >  arch/arm64/kvm/hyp/pgtable.c | 4 ++--
> >  2 files changed, 5 insertions(+), 5 deletions(-)
> > 
> > diff --git a/arch/arm64/include/asm/kvm_pgtable.h 
> > b/arch/arm64/include/asm/kvm_pgtable.h
> > index 8e8f1d2c5e0e..d846bc3d3b77 100644
> > --- a/arch/arm64/include/asm/kvm_pgtable.h
> > +++ b/arch/arm64/include/asm/kvm_pgtable.h
> > @@ -176,8 +176,8 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable 
> > *pgt);
> >   * @size:  Size of the mapping.
> >   * @phys:  Physical address of the memory to map.
> >   * @prot:  Permissions and attributes for the mapping.
> > - * @mc:Cache of pre-allocated GFP_PGTABLE_USER memory from 
> > which to
> > - * allocate page-table pages.
> > + * @mc:Cache of pre-allocated memory from which to allocate 
> > page-table
> > + * pages.
> 
> We should probably mention that this memory must be zeroed, since I don't
> think the page-table code takes care of that.

OK, though I think this is unrelated to this change -- this is already
true today I believe. Anyhow, I'll pile a change on top.

Cheers,
Quentin
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCHv2] kvm: arm64: Add SVE support for nVHE.

2021-02-04 Thread Dave Martin
On Tue, Feb 02, 2021 at 07:52:54PM +0100, Daniel Kiss wrote:
> CPUs that support SVE are architecturally required to support the
> Virtualization Host Extensions (VHE), so far the kernel supported
> SVE alongside KVM with VHE enabled. In same cases it is desired to
> run nVHE config even when VHE is available.
> This patch add support for SVE for nVHE configuration too.
> 
> Tested on FVP with a Linux guest VM that run with a different VL than
> the host system.
> 
> Signed-off-by: Daniel Kiss 
> ---
>  arch/arm64/Kconfig  |  7 -
>  arch/arm64/include/asm/fpsimd.h |  6 
>  arch/arm64/include/asm/fpsimdmacros.h   | 24 +--
>  arch/arm64/include/asm/kvm_host.h   | 17 +++
>  arch/arm64/kernel/entry-fpsimd.S|  5 
>  arch/arm64/kvm/arm.c|  5 
>  arch/arm64/kvm/fpsimd.c | 39 -
>  arch/arm64/kvm/hyp/fpsimd.S | 15 ++
>  arch/arm64/kvm/hyp/include/hyp/switch.h | 34 +++--
>  arch/arm64/kvm/hyp/nvhe/switch.c| 29 +-
>  arch/arm64/kvm/reset.c  |  6 +---
>  11 files changed, 126 insertions(+), 61 deletions(-)
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index f39568b28ec1..049428f1bf27 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -1676,7 +1676,6 @@ endmenu
>  config ARM64_SVE
>   bool "ARM Scalable Vector Extension support"
>   default y
> - depends on !KVM || ARM64_VHE
>   help
> The Scalable Vector Extension (SVE) is an extension to the AArch64
> execution state which complements and extends the SIMD functionality
> @@ -1705,12 +1704,6 @@ config ARM64_SVE
> booting the kernel.  If unsure and you are not observing these
> symptoms, you should assume that it is safe to say Y.
>  
> -   CPUs that support SVE are architecturally required to support the
> -   Virtualization Host Extensions (VHE), so the kernel makes no
> -   provision for supporting SVE alongside KVM without VHE enabled.
> -   Thus, you will need to enable CONFIG_ARM64_VHE if you want to support
> -   KVM in the same kernel image.
> -
>  config ARM64_MODULE_PLTS
>   bool "Use PLTs to allow module memory to spill over into vmalloc area"
>   depends on MODULES
> diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
> index bec5f14b622a..526d69f3eeb3 100644
> --- a/arch/arm64/include/asm/fpsimd.h
> +++ b/arch/arm64/include/asm/fpsimd.h
> @@ -69,6 +69,12 @@ static inline void *sve_pffr(struct thread_struct *thread)
>  extern void sve_save_state(void *state, u32 *pfpsr);
>  extern void sve_load_state(void const *state, u32 const *pfpsr,
>  unsigned long vq_minus_1);
> +/*
> + * sve_load_state_nvhe function for the hyp code where the SVE registers are
> + * handled from the EL2, vector length is governed by ZCR_EL2.
> + */
> +extern void sve_load_state_nvhe(void const *state, u32 const *pfpsr,
> +unsigned long vq_minus_1);
>  extern void sve_flush_live(void);
>  extern void sve_load_from_fpsimd_state(struct user_fpsimd_state const *state,
>  unsigned long vq_minus_1);
> diff --git a/arch/arm64/include/asm/fpsimdmacros.h 
> b/arch/arm64/include/asm/fpsimdmacros.h
> index af43367534c7..d309c6071bce 100644
> --- a/arch/arm64/include/asm/fpsimdmacros.h
> +++ b/arch/arm64/include/asm/fpsimdmacros.h
> @@ -205,6 +205,17 @@
>  921:
>  .endm
>  
> +/* Update ZCR_EL2.LEN with the new VQ */
> +.macro sve_load_vq_nvhe xvqminus1, xtmp, xtmp2
> + mrs_s   \xtmp, SYS_ZCR_EL2
> + bic \xtmp2, \xtmp, ZCR_ELx_LEN_MASK
> + orr \xtmp2, \xtmp2, \xvqminus1
> + cmp \xtmp2, \xtmp
> + b.eq922f
> + msr_s   SYS_ZCR_EL2, \xtmp2 //self-synchronising
> +922:
> +.endm
> +

This looks a little better, but can we just give sve_load_vq an extra
argument, say

.macro sve_load_vq ... , el=EL1

[...]

> +.macro sve_load_nvhe nxbase, xpfpsr, xvqminus1, nxtmp, xtmp2
> + sve_load_vq_nvhe\xvqminus1, x\nxtmp, \xtmp2

and do sve_load_vq \xvqminus1, x\nxtmp, \xtmp2, EL2

> + _sve_load\nxbase, \xpfpsr, \nxtmp
> +.endm
> diff --git a/arch/arm64/include/asm/kvm_host.h 
> b/arch/arm64/include/asm/kvm_host.h
> index 8fcfab0c2567..11a058c81c1d 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -376,6 +376,10 @@ struct kvm_vcpu_arch {
>  #define vcpu_sve_pffr(vcpu) ((void *)((char *)((vcpu)->arch.sve_state) + \
> sve_ffr_offset((vcpu)->arch.sve_max_vl)))
>  
> +#define vcpu_sve_pffr_hyp(vcpu) ((void *)((char *) \
> + (kern_hyp_va((vcpu)->arch.sve_state)) + \
> + 

Re: [RFC PATCH v2 12/26] KVM: arm64: Introduce a Hyp buddy page allocator

2021-02-04 Thread Will Deacon
On Thu, Feb 04, 2021 at 02:52:52PM +, Quentin Perret wrote:
> On Thursday 04 Feb 2021 at 14:31:08 (+), Will Deacon wrote:
> > On Wed, Feb 03, 2021 at 06:33:30PM +, Quentin Perret wrote:
> > > On Tuesday 02 Feb 2021 at 18:13:08 (+), Will Deacon wrote:
> > > > On Fri, Jan 08, 2021 at 12:15:10PM +, Quentin Perret wrote:
> > > > > + *   __find_buddy(pool, page 0, order 0) => page 1
> > > > > + *   __find_buddy(pool, page 0, order 1) => page 2
> > > > > + *   __find_buddy(pool, page 1, order 0) => page 0
> > > > > + *   __find_buddy(pool, page 2, order 0) => page 3
> > > > > + */
> > > > > +static struct hyp_page *__find_buddy(struct hyp_pool *pool, struct 
> > > > > hyp_page *p,
> > > > > +  unsigned int order)
> > > > > +{
> > > > > + phys_addr_t addr = hyp_page_to_phys(p);
> > > > > +
> > > > > + addr ^= (PAGE_SIZE << order);
> > > > > + if (addr < pool->range_start || addr >= pool->range_end)
> > > > > + return NULL;
> > > > 
> > > > Are these range checks only needed because the pool isn't required to be
> > > > an exact power-of-2 pages in size? If so, maybe it would be more
> > > > straightforward to limit the max order on a per-pool basis depending 
> > > > upon
> > > > its size?
> > > 
> > > More importantly, it is because pages outside of the pool are not
> > > guaranteed to be covered by the hyp_vmemmap, so I really need to make
> > > sure I don't dereference them.
> > 
> > Wouldn't having a per-pool max order help with that?
> 
> The issue is, I have no alignment guarantees for the pools, so I may end
> up with max_order = 0 ...

Yeah, so you would still need the range tracking, but it would at least help
to reduce HYP_MAX_ORDER failed searches each time. Still, we can always do
that later.

Will
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC PATCH 04/11] iommu/arm-smmu-v3: Split block descriptor to a span of page

2021-02-04 Thread Robin Murphy

On 2021-01-28 15:17, Keqian Zhu wrote:

From: jiangkunkun 

Block descriptor is not a proper granule for dirty log tracking. This
adds a new interface named split_block in iommu layer and arm smmuv3
implements it, which splits block descriptor to an equivalent span of
page descriptors.

During spliting block, other interfaces are not expected to be working,
so race condition does not exist. And we flush all iotlbs after the split
procedure is completed to ease the pressure of iommu, as we will split a
huge range of block mappings in general.


"Not expected to be" is not the same thing as "can not". Presumably the 
whole point of dirty log tracking is that it can be run speculatively in 
the background, so is there any actual guarantee that the guest can't, 
say, issue a hotplug event that would cause some memory to be released 
back to the host and unmapped while a scan might be in progress? Saying 
effectively "there is no race condition as long as you assume there is 
no race condition" isn't all that reassuring...


That said, it's not very clear why patches #4 and #5 are here at all, 
given that patches #6 and #7 appear quite happy to handle block entries.



Co-developed-by: Keqian Zhu 
Signed-off-by: Kunkun Jiang 
---
  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c |  20 
  drivers/iommu/io-pgtable-arm.c  | 122 
  drivers/iommu/iommu.c   |  40 +++
  include/linux/io-pgtable.h  |   2 +
  include/linux/iommu.h   |  10 ++
  5 files changed, 194 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 9208881a571c..5469f4fca820 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2510,6 +2510,25 @@ static int arm_smmu_domain_set_attr(struct iommu_domain 
*domain,
return ret;
  }
  
+static size_t arm_smmu_split_block(struct iommu_domain *domain,

+  unsigned long iova, size_t size)
+{
+   struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu;
+   struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
+
+   if (!(smmu->features & (ARM_SMMU_FEAT_BBML1 | ARM_SMMU_FEAT_BBML2))) {
+   dev_err(smmu->dev, "don't support BBML1/2 and split block\n");
+   return 0;
+   }
+
+   if (!ops || !ops->split_block) {
+   pr_err("don't support split block\n");
+   return 0;
+   }
+
+   return ops->split_block(ops, iova, size);
+}
+
  static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
  {
return iommu_fwspec_add_ids(dev, args->args, 1);
@@ -2609,6 +2628,7 @@ static struct iommu_ops arm_smmu_ops = {
.device_group   = arm_smmu_device_group,
.domain_get_attr= arm_smmu_domain_get_attr,
.domain_set_attr= arm_smmu_domain_set_attr,
+   .split_block= arm_smmu_split_block,
.of_xlate   = arm_smmu_of_xlate,
.get_resv_regions   = arm_smmu_get_resv_regions,
.put_resv_regions   = generic_iommu_put_resv_regions,
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index e299a44808ae..f3b7f7115e38 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -79,6 +79,8 @@
  #define ARM_LPAE_PTE_SH_IS(((arm_lpae_iopte)3) << 8)
  #define ARM_LPAE_PTE_NS   (((arm_lpae_iopte)1) << 5)
  #define ARM_LPAE_PTE_VALID(((arm_lpae_iopte)1) << 0)
+/* Block descriptor bits */
+#define ARM_LPAE_PTE_NT(((arm_lpae_iopte)1) << 16)
  
  #define ARM_LPAE_PTE_ATTR_LO_MASK	(((arm_lpae_iopte)0x3ff) << 2)

  /* Ignore the contiguous bit for block splitting */
@@ -679,6 +681,125 @@ static phys_addr_t arm_lpae_iova_to_phys(struct 
io_pgtable_ops *ops,
return iopte_to_paddr(pte, data) | iova;
  }
  
+static size_t __arm_lpae_split_block(struct arm_lpae_io_pgtable *data,

+unsigned long iova, size_t size, int lvl,
+arm_lpae_iopte *ptep);
+
+static size_t arm_lpae_do_split_blk(struct arm_lpae_io_pgtable *data,
+   unsigned long iova, size_t size,
+   arm_lpae_iopte blk_pte, int lvl,
+   arm_lpae_iopte *ptep)
+{
+   struct io_pgtable_cfg *cfg = >iop.cfg;
+   arm_lpae_iopte pte, *tablep;
+   phys_addr_t blk_paddr;
+   size_t tablesz = ARM_LPAE_GRANULE(data);
+   size_t split_sz = ARM_LPAE_BLOCK_SIZE(lvl, data);
+   int i;
+
+   if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
+   return 0;
+
+   tablep = __arm_lpae_alloc_pages(tablesz, GFP_ATOMIC, cfg);
+   if (!tablep)
+   return 0;
+
+   blk_paddr = iopte_to_paddr(blk_pte, data);
+   

Re: [RFC PATCH 01/11] iommu/arm-smmu-v3: Add feature detection for HTTU

2021-02-04 Thread Robin Murphy

On 2021-01-28 15:17, Keqian Zhu wrote:

From: jiangkunkun 

The SMMU which supports HTTU (Hardware Translation Table Update) can
update the access flag and the dirty state of TTD by hardware. It is
essential to track dirty pages of DMA.

This adds feature detection, none functional change.

Co-developed-by: Keqian Zhu 
Signed-off-by: Kunkun Jiang 
---
  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 16 
  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  8 
  include/linux/io-pgtable.h  |  1 +
  3 files changed, 25 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 8ca7415d785d..0f0fe71cc10d 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1987,6 +1987,7 @@ static int arm_smmu_domain_finalise(struct iommu_domain 
*domain,
.pgsize_bitmap  = smmu->pgsize_bitmap,
.ias= ias,
.oas= oas,
+   .httu_hd= smmu->features & ARM_SMMU_FEAT_HTTU_HD,
.coherent_walk  = smmu->features & ARM_SMMU_FEAT_COHERENCY,
.tlb= _smmu_flush_ops,
.iommu_dev  = smmu->dev,
@@ -3224,6 +3225,21 @@ static int arm_smmu_device_hw_probe(struct 
arm_smmu_device *smmu)
if (reg & IDR0_HYP)
smmu->features |= ARM_SMMU_FEAT_HYP;
  
+	switch (FIELD_GET(IDR0_HTTU, reg)) {


We need to accommodate the firmware override as well if we need this to 
be meaningful. Jean-Philippe is already carrying a suitable patch in the 
SVA stack[1].



+   case IDR0_HTTU_NONE:
+   break;
+   case IDR0_HTTU_HA:
+   smmu->features |= ARM_SMMU_FEAT_HTTU_HA;
+   break;
+   case IDR0_HTTU_HAD:
+   smmu->features |= ARM_SMMU_FEAT_HTTU_HA;
+   smmu->features |= ARM_SMMU_FEAT_HTTU_HD;
+   break;
+   default:
+   dev_err(smmu->dev, "unknown/unsupported HTTU!\n");
+   return -ENXIO;
+   }
+
/*
 * The coherency feature as set by FW is used in preference to the ID
 * register, but warn on mismatch.
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 96c2e9565e00..e91bea44519e 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -33,6 +33,10 @@
  #define IDR0_ASID16   (1 << 12)
  #define IDR0_ATS  (1 << 10)
  #define IDR0_HYP  (1 << 9)
+#define IDR0_HTTU  GENMASK(7, 6)
+#define IDR0_HTTU_NONE 0
+#define IDR0_HTTU_HA   1
+#define IDR0_HTTU_HAD  2
  #define IDR0_COHACC   (1 << 4)
  #define IDR0_TTF  GENMASK(3, 2)
  #define IDR0_TTF_AARCH64  2
@@ -286,6 +290,8 @@
  #define CTXDESC_CD_0_TCR_TBI0 (1ULL << 38)
  
  #define CTXDESC_CD_0_AA64		(1UL << 41)

+#define CTXDESC_CD_0_HD(1UL << 42)
+#define CTXDESC_CD_0_HA(1UL << 43)
  #define CTXDESC_CD_0_S(1UL << 44)
  #define CTXDESC_CD_0_R(1UL << 45)
  #define CTXDESC_CD_0_A(1UL << 46)
@@ -604,6 +610,8 @@ struct arm_smmu_device {
  #define ARM_SMMU_FEAT_RANGE_INV   (1 << 15)
  #define ARM_SMMU_FEAT_BTM (1 << 16)
  #define ARM_SMMU_FEAT_SVA (1 << 17)
+#define ARM_SMMU_FEAT_HTTU_HA  (1 << 18)
+#define ARM_SMMU_FEAT_HTTU_HD  (1 << 19)
u32 features;
  
  #define ARM_SMMU_OPT_SKIP_PREFETCH	(1 << 0)

diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index ea727eb1a1a9..1a00ea8562c7 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -97,6 +97,7 @@ struct io_pgtable_cfg {
unsigned long   pgsize_bitmap;
unsigned intias;
unsigned intoas;
+   boolhttu_hd;


This is very specific to the AArch64 stage 1 format, not a generic 
capability - I think it should be a quirk flag rather than a common field.


Robin.

[1] 
https://jpbrucker.net/git/linux/commit/?h=sva/current=1ef7d512fb9082450dfe0d22ca4f7e35625a097b



boolcoherent_walk;
const struct iommu_flush_ops*tlb;
struct device   *iommu_dev;


___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC PATCH v2 26/26] KVM: arm64: Wrap the host with a stage 2

2021-02-04 Thread Quentin Perret
On Wednesday 03 Feb 2021 at 16:11:47 (+), Will Deacon wrote:
> On Fri, Jan 08, 2021 at 12:15:24PM +, Quentin Perret wrote:
> > When KVM runs in protected nVHE mode, make use of a stage 2 page-table
> > to give the hypervisor some control over the host memory accesses. At
> > the moment all memory aborts from the host will be instantly idmapped
> > RWX at stage 2 in a lazy fashion. Later patches will make use of that
> > infrastructure to implement access control restrictions to e.g. protect
> > guest memory from the host.
> > 
> > Signed-off-by: Quentin Perret 
> > ---
> >  arch/arm64/include/asm/kvm_cpufeature.h   |   2 +
> >  arch/arm64/kernel/image-vars.h|   3 +
> >  arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |  33 +++
> >  arch/arm64/kvm/hyp/nvhe/Makefile  |   2 +-
> >  arch/arm64/kvm/hyp/nvhe/hyp-init.S|   1 +
> >  arch/arm64/kvm/hyp/nvhe/hyp-main.c|   6 +
> >  arch/arm64/kvm/hyp/nvhe/mem_protect.c | 191 ++
> >  arch/arm64/kvm/hyp/nvhe/setup.c   |   6 +
> >  arch/arm64/kvm/hyp/nvhe/switch.c  |   7 +-
> >  arch/arm64/kvm/hyp/nvhe/tlb.c |   4 +-
> >  10 files changed, 248 insertions(+), 7 deletions(-)
> >  create mode 100644 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
> >  create mode 100644 arch/arm64/kvm/hyp/nvhe/mem_protect.c
> 
> [...]
> 
> > +void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
> > +{
> > +   enum kvm_pgtable_prot prot;
> > +   u64 far, hpfar, esr, ipa;
> > +   int ret;
> > +
> > +   esr = read_sysreg_el2(SYS_ESR);
> > +   if (!__get_fault_info(esr, , ))
> > +   hyp_panic();
> > +
> > +   prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W | KVM_PGTABLE_PROT_X;
> > +   ipa = (hpfar & HPFAR_MASK) << 8;
> > +   ret = host_stage2_map(ipa, PAGE_SIZE, prot);
> 
> Can we try to put down a block mapping if the whole thing falls within
> memory?

Yes we can! And in fact we can do that outside of memory too. It's
queued for v3 already, so stay tuned ... :)

Thanks,
Quentin
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC PATCH 05/11] iommu/arm-smmu-v3: Merge a span of page to block descriptor

2021-02-04 Thread Robin Murphy

On 2021-01-28 15:17, Keqian Zhu wrote:

From: jiangkunkun 

When stop dirty log tracking, we need to recover all block descriptors
which are splited when start dirty log tracking. This adds a new
interface named merge_page in iommu layer and arm smmuv3 implements it,
which reinstall block mappings and unmap the span of page mappings.

It's caller's duty to find contiuous physical memory.

During merging page, other interfaces are not expected to be working,
so race condition does not exist. And we flush all iotlbs after the merge
procedure is completed to ease the pressure of iommu, as we will merge a
huge range of page mappings in general.


Again, I think we need better reasoning than "race conditions don't 
exist because we don't expect them to exist".



Co-developed-by: Keqian Zhu 
Signed-off-by: Kunkun Jiang 
---
  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 20 ++
  drivers/iommu/io-pgtable-arm.c  | 78 +
  drivers/iommu/iommu.c   | 75 
  include/linux/io-pgtable.h  |  2 +
  include/linux/iommu.h   | 10 +++
  5 files changed, 185 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 5469f4fca820..2434519e4bb6 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2529,6 +2529,25 @@ static size_t arm_smmu_split_block(struct iommu_domain 
*domain,
return ops->split_block(ops, iova, size);
  }
  
+static size_t arm_smmu_merge_page(struct iommu_domain *domain, unsigned long iova,

+ phys_addr_t paddr, size_t size, int prot)
+{
+   struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
+   struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu;
+
+   if (!(smmu->features & (ARM_SMMU_FEAT_BBML1 | ARM_SMMU_FEAT_BBML2))) {
+   dev_err(smmu->dev, "don't support BBML1/2 and merge page\n");
+   return 0;
+   }
+
+   if (!ops || !ops->merge_page) {
+   pr_err("don't support merge page\n");
+   return 0;
+   }
+
+   return ops->merge_page(ops, iova, paddr, size, prot);
+}
+
  static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
  {
return iommu_fwspec_add_ids(dev, args->args, 1);
@@ -2629,6 +2648,7 @@ static struct iommu_ops arm_smmu_ops = {
.domain_get_attr= arm_smmu_domain_get_attr,
.domain_set_attr= arm_smmu_domain_set_attr,
.split_block= arm_smmu_split_block,
+   .merge_page = arm_smmu_merge_page,
.of_xlate   = arm_smmu_of_xlate,
.get_resv_regions   = arm_smmu_get_resv_regions,
.put_resv_regions   = generic_iommu_put_resv_regions,
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index f3b7f7115e38..17390f258eb1 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -800,6 +800,83 @@ static size_t arm_lpae_split_block(struct io_pgtable_ops 
*ops,
return __arm_lpae_split_block(data, iova, size, lvl, ptep);
  }
  
+static size_t __arm_lpae_merge_page(struct arm_lpae_io_pgtable *data,

+   unsigned long iova, phys_addr_t paddr,
+   size_t size, int lvl, arm_lpae_iopte *ptep,
+   arm_lpae_iopte prot)
+{
+   arm_lpae_iopte pte, *tablep;
+   struct io_pgtable *iop = >iop;
+   struct io_pgtable_cfg *cfg = >iop.cfg;
+
+   if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
+   return 0;
+
+   ptep += ARM_LPAE_LVL_IDX(iova, lvl, data);
+   pte = READ_ONCE(*ptep);
+   if (WARN_ON(!pte))
+   return 0;
+
+   if (size == ARM_LPAE_BLOCK_SIZE(lvl, data)) {
+   if (iopte_leaf(pte, lvl, iop->fmt))
+   return size;
+
+   /* Race does not exist */
+   if (cfg->bbml == 1) {
+   prot |= ARM_LPAE_PTE_NT;
+   __arm_lpae_init_pte(data, paddr, prot, lvl, ptep);
+   io_pgtable_tlb_flush_walk(iop, iova, size,
+ ARM_LPAE_GRANULE(data));
+
+   prot &= ~(ARM_LPAE_PTE_NT);
+   __arm_lpae_init_pte(data, paddr, prot, lvl, ptep);
+   } else {
+   __arm_lpae_init_pte(data, paddr, prot, lvl, ptep);
+   }
+
+   tablep = iopte_deref(pte, data);
+   __arm_lpae_free_pgtable(data, lvl + 1, tablep);
+   return size;
+   } else if (iopte_leaf(pte, lvl, iop->fmt)) {
+   /* The size is too small, already merged */
+   return size;
+   }
+
+   /* Keep on walkin */
+   ptep = iopte_deref(pte, data);
+   return