Re: [PATCH v4 09/16] KVM: arm64: PMU: Do not let AArch32 change the counters' top 32 bits

2022-11-17 Thread Reiji Watanabe
Hi Marc,

On Sun, Nov 13, 2022 at 8:38 AM Marc Zyngier  wrote:
>
> Even when using PMUv3p5 (which implies 64bit counters), there is
> no way for AArch32 to write to the top 32 bits of the counters.
> The only way to influence these bits (other than by counting
> events) is by writing PMCR.P==1.
>
> Make sure we obey the architecture and preserve the top 32 bits
> on a counter update.
>
> Signed-off-by: Marc Zyngier 
> ---
>  arch/arm64/kvm/pmu-emul.c | 35 +++
>  1 file changed, 27 insertions(+), 8 deletions(-)
>
> diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
> index ea0c8411641f..419e5e0a13d0 100644
> --- a/arch/arm64/kvm/pmu-emul.c
> +++ b/arch/arm64/kvm/pmu-emul.c
> @@ -119,13 +119,8 @@ u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 
> select_idx)
> return counter;
>  }
>
> -/**
> - * kvm_pmu_set_counter_value - set PMU counter value
> - * @vcpu: The vcpu pointer
> - * @select_idx: The counter index
> - * @val: The counter value
> - */
> -void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 
> val)
> +static void kvm_pmu_set_counter(struct kvm_vcpu *vcpu, u64 select_idx, u64 
> val,
> +   bool force)
>  {
> u64 reg;
>
> @@ -135,12 +130,36 @@ void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, 
> u64 select_idx, u64 val)
> kvm_pmu_release_perf_event(>arch.pmu.pmc[select_idx]);
>
> reg = counter_index_to_reg(select_idx);
> +
> +   if (vcpu_mode_is_32bit(vcpu) && select_idx != ARMV8_PMU_CYCLE_IDX &&
> +   !force) {
> +   /*
> +* Even with PMUv3p5, AArch32 cannot write to the top
> +* 32bit of the counters. The only possible course of
> +* action is to use PMCR.P, which will reset them to
> +* 0 (the only use of the 'force' parameter).
> +*/
> +   val  = lower_32_bits(val);
> +   val |= upper_32_bits(__vcpu_sys_reg(vcpu, reg));

Shouldn't the result of upper_32_bits() be shifted 32bits left
before ORing (to maintain the upper 32bits of the current value) ?

Thank you,
Reiji

> +   }
> +
> __vcpu_sys_reg(vcpu, reg) = val;
>
> /* Recreate the perf event to reflect the updated sample_period */
> kvm_pmu_create_perf_event(vcpu, select_idx);
>  }
>
> +/**
> + * kvm_pmu_set_counter_value - set PMU counter value
> + * @vcpu: The vcpu pointer
> + * @select_idx: The counter index
> + * @val: The counter value
> + */
> +void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 
> val)
> +{
> +   kvm_pmu_set_counter(vcpu, select_idx, val, false);
> +}
> +
>  /**
>   * kvm_pmu_release_perf_event - remove the perf event
>   * @pmc: The PMU counter pointer
> @@ -533,7 +552,7 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
> unsigned long mask = kvm_pmu_valid_counter_mask(vcpu);
> mask &= ~BIT(ARMV8_PMU_CYCLE_IDX);
> for_each_set_bit(i, , 32)
> -   kvm_pmu_set_counter_value(vcpu, i, 0);
> +   kvm_pmu_set_counter(vcpu, i, 0, true);
> }
>  }
>
> --
> 2.34.1
>
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v4 07/16] KVM: arm64: PMU: Add counter_index_to_*reg() helpers

2022-11-17 Thread Reiji Watanabe
Hi Marc,

On Sun, Nov 13, 2022 at 8:38 AM Marc Zyngier  wrote:
>
> In order to reduce the boilerplate code, add two helpers returning
> the counter register index (resp. the event register) in the vcpu
> register file from the counter index.
>
> Reviewed-by: Oliver Upton 
> Signed-off-by: Marc Zyngier 
> ---
>  arch/arm64/kvm/pmu-emul.c | 33 ++---
>  1 file changed, 18 insertions(+), 15 deletions(-)
>
> diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
> index 1fab889dbc74..faab0f57a45d 100644
> --- a/arch/arm64/kvm/pmu-emul.c
> +++ b/arch/arm64/kvm/pmu-emul.c
> @@ -77,6 +77,16 @@ static struct kvm_vcpu *kvm_pmc_to_vcpu(struct kvm_pmc 
> *pmc)
> return container_of(vcpu_arch, struct kvm_vcpu, arch);
>  }
>
> +static u32 counter_index_to_reg(u64 idx)
> +{
> +   return (idx == ARMV8_PMU_CYCLE_IDX) ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + 
> idx;
> +}
> +
> +static u32 counter_index_to_evtreg(u64 idx)
> +{
> +   return (idx == ARMV8_PMU_CYCLE_IDX) ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 
> + idx;
> +}

It looks like we could use this function for access_pmu_evtyper()
in arch/arm64/kvm/sys_regs.c as well.

Thank you,
Reiji


> +
>  /**
>   * kvm_pmu_get_counter_value - get PMU counter value
>   * @vcpu: The vcpu pointer
> @@ -91,8 +101,7 @@ u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 
> select_idx)
> if (!kvm_vcpu_has_pmu(vcpu))
> return 0;
>
> -   reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX)
> -   ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + pmc->idx;
> +   reg = counter_index_to_reg(select_idx);
> counter = __vcpu_sys_reg(vcpu, reg);
>
> /*
> @@ -122,8 +131,7 @@ void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 
> select_idx, u64 val)
> if (!kvm_vcpu_has_pmu(vcpu))
> return;
>
> -   reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
> - ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + select_idx;
> +   reg = counter_index_to_reg(select_idx);
> __vcpu_sys_reg(vcpu, reg) += (s64)val - 
> kvm_pmu_get_counter_value(vcpu, select_idx);
>
> /* Recreate the perf event to reflect the updated sample_period */
> @@ -158,10 +166,7 @@ static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, 
> struct kvm_pmc *pmc)
>
> val = kvm_pmu_get_counter_value(vcpu, pmc->idx);
>
> -   if (pmc->idx == ARMV8_PMU_CYCLE_IDX)
> -   reg = PMCCNTR_EL0;
> -   else
> -   reg = PMEVCNTR0_EL0 + pmc->idx;
> +   reg = counter_index_to_reg(pmc->idx);
>
> __vcpu_sys_reg(vcpu, reg) = val;
>
> @@ -404,16 +409,16 @@ static void kvm_pmu_counter_increment(struct kvm_vcpu 
> *vcpu,
> u64 type, reg;
>
> /* Filter on event type */
> -   type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i);
> +   type = __vcpu_sys_reg(vcpu, counter_index_to_evtreg(i));
> type &= kvm_pmu_event_mask(vcpu->kvm);
> if (type != event)
> continue;
>
> /* Increment this counter */
> -   reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1;
> +   reg = __vcpu_sys_reg(vcpu, counter_index_to_reg(i)) + 1;
> if (!kvm_pmu_idx_is_64bit(vcpu, i))
> reg = lower_32_bits(reg);
> -   __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg;
> +   __vcpu_sys_reg(vcpu, counter_index_to_reg(i)) = reg;
>
> /* No overflow? move on */
> if (kvm_pmu_idx_has_64bit_overflow(vcpu, i) ? reg : 
> lower_32_bits(reg))
> @@ -549,8 +554,7 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu 
> *vcpu, u64 select_idx)
> struct perf_event_attr attr;
> u64 eventsel, counter, reg, data;
>
> -   reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX)
> - ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + pmc->idx;
> +   reg = counter_index_to_evtreg(select_idx);
> data = __vcpu_sys_reg(vcpu, reg);
>
> kvm_pmu_stop_counter(vcpu, pmc);
> @@ -632,8 +636,7 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu 
> *vcpu, u64 data,
> mask &= ~ARMV8_PMU_EVTYPE_EVENT;
> mask |= kvm_pmu_event_mask(vcpu->kvm);
>
> -   reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
> - ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + select_idx;
> +   reg = counter_index_to_evtreg(select_idx);
>
> __vcpu_sys_reg(vcpu, reg) = data & mask;
>
> --
> 2.34.1
>
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: disabling halt polling broken? (was Re: [PATCH 00/14] KVM: Halt-polling fixes, cleanups and a new stat)

2022-11-17 Thread wangyanan (Y)




On 2022/11/17 1:19, David Matlack wrote:

On Tue, Nov 15, 2022 at 11:28:56AM +0800, wangyanan (Y) wrote:

Hi Sean, Paolo,

I recently also notice the behavior change of param halt_poll_ns.
Now it loses the ability to:
1) dynamically disable halt polling for all the running VMs
by `echo 0 > /sys`
2) dynamically adjust the halt polling interval for all the
running VMs by `echo * > /sys`

While in our cases, we usually use above two abilities, and
KVM_CAP_HALT_POLL is not used yet.

I think the right path forward is to make KVM_CAP_HALT_POLL a pure
override of halt_poll_ns, and restore the pre-existing behavior of
halt_poll_ns whenever KVM_CAP_HALT_POLL is not used. e.g. see the patch
below.

Agree with this.
kvm.halt_poll_ns serves like a legacy method to control halt polling
globally. Once KVM_CAP_HALT_POLL is used for a VM, it should
hold 100% responsibility to control on the VM, including disabling
the polling. This strategy helps to keep the two mechanisms
decoupled.

That will fix issues (1) and (2) above for any VM not using
KVM_CAP_HALT_POLL. If a VM is using KVM_CAP_HALT_POLL, it will ignore
all changes to halt_poll_ns. If we truly need a mechanism for admins to
disable halt-polling on VMs using KVM_CAP_HALT_POLL, we can introduce a
separate module parameter for that. But IMO, any setup that is
sophisticated enough to use KVM_CAP_HALT_POLL should also be able to use
KVM_CAP_HALT_POLL to disable halt polling.

If everyone is happy with this approach I can test and send a real patch
to the mailing list.

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e6e66c5e56f2..253ad055b6ad 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -788,6 +788,7 @@ struct kvm {
struct srcu_struct srcu;
struct srcu_struct irq_srcu;
pid_t userspace_pid;
+   bool override_halt_poll_ns;
unsigned int max_halt_poll_ns;
u32 dirty_ring_size;
bool vm_bugged;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 43bbe4fde078..479d0d0da0b5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1198,8 +1198,6 @@ static struct kvm *kvm_create_vm(unsigned long type, 
const char *fdname)
goto out_err_no_arch_destroy_vm;
}
  
-	kvm->max_halt_poll_ns = halt_poll_ns;

-
r = kvm_arch_init_vm(kvm, type);
if (r)
goto out_err_no_arch_destroy_vm;
@@ -3371,7 +3369,7 @@ void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
sigemptyset(>real_blocked);
  }
  
-static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)

+static void grow_halt_poll_ns(struct kvm_vcpu *vcpu, unsigned int max)
  {
unsigned int old, val, grow, grow_start;
  
@@ -3385,8 +3383,8 @@ static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)

if (val < grow_start)
val = grow_start;
  
-	if (val > vcpu->kvm->max_halt_poll_ns)

-   val = vcpu->kvm->max_halt_poll_ns;
+   if (val > max)
+   val = max;
  
  	vcpu->halt_poll_ns = val;

  out:
@@ -3501,10 +3499,17 @@ void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
  {
bool halt_poll_allowed = !kvm_arch_no_poll(vcpu);
bool do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns;
+   unsigned int max_halt_poll_ns;
ktime_t start, cur, poll_end;
+   struct kvm *kvm = vcpu->kvm;
bool waited = false;
u64 halt_ns;
  
+	if (kvm->override_halt_poll_ns)

+   max_halt_poll_ns = kvm->max_halt_poll_ns;
+   else
+   max_halt_poll_ns = READ_ONCE(halt_poll_ns);
+
start = cur = poll_end = ktime_get();
if (do_halt_poll) {
ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns);
@@ -3545,17 +3550,16 @@ void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
if (halt_poll_allowed) {
if (!vcpu_valid_wakeup(vcpu)) {
shrink_halt_poll_ns(vcpu);
-   } else if (vcpu->kvm->max_halt_poll_ns) {
+   } else if (max_halt_poll_ns) {
if (halt_ns <= vcpu->halt_poll_ns)
;
/* we had a long block, shrink polling */
-   else if (vcpu->halt_poll_ns &&
-halt_ns > vcpu->kvm->max_halt_poll_ns)
+   else if (vcpu->halt_poll_ns && halt_ns > 
max_halt_poll_ns)
shrink_halt_poll_ns(vcpu);
/* we had a short halt and our poll time is too small */
-   else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns 
&&
-halt_ns < vcpu->kvm->max_halt_poll_ns)
-   grow_halt_poll_ns(vcpu);
+   else if (vcpu->halt_poll_ns < max_halt_poll_ns &&
+halt_ns < max_halt_poll_ns)
+   grow_halt_poll_ns(vcpu, max_halt_poll_ns);
} else {

Re: [External] Re: [v2 0/6] KVM: arm64: implement vcpu_is_preempted check

2022-11-17 Thread Marc Zyngier
On Mon, 07 Nov 2022 12:00:44 +,
Usama Arif  wrote:
> 
> 
> 
> On 06/11/2022 16:35, Marc Zyngier wrote:
> > On Fri, 04 Nov 2022 06:20:59 +,
> > Usama Arif  wrote:
> >> 
> >> This patchset adds support for vcpu_is_preempted in arm64, which
> >> allows the guest to check if a vcpu was scheduled out, which is
> >> useful to know incase it was holding a lock. vcpu_is_preempted can
> >> be used to improve performance in locking (see owner_on_cpu usage in
> >> mutex_spin_on_owner, mutex_can_spin_on_owner, rtmutex_spin_on_owner
> >> and osq_lock) and scheduling (see available_idle_cpu which is used
> >> in several places in kernel/sched/fair.c for e.g. in wake_affine to
> >> determine which CPU can run soonest):
> > 
> > [...]
> > 
> >> pvcy shows a smaller overall improvement (50%) compared to
> >> vcpu_is_preempted (277%).  Host side flamegraph analysis shows that
> >> ~60% of the host time when using pvcy is spent in kvm_handle_wfx,
> >> compared with ~1.5% when using vcpu_is_preempted, hence
> >> vcpu_is_preempted shows a larger improvement.
> > 
> > And have you worked out *why* we spend so much time handling WFE?
> > 
> > M.
> 
> Its from the following change in pvcy patchset:
> 
> diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
> index e778eefcf214..915644816a85 100644
> --- a/arch/arm64/kvm/handle_exit.c
> +++ b/arch/arm64/kvm/handle_exit.c
> @@ -118,7 +118,12 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu)
> }
> 
> if (esr & ESR_ELx_WFx_ISS_WFE) {
> -   kvm_vcpu_on_spin(vcpu, vcpu_mode_priv(vcpu));
> +   int state;
> +   while ((state = kvm_pvcy_check_state(vcpu)) == 0)
> +   schedule();
> +
> +   if (state == -1)
> +   kvm_vcpu_on_spin(vcpu, vcpu_mode_priv(vcpu));
> } else {
> if (esr & ESR_ELx_WFx_ISS_WFxT)
> vcpu_set_flag(vcpu, IN_WFIT);
> 
> 
> If my understanding is correct of the pvcy changes, whenever pvcy
> returns an unchanged vcpu state, we would schedule to another
> vcpu. And its the constant scheduling where the time is spent. I guess
> the affects are much higher when the lock contention is very
> high. This can be seem from the pvcy host side flamegraph as well with
> (~67% of the time spent in the schedule() call in kvm_handle_wfx), For
> reference, I have put the graph at:
> https://uarif1.github.io/pvlock/perf_host_pvcy_nmi.svg

The real issue here is that we don't try to pick the right vcpu to
run, and strictly rely on schedule() to eventually pick something that
can run.

An interesting to do would be to try and fit the directed yield
mechanism there. It would be a lot more interesting than the one-off
vcpu_is_preempted hack, as it gives us a low-level primitive on which
to construct things (pvcy is effectively a mwait-like primitive).

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 2/2] KVM: arm64: selftests: Disable single-step without relying on ucall()

2022-11-17 Thread Sean Christopherson
On Thu, Nov 17, 2022, Sean Christopherson wrote:
> On Thu, Nov 17, 2022, Sean Christopherson wrote:
> > On Thu, Nov 17, 2022, Oliver Upton wrote:
> > > On Thu, Nov 17, 2022 at 12:23:50AM +, Sean Christopherson wrote:
> > > > Automatically disable single-step when the guest reaches the end of the
> > > > verified section instead of using an explicit ucall() to ask userspace 
> > > > to
> > > > disable single-step.  An upcoming change to implement a pool-based 
> > > > scheme
> > > > for ucall() will add an atomic operation (bit test and set) in the guest
> > > > ucall code, and if the compiler generate "old school" atomics, e.g.
> > > 
> > > Off topic, but I didn't ask when we were discussing this issue. What is
> > > the atomic used for in the pool-based ucall implementation?
> > 
> > To avoid having to plumb an "id" into the guest, vCPUs grab a ucall entry 
> > from
> > the pool on a first-come first-serve basis, and then release the entry when 
> > the
> > ucall is complete.  The current implementation is a bitmap, e.g. every 
> > possible
> > entry has a bit in the map, and vCPUs do an atomic bit-test-and-set to 
> > claim an
> > entry.
> > 
> > Ugh.  And there's a bug.  Of course I notice it after sending the pull 
> > request.
> > Depsite being defined in atomic.h, and despite clear_bit() being atomic in 
> > the
> > kernel, tools' clear_bit() isn't actually atomic.  Grr.
> > 
> > Doesn't cause problems because there are so few multi-vCPU selftests, but 
> > that
> > needs to be fixed.  Best thing would be to fix clear_bit() itself.
> 
> Ha!  And I bet when clear_bit() is fixed, this test will start failing again
> because the ucall() to activate single-step needs to release the entry _after_
> exiting to the host, i.e. single-step will be enabled across the atomic region
> again.

LOL, yep.  Test gets stuck in __aarch64_ldclr8_sync().
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC PATCH 04/12] KVM: arm64: Add kvm_pgtable_stage2_split()

2022-11-17 Thread Ricardo Koller
On Tue, Nov 15, 2022 at 11:54:27PM +, Oliver Upton wrote:
> On Tue, Nov 15, 2022 at 03:27:18PM -0800, Ricardo Koller wrote:
> > On Tue, Nov 15, 2022 at 03:03:42PM -0800, Ricardo Koller wrote:
> > > On Mon, Nov 14, 2022 at 08:54:52PM +, Oliver Upton wrote:
> 
> [...]
> 
> > > > > diff --git a/arch/arm64/kvm/hyp/pgtable.c 
> > > > > b/arch/arm64/kvm/hyp/pgtable.c
> > > > > index d1f309128118..9c42eff6d42e 100644
> > > > > --- a/arch/arm64/kvm/hyp/pgtable.c
> > > > > +++ b/arch/arm64/kvm/hyp/pgtable.c
> > > > > @@ -1267,6 +1267,80 @@ static int stage2_create_removed(kvm_pte_t 
> > > > > *ptep, u64 phys, u32 level,
> > > > >   return __kvm_pgtable_visit(, mm_ops, ptep, level);
> > > > >  }
> > > > >  
> > > > > +struct stage2_split_data {
> > > > > + struct kvm_s2_mmu   *mmu;
> > > > > + void*memcache;
> > > > > + struct kvm_pgtable_mm_ops   *mm_ops;
> > > > 
> > > > You can also get at mm_ops through kvm_pgtable_visit_ctx
> > > > 
> > > > > +};
> > > > > +
> > > > > +static int stage2_split_walker(const struct kvm_pgtable_visit_ctx 
> > > > > *ctx,
> > > > > +enum kvm_pgtable_walk_flags visit)
> > > > > +{
> > > > > + struct stage2_split_data *data = ctx->arg;
> > > > > + struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
> > > > > + kvm_pte_t pte = ctx->old, attr, new;
> > > > > + enum kvm_pgtable_prot prot;
> > > > > + void *mc = data->memcache;
> > > > > + u32 level = ctx->level;
> > > > > + u64 phys;
> > > > > +
> > > > > + if (WARN_ON_ONCE(kvm_pgtable_walk_shared(ctx)))
> > > > > + return -EINVAL;
> > > > > +
> > > > > + /* Nothing to split at the last level */
> > > > > + if (level == KVM_PGTABLE_MAX_LEVELS - 1)
> > > > > + return 0;
> > > > > +
> > > > > + /* We only split valid block mappings */
> > > > > + if (!kvm_pte_valid(pte) || kvm_pte_table(pte, ctx->level))
> > > > > + return 0;
> > > > > +
> > > > > + phys = kvm_pte_to_phys(pte);
> > > > > + prot = kvm_pgtable_stage2_pte_prot(pte);
> > > > > + stage2_set_prot_attr(data->mmu->pgt, prot, );
> > > > > +
> > > > > + /*
> > > > > +  * Eager page splitting is best-effort, so we can ignore the 
> > > > > error.
> > > > > +  * The returned PTE (new) will be valid even if this call 
> > > > > returns
> > > > > +  * error: new will be a single (big) block PTE.  The only issue 
> > > > > is
> > > > > +  * that it will affect dirty logging performance, as the 
> > > > > huge-pages
> > > > > +  * will have to be split on fault, and so we WARN.
> > > > > +  */
> > > > > + WARN_ON(stage2_create_removed(, phys, level, attr, mc, 
> > > > > mm_ops));
> > > > 
> > > > I don't believe we should warn in this case, at least not
> > > > unconditionally. ENOMEM is an expected outcome, for example.
> > > 
> > > Given that "eager page splitting" is best-effort, the error must be
> > > ignored somewhere: either here or by the caller (in mmu.c). It seems
> > > that ignoring the error here is not a very good idea.
> > 
> > Actually, ignoring the error here simplifies the error handling.
> > stage2_create_removed() is best-effort; here's an example.  If
> > stage2_create_removed() was called to split a 1G block PTE, and it
> > wasn't able to split all 2MB blocks, it would return ENOMEM and a valid
> > PTE pointing to a tree like this:
> > 
> > [-1GB-]
> > : :
> > [--2MB--][--2MB--][--2MB--]
> > :   :
> > [ ][ ][ ]
> > 
> > If we returned ENOMEM instead of ignoring the error, we would have to
> > clean all the intermediate state.  But stage2_create_removed() is
> > designed to always return a valid PTE, even if the tree is not fully
> > split (as above).  So, there's no really need to clean it: it's a valid
> > tree. Moreover, this valid tree would result in better dirty logging
> > performance as it already has some 2M blocks split into 4K pages.
> 
> I have no issue with installing a partially-populated table, but
> unconditionally ignoring the return code and marching onwards seems
> dangerous. If you document the behavior of -ENOMEM on
> stage2_create_removed() and return early for anything else it may read a
> bit better.

This got me thinking. These partially-populated tables are complicating
things too much for not good reason. They should be very rare, as the
caller will ensure there's enough memory in the memcache. So what do you
think of this other option?

https://github.com/ricarkol/linux/commit/54ba44f7d00d93cbaecebd148c102ca9d7c0e711

used here:

https://github.com/ricarkol/linux/commit/ff63a8744c18d5e4589911831e20ccb41712bda2#

It reuses the stage2 mapper to implement create_removed(), and makes
things simpler by only returning success when building a fully populated
tree. The caller doesn't need to clean anything in case of 

Re: [PATCH v3 2/2] KVM: arm64: Don't acquire RCU read lock for exclusive table walks

2022-11-17 Thread Oliver Upton
Hi Will,

Thanks for having a look.

On Thu, Nov 17, 2022 at 05:49:52PM +, Will Deacon wrote:
> On Wed, Nov 16, 2022 at 04:56:55PM +, Oliver Upton wrote:

[...]

> > -static inline void kvm_pgtable_walk_begin(void) {}
> > -static inline void kvm_pgtable_walk_end(void) {}
> > +static inline void kvm_pgtable_walk_begin(struct kvm_pgtable_walker 
> > *walker)
> > +{
> > +   /*
> > +* Due to the lack of RCU (or a similar protection scheme), only
> > +* non-shared table walkers are allowed in the hypervisor.
> > +*/
> > +   WARN_ON(walker->flags & KVM_PGTABLE_WALK_SHARED);
> > +}
> 
> I think it would be better to propagate the error to the caller rather
> than WARN here.

I'd really like to warn somewhere though since we're rather fscked at
this point. Keeping that WARN close to the exceptional condition would
help w/ debugging.

Were you envisioning bubbling the error all the way back up (i.e. early
return from kvm_pgtable_walk())?

I had really only intended these to indirect lock acquisition/release,
so the error handling on the caller side feels weird:

  static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker)
  {
if (WARN_ON(walker->flags & KVM_PGTABLE_WALK_SHARED))
return -EPERM;

return 0;
  }

  r = kvm_pgtable_walk_begin()
  if (r)
return r;

  r = _kvm_pgtable_walk();
  kvm_pgtable_walk_end();

> Since you're rejigging things anyway, can you have this
> function return int?

If having this is a strong motivator I can do a v4.

--
Thanks,
Oliver
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v3 2/2] KVM: arm64: Don't acquire RCU read lock for exclusive table walks

2022-11-17 Thread Will Deacon
On Wed, Nov 16, 2022 at 04:56:55PM +, Oliver Upton wrote:
> Marek reported a BUG resulting from the recent parallel faults changes,
> as the hyp stage-1 map walker attempted to allocate table memory while
> holding the RCU read lock:
> 
>   BUG: sleeping function called from invalid context at
>   include/linux/sched/mm.h:274
>   in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 1, name: swapper/0
>   preempt_count: 0, expected: 0
>   RCU nest depth: 1, expected: 0
>   2 locks held by swapper/0/1:
> #0: 8a8a44d0 (kvm_hyp_pgd_mutex){+.+.}-{3:3}, at:
>   __create_hyp_mappings+0x80/0xc4
> #1: 8a927720 (rcu_read_lock){}-{1:2}, at:
>   kvm_pgtable_walk+0x0/0x1f4
>   CPU: 2 PID: 1 Comm: swapper/0 Not tainted 6.1.0-rc3+ #5918
>   Hardware name: Raspberry Pi 3 Model B (DT)
>   Call trace:
> dump_backtrace.part.0+0xe4/0xf0
> show_stack+0x18/0x40
> dump_stack_lvl+0x8c/0xb8
> dump_stack+0x18/0x34
> __might_resched+0x178/0x220
> __might_sleep+0x48/0xa0
> prepare_alloc_pages+0x178/0x1a0
> __alloc_pages+0x9c/0x109c
> alloc_page_interleave+0x1c/0xc4
> alloc_pages+0xec/0x160
> get_zeroed_page+0x1c/0x44
> kvm_hyp_zalloc_page+0x14/0x20
> hyp_map_walker+0xd4/0x134
> kvm_pgtable_visitor_cb.isra.0+0x38/0x5c
> __kvm_pgtable_walk+0x1a4/0x220
> kvm_pgtable_walk+0x104/0x1f4
> kvm_pgtable_hyp_map+0x80/0xc4
> __create_hyp_mappings+0x9c/0xc4
> kvm_mmu_init+0x144/0x1cc
> kvm_arch_init+0xe4/0xef4
> kvm_init+0x3c/0x3d0
> arm_init+0x20/0x30
> do_one_initcall+0x74/0x400
> kernel_init_freeable+0x2e0/0x350
> kernel_init+0x24/0x130
> ret_from_fork+0x10/0x20
> 
> Since the hyp stage-1 table walkers are serialized by kvm_hyp_pgd_mutex,
> RCU protection really doesn't add anything. Don't acquire the RCU read
> lock for an exclusive walk. While at it, add a warning which codifies
> the lack of support for shared walks in the hypervisor code.
> 
> Reported-by: Marek Szyprowski 
> Signed-off-by: Oliver Upton 
> ---
>  arch/arm64/include/asm/kvm_pgtable.h | 22 --
>  arch/arm64/kvm/hyp/pgtable.c |  4 ++--
>  2 files changed, 18 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_pgtable.h 
> b/arch/arm64/include/asm/kvm_pgtable.h
> index f23af693e3c5..a07fc5e35a8c 100644
> --- a/arch/arm64/include/asm/kvm_pgtable.h
> +++ b/arch/arm64/include/asm/kvm_pgtable.h
> @@ -229,8 +229,16 @@ static inline kvm_pte_t *kvm_dereference_pteref(struct 
> kvm_pgtable_walker *walke
>   return pteref;
>  }
>  
> -static inline void kvm_pgtable_walk_begin(void) {}
> -static inline void kvm_pgtable_walk_end(void) {}
> +static inline void kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker)
> +{
> + /*
> +  * Due to the lack of RCU (or a similar protection scheme), only
> +  * non-shared table walkers are allowed in the hypervisor.
> +  */
> + WARN_ON(walker->flags & KVM_PGTABLE_WALK_SHARED);
> +}

I think it would be better to propagate the error to the caller rather
than WARN here. Since you're rejigging things anyway, can you have this
function return int?

Will
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 2/2] KVM: arm64: selftests: Disable single-step without relying on ucall()

2022-11-17 Thread Sean Christopherson
On Thu, Nov 17, 2022, Sean Christopherson wrote:
> On Thu, Nov 17, 2022, Oliver Upton wrote:
> > On Thu, Nov 17, 2022 at 12:23:50AM +, Sean Christopherson wrote:
> > > Automatically disable single-step when the guest reaches the end of the
> > > verified section instead of using an explicit ucall() to ask userspace to
> > > disable single-step.  An upcoming change to implement a pool-based scheme
> > > for ucall() will add an atomic operation (bit test and set) in the guest
> > > ucall code, and if the compiler generate "old school" atomics, e.g.
> > 
> > Off topic, but I didn't ask when we were discussing this issue. What is
> > the atomic used for in the pool-based ucall implementation?
> 
> To avoid having to plumb an "id" into the guest, vCPUs grab a ucall entry from
> the pool on a first-come first-serve basis, and then release the entry when 
> the
> ucall is complete.  The current implementation is a bitmap, e.g. every 
> possible
> entry has a bit in the map, and vCPUs do an atomic bit-test-and-set to claim 
> an
> entry.
> 
> Ugh.  And there's a bug.  Of course I notice it after sending the pull 
> request.
> Depsite being defined in atomic.h, and despite clear_bit() being atomic in the
> kernel, tools' clear_bit() isn't actually atomic.  Grr.
> 
> Doesn't cause problems because there are so few multi-vCPU selftests, but that
> needs to be fixed.  Best thing would be to fix clear_bit() itself.

Ha!  And I bet when clear_bit() is fixed, this test will start failing again
because the ucall() to activate single-step needs to release the entry _after_
exiting to the host, i.e. single-step will be enabled across the atomic region
again.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 38/44] KVM: Disable CPU hotplug during hardware enabling

2022-11-17 Thread Sean Christopherson
On Thu, Nov 17, 2022, Huang, Kai wrote:
> On Wed, 2022-11-16 at 17:11 +, Sean Christopherson wrote:
> > static int kvm_x86_check_processor_compatibility(void)
> > {
> > int cpu = smp_processor_id();
> > struct cpuinfo_x86 *c = _data(cpu);
> > 
> > /*
> >  * Compatibility checks are done when loading KVM and when enabling
> >  * hardware, e.g. during CPU hotplug, to ensure all online CPUs are
> >  * compatible, i.e. KVM should never perform a compatibility check 
> > on
> >  * an offline CPU.
> >  */
> > WARN_ON(!cpu_online(cpu));
> 
> Looks good to me.  Perhaps this also can be removed, though.

Hmm, it's a bit superfluous, but I think it could fire if KVM messed up CPU
hotplug again, e.g. if the for_each_online_cpu() => IPI raced with CPU unplug.

> And IMHO the removing of WARN_ON(!irq_disabled()) should be folded to the 
> patch
> "[PATCH 37/44] KVM: Rename and move CPUHP_AP_KVM_STARTING to ONLINE section". 
> Because moving from STARTING section to ONLINE section changes the IRQ status
> when the compatibility check is called.

Yep, that's what I have coded up, just smushed it all together here.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v3 0/8] perf: Arm SPEv1.2 support

2022-11-17 Thread Rob Herring
On Fri, Nov 4, 2022 at 10:55 AM Rob Herring  wrote:
>
> This series adds support for Arm SPEv1.2 which is part of the
> Armv8.7/Armv9.2 architecture. There's 2 new features that affect the
> kernel: a new event filter bit, branch 'not taken', and an inverted
> event filter register.
>
> Since this support adds new registers and fields, first the SPE register
> defines are converted to automatic generation.
>
> Note that the 'config3' addition in sysfs format files causes SPE to
> break. A stable fix e552b7be12ed ("perf: Skip and warn on unknown format
> 'configN' attrs") landed in v6.1-rc1.
>
> The perf tool side changes are available here[1].
>
> Tested on FVP.
>
> [1] 
> https://lore.kernel.org/all/20220914-arm-perf-tool-spe1-2-v2-v4-0-83c098e62...@kernel.org/
>
> Signed-off-by: Rob Herring 
> ---
> Changes in v3:
> - Add some more missing SPE register fields and use Enums for some
>   fields
> - Use the new PMSIDR_EL1 register Enum defines in the SPE driver
> - Link to v2: 
> https://lore.kernel.org/r/20220825-arm-spe-v8-7-v2-0-e37322d68...@kernel.org
>
> Changes in v2:
> - Convert the SPE register defines to automatic generation
> - Fixed access to SYS_PMSNEVFR_EL1 when not present
> - Rebase on v6.1-rc1
> - Link to v1: 
> https://lore.kernel.org/r/20220825-arm-spe-v8-7-v1-0-c75b8d92e...@kernel.org
>
> ---
> Rob Herring (8):
>   perf: arm_spe: Use feature numbering for PMSEVFR_EL1 defines
>   arm64: Drop SYS_ from SPE register defines
>   arm64/sysreg: Convert SPE registers to automatic generation
>   perf: arm_spe: Drop BIT() and use FIELD_GET/PREP accessors
>   perf: arm_spe: Use new PMSIDR_EL1 register enums
>   perf: arm_spe: Support new SPEv1.2/v8.7 'not taken' event
>   perf: Add perf_event_attr::config3
>   perf: arm_spe: Add support for SPEv1.2 inverted event filtering
>
>  arch/arm64/include/asm/el2_setup.h |   6 +-
>  arch/arm64/include/asm/sysreg.h|  99 +++
>  arch/arm64/kvm/debug.c |   2 +-
>  arch/arm64/kvm/hyp/nvhe/debug-sr.c |   2 +-
>  arch/arm64/tools/sysreg| 139 +
>  drivers/perf/arm_spe_pmu.c | 156 
> -
>  include/uapi/linux/perf_event.h|   3 +
>  7 files changed, 257 insertions(+), 150 deletions(-)

Will, any comments on this series?
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 0/2] KVM: arm64: selftests: Fixes for single-step test

2022-11-17 Thread Marc Zyngier
On Thu, 17 Nov 2022 00:23:48 +,
Sean Christopherson  wrote:
> 
> Marc,
> 
> I would like to route this through Paolo's tree/queue for 6.2 along with
> a big pile of other selftests updates.  I am hoping to get the selftests
> pile queued sooner than later as there is a lot of active development in
> that area, and don't want to have the selftests be in a broken state.
> I'm going to send Paolo a pull request shortly, I'll Cc you (and others)
> to keep everyone in the loop and give a chance for objections.
> 
> 
> 
> Fix a typo and an imminenent not-technically-a-bug bug in the single-step
> test where executing an atomic sequence in the guest with single-step
> enable will hang the guest due to eret clearing the local exclusive
> monitor.
> 
> 
> Sean Christopherson (2):
>   KVM: arm64: selftests: Disable single-step with correct KVM define
>   KVM: arm64: selftests: Disable single-step without relying on ucall()

I'm obviously late to the party, but hey... For the record:

Acked-by: Marc Zyngier 

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm