Re: [PATCH 05/19] KVM: PPC: Book3S HV: add a new KVM device for the XIVE native exploitation mode
On Mon, Feb 04, 2019 at 12:19:07PM +0100, Cédric Le Goater wrote: > On 2/4/19 5:25 AM, David Gibson wrote: > > On Mon, Jan 07, 2019 at 07:43:17PM +0100, Cédric Le Goater wrote: > >> This is the basic framework for the new KVM device supporting the XIVE > >> native exploitation mode. The user interface exposes a new capability > >> and a new KVM device to be used by QEMU. > >> > >> Internally, the interface to the new KVM device is protected with a > >> new interrupt mode: KVMPPC_IRQ_XIVE. > >> > >> Signed-off-by: Cédric Le Goater > >> --- > >> arch/powerpc/include/asm/kvm_host.h | 2 + > >> arch/powerpc/include/asm/kvm_ppc.h| 21 ++ > >> arch/powerpc/kvm/book3s_xive.h| 3 + > >> include/uapi/linux/kvm.h | 3 + > >> arch/powerpc/kvm/book3s.c | 7 +- > >> arch/powerpc/kvm/book3s_xive_native.c | 332 ++ > >> arch/powerpc/kvm/powerpc.c| 30 +++ > >> arch/powerpc/kvm/Makefile | 2 +- > >> 8 files changed, 398 insertions(+), 2 deletions(-) > >> create mode 100644 arch/powerpc/kvm/book3s_xive_native.c > >> > >> diff --git a/arch/powerpc/include/asm/kvm_host.h > >> b/arch/powerpc/include/asm/kvm_host.h > >> index 0f98f00da2ea..c522e8274ad9 100644 > >> --- a/arch/powerpc/include/asm/kvm_host.h > >> +++ b/arch/powerpc/include/asm/kvm_host.h > >> @@ -220,6 +220,7 @@ extern struct kvm_device_ops kvm_xics_ops; > >> struct kvmppc_xive; > >> struct kvmppc_xive_vcpu; > >> extern struct kvm_device_ops kvm_xive_ops; > >> +extern struct kvm_device_ops kvm_xive_native_ops; > >> > >> struct kvmppc_passthru_irqmap; > >> > >> @@ -446,6 +447,7 @@ struct kvmppc_passthru_irqmap { > >> #define KVMPPC_IRQ_DEFAULT0 > >> #define KVMPPC_IRQ_MPIC 1 > >> #define KVMPPC_IRQ_XICS 2 /* Includes a XIVE option */ > >> +#define KVMPPC_IRQ_XIVE 3 /* XIVE native exploitation mode */ > >> > >> #define MMIO_HPTE_CACHE_SIZE 4 > >> > >> diff --git a/arch/powerpc/include/asm/kvm_ppc.h > >> b/arch/powerpc/include/asm/kvm_ppc.h > >> index eb0d79f0ca45..1bb313f238fe 100644 > >> --- a/arch/powerpc/include/asm/kvm_ppc.h > >> +++ b/arch/powerpc/include/asm/kvm_ppc.h > >> @@ -591,6 +591,18 @@ extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, > >> u64 icpval); > >> extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 > >> irq, > >> int level, bool line_status); > >> extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu); > >> + > >> +static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu) > >> +{ > >> + return vcpu->arch.irq_type == KVMPPC_IRQ_XIVE; > >> +} > >> + > >> +extern int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, > >> + struct kvm_vcpu *vcpu, u32 cpu); > >> +extern void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu); > >> +extern void kvmppc_xive_native_init_module(void); > >> +extern void kvmppc_xive_native_exit_module(void); > >> + > >> #else > >> static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 > >> server, > >> u32 priority) { return -1; } > >> @@ -614,6 +626,15 @@ static inline int kvmppc_xive_set_icp(struct kvm_vcpu > >> *vcpu, u64 icpval) { retur > >> static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, > >> u32 irq, > >> int level, bool line_status) { return > >> -ENODEV; } > >> static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { } > >> + > >> +static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu) > >> + { return 0; } > >> +static inline int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, > >> +struct kvm_vcpu *vcpu, u32 > >> cpu) { return -EBUSY; } > >> +static inline void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) > >> { } > >> +static inline void kvmppc_xive_native_init_module(void) { } > >> +static inline void kvmppc_xive_native_exit_module(void) { } > >> + > >> #endif /* CONFIG_KVM_XIVE */ > >> > >> /* > >> diff --git a/arch/powerpc/kvm/book3s_xive.h > >> b/arch/powerpc/kvm/book3s_xive.h > >> index 10c4aa5cd010..5f22415520b4 100644 > >> --- a/arch/powerpc/kvm/book3s_xive.h > >> +++ b/arch/powerpc/kvm/book3s_xive.h > >> @@ -12,6 +12,9 @@ > >> #ifdef CONFIG_KVM_XICS > >> #include "book3s_xics.h" > >> > >> +#define KVMPPC_XIVE_FIRST_IRQ 0 > >> +#define KVMPPC_XIVE_NR_IRQS KVMPPC_XICS_NR_IRQS > >> + > >> /* > >> * State for one guest irq source. > >> * > >> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h > >> index 6d4ea4b6c922..52bf74a1616e 100644 > >> --- a/include/uapi/linux/kvm.h > >> +++ b/include/uapi/linux/kvm.h > >> @@ -988,6 +988,7 @@ struct kvm_ppc_resize_hpt { > >> #define KVM_CAP_ARM_VM_IPA_SIZE 165 > >> #define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166 > >> #define KVM_CAP_HYPERV_CPUID 167 > >> +#define
Re: [PATCH 05/19] KVM: PPC: Book3S HV: add a new KVM device for the XIVE native exploitation mode
On 2/4/19 5:25 AM, David Gibson wrote: > On Mon, Jan 07, 2019 at 07:43:17PM +0100, Cédric Le Goater wrote: >> This is the basic framework for the new KVM device supporting the XIVE >> native exploitation mode. The user interface exposes a new capability >> and a new KVM device to be used by QEMU. >> >> Internally, the interface to the new KVM device is protected with a >> new interrupt mode: KVMPPC_IRQ_XIVE. >> >> Signed-off-by: Cédric Le Goater >> --- >> arch/powerpc/include/asm/kvm_host.h | 2 + >> arch/powerpc/include/asm/kvm_ppc.h| 21 ++ >> arch/powerpc/kvm/book3s_xive.h| 3 + >> include/uapi/linux/kvm.h | 3 + >> arch/powerpc/kvm/book3s.c | 7 +- >> arch/powerpc/kvm/book3s_xive_native.c | 332 ++ >> arch/powerpc/kvm/powerpc.c| 30 +++ >> arch/powerpc/kvm/Makefile | 2 +- >> 8 files changed, 398 insertions(+), 2 deletions(-) >> create mode 100644 arch/powerpc/kvm/book3s_xive_native.c >> >> diff --git a/arch/powerpc/include/asm/kvm_host.h >> b/arch/powerpc/include/asm/kvm_host.h >> index 0f98f00da2ea..c522e8274ad9 100644 >> --- a/arch/powerpc/include/asm/kvm_host.h >> +++ b/arch/powerpc/include/asm/kvm_host.h >> @@ -220,6 +220,7 @@ extern struct kvm_device_ops kvm_xics_ops; >> struct kvmppc_xive; >> struct kvmppc_xive_vcpu; >> extern struct kvm_device_ops kvm_xive_ops; >> +extern struct kvm_device_ops kvm_xive_native_ops; >> >> struct kvmppc_passthru_irqmap; >> >> @@ -446,6 +447,7 @@ struct kvmppc_passthru_irqmap { >> #define KVMPPC_IRQ_DEFAULT 0 >> #define KVMPPC_IRQ_MPIC 1 >> #define KVMPPC_IRQ_XICS 2 /* Includes a XIVE option */ >> +#define KVMPPC_IRQ_XIVE 3 /* XIVE native exploitation mode */ >> >> #define MMIO_HPTE_CACHE_SIZE4 >> >> diff --git a/arch/powerpc/include/asm/kvm_ppc.h >> b/arch/powerpc/include/asm/kvm_ppc.h >> index eb0d79f0ca45..1bb313f238fe 100644 >> --- a/arch/powerpc/include/asm/kvm_ppc.h >> +++ b/arch/powerpc/include/asm/kvm_ppc.h >> @@ -591,6 +591,18 @@ extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, >> u64 icpval); >> extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, >> int level, bool line_status); >> extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu); >> + >> +static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu) >> +{ >> +return vcpu->arch.irq_type == KVMPPC_IRQ_XIVE; >> +} >> + >> +extern int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, >> +struct kvm_vcpu *vcpu, u32 cpu); >> +extern void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu); >> +extern void kvmppc_xive_native_init_module(void); >> +extern void kvmppc_xive_native_exit_module(void); >> + >> #else >> static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server, >> u32 priority) { return -1; } >> @@ -614,6 +626,15 @@ static inline int kvmppc_xive_set_icp(struct kvm_vcpu >> *vcpu, u64 icpval) { retur >> static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, >> u32 irq, >>int level, bool line_status) { return >> -ENODEV; } >> static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { } >> + >> +static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu) >> +{ return 0; } >> +static inline int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, >> + struct kvm_vcpu *vcpu, u32 >> cpu) { return -EBUSY; } >> +static inline void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) { >> } >> +static inline void kvmppc_xive_native_init_module(void) { } >> +static inline void kvmppc_xive_native_exit_module(void) { } >> + >> #endif /* CONFIG_KVM_XIVE */ >> >> /* >> diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h >> index 10c4aa5cd010..5f22415520b4 100644 >> --- a/arch/powerpc/kvm/book3s_xive.h >> +++ b/arch/powerpc/kvm/book3s_xive.h >> @@ -12,6 +12,9 @@ >> #ifdef CONFIG_KVM_XICS >> #include "book3s_xics.h" >> >> +#define KVMPPC_XIVE_FIRST_IRQ 0 >> +#define KVMPPC_XIVE_NR_IRQS KVMPPC_XICS_NR_IRQS >> + >> /* >> * State for one guest irq source. >> * >> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h >> index 6d4ea4b6c922..52bf74a1616e 100644 >> --- a/include/uapi/linux/kvm.h >> +++ b/include/uapi/linux/kvm.h >> @@ -988,6 +988,7 @@ struct kvm_ppc_resize_hpt { >> #define KVM_CAP_ARM_VM_IPA_SIZE 165 >> #define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166 >> #define KVM_CAP_HYPERV_CPUID 167 >> +#define KVM_CAP_PPC_IRQ_XIVE 168 >> >> #ifdef KVM_CAP_IRQ_ROUTING >> >> @@ -1211,6 +1212,8 @@ enum kvm_device_type { >> #define KVM_DEV_TYPE_ARM_VGIC_V3KVM_DEV_TYPE_ARM_VGIC_V3 >> KVM_DEV_TYPE_ARM_VGIC_ITS, >> #define KVM_DEV_TYPE_ARM_VGIC_ITS KVM_DEV_TYPE_ARM_VGIC_ITS >> +KVM_DEV_TYPE_XIVE, >>
Re: [PATCH 05/19] KVM: PPC: Book3S HV: add a new KVM device for the XIVE native exploitation mode
On Mon, Jan 07, 2019 at 07:43:17PM +0100, Cédric Le Goater wrote: > This is the basic framework for the new KVM device supporting the XIVE > native exploitation mode. The user interface exposes a new capability > and a new KVM device to be used by QEMU. > > Internally, the interface to the new KVM device is protected with a > new interrupt mode: KVMPPC_IRQ_XIVE. > > Signed-off-by: Cédric Le Goater > --- > arch/powerpc/include/asm/kvm_host.h | 2 + > arch/powerpc/include/asm/kvm_ppc.h| 21 ++ > arch/powerpc/kvm/book3s_xive.h| 3 + > include/uapi/linux/kvm.h | 3 + > arch/powerpc/kvm/book3s.c | 7 +- > arch/powerpc/kvm/book3s_xive_native.c | 332 ++ > arch/powerpc/kvm/powerpc.c| 30 +++ > arch/powerpc/kvm/Makefile | 2 +- > 8 files changed, 398 insertions(+), 2 deletions(-) > create mode 100644 arch/powerpc/kvm/book3s_xive_native.c > > diff --git a/arch/powerpc/include/asm/kvm_host.h > b/arch/powerpc/include/asm/kvm_host.h > index 0f98f00da2ea..c522e8274ad9 100644 > --- a/arch/powerpc/include/asm/kvm_host.h > +++ b/arch/powerpc/include/asm/kvm_host.h > @@ -220,6 +220,7 @@ extern struct kvm_device_ops kvm_xics_ops; > struct kvmppc_xive; > struct kvmppc_xive_vcpu; > extern struct kvm_device_ops kvm_xive_ops; > +extern struct kvm_device_ops kvm_xive_native_ops; > > struct kvmppc_passthru_irqmap; > > @@ -446,6 +447,7 @@ struct kvmppc_passthru_irqmap { > #define KVMPPC_IRQ_DEFAULT 0 > #define KVMPPC_IRQ_MPIC 1 > #define KVMPPC_IRQ_XICS 2 /* Includes a XIVE option */ > +#define KVMPPC_IRQ_XIVE 3 /* XIVE native exploitation mode */ > > #define MMIO_HPTE_CACHE_SIZE 4 > > diff --git a/arch/powerpc/include/asm/kvm_ppc.h > b/arch/powerpc/include/asm/kvm_ppc.h > index eb0d79f0ca45..1bb313f238fe 100644 > --- a/arch/powerpc/include/asm/kvm_ppc.h > +++ b/arch/powerpc/include/asm/kvm_ppc.h > @@ -591,6 +591,18 @@ extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, > u64 icpval); > extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, > int level, bool line_status); > extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu); > + > +static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu) > +{ > + return vcpu->arch.irq_type == KVMPPC_IRQ_XIVE; > +} > + > +extern int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, > + struct kvm_vcpu *vcpu, u32 cpu); > +extern void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu); > +extern void kvmppc_xive_native_init_module(void); > +extern void kvmppc_xive_native_exit_module(void); > + > #else > static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server, > u32 priority) { return -1; } > @@ -614,6 +626,15 @@ static inline int kvmppc_xive_set_icp(struct kvm_vcpu > *vcpu, u64 icpval) { retur > static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, > u32 irq, > int level, bool line_status) { return > -ENODEV; } > static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { } > + > +static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu) > + { return 0; } > +static inline int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, > + struct kvm_vcpu *vcpu, u32 > cpu) { return -EBUSY; } > +static inline void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) { } > +static inline void kvmppc_xive_native_init_module(void) { } > +static inline void kvmppc_xive_native_exit_module(void) { } > + > #endif /* CONFIG_KVM_XIVE */ > > /* > diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h > index 10c4aa5cd010..5f22415520b4 100644 > --- a/arch/powerpc/kvm/book3s_xive.h > +++ b/arch/powerpc/kvm/book3s_xive.h > @@ -12,6 +12,9 @@ > #ifdef CONFIG_KVM_XICS > #include "book3s_xics.h" > > +#define KVMPPC_XIVE_FIRST_IRQ0 > +#define KVMPPC_XIVE_NR_IRQS KVMPPC_XICS_NR_IRQS > + > /* > * State for one guest irq source. > * > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h > index 6d4ea4b6c922..52bf74a1616e 100644 > --- a/include/uapi/linux/kvm.h > +++ b/include/uapi/linux/kvm.h > @@ -988,6 +988,7 @@ struct kvm_ppc_resize_hpt { > #define KVM_CAP_ARM_VM_IPA_SIZE 165 > #define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166 > #define KVM_CAP_HYPERV_CPUID 167 > +#define KVM_CAP_PPC_IRQ_XIVE 168 > > #ifdef KVM_CAP_IRQ_ROUTING > > @@ -1211,6 +1212,8 @@ enum kvm_device_type { > #define KVM_DEV_TYPE_ARM_VGIC_V3 KVM_DEV_TYPE_ARM_VGIC_V3 > KVM_DEV_TYPE_ARM_VGIC_ITS, > #define KVM_DEV_TYPE_ARM_VGIC_ITSKVM_DEV_TYPE_ARM_VGIC_ITS > + KVM_DEV_TYPE_XIVE, > +#define KVM_DEV_TYPE_XIVEKVM_DEV_TYPE_XIVE > KVM_DEV_TYPE_MAX, > }; > > diff --git a/arch/powerpc/kvm/book3s.c
Re: [PATCH 05/19] KVM: PPC: Book3S HV: add a new KVM device for the XIVE native exploitation mode
On 1/31/19 4:01 AM, Paul Mackerras wrote: > On Wed, Jan 30, 2019 at 08:01:22AM +0100, Cédric Le Goater wrote: >> On 1/30/19 5:29 AM, Paul Mackerras wrote: >>> On Mon, Jan 28, 2019 at 06:35:34PM +0100, Cédric Le Goater wrote: On 1/22/19 6:05 AM, Paul Mackerras wrote: > On Mon, Jan 07, 2019 at 07:43:17PM +0100, Cédric Le Goater wrote: >> This is the basic framework for the new KVM device supporting the XIVE >> native exploitation mode. The user interface exposes a new capability >> and a new KVM device to be used by QEMU. > > [snip] >> @@ -1039,7 +1039,10 @@ static int kvmppc_book3s_init(void) >> #ifdef CONFIG_KVM_XIVE >> if (xive_enabled()) { >> kvmppc_xive_init_module(); >> +kvmppc_xive_native_init_module(); >> kvm_register_device_ops(_xive_ops, >> KVM_DEV_TYPE_XICS); >> +kvm_register_device_ops(_xive_native_ops, >> +KVM_DEV_TYPE_XIVE); > > I think we want tighter conditions on initializing the xive_native > stuff and creating the xive device class. We could have > xive_enabled() returning true in a guest, and this code will get > called both by PR KVM and HV KVM (and HV KVM no longer implies that we > are running bare metal). So yes, I gave nested a try with kernel_irqchip=on and the nested hypervisor (L1) obviously crashes trying to call OPAL. I have tighten the test with : if (xive_enabled() && !kvmhv_on_pseries()) { for now. As this is a problem today in 5.0.x, I will send a patch for it if you think >>> >>> How do you mean this is a problem today in 5.0? I just tried 5.0-rc1 >>> with kernel_irqchip=on in a nested guest and it works just fine. What >>> exactly did you test? >> >> L0: Linux 5.0.0-rc3 (+ KVM HV) >> L1: QEMU pseries-4.0 (kernel_irqchip=on) - Linux 5.0.0-rc3 (+ KVM HV) >> L2: QEMU pseries-4.0 (kernel_irqchip=on) - Linux 5.0.0-rc3 >> >> L1 crashes when L2 starts and tries to initialize the KVM IRQ device as >> it does an OPAL call and its running under SLOF. See below. > > OK, you must have a QEMU that advertises XIVE to the guest (L1). XIVE is not advertised if QEMU is started with 'ic-mode=xics' > In > that case I can see that L1 would try to do XICS-on-XIVE, which won't > work. We need to fix that. Unfortunately the XICS-on-XICS emulation > won't work as is in L1 either, but I think we can fix that by > disabling the real-mode XICS hcall handling. I have added some tests on kvm-hv, using kvmhv_on_pseries(), to disable the KVM XICS-on-XIVE device in a L1 guest running as hypervisor and to instead register the old KVM XICS device. If the L1 is started in KVM XICS mode, L2 can now run with KVM XICS. All seem fine. I booted two guests with disk and network. But I am still "a bit" confused with what is being done at each hypervisor level. It's not obvious to follow at all even with traces. >> I don't understand how L2 can work with kernel_irqchip=on. Could you >> please explain ? > > If QEMU decides to advertise XIVE to the L2 guest and the L2 guest can > do XIVE, then the only possibility is to use the XIVE software > emulation in QEMU, and if kernel_irqchip=on has been specified > explicitly, maybe QEMU decides to terminate the guest rather than > implicitly turning off kernel_irqchip. we can do that by disabling the KVM XIVE device when under kvmhv_on_pseries(). > If QEMU decides not to advertise XIVE to the L2 guest, or the L2 guest > can't do XIVE, then we could use the XICS-on-XICS emulation in L1 as > long as either (a) L1 is not using XIVE, or (b) we modify the > XICS-on-XICS code to avoid using any XICS or XIVE access (i.e. just > using calls to generic kernel facilities). (a) is what I did above I think May be we should consider having nested version of the KVM devices when under kvmhv_on_pseries(). With some sort of backend ops to modify the relation with the parent hypervisor : PowerNV/Linux or pseries/Linux. > Ultimately, if the spapr xive backend code in the kernel could be > extended to provide all the low-level functions that the XICS-on-XIVE > code needs, then we could do XICS-on-XIVE in a guest. What about a XIVE on XIVE ? Propagating the ESB pages to a nested guest seems feasible if not already done. The hcalls could be forwarded to the L1 QEMU ? The problematic part is handling the XIVE VP block. C.
Re: [PATCH 05/19] KVM: PPC: Book3S HV: add a new KVM device for the XIVE native exploitation mode
On Wed, Jan 30, 2019 at 08:01:22AM +0100, Cédric Le Goater wrote: > On 1/30/19 5:29 AM, Paul Mackerras wrote: > > On Mon, Jan 28, 2019 at 06:35:34PM +0100, Cédric Le Goater wrote: > >> On 1/22/19 6:05 AM, Paul Mackerras wrote: > >>> On Mon, Jan 07, 2019 at 07:43:17PM +0100, Cédric Le Goater wrote: > This is the basic framework for the new KVM device supporting the XIVE > native exploitation mode. The user interface exposes a new capability > and a new KVM device to be used by QEMU. > >>> > >>> [snip] > @@ -1039,7 +1039,10 @@ static int kvmppc_book3s_init(void) > #ifdef CONFIG_KVM_XIVE > if (xive_enabled()) { > kvmppc_xive_init_module(); > +kvmppc_xive_native_init_module(); > kvm_register_device_ops(_xive_ops, > KVM_DEV_TYPE_XICS); > +kvm_register_device_ops(_xive_native_ops, > +KVM_DEV_TYPE_XIVE); > >>> > >>> I think we want tighter conditions on initializing the xive_native > >>> stuff and creating the xive device class. We could have > >>> xive_enabled() returning true in a guest, and this code will get > >>> called both by PR KVM and HV KVM (and HV KVM no longer implies that we > >>> are running bare metal). > >> > >> So yes, I gave nested a try with kernel_irqchip=on and the nested > >> hypervisor > >> (L1) obviously crashes trying to call OPAL. I have tighten the test with : > >> > >>if (xive_enabled() && !kvmhv_on_pseries()) { > >> > >> for now. > >> > >> As this is a problem today in 5.0.x, I will send a patch for it if you > >> think > > > > How do you mean this is a problem today in 5.0? I just tried 5.0-rc1 > > with kernel_irqchip=on in a nested guest and it works just fine. What > > exactly did you test? > > L0: Linux 5.0.0-rc3 (+ KVM HV) > L1: QEMU pseries-4.0 (kernel_irqchip=on) - Linux 5.0.0-rc3 (+ KVM HV) > L2: QEMU pseries-4.0 (kernel_irqchip=on) - Linux 5.0.0-rc3 > > L1 crashes when L2 starts and tries to initialize the KVM IRQ device as > it does an OPAL call and its running under SLOF. See below. OK, you must have a QEMU that advertises XIVE to the guest (L1). In that case I can see that L1 would try to do XICS-on-XIVE, which won't work. We need to fix that. Unfortunately the XICS-on-XICS emulation won't work as is in L1 either, but I think we can fix that by disabling the real-mode XICS hcall handling. > I don't understand how L2 can work with kernel_irqchip=on. Could you > please explain ? If QEMU decides to advertise XIVE to the L2 guest and the L2 guest can do XIVE, then the only possibility is to use the XIVE software emulation in QEMU, and if kernel_irqchip=on has been specified explicitly, maybe QEMU decides to terminate the guest rather than implicitly turning off kernel_irqchip. If QEMU decides not to advertise XIVE to the L2 guest, or the L2 guest can't do XIVE, then we could use the XICS-on-XICS emulation in L1 as long as either (a) L1 is not using XIVE, or (b) we modify the XICS-on-XICS code to avoid using any XICS or XIVE access (i.e. just using calls to generic kernel facilities). Ultimately, if the spapr xive backend code in the kernel could be extended to provide all the low-level functions that the XICS-on-XIVE code needs, then we could do XICS-on-XIVE in a guest. Paul.
Re: [PATCH 05/19] KVM: PPC: Book3S HV: add a new KVM device for the XIVE native exploitation mode
On 1/30/19 5:29 AM, Paul Mackerras wrote: > On Mon, Jan 28, 2019 at 06:35:34PM +0100, Cédric Le Goater wrote: >> On 1/22/19 6:05 AM, Paul Mackerras wrote: >>> On Mon, Jan 07, 2019 at 07:43:17PM +0100, Cédric Le Goater wrote: This is the basic framework for the new KVM device supporting the XIVE native exploitation mode. The user interface exposes a new capability and a new KVM device to be used by QEMU. >>> >>> [snip] @@ -1039,7 +1039,10 @@ static int kvmppc_book3s_init(void) #ifdef CONFIG_KVM_XIVE if (xive_enabled()) { kvmppc_xive_init_module(); + kvmppc_xive_native_init_module(); kvm_register_device_ops(_xive_ops, KVM_DEV_TYPE_XICS); + kvm_register_device_ops(_xive_native_ops, + KVM_DEV_TYPE_XIVE); >>> >>> I think we want tighter conditions on initializing the xive_native >>> stuff and creating the xive device class. We could have >>> xive_enabled() returning true in a guest, and this code will get >>> called both by PR KVM and HV KVM (and HV KVM no longer implies that we >>> are running bare metal). >> >> So yes, I gave nested a try with kernel_irqchip=on and the nested hypervisor >> (L1) obviously crashes trying to call OPAL. I have tighten the test with : >> >> if (xive_enabled() && !kvmhv_on_pseries()) { >> >> for now. >> >> As this is a problem today in 5.0.x, I will send a patch for it if you think > > How do you mean this is a problem today in 5.0? I just tried 5.0-rc1 > with kernel_irqchip=on in a nested guest and it works just fine. What > exactly did you test? L0: Linux 5.0.0-rc3 (+ KVM HV) L1: QEMU pseries-4.0 (kernel_irqchip=on) - Linux 5.0.0-rc3 (+ KVM HV) L2: QEMU pseries-4.0 (kernel_irqchip=on) - Linux 5.0.0-rc3 L1 crashes when L2 starts and tries to initialize the KVM IRQ device as it does an OPAL call and its running under SLOF. See below. I don't understand how L2 can work with kernel_irqchip=on. Could you please explain ? >> it is correct. I don't think we should bother taking care of the PR case >> on P9. Should we ? > > We do need to take care of PR KVM on P9, since it is the only form of > nested KVM that works inside a host in HPT mode. ok. That is the test case. There are quite a few combinations now. Thanks, C. [ 49.547056] Oops: Exception in kernel mode, sig: 4 [#1] [ 49.555101] LE SMP NR_CPUS=2048 NUMA pSeries [ 49.555132] Modules linked in: xt_CHECKSUM iptable_mangle ipt_MASQUERADE iptable_nat nf_nat_ipv4 nf_nat xt_conntrack nf_conntrack nf_defrag_ipv6 libcrc32c nf_defrag_ipv4 ipt_REJECT nf_reject_ipv4 xt_tcpudp bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter vmx_crypto crct10dif_vpmsum crc32c_vpmsum kvm_hv kvm sch_fq_codel ip_tables x_tables autofs4 virtio_net net_failover failover virtio_scsi [ 49.555335] CPU: 9 PID: 2162 Comm: qemu-system-ppc Kdump: loaded Not tainted 5.0.0-rc3+ #53 [ 49.555378] NIP: c00a7548 LR: c00a4044 CTR: c00a24b0 [ 49.555421] REGS: c003ad71f8a0 TRAP: 0700 Not tainted (5.0.0-rc3+) [ 49.555456] MSR: 80041033 CR: 44222822 XER: 2004 [ 49.01] CFAR: c00a2508 IRQMASK: 0 [ 49.01] GPR00: 0087 c003ad71fb30 c175f700 000b [ 49.01] GPR04: c003f88d4000 000b [ 49.01] GPR08: 0003fd80 000b 0800 0031 [ 49.01] GPR12: 80001002 c7ff3280 [ 49.01] GPR16: 78d2bd60 02c9896d7800 78d2b970 [ 49.01] GPR20: 02c95c876f90 02c95c876fa0 02c95c876f80 02c95c876f70 [ 49.01] GPR24: 02c95cf4f648 c003ab3e4058 006000c0 [ 49.01] GPR28: 000b c003ab3e c003f88d [ 49.555883] NIP [c00a7548] opal_xive_alloc_vp_block+0x50/0x68 [ 49.555919] LR [c00a4044] opal_return+0x0/0x48 [ 49.555947] Call Trace: [ 49.555964] [c003ad71fb30] [c00a250c] xive_native_alloc_vp_block+0x5c/0x1c0 (unreliable) [ 49.556019] [c003ad71fbc0] [c0080430c0c0] kvmppc_xive_create+0x98/0x168 [kvm] [ 49.556065] [c003ad71fc00] [c008042f9fcc] kvm_vm_ioctl+0x474/0xa00 [kvm] [ 49.556113] [c003ad71fd10] [c0423a64] do_vfs_ioctl+0xd4/0x8e0 [ 49.556153] [c003ad71fdb0] [c0424334] ksys_ioctl+0xc4/0x110 [ 49.556190] [c003ad71fe00] [c04243a8] sys_ioctl+0x28/0x80 [ 49.556230] [c003ad71fe20] [c000b288] system_call+0x5c/0x70 [ 49.556265] Instruction dump: [ 49.556288] 6000 7d600026 91610008 3960 616b8000 f98d0980 7d8c5878 7d810164 [ 49.556332] e9628098 7d6803a6 39600031 7d8c5878 <7d9b4ba6> e96280b0 e98b0008 e84b [ 49.556378] ---[ end trace ac7420a6784de93b ]---
Re: [PATCH 05/19] KVM: PPC: Book3S HV: add a new KVM device for the XIVE native exploitation mode
On Mon, Jan 28, 2019 at 06:35:34PM +0100, Cédric Le Goater wrote: > On 1/22/19 6:05 AM, Paul Mackerras wrote: > > On Mon, Jan 07, 2019 at 07:43:17PM +0100, Cédric Le Goater wrote: > >> This is the basic framework for the new KVM device supporting the XIVE > >> native exploitation mode. The user interface exposes a new capability > >> and a new KVM device to be used by QEMU. > > > > [snip] > >> @@ -1039,7 +1039,10 @@ static int kvmppc_book3s_init(void) > >> #ifdef CONFIG_KVM_XIVE > >>if (xive_enabled()) { > >>kvmppc_xive_init_module(); > >> + kvmppc_xive_native_init_module(); > >>kvm_register_device_ops(_xive_ops, KVM_DEV_TYPE_XICS); > >> + kvm_register_device_ops(_xive_native_ops, > >> + KVM_DEV_TYPE_XIVE); > > > > I think we want tighter conditions on initializing the xive_native > > stuff and creating the xive device class. We could have > > xive_enabled() returning true in a guest, and this code will get > > called both by PR KVM and HV KVM (and HV KVM no longer implies that we > > are running bare metal). > > So yes, I gave nested a try with kernel_irqchip=on and the nested hypervisor > (L1) obviously crashes trying to call OPAL. I have tighten the test with : > > if (xive_enabled() && !kvmhv_on_pseries()) { > > for now. > > As this is a problem today in 5.0.x, I will send a patch for it if you think How do you mean this is a problem today in 5.0? I just tried 5.0-rc1 with kernel_irqchip=on in a nested guest and it works just fine. What exactly did you test? > it is correct. I don't think we should bother taking care of the PR case > on P9. Should we ? We do need to take care of PR KVM on P9, since it is the only form of nested KVM that works inside a host in HPT mode. Paul.
Re: [PATCH 05/19] KVM: PPC: Book3S HV: add a new KVM device for the XIVE native exploitation mode
On 1/22/19 6:05 AM, Paul Mackerras wrote: > On Mon, Jan 07, 2019 at 07:43:17PM +0100, Cédric Le Goater wrote: >> This is the basic framework for the new KVM device supporting the XIVE >> native exploitation mode. The user interface exposes a new capability >> and a new KVM device to be used by QEMU. > > [snip] >> @@ -1039,7 +1039,10 @@ static int kvmppc_book3s_init(void) >> #ifdef CONFIG_KVM_XIVE >> if (xive_enabled()) { >> kvmppc_xive_init_module(); >> +kvmppc_xive_native_init_module(); >> kvm_register_device_ops(_xive_ops, KVM_DEV_TYPE_XICS); >> +kvm_register_device_ops(_xive_native_ops, >> +KVM_DEV_TYPE_XIVE); > > I think we want tighter conditions on initializing the xive_native > stuff and creating the xive device class. We could have > xive_enabled() returning true in a guest, and this code will get > called both by PR KVM and HV KVM (and HV KVM no longer implies that we > are running bare metal). So yes, I gave nested a try with kernel_irqchip=on and the nested hypervisor (L1) obviously crashes trying to call OPAL. I have tighten the test with : if (xive_enabled() && !kvmhv_on_pseries()) { for now. As this is a problem today in 5.0.x, I will send a patch for it if you think it is correct. I don't think we should bother taking care of the PR case on P9. Should we ? Thanks, C. >> @@ -1050,8 +1053,10 @@ static int kvmppc_book3s_init(void) >> static void kvmppc_book3s_exit(void) >> { >> #ifdef CONFIG_KVM_XICS >> -if (xive_enabled()) >> +if (xive_enabled()) { >> kvmppc_xive_exit_module(); >> +kvmppc_xive_native_exit_module(); > > Same comment here. > > Paul. >
Re: [PATCH 05/19] KVM: PPC: Book3S HV: add a new KVM device for the XIVE native exploitation mode
On 1/22/19 6:05 AM, Paul Mackerras wrote: > On Mon, Jan 07, 2019 at 07:43:17PM +0100, Cédric Le Goater wrote: >> This is the basic framework for the new KVM device supporting the XIVE >> native exploitation mode. The user interface exposes a new capability >> and a new KVM device to be used by QEMU. > > [snip] >> @@ -1039,7 +1039,10 @@ static int kvmppc_book3s_init(void) >> #ifdef CONFIG_KVM_XIVE >> if (xive_enabled()) { >> kvmppc_xive_init_module(); >> +kvmppc_xive_native_init_module(); >> kvm_register_device_ops(_xive_ops, KVM_DEV_TYPE_XICS); >> +kvm_register_device_ops(_xive_native_ops, >> +KVM_DEV_TYPE_XIVE); > > I think we want tighter conditions on initializing the xive_native > stuff and creating the xive device class. We could have > xive_enabled() returning true in a guest, and this code will get > called both by PR KVM and HV KVM (and HV KVM no longer implies that we > are running bare metal). Ah yes, I agree. I haven't addressed at all the nested flavor. I have some questions about this that I will ask in summary email you sent. Thanks, C. > >> @@ -1050,8 +1053,10 @@ static int kvmppc_book3s_init(void) >> static void kvmppc_book3s_exit(void) >> { >> #ifdef CONFIG_KVM_XICS >> -if (xive_enabled()) >> +if (xive_enabled()) { >> kvmppc_xive_exit_module(); >> +kvmppc_xive_native_exit_module(); > > Same comment here. > > Paul. >
Re: [PATCH 05/19] KVM: PPC: Book3S HV: add a new KVM device for the XIVE native exploitation mode
On Mon, Jan 07, 2019 at 07:43:17PM +0100, Cédric Le Goater wrote: > This is the basic framework for the new KVM device supporting the XIVE > native exploitation mode. The user interface exposes a new capability > and a new KVM device to be used by QEMU. [snip] > @@ -1039,7 +1039,10 @@ static int kvmppc_book3s_init(void) > #ifdef CONFIG_KVM_XIVE > if (xive_enabled()) { > kvmppc_xive_init_module(); > + kvmppc_xive_native_init_module(); > kvm_register_device_ops(_xive_ops, KVM_DEV_TYPE_XICS); > + kvm_register_device_ops(_xive_native_ops, > + KVM_DEV_TYPE_XIVE); I think we want tighter conditions on initializing the xive_native stuff and creating the xive device class. We could have xive_enabled() returning true in a guest, and this code will get called both by PR KVM and HV KVM (and HV KVM no longer implies that we are running bare metal). > @@ -1050,8 +1053,10 @@ static int kvmppc_book3s_init(void) > static void kvmppc_book3s_exit(void) > { > #ifdef CONFIG_KVM_XICS > - if (xive_enabled()) > + if (xive_enabled()) { > kvmppc_xive_exit_module(); > + kvmppc_xive_native_exit_module(); Same comment here. Paul.
[PATCH 05/19] KVM: PPC: Book3S HV: add a new KVM device for the XIVE native exploitation mode
This is the basic framework for the new KVM device supporting the XIVE native exploitation mode. The user interface exposes a new capability and a new KVM device to be used by QEMU. Internally, the interface to the new KVM device is protected with a new interrupt mode: KVMPPC_IRQ_XIVE. Signed-off-by: Cédric Le Goater --- arch/powerpc/include/asm/kvm_host.h | 2 + arch/powerpc/include/asm/kvm_ppc.h| 21 ++ arch/powerpc/kvm/book3s_xive.h| 3 + include/uapi/linux/kvm.h | 3 + arch/powerpc/kvm/book3s.c | 7 +- arch/powerpc/kvm/book3s_xive_native.c | 332 ++ arch/powerpc/kvm/powerpc.c| 30 +++ arch/powerpc/kvm/Makefile | 2 +- 8 files changed, 398 insertions(+), 2 deletions(-) create mode 100644 arch/powerpc/kvm/book3s_xive_native.c diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 0f98f00da2ea..c522e8274ad9 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -220,6 +220,7 @@ extern struct kvm_device_ops kvm_xics_ops; struct kvmppc_xive; struct kvmppc_xive_vcpu; extern struct kvm_device_ops kvm_xive_ops; +extern struct kvm_device_ops kvm_xive_native_ops; struct kvmppc_passthru_irqmap; @@ -446,6 +447,7 @@ struct kvmppc_passthru_irqmap { #define KVMPPC_IRQ_DEFAULT 0 #define KVMPPC_IRQ_MPIC1 #define KVMPPC_IRQ_XICS2 /* Includes a XIVE option */ +#define KVMPPC_IRQ_XIVE3 /* XIVE native exploitation mode */ #define MMIO_HPTE_CACHE_SIZE 4 diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index eb0d79f0ca45..1bb313f238fe 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -591,6 +591,18 @@ extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval); extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status); extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu); + +static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.irq_type == KVMPPC_IRQ_XIVE; +} + +extern int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, + struct kvm_vcpu *vcpu, u32 cpu); +extern void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu); +extern void kvmppc_xive_native_init_module(void); +extern void kvmppc_xive_native_exit_module(void); + #else static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority) { return -1; } @@ -614,6 +626,15 @@ static inline int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) { retur static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status) { return -ENODEV; } static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { } + +static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu) + { return 0; } +static inline int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, + struct kvm_vcpu *vcpu, u32 cpu) { return -EBUSY; } +static inline void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) { } +static inline void kvmppc_xive_native_init_module(void) { } +static inline void kvmppc_xive_native_exit_module(void) { } + #endif /* CONFIG_KVM_XIVE */ /* diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h index 10c4aa5cd010..5f22415520b4 100644 --- a/arch/powerpc/kvm/book3s_xive.h +++ b/arch/powerpc/kvm/book3s_xive.h @@ -12,6 +12,9 @@ #ifdef CONFIG_KVM_XICS #include "book3s_xics.h" +#define KVMPPC_XIVE_FIRST_IRQ 0 +#define KVMPPC_XIVE_NR_IRQSKVMPPC_XICS_NR_IRQS + /* * State for one guest irq source. * diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 6d4ea4b6c922..52bf74a1616e 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -988,6 +988,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_VM_IPA_SIZE 165 #define KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 166 #define KVM_CAP_HYPERV_CPUID 167 +#define KVM_CAP_PPC_IRQ_XIVE 168 #ifdef KVM_CAP_IRQ_ROUTING @@ -1211,6 +1212,8 @@ enum kvm_device_type { #define KVM_DEV_TYPE_ARM_VGIC_V3 KVM_DEV_TYPE_ARM_VGIC_V3 KVM_DEV_TYPE_ARM_VGIC_ITS, #define KVM_DEV_TYPE_ARM_VGIC_ITS KVM_DEV_TYPE_ARM_VGIC_ITS + KVM_DEV_TYPE_XIVE, +#define KVM_DEV_TYPE_XIVE KVM_DEV_TYPE_XIVE KVM_DEV_TYPE_MAX, }; diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index bd1a677dd9e4..de7eed191107 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -1039,7 +1039,10 @@ static int kvmppc_book3s_init(void) #ifdef CONFIG_KVM_XIVE if (xive_enabled()) { kvmppc_xive_init_module(); +