Re: [PATCH v2] KVM: nVMX: Fix read/write to MSR_IA32_FEATURE_CONTROL
On Fri, Jul 05, 2013 at 11:13:17PM +0800, Arthur Chunqi Li wrote: Fix read/write to IA32_FEATURE_CONTROL MSR in nested environment. This patch simulate this MSR in nested_vmx and the default value is 0x0. BIOS should set it to 0x5 before VMXON. After setting the lock bit, write to it will cause #GP(0). Signed-off-by: Arthur Chunqi Li yzt...@gmail.com The patch is heavily based on Nadav's one. You need to preserve authorship. Add From: and Nadav's Sign-off-by. --- arch/x86/kvm/vmx.c | 25 + arch/x86/kvm/x86.c |3 ++- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 260a919..5e3d44e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -373,6 +373,7 @@ struct nested_vmx { * we must keep them pinned while L2 runs. */ struct page *apic_access_page; + u64 msr_ia32_feature_control; }; #define POSTED_INTR_ON 0 @@ -2277,8 +2278,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) switch (msr_index) { case MSR_IA32_FEATURE_CONTROL: - *pdata = 0; - break; + if (nested_vmx_allowed(vcpu)){ + *pdata = to_vmx(vcpu)-nested.msr_ia32_feature_control; + break; + } + return 0; case MSR_IA32_VMX_BASIC: /* * This MSR reports some information about VMX support. We @@ -2356,9 +2360,13 @@ static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) if (!nested_vmx_allowed(vcpu)) return 0; - if (msr_index == MSR_IA32_FEATURE_CONTROL) - /* TODO: the right thing. */ + if (msr_index == MSR_IA32_FEATURE_CONTROL){ + if (to_vmx(vcpu)-nested.msr_ia32_feature_control + FEATURE_CONTROL_LOCKED) + return 0; + to_vmx(vcpu)-nested.msr_ia32_feature_control = data; You need to allow setting it from userspace, otherwise reset will not clear it. return 1; + } /* * No need to treat VMX capability MSRs specially: If we don't handle * them, handle_wrmsr will #GP(0), which is correct (they are readonly) @@ -5595,6 +5603,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu) skip_emulated_instruction(vcpu); return 1; } + +#define VMXON_NEEDED_FEATURES \ Avi asked to use const u64 here. + (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) + if ((vmx-nested.msr_ia32_feature_control VMXON_NEEDED_FEATURES) + != VMXON_NEEDED_FEATURES) { + kvm_inject_gp(vcpu, 0); + return 1; + } + if (enable_shadow_vmcs) { shadow_vmcs = alloc_vmcs(); if (!shadow_vmcs) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e8ba99c..2d4eb8e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -850,7 +850,8 @@ static u32 msrs_to_save[] = { #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, + MSR_IA32_FEATURE_CONTROL }; static unsigned num_msrs_to_save; -- 1.7.9.5 -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2] KVM: nVMX: Fix read/write to MSR_IA32_FEATURE_CONTROL
On Sun, Jul 7, 2013 at 2:59 PM, Gleb Natapov g...@redhat.com wrote: On Fri, Jul 05, 2013 at 11:13:17PM +0800, Arthur Chunqi Li wrote: Fix read/write to IA32_FEATURE_CONTROL MSR in nested environment. This patch simulate this MSR in nested_vmx and the default value is 0x0. BIOS should set it to 0x5 before VMXON. After setting the lock bit, write to it will cause #GP(0). Signed-off-by: Arthur Chunqi Li yzt...@gmail.com The patch is heavily based on Nadav's one. You need to preserve authorship. Add From: and Nadav's Sign-off-by. --- arch/x86/kvm/vmx.c | 25 + arch/x86/kvm/x86.c |3 ++- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 260a919..5e3d44e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -373,6 +373,7 @@ struct nested_vmx { * we must keep them pinned while L2 runs. */ struct page *apic_access_page; + u64 msr_ia32_feature_control; }; #define POSTED_INTR_ON 0 @@ -2277,8 +2278,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) switch (msr_index) { case MSR_IA32_FEATURE_CONTROL: - *pdata = 0; - break; + if (nested_vmx_allowed(vcpu)){ + *pdata = to_vmx(vcpu)-nested.msr_ia32_feature_control; + break; + } + return 0; case MSR_IA32_VMX_BASIC: /* * This MSR reports some information about VMX support. We @@ -2356,9 +2360,13 @@ static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) if (!nested_vmx_allowed(vcpu)) return 0; - if (msr_index == MSR_IA32_FEATURE_CONTROL) - /* TODO: the right thing. */ + if (msr_index == MSR_IA32_FEATURE_CONTROL){ + if (to_vmx(vcpu)-nested.msr_ia32_feature_control + FEATURE_CONTROL_LOCKED) + return 0; + to_vmx(vcpu)-nested.msr_ia32_feature_control = data; You need to allow setting it from userspace, otherwise reset will not clear it. As to my memory, if the lock bit of IA32_FEATURE_CONTROL is set, this MSR cannot be set or reset. So what do you mean setting it from userspace? return 1; + } /* * No need to treat VMX capability MSRs specially: If we don't handle * them, handle_wrmsr will #GP(0), which is correct (they are readonly) @@ -5595,6 +5603,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu) skip_emulated_instruction(vcpu); return 1; } + +#define VMXON_NEEDED_FEATURES \ Avi asked to use const u64 here. + (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) + if ((vmx-nested.msr_ia32_feature_control VMXON_NEEDED_FEATURES) + != VMXON_NEEDED_FEATURES) { + kvm_inject_gp(vcpu, 0); + return 1; + } + if (enable_shadow_vmcs) { shadow_vmcs = alloc_vmcs(); if (!shadow_vmcs) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e8ba99c..2d4eb8e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -850,7 +850,8 @@ static u32 msrs_to_save[] = { #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, + MSR_IA32_FEATURE_CONTROL }; static unsigned num_msrs_to_save; -- 1.7.9.5 -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2] KVM: nVMX: Fix read/write to MSR_IA32_FEATURE_CONTROL
On Sun, Jul 07, 2013 at 03:16:08PM +0800, Arthur Chunqi Li wrote: On Sun, Jul 7, 2013 at 2:59 PM, Gleb Natapov g...@redhat.com wrote: On Fri, Jul 05, 2013 at 11:13:17PM +0800, Arthur Chunqi Li wrote: Fix read/write to IA32_FEATURE_CONTROL MSR in nested environment. This patch simulate this MSR in nested_vmx and the default value is 0x0. BIOS should set it to 0x5 before VMXON. After setting the lock bit, write to it will cause #GP(0). Signed-off-by: Arthur Chunqi Li yzt...@gmail.com The patch is heavily based on Nadav's one. You need to preserve authorship. Add From: and Nadav's Sign-off-by. --- arch/x86/kvm/vmx.c | 25 + arch/x86/kvm/x86.c |3 ++- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 260a919..5e3d44e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -373,6 +373,7 @@ struct nested_vmx { * we must keep them pinned while L2 runs. */ struct page *apic_access_page; + u64 msr_ia32_feature_control; }; #define POSTED_INTR_ON 0 @@ -2277,8 +2278,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) switch (msr_index) { case MSR_IA32_FEATURE_CONTROL: - *pdata = 0; - break; + if (nested_vmx_allowed(vcpu)){ + *pdata = to_vmx(vcpu)-nested.msr_ia32_feature_control; + break; + } + return 0; case MSR_IA32_VMX_BASIC: /* * This MSR reports some information about VMX support. We @@ -2356,9 +2360,13 @@ static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) if (!nested_vmx_allowed(vcpu)) return 0; - if (msr_index == MSR_IA32_FEATURE_CONTROL) - /* TODO: the right thing. */ + if (msr_index == MSR_IA32_FEATURE_CONTROL){ + if (to_vmx(vcpu)-nested.msr_ia32_feature_control + FEATURE_CONTROL_LOCKED) + return 0; + to_vmx(vcpu)-nested.msr_ia32_feature_control = data; You need to allow setting it from userspace, otherwise reset will not clear it. As to my memory, if the lock bit of IA32_FEATURE_CONTROL is set, this MSR cannot be set or reset. So what do you mean setting it from userspace? It can be reset by power cycling CPU obviously, so when userspace emulates power cycle reset it has to be able to zero the MSR. BTW you need to write corespondent QEMU patch to handle this MSR during reset and migration. return 1; + } /* * No need to treat VMX capability MSRs specially: If we don't handle * them, handle_wrmsr will #GP(0), which is correct (they are readonly) @@ -5595,6 +5603,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu) skip_emulated_instruction(vcpu); return 1; } + +#define VMXON_NEEDED_FEATURES \ Avi asked to use const u64 here. + (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) + if ((vmx-nested.msr_ia32_feature_control VMXON_NEEDED_FEATURES) + != VMXON_NEEDED_FEATURES) { + kvm_inject_gp(vcpu, 0); + return 1; + } + if (enable_shadow_vmcs) { shadow_vmcs = alloc_vmcs(); if (!shadow_vmcs) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e8ba99c..2d4eb8e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -850,7 +850,8 @@ static u32 msrs_to_save[] = { #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, + MSR_IA32_FEATURE_CONTROL }; static unsigned num_msrs_to_save; -- 1.7.9.5 -- Gleb. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH] uio: uio_pci_generic: Add support for MSI interrupts
On Thu, Jul 04, 2013 at 09:35:00AM -0700, Guenter Roeck wrote: On Thu, Jul 04, 2013 at 05:34:12PM +0300, Michael S. Tsirkin wrote: On Thu, Jul 04, 2013 at 07:25:23AM -0700, Guenter Roeck wrote: On Thu, Jul 04, 2013 at 10:20:23AM +0300, Michael S. Tsirkin wrote: On Thu, Jun 27, 2013 at 10:00:52AM -0700, Guenter Roeck wrote: On Thu, Jun 27, 2013 at 10:45:01AM +0300, Michael S. Tsirkin wrote: On Wed, Jun 26, 2013 at 03:30:23PM -0700, Guenter Roeck wrote: Enable support for MSI interrupts if the device supports it. Since MSI interrupts are edge triggered, it is no longer necessary to disable interrupts in the kernel and re-enable them from user-space. Instead, clearing the interrupt condition in the user space application automatically re-enables the interrupt. Signed-off-by: Guenter Roeck li...@roeck-us.net --- An open question is if we can just do this unconditionally or if there should be some flag to enable it. A module parameter, maybe ? NACK UIO is for devices that don't do memory writes. Anything that can do writes must be protected by an IOMMU and/or have a secure kernel driver, not a UIO stub. MSI is done by memory writes so if userspace controls the device it can trick it to write anywhere in memory. Just out of curiosity: Since MSI support is mandatory for all PCIE devices, isn't that possible anyway, even if MSI is not enabled by the kernel ? All one would need to do is to enable MSI from user space; after all, the chip configuration space is writable. Thanks, Guenter If a device has capability to do writes, sure. So don't do this then :) Not an option. I need to use MSI. Not that it matters anymore - turns out it was better writing a specific driver for my devices anyway; I needed to be able to disable chip interrupts before unloading the driver. But why is it then a reason to NACK this patch ? There seem to be two cases - either you can't access the device - and the uio driver is not useful - or you can, and it's not safe. In both cases the patch does not seem to bring about anything except user confusion ... Sounds like claiming that supporting MSI would cause some kind of confusion. Not sure if I can follow that logic. Actually, it simplifies the user space code a lot. Since MSI interrupts are edge triggered and queued, it is not necessary to disable the interrupts, and all user space has to do is to remove the interrupt reason. Works quite nicely for our devices. As I said, I don't really care too much anymore if this patch is rejected, but the reasons for the rejection are kind of weak. Besides, doesn't one have to be root anyway to perform such activities, which could then be more easily accomplished by writing into /dev/mem ? Thanks, Guenter root might not be able to write into /dev/mem. I don't really want to try, but at least it is marked rw for root. Guenter Yes but mechanisms such as selinux can still block it. -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[Bug 60505] Heavy network traffic triggers vhost_net lockup
https://bugzilla.kernel.org/show_bug.cgi?id=60505 Michael S. Tsirkin m.s.tsir...@gmail.com changed: What|Removed |Added CC||m.s.tsir...@gmail.com --- Comment #2 from Michael S. Tsirkin m.s.tsir...@gmail.com --- does this still trigger if you disable zerocopy tx? -- You are receiving this mail because: You are watching the assignee of the bug. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2] KVM: nVMX: Fix read/write to MSR_IA32_FEATURE_CONTROL
On Sun, Jul 7, 2013 at 3:20 PM, Gleb Natapov g...@redhat.com wrote: On Sun, Jul 07, 2013 at 03:16:08PM +0800, Arthur Chunqi Li wrote: On Sun, Jul 7, 2013 at 2:59 PM, Gleb Natapov g...@redhat.com wrote: On Fri, Jul 05, 2013 at 11:13:17PM +0800, Arthur Chunqi Li wrote: Fix read/write to IA32_FEATURE_CONTROL MSR in nested environment. This patch simulate this MSR in nested_vmx and the default value is 0x0. BIOS should set it to 0x5 before VMXON. After setting the lock bit, write to it will cause #GP(0). Signed-off-by: Arthur Chunqi Li yzt...@gmail.com The patch is heavily based on Nadav's one. You need to preserve authorship. Add From: and Nadav's Sign-off-by. --- arch/x86/kvm/vmx.c | 25 + arch/x86/kvm/x86.c |3 ++- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 260a919..5e3d44e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -373,6 +373,7 @@ struct nested_vmx { * we must keep them pinned while L2 runs. */ struct page *apic_access_page; + u64 msr_ia32_feature_control; }; #define POSTED_INTR_ON 0 @@ -2277,8 +2278,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) switch (msr_index) { case MSR_IA32_FEATURE_CONTROL: - *pdata = 0; - break; + if (nested_vmx_allowed(vcpu)){ + *pdata = to_vmx(vcpu)-nested.msr_ia32_feature_control; + break; + } + return 0; case MSR_IA32_VMX_BASIC: /* * This MSR reports some information about VMX support. We @@ -2356,9 +2360,13 @@ static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) if (!nested_vmx_allowed(vcpu)) return 0; - if (msr_index == MSR_IA32_FEATURE_CONTROL) - /* TODO: the right thing. */ + if (msr_index == MSR_IA32_FEATURE_CONTROL){ + if (to_vmx(vcpu)-nested.msr_ia32_feature_control + FEATURE_CONTROL_LOCKED) + return 0; + to_vmx(vcpu)-nested.msr_ia32_feature_control = data; You need to allow setting it from userspace, otherwise reset will not clear it. As to my memory, if the lock bit of IA32_FEATURE_CONTROL is set, this MSR cannot be set or reset. So what do you mean setting it from userspace? It can be reset by power cycling CPU obviously, so when userspace emulates power cycle reset it has to be able to zero the MSR. BTW you need to write corespondent QEMU patch to handle this MSR during reset and migration. What is the current call trace when QEMU do system_reset? Will it trigger vmx_vcpu_reset() in vmx.c? return 1; + } /* * No need to treat VMX capability MSRs specially: If we don't handle * them, handle_wrmsr will #GP(0), which is correct (they are readonly) @@ -5595,6 +5603,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu) skip_emulated_instruction(vcpu); return 1; } + +#define VMXON_NEEDED_FEATURES \ Avi asked to use const u64 here. + (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) + if ((vmx-nested.msr_ia32_feature_control VMXON_NEEDED_FEATURES) + != VMXON_NEEDED_FEATURES) { + kvm_inject_gp(vcpu, 0); + return 1; + } + if (enable_shadow_vmcs) { shadow_vmcs = alloc_vmcs(); if (!shadow_vmcs) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e8ba99c..2d4eb8e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -850,7 +850,8 @@ static u32 msrs_to_save[] = { #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, + MSR_IA32_FEATURE_CONTROL }; static unsigned num_msrs_to_save; -- 1.7.9.5 -- Gleb. -- Gleb. -- Arthur Chunqi Li Department of Computer Science School of EECS Peking University Beijing, China -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2] KVM: nVMX: Fix read/write to MSR_IA32_FEATURE_CONTROL
On Sun, Jul 07, 2013 at 04:47:05PM +0800, Arthur Chunqi Li wrote: On Sun, Jul 7, 2013 at 3:20 PM, Gleb Natapov g...@redhat.com wrote: On Sun, Jul 07, 2013 at 03:16:08PM +0800, Arthur Chunqi Li wrote: On Sun, Jul 7, 2013 at 2:59 PM, Gleb Natapov g...@redhat.com wrote: On Fri, Jul 05, 2013 at 11:13:17PM +0800, Arthur Chunqi Li wrote: Fix read/write to IA32_FEATURE_CONTROL MSR in nested environment. This patch simulate this MSR in nested_vmx and the default value is 0x0. BIOS should set it to 0x5 before VMXON. After setting the lock bit, write to it will cause #GP(0). Signed-off-by: Arthur Chunqi Li yzt...@gmail.com The patch is heavily based on Nadav's one. You need to preserve authorship. Add From: and Nadav's Sign-off-by. --- arch/x86/kvm/vmx.c | 25 + arch/x86/kvm/x86.c |3 ++- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 260a919..5e3d44e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -373,6 +373,7 @@ struct nested_vmx { * we must keep them pinned while L2 runs. */ struct page *apic_access_page; + u64 msr_ia32_feature_control; }; #define POSTED_INTR_ON 0 @@ -2277,8 +2278,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) switch (msr_index) { case MSR_IA32_FEATURE_CONTROL: - *pdata = 0; - break; + if (nested_vmx_allowed(vcpu)){ + *pdata = to_vmx(vcpu)-nested.msr_ia32_feature_control; + break; + } + return 0; case MSR_IA32_VMX_BASIC: /* * This MSR reports some information about VMX support. We @@ -2356,9 +2360,13 @@ static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) if (!nested_vmx_allowed(vcpu)) return 0; - if (msr_index == MSR_IA32_FEATURE_CONTROL) - /* TODO: the right thing. */ + if (msr_index == MSR_IA32_FEATURE_CONTROL){ + if (to_vmx(vcpu)-nested.msr_ia32_feature_control + FEATURE_CONTROL_LOCKED) + return 0; + to_vmx(vcpu)-nested.msr_ia32_feature_control = data; You need to allow setting it from userspace, otherwise reset will not clear it. As to my memory, if the lock bit of IA32_FEATURE_CONTROL is set, this MSR cannot be set or reset. So what do you mean setting it from userspace? It can be reset by power cycling CPU obviously, so when userspace emulates power cycle reset it has to be able to zero the MSR. BTW you need to write corespondent QEMU patch to handle this MSR during reset and migration. What is the current call trace when QEMU do system_reset? Will it trigger vmx_vcpu_reset() in vmx.c? It will not. QEMU inits vcpu state in userspace and write it into the kernel with set_regs/set_srges ioctls. return 1; + } /* * No need to treat VMX capability MSRs specially: If we don't handle * them, handle_wrmsr will #GP(0), which is correct (they are readonly) @@ -5595,6 +5603,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu) skip_emulated_instruction(vcpu); return 1; } + +#define VMXON_NEEDED_FEATURES \ Avi asked to use const u64 here. + (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) + if ((vmx-nested.msr_ia32_feature_control VMXON_NEEDED_FEATURES) + != VMXON_NEEDED_FEATURES) { + kvm_inject_gp(vcpu, 0); + return 1; + } + if (enable_shadow_vmcs) { shadow_vmcs = alloc_vmcs(); if (!shadow_vmcs) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e8ba99c..2d4eb8e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -850,7 +850,8 @@ static u32 msrs_to_save[] = { #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, + MSR_IA32_FEATURE_CONTROL }; static unsigned num_msrs_to_save; -- 1.7.9.5 -- Gleb. -- Gleb. -- Arthur Chunqi Li Department of Computer Science School of EECS Peking University Beijing, China -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2] KVM: nVMX: Fix read/write to MSR_IA32_FEATURE_CONTROL
On Sun, Jul 7, 2013 at 4:50 PM, Gleb Natapov g...@redhat.com wrote: On Sun, Jul 07, 2013 at 04:47:05PM +0800, Arthur Chunqi Li wrote: On Sun, Jul 7, 2013 at 3:20 PM, Gleb Natapov g...@redhat.com wrote: On Sun, Jul 07, 2013 at 03:16:08PM +0800, Arthur Chunqi Li wrote: On Sun, Jul 7, 2013 at 2:59 PM, Gleb Natapov g...@redhat.com wrote: On Fri, Jul 05, 2013 at 11:13:17PM +0800, Arthur Chunqi Li wrote: Fix read/write to IA32_FEATURE_CONTROL MSR in nested environment. This patch simulate this MSR in nested_vmx and the default value is 0x0. BIOS should set it to 0x5 before VMXON. After setting the lock bit, write to it will cause #GP(0). Signed-off-by: Arthur Chunqi Li yzt...@gmail.com The patch is heavily based on Nadav's one. You need to preserve authorship. Add From: and Nadav's Sign-off-by. --- arch/x86/kvm/vmx.c | 25 + arch/x86/kvm/x86.c |3 ++- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 260a919..5e3d44e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -373,6 +373,7 @@ struct nested_vmx { * we must keep them pinned while L2 runs. */ struct page *apic_access_page; + u64 msr_ia32_feature_control; }; #define POSTED_INTR_ON 0 @@ -2277,8 +2278,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) switch (msr_index) { case MSR_IA32_FEATURE_CONTROL: - *pdata = 0; - break; + if (nested_vmx_allowed(vcpu)){ + *pdata = to_vmx(vcpu)-nested.msr_ia32_feature_control; + break; + } + return 0; case MSR_IA32_VMX_BASIC: /* * This MSR reports some information about VMX support. We @@ -2356,9 +2360,13 @@ static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) if (!nested_vmx_allowed(vcpu)) return 0; - if (msr_index == MSR_IA32_FEATURE_CONTROL) - /* TODO: the right thing. */ + if (msr_index == MSR_IA32_FEATURE_CONTROL){ + if (to_vmx(vcpu)-nested.msr_ia32_feature_control + FEATURE_CONTROL_LOCKED) + return 0; + to_vmx(vcpu)-nested.msr_ia32_feature_control = data; You need to allow setting it from userspace, otherwise reset will not clear it. As to my memory, if the lock bit of IA32_FEATURE_CONTROL is set, this MSR cannot be set or reset. So what do you mean setting it from userspace? It can be reset by power cycling CPU obviously, so when userspace emulates power cycle reset it has to be able to zero the MSR. BTW you need to write corespondent QEMU patch to handle this MSR during reset and migration. What is the current call trace when QEMU do system_reset? Will it trigger vmx_vcpu_reset() in vmx.c? It will not. QEMU inits vcpu state in userspace and write it into the kernel with set_regs/set_srges ioctls. So how will this achieve isolation and protection? I mean if QEMU can reset this MSR when reboot, can user in OS also reset it? Actually, after setting to lock bit, it cannot be set until next reboot. return 1; + } /* * No need to treat VMX capability MSRs specially: If we don't handle * them, handle_wrmsr will #GP(0), which is correct (they are readonly) @@ -5595,6 +5603,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu) skip_emulated_instruction(vcpu); return 1; } + +#define VMXON_NEEDED_FEATURES \ Avi asked to use const u64 here. + (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) + if ((vmx-nested.msr_ia32_feature_control VMXON_NEEDED_FEATURES) + != VMXON_NEEDED_FEATURES) { + kvm_inject_gp(vcpu, 0); + return 1; + } + if (enable_shadow_vmcs) { shadow_vmcs = alloc_vmcs(); if (!shadow_vmcs) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e8ba99c..2d4eb8e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -850,7 +850,8 @@ static u32 msrs_to_save[] = { #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, + MSR_IA32_FEATURE_CONTROL }; static unsigned num_msrs_to_save; -- 1.7.9.5 -- Gleb. -- Gleb. -- Arthur Chunqi Li Department of Computer Science School of EECS Peking University Beijing, China -- Gleb. --
Re: [PATCH v2] KVM: nVMX: Fix read/write to MSR_IA32_FEATURE_CONTROL
On Sun, Jul 07, 2013 at 05:17:00PM +0800, Arthur Chunqi Li wrote: On Sun, Jul 7, 2013 at 4:50 PM, Gleb Natapov g...@redhat.com wrote: On Sun, Jul 07, 2013 at 04:47:05PM +0800, Arthur Chunqi Li wrote: On Sun, Jul 7, 2013 at 3:20 PM, Gleb Natapov g...@redhat.com wrote: On Sun, Jul 07, 2013 at 03:16:08PM +0800, Arthur Chunqi Li wrote: On Sun, Jul 7, 2013 at 2:59 PM, Gleb Natapov g...@redhat.com wrote: On Fri, Jul 05, 2013 at 11:13:17PM +0800, Arthur Chunqi Li wrote: Fix read/write to IA32_FEATURE_CONTROL MSR in nested environment. This patch simulate this MSR in nested_vmx and the default value is 0x0. BIOS should set it to 0x5 before VMXON. After setting the lock bit, write to it will cause #GP(0). Signed-off-by: Arthur Chunqi Li yzt...@gmail.com The patch is heavily based on Nadav's one. You need to preserve authorship. Add From: and Nadav's Sign-off-by. --- arch/x86/kvm/vmx.c | 25 + arch/x86/kvm/x86.c |3 ++- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 260a919..5e3d44e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -373,6 +373,7 @@ struct nested_vmx { * we must keep them pinned while L2 runs. */ struct page *apic_access_page; + u64 msr_ia32_feature_control; }; #define POSTED_INTR_ON 0 @@ -2277,8 +2278,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) switch (msr_index) { case MSR_IA32_FEATURE_CONTROL: - *pdata = 0; - break; + if (nested_vmx_allowed(vcpu)){ + *pdata = to_vmx(vcpu)-nested.msr_ia32_feature_control; + break; + } + return 0; case MSR_IA32_VMX_BASIC: /* * This MSR reports some information about VMX support. We @@ -2356,9 +2360,13 @@ static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) if (!nested_vmx_allowed(vcpu)) return 0; - if (msr_index == MSR_IA32_FEATURE_CONTROL) - /* TODO: the right thing. */ + if (msr_index == MSR_IA32_FEATURE_CONTROL){ + if (to_vmx(vcpu)-nested.msr_ia32_feature_control + FEATURE_CONTROL_LOCKED) + return 0; + to_vmx(vcpu)-nested.msr_ia32_feature_control = data; You need to allow setting it from userspace, otherwise reset will not clear it. As to my memory, if the lock bit of IA32_FEATURE_CONTROL is set, this MSR cannot be set or reset. So what do you mean setting it from userspace? It can be reset by power cycling CPU obviously, so when userspace emulates power cycle reset it has to be able to zero the MSR. BTW you need to write corespondent QEMU patch to handle this MSR during reset and migration. What is the current call trace when QEMU do system_reset? Will it trigger vmx_vcpu_reset() in vmx.c? It will not. QEMU inits vcpu state in userspace and write it into the kernel with set_regs/set_srges ioctls. So how will this achieve isolation and protection? I mean if QEMU can reset this MSR when reboot, can user in OS also reset it? Actually, after setting to lock bit, it cannot be set until next reboot. There is a way to tell if msr write is done by QEMU or a guest OS. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/4] PF: Move architecture specifics to the backends
On Fri, Jul 05, 2013 at 10:55:52PM +0200, Dominik Dingel wrote: Current common codes uses PAGE_OFFSET to indicate a bad host virtual address. As this check won't work on architectures that don't map kernel and user memory into the same address space (e.g. s390), it is moved into architcture specific code. Signed-off-by: Dominik Dingel din...@linux.vnet.ibm.com --- arch/arm/include/asm/kvm_host.h | 8 arch/ia64/include/asm/kvm_host.h| 3 +++ arch/mips/include/asm/kvm_host.h| 6 ++ arch/powerpc/include/asm/kvm_host.h | 8 arch/s390/include/asm/kvm_host.h| 12 arch/x86/include/asm/kvm_host.h | 8 include/linux/kvm_host.h| 8 7 files changed, 45 insertions(+), 8 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 7d22517..557c2a1 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -74,6 +74,14 @@ struct kvm_arch { struct vgic_distvgic; }; +#define KVM_HVA_ERR_BAD (PAGE_OFFSET) +#define KVM_HVA_ERR_RO_BAD (PAGE_OFFSET + PAGE_SIZE) + +static inline bool kvm_is_error_hva(unsigned long addr) +{ + return addr = PAGE_OFFSET; +} + Instead of changing every arch I prefer to add #ifndef KVM_HVA_ERR_BAD #dndif around this code in the common header and define different version for s390 only. A question bellow. #define KVM_NR_MEM_OBJS 40 /* diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h index 989dd3f..d3afa6f 100644 --- a/arch/ia64/include/asm/kvm_host.h +++ b/arch/ia64/include/asm/kvm_host.h @@ -486,6 +486,9 @@ struct kvm_arch { unsigned long irq_states[KVM_IOAPIC_NUM_PINS]; }; +#define KVM_HVA_ERR_BAD (PAGE_OFFSET) +#define KVM_HVA_ERR_RO_BAD (PAGE_OFFSET + PAGE_SIZE) + union cpuid3_t { u64 value; struct { diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 4d6fa0b..3a0a3f7 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -34,7 +34,13 @@ #define KVM_NR_PAGE_SIZES1 #define KVM_PAGES_PER_HPAGE(x) 1 +#define KVM_HVA_ERR_BAD (PAGE_OFFSET) +#define KVM_HVA_ERR_RO_BAD (PAGE_OFFSET + PAGE_SIZE) +static inline bool kvm_is_error_hva(unsigned long addr) +{ + return addr = PAGE_OFFSET; +} /* Special address that contains the comm page, used for reducing # of traps */ #define KVM_GUEST_COMMPAGE_ADDR 0x0 diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index af326cd..be5d7f4 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -273,6 +273,14 @@ struct kvm_arch { #endif }; +#define KVM_HVA_ERR_BAD (PAGE_OFFSET) +#define KVM_HVA_ERR_RO_BAD (PAGE_OFFSET + PAGE_SIZE) + +static inline bool kvm_is_error_hva(unsigned long addr) +{ + return addr = PAGE_OFFSET; +} + /* * Struct for a virtual core. * Note: entry_exit_count combines an entry count in the bottom 8 bits diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 3238d40..152 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -274,6 +274,18 @@ struct kvm_arch{ int css_support; }; +#define KVM_HVA_ERR_BAD (-1UL) +#define KVM_HVA_ERR_RO_BAD (-1UL) + +static inline bool kvm_is_error_hva(unsigned long addr) +{ + /* + * on s390, this check is not needed as kernel and user memory + * is not mapped into the same address space + */ + return false; +} + Can gfn_to_hva() ever return error hva on S390? Currently error hva is returned if gfn is outside of any slot or slot is invalid. extern int sie64a(struct kvm_s390_sie_block *, u64 *); extern char sie_exit; #endif diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f87f7fc..07e8570 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -96,6 +96,14 @@ #define ASYNC_PF_PER_VCPU 64 +#define KVM_HVA_ERR_BAD (PAGE_OFFSET) +#define KVM_HVA_ERR_RO_BAD (PAGE_OFFSET + PAGE_SIZE) + +static inline bool kvm_is_error_hva(unsigned long addr) +{ + return addr = PAGE_OFFSET; +} + struct kvm_vcpu; struct kvm; struct kvm_async_pf; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a63d83e..210f493 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -85,14 +85,6 @@ static inline bool is_noslot_pfn(pfn_t pfn) return pfn == KVM_PFN_NOSLOT; } -#define KVM_HVA_ERR_BAD (PAGE_OFFSET) -#define KVM_HVA_ERR_RO_BAD (PAGE_OFFSET + PAGE_SIZE) - -static inline bool kvm_is_error_hva(unsigned long addr) -{ - return addr = PAGE_OFFSET; -} -
Re: [PATCH 3/4] PF: Provide additional direct page notification
On Fri, Jul 05, 2013 at 10:55:53PM +0200, Dominik Dingel wrote: By setting a Kconfig option, the architecture can control when guest notifications will be presented by the apf backend. So there is the default batch mechanism, working as before, where the vcpu thread should pull in this information. On the other hand there is now the direct mechanism, this will directly push the information to the guest. Still the vcpu thread should call check_completion to cleanup leftovers, that leaves most of the common code untouched. Signed-off-by: Dominik Dingel din...@linux.vnet.ibm.com --- arch/x86/kvm/mmu.c | 2 +- include/linux/kvm_host.h | 2 +- virt/kvm/Kconfig | 4 virt/kvm/async_pf.c | 22 +++--- 4 files changed, 25 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 0d094da..b8632e9 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3343,7 +3343,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) arch.direct_map = vcpu-arch.mmu.direct_map; arch.cr3 = vcpu-arch.mmu.get_cr3(vcpu); - return kvm_setup_async_pf(vcpu, gva, gfn, arch); + return kvm_setup_async_pf(vcpu, gva, gfn_to_hva(vcpu-kvm, gfn), arch); } static bool can_do_async_pf(struct kvm_vcpu *vcpu) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 210f493..969d575 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -175,7 +175,7 @@ struct kvm_async_pf { void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu); void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu); -int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, +int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva, struct kvm_arch_async_pf *arch); int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #endif diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 779262f..715e6b5 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -22,6 +22,10 @@ config KVM_MMIO config KVM_ASYNC_PF bool +# Toggle to switch between direct notification and batch job +config KVM_ASYNC_PF_DIRECT + bool + config HAVE_KVM_MSI bool diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index ea475cd..b8df37a 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -28,6 +28,21 @@ #include async_pf.h #include trace/events/kvm.h +static inline void kvm_async_page_direct_present(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work) +{ +#ifdef CONFIG_KVM_ASYNC_PF_DIRECT + kvm_arch_async_page_present(vcpu, work); +#endif +} +static inline void kvm_async_page_batch_present(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work) +{ +#ifndef CONFIG_KVM_ASYNC_PF_DIRECT + kvm_arch_async_page_present(vcpu, work); +#endif +} + I would call them kvm_async_page_present_(async|sync)(). Hmm, to much sync in each function name, but I still think it is better. static struct kmem_cache *async_pf_cache; int kvm_async_pf_init(void) @@ -70,6 +85,7 @@ static void async_pf_execute(struct work_struct *work) down_read(mm-mmap_sem); get_user_pages(current, mm, addr, 1, 1, 0, page, NULL); up_read(mm-mmap_sem); + kvm_async_page_direct_present(vcpu, apf); unuse_mm(mm); spin_lock(vcpu-async_pf.lock); @@ -134,7 +150,7 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) if (work-page) kvm_arch_async_page_ready(vcpu, work); - kvm_arch_async_page_present(vcpu, work); + kvm_async_page_batch_present(vcpu, work); list_del(work-queue); vcpu-async_pf.queued--; @@ -144,7 +160,7 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) } } -int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, +int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva, struct kvm_arch_async_pf *arch) { struct kvm_async_pf *work; @@ -166,7 +182,7 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, work-done = false; work-vcpu = vcpu; work-gva = gva; - work-addr = gfn_to_hva(vcpu-kvm, gfn); + work-addr = hva; work-arch = *arch; work-mm = current-mm; atomic_inc(work-mm-mm_count); -- 1.8.2.2 -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[Bug 60518] Heavy network traffic between guest and host triggers kernel oops
https://bugzilla.kernel.org/show_bug.cgi?id=60518 --- Comment #2 from Bart Van Assche bvanass...@acm.org --- Note: so far I haven't been able to trigger this issue with kernel 3.8.12 nor with kernel 3.10. -- You are receiving this mail because: You are watching the assignee of the bug. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCHv3] vhost-net: fix use-after-free in vhost_net_flush
vhost_net_ubuf_put_and_wait has a confusing name: it will actually also free it's argument. Thus since commit 1280c27f8e29acf4af2da914e80ec27c3dbd5c01 vhost-net: flush outstanding DMAs on memory change vhost_net_flush tries to use the argument after passing it to vhost_net_ubuf_put_and_wait, this results in use after free. To fix, don't free the argument in vhost_net_ubuf_put_and_wait, add an new API for callers that want to free ubufs. Acked-by: Asias He as...@redhat.com Acked-by: Jason Wang jasow...@redhat.com Signed-off-by: Michael S. Tsirkin m...@redhat.com --- Please review, and queue for 3.11 and stable. Changes from v2: - rebased to net/master Changes from v1: - tweak commit message drivers/vhost/net.c | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index f80d3dd..8ca5ac7 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -150,6 +150,11 @@ static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs) { kref_put(ubufs-kref, vhost_net_zerocopy_done_signal); wait_event(ubufs-wait, !atomic_read(ubufs-kref.refcount)); +} + +static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs) +{ + vhost_net_ubuf_put_and_wait(ubufs); kfree(ubufs); } @@ -948,7 +953,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) mutex_unlock(vq-mutex); if (oldubufs) { - vhost_net_ubuf_put_and_wait(oldubufs); + vhost_net_ubuf_put_wait_and_free(oldubufs); mutex_lock(vq-mutex); vhost_zerocopy_signal_used(n, vq); mutex_unlock(vq-mutex); @@ -966,7 +971,7 @@ err_used: rcu_assign_pointer(vq-private_data, oldsock); vhost_net_enable_vq(n, vq); if (ubufs) - vhost_net_ubuf_put_and_wait(ubufs); + vhost_net_ubuf_put_wait_and_free(ubufs); err_ubufs: fput(sock-file); err_vq: -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[Bug 60505] Heavy network traffic triggers vhost_net lockup
https://bugzilla.kernel.org/show_bug.cgi?id=60505 --- Comment #3 from Michael S. Tsirkin m.s.tsir...@gmail.com --- Also. I just posted a patch fixing a bug in this function. [PATCHv3] vhost-net: fix use-after-free in vhost_net_flush could you please try with this patch applied? -- You are receiving this mail because: You are watching the assignee of the bug. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] vhost: Avoid that vhost_work_flush() locks up
On Fri, Jul 05, 2013 at 10:36:46AM +0200, Bart Van Assche wrote: Wake up work-done waiters even if the TIF_NEED_RESCHED task flag has been set. This patch fixes a regression introduced in commit d550dda (kernel v3.4). Signed-off-by: Bart Van Assche bvanass...@acm.org Reference: https://bugzilla.kernel.org/show_bug.cgi?id=60505 Cc: Michael S. Tsirkin m...@redhat.com Cc: Asias He as...@redhat.com Cc: Nadav Har'El n...@math.technion.ac.il Cc: Abel Gordon ab...@il.ibm.com Cc: sta...@vger.kernel.org # v3.4+ I just posted a patch fixing a bug in this function. [PATCHv3] vhost-net: fix use-after-free in vhost_net_flush could you please try with this patch applied? --- drivers/vhost/vhost.c | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 60aa5ad..cd544ae 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -227,8 +227,16 @@ static int vhost_worker(void *data) if (work) { __set_current_state(TASK_RUNNING); work-fn(work); - if (need_resched()) + if (need_resched()) { + spin_lock_irq(dev-work_lock); + work-done_seq = seq; + if (work-flushing) + wake_up_all(work-done); + spin_unlock_irq(dev-work_lock); + + work = NULL; schedule(); + } } else schedule(); -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 03/11] vhost: Make vhost a separate module
On Mon, May 06, 2013 at 08:10:03PM +0800, Asias He wrote: On Mon, May 06, 2013 at 01:03:42PM +0300, Michael S. Tsirkin wrote: On Mon, May 06, 2013 at 04:38:21PM +0800, Asias He wrote: Currently, vhost-net and vhost-scsi are sharing the vhost core code. However, vhost-scsi shares the code by including the vhost.c file directly. Making vhost a separate module makes it is easier to share code with other vhost devices. Signed-off-by: Asias He as...@redhat.com Also this will break test.c, right? Let's fix it in the same commit too. I will fix it up and remove the useless 'return'. Don't see v3 anywhere? --- drivers/vhost/Kconfig | 8 drivers/vhost/Makefile | 3 ++- drivers/vhost/scsi.c | 1 - drivers/vhost/vhost.c | 51 +- drivers/vhost/vhost.h | 2 ++ 5 files changed, 62 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index 8b9226d..017a1e8 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -1,6 +1,7 @@ config VHOST_NET tristate Host kernel accelerator for virtio net depends on NET EVENTFD (TUN || !TUN) (MACVTAP || !MACVTAP) + select VHOST select VHOST_RING ---help--- This kernel module can be loaded in host kernel to accelerate @@ -13,6 +14,7 @@ config VHOST_NET config VHOST_SCSI tristate VHOST_SCSI TCM fabric driver depends on TARGET_CORE EVENTFD m + select VHOST select VHOST_RING default n ---help--- @@ -24,3 +26,9 @@ config VHOST_RING ---help--- This option is selected by any driver which needs to access the host side of a virtio ring. + +config VHOST + tristate + ---help--- + This option is selected by any driver which needs to access + the core of vhost. diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile index 654e9afb..e0441c3 100644 --- a/drivers/vhost/Makefile +++ b/drivers/vhost/Makefile @@ -1,7 +1,8 @@ obj-$(CONFIG_VHOST_NET) += vhost_net.o -vhost_net-y := vhost.o net.o +vhost_net-y := net.o obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o vhost_scsi-y := scsi.o obj-$(CONFIG_VHOST_RING) += vringh.o +obj-$(CONFIG_VHOST) += vhost.o diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 5179f7a..2dcb94a 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -49,7 +49,6 @@ #include linux/llist.h #include linux/bitmap.h -#include vhost.c #include vhost.h #define TCM_VHOST_VERSION v0.1 diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index de9441a..e406d5f 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -25,6 +25,7 @@ #include linux/slab.h #include linux/kthread.h #include linux/cgroup.h +#include linux/module.h #include vhost.h @@ -66,6 +67,7 @@ void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn) work-flushing = 0; work-queue_seq = work-done_seq = 0; } +EXPORT_SYMBOL_GPL(vhost_work_init); /* Init poll structure */ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, @@ -79,6 +81,7 @@ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, vhost_work_init(poll-work, fn); } +EXPORT_SYMBOL_GPL(vhost_poll_init); /* Start polling a file. We add ourselves to file's wait queue. The caller must * keep a reference to a file until after vhost_poll_stop is called. */ @@ -101,6 +104,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file *file) return ret; } +EXPORT_SYMBOL_GPL(vhost_poll_start); /* Stop polling a file. After this function returns, it becomes safe to drop the * file reference. You must also flush afterwards. */ @@ -111,6 +115,7 @@ void vhost_poll_stop(struct vhost_poll *poll) poll-wqh = NULL; } } +EXPORT_SYMBOL_GPL(vhost_poll_stop); static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work, unsigned seq) @@ -123,7 +128,7 @@ static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work, return left = 0; } -static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) +void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) { unsigned seq; int flushing; @@ -138,6 +143,7 @@ static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) spin_unlock_irq(dev-work_lock); BUG_ON(flushing 0); } +EXPORT_SYMBOL_GPL(vhost_work_flush); /* Flush any work that has been scheduled. When calling this, don't hold any * locks that are also used by the callback. */ @@ -145,6 +151,7 @@ void vhost_poll_flush(struct
[PATCH] virtio: include asm/barrier explicitly
virtio_ring.h uses mb() and friends, make it pull in asm/barrier.h itself, not rely on other headers to do it. Signed-off-by: Michael S. Tsirkin m...@redhat.com --- include/linux/virtio_ring.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index ca3ad41..b300787 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -1,6 +1,7 @@ #ifndef _LINUX_VIRTIO_RING_H #define _LINUX_VIRTIO_RING_H +#include asm/barrier.h #include linux/irqreturn.h #include uapi/linux/virtio_ring.h -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] virtio: include asm/barrier explicitly
On Sun, Jul 07, 2013 at 05:20:19PM +0300, Michael S. Tsirkin wrote: virtio_ring.h uses mb() and friends, make it pull in asm/barrier.h itself, not rely on other headers to do it. Signed-off-by: Michael S. Tsirkin m...@redhat.com In particular this fixes warnings during test build: [linux]$ make -C tools/virtio/ make: Entering directory `/home/mst/scm/linux/tools/virtio' cc -g -O2 -Wall -I. -I ../../usr/include/ -Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD -U_FORTIFY_SOURCE -c -o virtio_test.o virtio_test.c In file included from ./linux/virtio_ring.h:1:0, from ../../usr/include/linux/vhost.h:17, from virtio_test.c:14: ./linux/../../../include/linux/virtio_ring.h: In function ‘virtio_mb’: ./linux/../../../include/linux/virtio_ring.h:50:2: warning: implicit declaration of function ‘mb’ [-Wimplicit-function-declaration] --- include/linux/virtio_ring.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index ca3ad41..b300787 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -1,6 +1,7 @@ #ifndef _LINUX_VIRTIO_RING_H #define _LINUX_VIRTIO_RING_H +#include asm/barrier.h #include linux/irqreturn.h #include uapi/linux/virtio_ring.h -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] tools/virtio: move module license stub to module.h
This fixes build for the vringh test: [linux]$ make -C tools/virtio/ make: Entering directory `/home/mst/scm/linux/tools/virtio' cc -g -O2 -Wall -I. -I ../../usr/include/ -Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD -U_FORTIFY_SOURCE -c -o vringh.o ../../drivers/vhost/vringh.c ../../drivers/vhost/vringh.c:1010:16: error: expected declaration specifiers or ‘...’ before string constant Signed-off-by: Michael S. Tsirkin m...@redhat.com --- tools/virtio/linux/module.h | 5 + tools/virtio/linux/virtio.h | 3 --- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/virtio/linux/module.h b/tools/virtio/linux/module.h index 3039a7e..28ce95a 100644 --- a/tools/virtio/linux/module.h +++ b/tools/virtio/linux/module.h @@ -1 +1,6 @@ #include linux/export.h + +#define MODULE_LICENSE(__MODULE_LICENSE_value) \ + static __attribute__((unused)) const char *__MODULE_LICENSE_name = \ + __MODULE_LICENSE_value + diff --git a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h index cd80183..8447830 100644 --- a/tools/virtio/linux/virtio.h +++ b/tools/virtio/linux/virtio.h @@ -45,9 +45,6 @@ struct virtqueue { void *priv; }; -#define MODULE_LICENSE(__MODULE_LICENSE_value) \ - const char *__MODULE_LICENSE_name = __MODULE_LICENSE_value - /* Interfaces exported by virtio_ring. */ int virtqueue_add_sgs(struct virtqueue *vq, struct scatterlist *sgs[], -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH] uio: uio_pci_generic: Add support for MSI interrupts
On Sun, Jul 07, 2013 at 11:15:11AM +0300, Michael S. Tsirkin wrote: On Thu, Jul 04, 2013 at 09:35:00AM -0700, Guenter Roeck wrote: On Thu, Jul 04, 2013 at 05:34:12PM +0300, Michael S. Tsirkin wrote: On Thu, Jul 04, 2013 at 07:25:23AM -0700, Guenter Roeck wrote: On Thu, Jul 04, 2013 at 10:20:23AM +0300, Michael S. Tsirkin wrote: On Thu, Jun 27, 2013 at 10:00:52AM -0700, Guenter Roeck wrote: On Thu, Jun 27, 2013 at 10:45:01AM +0300, Michael S. Tsirkin wrote: On Wed, Jun 26, 2013 at 03:30:23PM -0700, Guenter Roeck wrote: Enable support for MSI interrupts if the device supports it. Since MSI interrupts are edge triggered, it is no longer necessary to disable interrupts in the kernel and re-enable them from user-space. Instead, clearing the interrupt condition in the user space application automatically re-enables the interrupt. Signed-off-by: Guenter Roeck li...@roeck-us.net --- An open question is if we can just do this unconditionally or if there should be some flag to enable it. A module parameter, maybe ? NACK UIO is for devices that don't do memory writes. Anything that can do writes must be protected by an IOMMU and/or have a secure kernel driver, not a UIO stub. MSI is done by memory writes so if userspace controls the device it can trick it to write anywhere in memory. Just out of curiosity: Since MSI support is mandatory for all PCIE devices, isn't that possible anyway, even if MSI is not enabled by the kernel ? All one would need to do is to enable MSI from user space; after all, the chip configuration space is writable. Thanks, Guenter If a device has capability to do writes, sure. So don't do this then :) Not an option. I need to use MSI. Not that it matters anymore - turns out it was better writing a specific driver for my devices anyway; I needed to be able to disable chip interrupts before unloading the driver. But why is it then a reason to NACK this patch ? There seem to be two cases - either you can't access the device - and the uio driver is not useful - or you can, and it's not safe. In both cases the patch does not seem to bring about anything except user confusion ... Sounds like claiming that supporting MSI would cause some kind of confusion. Not sure if I can follow that logic. Actually, it simplifies the user space code a lot. Since MSI interrupts are edge triggered and queued, it is not necessary to disable the interrupts, and all user space has to do is to remove the interrupt reason. Works quite nicely for our devices. As I said, I don't really care too much anymore if this patch is rejected, but the reasons for the rejection are kind of weak. Besides, doesn't one have to be root anyway to perform such activities, which could then be more easily accomplished by writing into /dev/mem ? Thanks, Guenter root might not be able to write into /dev/mem. I don't really want to try, but at least it is marked rw for root. Guenter Yes but mechanisms such as selinux can still block it. I hope it can also prevent the use of the the MSI trick on PCIe devices. After all, the user space mechanisms to write into configuration space are available for all devices, not just for those with uio drivers. Guenter -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 03/11] vhost: Make vhost a separate module
On Sun, Jul 07, 2013 at 02:37:10PM +0300, Michael S. Tsirkin wrote: On Mon, May 06, 2013 at 08:10:03PM +0800, Asias He wrote: On Mon, May 06, 2013 at 01:03:42PM +0300, Michael S. Tsirkin wrote: On Mon, May 06, 2013 at 04:38:21PM +0800, Asias He wrote: Currently, vhost-net and vhost-scsi are sharing the vhost core code. However, vhost-scsi shares the code by including the vhost.c file directly. Making vhost a separate module makes it is easier to share code with other vhost devices. Signed-off-by: Asias He as...@redhat.com Also this will break test.c, right? Let's fix it in the same commit too. I will fix it up and remove the useless 'return'. Don't see v3 anywhere? I did these tweaks, you can see the result on the vhost branch in my tree. --- drivers/vhost/Kconfig | 8 drivers/vhost/Makefile | 3 ++- drivers/vhost/scsi.c | 1 - drivers/vhost/vhost.c | 51 +- drivers/vhost/vhost.h | 2 ++ 5 files changed, 62 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index 8b9226d..017a1e8 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -1,6 +1,7 @@ config VHOST_NET tristate Host kernel accelerator for virtio net depends on NET EVENTFD (TUN || !TUN) (MACVTAP || !MACVTAP) + select VHOST select VHOST_RING ---help--- This kernel module can be loaded in host kernel to accelerate @@ -13,6 +14,7 @@ config VHOST_NET config VHOST_SCSI tristate VHOST_SCSI TCM fabric driver depends on TARGET_CORE EVENTFD m + select VHOST select VHOST_RING default n ---help--- @@ -24,3 +26,9 @@ config VHOST_RING ---help--- This option is selected by any driver which needs to access the host side of a virtio ring. + +config VHOST + tristate + ---help--- + This option is selected by any driver which needs to access + the core of vhost. diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile index 654e9afb..e0441c3 100644 --- a/drivers/vhost/Makefile +++ b/drivers/vhost/Makefile @@ -1,7 +1,8 @@ obj-$(CONFIG_VHOST_NET) += vhost_net.o -vhost_net-y := vhost.o net.o +vhost_net-y := net.o obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o vhost_scsi-y := scsi.o obj-$(CONFIG_VHOST_RING) += vringh.o +obj-$(CONFIG_VHOST)+= vhost.o diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 5179f7a..2dcb94a 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -49,7 +49,6 @@ #include linux/llist.h #include linux/bitmap.h -#include vhost.c #include vhost.h #define TCM_VHOST_VERSION v0.1 diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index de9441a..e406d5f 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -25,6 +25,7 @@ #include linux/slab.h #include linux/kthread.h #include linux/cgroup.h +#include linux/module.h #include vhost.h @@ -66,6 +67,7 @@ void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn) work-flushing = 0; work-queue_seq = work-done_seq = 0; } +EXPORT_SYMBOL_GPL(vhost_work_init); /* Init poll structure */ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, @@ -79,6 +81,7 @@ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, vhost_work_init(poll-work, fn); } +EXPORT_SYMBOL_GPL(vhost_poll_init); /* Start polling a file. We add ourselves to file's wait queue. The caller must * keep a reference to a file until after vhost_poll_stop is called. */ @@ -101,6 +104,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file *file) return ret; } +EXPORT_SYMBOL_GPL(vhost_poll_start); /* Stop polling a file. After this function returns, it becomes safe to drop the * file reference. You must also flush afterwards. */ @@ -111,6 +115,7 @@ void vhost_poll_stop(struct vhost_poll *poll) poll-wqh = NULL; } } +EXPORT_SYMBOL_GPL(vhost_poll_stop); static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work, unsigned seq) @@ -123,7 +128,7 @@ static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work, return left = 0; } -static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) +void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
Re: [RFC PATCH] uio: uio_pci_generic: Add support for MSI interrupts
On Sun, Jul 07, 2013 at 07:39:01AM -0700, Guenter Roeck wrote: On Sun, Jul 07, 2013 at 11:15:11AM +0300, Michael S. Tsirkin wrote: On Thu, Jul 04, 2013 at 09:35:00AM -0700, Guenter Roeck wrote: On Thu, Jul 04, 2013 at 05:34:12PM +0300, Michael S. Tsirkin wrote: On Thu, Jul 04, 2013 at 07:25:23AM -0700, Guenter Roeck wrote: On Thu, Jul 04, 2013 at 10:20:23AM +0300, Michael S. Tsirkin wrote: On Thu, Jun 27, 2013 at 10:00:52AM -0700, Guenter Roeck wrote: On Thu, Jun 27, 2013 at 10:45:01AM +0300, Michael S. Tsirkin wrote: On Wed, Jun 26, 2013 at 03:30:23PM -0700, Guenter Roeck wrote: Enable support for MSI interrupts if the device supports it. Since MSI interrupts are edge triggered, it is no longer necessary to disable interrupts in the kernel and re-enable them from user-space. Instead, clearing the interrupt condition in the user space application automatically re-enables the interrupt. Signed-off-by: Guenter Roeck li...@roeck-us.net --- An open question is if we can just do this unconditionally or if there should be some flag to enable it. A module parameter, maybe ? NACK UIO is for devices that don't do memory writes. Anything that can do writes must be protected by an IOMMU and/or have a secure kernel driver, not a UIO stub. MSI is done by memory writes so if userspace controls the device it can trick it to write anywhere in memory. Just out of curiosity: Since MSI support is mandatory for all PCIE devices, isn't that possible anyway, even if MSI is not enabled by the kernel ? All one would need to do is to enable MSI from user space; after all, the chip configuration space is writable. Thanks, Guenter If a device has capability to do writes, sure. So don't do this then :) Not an option. I need to use MSI. Not that it matters anymore - turns out it was better writing a specific driver for my devices anyway; I needed to be able to disable chip interrupts before unloading the driver. But why is it then a reason to NACK this patch ? There seem to be two cases - either you can't access the device - and the uio driver is not useful - or you can, and it's not safe. In both cases the patch does not seem to bring about anything except user confusion ... Sounds like claiming that supporting MSI would cause some kind of confusion. Not sure if I can follow that logic. Actually, it simplifies the user space code a lot. Since MSI interrupts are edge triggered and queued, it is not necessary to disable the interrupts, and all user space has to do is to remove the interrupt reason. Works quite nicely for our devices. As I said, I don't really care too much anymore if this patch is rejected, but the reasons for the rejection are kind of weak. Besides, doesn't one have to be root anyway to perform such activities, which could then be more easily accomplished by writing into /dev/mem ? Thanks, Guenter root might not be able to write into /dev/mem. I don't really want to try, but at least it is marked rw for root. Guenter Yes but mechanisms such as selinux can still block it. I hope it can also prevent the use of the the MSI trick on PCIe devices. After all, the user space mechanisms to write into configuration space are available for all devices, not just for those with uio drivers. Guenter Yes, selinux can also prevent access to sysfs and procfs for any process. -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3] KVM: nVMX: Fix read/write to MSR_IA32_FEATURE_CONTROL
Fix read/write to IA32_FEATURE_CONTROL MSR in nested environment. This patch simulate this MSR in nested_vmx and the default value is 0x0. BIOS should set it to 0x5 before VMXON. After setting the lock bit, write to it will cause #GP(0). Another QEMU patch is also needed to handle emulation of reset and migration. Reset to vCPU should clear this MSR and migration should reserve value of it. This patch is based on Nadav's previous commit. http://permalink.gmane.org/gmane.comp.emulators.kvm.devel/88478 Signed-off-by: Nadav Har'El nyh at il.ibm.com Signed-off-by: Arthur Chunqi Li yzt...@gmail.com --- arch/x86/kvm/vmx.c | 32 ++-- arch/x86/kvm/x86.c |3 ++- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a7e1855..a64efd0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -373,6 +373,7 @@ struct nested_vmx { * we must keep them pinned while L2 runs. */ struct page *apic_access_page; + u64 msr_ia32_feature_control; }; #define POSTED_INTR_ON 0 @@ -2282,8 +2283,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) switch (msr_index) { case MSR_IA32_FEATURE_CONTROL: - *pdata = 0; - break; + if (nested_vmx_allowed(vcpu)){ + *pdata = to_vmx(vcpu)-nested.msr_ia32_feature_control; + break; + } + return 0; case MSR_IA32_VMX_BASIC: /* * This MSR reports some information about VMX support. We @@ -2356,14 +2360,21 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) return 1; } -static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { + u32 msr_index = msr_info-index; + u64 data = msr_info-data; + bool host_initialized = msr_info-host_initiated; if (!nested_vmx_allowed(vcpu)) return 0; - if (msr_index == MSR_IA32_FEATURE_CONTROL) - /* TODO: the right thing. */ + if (msr_index == MSR_IA32_FEATURE_CONTROL){ + if (!host_initialized to_vmx(vcpu)-nested.msr_ia32_feature_control +FEATURE_CONTROL_LOCKED) + return 0; + to_vmx(vcpu)-nested.msr_ia32_feature_control = data; return 1; + } /* * No need to treat VMX capability MSRs specially: If we don't handle * them, handle_wrmsr will #GP(0), which is correct (they are readonly) @@ -2494,7 +2505,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; /* Otherwise falls through */ default: - if (vmx_set_vmx_msr(vcpu, msr_index, data)) + if (vmx_set_vmx_msr(vcpu, msr_info)) break; msr = find_msr_entry(vmx, msr_index); if (msr) { @@ -5576,6 +5587,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu) struct kvm_segment cs; struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs *shadow_vmcs; + const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED + | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; /* The Intel VMX Instruction Reference lists a bunch of bits that * are prerequisite to running VMXON, most notably cr4.VMXE must be @@ -5604,6 +5617,13 @@ static int handle_vmon(struct kvm_vcpu *vcpu) skip_emulated_instruction(vcpu); return 1; } + + if ((vmx-nested.msr_ia32_feature_control VMXON_NEEDED_FEATURES) + != VMXON_NEEDED_FEATURES) { + kvm_inject_gp(vcpu, 0); + return 1; + } + if (enable_shadow_vmcs) { shadow_vmcs = alloc_vmcs(); if (!shadow_vmcs) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d21bce5..cff77c4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -850,7 +850,8 @@ static u32 msrs_to_save[] = { #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, + MSR_IA32_FEATURE_CONTROL }; static unsigned num_msrs_to_save; -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[[Qemu-devel] [PATCH]] nVMX: Initialize IA32_FEATURE_CONTROL MSR in reset and migration
The recent KVM patch adds IA32_FEATURE_CONTROL support. QEMU needs to clear this MSR when reset vCPU and keep the value of it when migration. This patch add this feature. Signed-off-by: Arthur Chunqi Li yzt...@gmail.com --- target-i386/cpu.h |2 ++ target-i386/kvm.c |4 target-i386/machine.c | 22 ++ 3 files changed, 28 insertions(+) diff --git a/target-i386/cpu.h b/target-i386/cpu.h index 62e3547..a418e17 100644 --- a/target-i386/cpu.h +++ b/target-i386/cpu.h @@ -301,6 +301,7 @@ #define MSR_IA32_APICBASE_BSP (18) #define MSR_IA32_APICBASE_ENABLE(111) #define MSR_IA32_APICBASE_BASE (0xf12) +#define MSR_IA32_FEATURE_CONTROL0x003a #define MSR_TSC_ADJUST 0x003b #define MSR_IA32_TSCDEADLINE0x6e0 @@ -813,6 +814,7 @@ typedef struct CPUX86State { uint64_t mcg_status; uint64_t msr_ia32_misc_enable; +uint64_t msr_ia32_feature_control; /* exception/interrupt handling */ int error_code; diff --git a/target-i386/kvm.c b/target-i386/kvm.c index 39f4fbb..3cb2161 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -1122,6 +1122,7 @@ static int kvm_put_msrs(X86CPU *cpu, int level) if (hyperv_vapic_recommended()) { kvm_msr_entry_set(msrs[n++], HV_X64_MSR_APIC_ASSIST_PAGE, 0); } +kvm_msr_entry_set(msrs[n++], MSR_IA32_FEATURE_CONTROL, env-msr_ia32_feature_control); } if (env-mcg_cap) { int i; @@ -1346,6 +1347,7 @@ static int kvm_get_msrs(X86CPU *cpu) if (has_msr_misc_enable) { msrs[n++].index = MSR_IA32_MISC_ENABLE; } +msrs[n++].index = MSR_IA32_FEATURE_CONTROL; if (!env-tsc_valid) { msrs[n++].index = MSR_IA32_TSC; @@ -1444,6 +1446,8 @@ static int kvm_get_msrs(X86CPU *cpu) case MSR_IA32_MISC_ENABLE: env-msr_ia32_misc_enable = msrs[i].data; break; +case MSR_IA32_FEATURE_CONTROL: +env-msr_ia32_feature_control = msrs[i].data; default: if (msrs[i].index = MSR_MC0_CTL msrs[i].index MSR_MC0_CTL + (env-mcg_cap 0xff) * 4) { diff --git a/target-i386/machine.c b/target-i386/machine.c index 3659db9..94ca914 100644 --- a/target-i386/machine.c +++ b/target-i386/machine.c @@ -399,6 +399,14 @@ static bool misc_enable_needed(void *opaque) return env-msr_ia32_misc_enable != MSR_IA32_MISC_ENABLE_DEFAULT; } +static bool feature_control_needed(void *opaque) +{ +X86CPU *cpu = opaque; +CPUX86State *env = cpu-env; + +return env-msr_ia32_feature_control != 0; +} + static const VMStateDescription vmstate_msr_ia32_misc_enable = { .name = cpu/msr_ia32_misc_enable, .version_id = 1, @@ -410,6 +418,17 @@ static const VMStateDescription vmstate_msr_ia32_misc_enable = { } }; +static const VMStateDescription vmstate_msr_ia32_feature_control = { +.name = cpu/msr_ia32_feature_control, +.version_id = 1, +.minimum_version_id = 1, +.minimum_version_id_old = 1, +.fields = (VMStateField []) { +VMSTATE_UINT64(env.msr_ia32_feature_control, X86CPU), +VMSTATE_END_OF_LIST() +} +}; + const VMStateDescription vmstate_x86_cpu = { .name = cpu, .version_id = 12, @@ -535,6 +554,9 @@ const VMStateDescription vmstate_x86_cpu = { }, { .vmsd = vmstate_msr_ia32_misc_enable, .needed = misc_enable_needed, +}, { +.vmsd = vmstate_msr_ia32_feature_control, +.needed = feature_control_needed, } , { /* empty */ } -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] kvm tools: fix boot of guests with more than 4gb of ram
On Mon, Jun 24, 2013 at 1:06 PM, Michael Tokarev m...@tls.msk.ru wrote: 24.06.2013 05:23, Sasha Levin wrote: queue = p9dev-vqs[vq]; queue-pfn = pfn; - p = guest_flat_to_host(kvm, queue-pfn * page_size); + p = guest_flat_to_host(kvm, (u64)queue-pfn * page_size); Maybe it's worth to use a common function for this, something like guest_queue_to_host(kvm, queue) ? Sasha, care to do something like this and resend the patch? -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2] kvm tools: fix boot of guests with more than 4gb of ram
Commit kvm tools: virtio: remove hardcoded assumptions about guest page size has introduced a bug that prevented guests with more than 4gb of ram from booting. The issue is that 'pfn' is a 32bit integer, so when multiplying it by page size to get the actual page will cause an overflow if the pfn referred to a memory area above 4gb. Signed-off-by: Sasha Levin sasha.le...@oracle.com --- tools/kvm/include/kvm/virtio.h | 6 ++ tools/kvm/virtio/9p.c | 2 +- tools/kvm/virtio/balloon.c | 2 +- tools/kvm/virtio/blk.c | 2 +- tools/kvm/virtio/console.c | 2 +- tools/kvm/virtio/net.c | 2 +- tools/kvm/virtio/rng.c | 2 +- tools/kvm/virtio/scsi.c| 2 +- 8 files changed, 13 insertions(+), 7 deletions(-) diff --git a/tools/kvm/include/kvm/virtio.h b/tools/kvm/include/kvm/virtio.h index 269ea4a..820b94a 100644 --- a/tools/kvm/include/kvm/virtio.h +++ b/tools/kvm/include/kvm/virtio.h @@ -90,4 +90,10 @@ int virtio_init(struct kvm *kvm, void *dev, struct virtio_device *vdev, struct virtio_ops *ops, enum virtio_trans trans, int device_id, int subsys_id, int class); int virtio_compat_add_message(const char *device, const char *config); + +static inline void *virtio_get_vq(struct kvm *kvm, u32 pfn, u32 page_size) +{ + return guest_flat_to_host(kvm, (u64)pfn * page_size); +} + #endif /* KVM__VIRTIO_H */ diff --git a/tools/kvm/virtio/9p.c b/tools/kvm/virtio/9p.c index b7a5723..127b13a 100644 --- a/tools/kvm/virtio/9p.c +++ b/tools/kvm/virtio/9p.c @@ -1267,7 +1267,7 @@ static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align, queue = p9dev-vqs[vq]; queue-pfn = pfn; - p = guest_flat_to_host(kvm, queue-pfn * page_size); + p = virtio_get_vq(kvm, queue-pfn, page_size); job = p9dev-jobs[vq]; vring_init(queue-vring, VIRTQUEUE_NUM, p, align); diff --git a/tools/kvm/virtio/balloon.c b/tools/kvm/virtio/balloon.c index d1b64fa..4863535 100644 --- a/tools/kvm/virtio/balloon.c +++ b/tools/kvm/virtio/balloon.c @@ -204,7 +204,7 @@ static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align, queue = bdev-vqs[vq]; queue-pfn = pfn; - p = guest_flat_to_host(kvm, queue-pfn * page_size); + p = virtio_get_vq(kvm, queue-pfn, page_size); thread_pool__init_job(bdev-jobs[vq], kvm, virtio_bln_do_io, queue); vring_init(queue-vring, VIRTIO_BLN_QUEUE_SIZE, p, align); diff --git a/tools/kvm/virtio/blk.c b/tools/kvm/virtio/blk.c index 44ac44b..ab18716 100644 --- a/tools/kvm/virtio/blk.c +++ b/tools/kvm/virtio/blk.c @@ -167,7 +167,7 @@ static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align, queue = bdev-vqs[vq]; queue-pfn = pfn; - p = guest_flat_to_host(kvm, queue-pfn * page_size); + p = virtio_get_vq(kvm, queue-pfn, page_size); vring_init(queue-vring, VIRTIO_BLK_QUEUE_SIZE, p, align); diff --git a/tools/kvm/virtio/console.c b/tools/kvm/virtio/console.c index 1f88a4b..83c58bf 100644 --- a/tools/kvm/virtio/console.c +++ b/tools/kvm/virtio/console.c @@ -137,7 +137,7 @@ static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align, queue = cdev.vqs[vq]; queue-pfn = pfn; - p = guest_flat_to_host(kvm, queue-pfn * page_size); + p = virtio_get_vq(kvm, queue-pfn, page_size); vring_init(queue-vring, VIRTIO_CONSOLE_QUEUE_SIZE, p, align); diff --git a/tools/kvm/virtio/net.c b/tools/kvm/virtio/net.c index 7855cfc..298903c 100644 --- a/tools/kvm/virtio/net.c +++ b/tools/kvm/virtio/net.c @@ -410,7 +410,7 @@ static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align, queue = ndev-vqs[vq]; queue-pfn = pfn; - p = guest_flat_to_host(kvm, queue-pfn * page_size); + p = virtio_get_vq(kvm, queue-pfn, page_size); vring_init(queue-vring, VIRTIO_NET_QUEUE_SIZE, p, align); diff --git a/tools/kvm/virtio/rng.c b/tools/kvm/virtio/rng.c index 2ce8afd..394de71 100644 --- a/tools/kvm/virtio/rng.c +++ b/tools/kvm/virtio/rng.c @@ -98,7 +98,7 @@ static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align, queue = rdev-vqs[vq]; queue-pfn = pfn; - p = guest_flat_to_host(kvm, queue-pfn * page_size); + p = virtio_get_vq(kvm, queue-pfn, page_size); job = rdev-jobs[vq]; diff --git a/tools/kvm/virtio/scsi.c b/tools/kvm/virtio/scsi.c index 05b2dc6..8da2420 100644 --- a/tools/kvm/virtio/scsi.c +++ b/tools/kvm/virtio/scsi.c @@ -63,7 +63,7 @@ static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align, queue =
[PATCH qom-cpu v2 10/42] kvm: Change kvm_{insert,remove}_breakpoint() argument to CPUState
CPUArchState is no longer directly used since converting CPU loops to CPUState. Prepares for changing GDBState::c_cpu to CPUState. Signed-off-by: Andreas Färber afaer...@suse.de --- gdbstub.c| 12 include/sysemu/kvm.h | 4 ++-- kvm-all.c| 10 -- kvm-stub.c | 4 ++-- 4 files changed, 16 insertions(+), 14 deletions(-) diff --git a/gdbstub.c b/gdbstub.c index 17da380..b77cd3e 100644 --- a/gdbstub.c +++ b/gdbstub.c @@ -1954,8 +1954,10 @@ static int gdb_breakpoint_insert(target_ulong addr, target_ulong len, int type) CPUArchState *env; int err = 0; -if (kvm_enabled()) -return kvm_insert_breakpoint(gdbserver_state-c_cpu, addr, len, type); +if (kvm_enabled()) { +return kvm_insert_breakpoint(ENV_GET_CPU(gdbserver_state-c_cpu), + addr, len, type); +} switch (type) { case GDB_BREAKPOINT_SW: @@ -1991,8 +1993,10 @@ static int gdb_breakpoint_remove(target_ulong addr, target_ulong len, int type) CPUArchState *env; int err = 0; -if (kvm_enabled()) -return kvm_remove_breakpoint(gdbserver_state-c_cpu, addr, len, type); +if (kvm_enabled()) { +return kvm_remove_breakpoint(ENV_GET_CPU(gdbserver_state-c_cpu), + addr, len, type); +} switch (type) { case GDB_BREAKPOINT_SW: diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index 1e08a85..f8ac448 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -169,9 +169,9 @@ void *kvm_arch_ram_alloc(ram_addr_t size); void kvm_setup_guest_memory(void *start, size_t size); void kvm_flush_coalesced_mmio_buffer(void); -int kvm_insert_breakpoint(CPUArchState *env, target_ulong addr, +int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr, target_ulong len, int type); -int kvm_remove_breakpoint(CPUArchState *env, target_ulong addr, +int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, target_ulong len, int type); void kvm_remove_all_breakpoints(CPUState *cpu); int kvm_update_guest_debug(CPUArchState *env, unsigned long reinject_trap); diff --git a/kvm-all.c b/kvm-all.c index 0c16a44..2263c48 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -1903,10 +1903,9 @@ int kvm_update_guest_debug(CPUArchState *env, unsigned long reinject_trap) return data.err; } -int kvm_insert_breakpoint(CPUArchState *env, target_ulong addr, +int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr, target_ulong len, int type) { -CPUState *cpu = ENV_GET_CPU(env); struct kvm_sw_breakpoint *bp; int err; @@ -1949,10 +1948,9 @@ int kvm_insert_breakpoint(CPUArchState *env, target_ulong addr, return 0; } -int kvm_remove_breakpoint(CPUArchState *env, target_ulong addr, +int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, target_ulong len, int type) { -CPUState *cpu = ENV_GET_CPU(env); struct kvm_sw_breakpoint *bp; int err; @@ -2025,13 +2023,13 @@ int kvm_update_guest_debug(CPUArchState *env, unsigned long reinject_trap) return -EINVAL; } -int kvm_insert_breakpoint(CPUArchState *env, target_ulong addr, +int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr, target_ulong len, int type) { return -EINVAL; } -int kvm_remove_breakpoint(CPUArchState *env, target_ulong addr, +int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, target_ulong len, int type) { return -EINVAL; diff --git a/kvm-stub.c b/kvm-stub.c index 370c837..7b2233a 100644 --- a/kvm-stub.c +++ b/kvm-stub.c @@ -83,13 +83,13 @@ int kvm_update_guest_debug(CPUArchState *env, unsigned long reinject_trap) return -ENOSYS; } -int kvm_insert_breakpoint(CPUArchState *env, target_ulong addr, +int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr, target_ulong len, int type) { return -EINVAL; } -int kvm_remove_breakpoint(CPUArchState *env, target_ulong addr, +int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, target_ulong len, int type) { return -EINVAL; -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH qom-cpu v2 16/42] exec: Change cpu_memory_rw_debug() argument to CPUState
Propagate X86CPU in kvmvapic for simplicity. Signed-off-by: Andreas Färber afaer...@suse.de --- cpus.c | 4 +-- disas.c | 4 +-- exec.c | 6 ++-- gdbstub.c | 2 +- hw/i386/kvmvapic.c | 72 +++-- include/exec/cpu-all.h | 3 +- include/exec/softmmu-semi.h | 18 +++- monitor.c | 2 +- target-arm/arm-semi.c | 2 +- target-i386/helper.c| 8 +++-- target-i386/kvm.c | 14 - target-sparc/mmu_helper.c | 5 ++-- target-xtensa/xtensa-semi.c | 10 +++ 13 files changed, 77 insertions(+), 73 deletions(-) diff --git a/cpus.c b/cpus.c index 8a4f395..1ce6816 100644 --- a/cpus.c +++ b/cpus.c @@ -1247,7 +1247,6 @@ void qmp_memsave(int64_t addr, int64_t size, const char *filename, { FILE *f; uint32_t l; -CPUArchState *env; CPUState *cpu; uint8_t buf[1024]; @@ -1261,7 +1260,6 @@ void qmp_memsave(int64_t addr, int64_t size, const char *filename, a CPU number); return; } -env = cpu-env_ptr; f = fopen(filename, wb); if (!f) { @@ -1273,7 +1271,7 @@ void qmp_memsave(int64_t addr, int64_t size, const char *filename, l = sizeof(buf); if (l size) l = size; -cpu_memory_rw_debug(env, addr, buf, l, 0); +cpu_memory_rw_debug(cpu, addr, buf, l, 0); if (fwrite(buf, 1, l, f) != l) { error_set(errp, QERR_IO_ERROR); goto exit; diff --git a/disas.c b/disas.c index e51127e..71007fb 100644 --- a/disas.c +++ b/disas.c @@ -39,7 +39,7 @@ target_read_memory (bfd_vma memaddr, { CPUDebug *s = container_of(info, CPUDebug, info); -cpu_memory_rw_debug(s-env, memaddr, myaddr, length, 0); +cpu_memory_rw_debug(ENV_GET_CPU(s-env), memaddr, myaddr, length, 0); return 0; } @@ -392,7 +392,7 @@ monitor_read_memory (bfd_vma memaddr, bfd_byte *myaddr, int length, if (monitor_disas_is_physical) { cpu_physical_memory_read(memaddr, myaddr, length); } else { -cpu_memory_rw_debug(s-env, memaddr,myaddr, length, 0); +cpu_memory_rw_debug(ENV_GET_CPU(s-env), memaddr, myaddr, length, 0); } return 0; } diff --git a/exec.c b/exec.c index a768aea..f3fe6e6 100644 --- a/exec.c +++ b/exec.c @@ -1840,7 +1840,7 @@ MemoryRegion *get_system_io(void) /* physical memory access (slow version, mainly for debug) */ #if defined(CONFIG_USER_ONLY) -int cpu_memory_rw_debug(CPUArchState *env, target_ulong addr, +int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr, uint8_t *buf, int len, int is_write) { int l, flags; @@ -2574,7 +2574,7 @@ void stq_be_phys(hwaddr addr, uint64_t val) } /* virtual memory access for debug (includes writing to ROM) */ -int cpu_memory_rw_debug(CPUArchState *env, target_ulong addr, +int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr, uint8_t *buf, int len, int is_write) { int l; @@ -2583,7 +2583,7 @@ int cpu_memory_rw_debug(CPUArchState *env, target_ulong addr, while (len 0) { page = addr TARGET_PAGE_MASK; -phys_addr = cpu_get_phys_page_debug(ENV_GET_CPU(env), page); +phys_addr = cpu_get_phys_page_debug(cpu, page); /* if no physical page mapped, return an error */ if (phys_addr == -1) return -1; diff --git a/gdbstub.c b/gdbstub.c index cee9c13..43ecc0d 100644 --- a/gdbstub.c +++ b/gdbstub.c @@ -46,7 +46,7 @@ static inline int target_memory_rw_debug(CPUArchState *env, target_ulong addr, uint8_t *buf, int len, int is_write) { -return cpu_memory_rw_debug(env, addr, buf, len, is_write); +return cpu_memory_rw_debug(ENV_GET_CPU(env), addr, buf, len, is_write); } #else /* target_memory_rw_debug() defined in cpu.h */ diff --git a/hw/i386/kvmvapic.c b/hw/i386/kvmvapic.c index 224601f..035d0fe 100644 --- a/hw/i386/kvmvapic.c +++ b/hw/i386/kvmvapic.c @@ -188,9 +188,10 @@ static bool opcode_matches(uint8_t *opcode, const TPRInstruction *instr) modrm_reg(opcode[1]) == instr-modrm_reg); } -static int evaluate_tpr_instruction(VAPICROMState *s, CPUX86State *env, +static int evaluate_tpr_instruction(VAPICROMState *s, X86CPU *cpu, target_ulong *pip, TPRAccess access) { +CPUState *cs = CPU(cpu); const TPRInstruction *instr; target_ulong ip = *pip; uint8_t opcode[2]; @@ -211,7 +212,7 @@ static int evaluate_tpr_instruction(VAPICROMState *s, CPUX86State *env, * RSP, used by the patched instruction, is zero, so the guest gets a * double fault and dies. */ -if (env-regs[R_ESP] == 0) { +if (cpu-env.regs[R_ESP] == 0) { return -1; } @@ -226,7 +227,7 @@ static int evaluate_tpr_instruction(VAPICROMState *s, CPUX86State *env, if
[PATCH qom-cpu v2 07/42] cpu: Move singlestep_enabled field from CPU_COMMON to CPUState
Prepares for changing cpu_single_step() argument to CPUState. Acked-by: Michael Walle mich...@walle.cc (for lm32) Signed-off-by: Andreas Färber afaer...@suse.de --- cpu-exec.c| 2 +- cpus.c| 2 +- exec.c| 10 ++ include/exec/cpu-defs.h | 1 - include/qom/cpu.h | 2 ++ kvm-all.c | 2 +- target-alpha/translate.c | 3 ++- target-arm/translate.c| 7 --- target-cris/translate.c | 7 --- target-i386/kvm.c | 6 -- target-i386/translate.c | 5 +++-- target-lm32/translate.c | 7 --- target-m68k/translate.c | 7 --- target-microblaze/translate.c | 8 +--- target-mips/translate.c | 11 +++ target-moxie/translate.c | 5 +++-- target-openrisc/translate.c | 7 --- target-ppc/translate.c| 8 +--- target-s390x/translate.c | 5 +++-- target-sh4/translate.c| 8 +--- target-sparc/translate.c | 3 ++- target-unicore32/translate.c | 7 --- target-xtensa/translate.c | 7 --- 23 files changed, 78 insertions(+), 52 deletions(-) diff --git a/cpu-exec.c b/cpu-exec.c index 3fccb86..301be28 100644 --- a/cpu-exec.c +++ b/cpu-exec.c @@ -297,7 +297,7 @@ int cpu_exec(CPUArchState *env) for(;;) { interrupt_request = cpu-interrupt_request; if (unlikely(interrupt_request)) { -if (unlikely(env-singlestep_enabled SSTEP_NOIRQ)) { +if (unlikely(cpu-singlestep_enabled SSTEP_NOIRQ)) { /* Mask out external interrupts for this step. */ interrupt_request = ~CPU_INTERRUPT_SSTEP_MASK; } diff --git a/cpus.c b/cpus.c index f141428..8a4f395 100644 --- a/cpus.c +++ b/cpus.c @@ -1148,7 +1148,7 @@ static void tcg_exec_all(void) CPUArchState *env = cpu-env_ptr; qemu_clock_enable(vm_clock, - (env-singlestep_enabled SSTEP_NOTIMER) == 0); + (cpu-singlestep_enabled SSTEP_NOTIMER) == 0); if (cpu_can_run(cpu)) { r = tcg_cpu_exec(env); diff --git a/exec.c b/exec.c index f01e3b6..ae6eb24 100644 --- a/exec.c +++ b/exec.c @@ -588,11 +588,13 @@ void cpu_breakpoint_remove_all(CPUArchState *env, int mask) void cpu_single_step(CPUArchState *env, int enabled) { #if defined(TARGET_HAS_ICE) -if (env-singlestep_enabled != enabled) { -env-singlestep_enabled = enabled; -if (kvm_enabled()) +CPUState *cpu = ENV_GET_CPU(env); + +if (cpu-singlestep_enabled != enabled) { +cpu-singlestep_enabled = enabled; +if (kvm_enabled()) { kvm_update_guest_debug(env, 0); -else { +} else { /* must flush all the translated code to avoid inconsistencies */ /* XXX: only flush what is necessary */ tb_flush(env); diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h index 39094b3..12b1ca7 100644 --- a/include/exec/cpu-defs.h +++ b/include/exec/cpu-defs.h @@ -170,7 +170,6 @@ typedef struct CPUWatchpoint { /* from this point: preserved by CPU reset */ \ /* ice debug support */ \ QTAILQ_HEAD(breakpoints_head, CPUBreakpoint) breakpoints;\ -int singlestep_enabled; \ \ QTAILQ_HEAD(watchpoints_head, CPUWatchpoint) watchpoints;\ CPUWatchpoint *watchpoint_hit; \ diff --git a/include/qom/cpu.h b/include/qom/cpu.h index 152dad5..136482c 100644 --- a/include/qom/cpu.h +++ b/include/qom/cpu.h @@ -129,6 +129,7 @@ struct kvm_run; * @stopped: Indicates the CPU has been artificially stopped. * @tcg_exit_req: Set to force TCG to stop executing linked TBs for this * CPU and return to its top level loop. + * @singlestep_enabled: Flags for single-stepping. * @env_ptr: Pointer to subclass-specific CPUArchState field. * @current_tb: Currently executing TB. * @next_cpu: Next CPU sharing TB cache. @@ -161,6 +162,7 @@ struct CPUState { volatile sig_atomic_t exit_request; volatile sig_atomic_t tcg_exit_req; uint32_t interrupt_request; +int singlestep_enabled; void *env_ptr; /* CPUArchState */ struct TranslationBlock *current_tb; diff --git a/kvm-all.c b/kvm-all.c index c130705..0c16a44 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -1893,7 +1893,7 @@ int kvm_update_guest_debug(CPUArchState *env, unsigned long reinject_trap) data.dbg.control = reinject_trap; -if (env-singlestep_enabled) { +if (cpu-singlestep_enabled) { data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP; } kvm_arch_update_guest_debug(cpu,
KVM call agenda for 2013-07-09
Hi Please, send any topic that you are interested in covering. Thanks, Juan. PD. If you want to attend and you don't have the call details, contact me. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] KVM call agenda for 2013-07-09
Am 08.07.2013 01:10, schrieb Juan Quintela: Please, send any topic that you are interested in covering. Static qdev vs. dynamic QOM properties and -global (Igor's topic from two weeks ago that we couldn't cover any more) = needed for X86CPU subclasses (hot-add, topology, NUMA, ...) Thanks, Andreas -- SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH qom-cpu v9] target-i386: Move hyperv_* static globals to X86CPU
From: Igor Mammedov imamm...@redhat.com - since hyperv_* helper functions are used only in target-i386/kvm.c move them there as static helpers Requested-by: Eduardo Habkost ehabk...@redhat.com Signed-off-by: Igor Mammedov imamm...@redhat.com Signed-off-by: Andreas Färber afaer...@suse.de --- v8 (imammedo) - v9: * Use X86CPU instead of CPUX86State (only used in KVM) * Changed helper functions to X86CPU argument * Moved field initialization to QOM instance_init * Fixed subject (not today's CPUState) target-i386/Makefile.objs | 2 +- target-i386/cpu-qom.h | 4 +++ target-i386/cpu.c | 16 target-i386/cpu.h | 4 +++ target-i386/hyperv.c | 64 --- target-i386/hyperv.h | 45 - target-i386/kvm.c | 36 ++ 7 files changed, 46 insertions(+), 125 deletions(-) delete mode 100644 target-i386/hyperv.c delete mode 100644 target-i386/hyperv.h diff --git a/target-i386/Makefile.objs b/target-i386/Makefile.objs index c1d4f05..887dca7 100644 --- a/target-i386/Makefile.objs +++ b/target-i386/Makefile.objs @@ -2,7 +2,7 @@ obj-y += translate.o helper.o cpu.o obj-y += excp_helper.o fpu_helper.o cc_helper.o int_helper.o svm_helper.o obj-y += smm_helper.o misc_helper.o mem_helper.o seg_helper.o obj-$(CONFIG_SOFTMMU) += machine.o arch_memory_mapping.o arch_dump.o -obj-$(CONFIG_KVM) += kvm.o hyperv.o +obj-$(CONFIG_KVM) += kvm.o obj-$(CONFIG_NO_KVM) += kvm-stub.o obj-$(CONFIG_LINUX_USER) += ioport-user.o obj-$(CONFIG_BSD_USER) += ioport-user.o diff --git a/target-i386/cpu-qom.h b/target-i386/cpu-qom.h index 7e55e5f..18f08b8 100644 --- a/target-i386/cpu-qom.h +++ b/target-i386/cpu-qom.h @@ -66,6 +66,10 @@ typedef struct X86CPU { CPUX86State env; +bool hyperv_vapic; +bool hyperv_relaxed_timing; +int hyperv_spinlock_attempts; + /* Features that were filtered out because of missing host capabilities */ uint32_t filtered_features[FEATURE_WORDS]; } X86CPU; diff --git a/target-i386/cpu.c b/target-i386/cpu.c index e3f75a8..14e9c7e 100644 --- a/target-i386/cpu.c +++ b/target-i386/cpu.c @@ -35,8 +35,6 @@ #include qapi/visitor.h #include sysemu/arch_init.h -#include hyperv.h - #include hw/hw.h #if defined(CONFIG_KVM) #include linux/kvm_para.h @@ -1587,12 +1585,19 @@ static void cpu_x86_parse_featurestr(X86CPU *cpu, char *features, Error **errp) object_property_parse(OBJECT(cpu), num, tsc-frequency, errp); } else if (!strcmp(featurestr, hv-spinlocks)) { char *err; +const int min = 0xFFF; numvalue = strtoul(val, err, 0); if (!*val || *err) { error_setg(errp, bad numerical value %s, val); goto out; } -hyperv_set_spinlock_retries(numvalue); +if (numvalue min) { +fprintf(stderr, hv-spinlocks value shall always be = 0x%x +, fixup will be removed in future versions\n, +min); +numvalue = min; +} +cpu-hyperv_spinlock_attempts = numvalue; } else { error_setg(errp, unrecognized feature %s, featurestr); goto out; @@ -1602,9 +1607,9 @@ static void cpu_x86_parse_featurestr(X86CPU *cpu, char *features, Error **errp) } else if (!strcmp(featurestr, enforce)) { check_cpuid = enforce_cpuid = 1; } else if (!strcmp(featurestr, hv_relaxed)) { -hyperv_enable_relaxed_timing(true); +cpu-hyperv_relaxed_timing = true; } else if (!strcmp(featurestr, hv_vapic)) { -hyperv_enable_vapic_recommended(true); +cpu-hyperv_vapic = true; } else { error_setg(errp, feature string `%s' not in format (+feature| -feature|feature=xyz), featurestr); @@ -2479,6 +2484,7 @@ static void x86_cpu_initfn(Object *obj) x86_cpu_get_feature_words, NULL, NULL, (void *)cpu-filtered_features, NULL); +cpu-hyperv_spinlock_attempts = HYPERV_SPINLOCK_NEVER_RETRY; env-cpuid_apic_id = x86_cpu_apic_id_from_index(cs-cpu_index); /* init various static tables used in TCG mode */ diff --git a/target-i386/cpu.h b/target-i386/cpu.h index 2d005b3..6c3eb86 100644 --- a/target-i386/cpu.h +++ b/target-i386/cpu.h @@ -549,6 +549,10 @@ typedef uint32_t FeatureWordArray[FEATURE_WORDS]; #define CPUID_MWAIT_IBE (1 1) /* Interrupts can exit capability */ #define CPUID_MWAIT_EMX (1 0) /* enumeration supported */ +#ifndef HYPERV_SPINLOCK_NEVER_RETRY +#define HYPERV_SPINLOCK_NEVER_RETRY 0x +#endif + #define EXCP00_DIVZ0 #define EXCP01_DB 1 #define EXCP02_NMI 2 diff --git a/target-i386/hyperv.c
Re: [PATCH 4/8] powerpc: Prepare to support kernel handling of IOMMU map/unmap
On Sun, 2013-07-07 at 01:07 +1000, Alexey Kardashevskiy wrote: The current VFIO-on-POWER implementation supports only user mode driven mapping, i.e. QEMU is sending requests to map/unmap pages. However this approach is really slow, so we want to move that to KVM. Since H_PUT_TCE can be extremely performance sensitive (especially with network adapters where each packet needs to be mapped/unmapped) we chose to implement that as a fast hypercall directly in real mode (processor still in the guest context but MMU off). To be able to do that, we need to provide some facilities to access the struct page count within that real mode environment as things like the sparsemem vmemmap mappings aren't accessible. This adds an API to increment/decrement page counter as get_user_pages API used for user mode mapping does not work in the real mode. CONFIG_SPARSEMEM_VMEMMAP and CONFIG_FLATMEM are supported. This patch will need an ack from mm people to make sure they are ok with our approach and ack the change to the generic header. (Added linux-mm). Cheers, Ben. Reviewed-by: Paul Mackerras pau...@samba.org Signed-off-by: Paul Mackerras pau...@samba.org Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- Changes: 2013/06/27: * realmode_get_page() fixed to use get_page_unless_zero(). If failed, the call will be passed from real to virtual mode and safely handled. * added comment to PageCompound() in include/linux/page-flags.h. 2013/05/20: * PageTail() is replaced by PageCompound() in order to have the same checks for whether the page is huge in realmode_get_page() and realmode_put_page() Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/pgtable-ppc64.h | 4 ++ arch/powerpc/mm/init_64.c| 78 +++- include/linux/page-flags.h | 4 +- 3 files changed, 84 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index e3d55f6f..7b46e5f 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -376,6 +376,10 @@ static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, } #endif /* !CONFIG_HUGETLB_PAGE */ +struct page *realmode_pfn_to_page(unsigned long pfn); +int realmode_get_page(struct page *page); +int realmode_put_page(struct page *page); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */ diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index a90b9c4..7031be3 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -297,5 +297,81 @@ void vmemmap_free(unsigned long start, unsigned long end) { } -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ +/* + * We do not have access to the sparsemem vmemmap, so we fallback to + * walking the list of sparsemem blocks which we already maintain for + * the sake of crashdump. In the long run, we might want to maintain + * a tree if performance of that linear walk becomes a problem. + * + * Any of realmode_ functions can fail due to: + * 1) As real sparsemem blocks do not lay in RAM continously (they + * are in virtual address space which is not available in the real mode), + * the requested page struct can be split between blocks so get_page/put_page + * may fail. + * 2) When huge pages are used, the get_page/put_page API will fail + * in real mode as the linked addresses in the page struct are virtual + * too. + * When 1) or 2) takes place, the API returns an error code to cause + * an exit to kernel virtual mode where the operation will be completed. + */ +struct page *realmode_pfn_to_page(unsigned long pfn) +{ + struct vmemmap_backing *vmem_back; + struct page *page; + unsigned long page_size = 1 mmu_psize_defs[mmu_vmemmap_psize].shift; + unsigned long pg_va = (unsigned long) pfn_to_page(pfn); + for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back-list) { + if (pg_va vmem_back-virt_addr) + continue; + + /* Check that page struct is not split between real pages */ + if ((pg_va + sizeof(struct page)) + (vmem_back-virt_addr + page_size)) + return NULL; + + page = (struct page *) (vmem_back-phys + pg_va - + vmem_back-virt_addr); + return page; + } + + return NULL; +} +EXPORT_SYMBOL_GPL(realmode_pfn_to_page); + +#elif defined(CONFIG_FLATMEM) + +struct page *realmode_pfn_to_page(unsigned long pfn) +{ + struct page *page = pfn_to_page(pfn); + return page; +} +EXPORT_SYMBOL_GPL(realmode_pfn_to_page); + +#endif /* CONFIG_SPARSEMEM_VMEMMAP/CONFIG_FLATMEM */ + +#if defined(CONFIG_SPARSEMEM_VMEMMAP) || defined(CONFIG_FLATMEM) +int realmode_get_page(struct page *page) +{ +
Re: [PATCH] virtio: include asm/barrier explicitly
Michael S. Tsirkin m...@redhat.com writes: virtio_ring.h uses mb() and friends, make it pull in asm/barrier.h itself, not rely on other headers to do it. Signed-off-by: Michael S. Tsirkin m...@redhat.com Applied. Thanks, Rusty. PS. I'll squeeze these into this merge window, since they're trivial. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] tools/virtio: move module license stub to module.h
Michael S. Tsirkin m...@redhat.com writes: This fixes build for the vringh test: [linux]$ make -C tools/virtio/ make: Entering directory `/home/mst/scm/linux/tools/virtio' cc -g -O2 -Wall -I. -I ../../usr/include/ -Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD -U_FORTIFY_SOURCE -c -o vringh.o ../../drivers/vhost/vringh.c ../../drivers/vhost/vringh.c:1010:16: error: expected declaration specifiers or ‘...’ before string constant Signed-off-by: Michael S. Tsirkin m...@redhat.com --- tools/virtio/linux/module.h | 5 + tools/virtio/linux/virtio.h | 3 --- 2 files changed, 5 insertions(+), 3 deletions(-) Applied, thanks. Cheers, Rusty. diff --git a/tools/virtio/linux/module.h b/tools/virtio/linux/module.h index 3039a7e..28ce95a 100644 --- a/tools/virtio/linux/module.h +++ b/tools/virtio/linux/module.h @@ -1 +1,6 @@ #include linux/export.h + +#define MODULE_LICENSE(__MODULE_LICENSE_value) \ + static __attribute__((unused)) const char *__MODULE_LICENSE_name = \ + __MODULE_LICENSE_value + diff --git a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h index cd80183..8447830 100644 --- a/tools/virtio/linux/virtio.h +++ b/tools/virtio/linux/virtio.h @@ -45,9 +45,6 @@ struct virtqueue { void *priv; }; -#define MODULE_LICENSE(__MODULE_LICENSE_value) \ - const char *__MODULE_LICENSE_name = __MODULE_LICENSE_value - /* Interfaces exported by virtio_ring. */ int virtqueue_add_sgs(struct virtqueue *vq, struct scatterlist *sgs[], -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR
I only recently saw this email. On Thu, Jun 06, 2013 at 06:10:12PM +0300, Gleb Natapov wrote: On Thu, Jun 06, 2013 at 05:06:32PM +0200, Gerd Hoffmann wrote: For seabios itself this isn't a big issue, see pci_{readl,writel} in src/pci.c. When called in 16bit mode it goes into 32bit mode temporarily, just for accessing the mmio register. ahci driver uses it, xhci driver (wip atm) will use that too, and virtio-{blk,scsi} drivers in seabios can do the same. Isn't this approach broken? How can SeaBIOS be sure it restores real mode registers to exactly same state they were before entering 32bit mode? You are correct - SeaBIOS can't fully restore the hidden segment registers. So, in a way it is broken. In practice, it seems to work on modern bootloaders (eg, ntldr, grub, lilo). It definitely doesn't work with EMM386 (old DOS stuff), but does seem to work okay with FreeDOS as long as one doesn't run EMM386. The AHCI code uses this 32bit/16bit trampoline because it would not be possible to support AHCI otherwise. I haven't seen any complaints of failures with the AHCI code - probably because people using AHCI are using modern guests. I explored this a bit some time back and the only way I could think of to reliably restore the 16bit registers would be via SMM. Unfortunately, using SMM introduces a whole host of complexity and problems. -Kevin -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] virtio: include asm/barrier explicitly
Hi Rusty, On Mon, 08 Jul 2013 11:31:45 +0930 Rusty Russell ru...@rustcorp.com.au wrote: PS. I'll squeeze these into this merge window, since they're trivial. s/'re trivial/fix bugs/ ;-) -- Cheers, Stephen Rothwells...@canb.auug.org.au pgpQCBOOsFabg.pgp Description: PGP signature
Re: [PATCH 4/8] powerpc: Prepare to support kernel handling of IOMMU map/unmap
On Sun, 2013-07-07 at 01:07 +1000, Alexey Kardashevskiy wrote: The current VFIO-on-POWER implementation supports only user mode driven mapping, i.e. QEMU is sending requests to map/unmap pages. However this approach is really slow, so we want to move that to KVM. Since H_PUT_TCE can be extremely performance sensitive (especially with network adapters where each packet needs to be mapped/unmapped) we chose to implement that as a fast hypercall directly in real mode (processor still in the guest context but MMU off). To be able to do that, we need to provide some facilities to access the struct page count within that real mode environment as things like the sparsemem vmemmap mappings aren't accessible. This adds an API to increment/decrement page counter as get_user_pages API used for user mode mapping does not work in the real mode. CONFIG_SPARSEMEM_VMEMMAP and CONFIG_FLATMEM are supported. This patch will need an ack from mm people to make sure they are ok with our approach and ack the change to the generic header. (Added linux-mm). Cheers, Ben. Reviewed-by: Paul Mackerras pau...@samba.org Signed-off-by: Paul Mackerras pau...@samba.org Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- Changes: 2013/06/27: * realmode_get_page() fixed to use get_page_unless_zero(). If failed, the call will be passed from real to virtual mode and safely handled. * added comment to PageCompound() in include/linux/page-flags.h. 2013/05/20: * PageTail() is replaced by PageCompound() in order to have the same checks for whether the page is huge in realmode_get_page() and realmode_put_page() Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/pgtable-ppc64.h | 4 ++ arch/powerpc/mm/init_64.c| 78 +++- include/linux/page-flags.h | 4 +- 3 files changed, 84 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index e3d55f6f..7b46e5f 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -376,6 +376,10 @@ static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, } #endif /* !CONFIG_HUGETLB_PAGE */ +struct page *realmode_pfn_to_page(unsigned long pfn); +int realmode_get_page(struct page *page); +int realmode_put_page(struct page *page); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */ diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index a90b9c4..7031be3 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -297,5 +297,81 @@ void vmemmap_free(unsigned long start, unsigned long end) { } -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ +/* + * We do not have access to the sparsemem vmemmap, so we fallback to + * walking the list of sparsemem blocks which we already maintain for + * the sake of crashdump. In the long run, we might want to maintain + * a tree if performance of that linear walk becomes a problem. + * + * Any of realmode_ functions can fail due to: + * 1) As real sparsemem blocks do not lay in RAM continously (they + * are in virtual address space which is not available in the real mode), + * the requested page struct can be split between blocks so get_page/put_page + * may fail. + * 2) When huge pages are used, the get_page/put_page API will fail + * in real mode as the linked addresses in the page struct are virtual + * too. + * When 1) or 2) takes place, the API returns an error code to cause + * an exit to kernel virtual mode where the operation will be completed. + */ +struct page *realmode_pfn_to_page(unsigned long pfn) +{ + struct vmemmap_backing *vmem_back; + struct page *page; + unsigned long page_size = 1 mmu_psize_defs[mmu_vmemmap_psize].shift; + unsigned long pg_va = (unsigned long) pfn_to_page(pfn); + for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back-list) { + if (pg_va vmem_back-virt_addr) + continue; + + /* Check that page struct is not split between real pages */ + if ((pg_va + sizeof(struct page)) + (vmem_back-virt_addr + page_size)) + return NULL; + + page = (struct page *) (vmem_back-phys + pg_va - + vmem_back-virt_addr); + return page; + } + + return NULL; +} +EXPORT_SYMBOL_GPL(realmode_pfn_to_page); + +#elif defined(CONFIG_FLATMEM) + +struct page *realmode_pfn_to_page(unsigned long pfn) +{ + struct page *page = pfn_to_page(pfn); + return page; +} +EXPORT_SYMBOL_GPL(realmode_pfn_to_page); + +#endif /* CONFIG_SPARSEMEM_VMEMMAP/CONFIG_FLATMEM */ + +#if defined(CONFIG_SPARSEMEM_VMEMMAP) || defined(CONFIG_FLATMEM) +int realmode_get_page(struct page *page) +{ +