Re: [PATCH v13 08/35] KVM: Introduce KVM_SET_USER_MEMORY_REGION2
On Fri, Oct 27, 2023 at 7:22 PM Sean Christopherson wrote: > > Introduce a "version 2" of KVM_SET_USER_MEMORY_REGION so that additional > information can be supplied without setting userspace up to fail. The > padding in the new kvm_userspace_memory_region2 structure will be used to > pass a file descriptor in addition to the userspace_addr, i.e. allow > userspace to point at a file descriptor and map memory into a guest that > is NOT mapped into host userspace. > > Alternatively, KVM could simply add "struct kvm_userspace_memory_region2" > without a new ioctl(), but as Paolo pointed out, adding a new ioctl() > makes detection of bad flags a bit more robust, e.g. if the new fd field > is guarded only by a flag and not a new ioctl(), then a userspace bug > (setting a "bad" flag) would generate out-of-bounds access instead of an > -EINVAL error. > > Cc: Jarkko Sakkinen > Reviewed-by: Paolo Bonzini > Reviewed-by: Xiaoyao Li > Signed-off-by: Sean Christopherson > --- With the missing pad in api.rst fixed: Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Cheers, /fuad > Documentation/virt/kvm/api.rst | 21 +++ > arch/x86/kvm/x86.c | 2 +- > include/linux/kvm_host.h | 4 ++-- > include/uapi/linux/kvm.h | 13 > virt/kvm/kvm_main.c| 38 +++--- > 5 files changed, 67 insertions(+), 11 deletions(-) > > diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst > index 21a7578142a1..ace984acc125 100644 > --- a/Documentation/virt/kvm/api.rst > +++ b/Documentation/virt/kvm/api.rst > @@ -6070,6 +6070,27 @@ writes to the CNTVCT_EL0 and CNTPCT_EL0 registers > using the SET_ONE_REG > interface. No error will be returned, but the resulting offset will not be > applied. > > +4.139 KVM_SET_USER_MEMORY_REGION2 > +- > + > +:Capability: KVM_CAP_USER_MEMORY2 > +:Architectures: all > +:Type: vm ioctl > +:Parameters: struct kvm_userspace_memory_region2 (in) > +:Returns: 0 on success, -1 on error > + > +:: > + > + struct kvm_userspace_memory_region2 { > + __u32 slot; > + __u32 flags; > + __u64 guest_phys_addr; > + __u64 memory_size; /* bytes */ > + __u64 userspace_addr; /* start of the userspace allocated memory */ > + }; > + > +See KVM_SET_USER_MEMORY_REGION. > + > 5. The kvm_run structure > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 41cce5031126..6409914428ca 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -12455,7 +12455,7 @@ void __user * __x86_set_memory_region(struct kvm > *kvm, int id, gpa_t gpa, > } > > for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { > - struct kvm_userspace_memory_region m; > + struct kvm_userspace_memory_region2 m; > > m.slot = id | (i << 16); > m.flags = 0; > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 5faba69403ac..4e741ff27af3 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -1146,9 +1146,9 @@ enum kvm_mr_change { > }; > > int kvm_set_memory_region(struct kvm *kvm, > - const struct kvm_userspace_memory_region *mem); > + const struct kvm_userspace_memory_region2 *mem); > int __kvm_set_memory_region(struct kvm *kvm, > - const struct kvm_userspace_memory_region *mem); > + const struct kvm_userspace_memory_region2 *mem); > void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); > void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen); > int kvm_arch_prepare_memory_region(struct kvm *kvm, > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h > index 13065dd96132..bd1abe067f28 100644 > --- a/include/uapi/linux/kvm.h > +++ b/include/uapi/linux/kvm.h > @@ -95,6 +95,16 @@ struct kvm_userspace_memory_region { > __u64 userspace_addr; /* start of the userspace allocated memory */ > }; > > +/* for KVM_SET_USER_MEMORY_REGION2 */ > +struct kvm_userspace_memory_region2 { > + __u32 slot; > + __u32 flags; > + __u64 guest_phys_addr; > + __u64 memory_size; > + __u64 userspace_addr; > + __u64 pad[16]; > +}; > + > /* > * The bit 0 ~ bit 15 of kvm_userspace_memory_region::flags are visible for > * userspace, other bits are reserved for kvm internal use which are defined > @@ -1192,6 +1202,7 @@ struct kvm_ppc_resize_hpt { > #define KVM_CAP_COUNTER_OFFSET 227 > #define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228 > #define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229 > +#define KVM_CAP_USER_MEMORY2 230 > > #ifdef KVM_CAP_IRQ_ROUTING > > @@ -1473,6 +1484,8 @@ struct kvm_vfio_spapr_tce { > struct kvm_userspace_memory_region) > #define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) > #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,
Re: [PATCH v13 08/35] KVM: Introduce KVM_SET_USER_MEMORY_REGION2
On Tue, Oct 31, 2023, Xiaoyao Li wrote: > On 10/28/2023 2:21 AM, Sean Christopherson wrote: > > Introduce a "version 2" of KVM_SET_USER_MEMORY_REGION so that additional > > information can be supplied without setting userspace up to fail. The > > padding in the new kvm_userspace_memory_region2 structure will be used to > > pass a file descriptor in addition to the userspace_addr, i.e. allow > > userspace to point at a file descriptor and map memory into a guest that > > is NOT mapped into host userspace. > > > > Alternatively, KVM could simply add "struct kvm_userspace_memory_region2" > > without a new ioctl(), but as Paolo pointed out, adding a new ioctl() > > makes detection of bad flags a bit more robust, e.g. if the new fd field > > is guarded only by a flag and not a new ioctl(), then a userspace bug > > (setting a "bad" flag) would generate out-of-bounds access instead of an > > -EINVAL error. > > > > Cc: Jarkko Sakkinen > > Reviewed-by: Paolo Bonzini > > Reviewed-by: Xiaoyao Li > > Signed-off-by: Sean Christopherson > > --- > > Documentation/virt/kvm/api.rst | 21 +++ > > arch/x86/kvm/x86.c | 2 +- > > include/linux/kvm_host.h | 4 ++-- > > include/uapi/linux/kvm.h | 13 > > virt/kvm/kvm_main.c| 38 +++--- > > 5 files changed, 67 insertions(+), 11 deletions(-) > > > > diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst > > index 21a7578142a1..ace984acc125 100644 > > --- a/Documentation/virt/kvm/api.rst > > +++ b/Documentation/virt/kvm/api.rst > > @@ -6070,6 +6070,27 @@ writes to the CNTVCT_EL0 and CNTPCT_EL0 registers > > using the SET_ONE_REG > > interface. No error will be returned, but the resulting offset will not be > > applied. > > +4.139 KVM_SET_USER_MEMORY_REGION2 > > +- > > + > > +:Capability: KVM_CAP_USER_MEMORY2 > > +:Architectures: all > > +:Type: vm ioctl > > +:Parameters: struct kvm_userspace_memory_region2 (in) > > +:Returns: 0 on success, -1 on error > > + > > +:: > > + > > + struct kvm_userspace_memory_region2 { > > + __u32 slot; > > + __u32 flags; > > + __u64 guest_phys_addr; > > + __u64 memory_size; /* bytes */ > > + __u64 userspace_addr; /* start of the userspace allocated memory */ > > missing > > __u64 pad[16]; I can't even copy+paste correctly :-(
Re: [PATCH v13 08/35] KVM: Introduce KVM_SET_USER_MEMORY_REGION2
On 10/28/2023 2:21 AM, Sean Christopherson wrote: Introduce a "version 2" of KVM_SET_USER_MEMORY_REGION so that additional information can be supplied without setting userspace up to fail. The padding in the new kvm_userspace_memory_region2 structure will be used to pass a file descriptor in addition to the userspace_addr, i.e. allow userspace to point at a file descriptor and map memory into a guest that is NOT mapped into host userspace. Alternatively, KVM could simply add "struct kvm_userspace_memory_region2" without a new ioctl(), but as Paolo pointed out, adding a new ioctl() makes detection of bad flags a bit more robust, e.g. if the new fd field is guarded only by a flag and not a new ioctl(), then a userspace bug (setting a "bad" flag) would generate out-of-bounds access instead of an -EINVAL error. Cc: Jarkko Sakkinen Reviewed-by: Paolo Bonzini Reviewed-by: Xiaoyao Li Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/api.rst | 21 +++ arch/x86/kvm/x86.c | 2 +- include/linux/kvm_host.h | 4 ++-- include/uapi/linux/kvm.h | 13 virt/kvm/kvm_main.c| 38 +++--- 5 files changed, 67 insertions(+), 11 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 21a7578142a1..ace984acc125 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6070,6 +6070,27 @@ writes to the CNTVCT_EL0 and CNTPCT_EL0 registers using the SET_ONE_REG interface. No error will be returned, but the resulting offset will not be applied. +4.139 KVM_SET_USER_MEMORY_REGION2 +- + +:Capability: KVM_CAP_USER_MEMORY2 +:Architectures: all +:Type: vm ioctl +:Parameters: struct kvm_userspace_memory_region2 (in) +:Returns: 0 on success, -1 on error + +:: + + struct kvm_userspace_memory_region2 { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; /* bytes */ + __u64 userspace_addr; /* start of the userspace allocated memory */ missing __u64 pad[16]; + }; + +See KVM_SET_USER_MEMORY_REGION. + 5. The kvm_run structure diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 41cce5031126..6409914428ca 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12455,7 +12455,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, } for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - struct kvm_userspace_memory_region m; + struct kvm_userspace_memory_region2 m; m.slot = id | (i << 16); m.flags = 0; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5faba69403ac..4e741ff27af3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1146,9 +1146,9 @@ enum kvm_mr_change { }; int kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem); + const struct kvm_userspace_memory_region2 *mem); int __kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem); + const struct kvm_userspace_memory_region2 *mem); void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen); int kvm_arch_prepare_memory_region(struct kvm *kvm, diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 13065dd96132..bd1abe067f28 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -95,6 +95,16 @@ struct kvm_userspace_memory_region { __u64 userspace_addr; /* start of the userspace allocated memory */ }; +/* for KVM_SET_USER_MEMORY_REGION2 */ +struct kvm_userspace_memory_region2 { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; + __u64 userspace_addr; + __u64 pad[16]; +}; + /* * The bit 0 ~ bit 15 of kvm_userspace_memory_region::flags are visible for * userspace, other bits are reserved for kvm internal use which are defined @@ -1192,6 +1202,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_COUNTER_OFFSET 227 #define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228 #define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229 +#define KVM_CAP_USER_MEMORY2 230 #ifdef KVM_CAP_IRQ_ROUTING @@ -1473,6 +1484,8 @@ struct kvm_vfio_spapr_tce { struct kvm_userspace_memory_region) #define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64) +#define KVM_SET_USER_MEMORY_REGION2 _IOW(KVMIO, 0x49, \ +struct kvm_userspace_memory_region2) /* enable ucontrol for s390 */ struct kvm_s390_ucas_mapping { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index
Re: [PATCH v13 08/35] KVM: Introduce KVM_SET_USER_MEMORY_REGION2
On Tue, Oct 31, 2023, Paolo Bonzini wrote: > On 10/30/23 21:25, Sean Christopherson wrote: > > > Probably worth adding a check on valid flags here. > > > > Definitely needed. There's a very real bug here. But rather than > > duplicate flags > > checking or plumb @ioctl all the way to __kvm_set_memory_region(), now that > > we > > have the fancy guard(mutex) and there are no internal calls to > > kvm_set_memory_region(), > > what if we: > > > >1. Acquire/release slots_lock in __kvm_set_memory_region() > >2. Call kvm_set_memory_region() from x86 code for the internal memslots > >3. Disallow *any* flags for internal memslots > >4. Open code check_memory_region_flags in > > kvm_vm_ioctl_set_memory_region() > > I dislike this step, there is a clear point where all paths meet > (ioctl/internal, locked/unlocked) and that's __kvm_set_memory_region(). > I think that's the place where flags should be checked. (I don't mind > the restriction on internal memslots; it's just that to me it's not a > particularly natural way to structure the checks). Yeah, I just don't like the discrepancy it causes where some flags are explicitly checked and allowed, allowed and then later disallowed. > On the other hand, the place where to protect from out-of-bounds > accesses, is the place where you stop caring about struct > kvm_userspace_memory_region vs kvm_userspace_memory_region2 (and > your code gets it right, by dropping "ioctl" as soon as possible). > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 87f45aa91ced..fe5a2af14fff 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -1635,6 +1635,14 @@ bool __weak kvm_arch_dirty_log_supported(struct kvm > *kvm) > return true; > } > +/* > + * Flags that do not access any of the extra space of struct > + * kvm_userspace_memory_region2. KVM_SET_USER_MEMORY_REGION_FLAGS > + * only allows these. > + */ > +#define KVM_SET_USER_MEMORY_REGION_FLAGS \ Can we name this KVM_SET_USER_MEMORY_REGION_LEGACY_FLAGS, or something equally horrific? As is, this sounds way too much like a generic "allowed flags for any memory region". Or maybe invert the macro? I.e. something to make it more obvious that it's effectively a versioning check, not a generic "what's supported?" check. #define KVM_SET_USER_MEMORY_FLAGS_V2_ONLY \ (~(KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)) > + (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY) > + > static int check_memory_region_flags(struct kvm *kvm, >const struct kvm_userspace_memory_region2 > *mem) > { > @@ -5149,10 +5149,16 @@ static long kvm_vm_ioctl(struct file *filp, > struct kvm_userspace_memory_region2 mem; > unsigned long size; > - if (ioctl == KVM_SET_USER_MEMORY_REGION) > + if (ioctl == KVM_SET_USER_MEMORY_REGION) { > + /* > + * Fields beyond struct kvm_userspace_memory_region > shouldn't be > + * accessed, but avoid leaking kernel memory in case of > a bug. > + */ > + memset(, 0, sizeof(mem)); > size = sizeof(struct kvm_userspace_memory_region); > - else > + } else { > size = sizeof(struct kvm_userspace_memory_region2); > + } > /* Ensure the common parts of the two structs are identical. */ > SANITY_CHECK_MEM_REGION_FIELD(slot); > @@ -5165,6 +5167,11 @@ static long kvm_vm_ioctl(struct file *filp, > if (copy_from_user(, argp, size)) > goto out; > + r = -EINVAL; > + if (ioctl == KVM_SET_USER_MEMORY_REGION && > + (mem->flags & ~KVM_SET_USER_MEMORY_REGION_FLAGS)) > + goto out; > + > r = kvm_vm_ioctl_set_memory_region(kvm, ); > break; > } > > > That's a kind of patch that you can't really get wrong (though I have > the brown paper bag ready). > > Maintainance-wise it's fine, since flags are being added at a pace of > roughly one every five years, Heh, true. > and anyway it's also future proof: I placed the #define near > check_memory_region_flags so that in five years we remember to keep it up to > date. But worst case, the new flags will only be allowed by > KVM_SET_USER_MEMORY_REGION2 unnecessarily; there are no security issues > waiting to bite us. > > In sum, this is exactly the only kind of fix that should be in the v13->v14 > delta. Boiling the ocean can be fun too ;-)
Re: [PATCH v13 08/35] KVM: Introduce KVM_SET_USER_MEMORY_REGION2
On 10/30/23 21:25, Sean Christopherson wrote: On Mon, Oct 30, 2023, Paolo Bonzini wrote: On 10/27/23 20:21, Sean Christopherson wrote: + if (ioctl == KVM_SET_USER_MEMORY_REGION) + size = sizeof(struct kvm_userspace_memory_region); This also needs a memset(, 0, sizeof(mem)), otherwise the out-of-bounds access of the commit message becomes a kernel stack read. Ouch. There's some irony. Might be worth doing memset(, -1, sizeof(mem)) though as '0' is a valid file descriptor and a valid file offset. Either is okay, because unless the flags check is screwed up it should not matter. The memset is actually unnecessary, though it may be a good idea anyway to keep it, aka belt-and-suspenders. Probably worth adding a check on valid flags here. Definitely needed. There's a very real bug here. But rather than duplicate flags checking or plumb @ioctl all the way to __kvm_set_memory_region(), now that we have the fancy guard(mutex) and there are no internal calls to kvm_set_memory_region(), what if we: 1. Acquire/release slots_lock in __kvm_set_memory_region() 2. Call kvm_set_memory_region() from x86 code for the internal memslots 3. Disallow *any* flags for internal memslots 4. Open code check_memory_region_flags in kvm_vm_ioctl_set_memory_region() I dislike this step, there is a clear point where all paths meet (ioctl/internal, locked/unlocked) and that's __kvm_set_memory_region(). I think that's the place where flags should be checked. (I don't mind the restriction on internal memslots; it's just that to me it's not a particularly natural way to structure the checks). On the other hand, the place where to protect from out-of-bounds accesses, is the place where you stop caring about struct kvm_userspace_memory_region vs kvm_userspace_memory_region2 (and your code gets it right, by dropping "ioctl" as soon as possible). diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 87f45aa91ced..fe5a2af14fff 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1635,6 +1635,14 @@ bool __weak kvm_arch_dirty_log_supported(struct kvm *kvm) return true; } +/* + * Flags that do not access any of the extra space of struct + * kvm_userspace_memory_region2. KVM_SET_USER_MEMORY_REGION_FLAGS + * only allows these. + */ +#define KVM_SET_USER_MEMORY_REGION_FLAGS \ + (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY) + static int check_memory_region_flags(struct kvm *kvm, const struct kvm_userspace_memory_region2 *mem) { @@ -5149,10 +5149,16 @@ static long kvm_vm_ioctl(struct file *filp, struct kvm_userspace_memory_region2 mem; unsigned long size; - if (ioctl == KVM_SET_USER_MEMORY_REGION) + if (ioctl == KVM_SET_USER_MEMORY_REGION) { + /* +* Fields beyond struct kvm_userspace_memory_region shouldn't be +* accessed, but avoid leaking kernel memory in case of a bug. +*/ + memset(, 0, sizeof(mem)); size = sizeof(struct kvm_userspace_memory_region); - else + } else { size = sizeof(struct kvm_userspace_memory_region2); + } /* Ensure the common parts of the two structs are identical. */ SANITY_CHECK_MEM_REGION_FIELD(slot); @@ -5165,6 +5167,11 @@ static long kvm_vm_ioctl(struct file *filp, if (copy_from_user(, argp, size)) goto out; + r = -EINVAL; + if (ioctl == KVM_SET_USER_MEMORY_REGION && + (mem->flags & ~KVM_SET_USER_MEMORY_REGION_FLAGS)) + goto out; + r = kvm_vm_ioctl_set_memory_region(kvm, ); break; } That's a kind of patch that you can't really get wrong (though I have the brown paper bag ready). Maintainance-wise it's fine, since flags are being added at a pace of roughly one every five years, and anyway it's also future proof: I placed the #define near check_memory_region_flags so that in five years we remember to keep it up to date. But worst case, the new flags will only be allowed by KVM_SET_USER_MEMORY_REGION2 unnecessarily; there are no security issues waiting to bite us. In sum, this is exactly the only kind of fix that should be in the v13->v14 delta. Paolo
Re: [PATCH v13 08/35] KVM: Introduce KVM_SET_USER_MEMORY_REGION2
On Mon, Oct 30, 2023, Sean Christopherson wrote: > On Mon, Oct 30, 2023, Paolo Bonzini wrote: > > On 10/27/23 20:21, Sean Christopherson wrote: > > > > > > + if (ioctl == KVM_SET_USER_MEMORY_REGION) > > > + size = sizeof(struct kvm_userspace_memory_region); > > > > This also needs a memset(, 0, sizeof(mem)), otherwise the out-of-bounds > > access of the commit message becomes a kernel stack read. > > Ouch. There's some irony. Might be worth doing memset(, -1, sizeof(mem)) > though as '0' is a valid file descriptor and a valid file offset. > > > Probably worth adding a check on valid flags here. > > Definitely needed. There's a very real bug here. But rather than duplicate > flags > checking or plumb @ioctl all the way to __kvm_set_memory_region(), now that we > have the fancy guard(mutex) and there are no internal calls to > kvm_set_memory_region(), > what if we: > > 1. Acquire/release slots_lock in __kvm_set_memory_region() Gah, this won't work with x86's usage, which acquire slots_lock quite far away from __kvm_set_memory_region() in several places. The rest of the idea still works, kvm_vm_ioctl_set_memory_region() will just need to take slots_lock manually. > 2. Call kvm_set_memory_region() from x86 code for the internal memslots > 3. Disallow *any* flags for internal memslots > 4. Open code check_memory_region_flags in kvm_vm_ioctl_set_memory_region() > 5. Pass @ioctl to kvm_vm_ioctl_set_memory_region() and allow KVM_MEM_PRIVATE > only for KVM_SET_USER_MEMORY_REGION2
Re: [PATCH v13 08/35] KVM: Introduce KVM_SET_USER_MEMORY_REGION2
On Mon, Oct 30, 2023, Paolo Bonzini wrote: > On 10/27/23 20:21, Sean Christopherson wrote: > > > > + if (ioctl == KVM_SET_USER_MEMORY_REGION) > > + size = sizeof(struct kvm_userspace_memory_region); > > This also needs a memset(, 0, sizeof(mem)), otherwise the out-of-bounds > access of the commit message becomes a kernel stack read. Ouch. There's some irony. Might be worth doing memset(, -1, sizeof(mem)) though as '0' is a valid file descriptor and a valid file offset. > Probably worth adding a check on valid flags here. Definitely needed. There's a very real bug here. But rather than duplicate flags checking or plumb @ioctl all the way to __kvm_set_memory_region(), now that we have the fancy guard(mutex) and there are no internal calls to kvm_set_memory_region(), what if we: 1. Acquire/release slots_lock in __kvm_set_memory_region() 2. Call kvm_set_memory_region() from x86 code for the internal memslots 3. Disallow *any* flags for internal memslots 4. Open code check_memory_region_flags in kvm_vm_ioctl_set_memory_region() 5. Pass @ioctl to kvm_vm_ioctl_set_memory_region() and allow KVM_MEM_PRIVATE only for KVM_SET_USER_MEMORY_REGION2 E.g. this over ~5 patches --- arch/x86/kvm/x86.c | 2 +- include/linux/kvm_host.h | 4 +-- virt/kvm/kvm_main.c | 65 +--- 3 files changed, 29 insertions(+), 42 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e3eb608b6692..dd3e2017366c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12478,7 +12478,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, m.guest_phys_addr = gpa; m.userspace_addr = hva; m.memory_size = size; - r = __kvm_set_memory_region(kvm, ); + r = kvm_set_memory_region(kvm, ); if (r < 0) return ERR_PTR_USR(r); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 687589ce9f63..fbb98efe8200 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1170,7 +1170,7 @@ static inline bool kvm_memslot_iter_is_valid(struct kvm_memslot_iter *iter, gfn_ * -- just change its flags * * Since flags can be changed by some of these operations, the following - * differentiation is the best we can do for __kvm_set_memory_region(): + * differentiation is the best we can do for __kvm_set_memory_region(). */ enum kvm_mr_change { KVM_MR_CREATE, @@ -1181,8 +1181,6 @@ enum kvm_mr_change { int kvm_set_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region2 *mem); -int __kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region2 *mem); void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen); int kvm_arch_prepare_memory_region(struct kvm *kvm, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 23633984142f..39ceee2f67f2 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1608,28 +1608,6 @@ static void kvm_replace_memslot(struct kvm *kvm, } } -static int check_memory_region_flags(struct kvm *kvm, -const struct kvm_userspace_memory_region2 *mem) -{ - u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; - - if (kvm_arch_has_private_mem(kvm)) - valid_flags |= KVM_MEM_PRIVATE; - - /* Dirty logging private memory is not currently supported. */ - if (mem->flags & KVM_MEM_PRIVATE) - valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES; - -#ifdef __KVM_HAVE_READONLY_MEM - valid_flags |= KVM_MEM_READONLY; -#endif - - if (mem->flags & ~valid_flags) - return -EINVAL; - - return 0; -} - static void kvm_swap_active_memslots(struct kvm *kvm, int as_id) { struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id); @@ -2014,11 +1992,9 @@ static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id, * space. * * Discontiguous memory is allowed, mostly for framebuffers. - * - * Must be called holding kvm->slots_lock for write. */ -int __kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region2 *mem) +static int __kvm_set_memory_region(struct kvm *kvm, + const struct kvm_userspace_memory_region2 *mem) { struct kvm_memory_slot *old, *new; struct kvm_memslots *slots; @@ -2028,9 +2004,7 @@ int __kvm_set_memory_region(struct kvm *kvm, int as_id, id; int r; - r = check_memory_region_flags(kvm, mem); - if (r) - return r; + guard(mutex)(>slots_lock); as_id = mem->slot >> 16; id = (u16)mem->slot; @@ -2139,27 +2113,42 @@ int __kvm_set_memory_region(struct kvm *kvm,
Re: [PATCH v13 08/35] KVM: Introduce KVM_SET_USER_MEMORY_REGION2
On 10/27/23 20:21, Sean Christopherson wrote: + if (ioctl == KVM_SET_USER_MEMORY_REGION) + size = sizeof(struct kvm_userspace_memory_region); This also needs a memset(, 0, sizeof(mem)), otherwise the out-of-bounds access of the commit message becomes a kernel stack read. Probably worth adding a check on valid flags here. Paolo + else + size = sizeof(struct kvm_userspace_memory_region2); + + /* Ensure the common parts of the two structs are identical. */ + SANITY_CHECK_MEM_REGION_FIELD(slot); + SANITY_CHECK_MEM_REGION_FIELD(flags); + SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr); + SANITY_CHECK_MEM_REGION_FIELD(memory_size); + SANITY_CHECK_MEM_REGION_FIELD(userspace_addr);
[PATCH v13 08/35] KVM: Introduce KVM_SET_USER_MEMORY_REGION2
Introduce a "version 2" of KVM_SET_USER_MEMORY_REGION so that additional information can be supplied without setting userspace up to fail. The padding in the new kvm_userspace_memory_region2 structure will be used to pass a file descriptor in addition to the userspace_addr, i.e. allow userspace to point at a file descriptor and map memory into a guest that is NOT mapped into host userspace. Alternatively, KVM could simply add "struct kvm_userspace_memory_region2" without a new ioctl(), but as Paolo pointed out, adding a new ioctl() makes detection of bad flags a bit more robust, e.g. if the new fd field is guarded only by a flag and not a new ioctl(), then a userspace bug (setting a "bad" flag) would generate out-of-bounds access instead of an -EINVAL error. Cc: Jarkko Sakkinen Reviewed-by: Paolo Bonzini Reviewed-by: Xiaoyao Li Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/api.rst | 21 +++ arch/x86/kvm/x86.c | 2 +- include/linux/kvm_host.h | 4 ++-- include/uapi/linux/kvm.h | 13 virt/kvm/kvm_main.c| 38 +++--- 5 files changed, 67 insertions(+), 11 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 21a7578142a1..ace984acc125 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6070,6 +6070,27 @@ writes to the CNTVCT_EL0 and CNTPCT_EL0 registers using the SET_ONE_REG interface. No error will be returned, but the resulting offset will not be applied. +4.139 KVM_SET_USER_MEMORY_REGION2 +- + +:Capability: KVM_CAP_USER_MEMORY2 +:Architectures: all +:Type: vm ioctl +:Parameters: struct kvm_userspace_memory_region2 (in) +:Returns: 0 on success, -1 on error + +:: + + struct kvm_userspace_memory_region2 { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; /* bytes */ + __u64 userspace_addr; /* start of the userspace allocated memory */ + }; + +See KVM_SET_USER_MEMORY_REGION. + 5. The kvm_run structure diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 41cce5031126..6409914428ca 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12455,7 +12455,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, } for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - struct kvm_userspace_memory_region m; + struct kvm_userspace_memory_region2 m; m.slot = id | (i << 16); m.flags = 0; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5faba69403ac..4e741ff27af3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1146,9 +1146,9 @@ enum kvm_mr_change { }; int kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem); + const struct kvm_userspace_memory_region2 *mem); int __kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem); + const struct kvm_userspace_memory_region2 *mem); void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen); int kvm_arch_prepare_memory_region(struct kvm *kvm, diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 13065dd96132..bd1abe067f28 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -95,6 +95,16 @@ struct kvm_userspace_memory_region { __u64 userspace_addr; /* start of the userspace allocated memory */ }; +/* for KVM_SET_USER_MEMORY_REGION2 */ +struct kvm_userspace_memory_region2 { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; + __u64 userspace_addr; + __u64 pad[16]; +}; + /* * The bit 0 ~ bit 15 of kvm_userspace_memory_region::flags are visible for * userspace, other bits are reserved for kvm internal use which are defined @@ -1192,6 +1202,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_COUNTER_OFFSET 227 #define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228 #define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229 +#define KVM_CAP_USER_MEMORY2 230 #ifdef KVM_CAP_IRQ_ROUTING @@ -1473,6 +1484,8 @@ struct kvm_vfio_spapr_tce { struct kvm_userspace_memory_region) #define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64) +#define KVM_SET_USER_MEMORY_REGION2 _IOW(KVMIO, 0x49, \ +struct kvm_userspace_memory_region2) /* enable ucontrol for s390 */ struct kvm_s390_ucas_mapping { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6e708017064d..3f5b7c2c5327 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1578,7 +1578,7 @@ static