Re: [PATCH v13 08/35] KVM: Introduce KVM_SET_USER_MEMORY_REGION2

2023-11-01 Thread Fuad Tabba
On Fri, Oct 27, 2023 at 7:22 PM Sean Christopherson  wrote:
>
> Introduce a "version 2" of KVM_SET_USER_MEMORY_REGION so that additional
> information can be supplied without setting userspace up to fail.  The
> padding in the new kvm_userspace_memory_region2 structure will be used to
> pass a file descriptor in addition to the userspace_addr, i.e. allow
> userspace to point at a file descriptor and map memory into a guest that
> is NOT mapped into host userspace.
>
> Alternatively, KVM could simply add "struct kvm_userspace_memory_region2"
> without a new ioctl(), but as Paolo pointed out, adding a new ioctl()
> makes detection of bad flags a bit more robust, e.g. if the new fd field
> is guarded only by a flag and not a new ioctl(), then a userspace bug
> (setting a "bad" flag) would generate out-of-bounds access instead of an
> -EINVAL error.
>
> Cc: Jarkko Sakkinen 
> Reviewed-by: Paolo Bonzini 
> Reviewed-by: Xiaoyao Li 
> Signed-off-by: Sean Christopherson 
> ---

With the missing pad in api.rst fixed:
Reviewed-by: Fuad Tabba 
Tested-by: Fuad Tabba 

Cheers,
/fuad

>  Documentation/virt/kvm/api.rst | 21 +++
>  arch/x86/kvm/x86.c |  2 +-
>  include/linux/kvm_host.h   |  4 ++--
>  include/uapi/linux/kvm.h   | 13 
>  virt/kvm/kvm_main.c| 38 +++---
>  5 files changed, 67 insertions(+), 11 deletions(-)
>
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index 21a7578142a1..ace984acc125 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -6070,6 +6070,27 @@ writes to the CNTVCT_EL0 and CNTPCT_EL0 registers 
> using the SET_ONE_REG
>  interface. No error will be returned, but the resulting offset will not be
>  applied.
>
> +4.139 KVM_SET_USER_MEMORY_REGION2
> +-
> +
> +:Capability: KVM_CAP_USER_MEMORY2
> +:Architectures: all
> +:Type: vm ioctl
> +:Parameters: struct kvm_userspace_memory_region2 (in)
> +:Returns: 0 on success, -1 on error
> +
> +::
> +
> +  struct kvm_userspace_memory_region2 {
> +   __u32 slot;
> +   __u32 flags;
> +   __u64 guest_phys_addr;
> +   __u64 memory_size; /* bytes */
> +   __u64 userspace_addr; /* start of the userspace allocated memory */
> +  };
> +
> +See KVM_SET_USER_MEMORY_REGION.
> +
>  5. The kvm_run structure
>  
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 41cce5031126..6409914428ca 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -12455,7 +12455,7 @@ void __user * __x86_set_memory_region(struct kvm 
> *kvm, int id, gpa_t gpa,
> }
>
> for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
> -   struct kvm_userspace_memory_region m;
> +   struct kvm_userspace_memory_region2 m;
>
> m.slot = id | (i << 16);
> m.flags = 0;
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 5faba69403ac..4e741ff27af3 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1146,9 +1146,9 @@ enum kvm_mr_change {
>  };
>
>  int kvm_set_memory_region(struct kvm *kvm,
> - const struct kvm_userspace_memory_region *mem);
> + const struct kvm_userspace_memory_region2 *mem);
>  int __kvm_set_memory_region(struct kvm *kvm,
> -   const struct kvm_userspace_memory_region *mem);
> +   const struct kvm_userspace_memory_region2 *mem);
>  void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot);
>  void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen);
>  int kvm_arch_prepare_memory_region(struct kvm *kvm,
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 13065dd96132..bd1abe067f28 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -95,6 +95,16 @@ struct kvm_userspace_memory_region {
> __u64 userspace_addr; /* start of the userspace allocated memory */
>  };
>
> +/* for KVM_SET_USER_MEMORY_REGION2 */
> +struct kvm_userspace_memory_region2 {
> +   __u32 slot;
> +   __u32 flags;
> +   __u64 guest_phys_addr;
> +   __u64 memory_size;
> +   __u64 userspace_addr;
> +   __u64 pad[16];
> +};
> +
>  /*
>   * The bit 0 ~ bit 15 of kvm_userspace_memory_region::flags are visible for
>   * userspace, other bits are reserved for kvm internal use which are defined
> @@ -1192,6 +1202,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_COUNTER_OFFSET 227
>  #define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228
>  #define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229
> +#define KVM_CAP_USER_MEMORY2 230
>
>  #ifdef KVM_CAP_IRQ_ROUTING
>
> @@ -1473,6 +1484,8 @@ struct kvm_vfio_spapr_tce {
> struct kvm_userspace_memory_region)
>  #define KVM_SET_TSS_ADDR  _IO(KVMIO,   0x47)
>  #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  

Re: [PATCH v13 08/35] KVM: Introduce KVM_SET_USER_MEMORY_REGION2

2023-10-31 Thread Sean Christopherson
On Tue, Oct 31, 2023, Xiaoyao Li wrote:
> On 10/28/2023 2:21 AM, Sean Christopherson wrote:
> > Introduce a "version 2" of KVM_SET_USER_MEMORY_REGION so that additional
> > information can be supplied without setting userspace up to fail.  The
> > padding in the new kvm_userspace_memory_region2 structure will be used to
> > pass a file descriptor in addition to the userspace_addr, i.e. allow
> > userspace to point at a file descriptor and map memory into a guest that
> > is NOT mapped into host userspace.
> > 
> > Alternatively, KVM could simply add "struct kvm_userspace_memory_region2"
> > without a new ioctl(), but as Paolo pointed out, adding a new ioctl()
> > makes detection of bad flags a bit more robust, e.g. if the new fd field
> > is guarded only by a flag and not a new ioctl(), then a userspace bug
> > (setting a "bad" flag) would generate out-of-bounds access instead of an
> > -EINVAL error.
> > 
> > Cc: Jarkko Sakkinen 
> > Reviewed-by: Paolo Bonzini 
> > Reviewed-by: Xiaoyao Li 
> > Signed-off-by: Sean Christopherson 
> > ---
> >   Documentation/virt/kvm/api.rst | 21 +++
> >   arch/x86/kvm/x86.c |  2 +-
> >   include/linux/kvm_host.h   |  4 ++--
> >   include/uapi/linux/kvm.h   | 13 
> >   virt/kvm/kvm_main.c| 38 +++---
> >   5 files changed, 67 insertions(+), 11 deletions(-)
> > 
> > diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> > index 21a7578142a1..ace984acc125 100644
> > --- a/Documentation/virt/kvm/api.rst
> > +++ b/Documentation/virt/kvm/api.rst
> > @@ -6070,6 +6070,27 @@ writes to the CNTVCT_EL0 and CNTPCT_EL0 registers 
> > using the SET_ONE_REG
> >   interface. No error will be returned, but the resulting offset will not be
> >   applied.
> > +4.139 KVM_SET_USER_MEMORY_REGION2
> > +-
> > +
> > +:Capability: KVM_CAP_USER_MEMORY2
> > +:Architectures: all
> > +:Type: vm ioctl
> > +:Parameters: struct kvm_userspace_memory_region2 (in)
> > +:Returns: 0 on success, -1 on error
> > +
> > +::
> > +
> > +  struct kvm_userspace_memory_region2 {
> > +   __u32 slot;
> > +   __u32 flags;
> > +   __u64 guest_phys_addr;
> > +   __u64 memory_size; /* bytes */
> > +   __u64 userspace_addr; /* start of the userspace allocated memory */
> 
> missing
> 
>   __u64 pad[16];

I can't even copy+paste correctly :-(


Re: [PATCH v13 08/35] KVM: Introduce KVM_SET_USER_MEMORY_REGION2

2023-10-30 Thread Xiaoyao Li

On 10/28/2023 2:21 AM, Sean Christopherson wrote:

Introduce a "version 2" of KVM_SET_USER_MEMORY_REGION so that additional
information can be supplied without setting userspace up to fail.  The
padding in the new kvm_userspace_memory_region2 structure will be used to
pass a file descriptor in addition to the userspace_addr, i.e. allow
userspace to point at a file descriptor and map memory into a guest that
is NOT mapped into host userspace.

Alternatively, KVM could simply add "struct kvm_userspace_memory_region2"
without a new ioctl(), but as Paolo pointed out, adding a new ioctl()
makes detection of bad flags a bit more robust, e.g. if the new fd field
is guarded only by a flag and not a new ioctl(), then a userspace bug
(setting a "bad" flag) would generate out-of-bounds access instead of an
-EINVAL error.

Cc: Jarkko Sakkinen 
Reviewed-by: Paolo Bonzini 
Reviewed-by: Xiaoyao Li 
Signed-off-by: Sean Christopherson 
---
  Documentation/virt/kvm/api.rst | 21 +++
  arch/x86/kvm/x86.c |  2 +-
  include/linux/kvm_host.h   |  4 ++--
  include/uapi/linux/kvm.h   | 13 
  virt/kvm/kvm_main.c| 38 +++---
  5 files changed, 67 insertions(+), 11 deletions(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 21a7578142a1..ace984acc125 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6070,6 +6070,27 @@ writes to the CNTVCT_EL0 and CNTPCT_EL0 registers using 
the SET_ONE_REG
  interface. No error will be returned, but the resulting offset will not be
  applied.
  
+4.139 KVM_SET_USER_MEMORY_REGION2

+-
+
+:Capability: KVM_CAP_USER_MEMORY2
+:Architectures: all
+:Type: vm ioctl
+:Parameters: struct kvm_userspace_memory_region2 (in)
+:Returns: 0 on success, -1 on error
+
+::
+
+  struct kvm_userspace_memory_region2 {
+   __u32 slot;
+   __u32 flags;
+   __u64 guest_phys_addr;
+   __u64 memory_size; /* bytes */
+   __u64 userspace_addr; /* start of the userspace allocated memory */


missing

__u64 pad[16];


+  };
+
+See KVM_SET_USER_MEMORY_REGION.
+
  5. The kvm_run structure
  
  
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index 41cce5031126..6409914428ca 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12455,7 +12455,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, 
int id, gpa_t gpa,
}
  
  	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {

-   struct kvm_userspace_memory_region m;
+   struct kvm_userspace_memory_region2 m;
  
  		m.slot = id | (i << 16);

m.flags = 0;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5faba69403ac..4e741ff27af3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1146,9 +1146,9 @@ enum kvm_mr_change {
  };
  
  int kvm_set_memory_region(struct kvm *kvm,

- const struct kvm_userspace_memory_region *mem);
+ const struct kvm_userspace_memory_region2 *mem);
  int __kvm_set_memory_region(struct kvm *kvm,
-   const struct kvm_userspace_memory_region *mem);
+   const struct kvm_userspace_memory_region2 *mem);
  void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot);
  void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen);
  int kvm_arch_prepare_memory_region(struct kvm *kvm,
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 13065dd96132..bd1abe067f28 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -95,6 +95,16 @@ struct kvm_userspace_memory_region {
__u64 userspace_addr; /* start of the userspace allocated memory */
  };
  
+/* for KVM_SET_USER_MEMORY_REGION2 */

+struct kvm_userspace_memory_region2 {
+   __u32 slot;
+   __u32 flags;
+   __u64 guest_phys_addr;
+   __u64 memory_size;
+   __u64 userspace_addr;
+   __u64 pad[16];
+};
+
  /*
   * The bit 0 ~ bit 15 of kvm_userspace_memory_region::flags are visible for
   * userspace, other bits are reserved for kvm internal use which are defined
@@ -1192,6 +1202,7 @@ struct kvm_ppc_resize_hpt {
  #define KVM_CAP_COUNTER_OFFSET 227
  #define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228
  #define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229
+#define KVM_CAP_USER_MEMORY2 230
  
  #ifdef KVM_CAP_IRQ_ROUTING
  
@@ -1473,6 +1484,8 @@ struct kvm_vfio_spapr_tce {

struct kvm_userspace_memory_region)
  #define KVM_SET_TSS_ADDR  _IO(KVMIO,   0x47)
  #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
+#define KVM_SET_USER_MEMORY_REGION2 _IOW(KVMIO, 0x49, \
+struct kvm_userspace_memory_region2)
  
  /* enable ucontrol for s390 */

  struct kvm_s390_ucas_mapping {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 

Re: [PATCH v13 08/35] KVM: Introduce KVM_SET_USER_MEMORY_REGION2

2023-10-30 Thread Sean Christopherson
On Tue, Oct 31, 2023, Paolo Bonzini wrote:
> On 10/30/23 21:25, Sean Christopherson wrote:
> > > Probably worth adding a check on valid flags here.
> > 
> > Definitely needed.  There's a very real bug here.  But rather than 
> > duplicate flags
> > checking or plumb @ioctl all the way to __kvm_set_memory_region(), now that 
> > we
> > have the fancy guard(mutex) and there are no internal calls to 
> > kvm_set_memory_region(),
> > what if we:
> > 
> >1. Acquire/release slots_lock in __kvm_set_memory_region()
> >2. Call kvm_set_memory_region() from x86 code for the internal memslots
> >3. Disallow *any* flags for internal memslots
> >4. Open code check_memory_region_flags in 
> > kvm_vm_ioctl_set_memory_region()
> 
> I dislike this step, there is a clear point where all paths meet
> (ioctl/internal, locked/unlocked) and that's __kvm_set_memory_region().
> I think that's the place where flags should be checked.  (I don't mind
> the restriction on internal memslots; it's just that to me it's not a
> particularly natural way to structure the checks).

Yeah, I just don't like the discrepancy it causes where some flags are 
explicitly
checked and allowed, allowed and then later disallowed.

> On the other hand, the place where to protect from out-of-bounds
> accesses, is the place where you stop caring about struct
> kvm_userspace_memory_region vs kvm_userspace_memory_region2 (and
> your code gets it right, by dropping "ioctl" as soon as possible).
> 
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 87f45aa91ced..fe5a2af14fff 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1635,6 +1635,14 @@ bool __weak kvm_arch_dirty_log_supported(struct kvm 
> *kvm)
>   return true;
>  }
> +/*
> + * Flags that do not access any of the extra space of struct
> + * kvm_userspace_memory_region2.  KVM_SET_USER_MEMORY_REGION_FLAGS
> + * only allows these.
> + */
> +#define KVM_SET_USER_MEMORY_REGION_FLAGS \

Can we name this KVM_SET_USER_MEMORY_REGION_LEGACY_FLAGS, or something equally
horrific?  As is, this sounds way too much like a generic "allowed flags for any
memory region".

Or maybe invert the macro?  I.e. something to make it more obvious that it's
effectively a versioning check, not a generic "what's supported?" check.

#define KVM_SET_USER_MEMORY_FLAGS_V2_ONLY \
(~(KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY))


> + (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)
> +
>  static int check_memory_region_flags(struct kvm *kvm,
>const struct kvm_userspace_memory_region2 
> *mem)
>  {
> @@ -5149,10 +5149,16 @@ static long kvm_vm_ioctl(struct file *filp,
>   struct kvm_userspace_memory_region2 mem;
>   unsigned long size;
> - if (ioctl == KVM_SET_USER_MEMORY_REGION)
> + if (ioctl == KVM_SET_USER_MEMORY_REGION) {
> + /*
> +  * Fields beyond struct kvm_userspace_memory_region 
> shouldn't be
> +  * accessed, but avoid leaking kernel memory in case of 
> a bug.
> +  */
> + memset(, 0, sizeof(mem));
>   size = sizeof(struct kvm_userspace_memory_region);
> - else
> + } else {
>   size = sizeof(struct kvm_userspace_memory_region2);
> + }
>   /* Ensure the common parts of the two structs are identical. */
>   SANITY_CHECK_MEM_REGION_FIELD(slot);
> @@ -5165,6 +5167,11 @@ static long kvm_vm_ioctl(struct file *filp,
>   if (copy_from_user(, argp, size))
>   goto out;
> + r = -EINVAL;
> + if (ioctl == KVM_SET_USER_MEMORY_REGION &&
> + (mem->flags & ~KVM_SET_USER_MEMORY_REGION_FLAGS))
> + goto out;
> +
>   r = kvm_vm_ioctl_set_memory_region(kvm, );
>   break;
>   }
> 
> 
> That's a kind of patch that you can't really get wrong (though I have
> the brown paper bag ready).
> 
> Maintainance-wise it's fine, since flags are being added at a pace of
> roughly one every five years,

Heh, true.

> and anyway it's also future proof: I placed the #define near
> check_memory_region_flags so that in five years we remember to keep it up to
> date.  But worst case, the new flags will only be allowed by
> KVM_SET_USER_MEMORY_REGION2 unnecessarily; there are no security issues
> waiting to bite us.
>
> In sum, this is exactly the only kind of fix that should be in the v13->v14
> delta.

Boiling the ocean can be fun too ;-)


Re: [PATCH v13 08/35] KVM: Introduce KVM_SET_USER_MEMORY_REGION2

2023-10-30 Thread Paolo Bonzini

On 10/30/23 21:25, Sean Christopherson wrote:

On Mon, Oct 30, 2023, Paolo Bonzini wrote:

On 10/27/23 20:21, Sean Christopherson wrote:


+   if (ioctl == KVM_SET_USER_MEMORY_REGION)
+   size = sizeof(struct kvm_userspace_memory_region);


This also needs a memset(, 0, sizeof(mem)), otherwise the out-of-bounds
access of the commit message becomes a kernel stack read.


Ouch.  There's some irony.  Might be worth doing memset(, -1, sizeof(mem))
though as '0' is a valid file descriptor and a valid file offset.


Either is okay, because unless the flags check is screwed up it should
not matter.  The memset is actually unnecessary, though it may be a good
idea anyway to keep it, aka belt-and-suspenders.


Probably worth adding a check on valid flags here.


Definitely needed.  There's a very real bug here.  But rather than duplicate 
flags
checking or plumb @ioctl all the way to __kvm_set_memory_region(), now that we
have the fancy guard(mutex) and there are no internal calls to 
kvm_set_memory_region(),
what if we:

   1. Acquire/release slots_lock in __kvm_set_memory_region()
   2. Call kvm_set_memory_region() from x86 code for the internal memslots
   3. Disallow *any* flags for internal memslots
   4. Open code check_memory_region_flags in kvm_vm_ioctl_set_memory_region()


I dislike this step, there is a clear point where all paths meet
(ioctl/internal, locked/unlocked) and that's __kvm_set_memory_region().
I think that's the place where flags should be checked.  (I don't mind
the restriction on internal memslots; it's just that to me it's not a
particularly natural way to structure the checks).

On the other hand, the place where to protect from out-of-bounds
accesses, is the place where you stop caring about struct
kvm_userspace_memory_region vs kvm_userspace_memory_region2 (and
your code gets it right, by dropping "ioctl" as soon as possible).

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 87f45aa91ced..fe5a2af14fff 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1635,6 +1635,14 @@ bool __weak kvm_arch_dirty_log_supported(struct kvm *kvm)
return true;
 }
 
+/*

+ * Flags that do not access any of the extra space of struct
+ * kvm_userspace_memory_region2.  KVM_SET_USER_MEMORY_REGION_FLAGS
+ * only allows these.
+ */
+#define KVM_SET_USER_MEMORY_REGION_FLAGS \
+   (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)
+
 static int check_memory_region_flags(struct kvm *kvm,
 const struct kvm_userspace_memory_region2 
*mem)
 {
@@ -5149,10 +5149,16 @@ static long kvm_vm_ioctl(struct file *filp,
struct kvm_userspace_memory_region2 mem;
unsigned long size;
 
-		if (ioctl == KVM_SET_USER_MEMORY_REGION)

+   if (ioctl == KVM_SET_USER_MEMORY_REGION) {
+   /*
+* Fields beyond struct kvm_userspace_memory_region 
shouldn't be
+* accessed, but avoid leaking kernel memory in case of 
a bug.
+*/
+   memset(, 0, sizeof(mem));
size = sizeof(struct kvm_userspace_memory_region);
-   else
+   } else {
size = sizeof(struct kvm_userspace_memory_region2);
+   }
 
 		/* Ensure the common parts of the two structs are identical. */

SANITY_CHECK_MEM_REGION_FIELD(slot);
@@ -5165,6 +5167,11 @@ static long kvm_vm_ioctl(struct file *filp,
if (copy_from_user(, argp, size))
goto out;
 
+		r = -EINVAL;

+   if (ioctl == KVM_SET_USER_MEMORY_REGION &&
+   (mem->flags & ~KVM_SET_USER_MEMORY_REGION_FLAGS))
+   goto out;
+
r = kvm_vm_ioctl_set_memory_region(kvm, );
break;
}


That's a kind of patch that you can't really get wrong (though I have
the brown paper bag ready).

Maintainance-wise it's fine, since flags are being added at a pace of
roughly one every five years, and anyway it's also future proof: I placed
the #define near check_memory_region_flags so that in five years we remember
to keep it up to date.  But worst case, the new flags will only be allowed
by KVM_SET_USER_MEMORY_REGION2 unnecessarily; there are no security issues
waiting to bite us.

In sum, this is exactly the only kind of fix that should be in the v13->v14
delta.

Paolo



Re: [PATCH v13 08/35] KVM: Introduce KVM_SET_USER_MEMORY_REGION2

2023-10-30 Thread Sean Christopherson
On Mon, Oct 30, 2023, Sean Christopherson wrote:
> On Mon, Oct 30, 2023, Paolo Bonzini wrote:
> > On 10/27/23 20:21, Sean Christopherson wrote:
> > > 
> > > + if (ioctl == KVM_SET_USER_MEMORY_REGION)
> > > + size = sizeof(struct kvm_userspace_memory_region);
> > 
> > This also needs a memset(, 0, sizeof(mem)), otherwise the out-of-bounds
> > access of the commit message becomes a kernel stack read.
> 
> Ouch.  There's some irony.  Might be worth doing memset(, -1, sizeof(mem))
> though as '0' is a valid file descriptor and a valid file offset.
> 
> > Probably worth adding a check on valid flags here.
> 
> Definitely needed.  There's a very real bug here.  But rather than duplicate 
> flags
> checking or plumb @ioctl all the way to __kvm_set_memory_region(), now that we
> have the fancy guard(mutex) and there are no internal calls to 
> kvm_set_memory_region(),
> what if we:
> 
>   1. Acquire/release slots_lock in __kvm_set_memory_region()

Gah, this won't work with x86's usage, which acquire slots_lock quite far away
from __kvm_set_memory_region() in several places.  The rest of the idea still
works, kvm_vm_ioctl_set_memory_region() will just need to take slots_lock 
manually.

>   2. Call kvm_set_memory_region() from x86 code for the internal memslots
>   3. Disallow *any* flags for internal memslots
>   4. Open code check_memory_region_flags in kvm_vm_ioctl_set_memory_region()
>   5. Pass @ioctl to kvm_vm_ioctl_set_memory_region() and allow KVM_MEM_PRIVATE
>  only for KVM_SET_USER_MEMORY_REGION2


Re: [PATCH v13 08/35] KVM: Introduce KVM_SET_USER_MEMORY_REGION2

2023-10-30 Thread Sean Christopherson
On Mon, Oct 30, 2023, Paolo Bonzini wrote:
> On 10/27/23 20:21, Sean Christopherson wrote:
> > 
> > +   if (ioctl == KVM_SET_USER_MEMORY_REGION)
> > +   size = sizeof(struct kvm_userspace_memory_region);
> 
> This also needs a memset(, 0, sizeof(mem)), otherwise the out-of-bounds
> access of the commit message becomes a kernel stack read.

Ouch.  There's some irony.  Might be worth doing memset(, -1, sizeof(mem))
though as '0' is a valid file descriptor and a valid file offset.

> Probably worth adding a check on valid flags here.

Definitely needed.  There's a very real bug here.  But rather than duplicate 
flags
checking or plumb @ioctl all the way to __kvm_set_memory_region(), now that we
have the fancy guard(mutex) and there are no internal calls to 
kvm_set_memory_region(),
what if we:

  1. Acquire/release slots_lock in __kvm_set_memory_region()
  2. Call kvm_set_memory_region() from x86 code for the internal memslots
  3. Disallow *any* flags for internal memslots
  4. Open code check_memory_region_flags in kvm_vm_ioctl_set_memory_region()
  5. Pass @ioctl to kvm_vm_ioctl_set_memory_region() and allow KVM_MEM_PRIVATE
 only for KVM_SET_USER_MEMORY_REGION2

E.g. this over ~5 patches

---
 arch/x86/kvm/x86.c   |  2 +-
 include/linux/kvm_host.h |  4 +--
 virt/kvm/kvm_main.c  | 65 +---
 3 files changed, 29 insertions(+), 42 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e3eb608b6692..dd3e2017366c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12478,7 +12478,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, 
int id, gpa_t gpa,
m.guest_phys_addr = gpa;
m.userspace_addr = hva;
m.memory_size = size;
-   r = __kvm_set_memory_region(kvm, );
+   r = kvm_set_memory_region(kvm, );
if (r < 0)
return ERR_PTR_USR(r);
}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 687589ce9f63..fbb98efe8200 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1170,7 +1170,7 @@ static inline bool kvm_memslot_iter_is_valid(struct 
kvm_memslot_iter *iter, gfn_
  *   -- just change its flags
  *
  * Since flags can be changed by some of these operations, the following
- * differentiation is the best we can do for __kvm_set_memory_region():
+ * differentiation is the best we can do for __kvm_set_memory_region().
  */
 enum kvm_mr_change {
KVM_MR_CREATE,
@@ -1181,8 +1181,6 @@ enum kvm_mr_change {
 
 int kvm_set_memory_region(struct kvm *kvm,
  const struct kvm_userspace_memory_region2 *mem);
-int __kvm_set_memory_region(struct kvm *kvm,
-   const struct kvm_userspace_memory_region2 *mem);
 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot);
 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen);
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 23633984142f..39ceee2f67f2 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1608,28 +1608,6 @@ static void kvm_replace_memslot(struct kvm *kvm,
}
 }
 
-static int check_memory_region_flags(struct kvm *kvm,
-const struct kvm_userspace_memory_region2 
*mem)
-{
-   u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
-
-   if (kvm_arch_has_private_mem(kvm))
-   valid_flags |= KVM_MEM_PRIVATE;
-
-   /* Dirty logging private memory is not currently supported. */
-   if (mem->flags & KVM_MEM_PRIVATE)
-   valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
-
-#ifdef __KVM_HAVE_READONLY_MEM
-   valid_flags |= KVM_MEM_READONLY;
-#endif
-
-   if (mem->flags & ~valid_flags)
-   return -EINVAL;
-
-   return 0;
-}
-
 static void kvm_swap_active_memslots(struct kvm *kvm, int as_id)
 {
struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
@@ -2014,11 +1992,9 @@ static bool kvm_check_memslot_overlap(struct 
kvm_memslots *slots, int id,
  * space.
  *
  * Discontiguous memory is allowed, mostly for framebuffers.
- *
- * Must be called holding kvm->slots_lock for write.
  */
-int __kvm_set_memory_region(struct kvm *kvm,
-   const struct kvm_userspace_memory_region2 *mem)
+static int __kvm_set_memory_region(struct kvm *kvm,
+  const struct kvm_userspace_memory_region2 
*mem)
 {
struct kvm_memory_slot *old, *new;
struct kvm_memslots *slots;
@@ -2028,9 +2004,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
int as_id, id;
int r;
 
-   r = check_memory_region_flags(kvm, mem);
-   if (r)
-   return r;
+   guard(mutex)(>slots_lock);
 
as_id = mem->slot >> 16;
id = (u16)mem->slot;
@@ -2139,27 +2113,42 @@ int __kvm_set_memory_region(struct kvm *kvm,
   

Re: [PATCH v13 08/35] KVM: Introduce KVM_SET_USER_MEMORY_REGION2

2023-10-30 Thread Paolo Bonzini

On 10/27/23 20:21, Sean Christopherson wrote:


+   if (ioctl == KVM_SET_USER_MEMORY_REGION)
+   size = sizeof(struct kvm_userspace_memory_region);


This also needs a memset(, 0, sizeof(mem)), otherwise the 
out-of-bounds access of the commit message becomes a kernel stack read.


Probably worth adding a check on valid flags here.

Paolo


+   else
+   size = sizeof(struct kvm_userspace_memory_region2);
+
+   /* Ensure the common parts of the two structs are identical. */
+   SANITY_CHECK_MEM_REGION_FIELD(slot);
+   SANITY_CHECK_MEM_REGION_FIELD(flags);
+   SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr);
+   SANITY_CHECK_MEM_REGION_FIELD(memory_size);
+   SANITY_CHECK_MEM_REGION_FIELD(userspace_addr);
 





[PATCH v13 08/35] KVM: Introduce KVM_SET_USER_MEMORY_REGION2

2023-10-27 Thread Sean Christopherson
Introduce a "version 2" of KVM_SET_USER_MEMORY_REGION so that additional
information can be supplied without setting userspace up to fail.  The
padding in the new kvm_userspace_memory_region2 structure will be used to
pass a file descriptor in addition to the userspace_addr, i.e. allow
userspace to point at a file descriptor and map memory into a guest that
is NOT mapped into host userspace.

Alternatively, KVM could simply add "struct kvm_userspace_memory_region2"
without a new ioctl(), but as Paolo pointed out, adding a new ioctl()
makes detection of bad flags a bit more robust, e.g. if the new fd field
is guarded only by a flag and not a new ioctl(), then a userspace bug
(setting a "bad" flag) would generate out-of-bounds access instead of an
-EINVAL error.

Cc: Jarkko Sakkinen 
Reviewed-by: Paolo Bonzini 
Reviewed-by: Xiaoyao Li 
Signed-off-by: Sean Christopherson 
---
 Documentation/virt/kvm/api.rst | 21 +++
 arch/x86/kvm/x86.c |  2 +-
 include/linux/kvm_host.h   |  4 ++--
 include/uapi/linux/kvm.h   | 13 
 virt/kvm/kvm_main.c| 38 +++---
 5 files changed, 67 insertions(+), 11 deletions(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 21a7578142a1..ace984acc125 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6070,6 +6070,27 @@ writes to the CNTVCT_EL0 and CNTPCT_EL0 registers using 
the SET_ONE_REG
 interface. No error will be returned, but the resulting offset will not be
 applied.
 
+4.139 KVM_SET_USER_MEMORY_REGION2
+-
+
+:Capability: KVM_CAP_USER_MEMORY2
+:Architectures: all
+:Type: vm ioctl
+:Parameters: struct kvm_userspace_memory_region2 (in)
+:Returns: 0 on success, -1 on error
+
+::
+
+  struct kvm_userspace_memory_region2 {
+   __u32 slot;
+   __u32 flags;
+   __u64 guest_phys_addr;
+   __u64 memory_size; /* bytes */
+   __u64 userspace_addr; /* start of the userspace allocated memory */
+  };
+
+See KVM_SET_USER_MEMORY_REGION.
+
 5. The kvm_run structure
 
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 41cce5031126..6409914428ca 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12455,7 +12455,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, 
int id, gpa_t gpa,
}
 
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
-   struct kvm_userspace_memory_region m;
+   struct kvm_userspace_memory_region2 m;
 
m.slot = id | (i << 16);
m.flags = 0;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5faba69403ac..4e741ff27af3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1146,9 +1146,9 @@ enum kvm_mr_change {
 };
 
 int kvm_set_memory_region(struct kvm *kvm,
- const struct kvm_userspace_memory_region *mem);
+ const struct kvm_userspace_memory_region2 *mem);
 int __kvm_set_memory_region(struct kvm *kvm,
-   const struct kvm_userspace_memory_region *mem);
+   const struct kvm_userspace_memory_region2 *mem);
 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot);
 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen);
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 13065dd96132..bd1abe067f28 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -95,6 +95,16 @@ struct kvm_userspace_memory_region {
__u64 userspace_addr; /* start of the userspace allocated memory */
 };
 
+/* for KVM_SET_USER_MEMORY_REGION2 */
+struct kvm_userspace_memory_region2 {
+   __u32 slot;
+   __u32 flags;
+   __u64 guest_phys_addr;
+   __u64 memory_size;
+   __u64 userspace_addr;
+   __u64 pad[16];
+};
+
 /*
  * The bit 0 ~ bit 15 of kvm_userspace_memory_region::flags are visible for
  * userspace, other bits are reserved for kvm internal use which are defined
@@ -1192,6 +1202,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_COUNTER_OFFSET 227
 #define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228
 #define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229
+#define KVM_CAP_USER_MEMORY2 230
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1473,6 +1484,8 @@ struct kvm_vfio_spapr_tce {
struct kvm_userspace_memory_region)
 #define KVM_SET_TSS_ADDR  _IO(KVMIO,   0x47)
 #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
+#define KVM_SET_USER_MEMORY_REGION2 _IOW(KVMIO, 0x49, \
+struct kvm_userspace_memory_region2)
 
 /* enable ucontrol for s390 */
 struct kvm_s390_ucas_mapping {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 6e708017064d..3f5b7c2c5327 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1578,7 +1578,7 @@ static