Re: [RFC PATCH v11 08/29] KVM: Introduce per-page memory attributes

2023-08-02 Thread Isaku Yamahata
m_vm_ioctl_clear_dirty_log(struct kvm 
> *kvm,
>  }
>  #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
>  
> +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
> +static u64 kvm_supported_mem_attributes(struct kvm *kvm)
> +{
> + return 0;
> +}
> +
> +static __always_inline void kvm_handle_gfn_range(struct kvm *kvm,
> +  struct kvm_mmu_notifier_range 
> *range)
> +{
> + struct kvm_gfn_range gfn_range;
> + struct kvm_memory_slot *slot;
> + struct kvm_memslots *slots;
> + struct kvm_memslot_iter iter;
> + bool locked = false;
> + bool ret = false;
> + int i;
> +
> + gfn_range.arg.raw = range->arg.raw;
> + gfn_range.may_block = range->may_block;
> +
> + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
> + slots = __kvm_memslots(kvm, i);
> +
> + kvm_for_each_memslot_in_gfn_range(, slots, range->start, 
> range->end) {
> + slot = iter.slot;
> + gfn_range.slot = slot;
> +
> + gfn_range.start = max(range->start, slot->base_gfn);
> + gfn_range.end = min(range->end, slot->base_gfn + 
> slot->npages);
> + if (gfn_range.start >= gfn_range.end)
> + continue;
> +
> + if (!locked) {
> + locked = true;
> + KVM_MMU_LOCK(kvm);
> + if (!IS_KVM_NULL_FN(range->on_lock))
> + range->on_lock(kvm);
> + }
> +
> + ret |= range->handler(kvm, _range);
> + }
> + }
> +
> + if (range->flush_on_ret && ret)
> + kvm_flush_remote_tlbs(kvm);
> +
> + if (locked) {
> + KVM_MMU_UNLOCK(kvm);
> + if (!IS_KVM_NULL_FN(range->on_unlock))
> + range->on_unlock(kvm);
> + }
> +}
> +
> +static int kvm_vm_set_mem_attributes(struct kvm *kvm, unsigned long 
> attributes,
> +  gfn_t start, gfn_t end)
> +{
> + struct kvm_mmu_notifier_range unmap_range = {
> + .start = start,
> + .end = end,
> + .handler = kvm_mmu_unmap_gfn_range,
> + .on_lock = kvm_mmu_invalidate_begin,
> +     .on_unlock = (void *)kvm_null_fn,
> + .flush_on_ret = true,
> + .may_block = true,
> + };
> + struct kvm_mmu_notifier_range post_set_range = {
> + .start = start,
> + .end = end,
> + .arg.attributes = attributes,
> + .handler = kvm_arch_post_set_memory_attributes,
> + .on_lock = (void *)kvm_null_fn,
> + .on_unlock = kvm_mmu_invalidate_end,


on_unlock is called after unlocking mmu_lock. So kvm::mmu_invalidate_in_progress
is touched out side of it.  Here is a quick fix.

 WARNING: CPU: 108 PID: 62218 at arch/x86/kvm/../../../virt/kvm/kvm_main.c:757 
kvm_mmu_unmap_gfn_range+0x32/0x70 [kvm]
  ...
 RIP: 0010:kvm_mmu_unmap_gfn_range+0x32/0x70 [kvm]
  ...
 Call Trace:
  
  kvm_gmem_invalidate_begin+0xd0/0x130 [kvm]
  kvm_gmem_fallocate+0x134/0x290 [kvm]
  vfs_fallocate+0x151/0x380
  __x64_sys_fallocate+0x3c/0x70
  do_syscall_64+0x40/0x90
  entry_SYSCALL_64_after_hwframe+0x6e/0xd8


>From c06084048271278d3508f534479b356f49f619ce Mon Sep 17 00:00:00 2001
Message-Id: 

From: Isaku Yamahata 
Date: Mon, 31 Jul 2023 22:58:15 -0700
Subject: [PATCH] KVM: guest_memfd(): protect kvm_mmu_invalidate_end()

kvm_mmu_invalidate_end() updates struct kvm::mmu_invalidate_in_progress
and it's protected by kvm::mmu_lock.  call kvm_mmu_invalidate_end() before
unlocking it. Not after the unlock.

Fixes: edd048ffeaf6 ("KVM: Introduce per-page memory attributes")
Signed-off-by: Isaku Yamahata 
---
 virt/kvm/kvm_main.c | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9b4759b6dd87..6947f776851b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -548,6 +548,7 @@ struct kvm_mmu_notifier_range {
} arg;
gfn_handler_t handler;
on_lock_fn_t on_lock;
+   on_unlock_fn_t before_unlock;
on_unlock_fn_t on_unlock;
bool flush_on_ret;
bool may_block;
@@ -644,6 +645,8 @@ static __always_inline int __kvm_handle_hva_range(struct 
kvm *kvm,
kvm_flush_remote_tlbs(kvm);
 
if (locked) {
+   if (!IS_KVM_NULL_FN(range->before_unlock))
+   range->before_unlock(kvm);
KVM_MMU_UNLOCK(kvm);
if (!IS_KV

Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

2023-07-21 Thread Isaku Yamahata
On Fri, Jul 21, 2023 at 02:13:14PM +0800,
Yuan Yao  wrote:

> On Tue, Jul 18, 2023 at 04:44:55PM -0700, Sean Christopherson wrote:
> > TODO
> >
> > Cc: Fuad Tabba 
> > Cc: Vishal Annapurve 
> > Cc: Ackerley Tng 
> > Cc: Jarkko Sakkinen 
> > Cc: Maciej Szmigiero 
> > Cc: Vlastimil Babka 
> > Cc: David Hildenbrand 
> > Cc: Quentin Perret 
> > Cc: Michael Roth 
> > Cc: Wang 
> > Cc: Liam Merwick 
> > Cc: Isaku Yamahata 
> > Co-developed-by: Kirill A. Shutemov 
> > Signed-off-by: Kirill A. Shutemov 
> > Co-developed-by: Yu Zhang 
> > Signed-off-by: Yu Zhang 
> > Co-developed-by: Chao Peng 
> > Signed-off-by: Chao Peng 
> > Co-developed-by: Ackerley Tng 
> > Signed-off-by: Ackerley Tng 
> > Signed-off-by: Sean Christopherson 
> > ---
> >  include/linux/kvm_host.h   |  48 +++
> >  include/uapi/linux/kvm.h   |  14 +-
> >  include/uapi/linux/magic.h |   1 +
> >  virt/kvm/Kconfig   |   4 +
> >  virt/kvm/Makefile.kvm  |   1 +
> >  virt/kvm/guest_mem.c   | 591 +
> >  virt/kvm/kvm_main.c|  58 +++-
> >  virt/kvm/kvm_mm.h  |  38 +++
> >  8 files changed, 750 insertions(+), 5 deletions(-)
> >  create mode 100644 virt/kvm/guest_mem.c
> >
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 97db63da6227..0d1e2ee8ae7a 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -592,8 +592,20 @@ struct kvm_memory_slot {
> > u32 flags;
> > short id;
> > u16 as_id;
> > +
> > +#ifdef CONFIG_KVM_PRIVATE_MEM
> > +   struct {
> > +   struct file __rcu *file;
> > +   pgoff_t pgoff;
> > +   } gmem;
> > +#endif
> >  };
> >
> > +static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot 
> > *slot)
> > +{
> > +   return slot && (slot->flags & KVM_MEM_PRIVATE);
> > +}
> > +
> >  static inline bool kvm_slot_dirty_track_enabled(const struct 
> > kvm_memory_slot *slot)
> >  {
> > return slot->flags & KVM_MEM_LOG_DIRTY_PAGES;
> > @@ -688,6 +700,17 @@ static inline int kvm_arch_vcpu_memslots_id(struct 
> > kvm_vcpu *vcpu)
> >  }
> >  #endif
> >
> > +/*
> > + * Arch code must define kvm_arch_has_private_mem if support for private 
> > memory
> > + * is enabled.
> > + */
> > +#if !defined(kvm_arch_has_private_mem) && 
> > !IS_ENABLED(CONFIG_KVM_PRIVATE_MEM)
> > +static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
> > +{
> > +   return false;
> > +}
> > +#endif
> > +
> >  struct kvm_memslots {
> > u64 generation;
> > atomic_long_t last_used_slot;
> > @@ -1380,6 +1403,7 @@ void *kvm_mmu_memory_cache_alloc(struct 
> > kvm_mmu_memory_cache *mc);
> >  void kvm_mmu_invalidate_begin(struct kvm *kvm);
> >  void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end);
> >  void kvm_mmu_invalidate_end(struct kvm *kvm);
> > +bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
> >
> >  long kvm_arch_dev_ioctl(struct file *filp,
> > unsigned int ioctl, unsigned long arg);
> > @@ -2313,6 +2337,30 @@ static inline unsigned long 
> > kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn
> >
> >  bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
> >  struct kvm_gfn_range *range);
> > +
> > +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
> > +{
> > +   return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) &&
> > +  kvm_get_memory_attributes(kvm, gfn) & 
> > KVM_MEMORY_ATTRIBUTE_PRIVATE;
> > +}
> > +#else
> > +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
> > +{
> > +   return false;
> > +}
> >  #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
> >
> > +#ifdef CONFIG_KVM_PRIVATE_MEM
> > +int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > + gfn_t gfn, kvm_pfn_t *pfn, int *max_order);
> > +#else
> > +static inline int kvm_gmem_get_pfn(struct kvm *kvm,
> > +  struct kvm_memory_slot *slot, gfn_t gfn,
> > +  kvm_pfn_t *pfn, int *max_order)
> > +{
> > +   KVM_BUG_ON(1, kvm);
> > +   return -EIO;
> > +}
> > +#endif /* CONFIG_KVM_PRIVATE_MEM */
> > +
> >  #endif
> &g

Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

2023-07-20 Thread Isaku Yamahata
On Tue, Jul 18, 2023 at 04:44:55PM -0700,
Sean Christopherson  wrote:

> +static int kvm_gmem_release(struct inode *inode, struct file *file)
> +{
> + struct kvm_gmem *gmem = file->private_data;
> + struct kvm_memory_slot *slot;
> + struct kvm *kvm = gmem->kvm;
> + unsigned long index;
> +
> + filemap_invalidate_lock(inode->i_mapping);
> +
> + /*
> +  * Prevent concurrent attempts to *unbind* a memslot.  This is the last
> +  * reference to the file and thus no new bindings can be created, but
> +  * dereferencing the slot for existing bindings needs to be protected
> +  * against memslot updates, specifically so that unbind doesn't race
> +  * and free the memslot (kvm_gmem_get_file() will return NULL).
> +  */
> + mutex_lock(>slots_lock);
> +
> + xa_for_each(>bindings, index, slot)
> + rcu_assign_pointer(slot->gmem.file, NULL);
> +
> + synchronize_rcu();
> +
> + /*
> +  * All in-flight operations are gone and new bindings can be created.
> +  * Zap all SPTEs pointed at by this file.  Do not free the backing
> +  * memory, as its lifetime is associated with the inode, not the file.
> +  */
> + kvm_gmem_invalidate_begin(gmem, 0, -1ul);
> + kvm_gmem_invalidate_end(gmem, 0, -1ul);
> +
> + mutex_unlock(>slots_lock);
> +
> + list_del(>entry);
> +
> + filemap_invalidate_unlock(inode->i_mapping);
> +
> + xa_destroy(>bindings);
> + kfree(gmem);
> +
> + kvm_put_kvm(kvm);
> +
> + return 0;
> +}

The lockdep complains with the filemapping lock and the kvm slot lock.


>From bc45eb084a761f93a87ba1f6d3a9949c17adeb31 Mon Sep 17 00:00:00 2001
Message-Id: 

From: Isaku Yamahata 
Date: Thu, 20 Jul 2023 14:16:21 -0700
Subject: [PATCH] KVM/gmem: Fix locking ordering in kvm_gmem_release()

The lockdep complains the locking order.  Fix kvm_gmem_release()

VM destruction:
- fput()
   ...
   \-kvm_gmem_release()
 \-filemap_invalidate_lock(inode->i_mapping);
   lock(>slots_lock);

slot creation:
kvm_set_memory_region()
   mutex_lock(>slots_lock);
   __kvm_set_memory_region(kvm, mem);
\-kvm_gmem_bind()
  \-filemap_invalidate_lock(inode->i_mapping);

==
WARNING: possible circular locking dependency detected
--
...

the existing dependency chain (in reverse order) is:

-> #1 (mapping.invalidate_lock#4){+.+.}-{4:4}:
   ...
   down_write+0x40/0xe0
   kvm_gmem_bind+0xd9/0x1b0 [kvm]
   __kvm_set_memory_region.part.0+0x4fc/0x620 [kvm]
   __kvm_set_memory_region+0x6b/0x90 [kvm]
   kvm_vm_ioctl+0x350/0xa00 [kvm]
   __x64_sys_ioctl+0x95/0xd0
   do_syscall_64+0x39/0x90
   entry_SYSCALL_64_after_hwframe+0x6e/0xd8

-> #0 (>slots_lock){+.+.}-{4:4}:
   ...
   mutex_lock_nested+0x1b/0x30
   kvm_gmem_release+0x56/0x1b0 [kvm]
   __fput+0x115/0x2e0
   fput+0xe/0x20
   task_work_run+0x5e/0xb0
   do_exit+0x2dd/0x5b0
   do_group_exit+0x3b/0xb0
   __x64_sys_exit_group+0x18/0x20
   do_syscall_64+0x39/0x90
   entry_SYSCALL_64_after_hwframe+0x6e/0xd8

other info that might help us debug this:

 Possible unsafe locking scenario:

   CPU0CPU1
   
  lock(mapping.invalidate_lock#4);
   lock(>slots_lock);
   lock(mapping.invalidate_lock#4);
  lock(>slots_lock);

Signed-off-by: Isaku Yamahata 
---
 virt/kvm/guest_mem.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/virt/kvm/guest_mem.c b/virt/kvm/guest_mem.c
index ab91e972e699..772e4631fcd9 100644
--- a/virt/kvm/guest_mem.c
+++ b/virt/kvm/guest_mem.c
@@ -274,8 +274,6 @@ static int kvm_gmem_release(struct inode *inode, struct 
file *file)
struct kvm *kvm = gmem->kvm;
unsigned long index;
 
-   filemap_invalidate_lock(inode->i_mapping);
-
/*
 * Prevent concurrent attempts to *unbind* a memslot.  This is the last
 * reference to the file and thus no new bindings can be created, but
@@ -285,6 +283,8 @@ static int kvm_gmem_release(struct inode *inode, struct 
file *file)
 */
mutex_lock(>slots_lock);
 
+   filemap_invalidate_lock(inode->i_mapping);
+
xa_for_each(>bindings, index, slot)
rcu_assign_pointer(slot->gmem.file, NULL);
 
@@ -299,12 +299,12 @@ static int kvm_gmem_release(struct inode *inode, struct 
file *file)
kvm_gmem_issue_arch_invalidate(gmem->kvm, file_inode(file), 0, -1ul);
kvm_gmem_invalidate_end(gmem, 0, -1ul);
 
-   mutex_unlock(>slots_lock);
-
list_del(>entry);
 
filemap_invalidate_unlock(inode->i_mapping);
 
+   mutex_unlock(>slots_lock);
+
xa_destroy(>bindings);
kfree(gmem);
 
-- 
2.25.1



-- 
Isaku Yamahata 


Re: [RFC PATCH v11 08/29] KVM: Introduce per-page memory attributes

2023-07-20 Thread Isaku Yamahata
The user should check the return value as well as the size to
> > +decide if the operation succeeded for the whole range or not. The user may 
> > want
> > +to retry the operation with the returned address/size if the previous 
> > range was
> > +partially successful.
> > +
> > +Both address and size should be page aligned and the supported attributes 
> > can be
> > +retrieved with KVM_GET_SUPPORTED_MEMORY_ATTRIBUTES.
> > +
> > +The "flags" field may be used for future extensions and should be set to 
> > 0s.
> > +
> >  5. The kvm_run structure
> >  
> >
> > @@ -8494,6 +8544,16 @@ block sizes is exposed in 
> > KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES as a
> >  64-bit bitmap (each bit describing a block size). The default value is
> >  0, to disable the eager page splitting.
> >
> > +8.41 KVM_CAP_MEMORY_ATTRIBUTES
> > +--
> > +
> > +:Capability: KVM_CAP_MEMORY_ATTRIBUTES
> > +:Architectures: x86
> > +:Type: vm
> > +
> > +This capability indicates KVM supports per-page memory attributes and 
> > ioctls
> > +KVM_GET_SUPPORTED_MEMORY_ATTRIBUTES/KVM_SET_MEMORY_ATTRIBUTES are 
> > available.
> > +
> >  9. Known KVM API problems
> >  =
> >
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index e9ca49d451f3..97db63da6227 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -264,6 +264,7 @@ struct kvm_gfn_range {
> > gfn_t end;
> > union {
> > pte_t pte;
> > +   unsigned long attributes;
> > u64 raw;
> > } arg;
> > bool may_block;
> > @@ -809,6 +810,9 @@ struct kvm {
> >
> >  #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
> > struct notifier_block pm_notifier;
> > +#endif
> > +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
> > +   struct xarray mem_attr_array;
> >  #endif
> > char stats_id[KVM_STATS_NAME_SIZE];
> >  };
> > @@ -2301,4 +2305,14 @@ static inline void kvm_account_pgtable_pages(void 
> > *virt, int nr)
> >  /* Max number of entries allowed for each kvm dirty ring */
> >  #define  KVM_DIRTY_RING_MAX_ENTRIES  65536
> >
> > +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
> > +static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, 
> > gfn_t gfn)
> > +{
> > +   return xa_to_value(xa_load(>mem_attr_array, gfn));
> > +}
> > +
> > +bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
> > +struct kvm_gfn_range *range);
> 
> Used but no definition in this patch, it's defined in next patch 09.
> How about add weak version in this patch and let ARCHs to overide it ?

It is guarded by CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES.
-- 
Isaku Yamahata 


Re: [PATCH v7 5/8] KVM: x86/mmu: Don't pass FOLL_GET to __kvm_follow_pfn

2023-07-10 Thread Isaku Yamahata
On Fri, Jul 07, 2023 at 10:35:02AM +0900,
David Stevens  wrote:

> > > > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > > > index e44ab512c3a1..b1607e314497 100644
> > > > > --- a/arch/x86/kvm/mmu/mmu.c
> > > > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > >
> > > > ...
> > > >
> > > > > @@ -2937,6 +2943,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, 
> > > > > struct kvm_memory_slot *slot,
> > > > >   bool host_writable = !fault || fault->map_writable;
> > > > >   bool prefetch = !fault || fault->prefetch;
> > > > >   bool write_fault = fault && fault->write;
> > > > > + bool is_refcounted = !fault || fault->is_refcounted_page;
> > > >
> > > > Just wonder, what if a non-refcounted page is prefetched?  Or is it 
> > > > possible in
> > > > practice?
> > >
> > > Prefetching is still done via gfn_to_page_many_atomic, which sets
> > > FOLL_GET. That's fixable, but it's not something this series currently
> > > does.
> >
> > So if we prefetch a page, REFCOUNTED bit is cleared unconditionally with 
> > this
> > hunk.  kvm_set_page_{dirty, accessed} won't be called as expected for 
> > prefetched
> > spte.  If I read the patch correctly, REFCOUNTED bit in SPTE should 
> > represent
> > whether the corresponding page is ref-countable or not, right?
> >
> > Because direct_pte_prefetch_many() is for legacy KVM MMU and 
> > FNAME(prefetch_pte)
> > is shadow paging, we need to test it with legacy KVM MMU or shadow paging 
> > to hit
> > the issue, though.
> >
> 
> direct_pte_prefetch_many and prefetch_gpte both pass NULL for the
> fault parameter, so is_refcounted will evaluate to true. So the spte's
> refcounted bit will get set in that case.

Oops, my bad.  My point is "unconditionally".  Is the bit always set for
non-refcountable pages?  Or non-refcountable pages are not prefeched?
-- 
Isaku Yamahata 


Re: [PATCH v7 5/8] KVM: x86/mmu: Don't pass FOLL_GET to __kvm_follow_pfn

2023-07-06 Thread Isaku Yamahata
On Thu, Jul 06, 2023 at 01:52:08PM +0900,
David Stevens  wrote:

> On Wed, Jul 5, 2023 at 7:17 PM Yu Zhang  wrote:
> >
> > On Tue, Jul 04, 2023 at 04:50:50PM +0900, David Stevens wrote:
> > > From: David Stevens 
> > >
> > > Stop passing FOLL_GET to __kvm_follow_pfn. This allows the host to map
> > > memory into the guest that is backed by un-refcounted struct pages - for
> > > example, higher order non-compound pages allocated by the amdgpu driver
> > > via ttm_pool_alloc_page.
> >
> > I guess you mean the tail pages of the higher order non-compound pages?
> > And as to the head page, it is said to be set to one coincidentally[*],
> > and shall not be considered as refcounted.  IIUC, refcount of this head
> > page will be increased and decreased soon in hva_to_pfn_remapped(), so
> > this may not be a problem(?). But treating this head page differently,
> > as a refcounted one(e.g., to set the A/D flags), is weired.
> >
> > Or maybe I missed some context, e.g., can the head page be allocted to
> > guest at all?
> 
> Yes, this is to allow mapping the tail pages of higher order
> non-compound pages - I should have been more precise in my wording.
> The head pages can already be mapped into the guest.
> 
> Treating the head and tail pages would require changing how KVM
> behaves in a situation it supports today (rather than just adding
> support for an unsupported situation). Currently, without this series,
> KVM can map VM_PFNMAP|VM_IO memory backed by refcounted pages into the
> guest. When that happens, KVM sets the A/D flags. I'm not sure whether
> that's actually valid behavior, nor do I know whether anyone actually
> cares about it. But it's what KVM does today, and I would shy away
> from modifying that behavior without good reason.
> 
> > >
> > > The bulk of this change is tracking the is_refcounted_page flag so that
> > > non-refcounted pages don't trigger page_count() == 0 warnings. This is
> > > done by storing the flag in an unused bit in the sptes.
> >
> > Also, maybe we should mention this only works on x86-64.
> >
> > >
> > > Signed-off-by: David Stevens 
> > > ---
> > >  arch/x86/kvm/mmu/mmu.c  | 44 +
> > >  arch/x86/kvm/mmu/mmu_internal.h |  1 +
> > >  arch/x86/kvm/mmu/paging_tmpl.h  |  9 ---
> > >  arch/x86/kvm/mmu/spte.c |  4 ++-
> > >  arch/x86/kvm/mmu/spte.h | 12 -
> > >  arch/x86/kvm/mmu/tdp_mmu.c  | 22 ++---
> > >  6 files changed, 62 insertions(+), 30 deletions(-)
> > >
> > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > index e44ab512c3a1..b1607e314497 100644
> > > --- a/arch/x86/kvm/mmu/mmu.c
> > > +++ b/arch/x86/kvm/mmu/mmu.c
> >
> > ...
> >
> > > @@ -2937,6 +2943,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, 
> > > struct kvm_memory_slot *slot,
> > >   bool host_writable = !fault || fault->map_writable;
> > >   bool prefetch = !fault || fault->prefetch;
> > >   bool write_fault = fault && fault->write;
> > > + bool is_refcounted = !fault || fault->is_refcounted_page;
> >
> > Just wonder, what if a non-refcounted page is prefetched?  Or is it 
> > possible in
> > practice?
> 
> Prefetching is still done via gfn_to_page_many_atomic, which sets
> FOLL_GET. That's fixable, but it's not something this series currently
> does.

So if we prefetch a page, REFCOUNTED bit is cleared unconditionally with this
hunk.  kvm_set_page_{dirty, accessed} won't be called as expected for prefetched
spte.  If I read the patch correctly, REFCOUNTED bit in SPTE should represent
whether the corresponding page is ref-countable or not, right?

Because direct_pte_prefetch_many() is for legacy KVM MMU and FNAME(prefetch_pte)
is shadow paging, we need to test it with legacy KVM MMU or shadow paging to hit
the issue, though.

Thanks,
-- 
Isaku Yamahata 


Re: [PATCH v7 5/8] KVM: x86/mmu: Don't pass FOLL_GET to __kvm_follow_pfn

2023-07-05 Thread Isaku Yamahata
On Tue, Jul 04, 2023 at 04:50:50PM +0900,
David Stevens  wrote:

> diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
> index cf2c6426a6fc..46c681dc45e6 100644
> --- a/arch/x86/kvm/mmu/spte.c
> +++ b/arch/x86/kvm/mmu/spte.c
> @@ -138,7 +138,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page 
> *sp,
>  const struct kvm_memory_slot *slot,
>  unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
>  u64 old_spte, bool prefetch, bool can_unsync,
> -bool host_writable, u64 *new_spte)
> +bool host_writable, bool is_refcounted, u64 *new_spte)
>  {
>   int level = sp->role.level;
>   u64 spte = SPTE_MMU_PRESENT_MASK;
> @@ -188,6 +188,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page 
> *sp,
>  
>   if (level > PG_LEVEL_4K)
>   spte |= PT_PAGE_SIZE_MASK;
> + else if (is_refcounted)
> + spte |= SPTE_MMU_PAGE_REFCOUNTED;

Is REFCOUNTED for 4K page only?  What guarantees that large page doesn't have
FOLL_GET? or can we set the bit for large page?


>  
>   if (shadow_memtype_mask)
>   spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn,

-- 
Isaku Yamahata 


Re: [PATCH v7 4/8] KVM: x86/mmu: Migrate to __kvm_follow_pfn

2023-07-05 Thread Isaku Yamahata
On Tue, Jul 04, 2023 at 04:50:49PM +0900,
David Stevens  wrote:

> From: David Stevens 
> 
> Migrate from __gfn_to_pfn_memslot to __kvm_follow_pfn.
> 
> Signed-off-by: David Stevens 
> ---
>  arch/x86/kvm/mmu/mmu.c | 35 +--
>  1 file changed, 25 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index ec169f5c7dce..e44ab512c3a1 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -4296,7 +4296,12 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, 
> struct kvm_async_pf *work)
>  static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault 
> *fault)
>  {
>   struct kvm_memory_slot *slot = fault->slot;
> - bool async;
> + struct kvm_follow_pfn foll = {
> + .slot = slot,
> + .gfn = fault->gfn,
> + .flags = FOLL_GET | (fault->write ? FOLL_WRITE : 0),
> + .allow_write_mapping = true,
> + };
>  
>   /*
>* Retry the page fault if the gfn hit a memslot that is being deleted
> @@ -4325,12 +4330,14 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, 
> struct kvm_page_fault *fault
>   return RET_PF_EMULATE;
>   }
>  
> - async = false;
> - fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, 
> ,
> -   fault->write, >map_writable,
> -   >hva);
> - if (!async)
> - return RET_PF_CONTINUE; /* *pfn has correct page already */
> + foll.flags |= FOLL_NOWAIT;
> + fault->pfn = __kvm_follow_pfn();
> +
> + if (!is_error_noslot_pfn(fault->pfn))

We have pfn in struct kvm_follow_pfn as output. Can we make __kvm_follow_pfn()
return int instead of kvm_pfn_t?  KVM_PFN_* seems widely used, though.


> + goto success;
> +
> + if (fault->pfn != KVM_PFN_ERR_NEEDS_IO)
> + return RET_PF_CONTINUE;
>  
>   if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) {
>   trace_kvm_try_async_get_page(fault->addr, fault->gfn);
> @@ -4348,9 +4355,17 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, 
> struct kvm_page_fault *fault
>* to wait for IO.  Note, gup always bails if it is unable to quickly
>* get a page and a fatal signal, i.e. SIGKILL, is pending.
>*/
> - fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL,
> -   fault->write, >map_writable,
> -   >hva);
> + foll.flags |= FOLL_INTERRUPTIBLE;
> + foll.flags &= ~FOLL_NOWAIT;
> + fault->pfn = __kvm_follow_pfn();
> +
> + if (!is_error_noslot_pfn(fault->pfn))
> + goto success;
> +
> + return RET_PF_CONTINUE;
> +success:
> + fault->hva = foll.hva;
> + fault->map_writable = foll.writable;
>   return RET_PF_CONTINUE;
>  }
>  
> -- 
> 2.41.0.255.g8b1d071c50-goog
> 

-- 
Isaku Yamahata 


Re: [PATCH v7 2/8] KVM: Introduce __kvm_follow_pfn function

2023-07-05 Thread Isaku Yamahata
low_pfn foll = {
> + .slot = slot,
> + .gfn = gfn,
> + .flags = FOLL_WRITE,
> + .atomic = true,
> + };
> + return __kvm_follow_pfn();
>  }
>  EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
>  
> diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
> index 180f1a09e6ba..ed896aee5396 100644
> --- a/virt/kvm/kvm_mm.h
> +++ b/virt/kvm/kvm_mm.h
> @@ -20,8 +20,7 @@
>  #define KVM_MMU_UNLOCK(kvm)  spin_unlock(&(kvm)->mmu_lock)
>  #endif /* KVM_HAVE_MMU_RWLOCK */
>  
> -kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
> -  bool *async, bool write_fault, bool *writable);
> +kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *foll);
>  
>  #ifdef CONFIG_HAVE_KVM_PFNCACHE
>  void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
> diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
> index 2d6aba677830..e3fefa753a51 100644
> --- a/virt/kvm/pfncache.c
> +++ b/virt/kvm/pfncache.c
> @@ -144,6 +144,12 @@ static kvm_pfn_t hva_to_pfn_retry(struct 
> gfn_to_pfn_cache *gpc)
>   kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT;
>   void *new_khva = NULL;
>   unsigned long mmu_seq;
> + struct kvm_follow_pfn foll = {
> + .slot = gpc->memslot,
> + .gfn = gpa_to_gfn(gpc->gpa),
> + .flags = FOLL_WRITE,
> + .hva = gpc->uhva,
> + };
>  
>   lockdep_assert_held(>refresh_lock);
>  
> @@ -183,7 +189,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache 
> *gpc)
>   }
>  
>   /* We always request a writeable mapping */
> - new_pfn = hva_to_pfn(gpc->uhva, false, false, NULL, true, NULL);
> + new_pfn = hva_to_pfn();
>   if (is_error_noslot_pfn(new_pfn))
>   goto out_error;
>  
> -- 
> 2.41.0.255.g8b1d071c50-goog
> 

-- 
Isaku Yamahata 


Re: [PATCH v2 39/50] KVM: x86: Move CPU compat checks hook to kvm_x86_ops (from kvm_x86_init_ops)

2022-12-05 Thread Isaku Yamahata
ted(void)
>  {
>   if (!cpu_has_vmx()) {
>   pr_err("CPU doesn't support VMX\n");
> @@ -2726,7 +2725,7 @@ static bool __init kvm_is_vmx_supported(void)
>   return true;
>  }
>  
> -static int __init vmx_check_processor_compat(void)
> +static int vmx_check_processor_compat(void)
>  {
>   struct vmcs_config vmcs_conf;
>   struct vmx_capability vmx_cap;
> @@ -8104,6 +8103,8 @@ static void vmx_vm_destroy(struct kvm *kvm)
>  static struct kvm_x86_ops vmx_x86_ops __initdata = {
>   .name = KBUILD_MODNAME,
>  
> + .check_processor_compatibility = vmx_check_processor_compat,
> +
>   .hardware_unsetup = vmx_hardware_unsetup,
>  
>   .hardware_enable = vmx_hardware_enable,
> @@ -8501,7 +8502,6 @@ static __init int hardware_setup(void)
>  }
>  
>  static struct kvm_x86_init_ops vmx_init_ops __initdata = {
> - .check_processor_compatibility = vmx_check_processor_compat,
>   .hardware_setup = hardware_setup,
>   .handle_intel_pt_intr = NULL,
>  
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 5551f3552f08..ee9af412ffd4 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -9279,12 +9279,7 @@ static inline void kvm_ops_update(struct 
> kvm_x86_init_ops *ops)
>   kvm_pmu_ops_update(ops->pmu_ops);
>  }
>  
> -struct kvm_cpu_compat_check {
> - struct kvm_x86_init_ops *ops;
> - int *ret;
> -};
> -
> -static int kvm_x86_check_processor_compatibility(struct kvm_x86_init_ops 
> *ops)
> +static int kvm_x86_check_processor_compatibility(void)
>  {
>   struct cpuinfo_x86 *c = _data(smp_processor_id());
>  
> @@ -9294,19 +9289,16 @@ static int 
> kvm_x86_check_processor_compatibility(struct kvm_x86_init_ops *ops)
>   __cr4_reserved_bits(cpu_has, _cpu_data))
>   return -EIO;
>  
> - return ops->check_processor_compatibility();
> + return static_call(kvm_x86_check_processor_compatibility)();
>  }
>  
> -static void kvm_x86_check_cpu_compat(void *data)
> +static void kvm_x86_check_cpu_compat(void *ret)
>  {
> - struct kvm_cpu_compat_check *c = data;
> -
> - *c->ret = kvm_x86_check_processor_compatibility(c->ops);
> + *(int *)ret = kvm_x86_check_processor_compatibility();
>  }
>  
>  static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
>  {
> - struct kvm_cpu_compat_check c;
>   u64 host_pat;
>   int r, cpu;
>  
> @@ -9377,12 +9369,12 @@ static int __kvm_x86_vendor_init(struct 
> kvm_x86_init_ops *ops)
>   if (r != 0)
>   goto out_mmu_exit;
>  
> - c.ret = 
> - c.ops = ops;
> + kvm_ops_update(ops);
> +
>   for_each_online_cpu(cpu) {
> - smp_call_function_single(cpu, kvm_x86_check_cpu_compat, , 1);
> + smp_call_function_single(cpu, kvm_x86_check_cpu_compat, , 1);

Ah, here pointer makes sense. So I scratch the comment on the previous patch.

Thanks,

>   if (r < 0)
> - goto out_hardware_unsetup;
> + goto out_unwind_ops;
>   }
>  
>   /*
> @@ -9390,8 +9382,6 @@ static int __kvm_x86_vendor_init(struct 
> kvm_x86_init_ops *ops)
>* absolutely necessary, as most operations from this point forward
>* require unwinding.
>*/
> - kvm_ops_update(ops);
> -
>   kvm_timer_init();
>  
>   if (pi_inject_timer == -1)
> @@ -9427,8 +9417,9 @@ static int __kvm_x86_vendor_init(struct 
> kvm_x86_init_ops *ops)
>   kvm_init_msr_list();
>   return 0;
>  
> -out_hardware_unsetup:
> - ops->runtime_ops->hardware_unsetup();
> +out_unwind_ops:
> + kvm_x86_ops.hardware_enable = NULL;
> + static_call(kvm_x86_hardware_unsetup)();
>  out_mmu_exit:
>   kvm_mmu_vendor_module_exit();
>  out_free_percpu:
> -- 
> 2.38.1.584.g0f3c55d4c2-goog
> 

-- 
Isaku Yamahata 


Re: [PATCH v2 31/50] KVM: x86: Do CPU compatibility checks in x86 code

2022-12-05 Thread Isaku Yamahata
On Wed, Nov 30, 2022 at 11:09:15PM +,
Sean Christopherson  wrote:

> Move the CPU compatibility checks to pure x86 code, i.e. drop x86's use
> of the common kvm_x86_check_cpu_compat() arch hook.  x86 is the only
> architecture that "needs" to do per-CPU compatibility checks, moving
> the logic to x86 will allow dropping the common code, and will also
> give x86 more control over when/how the compatibility checks are
> performed, e.g. TDX will need to enable hardware (do VMXON) in order to
> perform compatibility checks.
> 
> Signed-off-by: Sean Christopherson 
> ---
>  arch/x86/kvm/svm/svm.c |  2 +-
>  arch/x86/kvm/vmx/vmx.c |  2 +-
>  arch/x86/kvm/x86.c | 49 --
>  3 files changed, 40 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 19e81a99c58f..d7ea1c1175c2 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -5103,7 +5103,7 @@ static int __init svm_init(void)
>* Common KVM initialization _must_ come last, after this, /dev/kvm is
>* exposed to userspace!
>*/
> - r = kvm_init(_init_ops, sizeof(struct vcpu_svm),
> + r = kvm_init(NULL, sizeof(struct vcpu_svm),
>__alignof__(struct vcpu_svm), THIS_MODULE);
>   if (r)
>   goto err_kvm_init;
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 654d81f781da..8deb1bd60c10 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -8592,7 +8592,7 @@ static int __init vmx_init(void)
>* Common KVM initialization _must_ come last, after this, /dev/kvm is
>* exposed to userspace!
>*/
> - r = kvm_init(_init_ops, sizeof(struct vcpu_vmx),
> + r = kvm_init(NULL, sizeof(struct vcpu_vmx),
>__alignof__(struct vcpu_vmx), THIS_MODULE);
>   if (r)
>   goto err_kvm_init;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 66f16458aa97..3571bc968cf8 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -9277,10 +9277,36 @@ static inline void kvm_ops_update(struct 
> kvm_x86_init_ops *ops)
>   kvm_pmu_ops_update(ops->pmu_ops);
>  }
>  
> +struct kvm_cpu_compat_check {
> + struct kvm_x86_init_ops *ops;
> + int *ret;

minor nitpick: just int ret. I don't see the necessity of the pointer.
Anyway overall it looks good to me.

Reviewed-by: Isaku Yamahata 

> +};
> +
> +static int kvm_x86_check_processor_compatibility(struct kvm_x86_init_ops 
> *ops)
> +{
> + struct cpuinfo_x86 *c = _data(smp_processor_id());
> +
> + WARN_ON(!irqs_disabled());
> +
> + if (__cr4_reserved_bits(cpu_has, c) !=
> + __cr4_reserved_bits(cpu_has, _cpu_data))
> + return -EIO;
> +
> + return ops->check_processor_compatibility();
> +}
> +
> +static void kvm_x86_check_cpu_compat(void *data)
> +{
> + struct kvm_cpu_compat_check *c = data;
> +
> + *c->ret = kvm_x86_check_processor_compatibility(c->ops);
> +}
> +
>  static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
>  {
> + struct kvm_cpu_compat_check c;
>   u64 host_pat;
> - int r;
> + int r, cpu;
>  
>   if (kvm_x86_ops.hardware_enable) {
>   pr_err("kvm: already loaded vendor module '%s'\n", 
> kvm_x86_ops.name);
> @@ -9360,6 +9386,14 @@ static int __kvm_x86_vendor_init(struct 
> kvm_x86_init_ops *ops)
>   if (r != 0)
>   goto out_mmu_exit;
>  
> + c.ret = 
> + c.ops = ops;
> + for_each_online_cpu(cpu) {
> + smp_call_function_single(cpu, kvm_x86_check_cpu_compat, , 1);
> + if (r < 0)

Here it can be "c.ret < 0".

> + goto out_hardware_unsetup;
> + }
> +
>   /*
>* Point of no return!  DO NOT add error paths below this point unless
>* absolutely necessary, as most operations from this point forward
> @@ -9402,6 +9436,8 @@ static int __kvm_x86_vendor_init(struct 
> kvm_x86_init_ops *ops)
>   kvm_init_msr_list();
>   return 0;
>  
> +out_hardware_unsetup:
> + ops->runtime_ops->hardware_unsetup();
>  out_mmu_exit:
>   kvm_mmu_vendor_module_exit();
>  out_free_percpu:
> @@ -12037,16 +12073,7 @@ void kvm_arch_hardware_disable(void)
>  
>  int kvm_arch_check_processor_compat(void *opaque)
>  {
> - struct cpuinfo_x86 *c = _data(smp_processor_id());
> -     struct kvm_x86_init_ops *ops = opaque;
> -
> - WARN_ON(!irqs_disabled());
> -
> - if (__cr4_reserved_bits(cpu_has, c) !=
> - __cr4_reserved_bits(cpu_has, _cpu_data))
> - return -EIO;
> -
> - return ops->check_processor_compatibility();
> + return 0;
>  }
>  
>  bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
> -- 
> 2.38.1.584.g0f3c55d4c2-goog
> 

-- 
Isaku Yamahata 


Re: [PATCH 00/44] KVM: Rework kvm_init() and hardware enabling

2022-11-07 Thread Isaku Yamahata
On Tue, Nov 08, 2022 at 01:09:27AM +,
"Huang, Kai"  wrote:

> On Mon, 2022-11-07 at 13:46 -0800, Isaku Yamahata wrote:
> > > On Fri, Nov 04, 2022, Isaku Yamahata wrote:
> > > > Thanks for the patch series. I the rebased TDX KVM patch series and it
> > > > worked.
> > > > Since cpu offline needs to be rejected in some cases(To keep at least 
> > > > one
> > > > cpu
> > > > on a package), arch hook for cpu offline is needed.
> > > 
> > > I hate to bring this up because I doubt there's a real use case for 
> > > SUSPEND
> > > with
> > > TDX, but the CPU offline path isn't just for true offlining of CPUs.  When
> > > the
> > > system enters SUSPEND, only the initiating CPU goes through
> > > kvm_suspend()+kvm_resume(),
> > > all responding CPUs go through CPU offline+online.  I.e. disallowing all
> > > CPUs from
> > > going "offline" will prevent suspending the system.
> > 
> > The current TDX KVM implementation disallows CPU package from offline only
> > when
> > TDs are running.  If no TD is running, CPU offline is allowed.  So before
> > SUSPEND, TDs need to be killed via systemd or something.  After killing TDs,
> > the
> > system can enter into SUSPEND state.
> 
> This seems not correct.  You need one cpu for each to be online in order to
> create TD as well, as TDH.MNG.KEY.CONFIG needs to be called on all packages,
> correct?

That's correct. In such case, the creation of TD fails.  TD creation checks if
at least one cpu is online on all CPU packages.  If no, error.
-- 
Isaku Yamahata 


Re: [PATCH 00/44] KVM: Rework kvm_init() and hardware enabling

2022-11-07 Thread Isaku Yamahata
On Fri, Nov 04, 2022 at 08:27:14PM +,
Sean Christopherson  wrote:

> On Fri, Nov 04, 2022, Isaku Yamahata wrote:
> > Thanks for the patch series. I the rebased TDX KVM patch series and it 
> > worked.
> > Since cpu offline needs to be rejected in some cases(To keep at least one 
> > cpu
> > on a package), arch hook for cpu offline is needed.
> 
> I hate to bring this up because I doubt there's a real use case for SUSPEND 
> with
> TDX, but the CPU offline path isn't just for true offlining of CPUs.  When the
> system enters SUSPEND, only the initiating CPU goes through 
> kvm_suspend()+kvm_resume(),
> all responding CPUs go through CPU offline+online.  I.e. disallowing all CPUs 
> from
> going "offline" will prevent suspending the system.

The current TDX KVM implementation disallows CPU package from offline only when
TDs are running.  If no TD is running, CPU offline is allowed.  So before
SUSPEND, TDs need to be killed via systemd or something.  After killing TDs, the
system can enter into SUSPEND state.


> I don't see anything in the TDX series or the specs that suggests 
> suspend+resume
> is disallowed when TDX is enabled, so blocking that seems just as wrong as
> preventing software from soft-offlining CPUs.

When it comes to SUSPEND, it means suspend-to-idle, ACPI S1, S3, or S4.
suspend-to-idle doesn't require CPU offline.

Although CPU related spec doesn't mention about S3, the ACPI spec says

  7.4.2.2 System _S1 State (Sleeping with Processor Context Maintained)
  The processor-complex context is maintained.

  7.4.2.4 System _S3 State or 7.4.2.5 System _S4 State
  The processor-complex context is not maintained.

It's safe to say the processor context related to TDX is complex, I think.
Let me summarize the situation. What do you think?

- While no TD running:
  No additional limitation on CPU offline.

- On TD creation:
  If any of whole cpu package is software offlined, TD creation fails.
  Alternative: forcibly online necessary CPUs, create TD, and offline CPUs

- TD running:
  Although it's not required to keep all CPU packages online, keep CPU package
  from offlining for TD destruction.

- TD destruction:
  If any of whole cpu package is software offlined, TD destruction fails.
  The current implementation prevents any cpu package from offlinining during
  TD running.
  Alternative:
  - forcibly online necessary CPUs, destruct TD, and offline CPUs again and
allow CPU package to offline
  - Stash TDX resources somewhere. When cpu packages are onlined, free those
release.

- On SUSPEND:
  TODO: Allow CPU offline if S1 is requested.
  - suspend-to-idle: nothing to do because cpu offline isn't required
  - ACPI S1: Need to allow offline CPUs.  This can be implemented by referencing
suspend_state_t pm_suspend_target_state is PM_SUSPEND_TO_STANBY.
  - ACPI S3/S4: refuse cpu offline.  The system needs to kill all TDs before
starting SUSPEND process. This is what is implemented.

Thanks,
-- 
Isaku Yamahata 


Re: [PATCH 36/44] KVM: x86: Do compatibility checks when onlining CPU

2022-11-04 Thread Isaku Yamahata
On Thu, Nov 03, 2022 at 10:34:10PM +,
Sean Christopherson  wrote:

> On Thu, Nov 03, 2022, Isaku Yamahata wrote:
> > On Wed, Nov 02, 2022 at 11:19:03PM +,
> > Sean Christopherson  wrote:
> > > diff --git a/arch/x86/include/asm/kvm_host.h 
> > > b/arch/x86/include/asm/kvm_host.h
> > > index f223c845ed6e..c99222b71fcc 100644
> > > --- a/arch/x86/include/asm/kvm_host.h
> > > +++ b/arch/x86/include/asm/kvm_host.h
> > > @@ -1666,7 +1666,7 @@ struct kvm_x86_nested_ops {
> > >  };
> > >  
> > >  struct kvm_x86_init_ops {
> > > - int (*check_processor_compatibility)(void);
> > > + int (*check_processor_compatibility)(int cpu);
> > 
> > Is this cpu argument used only for error message to include cpu number
> > with avoiding repeating raw_smp_processor_id() in pr_err()?
> 
> Yep.
> 
> > The actual check is done on the current executing cpu.
> > 
> > If cpu != raw_smp_processor_id(), cpu is wrong. Although the function is 
> > called
> > in non-preemptive context, it's a bit confusing. So voting to remove it and
> > to use.
> 
> What if I rename the param is this_cpu?  I 100% agree the argument is 
> confusing
> as-is, but forcing all the helpers to manually grab the cpu is quite annoying.

Makes sense. Let's settle it with this_cpu.
-- 
Isaku Yamahata 


Re: [PATCH 00/44] KVM: Rework kvm_init() and hardware enabling

2022-11-04 Thread Isaku Yamahata
able_nolock(void *junk)
__this_cpu_write(hardware_enabled, false);
 }
 
+__weak int kvm_arch_offline_cpu(unsigned int cpu)
+{
+   return 0;
+}
+
 static int kvm_offline_cpu(unsigned int cpu)
 {
+   int r = 0;
+
mutex_lock(_lock);
-   if (kvm_usage_count) {
+   r = kvm_arch_offline_cpu(cpu);
+   if (!r && kvm_usage_count) {
preempt_disable();
hardware_disable_nolock(NULL);
preempt_enable();
    }
mutex_unlock(_lock);
-   return 0;
+   return r;
 }
 
 static void hardware_disable_all_nolock(void)

-- 
Isaku Yamahata 


Re: [PATCH 36/44] KVM: x86: Do compatibility checks when onlining CPU

2022-11-03 Thread Isaku Yamahata
On Wed, Nov 02, 2022 at 11:19:03PM +,
Sean Christopherson  wrote:

> From: Chao Gao 
> 
> Do compatibility checks when enabling hardware to effectively add
> compatibility checks when onlining a CPU.  Abort enabling, i.e. the
> online process, if the (hotplugged) CPU is incompatible with the known
> good setup.
> 
> At init time, KVM does compatibility checks to ensure that all online
> CPUs support hardware virtualization and a common set of features. But
> KVM uses hotplugged CPUs without such compatibility checks. On Intel
> CPUs, this leads to #GP if the hotplugged CPU doesn't support VMX, or
> VM-Entry failure if the hotplugged CPU doesn't support all features
> enabled by KVM.
> 
> Note, this is little more than a NOP on SVM, as SVM already checks for
> full SVM support during hardware enabling.
> 
> Opportunistically add a pr_err() if setup_vmcs_config() fails, and
> tweak all error messages to output which CPU failed.
> 
> Signed-off-by: Chao Gao 
> Co-developed-by: Sean Christopherson 
> Signed-off-by: Sean Christopherson 
> ---
>  arch/x86/include/asm/kvm_host.h |  2 +-
>  arch/x86/kvm/svm/svm.c  | 20 +++-
>  arch/x86/kvm/vmx/vmx.c  | 33 +++--
>  arch/x86/kvm/x86.c  |  5 +++--
>  4 files changed, 34 insertions(+), 26 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index f223c845ed6e..c99222b71fcc 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1666,7 +1666,7 @@ struct kvm_x86_nested_ops {
>  };
>  
>  struct kvm_x86_init_ops {
> - int (*check_processor_compatibility)(void);
> + int (*check_processor_compatibility)(int cpu);

Is this cpu argument used only for error message to include cpu number
with avoiding repeating raw_smp_processor_id() in pr_err()?
The actual check is done on the current executing cpu.

If cpu != raw_smp_processor_id(), cpu is wrong. Although the function is called
in non-preemptive context, it's a bit confusing. So voting to remove it and
to use.

Thanks,
-- 
Isaku Yamahata 


Re: [PATCH v5 27/30] RFC: KVM: powerpc: Move processor compatibility check to hardware setup

2022-09-26 Thread Isaku Yamahata
On Fri, Sep 23, 2022 at 04:58:41PM +1000,
Michael Ellerman  wrote:

> isaku.yamah...@intel.com writes:
> > From: Isaku Yamahata 
> >
> > Move processor compatibility check from kvm_arch_processor_compat() into
>   ^ 
>   kvm_arch_check_processor_compat()
> 
> > kvm_arch_hardware_setup().  The check does model name comparison with a
> > global variable, cur_cpu_spec.  There is no point to check it at run time
> > on all processors.
> 
> A key detail I had to look up is that both kvm_arch_hardware_setup() and
> kvm_arch_check_processor_compat() are called from kvm_init(), one after
> the other. But the latter is called on each CPU.
> 
> And because the powerpc implementation of kvm_arch_check_processor_compat()
> just checks a global, there's no need to call it on every CPU.
> 
> > kvmppc_core_check_processor_compat() checks the global variable.  There are
> > five implementation for it as follows.
> 
> There are three implementations not five.

Thanks. I'll update the commit message.

> >   arch/powerpc/include/asm/cputable.h: extern struct cpu_spec *cur_cpu_spec;
> >   arch/powerpc/kvm/book3s.c: return 0
> >   arch/powerpc/kvm/e500.c: strcmp(cur_cpu_spec->cpu_name, "e500v2")
> >   arch/powerpc/kvm/e500mc.c: strcmp(cur_cpu_spec->cpu_name, "e500mc")
> >  strcmp(cur_cpu_spec->cpu_name, "e5500")
> >  strcmp(cur_cpu_spec->cpu_name, "e6500")
> >
> > Suggested-by: Sean Christopherson 
> > Signed-off-by: Isaku Yamahata 
> > Cc: linuxppc-dev@lists.ozlabs.org
> > Cc: Fabiano Rosas 
> > ---
> >  arch/powerpc/kvm/powerpc.c | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
> > index 7b56d6ccfdfb..31dc4f231e9d 100644
> > --- a/arch/powerpc/kvm/powerpc.c
> > +++ b/arch/powerpc/kvm/powerpc.c
> > @@ -444,12 +444,12 @@ int kvm_arch_hardware_enable(void)
> >  
> >  int kvm_arch_hardware_setup(void *opaque)
> >  {
> > -   return 0;
> > +   return kvmppc_core_check_processor_compat();
> >  }
> >  
> >  int kvm_arch_check_processor_compat(void)
> >  {
> > -   return kvmppc_core_check_processor_compat();
> > +   return 0;
> >  }
> 
> The actual change seems OK. I gave it a quick test boot and ran some
> VMs, everything seems to work as before.
> 
> Acked-by: Michael Ellerman  (powerpc)

Thanks so much for testing. I'll remove RFC.
-- 
Isaku Yamahata 


[PATCH v5 27/30] RFC: KVM: powerpc: Move processor compatibility check to hardware setup

2022-09-22 Thread isaku . yamahata
From: Isaku Yamahata 

Move processor compatibility check from kvm_arch_processor_compat() into
kvm_arch_hardware_setup().  The check does model name comparison with a
global variable, cur_cpu_spec.  There is no point to check it at run time
on all processors.

kvmppc_core_check_processor_compat() checks the global variable.  There are
five implementation for it as follows.

  arch/powerpc/include/asm/cputable.h: extern struct cpu_spec *cur_cpu_spec;
  arch/powerpc/kvm/book3s.c: return 0
  arch/powerpc/kvm/e500.c: strcmp(cur_cpu_spec->cpu_name, "e500v2")
  arch/powerpc/kvm/e500mc.c: strcmp(cur_cpu_spec->cpu_name, "e500mc")
 strcmp(cur_cpu_spec->cpu_name, "e5500")
 strcmp(cur_cpu_spec->cpu_name, "e6500")

Suggested-by: Sean Christopherson 
Signed-off-by: Isaku Yamahata 
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Fabiano Rosas 
---
 arch/powerpc/kvm/powerpc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 7b56d6ccfdfb..31dc4f231e9d 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -444,12 +444,12 @@ int kvm_arch_hardware_enable(void)
 
 int kvm_arch_hardware_setup(void *opaque)
 {
-   return 0;
+   return kvmppc_core_check_processor_compat();
 }
 
 int kvm_arch_check_processor_compat(void)
 {
-   return kvmppc_core_check_processor_compat();
+   return 0;
 }
 
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
-- 
2.25.1



Re: [PATCH v4 23/26] RFC: KVM: powerpc: Move processor compatibility check to hardware setup

2022-09-09 Thread Isaku Yamahata
On Fri, Sep 09, 2022 at 05:55:14AM +,
Christophe Leroy  wrote:

> 
> 
> Le 09/09/2022 à 01:25, isaku.yamah...@intel.com a écrit :
> > [Vous ne recevez pas souvent de courriers de isaku.yamah...@intel.com. 
> > Découvrez pourquoi ceci est important à 
> > https://aka.ms/LearnAboutSenderIdentification ]
> > 
> > From: Isaku Yamahata 
> > 
> > Move processor compatibility check from kvm_arch_processor_compat() into
> > kvm_arch_hardware_setup().  The check does model name comparison with a
> > global variable, cur_cpu_spec.  There is no point to check it at run time
> > on all processors.
> > 
> > Suggested-by: Sean Christopherson 
> > Signed-off-by: Isaku Yamahata 
> > Cc: linuxppc-dev@lists.ozlabs.org
> > Cc: Fabiano Rosas 
> > ---
> >   arch/powerpc/kvm/powerpc.c | 13 +++--
> >   1 file changed, 11 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
> > index 7b56d6ccfdfb..7e3a6659f107 100644
> > --- a/arch/powerpc/kvm/powerpc.c
> > +++ b/arch/powerpc/kvm/powerpc.c
> > @@ -444,12 +444,21 @@ int kvm_arch_hardware_enable(void)
> > 
> >   int kvm_arch_hardware_setup(void *opaque)
> >   {
> > -   return 0;
> > +   /*
> > +* kvmppc_core_check_processor_compat() checks the global variable.
> > +* No point to check on all processors or at runtime.
> > +* arch/powerpc/kvm/book3s.c: return 0
> > +* arch/powerpc/kvm/e500.c: strcmp(cur_cpu_spec->cpu_name, "e500v2")
> > +* arch/powerpc/kvm/e500mc.c: strcmp(cur_cpu_spec->cpu_name, 
> > "e500mc")
> > +*strcmp(cur_cpu_spec->cpu_name, 
> > "e5500")
> > +*strcmp(cur_cpu_spec->cpu_name, 
> > "e6500")
> > +*/
> 
> This explanation shouldn't be in the code. The content of other file may 
> change in the future, the files may be renamed or deleted, new files may 
> be added. And there is no added value with that comment.
> 
> That detailed explanation should go in the commit message.

Ok, will move the comment into the commit message.
-- 
Isaku Yamahata 


[PATCH v4 23/26] RFC: KVM: powerpc: Move processor compatibility check to hardware setup

2022-09-08 Thread isaku . yamahata
From: Isaku Yamahata 

Move processor compatibility check from kvm_arch_processor_compat() into
kvm_arch_hardware_setup().  The check does model name comparison with a
global variable, cur_cpu_spec.  There is no point to check it at run time
on all processors.

Suggested-by: Sean Christopherson 
Signed-off-by: Isaku Yamahata 
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Fabiano Rosas 
---
 arch/powerpc/kvm/powerpc.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 7b56d6ccfdfb..7e3a6659f107 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -444,12 +444,21 @@ int kvm_arch_hardware_enable(void)
 
 int kvm_arch_hardware_setup(void *opaque)
 {
-   return 0;
+   /*
+* kvmppc_core_check_processor_compat() checks the global variable.
+* No point to check on all processors or at runtime.
+* arch/powerpc/kvm/book3s.c: return 0
+* arch/powerpc/kvm/e500.c: strcmp(cur_cpu_spec->cpu_name, "e500v2")
+* arch/powerpc/kvm/e500mc.c: strcmp(cur_cpu_spec->cpu_name, "e500mc")
+*strcmp(cur_cpu_spec->cpu_name, "e5500")
+*strcmp(cur_cpu_spec->cpu_name, "e6500")
+*/
+   return kvmppc_core_check_processor_compat();
 }
 
 int kvm_arch_check_processor_compat(void)
 {
-   return kvmppc_core_check_processor_compat();
+   return 0;
 }
 
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
-- 
2.25.1