[RFC PATCH 06/19] mm: hugetlb: Provide hugetlb_filemap_add_folio()

2023-06-06 Thread Ackerley Tng
hstate_inode() is hugetlbfs-specific, limiting
hugetlb_add_to_page_cache() to hugetlbfs.

hugetlb_filemap_add_folio() allows hstate to be specified and further
separates hugetlb from hugetlbfs.

Signed-off-by: Ackerley Tng 
---
 include/linux/hugetlb.h |  2 ++
 mm/hugetlb.c| 13 ++---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 14df89d1642c..7d49048c5a2a 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -756,6 +756,8 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate 
*h, int preferred_nid,
nodemask_t *nmask, gfp_t gfp_mask);
 struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct 
*vma,
unsigned long address);
+int hugetlb_filemap_add_folio(struct address_space *mapping, struct hstate *h,
+ struct folio *folio, pgoff_t idx);
 int hugetlb_add_to_page_cache(struct folio *folio, struct address_space 
*mapping,
pgoff_t idx);
 void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 99ab4bbdb2ce..d16c6417b90f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5665,11 +5665,10 @@ static bool hugetlbfs_pagecache_present(struct hstate 
*h,
return present;
 }
 
-int hugetlb_add_to_page_cache(struct folio *folio, struct address_space 
*mapping,
-  pgoff_t idx)
+int hugetlb_filemap_add_folio(struct address_space *mapping, struct hstate *h,
+ struct folio *folio, pgoff_t idx)
 {
struct inode *inode = mapping->host;
-   struct hstate *h = hstate_inode(inode);
int err;
 
__folio_set_locked(folio);
@@ -5693,6 +5692,14 @@ int hugetlb_add_to_page_cache(struct folio *folio, 
struct address_space *mapping
return 0;
 }
 
+int hugetlb_add_to_page_cache(struct folio *folio, struct address_space 
*mapping,
+ pgoff_t idx)
+{
+   struct hstate *h = hstate_inode(mapping->host);
+
+   return hugetlb_filemap_add_folio(mapping, h, folio, idx);
+}
+
 static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
  struct address_space *mapping,
  pgoff_t idx,
-- 
2.41.0.rc0.172.g3f132b7071-goog




[RFC PATCH 04/19] mm: hugetlb: Decouple hstate, subpool from inode

2023-06-06 Thread Ackerley Tng
hstate and subpool being retrievable from inode via hstate_inode() and
subpool_inode() respectively is a hugetlbfs concept.

hugetlb should be agnostic of hugetlbfs and hugetlb accounting
functions should accept hstate (required) and subpool (can be NULL)
independently of inode.

inode is still a parameter for these accounting functions since the
inode's block counts need to be updated during accounting.

The inode's resv_map will also still need to be updated if not NULL.

Signed-off-by: Ackerley Tng 
---
 fs/hugetlbfs/inode.c| 59 -
 include/linux/hugetlb.h | 32 +-
 mm/hugetlb.c| 49 --
 3 files changed, 95 insertions(+), 45 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 4f25df31ae80..0fc49b6252e4 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -164,7 +164,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct 
vm_area_struct *vma)
file_accessed(file);
 
ret = -ENOMEM;
-   if (!hugetlb_reserve_pages(inode,
+   if (!hugetlb_reserve_pages(h, subpool_inode(inode), inode,
vma->vm_pgoff >> huge_page_order(h),
len >> huge_page_shift(h), vma,
vma->vm_flags))
@@ -550,14 +550,18 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, 
pgoff_t start, pgoff_t end,
}
 }
 
-/*
+/**
+ * Remove folio from page_cache and userspace mappings. Also unreserves pages,
+ * updating hstate @h, subpool @spool (if not NULL), @inode block info and
+ * @inode's resv_map (if not NULL).
+ *
  * Called with hugetlb fault mutex held.
  * Returns true if page was actually removed, false otherwise.
  */
-static bool remove_inode_single_folio(struct hstate *h, struct inode *inode,
-   struct address_space *mapping,
-   struct folio *folio, pgoff_t index,
-   bool truncate_op)
+static bool remove_mapping_single_folio(
+   struct address_space *mapping, struct folio *folio, pgoff_t index,
+   struct hstate *h, struct hugepage_subpool *spool, struct inode *inode,
+   bool truncate_op)
 {
bool ret = false;
 
@@ -582,9 +586,8 @@ static bool remove_inode_single_folio(struct hstate *h, 
struct inode *inode,
hugetlb_delete_from_page_cache(folio);
ret = true;
if (!truncate_op) {
-   if (unlikely(hugetlb_unreserve_pages(inode, index,
-   index + 1, 1)))
-   hugetlb_fix_reserve_counts(inode);
+   if (unlikely(hugetlb_unreserve_pages(h, spool, inode, index, 
index + 1, 1)))
+   hugetlb_fix_reserve_counts(h, spool);
}
 
folio_unlock(folio);
@@ -592,7 +595,14 @@ static bool remove_inode_single_folio(struct hstate *h, 
struct inode *inode,
 }
 
 /*
- * remove_inode_hugepages handles two distinct cases: truncation and hole
+ * Remove hugetlb page mappings from @mapping between offsets [@lstart, @lend).
+ * Also updates reservations in:
+ * + hstate @h (required)
+ * + subpool @spool (can be NULL)
+ * + resv_map in @inode (can be NULL)
+ * and updates blocks in @inode (required)
+ *
+ * remove_mapping_hugepages handles two distinct cases: truncation and hole
  * punch.  There are subtle differences in operation for each case.
  *
  * truncation is indicated by end of range being LLONG_MAX
@@ -611,10 +621,10 @@ static bool remove_inode_single_folio(struct hstate *h, 
struct inode *inode,
  * Note: If the passed end of range value is beyond the end of file, but
  * not LLONG_MAX this routine still performs a hole punch operation.
  */
-void remove_inode_hugepages(struct inode *inode, loff_t lstart, loff_t lend)
+void remove_mapping_hugepages(struct address_space *mapping,
+ struct hstate *h, struct hugepage_subpool *spool,
+ struct inode *inode, loff_t lstart, loff_t lend)
 {
-   struct hstate *h = hstate_inode(inode);
-   struct address_space *mapping = >i_data;
const pgoff_t start = lstart >> huge_page_shift(h);
const pgoff_t end = lend >> huge_page_shift(h);
struct folio_batch fbatch;
@@ -636,8 +646,8 @@ void remove_inode_hugepages(struct inode *inode, loff_t 
lstart, loff_t lend)
/*
 * Remove folio that was part of folio_batch.
 */
-   if (remove_inode_single_folio(h, inode, mapping, folio,
-   index, truncate_op))
+   if (remove_mapping_single_folio(mapping, folio, index,
+   h, spool, inode, 
truncate_op))
freed++;
 
   

[RFC PATCH 08/19] mm: hugetlb: Refactor restore_reserve_on_error

2023-06-06 Thread Ackerley Tng
Refactor restore_reserve_on_error to allow resv_map to be passed
in. vma_resv_map() assumes the use of hugetlbfs in the way it
retrieves the resv_map from the vma and inode.

Introduce restore_reserve_on_error_vma() which retains original
functionality to simplify refactoring for now.

Signed-off-by: Ackerley Tng 
---
 fs/hugetlbfs/inode.c|  2 +-
 include/linux/hugetlb.h |  6 --
 mm/hugetlb.c| 37 +
 3 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 0fc49b6252e4..44e6ee9a856d 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -868,7 +868,7 @@ static long hugetlbfs_fallocate(struct file *file, int 
mode, loff_t offset,
__folio_mark_uptodate(folio);
error = hugetlb_add_to_page_cache(folio, mapping, index);
if (unlikely(error)) {
-   restore_reserve_on_error(h, _vma, addr, folio);
+   restore_reserve_on_error_vma(h, _vma, addr, 
folio);
folio_put(folio);
mutex_unlock(_fault_mutex_table[hash]);
goto out;
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 7d49048c5a2a..02a2766d89a4 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -760,8 +760,10 @@ int hugetlb_filemap_add_folio(struct address_space 
*mapping, struct hstate *h,
  struct folio *folio, pgoff_t idx);
 int hugetlb_add_to_page_cache(struct folio *folio, struct address_space 
*mapping,
pgoff_t idx);
-void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
-   unsigned long address, struct folio *folio);
+void restore_reserve_on_error(struct resv_map *resv, pgoff_t resv_index,
+ bool may_share, struct folio *folio);
+void restore_reserve_on_error_vma(struct hstate *h, struct vm_area_struct *vma,
+ unsigned long address, struct folio *folio);
 
 /* arch callback */
 int __init __alloc_bootmem_huge_page(struct hstate *h, int nid);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d943f83d15a9..4675f9efeba4 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2805,15 +2805,10 @@ static long resv_map_del_reservation(struct resv_map 
*resv, pgoff_t resv_index,
  *
  * In case 2, simply undo reserve map modifications done by 
alloc_hugetlb_folio.
  */
-void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
-   unsigned long address, struct folio *folio)
+void restore_reserve_on_error(struct resv_map *resv, pgoff_t resv_index,
+ bool may_share, struct folio *folio)
 {
-   long rc;
-   struct resv_map *resv = vma_resv_map(vma);
-   pgoff_t resv_index = vma_hugecache_offset(h, vma, address);
-   bool may_share = vma->vm_flags & VM_MAYSHARE;
-
-   rc = resv_map_needs_reservation(resv, resv_index, may_share);
+   long rc = resv_map_needs_reservation(resv, resv_index, may_share);
 
if (folio_test_hugetlb_restore_reserve(folio)) {
if (unlikely(rc < 0))
@@ -2865,7 +2860,7 @@ void restore_reserve_on_error(struct hstate *h, struct 
vm_area_struct *vma,
 * For shared mappings, no entry in the map indicates
 * no reservation.  We are done.
 */
-   if (!(vma->vm_flags & VM_MAYSHARE))
+   if (!may_share)
/*
 * For private mappings, no entry indicates
 * a reservation is present.  Since we can
@@ -2883,6 +2878,16 @@ void restore_reserve_on_error(struct hstate *h, struct 
vm_area_struct *vma,
}
 }
 
+void restore_reserve_on_error_vma(struct hstate *h, struct vm_area_struct *vma,
+ unsigned long address, struct folio *folio)
+{
+   struct resv_map *resv = vma_resv_map(vma);
+   pgoff_t resv_index = vma_hugecache_offset(h, vma, address);
+   bool may_share = vma->vm_flags & VM_MAYSHARE;
+
+   restore_reserve_on_error(resv, resv_index, may_share, folio);
+}
+
 /*
  * alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve
  * the old one
@@ -5109,8 +5114,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct 
mm_struct *src,
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
if (!pte_same(src_pte_old, entry)) {
-   restore_reserve_on_error(h, dst_vma, 
addr,
-   new_folio);
+   restore_reserve_on_err

[RFC PATCH 17/19] KVM: selftests: Add basic selftests for hugetlbfs-backed guest_mem

2023-06-06 Thread Ackerley Tng
Add tests for 2MB and 1GB page sizes.

Signed-off-by: Ackerley Tng 
---
 .../testing/selftests/kvm/guest_memfd_test.c  | 33 ++-
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c 
b/tools/testing/selftests/kvm/guest_memfd_test.c
index 059b33cdecec..6e24631119c6 100644
--- a/tools/testing/selftests/kvm/guest_memfd_test.c
+++ b/tools/testing/selftests/kvm/guest_memfd_test.c
@@ -90,20 +90,14 @@ static void test_fallocate(int fd, size_t page_size, size_t 
total_size)
TEST_ASSERT(!ret, "fallocate to restore punched hole should succeed");
 }
 
-
-int main(int argc, char *argv[])
+void test_guest_mem(struct kvm_vm *vm, uint32_t flags, size_t page_size)
 {
-   size_t page_size;
-   size_t total_size;
int fd;
-   struct kvm_vm *vm;
+   size_t total_size;
 
-   page_size = getpagesize();
total_size = page_size * 4;
 
-   vm = vm_create_barebones();
-
-   fd = vm_create_guest_memfd(vm, total_size, 0);
+   fd = vm_create_guest_memfd(vm, total_size, flags);
 
test_file_read_write(fd);
test_mmap(fd, page_size);
@@ -112,3 +106,24 @@ int main(int argc, char *argv[])
 
close(fd);
 }
+
+int main(int argc, char *argv[])
+{
+   struct kvm_vm *vm = vm_create_barebones();
+
+   printf("Test guest mem 4K\n");
+   test_guest_mem(vm, 0, getpagesize());
+   printf("PASSED\n");
+
+   printf("Test guest mem hugetlb 2M\n");
+   test_guest_mem(
+   vm, KVM_GUEST_MEMFD_HUGETLB | KVM_GUEST_MEMFD_HUGE_2MB, 2UL << 
20);
+   printf("PASSED\n");
+
+   printf("Test guest mem hugetlb 1G\n");
+   test_guest_mem(
+   vm, KVM_GUEST_MEMFD_HUGETLB | KVM_GUEST_MEMFD_HUGE_1GB, 1UL << 
30);
+   printf("PASSED\n");
+
+   return 0;
+}
-- 
2.41.0.rc0.172.g3f132b7071-goog




[RFC PATCH 12/19] mm: truncate: Expose preparation steps for truncate_inode_pages_final

2023-06-06 Thread Ackerley Tng
This will allow preparation steps to be shared

Signed-off-by: Ackerley Tng 
---
 include/linux/mm.h |  1 +
 mm/truncate.c  | 24 ++--
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1f79667824eb..7a8f6b810de0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3053,6 +3053,7 @@ extern unsigned long vm_unmapped_area(struct 
vm_unmapped_area_info *info);
 extern void truncate_inode_pages(struct address_space *, loff_t);
 extern void truncate_inode_pages_range(struct address_space *,
   loff_t lstart, loff_t lend);
+extern void truncate_inode_pages_final_prepare(struct address_space *mapping);
 extern void truncate_inode_pages_final(struct address_space *);
 
 /* generic vm_area_ops exported for stackable file systems */
diff --git a/mm/truncate.c b/mm/truncate.c
index 7b4ea4c4a46b..4a7ae87e03b5 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -449,16 +449,7 @@ void truncate_inode_pages(struct address_space *mapping, 
loff_t lstart)
 }
 EXPORT_SYMBOL(truncate_inode_pages);
 
-/**
- * truncate_inode_pages_final - truncate *all* pages before inode dies
- * @mapping: mapping to truncate
- *
- * Called under (and serialized by) inode->i_rwsem.
- *
- * Filesystems have to use this in the .evict_inode path to inform the
- * VM that this is the final truncate and the inode is going away.
- */
-void truncate_inode_pages_final(struct address_space *mapping)
+void truncate_inode_pages_final_prepare(struct address_space *mapping)
 {
/*
 * Page reclaim can not participate in regular inode lifetime
@@ -479,7 +470,20 @@ void truncate_inode_pages_final(struct address_space 
*mapping)
xa_lock_irq(>i_pages);
xa_unlock_irq(>i_pages);
}
+}
 
+/**
+ * truncate_inode_pages_final - truncate *all* pages before inode dies
+ * @mapping: mapping to truncate
+ *
+ * Called under (and serialized by) inode->i_rwsem.
+ *
+ * Filesystems have to use this in the .evict_inode path to inform the
+ * VM that this is the final truncate and the inode is going away.
+ */
+void truncate_inode_pages_final(struct address_space *mapping)
+{
+   truncate_inode_pages_final_prepare(mapping);
truncate_inode_pages(mapping, 0);
 }
 EXPORT_SYMBOL(truncate_inode_pages_final);
-- 
2.41.0.rc0.172.g3f132b7071-goog




[RFC PATCH 07/19] mm: hugetlb: Refactor vma_*_reservation functions

2023-06-06 Thread Ackerley Tng
vma_*_reservation functions rely on vma_resv_map(), which assumes on a
hugetlbfs concept of the resv_map being stored in a specific field of
the inode.

This refactor enables vma_*_reservation functions, now renamed
resv_map_*_reservation, to be used with non-hugetlbfs filesystems,
further decoupling hugetlb from hugetlbfs.

Signed-off-by: Ackerley Tng 
---
 mm/hugetlb.c | 184 +++
 1 file changed, 99 insertions(+), 85 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d16c6417b90f..d943f83d15a9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2643,89 +2643,81 @@ static void return_unused_surplus_pages(struct hstate 
*h,
 
 
 /*
- * vma_needs_reservation, vma_commit_reservation and vma_end_reservation
- * are used by the huge page allocation routines to manage reservations.
+ * resv_map_needs_reservation, resv_map_commit_reservation and
+ * resv_map_end_reservation are used by the huge page allocation routines to
+ * manage reservations.
  *
- * vma_needs_reservation is called to determine if the huge page at addr
- * within the vma has an associated reservation.  If a reservation is
- * needed, the value 1 is returned.  The caller is then responsible for
- * managing the global reservation and subpool usage counts.  After
- * the huge page has been allocated, vma_commit_reservation is called
- * to add the page to the reservation map.  If the page allocation fails,
- * the reservation must be ended instead of committed.  vma_end_reservation
- * is called in such cases.
+ * resv_map_needs_reservation is called to determine if the huge page at addr
+ * within the vma has an associated reservation.  If a reservation is needed,
+ * the value 1 is returned.  The caller is then responsible for managing the
+ * global reservation and subpool usage counts.  After the huge page has been
+ * allocated, resv_map_commit_reservation is called to add the page to the
+ * reservation map.  If the page allocation fails, the reservation must be 
ended
+ * instead of committed.  resv_map_end_reservation is called in such cases.
  *
- * In the normal case, vma_commit_reservation returns the same value
- * as the preceding vma_needs_reservation call.  The only time this
- * is not the case is if a reserve map was changed between calls.  It
- * is the responsibility of the caller to notice the difference and
- * take appropriate action.
+ * In the normal case, resv_map_commit_reservation returns the same value as 
the
+ * preceding resv_map_needs_reservation call.  The only time this is not the
+ * case is if a reserve map was changed between calls.  It is the 
responsibility
+ * of the caller to notice the difference and take appropriate action.
  *
- * vma_add_reservation is used in error paths where a reservation must
- * be restored when a newly allocated huge page must be freed.  It is
- * to be called after calling vma_needs_reservation to determine if a
- * reservation exists.
+ * resv_map_add_reservation is used in error paths where a reservation must be
+ * restored when a newly allocated huge page must be freed.  It is to be called
+ * after calling resv_map_needs_reservation to determine if a reservation
+ * exists.
  *
- * vma_del_reservation is used in error paths where an entry in the reserve
- * map was created during huge page allocation and must be removed.  It is to
- * be called after calling vma_needs_reservation to determine if a reservation
+ * resv_map_del_reservation is used in error paths where an entry in the 
reserve
+ * map was created during huge page allocation and must be removed.  It is to 
be
+ * called after calling resv_map_needs_reservation to determine if a 
reservation
  * exists.
  */
-enum vma_resv_mode {
-   VMA_NEEDS_RESV,
-   VMA_COMMIT_RESV,
-   VMA_END_RESV,
-   VMA_ADD_RESV,
-   VMA_DEL_RESV,
+enum resv_map_resv_mode {
+   RESV_MAP_NEEDS_RESV,
+   RESV_MAP_COMMIT_RESV,
+   RESV_MAP_END_RESV,
+   RESV_MAP_ADD_RESV,
+   RESV_MAP_DEL_RESV,
 };
-static long __vma_reservation_common(struct hstate *h,
-   struct vm_area_struct *vma, unsigned long addr,
-   enum vma_resv_mode mode)
+static long __resv_map_reservation_common(struct resv_map *resv, pgoff_t 
resv_index,
+ bool may_be_shared_mapping,
+ enum resv_map_resv_mode mode)
 {
-   struct resv_map *resv;
-   pgoff_t idx;
long ret;
long dummy_out_regions_needed;
 
-   resv = vma_resv_map(vma);
-   if (!resv)
-   return 1;
-
-   idx = vma_hugecache_offset(h, vma, addr);
switch (mode) {
-   case VMA_NEEDS_RESV:
-   ret = region_chg(resv, idx, idx + 1, _out_regions_needed);
+   case RESV_MAP_NEEDS_RESV:
+   ret = region_chg(resv, resv_index, resv_index + 1, 
_out_regions_needed);
/* We assume

[RFC PATCH 14/19] KVM: guest_mem: Refactor cleanup to separate inode and file cleanup

2023-06-06 Thread Ackerley Tng
Cleanup in kvm_gmem_release() should be the reverse of
kvm_gmem_create_file().

Cleanup in kvm_gmem_evict_inode() should be the reverse of
kvm_gmem_create_inode().

Signed-off-by: Ackerley Tng 
---
 virt/kvm/guest_mem.c | 105 +--
 1 file changed, 71 insertions(+), 34 deletions(-)

diff --git a/virt/kvm/guest_mem.c b/virt/kvm/guest_mem.c
index 2f69ef666871..13253af40be6 100644
--- a/virt/kvm/guest_mem.c
+++ b/virt/kvm/guest_mem.c
@@ -247,42 +247,13 @@ static long kvm_gmem_fallocate(struct file *file, int 
mode, loff_t offset,
 
 static int kvm_gmem_release(struct inode *inode, struct file *file)
 {
-   struct kvm_gmem *gmem = inode->i_mapping->private_data;
-   struct kvm_memory_slot *slot;
-   struct kvm *kvm = gmem->kvm;
-   unsigned long index;
-
/*
-* Prevent concurrent attempts to *unbind* a memslot.  This is the last
-* reference to the file and thus no new bindings can be created, but
-* deferencing the slot for existing bindings needs to be protected
-* against memslot updates, specifically so that unbind doesn't race
-* and free the memslot (kvm_gmem_get_file() will return NULL).
+* This is called when the last reference to the file is released. Only
+* clean up file-related stuff. struct kvm_gmem is also referred to in
+* the inode, so clean that up in kvm_gmem_evict_inode().
 */
-   mutex_lock(>slots_lock);
-
-   xa_for_each(>bindings, index, slot)
-   rcu_assign_pointer(slot->gmem.file, NULL);
-
-   synchronize_rcu();
-
-   /*
-* All in-flight operations are gone and new bindings can be created.
-* Free the backing memory, and more importantly, zap all SPTEs that
-* pointed at this file.
-*/
-   kvm_gmem_invalidate_begin(kvm, gmem, 0, -1ul);
-   truncate_inode_pages_final(file->f_mapping);
-   kvm_gmem_invalidate_end(kvm, gmem, 0, -1ul);
-
-   mutex_unlock(>slots_lock);
-
-   WARN_ON_ONCE(!(mapping_empty(file->f_mapping)));
-
-   xa_destroy(>bindings);
-   kfree(gmem);
-
-   kvm_put_kvm(kvm);
+   file->f_mapping = NULL;
+   file->private_data = NULL;
 
return 0;
 }
@@ -603,11 +574,77 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct 
kvm_memory_slot *slot,
 }
 EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);
 
+static void kvm_gmem_evict_inode(struct inode *inode)
+{
+   struct kvm_gmem *gmem = inode->i_mapping->private_data;
+   struct kvm_memory_slot *slot;
+   struct kvm *kvm;
+   unsigned long index;
+
+   /*
+* If iput() was called before inode is completely set up due to some
+* error in kvm_gmem_create_inode(), gmem will be NULL.
+*/
+   if (!gmem)
+   goto basic_cleanup;
+
+   kvm = gmem->kvm;
+
+   /*
+* Prevent concurrent attempts to *unbind* a memslot.  This is the last
+* reference to the file and thus no new bindings can be created, but
+* deferencing the slot for existing bindings needs to be protected
+* against memslot updates, specifically so that unbind doesn't race
+* and free the memslot (kvm_gmem_get_file() will return NULL).
+*/
+   mutex_lock(>slots_lock);
+
+   xa_for_each(>bindings, index, slot)
+   rcu_assign_pointer(slot->gmem.file, NULL);
+
+   synchronize_rcu();
+
+   /*
+* All in-flight operations are gone and new bindings can be created.
+* Free the backing memory, and more importantly, zap all SPTEs that
+* pointed at this file.
+*/
+   kvm_gmem_invalidate_begin(kvm, gmem, 0, -1ul);
+   truncate_inode_pages_final(inode->i_mapping);
+   kvm_gmem_invalidate_end(kvm, gmem, 0, -1ul);
+
+   mutex_unlock(>slots_lock);
+
+   WARN_ON_ONCE(!(mapping_empty(inode->i_mapping)));
+
+   xa_destroy(>bindings);
+   kfree(gmem);
+
+   kvm_put_kvm(kvm);
+
+basic_cleanup:
+   clear_inode(inode);
+}
+
+static const struct super_operations kvm_gmem_super_operations = {
+   /*
+* TODO update statfs handler for kvm_gmem. What should the statfs
+* handler return?
+*/
+   .statfs = simple_statfs,
+   .evict_inode= kvm_gmem_evict_inode,
+};
+
 static int kvm_gmem_init_fs_context(struct fs_context *fc)
 {
+   struct pseudo_fs_context *ctx;
+
if (!init_pseudo(fc, GUEST_MEMORY_MAGIC))
return -ENOMEM;
 
+   ctx = fc->fs_private;
+   ctx->ops = _gmem_super_operations;
+
return 0;
 }
 
-- 
2.41.0.rc0.172.g3f132b7071-goog




[RFC PATCH 10/19] mm: hugetlb: Parametrize alloc_hugetlb_folio_from_subpool() by resv_map

2023-06-06 Thread Ackerley Tng
Parametrize alloc_hugetlb_folio_from_subpool() by resv_map to remove
the use of vma_resv_map() and decouple hugetlb with hugetlbfs.

Signed-off-by: Ackerley Tng 
---
 include/linux/hugetlb.h | 2 +-
 mm/hugetlb.c| 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 5fe9643826d7..d564802ace4b 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -767,7 +767,7 @@ struct huge_bootmem_page {
 
 int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list);
 struct folio *alloc_hugetlb_folio_from_subpool(
-   struct hugepage_subpool *spool, struct hstate *h,
+   struct hugepage_subpool *spool, struct hstate *h, struct resv_map *resv,
struct vm_area_struct *vma, unsigned long addr, int avoid_reserve);
 struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 540634aec181..aebdd8c63439 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3003,7 +3003,7 @@ int isolate_or_dissolve_huge_page(struct page *page, 
struct list_head *list)
 }
 
 struct folio *alloc_hugetlb_folio_from_subpool(
-   struct hugepage_subpool *spool, struct hstate *h,
+   struct hugepage_subpool *spool, struct hstate *h, struct resv_map *resv,
struct vm_area_struct *vma, unsigned long addr, int avoid_reserve)
 {
struct folio *folio;
@@ -3013,7 +3013,6 @@ struct folio *alloc_hugetlb_folio_from_subpool(
struct hugetlb_cgroup *h_cg = NULL;
bool deferred_reserve;
 
-   struct resv_map *resv = vma_resv_map(vma);
pgoff_t resv_index = vma_hugecache_offset(h, vma, addr);
bool may_share = vma->vm_flags & VM_MAYSHARE;
 
@@ -3141,8 +3140,9 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct 
*vma,
 {
struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
+   struct resv_map *resv = vma_resv_map(vma);
 
-   return alloc_hugetlb_folio_from_subpool(spool, h, vma, addr, 
avoid_reserve);
+   return alloc_hugetlb_folio_from_subpool(spool, h, resv, vma, addr, 
avoid_reserve);
 }
 
 int alloc_bootmem_huge_page(struct hstate *h, int nid)
-- 
2.41.0.rc0.172.g3f132b7071-goog




[RFC PATCH 00/19] hugetlb support for KVM guest_mem

2023-06-06 Thread Ackerley Tng
Hello,

This patchset builds upon a soon-to-be-published WIP patchset that Sean
published at https://github.com/sean-jc/linux/tree/x86/kvm_gmem_solo, mentioned
at [1].

The tree can be found at:
https://github.com/googleprodkernel/linux-cc/tree/gmem-hugetlb-rfc-v1

In this patchset, hugetlb support for KVM's guest_mem (aka gmem) is introduced,
allowing VM private memory (for confidential computing) to be backed by hugetlb
pages.

guest_mem provides userspace with a handle, with which userspace can allocate
and deallocate memory for confidential VMs without mapping the memory into
userspace.

Why use hugetlb instead of introducing a new allocator, like gmem does for 4K
and transparent hugepages?

+ hugetlb provides the following useful functionality, which would otherwise
  have to be reimplemented:
+ Allocation of hugetlb pages at boot time, including
+ Parsing of kernel boot parameters to configure hugetlb
+ Tracking of usage in hstate
+ gmem will share the same system-wide pool of hugetlb pages, so users
  don't have to have separate pools for hugetlb and gmem
+ Page accounting with subpools
+ hugetlb pages are tracked in subpools, which gmem uses to reserve
  pages from the global hstate
+ Memory charging
+ hugetlb provides code that charges memory to cgroups
+ Reporting: hugetlb usage and availability are available at /proc/meminfo,
  etc

The first 11 patches in this patchset is a series of refactoring to decouple
hugetlb and hugetlbfs.

The central thread binding the refactoring is that some functions (like
inode_resv_map(), inode_subpool(), inode_hstate(), etc) rely on a hugetlbfs
concept, that the resv_map, subpool, hstate, are in a specific field in a
hugetlb inode.

Refactoring to parametrize functions by hstate, subpool, resv_map will allow
hugetlb to be used by gmem and in other places where these data structures
aren't necessarily stored in the same positions in the inode.

The refactoring proposed here is just the minimum required to get a
proof-of-concept working with gmem. I would like to get opinions on this
approach before doing further refactoring. (See TODOs)

TODOs:

+ hugetlb/hugetlbfs refactoring
+ remove_inode_hugepages() no longer needs to be exposed, it is hugetlbfs
  specific and used only in inode.c
+ remove_mapping_hugepages(), remove_inode_single_folio(),
  hugetlb_unreserve_pages() shouldn't need to take inode as a parameter
+ Updating inode->i_blocks can be refactored to a separate function and
  called from hugetlbfs and gmem
+ alloc_hugetlb_folio_from_subpool() shouldn't need to be parametrized by
  vma
+ hugetlb_reserve_pages() should be refactored to be symmetric with
  hugetlb_unreserve_pages()
+ It should be parametrized by resv_map
+ alloc_hugetlb_folio_from_subpool() could perhaps use
  hugetlb_reserve_pages()?
+ gmem
+ Figure out if resv_map should be used by gmem at all
+ Probably needs more refactoring to decouple resv_map from hugetlb
  functions

Questions for the community:

1. In this patchset, every gmem file backed with hugetlb is given a new
   subpool. Is that desirable?
+ In hugetlbfs, a subpool always belongs to a mount, and hugetlbfs has one
  mount per hugetlb size (2M, 1G, etc)
+ memfd_create(MFD_HUGETLB) effectively returns a full hugetlbfs file, so it
  (rightfully) uses the hugetlbfs kernel mounts and their subpools
+ I gave each file a subpool mostly to speed up implementation and still be
  able to reserve hugetlb pages from the global hstate based on the gmem
  file size.
+ gmem, unlike hugetlbfs, isn't meant to be a full filesystem, so
+ Should there be multiple mounts, one for each hugetlb size?
+ Will the mounts be initialized on boot or on first gmem file creation?
+ Or is one subpool per gmem file fine?
2. Should resv_map be used for gmem at all, since gmem doesn't allow userspace
   reservations?

[1] https://lore.kernel.org/lkml/zem5zq8oo+xna...@google.com/

---

Ackerley Tng (19):
  mm: hugetlb: Expose get_hstate_idx()
  mm: hugetlb: Move and expose hugetlbfs_zero_partial_page
  mm: hugetlb: Expose remove_inode_hugepages
  mm: hugetlb: Decouple hstate, subpool from inode
  mm: hugetlb: Allow alloc_hugetlb_folio() to be parametrized by subpool
and hstate
  mm: hugetlb: Provide hugetlb_filemap_add_folio()
  mm: hugetlb: Refactor vma_*_reservation functions
  mm: hugetlb: Refactor restore_reserve_on_error
  mm: hugetlb: Use restore_reserve_on_error directly in filesystems
  mm: hugetlb: Parametrize alloc_hugetlb_folio_from_subpool() by
resv_map
  mm: hugetlb: Parametrize hugetlb functions by resv_map
  mm: truncate: Expose preparation steps for truncate_inode_pages_final
  KVM: guest_mem: Refactor kvm_gmem fd creation to be in layers
  KVM: guest_mem: Refactor cleanup to separate inode and file cleanup
  

[RFC PATCH 11/19] mm: hugetlb: Parametrize hugetlb functions by resv_map

2023-06-06 Thread Ackerley Tng
Parametrize remove_mapping_hugepages() and hugetlb_unreserve_pages()
by resv_map to remove the use of inode_resv_map() and decouple hugetlb
with hugetlbfs.

Signed-off-by: Ackerley Tng 
---
 fs/hugetlbfs/inode.c| 16 ++--
 include/linux/hugetlb.h |  6 --
 mm/hugetlb.c|  4 ++--
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 53f6a421499d..a7791b1390a6 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -560,8 +560,8 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t 
start, pgoff_t end,
  */
 static bool remove_mapping_single_folio(
struct address_space *mapping, struct folio *folio, pgoff_t index,
-   struct hstate *h, struct hugepage_subpool *spool, struct inode *inode,
-   bool truncate_op)
+   struct hstate *h, struct hugepage_subpool *spool, struct resv_map 
*resv_map,
+   struct inode *inode, bool truncate_op)
 {
bool ret = false;
 
@@ -586,7 +586,8 @@ static bool remove_mapping_single_folio(
hugetlb_delete_from_page_cache(folio);
ret = true;
if (!truncate_op) {
-   if (unlikely(hugetlb_unreserve_pages(h, spool, inode, index, 
index + 1, 1)))
+   if (unlikely(hugetlb_unreserve_pages(h, spool, resv_map,
+inode, index, index + 1, 
1)))
hugetlb_fix_reserve_counts(h, spool);
}
 
@@ -623,6 +624,7 @@ static bool remove_mapping_single_folio(
  */
 void remove_mapping_hugepages(struct address_space *mapping,
  struct hstate *h, struct hugepage_subpool *spool,
+ struct resv_map *resv_map,
  struct inode *inode, loff_t lstart, loff_t lend)
 {
const pgoff_t start = lstart >> huge_page_shift(h);
@@ -647,7 +649,7 @@ void remove_mapping_hugepages(struct address_space *mapping,
 * Remove folio that was part of folio_batch.
 */
if (remove_mapping_single_folio(mapping, folio, index,
-   h, spool, inode, 
truncate_op))
+   h, spool, resv_map, 
inode, truncate_op))
freed++;
 
mutex_unlock(_fault_mutex_table[hash]);
@@ -657,7 +659,8 @@ void remove_mapping_hugepages(struct address_space *mapping,
}
 
if (truncate_op)
-   (void)hugetlb_unreserve_pages(h, spool, inode, start, LONG_MAX, 
freed);
+   (void)hugetlb_unreserve_pages(h, spool, resv_map, inode,
+ start, LONG_MAX, freed);
 }
 
 void remove_inode_hugepages(struct inode *inode, loff_t lstart, loff_t lend)
@@ -665,8 +668,9 @@ void remove_inode_hugepages(struct inode *inode, loff_t 
lstart, loff_t lend)
struct address_space *mapping = >i_data;
struct hstate *h = hstate_inode(inode);
struct hugepage_subpool *spool = subpool_inode(inode);
+   struct resv_map *resv_map = inode_resv_map(inode);
 
-   return remove_mapping_hugepages(mapping, h, spool, inode, lstart, lend);
+   return remove_mapping_hugepages(mapping, h, spool, resv_map, inode, 
lstart, lend);
 }
 
 static void hugetlbfs_evict_inode(struct inode *inode)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d564802ace4b..af04588a5afe 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -172,7 +172,8 @@ bool hugetlb_reserve_pages(struct hstate *h, struct 
hugepage_subpool *spool,
   struct vm_area_struct *vma,
   vm_flags_t vm_flags);
 long hugetlb_unreserve_pages(struct hstate *h, struct hugepage_subpool *spool,
-struct inode *inode, long start, long end, long 
freed);
+struct resv_map *resv_map, struct inode *inode,
+long start, long end, long freed);
 bool isolate_hugetlb(struct folio *folio, struct list_head *list);
 int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool 
unpoison);
 int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
@@ -263,6 +264,7 @@ void hugetlb_zero_partial_page(struct hstate *h, struct 
address_space *mapping,
 
 void remove_mapping_hugepages(struct address_space *mapping,
  struct hstate *h, struct hugepage_subpool *spool,
+ struct resv_map *resv_map,
  struct inode *inode, loff_t lstart, loff_t lend);
 void remove_inode_hugepages(struct inode *inode, loff_t lstart, loff_t lend);
 
@@ -479,7 +481,7 @@ static inline void hugetlb_zero_partial_page(
 
 static inline void remove_mapping_hugepages(
struct address_space *mapping, struct hstate *h, struct 
hugepage_subpool *spool,
-

[RFC PATCH 05/19] mm: hugetlb: Allow alloc_hugetlb_folio() to be parametrized by subpool and hstate

2023-06-06 Thread Ackerley Tng
subpool_inode() and hstate_inode() are hugetlbfs-specific.

By allowing subpool and hstate to be specified, hugetlb is further
modularized from hugetlbfs.

Signed-off-by: Ackerley Tng 
---
 include/linux/hugetlb.h |  3 +++
 mm/hugetlb.c| 16 
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 2457d7a21974..14df89d1642c 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -747,6 +747,9 @@ struct huge_bootmem_page {
 };
 
 int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list);
+struct folio *alloc_hugetlb_folio_from_subpool(
+   struct hugepage_subpool *spool, struct hstate *h,
+   struct vm_area_struct *vma, unsigned long addr, int avoid_reserve);
 struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve);
 struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9da419b930df..99ab4bbdb2ce 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3008,11 +3008,10 @@ int isolate_or_dissolve_huge_page(struct page *page, 
struct list_head *list)
return ret;
 }
 
-struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
-   unsigned long addr, int avoid_reserve)
+struct folio *alloc_hugetlb_folio_from_subpool(
+   struct hugepage_subpool *spool, struct hstate *h,
+   struct vm_area_struct *vma, unsigned long addr, int avoid_reserve)
 {
-   struct hugepage_subpool *spool = subpool_vma(vma);
-   struct hstate *h = hstate_vma(vma);
struct folio *folio;
long map_chg, map_commit;
long gbl_chg;
@@ -3139,6 +3138,15 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct 
*vma,
return ERR_PTR(-ENOSPC);
 }
 
+struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
+   unsigned long addr, int avoid_reserve)
+{
+   struct hugepage_subpool *spool = subpool_vma(vma);
+   struct hstate *h = hstate_vma(vma);
+
+   return alloc_hugetlb_folio_from_subpool(spool, h, vma, addr, 
avoid_reserve);
+}
+
 int alloc_bootmem_huge_page(struct hstate *h, int nid)
__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
 int __alloc_bootmem_huge_page(struct hstate *h, int nid)
-- 
2.41.0.rc0.172.g3f132b7071-goog




[RFC PATCH 09/19] mm: hugetlb: Use restore_reserve_on_error directly in filesystems

2023-06-06 Thread Ackerley Tng
Expose inode_resv_map() so that hugetlbfs can access its own resv_map.

Hide restore_reserve_on_error_vma(), that function is now only used
within mm/hugetlb.c.

Signed-off-by: Ackerley Tng 
---
 fs/hugetlbfs/inode.c|  2 +-
 include/linux/hugetlb.h | 21 +++--
 mm/hugetlb.c| 13 -
 3 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 44e6ee9a856d..53f6a421499d 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -868,7 +868,7 @@ static long hugetlbfs_fallocate(struct file *file, int 
mode, loff_t offset,
__folio_mark_uptodate(folio);
error = hugetlb_add_to_page_cache(folio, mapping, index);
if (unlikely(error)) {
-   restore_reserve_on_error_vma(h, _vma, addr, 
folio);
+   restore_reserve_on_error(inode_resv_map(inode), index, 
true, folio);
folio_put(folio);
mutex_unlock(_fault_mutex_table[hash]);
goto out;
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 02a2766d89a4..5fe9643826d7 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -568,6 +568,20 @@ static inline struct hugepage_subpool 
*subpool_inode(struct inode *inode)
return HUGETLBFS_SB(inode->i_sb)->spool;
 }
 
+static inline struct resv_map *inode_resv_map(struct inode *inode)
+{
+   /*
+* At inode evict time, i_mapping may not point to the original
+* address space within the inode.  This original address space
+* contains the pointer to the resv_map.  So, always use the
+* address space embedded within the inode.
+* The VERY common case is inode->mapping == >i_data but,
+* this may not be true for device special inodes.
+*/
+   return (struct resv_map *)(>i_data)->private_data;
+}
+
+
 #else /* !CONFIG_HUGETLBFS */
 
 #define is_file_hugepages(file)false
@@ -588,6 +602,11 @@ static inline struct hugepage_subpool 
*subpool_inode(struct inode *inode)
return NULL;
 }
 
+static inline struct resv_map *inode_resv_map(struct inode *inode)
+{
+   return NULL;
+}
+
 #endif /* !CONFIG_HUGETLBFS */
 
 #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
@@ -762,8 +781,6 @@ int hugetlb_add_to_page_cache(struct folio *folio, struct 
address_space *mapping
pgoff_t idx);
 void restore_reserve_on_error(struct resv_map *resv, pgoff_t resv_index,
  bool may_share, struct folio *folio);
-void restore_reserve_on_error_vma(struct hstate *h, struct vm_area_struct *vma,
- unsigned long address, struct folio *folio);
 
 /* arch callback */
 int __init __alloc_bootmem_huge_page(struct hstate *h, int nid);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4675f9efeba4..540634aec181 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1091,19 +1091,6 @@ void resv_map_release(struct kref *ref)
kfree(resv_map);
 }
 
-static inline struct resv_map *inode_resv_map(struct inode *inode)
-{
-   /*
-* At inode evict time, i_mapping may not point to the original
-* address space within the inode.  This original address space
-* contains the pointer to the resv_map.  So, always use the
-* address space embedded within the inode.
-* The VERY common case is inode->mapping == >i_data but,
-* this may not be true for device special inodes.
-*/
-   return (struct resv_map *)(>i_data)->private_data;
-}
-
 static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
 {
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
-- 
2.41.0.rc0.172.g3f132b7071-goog




[RFC PATCH 13/19] KVM: guest_mem: Refactor kvm_gmem fd creation to be in layers

2023-06-06 Thread Ackerley Tng
First create a gmem inode, then create a gmem file using the inode,
then install the file into an fd.

Creating the file in layers separates inode concepts (struct kvm_gmem)
from file concepts and makes cleaning up in stages neater.

Signed-off-by: Ackerley Tng 
---
 virt/kvm/guest_mem.c | 86 +---
 1 file changed, 50 insertions(+), 36 deletions(-)

diff --git a/virt/kvm/guest_mem.c b/virt/kvm/guest_mem.c
index 8708139822d3..2f69ef666871 100644
--- a/virt/kvm/guest_mem.c
+++ b/virt/kvm/guest_mem.c
@@ -375,41 +375,27 @@ static const struct inode_operations kvm_gmem_iops = {
.setattr= kvm_gmem_setattr,
 };
 
-static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags,
-struct vfsmount *mnt)
+static struct inode *kvm_gmem_create_inode(struct kvm *kvm, loff_t size, u64 
flags,
+  struct vfsmount *mnt)
 {
+   int err;
+   struct inode *inode;
+   struct kvm_gmem *gmem;
const char *anon_name = "[kvm-gmem]";
const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name));
-   struct kvm_gmem *gmem;
-   struct inode *inode;
-   struct file *file;
-   int fd, err;
-
-   fd = get_unused_fd_flags(0);
-   if (fd < 0)
-   return fd;
 
inode = alloc_anon_inode(mnt->mnt_sb);
-   if (IS_ERR(inode)) {
-   err = PTR_ERR(inode);
-   goto err_fd;
-   }
+   if (IS_ERR(inode))
+   return inode;
 
err = security_inode_init_security_anon(inode, , NULL);
if (err)
goto err_inode;
 
-   file = alloc_file_pseudo(inode, mnt, "kvm-gmem", O_RDWR, 
_gmem_fops);
-   if (IS_ERR(file)) {
-   err = PTR_ERR(file);
-   goto err_inode;
-   }
-
+   err = -ENOMEM;
gmem = kzalloc(sizeof(*gmem), GFP_KERNEL);
-   if (!gmem) {
-   err = -ENOMEM;
-   goto err_file;
-   }
+   if (!gmem)
+   goto err_inode;
 
xa_init(>bindings);
 
@@ -426,24 +412,41 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t 
size, u64 flags,
mapping_set_large_folios(inode->i_mapping);
mapping_set_unevictable(inode->i_mapping);
 
-   file->f_flags |= O_LARGEFILE;
-   file->f_mapping = inode->i_mapping;
-   file->private_data = gmem;
-
-   fd_install(fd, file);
-   return fd;
+   return inode;
 
-err_file:
-   fput(file);
 err_inode:
iput(inode);
-err_fd:
-   put_unused_fd(fd);
-   return err;
+   return ERR_PTR(err);
+}
+
+
+static struct file *kvm_gmem_create_file(struct kvm *kvm, loff_t size, u64 
flags,
+struct vfsmount *mnt)
+{
+   struct file *file;
+   struct inode *inode;
+
+   inode = kvm_gmem_create_inode(kvm, size, flags, mnt);
+   if (IS_ERR(inode))
+   return ERR_CAST(inode);
+
+   file = alloc_file_pseudo(inode, mnt, "kvm-gmem", O_RDWR, 
_gmem_fops);
+   if (IS_ERR(file)) {
+   iput(inode);
+   return file;
+   }
+
+   file->f_flags |= O_LARGEFILE;
+   file->f_mapping = inode->i_mapping;
+   file->private_data = inode->i_mapping->private_data;
+
+   return file;
 }
 
 int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *gmem)
 {
+   int fd;
+   struct file *file;
loff_t size = gmem->size;
u64 flags = gmem->flags;
 
@@ -462,7 +465,18 @@ int kvm_gmem_create(struct kvm *kvm, struct 
kvm_create_guest_memfd *gmem)
 #endif
}
 
-   return __kvm_gmem_create(kvm, size, flags, kvm_gmem_mnt);
+   fd = get_unused_fd_flags(0);
+   if (fd < 0)
+   return fd;
+
+   file = kvm_gmem_create_file(kvm, size, flags, kvm_gmem_mnt);
+   if (IS_ERR(file)) {
+   put_unused_fd(fd);
+   return PTR_ERR(file);
+   }
+
+   fd_install(fd, file);
+   return fd;
 }
 
 int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
-- 
2.41.0.rc0.172.g3f132b7071-goog




[RFC PATCH 01/19] mm: hugetlb: Expose get_hstate_idx()

2023-06-06 Thread Ackerley Tng
Expose get_hstate_idx() so it can be used from KVM's guest_mem code

Signed-off-by: Ackerley Tng 
---
 fs/hugetlbfs/inode.c|  9 -
 include/linux/hugetlb.h | 14 ++
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9062da6da567..406d7366cf3e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1560,15 +1560,6 @@ static int can_do_hugetlb_shm(void)
return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
 }
 
-static int get_hstate_idx(int page_size_log)
-{
-   struct hstate *h = hstate_sizelog(page_size_log);
-
-   if (!h)
-   return -1;
-   return hstate_index(h);
-}
-
 /*
  * Note that size should be aligned to proper hugepage size in caller side,
  * otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 7c977d234aba..37c2edf7beea 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -876,6 +876,15 @@ static inline int hstate_index(struct hstate *h)
return h - hstates;
 }
 
+static inline int get_hstate_idx(int page_size_log)
+{
+   struct hstate *h = hstate_sizelog(page_size_log);
+
+   if (!h)
+   return -1;
+   return hstate_index(h);
+}
+
 extern int dissolve_free_huge_page(struct page *page);
 extern int dissolve_free_huge_pages(unsigned long start_pfn,
unsigned long end_pfn);
@@ -1142,6 +1151,11 @@ static inline int hstate_index(struct hstate *h)
return 0;
 }
 
+static inline int get_hstate_idx(int page_size_log)
+{
+   return 0;
+}
+
 static inline int dissolve_free_huge_page(struct page *page)
 {
return 0;
-- 
2.41.0.rc0.172.g3f132b7071-goog




[RFC PATCH 15/19] KVM: guest_mem: hugetlb: initialization and cleanup

2023-06-06 Thread Ackerley Tng
First stage of hugetlb support: add initialization and cleanup
routines

Signed-off-by: Ackerley Tng 
---
 include/uapi/linux/kvm.h | 25 
 virt/kvm/guest_mem.c | 88 +---
 2 files changed, 108 insertions(+), 5 deletions(-)

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 0fa665e8862a..1df0c802c29f 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -13,6 +13,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define KVM_API_VERSION 12
 
@@ -2280,6 +2281,30 @@ struct kvm_memory_attributes {
 #define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO,  0xd4, struct 
kvm_create_guest_memfd)
 
 #define KVM_GUEST_MEMFD_HUGE_PMD   (1ULL << 0)
+#define KVM_GUEST_MEMFD_HUGETLB(1ULL << 1)
+
+/*
+ * Huge page size encoding when KVM_GUEST_MEMFD_HUGETLB is specified, and a 
huge
+ * page size other than the default is desired.  See hugetlb_encode.h.  All
+ * known huge page size encodings are provided here.  It is the responsibility
+ * of the application to know which sizes are supported on the running system.
+ * See mmap(2) man page for details.
+ */
+#define KVM_GUEST_MEMFD_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT
+#define KVM_GUEST_MEMFD_HUGE_MASK  HUGETLB_FLAG_ENCODE_MASK
+
+#define KVM_GUEST_MEMFD_HUGE_64KB  HUGETLB_FLAG_ENCODE_64KB
+#define KVM_GUEST_MEMFD_HUGE_512KB HUGETLB_FLAG_ENCODE_512KB
+#define KVM_GUEST_MEMFD_HUGE_1MB   HUGETLB_FLAG_ENCODE_1MB
+#define KVM_GUEST_MEMFD_HUGE_2MB   HUGETLB_FLAG_ENCODE_2MB
+#define KVM_GUEST_MEMFD_HUGE_8MB   HUGETLB_FLAG_ENCODE_8MB
+#define KVM_GUEST_MEMFD_HUGE_16MB  HUGETLB_FLAG_ENCODE_16MB
+#define KVM_GUEST_MEMFD_HUGE_32MB  HUGETLB_FLAG_ENCODE_32MB
+#define KVM_GUEST_MEMFD_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB
+#define KVM_GUEST_MEMFD_HUGE_512MB HUGETLB_FLAG_ENCODE_512MB
+#define KVM_GUEST_MEMFD_HUGE_1GB   HUGETLB_FLAG_ENCODE_1GB
+#define KVM_GUEST_MEMFD_HUGE_2GB   HUGETLB_FLAG_ENCODE_2GB
+#define KVM_GUEST_MEMFD_HUGE_16GB  HUGETLB_FLAG_ENCODE_16GB
 
 struct kvm_create_guest_memfd {
__u64 size;
diff --git a/virt/kvm/guest_mem.c b/virt/kvm/guest_mem.c
index 13253af40be6..b533143e2878 100644
--- a/virt/kvm/guest_mem.c
+++ b/virt/kvm/guest_mem.c
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -30,6 +31,11 @@ struct kvm_gmem {
struct kvm *kvm;
u64 flags;
struct xarray bindings;
+   struct {
+   struct hstate *h;
+   struct hugepage_subpool *spool;
+   struct resv_map *resv_map;
+   } hugetlb;
 };
 
 static loff_t kvm_gmem_get_size(struct file *file)
@@ -346,6 +352,46 @@ static const struct inode_operations kvm_gmem_iops = {
.setattr= kvm_gmem_setattr,
 };
 
+static int kvm_gmem_hugetlb_setup(struct inode *inode, struct kvm_gmem *gmem,
+ loff_t size, u64 flags)
+{
+   int page_size_log;
+   int hstate_idx;
+   long hpages;
+   struct resv_map *resv_map;
+   struct hugepage_subpool *spool;
+   struct hstate *h;
+
+   page_size_log = (flags >> KVM_GUEST_MEMFD_HUGE_SHIFT) & 
KVM_GUEST_MEMFD_HUGE_MASK;
+   hstate_idx = get_hstate_idx(page_size_log);
+   if (hstate_idx < 0)
+   return -ENOENT;
+
+   h = [hstate_idx];
+   /* Round up to accommodate size requests that don't align with huge 
pages */
+   hpages = round_up(size, huge_page_size(h)) >> huge_page_shift(h);
+   spool = hugepage_new_subpool(h, hpages, hpages);
+   if (!spool)
+   goto out;
+
+   resv_map = resv_map_alloc();
+   if (!resv_map)
+   goto out_subpool;
+
+   inode->i_blkbits = huge_page_shift(h);
+
+   gmem->hugetlb.h = h;
+   gmem->hugetlb.spool = spool;
+   gmem->hugetlb.resv_map = resv_map;
+
+   return 0;
+
+out_subpool:
+   kfree(spool);
+out:
+   return -ENOMEM;
+}
+
 static struct inode *kvm_gmem_create_inode(struct kvm *kvm, loff_t size, u64 
flags,
   struct vfsmount *mnt)
 {
@@ -368,6 +414,12 @@ static struct inode *kvm_gmem_create_inode(struct kvm 
*kvm, loff_t size, u64 fla
if (!gmem)
goto err_inode;
 
+   if (flags & KVM_GUEST_MEMFD_HUGETLB) {
+   err = kvm_gmem_hugetlb_setup(inode, gmem, size, flags);
+   if (err)
+   goto err_gmem;
+   }
+
xa_init(>bindings);
 
kvm_get_kvm(kvm);
@@ -385,6 +437,8 @@ static struct inode *kvm_gmem_create_inode(struct kvm *kvm, 
loff_t size, u64 fla
 
return inode;
 
+err_gmem:
+   kfree(gmem);
 err_inode:
iput(inode);
return ERR_PTR(err);
@@ -414,6 +468,8 @@ static struct file *kvm_gmem_create_file(struct kvm *kvm, 
loff_t size, u64 flags
return file;
 }
 
+#define KVM_GUEST_MEMFD_ALL_FLAGS (KVM_GUEST

[RFC PATCH 18/19] KVM: selftests: Support various types of backing sources for private memory

2023-06-06 Thread Ackerley Tng
Adds support for various type of backing sources for private
memory (in the sense of confidential computing), similar to the
backing sources available for shared memory.

Signed-off-by: Ackerley Tng 
---
 .../testing/selftests/kvm/include/test_util.h | 14 
 tools/testing/selftests/kvm/lib/test_util.c   | 74 +++
 2 files changed, 88 insertions(+)

diff --git a/tools/testing/selftests/kvm/include/test_util.h 
b/tools/testing/selftests/kvm/include/test_util.h
index a6e9f215ce70..899ea15ca8a9 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -122,6 +122,16 @@ struct vm_mem_backing_src_alias {
uint32_t flag;
 };
 
+enum vm_pmem_backing_src_type {
+   VM_PMEM_SRC_GMEM,
+   VM_PMEM_SRC_HUGETLB,  /* Use kernel default page size for hugetlb pages 
*/
+   VM_PMEM_SRC_HUGETLB_2MB,
+   VM_PMEM_SRC_HUGETLB_1GB,
+   NUM_PMEM_SRC_TYPES,
+};
+
+#define DEFAULT_VM_PMEM_SRC VM_PMEM_SRC_GMEM
+
 #define MIN_RUN_DELAY_NS   20UL
 
 bool thp_configured(void);
@@ -132,6 +142,10 @@ size_t get_backing_src_pagesz(uint32_t i);
 bool is_backing_src_hugetlb(uint32_t i);
 void backing_src_help(const char *flag);
 enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
+void pmem_backing_src_help(const char *flag);
+enum vm_pmem_backing_src_type parse_pmem_backing_src_type(const char 
*type_name);
+const struct vm_mem_backing_src_alias *vm_pmem_backing_src_alias(uint32_t i);
+size_t get_pmem_backing_src_pagesz(uint32_t i);
 long get_run_delay(void);
 
 /*
diff --git a/tools/testing/selftests/kvm/lib/test_util.c 
b/tools/testing/selftests/kvm/lib/test_util.c
index b772193f6c18..62efb7b8ba51 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -8,6 +8,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -287,6 +288,34 @@ const struct vm_mem_backing_src_alias 
*vm_mem_backing_src_alias(uint32_t i)
return [i];
 }
 
+const struct vm_mem_backing_src_alias *vm_pmem_backing_src_alias(uint32_t i)
+{
+   static const struct vm_mem_backing_src_alias aliases[] = {
+   [VM_PMEM_SRC_GMEM] = {
+   .name = "pmem_gmem",
+   .flag = 0,
+   },
+   [VM_PMEM_SRC_HUGETLB] = {
+   .name = "pmem_hugetlb",
+   .flag = KVM_GUEST_MEMFD_HUGETLB,
+   },
+   [VM_PMEM_SRC_HUGETLB_2MB] = {
+   .name = "pmem_hugetlb_2mb",
+   .flag = KVM_GUEST_MEMFD_HUGETLB | 
KVM_GUEST_MEMFD_HUGE_2MB,
+   },
+   [VM_PMEM_SRC_HUGETLB_1GB] = {
+   .name = "pmem_hugetlb_1gb",
+   .flag = KVM_GUEST_MEMFD_HUGETLB | 
KVM_GUEST_MEMFD_HUGE_1GB,
+   },
+   };
+   _Static_assert(ARRAY_SIZE(aliases) == NUM_PMEM_SRC_TYPES,
+  "Missing new backing private mem src types?");
+
+   TEST_ASSERT(i < NUM_PMEM_SRC_TYPES, "Private mem backing src type ID %d 
too big", i);
+
+   return [i];
+}
+
 #define MAP_HUGE_PAGE_SIZE(x) (1ULL << ((x >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK))
 
 size_t get_backing_src_pagesz(uint32_t i)
@@ -307,6 +336,20 @@ size_t get_backing_src_pagesz(uint32_t i)
}
 }
 
+size_t get_pmem_backing_src_pagesz(uint32_t i)
+{
+   uint32_t flag = vm_pmem_backing_src_alias(i)->flag;
+
+   switch (i) {
+   case VM_PMEM_SRC_GMEM:
+   return getpagesize();
+   case VM_PMEM_SRC_HUGETLB:
+   return get_def_hugetlb_pagesz();
+   default:
+   return MAP_HUGE_PAGE_SIZE(flag);
+   }
+}
+
 bool is_backing_src_hugetlb(uint32_t i)
 {
return !!(vm_mem_backing_src_alias(i)->flag & MAP_HUGETLB);
@@ -343,6 +386,37 @@ enum vm_mem_backing_src_type parse_backing_src_type(const 
char *type_name)
return -1;
 }
 
+static void print_available_pmem_backing_src_types(const char *prefix)
+{
+   int i;
+
+   printf("%sAvailable private mem backing src types:\n", prefix);
+
+   for (i = 0; i < NUM_PMEM_SRC_TYPES; i++)
+   printf("%s%s\n", prefix, 
vm_pmem_backing_src_alias(i)->name);
+}
+
+void pmem_backing_src_help(const char *flag)
+{
+   printf(" %s: specify the type of memory that should be used to\n"
+  " back guest private memory. (default: %s)\n",
+  flag, vm_pmem_backing_src_alias(DEFAULT_VM_MEM_SRC)->name);
+   print_available_pmem_backing_src_types(" ");
+}
+
+enum vm_pmem_backing_src_type parse_pmem_backing_src_type(const char 
*type_name)
+{
+   int i;
+
+   for (i = 0; i < NUM_SRC_TYPES; i++)
+   if (!strcmp(type_name, vm_pmem_backing_src_alias(i)->name))
+

[RFC PATCH 16/19] KVM: guest_mem: hugetlb: allocate and truncate from hugetlb

2023-06-06 Thread Ackerley Tng
Introduce kvm_gmem_hugetlb_get_folio(), then update
kvm_gmem_allocate() and kvm_gmem_truncate() to use hugetlb functions.

Signed-off-by: Ackerley Tng 
---
 virt/kvm/guest_mem.c | 215 +--
 1 file changed, 188 insertions(+), 27 deletions(-)

diff --git a/virt/kvm/guest_mem.c b/virt/kvm/guest_mem.c
index b533143e2878..6271621f6b73 100644
--- a/virt/kvm/guest_mem.c
+++ b/virt/kvm/guest_mem.c
@@ -43,6 +43,95 @@ static loff_t kvm_gmem_get_size(struct file *file)
return i_size_read(file_inode(file));
 }
 
+static struct folio *kvm_gmem_hugetlb_alloc_and_cache_folio(
+   struct file *file, pgoff_t hindex)
+{
+   int err;
+   struct folio *folio;
+   struct kvm_gmem *gmem;
+   struct hstate *h;
+   struct resv_map *resv_map;
+   unsigned long offset;
+   struct vm_area_struct pseudo_vma;
+
+   gmem = file->private_data;
+   h = gmem->hugetlb.h;
+   resv_map = gmem->hugetlb.resv_map;
+   offset = hindex << huge_page_shift(h);
+
+   vma_init(_vma, NULL);
+   vm_flags_init(_vma, VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
+   /* vma infrastructure is dependent on vm_file being set */
+   pseudo_vma.vm_file = file;
+
+   /* TODO setup NUMA policy. Meanwhile, fallback to get_task_policy(). */
+   pseudo_vma.vm_policy = NULL;
+   folio = alloc_hugetlb_folio_from_subpool(
+   gmem->hugetlb.spool, h, resv_map, _vma, offset, 0);
+   /* Remember to take and drop refcount from vm_policy */
+   if (IS_ERR(folio))
+   return folio;
+
+   /*
+* FIXME: Skip clearing pages when trusted firmware will do it when
+* assigning memory to the guest.
+*/
+   clear_huge_page(>page, offset, pages_per_huge_page(h));
+   __folio_mark_uptodate(folio);
+   err = hugetlb_filemap_add_folio(file->f_mapping, h, folio, hindex);
+   if (unlikely(err)) {
+   restore_reserve_on_error(resv_map, hindex, true, folio);
+   folio_put(folio);
+   folio = ERR_PTR(err);
+   }
+
+   return folio;
+}
+
+/**
+ * Gets a hugetlb folio, from @file, at @index (in terms of PAGE_SIZE) within
+ * the file.
+ *
+ * The returned folio will be in @file's page cache, and locked.
+ */
+static struct folio *kvm_gmem_hugetlb_get_folio(struct file *file, pgoff_t 
index)
+{
+   struct folio *folio;
+   u32 hash;
+   /* hindex is in terms of huge_page_size(h) and not PAGE_SIZE */
+   pgoff_t hindex;
+   struct kvm_gmem *gmem;
+   struct hstate *h;
+   struct address_space *mapping;
+
+   gmem = file->private_data;
+   h = gmem->hugetlb.h;
+   hindex = index >> huge_page_order(h);
+
+   mapping = file->f_mapping;
+   hash = hugetlb_fault_mutex_hash(mapping, hindex);
+   mutex_lock(_fault_mutex_table[hash]);
+
+   rcu_read_lock();
+   folio = filemap_lock_folio(mapping, hindex);
+   rcu_read_unlock();
+   if (folio)
+   goto folio_valid;
+
+   folio = kvm_gmem_hugetlb_alloc_and_cache_folio(file, hindex);
+   /*
+* TODO Perhaps the interface of kvm_gmem_get_folio should change to 
better
+* report errors
+*/
+   if (IS_ERR(folio))
+   folio = NULL;
+
+folio_valid:
+   mutex_unlock(_fault_mutex_table[hash]);
+
+   return folio;
+}
+
 static struct folio *kvm_gmem_get_huge_folio(struct file *file, pgoff_t index)
 {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -74,36 +163,56 @@ static struct folio *kvm_gmem_get_huge_folio(struct file 
*file, pgoff_t index)
 #endif
 }
 
+/**
+ * Gets a folio, from @file, at @index (in terms of PAGE_SIZE) within the file.
+ *
+ * The returned folio will be in @file's page cache and locked.
+ */
 static struct folio *kvm_gmem_get_folio(struct file *file, pgoff_t index)
 {
struct folio *folio;
+   struct kvm_gmem *gmem = file->private_data;
 
-   folio = kvm_gmem_get_huge_folio(file, index);
-   if (!folio) {
-   folio = filemap_grab_folio(file->f_mapping, index);
+   if (gmem->flags & KVM_GUEST_MEMFD_HUGETLB) {
+   folio = kvm_gmem_hugetlb_get_folio(file, index);
+
+   /* hugetlb gmem does not fall back to non-hugetlb pages */
if (!folio)
return NULL;
-   }
 
-   /*
-* TODO: Confirm this won't zero in-use pages, and skip clearing pages
-* when trusted firmware will do it when assigning memory to the guest.
-*/
-   if (!folio_test_uptodate(folio)) {
-   unsigned long nr_pages = folio_nr_pages(folio);
-   unsigned long i;
+   /*
+* Don't need to clear pages because
+* kvm_gmem_hugetlb_alloc_and_cache_folio() already clears pages
+* when allocating
+*/
+   } else {
+   folio = kvm_gmem_get

[RFC PATCH 19/19] KVM: selftests: Update test for various private memory backing source types

2023-06-06 Thread Ackerley Tng
Update private_mem_conversions_test for various private memory backing
source types

Signed-off-by: Ackerley Tng 
---
 .../kvm/x86_64/private_mem_conversions_test.c | 38 ++-
 1 file changed, 28 insertions(+), 10 deletions(-)

diff --git a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c 
b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c
index 6a353cf64f52..27a7e5099b7b 100644
--- a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c
+++ b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c
@@ -240,14 +240,15 @@ static void *__test_mem_conversions(void *__vcpu)
}
 }
 
-static void test_mem_conversions(enum vm_mem_backing_src_type src_type, 
uint32_t nr_vcpus,
-uint32_t nr_memslots)
+static void test_mem_conversions(enum vm_mem_backing_src_type src_type,
+enum vm_pmem_backing_src_type pmem_src_type,
+uint32_t nr_vcpus, uint32_t nr_memslots)
 {
-   const size_t memfd_size = PER_CPU_DATA_SIZE * nr_vcpus;
struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
pthread_t threads[KVM_MAX_VCPUS];
struct kvm_vm *vm;
int memfd, i, r;
+   size_t pmem_aligned_size, memfd_size;
size_t test_unit_size;
 
const struct vm_shape shape = {
@@ -270,21 +271,32 @@ static void test_mem_conversions(enum 
vm_mem_backing_src_type src_type, uint32_t
 * Allocate enough memory so that each vCPU's chunk of memory 
can be
 * naturally aligned with respect to the size of the backing 
store.
 */
-   test_unit_size = align_up(PER_CPU_DATA_SIZE, 
get_backing_src_pagesz(src_type));
+   test_unit_size = align_up(PER_CPU_DATA_SIZE,
+ max(get_backing_src_pagesz(src_type),
+ 
get_pmem_backing_src_pagesz(pmem_src_type)));
}
 
-   memfd = vm_create_guest_memfd(vm, memfd_size, 0);
+   pmem_aligned_size = PER_CPU_DATA_SIZE;
+   if (nr_memslots > 1) {
+   pmem_aligned_size = align_up(PER_CPU_DATA_SIZE,
+
get_pmem_backing_src_pagesz(pmem_src_type));
+   }
+
+   memfd_size = pmem_aligned_size * nr_vcpus;
+   memfd = vm_create_guest_memfd(vm, memfd_size,
+ 
vm_pmem_backing_src_alias(pmem_src_type)->flag);
for (i = 0; i < nr_memslots; i++) {
uint64_t gpa =  BASE_DATA_GPA + i * test_unit_size;
-   uint64_t npages = PER_CPU_DATA_SIZE / vm->page_size;
+   uint64_t npages = pmem_aligned_size / vm->page_size;
 
/* Make sure the memslot is large enough for all the test units 
*/
if (nr_memslots == 1)
npages *= nr_vcpus;
 
+   /* Offsets must be aligned to private mem's page size */
vm_mem_add(vm, src_type, gpa,
   BASE_DATA_SLOT + i, npages,
-  KVM_MEM_PRIVATE, memfd, PER_CPU_DATA_SIZE * i);
+  KVM_MEM_PRIVATE, memfd, pmem_aligned_size * i);
}
 
for (i = 0; i < nr_vcpus; i++) {
@@ -324,10 +336,12 @@ static void test_mem_conversions(enum 
vm_mem_backing_src_type src_type, uint32_t
 static void usage(const char *cmd)
 {
puts("");
-   printf("usage: %s [-h] [-m] [-s mem_type] [-n nr_vcpus]\n", cmd);
+   printf("usage: %s [-h] [-m] [-s mem_type] [-p pmem_type] [-n 
nr_vcpus]\n", cmd);
puts("");
backing_src_help("-s");
puts("");
+   pmem_backing_src_help("-p");
+   puts("");
puts(" -n: specify the number of vcpus (default: 1)");
puts("");
puts(" -m: use multiple memslots (default: 1)");
@@ -337,6 +351,7 @@ static void usage(const char *cmd)
 int main(int argc, char *argv[])
 {
enum vm_mem_backing_src_type src_type = DEFAULT_VM_MEM_SRC;
+   enum vm_pmem_backing_src_type pmem_src_type = DEFAULT_VM_PMEM_SRC;
bool use_multiple_memslots = false;
uint32_t nr_vcpus = 1;
uint32_t nr_memslots;
@@ -345,11 +360,14 @@ int main(int argc, char *argv[])
TEST_REQUIRE(kvm_has_cap(KVM_CAP_EXIT_HYPERCALL));
TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & 
BIT(KVM_X86_PROTECTED_VM));
 
-   while ((opt = getopt(argc, argv, "hms:n:")) != -1) {
+   while ((opt = getopt(argc, argv, "hms:p:n:")) != -1) {
switch (opt) {
case 's':
src_type = parse_backing_src_type(optarg);
break;
+   case 'p':
+   pmem_src_type = parse_pmem_backing_src_type(optarg);
+   break;

[RFC PATCH 02/19] mm: hugetlb: Move and expose hugetlbfs_zero_partial_page

2023-06-06 Thread Ackerley Tng
Zeroing of pages is generalizable to hugetlb and is not specific to
hugetlbfs.

Rename hugetlbfs_zero_partial_page => hugetlb_zero_partial_page, move
it to mm/hugetlb.c and expose it in linux/hugetlb.h.

Signed-off-by: Ackerley Tng 
---
 fs/hugetlbfs/inode.c| 27 ++-
 include/linux/hugetlb.h |  6 ++
 mm/hugetlb.c| 22 ++
 3 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 406d7366cf3e..3dab50d3ed88 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -688,29 +688,6 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t 
offset)
remove_inode_hugepages(inode, offset, LLONG_MAX);
 }
 
-static void hugetlbfs_zero_partial_page(struct hstate *h,
-   struct address_space *mapping,
-   loff_t start,
-   loff_t end)
-{
-   pgoff_t idx = start >> huge_page_shift(h);
-   struct folio *folio;
-
-   folio = filemap_lock_folio(mapping, idx);
-   if (!folio)
-   return;
-
-   start = start & ~huge_page_mask(h);
-   end = end & ~huge_page_mask(h);
-   if (!end)
-   end = huge_page_size(h);
-
-   folio_zero_segment(folio, (size_t)start, (size_t)end);
-
-   folio_unlock(folio);
-   folio_put(folio);
-}
-
 static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t 
len)
 {
struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
@@ -737,7 +714,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, 
loff_t offset, loff_t len)
 
/* If range starts before first full page, zero partial page. */
if (offset < hole_start)
-   hugetlbfs_zero_partial_page(h, mapping,
+   hugetlb_zero_partial_page(h, mapping,
offset, min(offset + len, hole_start));
 
/* Unmap users of full pages in the hole. */
@@ -750,7 +727,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, 
loff_t offset, loff_t len)
 
/* If range extends beyond last full page, zero partial page. */
if ((offset + len) > hole_end && (offset + len) > hole_start)
-   hugetlbfs_zero_partial_page(h, mapping,
+   hugetlb_zero_partial_page(h, mapping,
hole_end, offset + len);
 
i_mmap_unlock_write(mapping);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 37c2edf7beea..023293ceec25 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -256,6 +256,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
 bool is_hugetlb_entry_migration(pte_t pte);
 void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
 
+void hugetlb_zero_partial_page(struct hstate *h, struct address_space *mapping,
+  loff_t start, loff_t end);
+
 #else /* !CONFIG_HUGETLB_PAGE */
 
 static inline void hugetlb_dup_vma_private(struct vm_area_struct *vma)
@@ -464,6 +467,9 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
 
 static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }
 
+static inline void hugetlb_zero_partial_page(
+   struct hstate *h, struct address_space *mapping, loff_t start, loff_t 
end) {}
+
 #endif /* !CONFIG_HUGETLB_PAGE */
 /*
  * hugepages at page global directory. If arch support
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 07abcb6eb203..9c9262833b4f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7407,6 +7407,28 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
ALIGN_DOWN(vma->vm_end, PUD_SIZE));
 }
 
+void hugetlb_zero_partial_page(struct hstate *h,
+  struct address_space *mapping,
+  loff_t start, loff_t end)
+{
+   pgoff_t idx = start >> huge_page_shift(h);
+   struct folio *folio;
+
+   folio = filemap_lock_folio(mapping, idx);
+   if (!folio)
+   return;
+
+   start = start & ~huge_page_mask(h);
+   end = end & ~huge_page_mask(h);
+   if (!end)
+   end = huge_page_size(h);
+
+   folio_zero_segment(folio, (size_t)start, (size_t)end);
+
+   folio_unlock(folio);
+   folio_put(folio);
+}
+
 #ifdef CONFIG_CMA
 static bool cma_reserve_called __initdata;
 
-- 
2.41.0.rc0.172.g3f132b7071-goog




[RFC PATCH 03/19] mm: hugetlb: Expose remove_inode_hugepages

2023-06-06 Thread Ackerley Tng
TODO may want to move this to hugetlb

Signed-off-by: Ackerley Tng 
---
 fs/hugetlbfs/inode.c| 3 +--
 include/linux/hugetlb.h | 4 
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 3dab50d3ed88..4f25df31ae80 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -611,8 +611,7 @@ static bool remove_inode_single_folio(struct hstate *h, 
struct inode *inode,
  * Note: If the passed end of range value is beyond the end of file, but
  * not LLONG_MAX this routine still performs a hole punch operation.
  */
-static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
-  loff_t lend)
+void remove_inode_hugepages(struct inode *inode, loff_t lstart, loff_t lend)
 {
struct hstate *h = hstate_inode(inode);
struct address_space *mapping = >i_data;
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 023293ceec25..1483020b412b 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -259,6 +259,8 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
 void hugetlb_zero_partial_page(struct hstate *h, struct address_space *mapping,
   loff_t start, loff_t end);
 
+void remove_inode_hugepages(struct inode *inode, loff_t lstart, loff_t lend);
+
 #else /* !CONFIG_HUGETLB_PAGE */
 
 static inline void hugetlb_dup_vma_private(struct vm_area_struct *vma)
@@ -470,6 +472,8 @@ static inline void hugetlb_unshare_all_pmds(struct 
vm_area_struct *vma) { }
 static inline void hugetlb_zero_partial_page(
struct hstate *h, struct address_space *mapping, loff_t start, loff_t 
end) {}
 
+static inline void remove_inode_hugepages(struct inode *inode, loff_t lstart, 
loff_t lend) {}
+
 #endif /* !CONFIG_HUGETLB_PAGE */
 /*
  * hugepages at page global directory. If arch support
-- 
2.41.0.rc0.172.g3f132b7071-goog




Re: [PATCH v10 9/9] KVM: Enable and expose KVM_MEM_PRIVATE

2023-04-18 Thread Ackerley Tng

Sean Christopherson  writes:


On Tue, Mar 28, 2023, Chao Peng wrote:

On Fri, Mar 24, 2023 at 10:29:25AM +0800, Xiaoyao Li wrote:
> On 3/24/2023 10:10 AM, Chao Peng wrote:
> > On Wed, Mar 22, 2023 at 05:41:31PM -0700, Isaku Yamahata wrote:
> > > On Wed, Mar 08, 2023 at 03:40:26PM +0800,
> > > Chao Peng  wrote:
> > >
> > > > On Wed, Mar 08, 2023 at 12:13:24AM +, Ackerley Tng wrote:
> > > > > Chao Peng  writes:
> > > > >
> > > > > > On Sat, Jan 14, 2023 at 12:01:01AM +, Sean  
Christopherson wrote:

> > > > > > > On Fri, Dec 02, 2022, Chao Peng wrote:
> > > > > > +static bool kvm_check_rmem_offset_alignment(u64 offset, u64  
gpa)

> > > > > > +{
> > > > > > + if (!offset)
> > > > > > + return true;
> > > > > > + if (!gpa)
> > > > > > + return false;
> > > > > > +
> > > > > > +	return !!(count_trailing_zeros(offset) >=  
count_trailing_zeros(gpa));

> > >
> > > This check doesn't work expected. For example, offset = 2GB,  
gpa=4GB

> > > this check fails.
> >
> > This case is expected to fail as Sean initially suggested[*]:
> >I would rather reject memslot if the gfn has lesser alignment than
> >the offset. I'm totally ok with this approach _if_ there's a use  
case.
> >Until such a use case presents itself, I would rather be  
conservative

> >from a uAPI perspective.
> >
> > I understand that we put tighter restriction on this but if you see  
such

> > restriction is really a big issue for real usage, instead of a
> > theoretical problem, then we can loosen the check here. But at that  
time

> > below code is kind of x86 specific and may need improve.
> >
> > BTW, in latest code, I replaced count_trailing_zeros() with fls64():
> >return !!(fls64(offset) >= fls64(gpa));
>
> wouldn't it be !!(ffs64(offset) <= ffs64(gpa)) ?



As the function document explains, here we want to return true when
ALIGNMENT(offset) >= ALIGNMENT(gpa), so '>=' is what we need.



It's worthy clarifying that in Sean's original suggestion he actually
mentioned the opposite. He said 'reject memslot if the gfn has lesser
alignment than the offset', but I wonder this is his purpose, since
if ALIGNMENT(offset) < ALIGNMENT(gpa), we wouldn't be possible to map
the page as largepage. Consider we have below config:



   gpa=2M, offset=1M



In this case KVM tries to map gpa at 2M as 2M hugepage but the physical
page at the offset(1M) in private_fd cannot provide the 2M page due to
misalignment.



But as we discussed in the off-list thread, here we do find a real use
case indicating this check is too strict. i.e. QEMU immediately fails
when launch a guest > 2G memory. For this case QEMU splits guest memory
space into two slots:



   Slot#1(ram_below_4G): gpa=0x0, offset=0x0, size=2G
   Slot#2(ram_above_4G): gpa=4G,  offset=2G,  size=totalsize-2G



This strict alignment check fails for slot#2 because offset(2G) has less
alignment than gpa(4G). To allow this, one solution can revert to my
previous change in kvm_alloc_memslot_metadata() to disallow hugepage
only when the offset/gpa are not aligned to related page size.



Sean, How do you think?


I agree, a pure alignment check is too restrictive, and not really what I  
intended
despite past me literally saying that's what I wanted :-)  I think I may  
have also
inverted the "less alignment" statement, but luckily I believe that ends  
up being

a moot point.


The goal is to avoid having to juggle scenarios where KVM wants to create  
a hugepage,
but restrictedmem can't provide one because of a misaligned file offset.   
I think
the rule we want is that the offset must be aligned to the largest page  
size allowed
by the memslot _size_.  E.g. on x86, if the memslot size is >=1GiB then  
the offset
must be 1GiB or beter, ditto for >=2MiB and >=4KiB (ignoring that 4KiB is  
already a

requirement).


We could loosen that to say the largest size allowed by the memslot, but  
I don't
think that's worth the effort unless it's trivially easy to implement in  
code,
e.g. KVM could technically allow a 4KiB aligned offset if the memslot is  
2MiB
sized but only 4KiB aligned on the GPA.  I doubt there's a real use case  
for such

a memslot, so I want to disallow that unless it's super easy to implement.


Checking my understanding here about why we need this alignment check:

When KVM requests a page from restrictedmem, KVM will provide an offset
into the file in terms of 4K pages.

When shmem is configured to use hugepages, shmem_get_folio() will round
the requested offset down to the nearest hugepage-aligned boundary in
shmem_alloc_hug

Re: [PATCH v7 00/14] KVM: mm: fd-based approach for supporting KVM guest private memory

2023-04-14 Thread Ackerley Tng

Sean Christopherson  writes:


On Thu, Apr 13, 2023, Christian Brauner wrote:

On Thu, Aug 18, 2022 at 04:24:21PM +0300, Kirill A . Shutemov wrote:
> On Wed, Aug 17, 2022 at 10:40:12PM -0700, Hugh Dickins wrote:
> > Here's what I would prefer, and imagine much easier for you to  
maintain;

> > but I'm no system designer, and may be misunderstanding throughout.
> >
> > QEMU gets fd from opening /dev/kvm_something, uses ioctls (or perhaps
> > the fallocate syscall interface itself) to allocate and free the  
memory,
> > ioctl for initializing some of it too.  KVM in control of whether  
that
> > fd can be read or written or mmap'ed or whatever, no need to prevent  
it
> > in shmem.c, no need for flags, seals, notifications to and fro  
because
> > KVM is already in control and knows the history.  If shmem actually  
has
> > value, call into it underneath - somewhat like SysV SHM, and  
/dev/zero
> > mmap, and i915/gem make use of it underneath.  If shmem has nothing  
to

> > add, just allocate and free kernel memory directly, recorded in your
> > own xarray.
>
> I guess shim layer on top of shmem *can* work. I don't see immediately  
why
> it would not. But I'm not sure it is right direction. We risk creating  
yet

> another parallel VM with own rules/locking/accounting that opaque to
> core-mm.



Sorry for necrobumping this thread but I've been reviewing the


No worries, I'm just stoked someone who actually knows what they're doing  
is

chiming in :-)



+1, thanks Christian!


memfd_restricted() extension that Ackerley is currently working on. I
was pointed to this thread as this is what the extension is building
on but I'll reply to both threads here.



 From a glance at v10, memfd_restricted() is currently implemented as an
in-kernel stacking filesystem. A call to memfd_restricted() creates a
new restricted memfd file and a new unlinked tmpfs file and stashes the
tmpfs file into the memfd file's private data member. It then uses the
tmpfs file's f_ops and i_ops to perform the relevant file and inode
operations. So it has the same callstack as a general stacking
filesystem like overlayfs in some cases:



 memfd_restricted->getattr()
 -> tmpfs->getattr()



...



Since you're effectively acting like a stacking filesystem you should
really use the device number of your memfd restricted filesystem. IOW,
sm like:



 stat->dev = memfd_restricted_dentry->d_sb->s_dev;



But then you run into trouble if you want to go forward with Ackerley's
extension that allows to explicitly pass in tmpfs fds to
memfd_restricted(). Afaict, two tmpfs instances might allocate the same
inode number. So now the inode and device number pair isn't unique
anymore.



So you might best be served by allocating and reporting your own inode
numbers as well.



But if you want to preserve the inode number and device number of the
relevant tmpfs instance but still report memfd restricted as your
filesystem type


Unless I missed something along the way, reporting memfd_restricted as a  
distinct
filesystem is very much a non-goal.  AFAIK it's purely a side effect of  
the

proposed implementation.


then I think it's reasonable to ask whether a stacking implementation  
really

makes sense here.



If you extend memfd_restricted() or even consider extending it in the
future to take tmpfs file descriptors as arguments to identify the tmpfs
instance in which to allocate the underlying tmpfs file for the new
restricted memfd file you should really consider a tmpfs based
implementation.



Because at that point it just feels like a pointless wrapper to get
custom f_ops and i_ops. Plus it's wasteful because you allocate dentries
and inodes that you don't really care about at all.



Just off the top of my hat you might be better served:
* by a new ioctl() on tmpfs instances that
   yield regular tmpfs file descriptors with restricted f_ops and i_ops.
   That's not that different from btrfs subvolumes which effectively are
   directories but are created through an ioctl().


I think this is more or less what we want to do, except via a dedicated  
syscall
instead of an ioctl() so that the primary interface isn't strictly tied  
to tmpfs,

e.g. so that it can be extended to other backing types in the future.



* by a mount option to tmpfs that makes it act
   in this restricted manner then you don't need an ioctl() and can get
   away with regular open calls. Such a tmpfs instance would only create
   regular, restricted memfds.


I'd prefer to not go this route, becuase IIUC, it would require  
relatively invasive
changes to shmem code, and IIUC would require similar changes to other  
support
backings in the future, e.g. hugetlbfs?  And as above, I don't think any  
of the

potential use cases need restrictedmem to be a uniquely identifiable
mount.


FWIW, I'm starting to look at extending restrictedmem to hugetlbfs and
the separation that the current implementation has is very helpful. Also
helps that 

[RFC PATCH 2/6] mm: mempolicy: Refactor out mpol_init_from_nodemask

2023-04-13 Thread Ackerley Tng
Refactor out mpol_init_from_nodemask() to simplify logic in do_mbind().

mpol_init_from_nodemask() will be used to perform similar
functionality in do_memfd_restricted_bind() in a later patch.

Signed-off-by: Ackerley Tng 
---
 mm/mempolicy.c | 32 +---
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a256a241fd1d..a2655b626731 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1254,6 +1254,25 @@ static struct page *new_page(struct page *page, unsigned 
long start)
 }
 #endif
 
+static long mpol_init_from_nodemask(struct mempolicy *mpol, const nodemask_t 
*nmask,
+   bool always_unlock)
+{
+   long err;
+   NODEMASK_SCRATCH(scratch);
+
+   if (!scratch)
+   return -ENOMEM;
+
+   /* Cannot take lock before allocating in NODEMASK_SCRATCH */
+   mmap_write_lock(current->mm);
+   err = mpol_set_nodemask(mpol, nmask, scratch);
+   if (always_unlock || err)
+   mmap_write_unlock(current->mm);
+
+   NODEMASK_SCRATCH_FREE(scratch);
+   return err;
+}
+
 static long do_mbind(unsigned long start, unsigned long len,
 unsigned short mode, unsigned short mode_flags,
 nodemask_t *nmask, unsigned long flags)
@@ -1306,17 +1325,8 @@ static long do_mbind(unsigned long start, unsigned long 
len,
 
lru_cache_disable();
}
-   {
-   NODEMASK_SCRATCH(scratch);
-   if (scratch) {
-   mmap_write_lock(mm);
-   err = mpol_set_nodemask(new, nmask, scratch);
-   if (err)
-   mmap_write_unlock(mm);
-   } else
-   err = -ENOMEM;
-   NODEMASK_SCRATCH_FREE(scratch);
-   }
+
+   err = mpol_init_from_nodemask(new, nmask, false);
if (err)
goto mpol_out;
 
-- 
2.40.0.634.g4ca3ef3211-goog




[RFC PATCH 4/6] mm: mempolicy: Add and expose mpol_create

2023-04-13 Thread Ackerley Tng
mpol_create builds a mempolicy based on mode, nmask and maxnode.

mpol_create is exposed for use in memfd_restricted_bind() in a later
patch.

Signed-off-by: Ackerley Tng 
---
 include/linux/mempolicy.h |  2 ++
 mm/mempolicy.c| 39 +++
 2 files changed, 41 insertions(+)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 9a2a2dd95432..15facd9de087 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -125,6 +125,8 @@ struct shared_policy {
 };
 
 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst);
+struct mempolicy *mpol_create(
+   unsigned long mode, const unsigned long __user *nmask, unsigned long 
maxnode)
 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
 int __mpol_set_shared_policy(struct shared_policy *info, struct mempolicy 
*mpol,
 unsigned long pgoff_start, unsigned long npages);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f3fa5494e4a8..f4fe241c17ff 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -3181,3 +3181,42 @@ void mpol_to_str(char *buffer, int maxlen, struct 
mempolicy *pol)
p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
   nodemask_pr_args());
 }
+
+/**
+ * mpol_create - build mempolicy based on mode, nmask and maxnode
+ * @mode:  policy mode, as in MPOL_MODE_FLAGS
+ * @nmask:  node mask from userspace
+ * @maxnode:  number of valid bits in nmask
+ *
+ * Will allocate a new struct mempolicy that has to be released with
+ * mpol_put. Will also take and release the write lock mmap_lock in 
current->mm.
+ */
+struct mempolicy *mpol_create(
+   unsigned long mode, const unsigned long __user *nmask, unsigned long 
maxnode)
+{
+   int err;
+   unsigned short mode_flags;
+   nodemask_t nodes;
+   int lmode = mode;
+   struct mempolicy *mpol;
+
+   err = sanitize_mpol_flags(, _flags);
+   if (err)
+   return ERR_PTR(err);
+
+   err = get_nodes(, nmask, maxnode);
+   if (err)
+   return ERR_PTR(err);
+
+   mpol = mpol_new(mode, mode_flags, );
+   if (IS_ERR(mpol))
+   return mpol;
+
+   err = mpol_init_from_nodemask(mpol, , true);
+   if (err) {
+   mpol_put(mpol);
+   return ERR_PTR(err);
+   }
+
+   return mpol;
+}
-- 
2.40.0.634.g4ca3ef3211-goog




[RFC PATCH 3/6] mm: mempolicy: Refactor out __mpol_set_shared_policy()

2023-04-13 Thread Ackerley Tng
Refactor out __mpol_set_shared_policy() to remove dependency on struct
vm_area_struct, since only 2 parameters from struct vm_area_struct are
used.

__mpol_set_shared_policy() will be used in a later patch by
restrictedmem_set_shared_policy().

Signed-off-by: Ackerley Tng 
---
 include/linux/mempolicy.h |  2 ++
 mm/mempolicy.c| 29 +++--
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index d232de7cdc56..9a2a2dd95432 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -126,6 +126,8 @@ struct shared_policy {
 
 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst);
 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
+int __mpol_set_shared_policy(struct shared_policy *info, struct mempolicy 
*mpol,
+unsigned long pgoff_start, unsigned long npages);
 int mpol_set_shared_policy(struct shared_policy *info,
struct vm_area_struct *vma,
struct mempolicy *new);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a2655b626731..f3fa5494e4a8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2817,30 +2817,39 @@ void mpol_shared_policy_init(struct shared_policy *sp, 
struct mempolicy *mpol)
}
 }
 
-int mpol_set_shared_policy(struct shared_policy *info,
-   struct vm_area_struct *vma, struct mempolicy *npol)
+int __mpol_set_shared_policy(struct shared_policy *info, struct mempolicy 
*mpol,
+unsigned long pgoff_start, unsigned long npages)
 {
int err;
struct sp_node *new = NULL;
-   unsigned long sz = vma_pages(vma);
+   unsigned long pgoff_end = pgoff_start + npages;
 
pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
-vma->vm_pgoff,
-sz, npol ? npol->mode : -1,
-npol ? npol->flags : -1,
-npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE);
+pgoff_start, npages,
+mpol ? mpol->mode : -1,
+mpol ? mpol->flags : -1,
+mpol ? nodes_addr(mpol->nodes)[0] : NUMA_NO_NODE);
 
-   if (npol) {
-   new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
+   if (mpol) {
+   new = sp_alloc(pgoff_start, pgoff_end, mpol);
if (!new)
return -ENOMEM;
}
-   err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
+
+   err = shared_policy_replace(info, pgoff_start, pgoff_end, new);
+
if (err && new)
sp_free(new);
+
return err;
 }
 
+int mpol_set_shared_policy(struct shared_policy *info,
+   struct vm_area_struct *vma, struct mempolicy *mpol)
+{
+   return __mpol_set_shared_policy(info, mpol, vma->vm_pgoff, 
vma_pages(vma));
+}
+
 /* Free a backing policy store on inode delete. */
 void mpol_free_shared_policy(struct shared_policy *p)
 {
-- 
2.40.0.634.g4ca3ef3211-goog




[RFC PATCH 6/6] selftests: mm: Add selftest for memfd_restricted_bind()

2023-04-13 Thread Ackerley Tng
This selftest uses memfd_restricted_bind() to set the mempolicy for a
restrictedmem file, and then checks that pages were indeed allocated
according to that policy.

Because restrictedmem pages are never mapped into userspace memory,
the usual ways of checking which NUMA node the page was allocated
on (e.g. /proc/pid/numa_maps) cannot be used.

This selftest adds a small kernel module that overloads the ioctl
syscall on /proc/restrictedmem to request a restrictedmem page and get
the node it was allocated on. The page is freed within the ioctl handler.

Signed-off-by: Ackerley Tng 
---
 tools/testing/selftests/mm/.gitignore |   1 +
 tools/testing/selftests/mm/Makefile   |   8 +
 .../selftests/mm/memfd_restricted_bind.c  | 139 ++
 .../mm/restrictedmem_testmod/Makefile |  21 +++
 .../restrictedmem_testmod.c   |  89 +++
 tools/testing/selftests/mm/run_vmtests.sh |   6 +
 6 files changed, 264 insertions(+)
 create mode 100644 tools/testing/selftests/mm/memfd_restricted_bind.c
 create mode 100644 tools/testing/selftests/mm/restrictedmem_testmod/Makefile
 create mode 100644 
tools/testing/selftests/mm/restrictedmem_testmod/restrictedmem_testmod.c

diff --git a/tools/testing/selftests/mm/.gitignore 
b/tools/testing/selftests/mm/.gitignore
index fb6e4233374d..10c5701b9645 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -31,6 +31,7 @@ map_fixed_noreplace
 write_to_hugetlbfs
 hmm-tests
 memfd_restricted
+memfd_restricted_bind
 memfd_secret
 soft-dirty
 split_huge_page_test
diff --git a/tools/testing/selftests/mm/Makefile 
b/tools/testing/selftests/mm/Makefile
index 5ec338ea1fed..4a6cf922db45 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -46,6 +46,8 @@ TEST_GEN_FILES += map_fixed_noreplace
 TEST_GEN_FILES += map_hugetlb
 TEST_GEN_FILES += map_populate
 TEST_GEN_FILES += memfd_restricted
+TEST_GEN_FILES += memfd_restricted_bind
+TEST_GEN_FILES += restrictedmem_testmod.ko
 TEST_GEN_FILES += memfd_secret
 TEST_GEN_FILES += migration
 TEST_GEN_FILES += mlock-random-test
@@ -171,6 +173,12 @@ $(OUTPUT)/ksm_tests: LDLIBS += -lnuma
 
 $(OUTPUT)/migration: LDLIBS += -lnuma
 
+$(OUTPUT)/memfd_restricted_bind: LDLIBS += -lnuma
+$(OUTPUT)/restrictedmem_testmod.ko: $(wildcard restrictedmem_testmod/Makefile 
restrictedmem_testmod/*.[ch])
+   $(call msg,MOD,,$@)
+   $(Q)$(MAKE) -C restrictedmem_testmod
+   $(Q)cp restrictedmem_testmod/restrictedmem_testmod.ko $@
+
 local_config.mk local_config.h: check_config.sh
/bin/sh ./check_config.sh $(CC)
 
diff --git a/tools/testing/selftests/mm/memfd_restricted_bind.c 
b/tools/testing/selftests/mm/memfd_restricted_bind.c
new file mode 100644
index ..64aa44c72d09
--- /dev/null
+++ b/tools/testing/selftests/mm/memfd_restricted_bind.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "../kselftest_harness.h"
+
+int memfd_restricted(int flags, int fd)
+{
+   return syscall(__NR_memfd_restricted, flags, fd);
+}
+
+int memfd_restricted_bind(
+   int fd, loff_t offset, unsigned long len, unsigned long mode,
+   const unsigned long *nmask, unsigned long maxnode, unsigned int flags)
+{
+   struct file_range range = {
+   .offset = offset,
+   .len = len,
+   };
+
+   return syscall(__NR_memfd_restricted_bind, fd, , mode, nmask, 
maxnode, flags);
+}
+
+int memfd_restricted_bind_node(
+   int fd, loff_t offset, unsigned long len,
+   unsigned long mode, int node, unsigned int flags)
+{
+   int ret;
+   struct bitmask *mask = numa_allocate_nodemask();
+
+   numa_bitmask_setbit(mask, node);
+
+   ret = memfd_restricted_bind(fd, offset, len, mode, mask->maskp, 
mask->size, flags);
+
+   numa_free_nodemask(mask);
+
+   return ret;
+}
+
+/**
+ * Allocates a page in restrictedmem_fd, reads the node that the page was
+ * allocated it and returns it. Returns -1 on error.
+ */
+int read_node(int restrictedmem_fd, unsigned long offset)
+{
+   int ret;
+   int fd;
+
+   fd = open("/proc/restrictedmem", O_RDWR);
+   if (!fd)
+   return -ENOTSUP;
+
+   ret = ioctl(fd, restrictedmem_fd, offset);
+
+   close(fd);
+
+   return ret;
+}
+
+bool restrictedmem_testmod_loaded(void)
+{
+   struct stat buf;
+
+   return stat("/proc/restrictedmem", ) == 0;
+}
+
+FIXTURE(restrictedmem_file)
+{
+   int fd;
+   size_t page_size;
+};
+
+FIXTURE_SETUP(restrictedmem_file)
+{
+   int fd;
+   int ret;
+   struct stat stat;
+
+   fd = memfd_restricted(0, -1);
+   ASSERT_GT(fd, 0);
+
+#define RESTRICTEDMEM_TEST_NPAGES 16
+   ret = ftruncate(fd, getpagesize() * RESTRICTEDMEM_TEST_NPAGES);
+   ASSERT_EQ(ret, 0);
+
+   ret = f

[RFC PATCH 0/6] Setting memory policy for restrictedmem file

2023-04-13 Thread Ackerley Tng
Hello,

This patchset builds upon the memfd_restricted() system call that was
discussed in the 'KVM: mm: fd-based approach for supporting KVM' patch
series [1].

The tree can be found at:
https://github.com/googleprodkernel/linux-cc/tree/restrictedmem-set-memory-policy

In this patchset, a new syscall is introduced, which allows userspace
to set the memory policy (e.g. NUMA bindings) for a restrictedmem
file, to the granularity of offsets within the file.

The offset/length tuple is termed a file_range which is passed to the
kernel via a pointer to get around the limit of 6 arguments for a
syscall.

The following other approaches were also considered:

1. Pre-configuring a mount with a memory policy and providing that
   mount to memfd_restricted() as proposed at [2].
+ Pro: It allows choice of a specific backing mount with custom
  memory policy configurations
+ Con: Will need to create an entire new mount just to set memory
  policy for a restrictedmem file; files on the same mount cannot
  have different memory policies.

2. Passing memory policy to the memfd_restricted() syscall at creation time.
+ Pro: Only need to make a single syscall to create a file with a
  given memory policy
+ Con: At creation time, the kernel doesn’t know the size of the
  restrictedmem file. Given that memory policy is stored in the
  inode based on ranges (start, end), it is awkward for the kernel
  to store the memory policy and then add hooks to set the memory
  policy when allocation is done.

3. A more generic fbind(): it seems like this new functionality is
   really only needed for restrictedmem files, hence a separate,
   specific syscall was proposed to avoid complexities with handling
   conflicting policies that may be specified via other syscalls like
   mbind()

TODOs

+ Return -EINVAL if file_range is not within the size of the file and
  tests for this

Dependencies:

+ Chao’s work on UPM [3]

[1] 
https://lore.kernel.org/lkml/20221202061347.1070246-1-chao.p.p...@linux.intel.com/T/
[2] https://lore.kernel.org/lkml/cover.1681176340.git.ackerley...@google.com/T/
[3] https://github.com/chao-p/linux/commits/privmem-v11.5

---

Ackerley Tng (6):
  mm: shmem: Refactor out shmem_shared_policy() function
  mm: mempolicy: Refactor out mpol_init_from_nodemask
  mm: mempolicy: Refactor out __mpol_set_shared_policy()
  mm: mempolicy: Add and expose mpol_create
  mm: restrictedmem: Add memfd_restricted_bind() syscall
  selftests: mm: Add selftest for memfd_restricted_bind()

 arch/x86/entry/syscalls/syscall_32.tbl|   1 +
 arch/x86/entry/syscalls/syscall_64.tbl|   1 +
 include/linux/mempolicy.h |   4 +
 include/linux/shmem_fs.h  |   7 +
 include/linux/syscalls.h  |   5 +
 include/uapi/asm-generic/unistd.h |   5 +-
 include/uapi/linux/mempolicy.h|   7 +-
 kernel/sys_ni.c   |   1 +
 mm/mempolicy.c| 100 ++---
 mm/restrictedmem.c|  75 ++
 mm/shmem.c|  10 +-
 scripts/checksyscalls.sh  |   1 +
 tools/testing/selftests/mm/.gitignore |   1 +
 tools/testing/selftests/mm/Makefile   |   8 +
 .../selftests/mm/memfd_restricted_bind.c  | 139 ++
 .../mm/restrictedmem_testmod/Makefile |  21 +++
 .../restrictedmem_testmod.c   |  89 +++
 tools/testing/selftests/mm/run_vmtests.sh |   6 +
 18 files changed, 454 insertions(+), 27 deletions(-)
 create mode 100644 tools/testing/selftests/mm/memfd_restricted_bind.c
 create mode 100644 tools/testing/selftests/mm/restrictedmem_testmod/Makefile
 create mode 100644 
tools/testing/selftests/mm/restrictedmem_testmod/restrictedmem_testmod.c

--
2.40.0.634.g4ca3ef3211-goog



[RFC PATCH 5/6] mm: restrictedmem: Add memfd_restricted_bind() syscall

2023-04-13 Thread Ackerley Tng
memfd_restricted_bind() sets the NUMA memory policy, which consists of
a policy mode and zero or more nodes, for an offset within a
restrictedmem file with file descriptor fd and continuing for len
bytes.

This is intended to be like mbind() but specially for restrictedmem
files, which cannot be mmap()ed into userspace and hence has no memory
addresses that can be used with mbind().

Unlike mbind(), memfd_restricted_bind() will override any existing
memory policy if a new memory policy is defined for the same ranges.

For now, memfd_restricted_bind() does not perform migrations and no
flags are supported.

This syscall is specialised just for restrictedmem files because this
functionality is not required by other files.

Signed-off-by: Ackerley Tng 
---
 arch/x86/entry/syscalls/syscall_32.tbl |  1 +
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 include/linux/mempolicy.h  |  2 +-
 include/linux/syscalls.h   |  5 ++
 include/uapi/asm-generic/unistd.h  |  5 +-
 include/uapi/linux/mempolicy.h |  7 ++-
 kernel/sys_ni.c|  1 +
 mm/restrictedmem.c | 75 ++
 scripts/checksyscalls.sh   |  1 +
 9 files changed, 95 insertions(+), 3 deletions(-)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl 
b/arch/x86/entry/syscalls/syscall_32.tbl
index dc70ba90247e..c94e9ce46cc3 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -456,3 +456,4 @@
 449i386futex_waitv sys_futex_waitv
 450i386set_mempolicy_home_node sys_set_mempolicy_home_node
 451i386memfd_restrictedsys_memfd_restricted
+452i386memfd_restricted_bind   sys_memfd_restricted_bind
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl 
b/arch/x86/entry/syscalls/syscall_64.tbl
index 06516abc8318..6bd86b45d63a 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -373,6 +373,7 @@
 449common  futex_waitv sys_futex_waitv
 450common  set_mempolicy_home_node sys_set_mempolicy_home_node
 451common  memfd_restrictedsys_memfd_restricted
+452common  memfd_restricted_bind   sys_memfd_restricted_bind
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 15facd9de087..af62233df0c0 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -126,7 +126,7 @@ struct shared_policy {
 
 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst);
 struct mempolicy *mpol_create(
-   unsigned long mode, const unsigned long __user *nmask, unsigned long 
maxnode)
+   unsigned long mode, const unsigned long __user *nmask, unsigned long 
maxnode);
 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
 int __mpol_set_shared_policy(struct shared_policy *info, struct mempolicy 
*mpol,
 unsigned long pgoff_start, unsigned long npages);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 660be0bf89d5..852b202d3837 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1059,6 +1059,11 @@ asmlinkage long sys_set_mempolicy_home_node(unsigned 
long start, unsigned long l
unsigned long home_node,
unsigned long flags);
 asmlinkage long sys_memfd_restricted(unsigned int flags);
+asmlinkage long sys_memfd_restricted_bind(int fd, struct file_range __user 
*range,
+ unsigned long mode,
+ const unsigned long __user *nmask,
+ unsigned long maxnode,
+ unsigned int flags);
 
 /*
  * Architecture-specific system calls
diff --git a/include/uapi/asm-generic/unistd.h 
b/include/uapi/asm-generic/unistd.h
index e2ea7cd964f8..b5a1385bb4a7 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -889,10 +889,13 @@ __SYSCALL(__NR_set_mempolicy_home_node, 
sys_set_mempolicy_home_node)
 #ifdef __ARCH_WANT_MEMFD_RESTRICTED
 #define __NR_memfd_restricted 451
 __SYSCALL(__NR_memfd_restricted, sys_memfd_restricted)
+
+#define __NR_memfd_restricted_bind 452
+__SYSCALL(__NR_memfd_restricted_bind, sys_memfd_restricted_bind)
 #endif
 
 #undef __NR_syscalls
-#define __NR_syscalls 452
+#define __NR_syscalls 453
 
 /*
  * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 046d0ccba4cd..979499abd253 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -6,9 +6,9 @@
 #ifndef _UAPI_LINUX_MEMPOLICY_H
 #define _UAPI_LINUX_MEMPOLICY_H
 
+#include 
 #include 
 
-
 /*
  * Both the MPOL_* mempolicy mode and the MPOL_F_* optional

[RFC PATCH 1/6] mm: shmem: Refactor out shmem_shared_policy() function

2023-04-13 Thread Ackerley Tng
Refactor out shmem_shared_policy() to allow reading of a file's shared
mempolicy

Signed-off-by: Ackerley Tng 
---
 include/linux/shmem_fs.h |  7 +++
 mm/shmem.c   | 10 ++
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index d9e57485a686..bc1eeb4b4bd9 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -134,6 +134,13 @@ static inline bool shmem_file(struct file *file)
return shmem_mapping(file->f_mapping);
 }
 
+static inline struct shared_policy *shmem_shared_policy(struct file *file)
+{
+   struct inode *inode = file_inode(file);
+
+   return _I(inode)->policy;
+}
+
 /*
  * If fallocate(FALLOC_FL_KEEP_SIZE) has been used, there may be pages
  * beyond i_size's notion of EOF, which fallocate has committed to reserving:
diff --git a/mm/shmem.c b/mm/shmem.c
index b053cd1f12da..4f801f398454 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2248,20 +2248,22 @@ unsigned long shmem_get_unmapped_area(struct file *file,
 }
 
 #ifdef CONFIG_NUMA
+
 static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
 {
-   struct inode *inode = file_inode(vma->vm_file);
-   return mpol_set_shared_policy(_I(inode)->policy, vma, mpol);
+   struct shared_policy *info;
+
+   info = shmem_shared_policy(vma->vm_file);
+   return mpol_set_shared_policy(info, vma, mpol);
 }
 
 static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
  unsigned long addr)
 {
-   struct inode *inode = file_inode(vma->vm_file);
pgoff_t index;
 
index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-   return mpol_shared_policy_lookup(_I(inode)->policy, index);
+   return mpol_shared_policy_lookup(shmem_shared_policy(vma->vm_file), 
index);
 }
 #endif
 
-- 
2.40.0.634.g4ca3ef3211-goog




Re: [RFC PATCH v3 1/2] mm: restrictedmem: Allow userspace to specify mount for memfd_restricted

2023-04-13 Thread Ackerley Tng

Christian Brauner  writes:


On Wed, Apr 05, 2023 at 09:58:44PM +, Ackerley Tng wrote:



...



> > Why do you even need this flag? It seems that @mount_fd being < 0 is
> > sufficient to indicate that a new restricted memory fd is supposed  
to be

> > created in the system instance.




I'm hoping to have this patch series merged after Chao's patch series
introduces the memfd_restricted() syscall [1].



I'm curious, is there an LSFMM session for this?


As far as I know, there is no LSFMM session for this.



Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory

2023-04-13 Thread Ackerley Tng

Chao Peng  writes:


From: "Kirill A. Shutemov" 



Introduce 'memfd_restricted' system call with the ability to create
memory areas that are restricted from userspace access through ordinary
MMU operations (e.g. read/write/mmap). The memory content is expected to
be used through the new in-kernel interface by a third kernel module.



...



diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
new file mode 100644
index ..56953c204e5c
--- /dev/null
+++ b/mm/restrictedmem.c
@@ -0,0 +1,318 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "linux/sbitmap.h"
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+struct restrictedmem_data {
+   struct mutex lock;
+   struct file *memfd;


Can this be renamed to file, or lower_file (as in stacking filesystems)?

It's a little confusing because this pointer doesn't actually refer to
an fd.

'memfd' is already used by udmabuf to refer to an actual fd [1], which
makes this a little misleading.

[1]  
https://elixir.bootlin.com/linux/v6.2.10/source/tools/testing/selftests/drivers/dma-buf/udmabuf.c#L63



+   struct list_head notifiers;
+};
+
...





Re: [RFC PATCH v3 2/2] selftests: restrictedmem: Check hugepage-ness of shmem file backing restrictedmem fd

2023-04-10 Thread Ackerley Tng

David Hildenbrand  writes:


On 01.04.23 01:50, Ackerley Tng wrote:

For memfd_restricted() calls without a userspace mount, the backing
file should be the shmem mount in the kernel, and the size of backing
pages should be as defined by system-wide shmem configuration.



If a userspace mount is provided, the size of backing pages should be
as defined in the mount.



Also includes negative tests for invalid inputs, including fds
representing read-only superblocks/mounts.




When you talk about "hugepage" in this patch, do you mean THP or
hugetlb? I suspect thp, so it might be better to spell that out. IIRC,
there are plans to support actual huge pages in the future, at which
point "hugepage" terminology could be misleading.



Thanks for pointing this out! I've replaced references to hugepage with
thp, please see RFC v4 at
https://lore.kernel.org/lkml/cover.1681176340.git.ackerley...@google.com/T/


Signed-off-by: Ackerley Tng 
---
   tools/testing/selftests/Makefile  |   1 +
   .../selftests/restrictedmem/.gitignore|   3 +
   .../testing/selftests/restrictedmem/Makefile  |  15 +
   .../testing/selftests/restrictedmem/common.c  |   9 +
   .../testing/selftests/restrictedmem/common.h  |   8 +
   .../restrictedmem_hugepage_test.c | 486 ++
   6 files changed, 522 insertions(+)
   create mode 100644 tools/testing/selftests/restrictedmem/.gitignore
   create mode 100644 tools/testing/selftests/restrictedmem/Makefile
   create mode 100644 tools/testing/selftests/restrictedmem/common.c
   create mode 100644 tools/testing/selftests/restrictedmem/common.h
   create mode 100644  
tools/testing/selftests/restrictedmem/restrictedmem_hugepage_test.c



...





[RFC PATCH v4 0/2] Providing mount in memfd_restricted() syscall

2023-04-10 Thread Ackerley Tng
Hello,

This patchset builds upon the memfd_restricted() system call that was
discussed in the 'KVM: mm: fd-based approach for supporting KVM' patch
series, at
https://lore.kernel.org/lkml/20221202061347.1070246-1-chao.p.p...@linux.intel.com/T/

The tree can be found at:
https://github.com/googleprodkernel/linux-cc/tree/restrictedmem-provide-mount-fd-rfc-v4

In this patchset, a modification to the memfd_restricted() syscall is
proposed, which allows userspace to provide a mount, on which the
restrictedmem file will be created and returned from the
memfd_restricted().

Allowing userspace to provide a mount allows userspace to control
various memory binding policies via tmpfs mount options, such as
Transparent HugePage memory allocation policy through
'huge=always/never' and NUMA memory allocation policy through
'mpol=local/bind:*'.

Changes since RFCv3:
+ Added check to ensure that bind mounts must be bind mounts of the
  whole filesystem
+ Removed inappropriate check on fd’s permissions as Christian
  suggested
+ Renamed RMFD_USERMNT to MEMFD_RSTD_USERMNT as David suggested
+ Added selftest to check that bind mounts must be bind mounts of the
  whole filesystem

Changes since RFCv2:
+ Tightened semantics to accept only fds of the root of a tmpfs mount,
  as Christian suggested
+ Added permissions check on the inode represented by the fd to guard
  against creation of restrictedmem files on read-only tmpfs
  filesystems or mounts
+ Renamed RMFD_TMPFILE to RMFD_USERMNT to better represent providing a
  userspace mount to create a restrictedmem file on
+ Updated selftests for tighter semantics and added selftests to check
  for permissions

Changes since RFCv1:
+ Use fd to represent mount instead of path string, as Kirill
  suggested. I believe using fds makes this syscall interface more
  aligned with the other syscalls like fsopen(), fsconfig(), and
  fsmount() in terms of using and passing around fds
+ Remove unused variable char *orig_shmem_enabled from selftests

Dependencies:
+ Chao’s work on UPM, at
  https://github.com/chao-p/linux/commits/privmem-v11.5

Links to earlier patch series:
+ RFC v3: 
https://lore.kernel.org/lkml/cover.1680306489.git.ackerley...@google.com/T/
+ RFC v2: 
https://lore.kernel.org/lkml/cover.1679428901.git.ackerley...@google.com/T/
+ RFC v1: 
https://lore.kernel.org/lkml/cover.1676507663.git.ackerley...@google.com/T/

Ackerley Tng (2):
  mm: restrictedmem: Allow userspace to specify mount for
memfd_restricted
  selftests: restrictedmem: Check memfd_restricted()'s handling of
provided userspace mount

 include/linux/syscalls.h  |   2 +-
 include/uapi/linux/restrictedmem.h|   8 +
 mm/restrictedmem.c|  73 ++-
 tools/testing/selftests/mm/.gitignore |   1 +
 tools/testing/selftests/mm/Makefile   |   1 +
 .../selftests/mm/memfd_restricted_usermnt.c   | 529 ++
 tools/testing/selftests/mm/run_vmtests.sh |   3 +
 7 files changed, 611 insertions(+), 6 deletions(-)
 create mode 100644 include/uapi/linux/restrictedmem.h
 create mode 100644 tools/testing/selftests/mm/memfd_restricted_usermnt.c

--
2.40.0.577.gac1e443424-goog



[RFC PATCH v4 2/2] selftests: restrictedmem: Check memfd_restricted()'s handling of provided userspace mount

2023-04-10 Thread Ackerley Tng
For memfd_restricted() calls without a userspace mount, the backing
file should be the shmem mount in the kernel, and the size of backing
pages should be as defined by system-wide shmem configuration.

If a userspace mount is provided, the size of backing pages should be
as defined in the mount.

Also includes negative tests for invalid inputs, including fds
representing read-only superblocks/mounts.

Signed-off-by: Ackerley Tng 
---
 tools/testing/selftests/mm/.gitignore |   1 +
 tools/testing/selftests/mm/Makefile   |   1 +
 .../selftests/mm/memfd_restricted_usermnt.c   | 529 ++
 tools/testing/selftests/mm/run_vmtests.sh |   3 +
 4 files changed, 534 insertions(+)
 create mode 100644 tools/testing/selftests/mm/memfd_restricted_usermnt.c

diff --git a/tools/testing/selftests/mm/.gitignore 
b/tools/testing/selftests/mm/.gitignore
index fb6e4233374d..dba320c8151a 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -31,6 +31,7 @@ map_fixed_noreplace
 write_to_hugetlbfs
 hmm-tests
 memfd_restricted
+memfd_restricted_usermnt
 memfd_secret
 soft-dirty
 split_huge_page_test
diff --git a/tools/testing/selftests/mm/Makefile 
b/tools/testing/selftests/mm/Makefile
index 5ec338ea1fed..2f5df7a12ea5 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -46,6 +46,7 @@ TEST_GEN_FILES += map_fixed_noreplace
 TEST_GEN_FILES += map_hugetlb
 TEST_GEN_FILES += map_populate
 TEST_GEN_FILES += memfd_restricted
+TEST_GEN_FILES += memfd_restricted_usermnt
 TEST_GEN_FILES += memfd_secret
 TEST_GEN_FILES += migration
 TEST_GEN_FILES += mlock-random-test
diff --git a/tools/testing/selftests/mm/memfd_restricted_usermnt.c 
b/tools/testing/selftests/mm/memfd_restricted_usermnt.c
new file mode 100644
index ..0be04e3d714d
--- /dev/null
+++ b/tools/testing/selftests/mm/memfd_restricted_usermnt.c
@@ -0,0 +1,529 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define _GNU_SOURCE /* for O_PATH */
+#define _POSIX_C_SOURCE /* for PATH_MAX */
+#include 
+#include 
+#include 
+#include 
+
+#include "linux/restrictedmem.h"
+
+#include "../kselftest_harness.h"
+
+static int memfd_restricted(unsigned int flags, int fd)
+{
+   return syscall(__NR_memfd_restricted, flags, fd);
+}
+
+static int get_hpage_pmd_size(void)
+{
+   FILE *fp;
+   char buf[100];
+   char *ret;
+   int size;
+
+   fp = fopen("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", "r");
+   if (!fp)
+   return -1;
+
+   ret = fgets(buf, 100, fp);
+   if (ret != buf) {
+   size = -1;
+   goto out;
+   }
+
+   if (sscanf(buf, "%d\n", ) != 1)
+   size = -1;
+
+out:
+   fclose(fp);
+
+   return size;
+}
+
+static int write_string_to_file(const char *path, const char *string)
+{
+   FILE *fp;
+   size_t len = strlen(string);
+   int ret = -1;
+
+   fp = fopen(path, "w");
+   if (!fp)
+   return ret;
+
+   if (fwrite(string, 1, len, fp) != len)
+   goto out;
+
+   ret = 0;
+
+out:
+   fclose(fp);
+   return ret;
+}
+
+/*
+ * Expect shmem thp policy to be one of always, within_size, advise, never,
+ * deny, force
+ */
+#define POLICY_BUF_SIZE 12
+
+static bool is_valid_shmem_thp_policy(char *policy)
+{
+   if (strcmp(policy, "always") == 0)
+   return true;
+   if (strcmp(policy, "within_size") == 0)
+   return true;
+   if (strcmp(policy, "advise") == 0)
+   return true;
+   if (strcmp(policy, "never") == 0)
+   return true;
+   if (strcmp(policy, "deny") == 0)
+   return true;
+   if (strcmp(policy, "force") == 0)
+   return true;
+
+   return false;
+}
+
+static int get_shmem_thp_policy(char *policy)
+{
+   FILE *fp;
+   char buf[100];
+   char *left = NULL;
+   char *right = NULL;
+   int ret = -1;
+
+   fp = fopen("/sys/kernel/mm/transparent_hugepage/shmem_enabled", "r");
+   if (!fp)
+   return -1;
+
+   if (fgets(buf, 100, fp) != buf)
+   goto out;
+
+   /*
+* Expect shmem_enabled to be of format like "always within_size advise
+* [never] deny force"
+*/
+   left = memchr(buf, '[', 100);
+   if (!left)
+   goto out;
+
+   right = memchr(buf, ']', 100);
+   if (!right)
+   goto out;
+
+   memcpy(policy, left + 1, right - left - 1);
+
+   ret = !is_valid_shmem_thp_policy(policy);
+
+out:
+   fclose(fp);
+   return ret;
+}
+
+static int set_shmem_thp_policy(char *policy)
+{
+   int ret = -1;
+   /* +1 for newline */
+   char to_write[POLICY_BUF_SIZE + 1] = { 0 };
+
+   if (!is_valid_shmem_thp

[RFC PATCH v4 1/2] mm: restrictedmem: Allow userspace to specify mount for memfd_restricted

2023-04-10 Thread Ackerley Tng
By default, the backing shmem file for a restrictedmem fd is created
on shmem's kernel space mount.

With this patch, an optional tmpfs mount can be specified via an fd,
which will be used as the mountpoint for backing the shmem file
associated with a restrictedmem fd.

This will help restrictedmem fds inherit the properties of the
provided tmpfs mounts, for example, hugepage (THP) allocation hints,
NUMA binding hints, etc.

Permissions for the fd passed to memfd_restricted() is modeled after
the openat() syscall, since both of these allow creation of a file
upon a mount/directory.

Permission to reference the mount the fd represents is checked upon fd
creation by other syscalls (e.g. fsmount(), open(), or open_tree(),
etc) and any process that can present memfd_restricted() with a valid
fd is expected to have obtained permission to use the mount
represented by the fd. This behavior is intended to parallel that of
the openat() syscall.

memfd_restricted() will check that the tmpfs superblock is
writable, and that the mount is also writable, before attempting to
create a restrictedmem file on the mount.

Signed-off-by: Ackerley Tng 
---
 include/linux/syscalls.h   |  2 +-
 include/uapi/linux/restrictedmem.h |  8 
 mm/restrictedmem.c | 73 --
 3 files changed, 77 insertions(+), 6 deletions(-)
 create mode 100644 include/uapi/linux/restrictedmem.h

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 660be0bf89d5..90c73b9e14e5 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1058,7 +1058,7 @@ asmlinkage long sys_memfd_secret(unsigned int flags);
 asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long 
len,
unsigned long home_node,
unsigned long flags);
-asmlinkage long sys_memfd_restricted(unsigned int flags);
+asmlinkage long sys_memfd_restricted(unsigned int flags, int mount_fd);
 
 /*
  * Architecture-specific system calls
diff --git a/include/uapi/linux/restrictedmem.h 
b/include/uapi/linux/restrictedmem.h
new file mode 100644
index ..73e31bce73dc
--- /dev/null
+++ b/include/uapi/linux/restrictedmem.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_RESTRICTEDMEM_H
+#define _UAPI_LINUX_RESTRICTEDMEM_H
+
+/* flags for memfd_restricted */
+#define MEMFD_RSTD_USERMNT 0x0001U
+
+#endif /* _UAPI_LINUX_RESTRICTEDMEM_H */
diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
index 55e99e6c09a1..032ad1f15138 100644
--- a/mm/restrictedmem.c
+++ b/mm/restrictedmem.c
@@ -6,6 +6,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 struct restrictedmem {
@@ -250,19 +251,20 @@ static struct address_space_operations restricted_aops = {
 #endif
 };
 
-SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
+static int restrictedmem_create(struct vfsmount *mount)
 {
struct file *file, *restricted_file;
int fd, err;
 
-   if (flags)
-   return -EINVAL;
-
fd = get_unused_fd_flags(0);
if (fd < 0)
return fd;
 
-   file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE);
+   if (mount)
+   file = shmem_file_setup_with_mnt(mount, "memfd:restrictedmem", 
0, VM_NORESERVE);
+   else
+   file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE);
+
if (IS_ERR(file)) {
err = PTR_ERR(file);
goto err_fd;
@@ -286,6 +288,67 @@ SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
return err;
 }
 
+static struct vfsmount *restrictedmem_get_user_mount(struct file *file)
+{
+   int ret;
+   struct vfsmount *mnt;
+   struct path *path;
+
+   path = >f_path;
+   if (path->dentry != path->mnt->mnt_root)
+   return ERR_PTR(-EINVAL);
+
+   /*
+* Disallow bind-mounts that aren't bind-mounts of the whole
+* filesystem
+*/
+   mnt = path->mnt;
+   if (mnt->mnt_root != mnt->mnt_sb->s_root)
+   return ERR_PTR(-EINVAL);
+
+   if (mnt->mnt_sb->s_magic != TMPFS_MAGIC)
+   return ERR_PTR(-EINVAL);
+
+   ret = mnt_want_write(mnt);
+   if (ret)
+   return ERR_PTR(ret);
+
+   return mnt;
+}
+
+SYSCALL_DEFINE2(memfd_restricted, unsigned int, flags, int, mount_fd)
+{
+   int ret;
+   struct fd f = {};
+   struct vfsmount *mnt = NULL;
+
+   if (flags & ~MEMFD_RSTD_USERMNT)
+   return -EINVAL;
+
+   if (flags & MEMFD_RSTD_USERMNT) {
+   f = fdget_raw(mount_fd);
+   if (!f.file)
+   return -EBADF;
+
+   mnt = restrictedmem_get_user_mount(f.file);
+   if (IS_ERR(mnt)) {
+   ret = PTR_ERR(mnt);
+   

Re: [RFC PATCH v3 1/2] mm: restrictedmem: Allow userspace to specify mount for memfd_restricted

2023-04-05 Thread Ackerley Tng



Thanks for reviewing these patches!

"Kirill A. Shutemov"  writes:


On Fri, Mar 31, 2023 at 11:50:39PM +, Ackerley Tng wrote:



...



+static int restrictedmem_create_on_user_mount(int mount_fd)
+{
+   int ret;
+   struct fd f;
+   struct vfsmount *mnt;
+
+   f = fdget_raw(mount_fd);
+   if (!f.file)
+   return -EBADF;
+
+   ret = -EINVAL;
+   if (!is_mount_root(f.file))
+   goto out;
+
+   mnt = f.file->f_path.mnt;
+   if (!is_shmem_mount(mnt))
+   goto out;
+
+   ret = file_permission(f.file, MAY_WRITE | MAY_EXEC);



Why MAY_EXEC?



Christian pointed out that this check does not make sense, I'll be
removing the entire check in the next revision.


+   if (ret)
+   goto out;
+
+   ret = mnt_want_write(mnt);
+   if (unlikely(ret))
+   goto out;
+
+   ret = restrictedmem_create(mnt);
+
+   mnt_drop_write(mnt);
+out:
+   fdput(f);
+
+   return ret;
+}



We need review from fs folks. Look mostly sensible, but I have no
experience in fs.



+
+SYSCALL_DEFINE2(memfd_restricted, unsigned int, flags, int, mount_fd)
+{
+   if (flags & ~RMFD_USERMNT)
+   return -EINVAL;
+
+   if (flags == RMFD_USERMNT) {
+   if (mount_fd < 0)
+   return -EINVAL;
+
+   return restrictedmem_create_on_user_mount(mount_fd);
+   } else {
+   return restrictedmem_create(NULL);
+   }



Maybe restructure with single restrictedmem_create() call?



struct vfsmount *mnt = NULL;



if (flags == RMFD_USERMNT) {
...
mnt = ...();
}



return restrictedmem_create(mnt);


Will do so in the next revision.


+}
+
  int restrictedmem_bind(struct file *file, pgoff_t start, pgoff_t end,
   struct restrictedmem_notifier *notifier, bool exclusive)
  {
--
2.40.0.348.gf938b09366-goog




Re: [RFC PATCH v3 1/2] mm: restrictedmem: Allow userspace to specify mount for memfd_restricted

2023-04-05 Thread Ackerley Tng



Thanks for your review!

David Hildenbrand  writes:


On 01.04.23 01:50, Ackerley Tng wrote:



...


diff --git a/include/uapi/linux/restrictedmem.h  
b/include/uapi/linux/restrictedmem.h

new file mode 100644
index ..22d6f2285f6d
--- /dev/null
+++ b/include/uapi/linux/restrictedmem.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_RESTRICTEDMEM_H
+#define _UAPI_LINUX_RESTRICTEDMEM_H
+
+/* flags for memfd_restricted */
+#define RMFD_USERMNT   0x0001U



I wonder if we can come up with a more expressive prefix than RMFD.
Sounds more like "rm fd" ;) Maybe it should better match the
"memfd_restricted" syscall name, like "MEMFD_RSTD_USERMNT".



RMFD did actually sound vulgar, I'm good with MEMFD_RSTD_USERMNT!


+
+#endif /* _UAPI_LINUX_RESTRICTEDMEM_H */
diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
index c5d869d8c2d8..f7b62364a31a 100644
--- a/mm/restrictedmem.c
+++ b/mm/restrictedmem.c
@@ -1,11 +1,12 @@
   // SPDX-License-Identifier: GPL-2.0
-#include "linux/sbitmap.h"



Looks like an unrelated change?



Will remove this in the next revision.


+#include 
   #include 
   #include 
   #include 
   #include 
   #include 
   #include 
+#include 
   #include 



   struct restrictedmem {
@@ -189,19 +190,20 @@ static struct file  
*restrictedmem_file_create(struct file *memfd)

return file;
   }



-SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
+static int restrictedmem_create(struct vfsmount *mount)
   {
struct file *file, *restricted_file;
int fd, err;



-   if (flags)
-   return -EINVAL;
-
fd = get_unused_fd_flags(0);
if (fd < 0)
return fd;



-   file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE);
+   if (mount)
+		file = shmem_file_setup_with_mnt(mount, "memfd:restrictedmem", 0,  
VM_NORESERVE);

+   else
+   file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE);
+
if (IS_ERR(file)) {
err = PTR_ERR(file);
goto err_fd;
@@ -223,6 +225,66 @@ SYSCALL_DEFINE1(memfd_restricted, unsigned int,  
flags)

return err;
   }



+static bool is_shmem_mount(struct vfsmount *mnt)
+{
+   return mnt && mnt->mnt_sb && mnt->mnt_sb->s_magic == TMPFS_MAGIC;
+}
+
+static bool is_mount_root(struct file *file)
+{
+   return file->f_path.dentry == file->f_path.mnt->mnt_root;
+}



I'd inline at least that function, pretty self-explaining.



Will inline this in the next revision.


+
+static int restrictedmem_create_on_user_mount(int mount_fd)
+{
+   int ret;
+   struct fd f;
+   struct vfsmount *mnt;
+
+   f = fdget_raw(mount_fd);
+   if (!f.file)
+   return -EBADF;
+
+   ret = -EINVAL;
+   if (!is_mount_root(f.file))
+   goto out;
+
+   mnt = f.file->f_path.mnt;
+   if (!is_shmem_mount(mnt))
+   goto out;
+
+   ret = file_permission(f.file, MAY_WRITE | MAY_EXEC);
+   if (ret)
+   goto out;
+
+   ret = mnt_want_write(mnt);
+   if (unlikely(ret))
+   goto out;
+
+   ret = restrictedmem_create(mnt);
+
+   mnt_drop_write(mnt);
+out:
+   fdput(f);
+
+   return ret;
+}
+
+SYSCALL_DEFINE2(memfd_restricted, unsigned int, flags, int, mount_fd)
+{
+   if (flags & ~RMFD_USERMNT)
+   return -EINVAL;
+
+   if (flags == RMFD_USERMNT) {
+   if (mount_fd < 0)
+   return -EINVAL;
+
+   return restrictedmem_create_on_user_mount(mount_fd);
+   } else {
+   return restrictedmem_create(NULL);
+   }




You can drop the else case:



if (flags == RMFD_USERMNT) {
...
return restrictedmem_create_on_user_mount(mount_fd);
}
return restrictedmem_create(NULL);



I'll be refactoring this to adopt Kirill's suggestion of using a single
restrictedmem_create(mnt) call.



I do wonder if you want to properly check for a flag instead of
comparing values. Results in a more natural way to deal with flags:



if (flags & RMFD_USERMNT) {



}



Will use this in the next revision.


+}
+
   int restrictedmem_bind(struct file *file, pgoff_t start, pgoff_t end,
   struct restrictedmem_notifier *notifier, bool exclusive)
   {



The "memfd_restricted" vs. "restrictedmem" terminology is a bit
unfortunate, but not your fault here.




I'm not a FS person, but it does look good to me.




Re: [RFC PATCH v3 1/2] mm: restrictedmem: Allow userspace to specify mount for memfd_restricted

2023-04-05 Thread Ackerley Tng


Thanks again for your review!

Christian Brauner  writes:

On Tue, Apr 04, 2023 at 03:53:13PM +0200, Christian Brauner wrote:

On Fri, Mar 31, 2023 at 11:50:39PM +, Ackerley Tng wrote:
>
> ...
>
> -SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
> +static int restrictedmem_create(struct vfsmount *mount)
>  {
>struct file *file, *restricted_file;
>int fd, err;
>
> -  if (flags)
> -  return -EINVAL;
> -
>fd = get_unused_fd_flags(0);



Any reasons the file descriptors aren't O_CLOEXEC by default? I don't
see any reasons why we should introduce new fdtypes that aren't
O_CLOEXEC by default. The "don't mix-and-match" train has already left
the station anyway as we do have seccomp noitifer fds and pidfds both of
which are O_CLOEXEC by default.



Thanks for pointing this out. I agree with using O_CLOEXEC, but didn’t
notice this before. Let us discuss this under the original series at
[1].


>if (fd < 0)
>return fd;
>
> -  file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE);
> +  if (mount)
> +		file = shmem_file_setup_with_mnt(mount, "memfd:restrictedmem", 0,  
VM_NORESERVE);

> +  else
> +  file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE);
> +
>if (IS_ERR(file)) {
>err = PTR_ERR(file);
>goto err_fd;
> @@ -223,6 +225,66 @@ SYSCALL_DEFINE1(memfd_restricted, unsigned int,  
flags)

>return err;
>  }
>
> +static bool is_shmem_mount(struct vfsmount *mnt)
> +{
> +  return mnt && mnt->mnt_sb && mnt->mnt_sb->s_magic == TMPFS_MAGIC;



This can just be if (mnt->mnt_sb->s_magic == TMPFS_MAGIC).



Will simplify this in the next revision.


> +}
> +
> +static bool is_mount_root(struct file *file)
> +{
> +  return file->f_path.dentry == file->f_path.mnt->mnt_root;



mount -t tmpfs tmpfs /mnt
touch /mnt/bla
touch /mnt/ble
mount --bind /mnt/bla /mnt/ble
fd = open("/mnt/ble")
fd_restricted = memfd_restricted(fd)



IOW, this doesn't restrict it to the tmpfs root. It only restricts it to
paths that refer to the root of any tmpfs mount. To exclude bind-mounts
that aren't bind-mounts of the whole filesystem you want:



path->dentry == path->mnt->mnt_root &&
path->mnt->mnt_root == path->mnt->mnt_sb->s_root



Will adopt this in the next revision and add a selftest to check
this. Thanks for pointing this out!


> +}
> +
> +static int restrictedmem_create_on_user_mount(int mount_fd)
> +{
> +  int ret;
> +  struct fd f;
> +  struct vfsmount *mnt;
> +
> +  f = fdget_raw(mount_fd);
> +  if (!f.file)
> +  return -EBADF;
> +
> +  ret = -EINVAL;
> +  if (!is_mount_root(f.file))
> +  goto out;
> +
> +  mnt = f.file->f_path.mnt;
> +  if (!is_shmem_mount(mnt))
> +  goto out;
> +
> +  ret = file_permission(f.file, MAY_WRITE | MAY_EXEC);



With the current semantics you're asking whether you have write
permissions on the /mnt/ble file in order to get answer to the question
whether you're allowed to create an unlinked restricted memory file.
That doesn't make much sense afaict.



That's true. Since mnt_want_write() already checks for write permissions
and this syscall creates an unlinked file on the mount, we don't have to
check permissions on the file then. Will remove this in the next
revision!


> +  if (ret)
> +  goto out;
> +
> +  ret = mnt_want_write(mnt);
> +  if (unlikely(ret))
> +  goto out;
> +
> +  ret = restrictedmem_create(mnt);
> +
> +  mnt_drop_write(mnt);
> +out:
> +  fdput(f);
> +
> +  return ret;
> +}
> +
> +SYSCALL_DEFINE2(memfd_restricted, unsigned int, flags, int, mount_fd)
> +{
> +  if (flags & ~RMFD_USERMNT)
> +  return -EINVAL;
> +
> +  if (flags == RMFD_USERMNT) {



Why do you even need this flag? It seems that @mount_fd being < 0 is
sufficient to indicate that a new restricted memory fd is supposed to be
created in the system instance.



I'm hoping to have this patch series merged after Chao's patch series
introduces the memfd_restricted() syscall [1].

This flag is necessary to indicate the validity of the second argument.

With this flag, we can definitively return an error if the fd is
invalid, which I think is a better experience for the userspace
programmer than if we just silently default to the kernel mount when the
fd provided is invalid.


> +  if (mount_fd < 0)
> +  return -EINVAL;
> +
> +  return restrictedmem_create_on_user_mount(mount_fd);
> +  } else {
> +  return restrictedmem_create(NULL);
> +  }
> +}


I have to say that I'm very confused by all of this the more I look at  
it.



Effectively memfd r

Re: [RFC PATCH v2 1/2] mm: restrictedmem: Allow userspace to specify mount for memfd_restricted

2023-03-31 Thread Ackerley Tng

Christian Brauner  writes:


On Tue, Mar 21, 2023 at 08:15:32PM +, Ackerley Tng wrote:

By default, the backing shmem file for a restrictedmem fd is created
on shmem's kernel space mount.



...


Thanks for reviewing this patch!



This looks like you can just pass in some tmpfs fd and you just use it
to identify the mnt and then you create a restricted memfd area in that
instance. So if I did:



mount -t tmpfs tmpfs /mnt
mknod /mnt/bla c 0 0
fd = open("/mnt/bla")
memfd_restricted(fd)



then it would create a memfd restricted entry in the tmpfs instance
using the arbitrary dummy device node to infer the tmpfs instance.



Looking at the older thread briefly and the cover letter. Afaict, the
new mount api shouldn't figure into the design of this. fsopen() returns
fds referencing a VFS-internal fs_context object. They can't be used to
create or lookup files or identify mounts. The mount doesn't exist at
that time. Not even a superblock might exist at the time before
fsconfig(FSCONFIG_CMD_CREATE).



When fsmount() is called after superblock setup then it's similar to any
other fd from open() or open_tree() or whatever (glossing over some
details that are irrelevant here). Difference is that open_tree() and
fsmount() would refer to the root of a mount.


This is correct, memfd_restricted() needs an fd returned from fsmount()
and not fsopen(). Usage examples of this new parameter in
memfd_restricted() are available in selftests.



At first I wondered why this doesn't just use standard *at() semantics
but I guess the restricted memfd is unlinked and doesn't show up in the
tmpfs instance.



So if you go down that route then I would suggest to enforce that the
provided fd refer to the root of a tmpfs mount. IOW, it can't just be an
arbitrary file descriptor in a tmpfs instance. That seems cleaner to me:



sb = f_path->mnt->mnt_sb;
sb->s_magic == TMPFS_MAGIC && f_path->mnt->mnt_root == sb->s_root



and has much tigher semantics than just allowing any kind of fd.


Thanks for your suggestion, I've tightened the semantics as you
suggested. memfd_restricted() now only accepts fds representing the root
of the mount.



Another wrinkly I find odd but that's for you to judge is that this
bypasses the permission model of the tmpfs instance. IOW, as long as you
have a handle to the root of a tmpfs mount you can just create
restricted memfds in there. So if I provided a completely sandboxed
service - running in a user namespace or whatever - with an fd to the
host's tmpfs instance they can just create restricted memfds in there no
questions asked.



Maybe that's fine but it's certainly something to spell out and think
about the implications.


Thanks for pointing this out! I added a permissions check in RFC v3, and
clarified the permissions model (please see patch 1 of 2):
https://lore.kernel.org/lkml/cover.1680306489.git.ackerley...@google.com/



[RFC PATCH v3 2/2] selftests: restrictedmem: Check hugepage-ness of shmem file backing restrictedmem fd

2023-03-31 Thread Ackerley Tng
For memfd_restricted() calls without a userspace mount, the backing
file should be the shmem mount in the kernel, and the size of backing
pages should be as defined by system-wide shmem configuration.

If a userspace mount is provided, the size of backing pages should be
as defined in the mount.

Also includes negative tests for invalid inputs, including fds
representing read-only superblocks/mounts.

Signed-off-by: Ackerley Tng 
---
 tools/testing/selftests/Makefile  |   1 +
 .../selftests/restrictedmem/.gitignore|   3 +
 .../testing/selftests/restrictedmem/Makefile  |  15 +
 .../testing/selftests/restrictedmem/common.c  |   9 +
 .../testing/selftests/restrictedmem/common.h  |   8 +
 .../restrictedmem_hugepage_test.c | 486 ++
 6 files changed, 522 insertions(+)
 create mode 100644 tools/testing/selftests/restrictedmem/.gitignore
 create mode 100644 tools/testing/selftests/restrictedmem/Makefile
 create mode 100644 tools/testing/selftests/restrictedmem/common.c
 create mode 100644 tools/testing/selftests/restrictedmem/common.h
 create mode 100644 
tools/testing/selftests/restrictedmem/restrictedmem_hugepage_test.c

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index f07aef7c592c..44078eeefb79 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -60,6 +60,7 @@ TARGETS += pstore
 TARGETS += ptrace
 TARGETS += openat2
 TARGETS += resctrl
+TARGETS += restrictedmem
 TARGETS += rlimits
 TARGETS += rseq
 TARGETS += rtc
diff --git a/tools/testing/selftests/restrictedmem/.gitignore 
b/tools/testing/selftests/restrictedmem/.gitignore
new file mode 100644
index ..2581bcc8ff29
--- /dev/null
+++ b/tools/testing/selftests/restrictedmem/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+restrictedmem_hugepage_test
diff --git a/tools/testing/selftests/restrictedmem/Makefile 
b/tools/testing/selftests/restrictedmem/Makefile
new file mode 100644
index ..8e5378d20226
--- /dev/null
+++ b/tools/testing/selftests/restrictedmem/Makefile
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0
+
+CFLAGS = $(KHDR_INCLUDES)
+CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -std=gnu99
+
+TEST_GEN_PROGS += restrictedmem_hugepage_test
+
+include ../lib.mk
+
+EXTRA_CLEAN = $(OUTPUT)/common.o
+
+$(OUTPUT)/common.o: common.c
+   $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c -ffreestanding $< -o $@
+
+$(TEST_GEN_PROGS): $(OUTPUT)/common.o
diff --git a/tools/testing/selftests/restrictedmem/common.c 
b/tools/testing/selftests/restrictedmem/common.c
new file mode 100644
index ..03dac843404f
--- /dev/null
+++ b/tools/testing/selftests/restrictedmem/common.c
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include 
+#include 
+
+int memfd_restricted(unsigned int flags, int mount_fd)
+{
+   return syscall(__NR_memfd_restricted, flags, mount_fd);
+}
diff --git a/tools/testing/selftests/restrictedmem/common.h 
b/tools/testing/selftests/restrictedmem/common.h
new file mode 100644
index ..06284ed86baf
--- /dev/null
+++ b/tools/testing/selftests/restrictedmem/common.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef SELFTESTS_RESTRICTEDMEM_COMMON_H
+#define SELFTESTS_RESTRICTEDMEM_COMMON_H
+
+int memfd_restricted(unsigned int flags, int mount_fd);
+
+#endif  // SELFTESTS_RESTRICTEDMEM_COMMON_H
diff --git 
a/tools/testing/selftests/restrictedmem/restrictedmem_hugepage_test.c 
b/tools/testing/selftests/restrictedmem/restrictedmem_hugepage_test.c
new file mode 100644
index ..9ed319b83cb8
--- /dev/null
+++ b/tools/testing/selftests/restrictedmem/restrictedmem_hugepage_test.c
@@ -0,0 +1,486 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define _GNU_SOURCE /* for O_PATH */
+#define _POSIX_C_SOURCE /* for PATH_MAX */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "linux/restrictedmem.h"
+
+#include "common.h"
+#include "../kselftest_harness.h"
+
+/*
+ * Expect policy to be one of always, within_size, advise, never,
+ * deny, force
+ */
+#define POLICY_BUF_SIZE 12
+
+static int get_hpage_pmd_size(void)
+{
+   FILE *fp;
+   char buf[100];
+   char *ret;
+   int size;
+
+   fp = fopen("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", "r");
+   if (!fp)
+   return -1;
+
+   ret = fgets(buf, 100, fp);
+   if (ret != buf) {
+   size = -1;
+   goto out;
+   }
+
+   if (sscanf(buf, "%d\n", ) != 1)
+   size = -1;
+
+out:
+   fclose(fp);
+
+   return size;
+}
+
+static bool is_valid_shmem_thp_policy(char *policy)
+{
+   if (strcmp(policy, "always") == 0)
+   return true;
+   if (strcmp(policy, "within_size") == 0)
+   return true;
+   if (strcmp(policy, &q

[RFC PATCH v3 0/2] Providing mount in memfd_restricted() syscall

2023-03-31 Thread Ackerley Tng
Hello,

This patchset builds upon the memfd_restricted() system call that was
discussed in the ‘KVM: mm: fd-based approach for supporting KVM’ patch
series, at
https://lore.kernel.org/lkml/20221202061347.1070246-1-chao.p.p...@linux.intel.com/T/

The tree can be found at:
https://github.com/googleprodkernel/linux-cc/tree/restrictedmem-provide-mount-fd-rfc-v3

In this patchset, a modification to the memfd_restricted() syscall is
proposed, which allows userspace to provide a mount, on which the
restrictedmem file will be created and returned from the
memfd_restricted().

Allowing userspace to provide a mount allows userspace to control
various memory binding policies via tmpfs mount options, such as
Transparent HugePage memory allocation policy through
‘huge=always/never’ and NUMA memory allocation policy through
‘mpol=local/bind:*’.

Changes since RFCv2:
+ Tightened semantics to accept only fds of the root of a tmpfs mount,
  as Christian suggested
+ Added permissions check on the inode represented by the fd to guard
  against creation of restrictedmem files on read-only tmpfs
  filesystems or mounts
+ Renamed RMFD_TMPFILE to RMFD_USERMNT to better represent providing a
  userspace mount to create a restrictedmem file on
+ Updated selftests for tighter semantics and added selftests to check
  for permissions

Changes since RFCv1:
+ Use fd to represent mount instead of path string, as Kirill
  suggested. I believe using fds makes this syscall interface more
  aligned with the other syscalls like fsopen(), fsconfig(), and
  fsmount() in terms of using and passing around fds
+ Remove unused variable char *orig_shmem_enabled from selftests

Dependencies:
+ Sean’s iteration of the ‘KVM: mm: fd-based approach for supporting
  KVM’ patch series at
  https://github.com/sean-jc/linux/tree/x86/upm_base_support
+ Proposed fixes for these issues mentioned on the mailing list:
+ 
https://lore.kernel.org/lkml/diqzzga0fv96@ackerleytng-cloudtop-sg.c.googlers.com/

Links to earlier patch series:
+ RFC v2: 
https://lore.kernel.org/lkml/cover.1679428901.git.ackerley...@google.com/T/
+ RFC v1: 
https://lore.kernel.org/lkml/cover.1676507663.git.ackerley...@google.com/T/

---

Ackerley Tng (2):
  mm: restrictedmem: Allow userspace to specify mount for
memfd_restricted
  selftests: restrictedmem: Check hugepage-ness of shmem file backing
restrictedmem fd

 include/linux/syscalls.h  |   2 +-
 include/uapi/linux/restrictedmem.h|   8 +
 mm/restrictedmem.c|  74 ++-
 tools/testing/selftests/Makefile  |   1 +
 .../selftests/restrictedmem/.gitignore|   3 +
 .../testing/selftests/restrictedmem/Makefile  |  15 +
 .../testing/selftests/restrictedmem/common.c  |   9 +
 .../testing/selftests/restrictedmem/common.h  |   8 +
 .../restrictedmem_hugepage_test.c | 486 ++
 9 files changed, 599 insertions(+), 7 deletions(-)
 create mode 100644 include/uapi/linux/restrictedmem.h
 create mode 100644 tools/testing/selftests/restrictedmem/.gitignore
 create mode 100644 tools/testing/selftests/restrictedmem/Makefile
 create mode 100644 tools/testing/selftests/restrictedmem/common.c
 create mode 100644 tools/testing/selftests/restrictedmem/common.h
 create mode 100644 
tools/testing/selftests/restrictedmem/restrictedmem_hugepage_test.c

--
2.40.0.348.gf938b09366-goog



[RFC PATCH v3 1/2] mm: restrictedmem: Allow userspace to specify mount for memfd_restricted

2023-03-31 Thread Ackerley Tng
By default, the backing shmem file for a restrictedmem fd is created
on shmem's kernel space mount.

With this patch, an optional tmpfs mount can be specified via an fd,
which will be used as the mountpoint for backing the shmem file
associated with a restrictedmem fd.

This will help restrictedmem fds inherit the properties of the
provided tmpfs mounts, for example, hugepage allocation hints, NUMA
binding hints, etc.

Permissions for the fd passed to memfd_restricted() is modeled after
the openat() syscall, since both of these allow creation of a file
upon a mount/directory.

Permission to reference the mount the fd represents is checked upon fd
creation by other syscalls (e.g. fsmount(), open(), or open_tree(),
etc) and any process that can present memfd_restricted() with a valid
fd is expected to have obtained permission to use the mount
represented by the fd. This behavior is intended to parallel that of
the openat() syscall.

memfd_restricted() will check that the tmpfs superblock is
writable, and that the mount is also writable, before attempting to
create a restrictedmem file on the mount.

Signed-off-by: Ackerley Tng 
---
 include/linux/syscalls.h   |  2 +-
 include/uapi/linux/restrictedmem.h |  8 
 mm/restrictedmem.c | 74 +++---
 3 files changed, 77 insertions(+), 7 deletions(-)
 create mode 100644 include/uapi/linux/restrictedmem.h

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index f9e9e0c820c5..a23c4c385cd3 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1056,7 +1056,7 @@ asmlinkage long sys_memfd_secret(unsigned int flags);
 asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long 
len,
unsigned long home_node,
unsigned long flags);
-asmlinkage long sys_memfd_restricted(unsigned int flags);
+asmlinkage long sys_memfd_restricted(unsigned int flags, int mount_fd);

 /*
  * Architecture-specific system calls
diff --git a/include/uapi/linux/restrictedmem.h 
b/include/uapi/linux/restrictedmem.h
new file mode 100644
index ..22d6f2285f6d
--- /dev/null
+++ b/include/uapi/linux/restrictedmem.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_RESTRICTEDMEM_H
+#define _UAPI_LINUX_RESTRICTEDMEM_H
+
+/* flags for memfd_restricted */
+#define RMFD_USERMNT   0x0001U
+
+#endif /* _UAPI_LINUX_RESTRICTEDMEM_H */
diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
index c5d869d8c2d8..f7b62364a31a 100644
--- a/mm/restrictedmem.c
+++ b/mm/restrictedmem.c
@@ -1,11 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0
-#include "linux/sbitmap.h"
+#include 
 #include 
 #include 
 #include 
 #include 
 #include 
 #include 
+#include 
 #include 

 struct restrictedmem {
@@ -189,19 +190,20 @@ static struct file *restrictedmem_file_create(struct file 
*memfd)
return file;
 }

-SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
+static int restrictedmem_create(struct vfsmount *mount)
 {
struct file *file, *restricted_file;
int fd, err;

-   if (flags)
-   return -EINVAL;
-
fd = get_unused_fd_flags(0);
if (fd < 0)
return fd;

-   file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE);
+   if (mount)
+   file = shmem_file_setup_with_mnt(mount, "memfd:restrictedmem", 
0, VM_NORESERVE);
+   else
+   file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE);
+
if (IS_ERR(file)) {
err = PTR_ERR(file);
goto err_fd;
@@ -223,6 +225,66 @@ SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
return err;
 }

+static bool is_shmem_mount(struct vfsmount *mnt)
+{
+   return mnt && mnt->mnt_sb && mnt->mnt_sb->s_magic == TMPFS_MAGIC;
+}
+
+static bool is_mount_root(struct file *file)
+{
+   return file->f_path.dentry == file->f_path.mnt->mnt_root;
+}
+
+static int restrictedmem_create_on_user_mount(int mount_fd)
+{
+   int ret;
+   struct fd f;
+   struct vfsmount *mnt;
+
+   f = fdget_raw(mount_fd);
+   if (!f.file)
+   return -EBADF;
+
+   ret = -EINVAL;
+   if (!is_mount_root(f.file))
+   goto out;
+
+   mnt = f.file->f_path.mnt;
+   if (!is_shmem_mount(mnt))
+   goto out;
+
+   ret = file_permission(f.file, MAY_WRITE | MAY_EXEC);
+   if (ret)
+   goto out;
+
+   ret = mnt_want_write(mnt);
+   if (unlikely(ret))
+   goto out;
+
+   ret = restrictedmem_create(mnt);
+
+   mnt_drop_write(mnt);
+out:
+   fdput(f);
+
+   return ret;
+}
+
+SYSCALL_DEFINE2(memfd_restricted, unsigned int, flags, int, mount_fd)
+{
+   if (flags & ~RMFD_USERMNT)
+   ret

[RFC PATCH v2 0/2] Providing mount in memfd_restricted() syscall

2023-03-21 Thread Ackerley Tng
Hello,

This patchset builds upon the memfd_restricted() system call that was
discussed in the 'KVM: mm: fd-based approach for supporting KVM' patch
series, at
https://lore.kernel.org/lkml/20221202061347.1070246-1-chao.p.p...@linux.intel.com/T/#m7e944d7892afdd1d62a03a287bd488c56e377b0c

The tree can be found at:
https://github.com/googleprodkernel/linux-cc/tree/restrictedmem-provide-mount-fd

In this patchset, a modification to the memfd_restricted() syscall is
proposed, which allows userspace to provide a mount, on which the
restrictedmem file will be created and returned from the
memfd_restricted().

Allowing userspace to provide a mount allows userspace to control
various memory binding policies via tmpfs mount options, such as
Transparent HugePage memory allocation policy through
'huge=always/never' and NUMA memory allocation policy through
'mpol=local/bind:*'.

Changes since RFCv1:
+ Use fd to represent mount instead of path string, as Kirill
  suggested. I believe using fds makes this syscall interface more
  aligned with the other syscalls like fsopen(), fsconfig(), and
  fsmount() in terms of using and passing around fds
+ Remove unused variable char *orig_shmem_enabled from selftests

Dependencies:
+ Sean's iteration of the ‘KVM: mm: fd-based approach for supporting
  KVM’ patch series at
  https://github.com/sean-jc/linux/tree/x86/upm_base_support
+ Proposed fixes for these issues mentioned on the mailing list:
+ 
https://lore.kernel.org/lkml/diqzzga0fv96@ackerleytng-cloudtop-sg.c.googlers.com/

Links to earlier patch series:
+ RFC v1:
  https://lore.kernel.org/lkml/cover.1676507663.git.ackerley...@google.com/T/

Ackerley Tng (2):
  mm: restrictedmem: Allow userspace to specify mount for
memfd_restricted
  selftests: restrictedmem: Check hugepage-ness of shmem file backing
restrictedmem fd

 include/linux/syscalls.h  |   2 +-
 include/uapi/linux/restrictedmem.h|   8 +
 mm/restrictedmem.c|  63 ++-
 tools/testing/selftests/Makefile  |   1 +
 .../selftests/restrictedmem/.gitignore|   3 +
 .../testing/selftests/restrictedmem/Makefile  |  15 +
 .../testing/selftests/restrictedmem/common.c  |   9 +
 .../testing/selftests/restrictedmem/common.h  |   8 +
 .../restrictedmem_hugepage_test.c | 459 ++
 9 files changed, 561 insertions(+), 7 deletions(-)
 create mode 100644 include/uapi/linux/restrictedmem.h
 create mode 100644 tools/testing/selftests/restrictedmem/.gitignore
 create mode 100644 tools/testing/selftests/restrictedmem/Makefile
 create mode 100644 tools/testing/selftests/restrictedmem/common.c
 create mode 100644 tools/testing/selftests/restrictedmem/common.h
 create mode 100644 
tools/testing/selftests/restrictedmem/restrictedmem_hugepage_test.c

--
2.40.0.rc2.332.ga46443480c-goog



[RFC PATCH v2 2/2] selftests: restrictedmem: Check hugepage-ness of shmem file backing restrictedmem fd

2023-03-21 Thread Ackerley Tng
For memfd_restricted() calls without a userspace mount, the backing
file should be the shmem mount in the kernel, and the size of backing
pages should be as defined by system-wide shmem configuration.

If a userspace mount is provided, the size of backing pages should be
as defined in the mount.

Signed-off-by: Ackerley Tng 
---
 tools/testing/selftests/Makefile  |   1 +
 .../selftests/restrictedmem/.gitignore|   3 +
 .../testing/selftests/restrictedmem/Makefile  |  15 +
 .../testing/selftests/restrictedmem/common.c  |   9 +
 .../testing/selftests/restrictedmem/common.h  |   8 +
 .../restrictedmem_hugepage_test.c | 459 ++
 6 files changed, 495 insertions(+)
 create mode 100644 tools/testing/selftests/restrictedmem/.gitignore
 create mode 100644 tools/testing/selftests/restrictedmem/Makefile
 create mode 100644 tools/testing/selftests/restrictedmem/common.c
 create mode 100644 tools/testing/selftests/restrictedmem/common.h
 create mode 100644 
tools/testing/selftests/restrictedmem/restrictedmem_hugepage_test.c

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index f07aef7c592c..44078eeefb79 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -60,6 +60,7 @@ TARGETS += pstore
 TARGETS += ptrace
 TARGETS += openat2
 TARGETS += resctrl
+TARGETS += restrictedmem
 TARGETS += rlimits
 TARGETS += rseq
 TARGETS += rtc
diff --git a/tools/testing/selftests/restrictedmem/.gitignore 
b/tools/testing/selftests/restrictedmem/.gitignore
new file mode 100644
index ..2581bcc8ff29
--- /dev/null
+++ b/tools/testing/selftests/restrictedmem/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+restrictedmem_hugepage_test
diff --git a/tools/testing/selftests/restrictedmem/Makefile 
b/tools/testing/selftests/restrictedmem/Makefile
new file mode 100644
index ..8e5378d20226
--- /dev/null
+++ b/tools/testing/selftests/restrictedmem/Makefile
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0
+
+CFLAGS = $(KHDR_INCLUDES)
+CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -std=gnu99
+
+TEST_GEN_PROGS += restrictedmem_hugepage_test
+
+include ../lib.mk
+
+EXTRA_CLEAN = $(OUTPUT)/common.o
+
+$(OUTPUT)/common.o: common.c
+   $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c -ffreestanding $< -o $@
+
+$(TEST_GEN_PROGS): $(OUTPUT)/common.o
diff --git a/tools/testing/selftests/restrictedmem/common.c 
b/tools/testing/selftests/restrictedmem/common.c
new file mode 100644
index ..03dac843404f
--- /dev/null
+++ b/tools/testing/selftests/restrictedmem/common.c
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include 
+#include 
+
+int memfd_restricted(unsigned int flags, int mount_fd)
+{
+   return syscall(__NR_memfd_restricted, flags, mount_fd);
+}
diff --git a/tools/testing/selftests/restrictedmem/common.h 
b/tools/testing/selftests/restrictedmem/common.h
new file mode 100644
index ..06284ed86baf
--- /dev/null
+++ b/tools/testing/selftests/restrictedmem/common.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef SELFTESTS_RESTRICTEDMEM_COMMON_H
+#define SELFTESTS_RESTRICTEDMEM_COMMON_H
+
+int memfd_restricted(unsigned int flags, int mount_fd);
+
+#endif  // SELFTESTS_RESTRICTEDMEM_COMMON_H
diff --git 
a/tools/testing/selftests/restrictedmem/restrictedmem_hugepage_test.c 
b/tools/testing/selftests/restrictedmem/restrictedmem_hugepage_test.c
new file mode 100644
index ..ae37148342fe
--- /dev/null
+++ b/tools/testing/selftests/restrictedmem/restrictedmem_hugepage_test.c
@@ -0,0 +1,459 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define _GNU_SOURCE /* for O_PATH */
+#define _POSIX_C_SOURCE /* for PATH_MAX */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "linux/restrictedmem.h"
+
+#include "common.h"
+#include "../kselftest_harness.h"
+
+/*
+ * Expect policy to be one of always, within_size, advise, never,
+ * deny, force
+ */
+#define POLICY_BUF_SIZE 12
+
+static int get_hpage_pmd_size(void)
+{
+   FILE *fp;
+   char buf[100];
+   char *ret;
+   int size;
+
+   fp = fopen("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", "r");
+   if (!fp)
+   return -1;
+
+   ret = fgets(buf, 100, fp);
+   if (ret != buf) {
+   size = -1;
+   goto out;
+   }
+
+   if (sscanf(buf, "%d\n", ) != 1)
+   size = -1;
+
+out:
+   fclose(fp);
+
+   return size;
+}
+
+static bool is_valid_shmem_thp_policy(char *policy)
+{
+   if (strcmp(policy, "always") == 0)
+   return true;
+   if (strcmp(policy, "within_size") == 0)
+   return true;
+   if (strcmp(policy, "advise") == 0)
+   return true;
+   if (strcmp(policy, "never") == 0)
+ 

[RFC PATCH v2 1/2] mm: restrictedmem: Allow userspace to specify mount for memfd_restricted

2023-03-21 Thread Ackerley Tng
By default, the backing shmem file for a restrictedmem fd is created
on shmem's kernel space mount.

With this patch, an optional tmpfs mount can be specified via an fd,
which will be used as the mountpoint for backing the shmem file
associated with a restrictedmem fd.

This change is modeled after how sys_open() can create an unnamed
temporary file in a given directory with O_TMPFILE.

This will help restrictedmem fds inherit the properties of the
provided tmpfs mounts, for example, hugepage allocation hints, NUMA
binding hints, etc.

Signed-off-by: Ackerley Tng 
---
 include/linux/syscalls.h   |  2 +-
 include/uapi/linux/restrictedmem.h |  8 
 mm/restrictedmem.c | 63 +++---
 3 files changed, 66 insertions(+), 7 deletions(-)
 create mode 100644 include/uapi/linux/restrictedmem.h

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index f9e9e0c820c5..a23c4c385cd3 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1056,7 +1056,7 @@ asmlinkage long sys_memfd_secret(unsigned int flags);
 asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long 
len,
unsigned long home_node,
unsigned long flags);
-asmlinkage long sys_memfd_restricted(unsigned int flags);
+asmlinkage long sys_memfd_restricted(unsigned int flags, int mount_fd);
 
 /*
  * Architecture-specific system calls
diff --git a/include/uapi/linux/restrictedmem.h 
b/include/uapi/linux/restrictedmem.h
new file mode 100644
index ..9f108dd1ac4c
--- /dev/null
+++ b/include/uapi/linux/restrictedmem.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_RESTRICTEDMEM_H
+#define _UAPI_LINUX_RESTRICTEDMEM_H
+
+/* flags for memfd_restricted */
+#define RMFD_TMPFILE   0x0001U
+
+#endif /* _UAPI_LINUX_RESTRICTEDMEM_H */
diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
index c5d869d8c2d8..4d83b949d84e 100644
--- a/mm/restrictedmem.c
+++ b/mm/restrictedmem.c
@@ -1,11 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0
-#include "linux/sbitmap.h"
+#include 
 #include 
 #include 
 #include 
 #include 
 #include 
 #include 
+#include 
 #include 
 
 struct restrictedmem {
@@ -189,19 +190,20 @@ static struct file *restrictedmem_file_create(struct file 
*memfd)
return file;
 }
 
-SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
+static int restrictedmem_create(struct vfsmount *mount)
 {
struct file *file, *restricted_file;
int fd, err;
 
-   if (flags)
-   return -EINVAL;
-
fd = get_unused_fd_flags(0);
if (fd < 0)
return fd;
 
-   file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE);
+   if (mount)
+   file = shmem_file_setup_with_mnt(mount, "memfd:restrictedmem", 
0, VM_NORESERVE);
+   else
+   file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE);
+
if (IS_ERR(file)) {
err = PTR_ERR(file);
goto err_fd;
@@ -223,6 +225,55 @@ SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
return err;
 }
 
+static bool is_shmem_mount(struct vfsmount *mnt)
+{
+   return mnt && mnt->mnt_sb && mnt->mnt_sb->s_magic == TMPFS_MAGIC;
+}
+
+static int restrictedmem_create_from_file(int mount_fd)
+{
+   int ret;
+   struct fd f;
+   struct vfsmount *mnt;
+
+   f = fdget_raw(mount_fd);
+   if (!f.file)
+   return -EBADF;
+
+   mnt = f.file->f_path.mnt;
+   if (!is_shmem_mount(mnt)) {
+   ret = -EINVAL;
+   goto out;
+   }
+
+   ret = mnt_want_write(mnt);
+   if (unlikely(ret))
+   goto out;
+
+   ret = restrictedmem_create(mnt);
+
+   mnt_drop_write(mnt);
+out:
+   fdput(f);
+
+   return ret;
+}
+
+SYSCALL_DEFINE2(memfd_restricted, unsigned int, flags, int, mount_fd)
+{
+   if (flags & ~RMFD_TMPFILE)
+   return -EINVAL;
+
+   if (flags == RMFD_TMPFILE) {
+   if (mount_fd < 0)
+   return -EINVAL;
+
+   return restrictedmem_create_from_file(mount_fd);
+   } else {
+   return restrictedmem_create(NULL);
+   }
+}
+
 int restrictedmem_bind(struct file *file, pgoff_t start, pgoff_t end,
   struct restrictedmem_notifier *notifier, bool exclusive)
 {
-- 
2.40.0.rc2.332.ga46443480c-goog




[RFC PATCH 02/10] KVM: selftests: Test that ftruncate to non-page-aligned size on a restrictedmem fd should fail

2023-03-15 Thread Ackerley Tng
Signed-off-by: Ackerley Tng 
---
 tools/testing/selftests/vm/memfd_restricted.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/tools/testing/selftests/vm/memfd_restricted.c 
b/tools/testing/selftests/vm/memfd_restricted.c
index 43a512f273f7..9c4e6a0becbc 100644
--- a/tools/testing/selftests/vm/memfd_restricted.c
+++ b/tools/testing/selftests/vm/memfd_restricted.c
@@ -38,6 +38,11 @@ static void test_file_size(int fd)
 {
struct stat sb;
 
+   if (!ftruncate(fd, page_size + 1)) {
+   fail("ftruncate to non page-aligned sizes should fail\n");
+   return;
+   }
+
if (ftruncate(fd, page_size)) {
fail("ftruncate failed\n");
return;
-- 
2.40.0.rc2.332.ga46443480c-goog




[RFC PATCH 09/10] KVM: selftests: Add tests around sharing a restrictedmem fd

2023-03-15 Thread Ackerley Tng
Tests that

+ Different memslots in the same VM should be able to share a
  restrictedmem_fd
+ A second VM cannot share the same offsets in a restrictedmem_fd
+ Different VMs should be able to share the same restrictedmem_fd, as
  long as the offsets in the restrictedmem_fd are different

Signed-off-by: Ackerley Tng 
---
 .../selftests/kvm/set_memory_region_test.c| 29 +--
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c 
b/tools/testing/selftests/kvm/set_memory_region_test.c
index cc727d11569e..789c413e2a67 100644
--- a/tools/testing/selftests/kvm/set_memory_region_test.c
+++ b/tools/testing/selftests/kvm/set_memory_region_test.c
@@ -401,7 +401,7 @@ static bool set_private_region_failed(struct kvm_vm *vm, 
void *hva,
 static void test_private_regions(void)
 {
int ret;
-   struct kvm_vm *vm;
+   struct kvm_vm *vm, *vm2;
void *mem;
int fd;
 
@@ -416,7 +416,7 @@ static void test_private_regions(void)
 
vm = __vm_create(shape, 1, 0);
 
-   mem = mmap(NULL, MEM_REGION_SIZE * 2, PROT_READ | PROT_WRITE,
+   mem = mmap(NULL, MEM_REGION_SIZE * 3, PROT_READ | PROT_WRITE,
   MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0);
TEST_ASSERT(mem != MAP_FAILED, "Failed to mmap() host");
 
@@ -448,8 +448,31 @@ static void test_private_regions(void)
TEST_ASSERT(ret == -1 && errno == EINVAL,
"Set overlapping restrictedmem_offset should fail");
 
-   munmap(mem, MEM_REGION_SIZE * 2);
+   ret = __vm_set_user_memory_region2(vm, MEM_REGION_SLOT + 1,
+  KVM_MEM_PRIVATE,
+  MEM_REGION_GPA + MEM_REGION_SIZE,
+  MEM_REGION_SIZE,
+  mem + MEM_REGION_SIZE,
+  fd, MEM_REGION_SIZE);
+   TEST_ASSERT(!ret,
+   "Different memslots should be able to share a 
restrictedmem_fd");
+
+   vm2 = __vm_create(shape, 1, 0);
+   TEST_ASSERT(set_private_region_failed(vm2, mem + 2 * MEM_REGION_SIZE, 
fd, 0),
+   "Pages (offsets) of a restrictedmem_fd should be exclusive 
to a VM");
+
+   ret = __vm_set_user_memory_region2(vm2, MEM_REGION_SLOT,
+  KVM_MEM_PRIVATE,
+  MEM_REGION_GPA + 2 * MEM_REGION_SIZE,
+  MEM_REGION_SIZE,
+  mem + 2 * MEM_REGION_SIZE,
+  fd, 2 * MEM_REGION_SIZE);
+   TEST_ASSERT(!ret,
+   "Different VMs should be able to share a restrictedmem_fd");
+
+   munmap(mem, MEM_REGION_SIZE * 3);
kvm_vm_free(vm);
+   kvm_vm_free(vm2);
 }
 
 int main(int argc, char *argv[])
-- 
2.40.0.rc2.332.ga46443480c-goog




[RFC PATCH 08/10] KVM: selftests: Default private_mem_conversions_test to use 1 restrictedmem file for test data

2023-03-15 Thread Ackerley Tng
Default the private/shared memory conversion tests to use a single
file (when multiple memslots are requested), while executing on
multiple vCPUs in parallel, to stress-test the restrictedmem subsystem.

Also add a flag to allow multiple files to be used.

Signed-off-by: Ackerley Tng 
---
 .../kvm/x86_64/private_mem_conversions_test.c | 52 ++-
 1 file changed, 38 insertions(+), 14 deletions(-)

diff --git a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c 
b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c
index afaf8d0e52e6..ca30f0f05c39 100644
--- a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c
+++ b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c
@@ -324,7 +324,8 @@ void *thread_function(void *input)
 }
 
 static void add_memslot_for_vcpu(
-   struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, uint8_t 
vcpu_id)
+   struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, uint8_t 
vcpu_id,
+   int restrictedmem_fd, uint64_t restrictedmem_offset)
 {
uint64_t gpa = data_gpa_base_for_vcpu_id(vcpu_id);
uint32_t slot = DATA_SLOT_BASE + vcpu_id;
@@ -336,7 +337,8 @@ static void add_memslot_for_vcpu(
 
 static void test_mem_conversions(enum vm_mem_backing_src_type src_type,
 uint8_t nr_vcpus, uint32_t iterations,
-bool use_multiple_memslots)
+bool use_multiple_memslots,
+bool use_different_restrictedmem_files)
 {
struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
pthread_t threads[KVM_MAX_VCPUS];
@@ -356,21 +358,28 @@ static void test_mem_conversions(enum 
vm_mem_backing_src_type src_type,
vm_enable_cap(vm, KVM_CAP_EXIT_HYPERCALL, (1 << KVM_HC_MAP_GPA_RANGE));
 
npages_for_all_vcpus = DATA_SIZE / vm->page_size * nr_vcpus;
+   virt_map(vm, DATA_GPA_BASE, DATA_GPA_BASE, npages_for_all_vcpus);
 
if (use_multiple_memslots) {
-   for (i = 0; i < nr_vcpus; i++)
-   add_memslot_for_vcpu(vm, src_type, i);
+   int fd = memfd_restricted(0);
+   int offset = 0;
+
+   for (i = 0; i < nr_vcpus; i++) {
+   if (use_different_restrictedmem_files) {
+   if (i > 0)
+   fd = memfd_restricted(0);
+   } else {
+   offset = i * DATA_GPA_SPACING;
+   }
+
+   add_memslot_for_vcpu(vm, src_type, i, fd, offset);
+   }
} else {
vm_userspace_mem_region_add(
vm, src_type, DATA_GPA_BASE, DATA_SLOT_BASE,
npages_for_all_vcpus, KVM_MEM_PRIVATE);
}
 
-   virt_map(vm, DATA_GPA_BASE, DATA_GPA_BASE, npages_for_all_vcpus);
-
-   for (i = 0; i < nr_vcpus; i++)
-   add_memslot_for_vcpu(vm, src_type, i);
-
for (i = 0; i < nr_vcpus; i++) {
args[i].vm = vm;
args[i].vcpu = vcpus[i];
@@ -382,7 +391,7 @@ static void test_mem_conversions(enum 
vm_mem_backing_src_type src_type,
for (i = 0; i < nr_vcpus; i++)
pthread_join(threads[i], NULL);
 
-   if (!use_multiple_memslots)
+   if (!use_multiple_memslots || !use_different_restrictedmem_files)
test_invalidation_code_unbound(vm, 1, DATA_SIZE * nr_vcpus);
else
test_invalidation_code_unbound(vm, nr_vcpus, DATA_SIZE);
@@ -391,8 +400,9 @@ static void test_mem_conversions(enum 
vm_mem_backing_src_type src_type,
 static void usage(const char *command)
 {
puts("");
-   printf("usage: %s [-h] [-m] [-s mem-type] [-n number-of-vcpus] [-i 
number-of-iterations]\n",
-  command);
+   printf("usage: %s\n", command);
+   printf("   [-h] [-m] [-f] [-s mem-type]\n");
+   printf("   [-n number-of-vcpus] [-i number-of-iterations]\n");
puts("");
backing_src_help("-s");
puts("");
@@ -404,6 +414,9 @@ static void usage(const char *command)
puts("");
puts(" -m: use multiple memslots (default: use 1 memslot)");
puts("");
+   puts(" -f: use different restrictedmem files for each memslot");
+   puts(" (default: use 1 restrictedmem file for all memslots)");
+   puts("");
 }
 
 int main(int argc, char *argv[])
@@ -412,12 +425,13 @@ int main(int argc, char *argv[])
uint8_t nr_vcpus = 2;
uint32_t iterations = 10;
bool use_multiple_memslots = false;
+   bool use_different_restrictedmem_files = false;
int opt;
 
TEST_REQUIRE(kvm_has_cap(KVM_CAP_EXIT_HYPERCALL));
TEST_REQUIRE(kvm_check_c

[RFC PATCH 10/10] KVM: selftests: Test KVM exit behavior for private memory/access

2023-03-15 Thread Ackerley Tng
"Testing private access when memslot gets deleted" tests the behavior
of KVM when a private memslot gets deleted while the VM is using the
private memslot. When KVM looks up the deleted (slot = NULL) memslot,
KVM should exit to userspace with KVM_EXIT_MEMORY_FAULT.

In the second test, upon a private access to non-private memslot, KVM
should also exit to userspace with KVM_EXIT_MEMORY_FAULT.

Signed-off-by: Ackerley Tng 
---
 tools/testing/selftests/kvm/Makefile  |   1 +
 .../kvm/x86_64/private_mem_kvm_exits_test.c   | 124 ++
 2 files changed, 125 insertions(+)
 create mode 100644 
tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c

diff --git a/tools/testing/selftests/kvm/Makefile 
b/tools/testing/selftests/kvm/Makefile
index bafee3c43b2e..0ad588852a1d 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -80,6 +80,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/nested_exceptions_test
 TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
 TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test
 TEST_GEN_PROGS_x86_64 += x86_64/private_mem_conversions_test
+TEST_GEN_PROGS_x86_64 += x86_64/private_mem_kvm_exits_test
 TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id
 TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
 TEST_GEN_PROGS_x86_64 += x86_64/smaller_maxphyaddr_emulation_test
diff --git a/tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c 
b/tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c
new file mode 100644
index ..c8667dfbbf0a
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2022, Google LLC.
+ */
+#include "kvm_util_base.h"
+#include 
+#include 
+#include 
+#include "kvm_util.h"
+#include "processor.h"
+#include "test_util.h"
+
+/* Arbitrarily selected to avoid overlaps with anything else */
+#define EXITS_TEST_GVA 0xc000
+#define EXITS_TEST_GPA EXITS_TEST_GVA
+#define EXITS_TEST_NPAGES 1
+#define EXITS_TEST_SIZE (EXITS_TEST_NPAGES * PAGE_SIZE)
+#define EXITS_TEST_SLOT 10
+
+static uint64_t guest_repeatedly_read(void)
+{
+   volatile uint64_t value;
+
+   while (true)
+   value = *((uint64_t *) EXITS_TEST_GVA);
+
+   return value;
+}
+
+static uint32_t run_vcpu_get_exit_reason(struct kvm_vcpu *vcpu)
+{
+   vcpu_run(vcpu);
+
+   return vcpu->run->exit_reason;
+}
+
+const struct vm_shape protected_vm_shape = {
+   .mode = VM_MODE_DEFAULT,
+   .type = KVM_X86_PROTECTED_VM,
+};
+
+static void test_private_access_memslot_deleted(void)
+{
+   struct kvm_vm *vm;
+   struct kvm_vcpu *vcpu;
+   pthread_t vm_thread;
+   void *thread_return;
+   uint32_t exit_reason;
+
+   vm = vm_create_shape_with_one_vcpu(protected_vm_shape, ,
+  guest_repeatedly_read);
+
+   vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+   EXITS_TEST_GPA, EXITS_TEST_SLOT,
+   EXITS_TEST_NPAGES,
+   KVM_MEM_PRIVATE);
+
+   virt_map(vm, EXITS_TEST_GVA, EXITS_TEST_GPA, EXITS_TEST_NPAGES);
+
+   /* Request to access page privately */
+   vm_mem_map_shared_or_private(vm, EXITS_TEST_GPA, EXITS_TEST_SIZE, 
false);
+
+   pr_info("Testing private access when memslot gets deleted\n");
+
+   pthread_create(_thread, NULL,
+  (void *(*)(void *))run_vcpu_get_exit_reason,
+  (void *)vcpu);
+
+   vm_mem_region_delete(vm, EXITS_TEST_SLOT);
+
+   pthread_join(vm_thread, _return);
+   exit_reason = (uint32_t)(uint64_t)thread_return;
+
+   ASSERT_EQ(exit_reason, KVM_EXIT_MEMORY_FAULT);
+   ASSERT_EQ(vcpu->run->memory.flags, KVM_MEMORY_EXIT_FLAG_PRIVATE);
+   ASSERT_EQ(vcpu->run->memory.gpa, EXITS_TEST_GPA);
+   ASSERT_EQ(vcpu->run->memory.size, EXITS_TEST_SIZE);
+
+   pr_info("\t ... PASSED\n");
+
+   kvm_vm_free(vm);
+}
+
+static void test_private_access_memslot_not_private(void)
+{
+   struct kvm_vm *vm;
+   struct kvm_vcpu *vcpu;
+   uint32_t exit_reason;
+
+   vm = vm_create_shape_with_one_vcpu(protected_vm_shape, ,
+  guest_repeatedly_read);
+
+   /* Add a non-private memslot (flags = 0) */
+   vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+   EXITS_TEST_GPA, EXITS_TEST_SLOT,
+   EXITS_TEST_NPAGES, 0);
+
+   virt_map(vm, EXITS_TEST_GVA, EXITS_TEST_GPA, EXITS_TEST_NPAGES);
+
+   /* Request to access page privately */
+   vm_set_memory_attributes(vm, EXITS_TEST_GPA, EXITS_TEST_SIZE,
+KVM_MEMORY_ATTRIBUTE_PRIVATE);
+
+   pr_info("Testing private access to n

[RFC PATCH 07/10] KVM: selftests: Add vm_userspace_mem_region_add_with_restrictedmem

2023-03-15 Thread Ackerley Tng
Provide new function to allow restrictedmem's fd and offset to be
specified in selftests.

No functional change intended to vm_userspace_mem_region_add.

Signed-off-by: Ackerley Tng 
---
 .../selftests/kvm/include/kvm_util_base.h |  4 ++
 tools/testing/selftests/kvm/lib/kvm_util.c| 46 +--
 2 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h 
b/tools/testing/selftests/kvm/include/kvm_util_base.h
index b6531a4063bb..c1ac82332ca4 100644
--- a/tools/testing/selftests/kvm/include/kvm_util_base.h
+++ b/tools/testing/selftests/kvm/include/kvm_util_base.h
@@ -486,6 +486,10 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
enum vm_mem_backing_src_type src_type,
uint64_t guest_paddr, uint32_t slot, uint64_t npages,
uint32_t flags);
+void vm_userspace_mem_region_add_with_restrictedmem(struct kvm_vm *vm,
+   enum vm_mem_backing_src_type src_type,
+   uint64_t guest_paddr, uint32_t slot, uint64_t npages,
+   uint32_t flags, int restrictedmem_fd, uint64_t restrictedmem_offset);
 
 void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags);
 void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
b/tools/testing/selftests/kvm/lib/kvm_util.c
index d0e6b10f140f..d6bfcfc5cdea 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -898,6 +898,43 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
enum vm_mem_backing_src_type src_type,
uint64_t guest_paddr, uint32_t slot, uint64_t npages,
uint32_t flags)
+{
+   int restrictedmem_fd;
+
+   restrictedmem_fd = flags & KVM_MEM_PRIVATE ? memfd_restricted(0) : 0;
+   vm_userspace_mem_region_add_with_restrictedmem(
+   vm, src_type, guest_paddr, slot, npages, flags,
+   restrictedmem_fd, 0);
+}
+
+/*
+ * VM Userspace Memory Region Add With restrictedmem
+ *
+ * Input Args:
+ *   vm - Virtual Machine
+ *   src_type - Storage source for this region.
+ *  NULL to use anonymous memory.
+ *   guest_paddr - Starting guest physical address
+ *   slot - KVM region slot
+ *   npages - Number of physical pages
+ *   flags - KVM memory region flags (e.g. KVM_MEM_LOG_DIRTY_PAGES)
+ *   restrictedmem_fd - restrictedmem_fd for use with restrictedmem
+ *   restrictedmem_offset - offset within restrictedmem_fd to be used
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Allocates a memory area of the number of pages specified by npages
+ * and maps it to the VM specified by vm, at a starting physical address
+ * given by guest_paddr.  The region is created with a KVM region slot
+ * given by slot, which must be unique and < KVM_MEM_SLOTS_NUM.  The
+ * region is created with the flags given by flags.
+ */
+void vm_userspace_mem_region_add_with_restrictedmem(struct kvm_vm *vm,
+   enum vm_mem_backing_src_type src_type,
+   uint64_t guest_paddr, uint32_t slot, uint64_t npages,
+   uint32_t flags, int restrictedmem_fd, uint64_t restrictedmem_offset)
 {
int ret;
struct userspace_mem_region *region;
@@ -1011,8 +1048,8 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
region->backing_src_type = src_type;
 
if (flags & KVM_MEM_PRIVATE) {
-   region->region.restrictedmem_fd = memfd_restricted(0);
-   region->region.restrictedmem_offset = 0;
+   region->region.restrictedmem_fd = restrictedmem_fd;
+   region->region.restrictedmem_offset = restrictedmem_offset;
 
TEST_ASSERT(region->region.restrictedmem_fd >= 0,
"Failed to create restricted memfd");
@@ -1030,10 +1067,11 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n"
"  rc: %i errno: %i\n"
"  slot: %u flags: 0x%x\n"
-   "  guest_phys_addr: 0x%lx size: 0x%lx restricted fd: %d\n",
+   "  guest_phys_addr: 0x%lx size: 0x%lx\n"
+   "  restricted fd: %d restricted_offset: 0x%llx\n",
ret, errno, slot, flags,
guest_paddr, (uint64_t) region->region.memory_size,
-   region->region.restrictedmem_fd);
+   region->region.restrictedmem_fd, 
region->region.restrictedmem_offset);
 
/* Add to quick lookup data structures */
vm_userspace_mem_region_gpa_insert(>regions.gpa_tree, region);
-- 
2.40.0.rc2.332.ga46443480c-goog




[RFC PATCH 05/10] KVM: selftests: Generalize private_mem_conversions_test for parallel execution

2023-03-15 Thread Ackerley Tng
By running the private/shared memory conversion tests on multiple
vCPUs in parallel, we stress-test the restrictedmem subsystem to
test conversion of non-overlapping GPA ranges in multiple memslots.

Signed-off-by: Ackerley Tng 
---
 .../kvm/x86_64/private_mem_conversions_test.c | 203 +-
 1 file changed, 150 insertions(+), 53 deletions(-)

diff --git a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c 
b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c
index 7741916818db..14aa90e9a89b 100644
--- a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c
+++ b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c
@@ -5,6 +5,7 @@
 #define _GNU_SOURCE /* for program_invocation_short_name */
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -22,9 +23,10 @@
 #include 
 #include 
 
-#define DATA_SLOT  10
-#define DATA_GPA   ((uint64_t)(1ull << 32))
-#define DATA_SIZE  ((uint64_t)(SZ_2M + PAGE_SIZE))
+#define DATA_SLOT_BASE   10
+#define DATA_GPA_BASE((uint64_t)(1ull << 32))
+#define DATA_SIZE((uint64_t)(SZ_2M + PAGE_SIZE))
+#define DATA_GPA_SPACING DATA_SIZE
 
 /* Horrific macro so that the line info is captured accurately :-( */
 #define memcmp_g(gpa, pattern,  size)  \
@@ -83,7 +85,9 @@ static void memcmp_ne_h(uint8_t *mem, uint8_t pattern, size_t 
size)
 #define REQUEST_HOST_R_PRIVATE(gpa, size, expected_pattern) \
ucall(UCALL_R_PRIVATE, 3, gpa, size, expected_pattern)
 
-static void guest_code(void)
+const uint8_t init_p = 0xcc;
+
+static void guest_test_conversions(uint64_t gpa_base)
 {
struct {
uint64_t offset;
@@ -96,17 +100,11 @@ static void guest_code(void)
GUEST_STAGE(PAGE_SIZE, SZ_2M),
GUEST_STAGE(SZ_2M, PAGE_SIZE),
};
-   const uint8_t init_p = 0xcc;
uint64_t j;
int i;
 
-   /* Memory should be shared by default. */
-   memset((void *)DATA_GPA, ~init_p, DATA_SIZE);
-   REQUEST_HOST_RW_SHARED(DATA_GPA, DATA_SIZE, ~init_p, init_p);
-   memcmp_g(DATA_GPA, init_p, DATA_SIZE);
-
for (i = 0; i < ARRAY_SIZE(stages); i++) {
-   uint64_t gpa = DATA_GPA + stages[i].offset;
+   uint64_t gpa = gpa_base + stages[i].offset;
uint64_t size = stages[i].size;
uint8_t p1 = 0x11;
uint8_t p2 = 0x22;
@@ -140,11 +138,11 @@ static void guest_code(void)
 * that shared memory still holds the initial pattern.
 */
memcmp_g(gpa, p2, size);
-   if (gpa > DATA_GPA)
-   memcmp_g(DATA_GPA, init_p, gpa - DATA_GPA);
-   if (gpa + size < DATA_GPA + DATA_SIZE)
+   if (gpa > gpa_base)
+   memcmp_g(gpa_base, init_p, gpa - gpa_base);
+   if (gpa + size < gpa_base + DATA_SIZE)
memcmp_g(gpa + size, init_p,
-(DATA_GPA + DATA_SIZE) - (gpa + size));
+(gpa_base + DATA_SIZE) - (gpa + size));
 
/*
 * Convert odd-number page frames back to shared to verify KVM
@@ -182,6 +180,19 @@ static void guest_code(void)
/* Reset the shared memory back to the initial pattern. */
memset((void *)gpa, init_p, size);
}
+}
+
+static void guest_code(uint64_t gpa_base, uint32_t iterations)
+{
+   int i;
+
+   /* Memory should be shared by default. */
+   memset((void *)gpa_base, ~init_p, DATA_SIZE);
+   REQUEST_HOST_RW_SHARED(gpa_base, DATA_SIZE, ~init_p, init_p);
+   memcmp_g(gpa_base, init_p, DATA_SIZE);
+
+   for (i = 0; i < iterations; i++)
+   guest_test_conversions(gpa_base);
 
GUEST_DONE();
 }
@@ -203,15 +214,27 @@ static void handle_exit_hypercall(struct kvm_vcpu *vcpu)
run->hypercall.ret = 0;
 }
 
-static void test_invalidation_code_unbound(struct kvm_vm *vm)
+static uint64_t data_gpa_base_for_vcpu_id(uint8_t n)
+{
+   return DATA_GPA_BASE + n * DATA_GPA_SPACING;
+}
+
+static void test_invalidation_code_unbound(struct kvm_vm *vm, uint8_t 
nr_memslots,
+  off_t data_size)
 {
-   uint32_t fd;
-   uint64_t offset;
-   struct userspace_mem_region *region;
+   struct {
+   uint32_t fd;
+   uint64_t offset;
+   } params[KVM_MAX_VCPUS];
+   int i;
+
+   for (i = 0; i < nr_memslots; i++) {
+   struct userspace_mem_region *region;
 
-   region = memslot2region(vm, DATA_SLOT);
-   fd = region->region.restrictedmem_fd;
-   offset = region->region.restrictedmem_offset;
+   region = memslot2region(vm, DATA_SLOT_BASE + i);
+   params[i].fd = region->region.restrictedmem_fd;
+   params[i]

[RFC PATCH 01/10] KVM: selftests: Test error message fixes for memfd_restricted selftests

2023-03-15 Thread Ackerley Tng
Signed-off-by: Ackerley Tng 
---
 tools/testing/selftests/vm/memfd_restricted.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/vm/memfd_restricted.c 
b/tools/testing/selftests/vm/memfd_restricted.c
index 3a556b570129..43a512f273f7 100644
--- a/tools/testing/selftests/vm/memfd_restricted.c
+++ b/tools/testing/selftests/vm/memfd_restricted.c
@@ -49,12 +49,12 @@ static void test_file_size(int fd)
}
 
if (sb.st_size != page_size) {
-   fail("unexpected file size after ftruncate");
+   fail("unexpected file size after ftruncate\n");
return;
}
 
if (!ftruncate(fd, page_size * 2)) {
-   fail("unexpected ftruncate\n");
+   fail("size of file cannot be changed once set\n");
return;
}
 
-- 
2.40.0.rc2.332.ga46443480c-goog




[RFC PATCH 06/10] KVM: selftests: Default private_mem_conversions_test to use 1 memslot for test data

2023-03-15 Thread Ackerley Tng
Default the private/shared memory conversion tests to use a single
memslot, while executing on multiple vCPUs in parallel, to stress-test
the restrictedmem subsystem.

Also add a flag to allow multiple memslots to be used.

Signed-off-by: Ackerley Tng 
---
 .../kvm/x86_64/private_mem_conversions_test.c | 30 +++
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c 
b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c
index 14aa90e9a89b..afaf8d0e52e6 100644
--- a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c
+++ b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c
@@ -335,7 +335,8 @@ static void add_memslot_for_vcpu(
 }
 
 static void test_mem_conversions(enum vm_mem_backing_src_type src_type,
-uint8_t nr_vcpus, uint32_t iterations)
+uint8_t nr_vcpus, uint32_t iterations,
+bool use_multiple_memslots)
 {
struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
pthread_t threads[KVM_MAX_VCPUS];
@@ -355,6 +356,16 @@ static void test_mem_conversions(enum 
vm_mem_backing_src_type src_type,
vm_enable_cap(vm, KVM_CAP_EXIT_HYPERCALL, (1 << KVM_HC_MAP_GPA_RANGE));
 
npages_for_all_vcpus = DATA_SIZE / vm->page_size * nr_vcpus;
+
+   if (use_multiple_memslots) {
+   for (i = 0; i < nr_vcpus; i++)
+   add_memslot_for_vcpu(vm, src_type, i);
+   } else {
+   vm_userspace_mem_region_add(
+   vm, src_type, DATA_GPA_BASE, DATA_SLOT_BASE,
+   npages_for_all_vcpus, KVM_MEM_PRIVATE);
+   }
+
virt_map(vm, DATA_GPA_BASE, DATA_GPA_BASE, npages_for_all_vcpus);
 
for (i = 0; i < nr_vcpus; i++)
@@ -371,13 +382,16 @@ static void test_mem_conversions(enum 
vm_mem_backing_src_type src_type,
for (i = 0; i < nr_vcpus; i++)
pthread_join(threads[i], NULL);
 
-   test_invalidation_code_unbound(vm, nr_vcpus, DATA_SIZE);
+   if (!use_multiple_memslots)
+   test_invalidation_code_unbound(vm, 1, DATA_SIZE * nr_vcpus);
+   else
+   test_invalidation_code_unbound(vm, nr_vcpus, DATA_SIZE);
 }
 
 static void usage(const char *command)
 {
puts("");
-   printf("usage: %s [-h] [-s mem-type] [-n number-of-vcpus] [-i 
number-of-iterations]\n",
+   printf("usage: %s [-h] [-m] [-s mem-type] [-n number-of-vcpus] [-i 
number-of-iterations]\n",
   command);
puts("");
backing_src_help("-s");
@@ -388,6 +402,8 @@ static void usage(const char *command)
puts(" -i: specify the number iterations of memory conversion");
puts(" tests to run. (default: 10)");
puts("");
+   puts(" -m: use multiple memslots (default: use 1 memslot)");
+   puts("");
 }
 
 int main(int argc, char *argv[])
@@ -395,12 +411,13 @@ int main(int argc, char *argv[])
enum vm_mem_backing_src_type src_type = DEFAULT_VM_MEM_SRC;
uint8_t nr_vcpus = 2;
uint32_t iterations = 10;
+   bool use_multiple_memslots = false;
int opt;
 
TEST_REQUIRE(kvm_has_cap(KVM_CAP_EXIT_HYPERCALL));
TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & 
BIT(KVM_X86_PROTECTED_VM));
 
-   while ((opt = getopt(argc, argv, "hs:n:i:")) != -1) {
+   while ((opt = getopt(argc, argv, "mhs:n:i:")) != -1) {
switch (opt) {
case 'n':
nr_vcpus = atoi_positive("nr_vcpus", optarg);
@@ -411,6 +428,9 @@ int main(int argc, char *argv[])
case 's':
src_type = parse_backing_src_type(optarg);
break;
+   case 'm':
+   use_multiple_memslots = true;
+   break;
case 'h':
default:
usage(argv[0]);
@@ -418,6 +438,6 @@ int main(int argc, char *argv[])
}
}
 
-   test_mem_conversions(src_type, nr_vcpus, iterations);
+   test_mem_conversions(src_type, nr_vcpus, iterations, 
use_multiple_memslots);
return 0;
 }
-- 
2.40.0.rc2.332.ga46443480c-goog




[RFC PATCH 04/10] KVM: selftests: Exercise restrictedmem allocation and truncation code after KVM invalidation code has been unbound

2023-03-15 Thread Ackerley Tng
The kernel interfaces restrictedmem_bind and restrictedmem_unbind are
used by KVM to bind/unbind kvm functions to restrictedmem's
invalidate_start and invalidate_end callbacks.

After the KVM VM is freed, the KVM functions should have been unbound
from the restrictedmem_fd's callbacks.

In this test, we exercise fallocate to back and unback memory using
the restrictedmem fd, and we expect no problems (crashes) after the
KVM functions have been unbound.

Signed-off-by: Ackerley Tng 
---
 .../kvm/x86_64/private_mem_conversions_test.c | 26 ++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c 
b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c
index f2c1e4450b0e..7741916818db 100644
--- a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c
+++ b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c
@@ -203,6 +203,30 @@ static void handle_exit_hypercall(struct kvm_vcpu *vcpu)
run->hypercall.ret = 0;
 }
 
+static void test_invalidation_code_unbound(struct kvm_vm *vm)
+{
+   uint32_t fd;
+   uint64_t offset;
+   struct userspace_mem_region *region;
+
+   region = memslot2region(vm, DATA_SLOT);
+   fd = region->region.restrictedmem_fd;
+   offset = region->region.restrictedmem_offset;
+
+   kvm_vm_free(vm);
+
+   /*
+* At this point the KVM invalidation code should have been unbound from
+* the vm. We do allocation and truncation to exercise the restrictedmem
+* code. There should be no issues after the unbinding happens.
+*/
+   if (fallocate(fd, 0, offset, DATA_SIZE))
+   TEST_FAIL("Unexpected error in fallocate");
+   if (fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+ offset, DATA_SIZE))
+   TEST_FAIL("Unexpected error in fallocate");
+}
+
 static void test_mem_conversions(enum vm_mem_backing_src_type src_type)
 {
struct kvm_vcpu *vcpu;
@@ -270,7 +294,7 @@ static void test_mem_conversions(enum 
vm_mem_backing_src_type src_type)
}
 
 done:
-   kvm_vm_free(vm);
+   test_invalidation_code_unbound(vm);
 }
 
 int main(int argc, char *argv[])
-- 
2.40.0.rc2.332.ga46443480c-goog




[RFC PATCH 03/10] KVM: selftests: Test that VM private memory should not be readable from host

2023-03-15 Thread Ackerley Tng
After VM memory is remapped as private memory and guest has written to
private memory, request the host to read the corresponding hva for
that private memory.

The host should not be able to read the value in private memory.

This selftest shows that private memory contents of the guest are not
accessible to host userspace via the HVA.

Signed-off-by: Ackerley Tng 
---
 .../kvm/x86_64/private_mem_conversions_test.c | 54 ---
 1 file changed, 48 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c 
b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c
index ef9894340a2b..f2c1e4450b0e 100644
--- a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c
+++ b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c
@@ -47,6 +47,16 @@ static void memcmp_h(uint8_t *mem, uint8_t pattern, size_t 
size)
pattern, i, mem[i]);
 }
 
+static void memcmp_ne_h(uint8_t *mem, uint8_t pattern, size_t size)
+{
+   size_t i;
+
+   for (i = 0; i < size; i++)
+   TEST_ASSERT(mem[i] != pattern,
+   "Expected not to find 0x%x at offset %lu but got 
0x%x",
+   pattern, i, mem[i]);
+}
+
 /*
  * Run memory conversion tests with explicit conversion:
  * Execute KVM hypercall to map/unmap gpa range which will cause userspace exit
@@ -64,8 +74,14 @@ static void memcmp_h(uint8_t *mem, uint8_t pattern, size_t 
size)
 
 #define GUEST_STAGE(o, s) { .offset = o, .size = s }
 
-#define GUEST_SYNC4(gpa, size, current_pattern, new_pattern) \
-   ucall(UCALL_SYNC, 4, gpa, size, current_pattern, new_pattern)
+#define UCALL_RW_SHARED (0xca11 - 0)
+#define UCALL_R_PRIVATE (0xca11 - 1)
+
+#define REQUEST_HOST_RW_SHARED(gpa, size, current_pattern, new_pattern) \
+   ucall(UCALL_RW_SHARED, 4, gpa, size, current_pattern, new_pattern)
+
+#define REQUEST_HOST_R_PRIVATE(gpa, size, expected_pattern) \
+   ucall(UCALL_R_PRIVATE, 3, gpa, size, expected_pattern)
 
 static void guest_code(void)
 {
@@ -86,7 +102,7 @@ static void guest_code(void)
 
/* Memory should be shared by default. */
memset((void *)DATA_GPA, ~init_p, DATA_SIZE);
-   GUEST_SYNC4(DATA_GPA, DATA_SIZE, ~init_p, init_p);
+   REQUEST_HOST_RW_SHARED(DATA_GPA, DATA_SIZE, ~init_p, init_p);
memcmp_g(DATA_GPA, init_p, DATA_SIZE);
 
for (i = 0; i < ARRAY_SIZE(stages); i++) {
@@ -113,6 +129,12 @@ static void guest_code(void)
kvm_hypercall_map_private(gpa, size);
memset((void *)gpa, p2, size);
 
+   /*
+* Host should not be able to read the values written to private
+* memory
+*/
+   REQUEST_HOST_R_PRIVATE(gpa, size, p2);
+
/*
 * Verify that the private memory was set to pattern two, and
 * that shared memory still holds the initial pattern.
@@ -133,11 +155,20 @@ static void guest_code(void)
continue;
 
kvm_hypercall_map_shared(gpa + j, PAGE_SIZE);
-   GUEST_SYNC4(gpa + j, PAGE_SIZE, p1, p3);
+   REQUEST_HOST_RW_SHARED(gpa + j, PAGE_SIZE, p1, p3);
 
memcmp_g(gpa + j, p3, PAGE_SIZE);
}
 
+   /*
+* Even-number pages are still mapped as private, host should
+* not be able to read those values.
+*/
+   for (j = 0; j < size; j += PAGE_SIZE) {
+   if (!((j >> PAGE_SHIFT) & 1))
+   REQUEST_HOST_R_PRIVATE(gpa + j, PAGE_SIZE, p2);
+   }
+
/*
 * Convert the entire region back to shared, explicitly write
 * pattern three to fill in the even-number frames before
@@ -145,7 +176,7 @@ static void guest_code(void)
 */
kvm_hypercall_map_shared(gpa, size);
memset((void *)gpa, p3, size);
-   GUEST_SYNC4(gpa, size, p3, p4);
+   REQUEST_HOST_RW_SHARED(gpa, size, p3, p4);
memcmp_g(gpa, p4, size);
 
/* Reset the shared memory back to the initial pattern. */
@@ -209,7 +240,18 @@ static void test_mem_conversions(enum 
vm_mem_backing_src_type src_type)
switch (get_ucall(vcpu, )) {
case UCALL_ABORT:
REPORT_GUEST_ASSERT_4(uc, "%lx %lx %lx %lx");
-   case UCALL_SYNC: {
+   case UCALL_R_PRIVATE: {
+   uint8_t *hva = addr_gpa2hva(vm, uc.args[0]);
+   uint64_t size = uc.args[1];
+
+   /*
+* Try to read hva for private gpa from host, should not
+* be able to read private data
+   

[RFC PATCH 00/10] Additional selftests for restrictedmem

2023-03-15 Thread Ackerley Tng
Hello,

This is a series containing additional selftests for restrictedmem,
prepared to be used with the next iteration of the restrictedmem
series after v10.

restrictedmem v10 is available at
https://lore.kernel.org/lkml/20221202061347.1070246-1-chao.p.p...@linux.intel.com/T/.

The tree can be found at
https://github.com/googleprodkernel/linux-cc/tree/restrictedmem-additional-selftests-rfc-v1/.

Dependencies
+ The next iteration of the restrictedmem series
+ branch: https://github.com/chao-p/linux/commits/privmem-v11.4
+ commit: 
https://github.com/chao-p/linux/tree/ddd2c92b268a2fdc6158f82a6169ad1a57f2a01d
+ Proposed fix to adjust VM's initial stack address to align with SysV
  ABI spec: 
https://lore.kernel.org/lkml/20230227180601.104318-1-ackerley...@google.com/

Ackerley Tng (10):
  KVM: selftests: Test error message fixes for memfd_restricted
selftests
  KVM: selftests: Test that ftruncate to non-page-aligned size on a
restrictedmem fd should fail
  KVM: selftests: Test that VM private memory should not be readable
from host
  KVM: selftests: Exercise restrictedmem allocation and truncation code
after KVM invalidation code has been unbound
  KVM: selftests: Generalize private_mem_conversions_test for parallel
execution
  KVM: selftests: Default private_mem_conversions_test to use 1 memslot
for test data
  KVM: selftests: Add vm_userspace_mem_region_add_with_restrictedmem
  KVM: selftests: Default private_mem_conversions_test to use 1
restrictedmem file for test data
  KVM: selftests: Add tests around sharing a restrictedmem fd
  KVM: selftests: Test KVM exit behavior for private memory/access

 tools/testing/selftests/kvm/Makefile  |   1 +
 .../selftests/kvm/include/kvm_util_base.h |   4 +
 tools/testing/selftests/kvm/lib/kvm_util.c|  46 ++-
 .../selftests/kvm/set_memory_region_test.c|  29 +-
 .../kvm/x86_64/private_mem_conversions_test.c | 295 +++---
 .../kvm/x86_64/private_mem_kvm_exits_test.c   | 124 
 tools/testing/selftests/vm/memfd_restricted.c |   9 +-
 7 files changed, 455 insertions(+), 53 deletions(-)
 create mode 100644 
tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c

--
2.40.0.rc2.332.ga46443480c-goog



Re: [PATCH v10 9/9] KVM: Enable and expose KVM_MEM_PRIVATE

2023-03-07 Thread Ackerley Tng

Chao Peng  writes:


On Sat, Jan 14, 2023 at 12:01:01AM +, Sean Christopherson wrote:

On Fri, Dec 02, 2022, Chao Peng wrote:

...

Strongly prefer to use similar logic to existing code that detects wraps:



mem->restricted_offset + mem->memory_size < 
mem->restricted_offset


This is also where I'd like to add the "gfn is aligned to offset" check,  
though

my brain is too fried to figure that out right now.



Used count_trailing_zeros() for this TODO, unsure we have other better
approach.



diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index afc8c26fa652..fd34c5f7cd2f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -56,6 +56,7 @@
  #include 
  #include 
  #include 
+#include 



  #include "coalesced_mmio.h"
  #include "async_pf.h"
@@ -2087,6 +2088,19 @@ static bool kvm_check_memslot_overlap(struct  
kvm_memslots *slots, int id,

return false;
  }



+/*
+ * Return true when ALIGNMENT(offset) >= ALIGNMENT(gpa).
+ */
+static bool kvm_check_rmem_offset_alignment(u64 offset, u64 gpa)
+{
+   if (!offset)
+   return true;
+   if (!gpa)
+   return false;
+
+   return !!(count_trailing_zeros(offset) >= count_trailing_zeros(gpa));


Perhaps we could do something like

#define lowest_set_bit(val) (val & -val)

and use

return lowest_set_bit(offset) >= lowest_set_bit(gpa);

Please help me to understand: why must ALIGNMENT(offset) >=
ALIGNMENT(gpa)? Why is it not sufficient to have both gpa and offset be
aligned to PAGE_SIZE?


+}
+
  /*
   * Allocate some memory and give it an address in the guest physical  
address

   * space.
@@ -2128,7 +2142,8 @@ int __kvm_set_memory_region(struct kvm *kvm,
if (mem->flags & KVM_MEM_PRIVATE &&
(mem->restrictedmem_offset & (PAGE_SIZE - 1) ||
  	 mem->restrictedmem_offset + mem->memory_size <  
mem->restrictedmem_offset ||

-0 /* TODO: require gfn be aligned with restricted offset */))
+!kvm_check_rmem_offset_alignment(mem->restrictedmem_offset,
+ mem->guest_phys_addr)))
return -EINVAL;
if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_MEM_SLOTS_NUM)
return -EINVAL;




Re: [PATCH v10 9/9] KVM: Enable and expose KVM_MEM_PRIVATE

2023-03-07 Thread Ackerley Tng

Chao Peng  writes:


Register/unregister private memslot to fd-based memory backing store
restrictedmem and implement the callbacks for restrictedmem_notifier:
   - invalidate_start()/invalidate_end() to zap the existing memory
 mappings in the KVM page table.
   - error() to request KVM_REQ_MEMORY_MCE and later exit to userspace
 with KVM_EXIT_SHUTDOWN.



Expose KVM_MEM_PRIVATE for memslot and KVM_MEMORY_ATTRIBUTE_PRIVATE for
KVM_GET_SUPPORTED_MEMORY_ATTRIBUTES to userspace but either are
controlled by kvm_arch_has_private_mem() which should be rewritten by
architecture code.


Could we perhaps rename KVM_MEM_PRIVATE to KVM_MEM_PROTECTED, to be in
line with KVM_X86_PROTECTED_VM?

I feel that a memslot that has the KVM_MEM_PRIVATE flag need not always
be private; It can sometimes be providing memory that is shared and
also accessible from the host.

KVM_MEMORY_ATTRIBUTE_PRIVATE is fine as-is because this flag is set when
the guest memory is meant to be backed by private memory.

KVM_MEMORY_EXIT_FLAG_PRIVATE is also okay because the flag is used to
indicate when the memory error is caused by a private access (as opposed
to a shared access).

kvm_slot_can_be_private() could perhaps be renamed kvm_is_protected_slot()?



Co-developed-by: Yu Zhang 
Signed-off-by: Yu Zhang 
Signed-off-by: Chao Peng 
Reviewed-by: Fuad Tabba 
---
  arch/x86/include/asm/kvm_host.h |   1 +
  arch/x86/kvm/x86.c  |  13 +++
  include/linux/kvm_host.h|   3 +
  virt/kvm/kvm_main.c | 179 +++-
  4 files changed, 191 insertions(+), 5 deletions(-)


diff --git a/arch/x86/include/asm/kvm_host.h  
b/arch/x86/include/asm/kvm_host.h

index 7772ab37ac89..27ef31133352 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -114,6 +114,7 @@
KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
  #define KVM_REQ_HV_TLB_FLUSH \
KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_MEMORY_MCE KVM_ARCH_REQ(33)



  #define CR0_RESERVED_BITS   \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5aefcff614d2..c67e22f3e2ee 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6587,6 +6587,13 @@ int kvm_arch_pm_notifier(struct kvm *kvm, unsigned  
long state)

  }
  #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */



+#ifdef CONFIG_HAVE_KVM_RESTRICTED_MEM
+void kvm_arch_memory_mce(struct kvm *kvm)
+{
+   kvm_make_all_cpus_request(kvm, KVM_REQ_MEMORY_MCE);
+}
+#endif
+
  static int kvm_vm_ioctl_get_clock(struct kvm *kvm, void __user *argp)
  {
struct kvm_clock_data data = { 0 };
@@ -10357,6 +10364,12 @@ static int vcpu_enter_guest(struct kvm_vcpu  
*vcpu)



if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
+
+   if (kvm_check_request(KVM_REQ_MEMORY_MCE, vcpu)) {
+   vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+   r = 0;
+   goto out;
+   }
}



if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 153842bb33df..f032d878e034 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -590,6 +590,7 @@ struct kvm_memory_slot {
struct file *restricted_file;
loff_t restricted_offset;
struct restrictedmem_notifier notifier;
+   struct kvm *kvm;
  };


  static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot  
*slot)
@@ -2363,6 +2364,8 @@ static inline int kvm_restricted_mem_get_pfn(struct  
kvm_memory_slot *slot,

*pfn = page_to_pfn(page);
return ret;
  }
+
+void kvm_arch_memory_mce(struct kvm *kvm);
  #endif /* CONFIG_HAVE_KVM_RESTRICTED_MEM */



  #endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e107afea32f0..ac835fc77273 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -936,6 +936,121 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)



  #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */



+#ifdef CONFIG_HAVE_KVM_RESTRICTED_MEM
+static bool restrictedmem_range_is_valid(struct kvm_memory_slot *slot,
+pgoff_t start, pgoff_t end,
+gfn_t *gfn_start, gfn_t *gfn_end)
+{
+   unsigned long base_pgoff = slot->restricted_offset >> PAGE_SHIFT;
+
+   if (start > base_pgoff)
+   *gfn_start = slot->base_gfn + start - base_pgoff;
+   else
+   *gfn_start = slot->base_gfn;
+
+   if (end < base_pgoff + slot->npages)
+   *gfn_end = slot->base_gfn + end - base_pgoff;
+   else
+   *gfn_end = slot->base_gfn + slot->npages;
+
+  

Re: [RFC PATCH 0/2] Add flag as THP allocation hint for memfd_restricted() syscall

2023-02-22 Thread Ackerley Tng

Yuan Yao  writes:


On Sat, Feb 18, 2023 at 12:43:00AM +, Ackerley Tng wrote:

Hello,



This patchset builds upon the memfd_restricted() system call that has
been discussed in the ‘KVM: mm: fd-based approach for supporting KVM’
patch series, at
https://lore.kernel.org/lkml/20221202061347.1070246-1-chao.p.p...@linux.intel.com/T/#m7e944d7892afdd1d62a03a287bd488c56e377b0c



The tree can be found at:
https://github.com/googleprodkernel/linux-cc/tree/restrictedmem-rmfd-hugepage



Following the RFC to provide mount for memfd_restricted() syscall at
https://lore.kernel.org/lkml/cover.1676507663.git.ackerley...@google.com/T/#u,
this patchset adds the RMFD_HUGEPAGE flag to the memfd_restricted()
syscall, which will hint the kernel to use Transparent HugePages to
back restrictedmem pages.



This supplements the interface proposed earlier, which requires the
creation of a tmpfs mount to be passed to memfd_restricted(), with a
more direct per-file hint.



Dependencies:



+ Sean’s iteration of the ‘KVM: mm: fd-based approach for supporting
   KVM’ patch series at
   https://github.com/sean-jc/linux/tree/x86/upm_base_support
+ Proposed fix for restrictedmem_getattr() as mentioned on the mailing
   list at

https://lore.kernel.org/lkml/diqzzga0fv96@ackerleytng-cloudtop-sg.c.googlers.com/

+ Hugh’s patch:

https://lore.kernel.org/lkml/c140f56a-1aa3-f7ae-b7d1-93da7d5a3...@google.com/,

   which provides functionality in shmem that reads the VM_HUGEPAGE
   flag in key functions shmem_is_huge() and shmem_get_inode()



Will Hugh's patch be merged into 6.3 ? I didn't find it in 6.2-rc8.
IMHO this patch won't work without Hugh's patch, or at least need
another way, e.g. HMEM_SB(inode->i_sb)->huge.



Hugh's patch is still pending discussion and may not be merged so
soon. These patches will not work without Hugh's patch.

I would like to understand what the community thinks of the proposed
interface (RMFD_HUGEPAGE flag, passed to the memfd_restricted()
syscall). If this interface is favorably received, we can definitely
find another way for shmem to support this interface.

If I understand correctly, SHMEM_SB(inode->i_sb)->huge checks the state
of hugepage-ness for the superblock. Since the proposed interface will
only affect a single file, we will need something closer to

bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode,
   pgoff_t index, bool shmem_huge_force)
{
...

if (SHMEM_I(inode)->flags & VM_HUGEPAGE)
return true;

...
}

from Hugh's patch.


Re: [RFC PATCH 1/2] mm: restrictedmem: Allow userspace to specify mount_path for memfd_restricted

2023-02-22 Thread Ackerley Tng



"Kirill A. Shutemov"  writes:


On Thu, Feb 16, 2023 at 12:41:16AM +, Ackerley Tng wrote:

By default, the backing shmem file for a restrictedmem fd is created
on shmem's kernel space mount.



With this patch, an optional tmpfs mount can be specified, which will
be used as the mountpoint for backing the shmem file associated with a
restrictedmem fd.



This change is modeled after how sys_open() can create an unnamed
temporary file in a given directory with O_TMPFILE.



This will help restrictedmem fds inherit the properties of the
provided tmpfs mounts, for example, hugepage allocation hints, NUMA
binding hints, etc.



Signed-off-by: Ackerley Tng 
---
  include/linux/syscalls.h   |  2 +-
  include/uapi/linux/restrictedmem.h |  8 
  mm/restrictedmem.c | 63 +++---
  3 files changed, 66 insertions(+), 7 deletions(-)
  create mode 100644 include/uapi/linux/restrictedmem.h



diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index f9e9e0c820c5..4b8efe9a8680 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1056,7 +1056,7 @@ asmlinkage long sys_memfd_secret(unsigned int  
flags);
  asmlinkage long sys_set_mempolicy_home_node(unsigned long start,  
unsigned long len,

unsigned long home_node,
unsigned long flags);
-asmlinkage long sys_memfd_restricted(unsigned int flags);
+asmlinkage long sys_memfd_restricted(unsigned int flags, const char  
__user *mount_path);



  /*
   * Architecture-specific system calls



I'm not sure what the right practice now: do we provide string that
contains mount path or fd that represents the filesystem (returned from
fsmount(2) or open_tree(2)).



fd seems more flexible: it allows to specify unbind mounts.


I tried out the suggestion of passing fds to memfd_restricted() instead
of strings.

One benefit I see of using fds is interface uniformity: it feels more
aligned with other syscalls like fsopen(), fsconfig(), and fsmount() in
terms of using and passing around fds.

Other than being able to use a mount without a path attached to the
mount, are there any other benefits of using fds over using the path string?

Should I post the patches that allows specifying a mount using fds?
Should I post them as a separate RFC, or as a new revision to this RFC?



[RFC PATCH 2/2] selftests: restrictedmem: Add selftest for RMFD_HUGEPAGE

2023-02-17 Thread Ackerley Tng
Tests that when RMFD_HUGEPAGE is specified, restrictedmem will be
backed by Transparent HugePages.

Signed-off-by: Ackerley Tng 
---
 .../restrictedmem_hugepage_test.c | 25 +++
 1 file changed, 25 insertions(+)

diff --git 
a/tools/testing/selftests/restrictedmem/restrictedmem_hugepage_test.c 
b/tools/testing/selftests/restrictedmem/restrictedmem_hugepage_test.c
index 0d9cf2ced754..75283d68696f 100644
--- a/tools/testing/selftests/restrictedmem/restrictedmem_hugepage_test.c
+++ b/tools/testing/selftests/restrictedmem/restrictedmem_hugepage_test.c
@@ -180,6 +180,31 @@ TEST_F(reset_shmem_enabled, 
restrictedmem_fstat_shmem_enabled_always)
close(mfd);
 }
 
+TEST(restrictedmem_invalid_flags)
+{
+   int mfd = memfd_restricted(99, NULL);
+
+   ASSERT_EQ(-1, mfd);
+   ASSERT_EQ(EINVAL, errno);
+}
+
+TEST_F(reset_shmem_enabled, restrictedmem_rmfd_hugepage)
+{
+   int mfd = -1;
+   struct stat stat;
+
+   ASSERT_EQ(0, set_shmem_thp_policy("never"));
+
+   mfd = memfd_restricted(RMFD_HUGEPAGE, NULL);
+   ASSERT_NE(-1, mfd);
+
+   ASSERT_EQ(0, fstat(mfd, ));
+
+   ASSERT_EQ(stat.st_blksize, get_hpage_pmd_size());
+
+   close(mfd);
+}
+
 TEST(restrictedmem_tmpfile_no_mount_path)
 {
int mfd = memfd_restricted(RMFD_TMPFILE, NULL);
-- 
2.39.2.637.g21b0678d19-goog




[RFC PATCH 1/2] mm: restrictedmem: Add flag as THP allocation hint for memfd_restricted() syscall

2023-02-17 Thread Ackerley Tng
Allow userspace to hint the kernel to use Transparent HugePages to
back restricted memory on a per-file basis.

Signed-off-by: Ackerley Tng 
---
 include/uapi/linux/restrictedmem.h |  1 +
 mm/restrictedmem.c | 27 +--
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/include/uapi/linux/restrictedmem.h 
b/include/uapi/linux/restrictedmem.h
index 9f108dd1ac4c..f671ccbb43bc 100644
--- a/include/uapi/linux/restrictedmem.h
+++ b/include/uapi/linux/restrictedmem.h
@@ -4,5 +4,6 @@
 
 /* flags for memfd_restricted */
 #define RMFD_TMPFILE   0x0001U
+#define RMFD_HUGEPAGE  0x0002U
 
 #endif /* _UAPI_LINUX_RESTRICTEDMEM_H */
diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
index 97f3e2159e8b..87c829960b31 100644
--- a/mm/restrictedmem.c
+++ b/mm/restrictedmem.c
@@ -190,19 +190,25 @@ static struct file *restrictedmem_file_create(struct file 
*memfd)
return file;
 }
 
-static int restrictedmem_create(struct vfsmount *mount)
+static int restrictedmem_create(unsigned int flags, struct vfsmount *mount)
 {
struct file *file, *restricted_file;
int fd, err;
+   unsigned long shmem_setup_flags = VM_NORESERVE;
 
fd = get_unused_fd_flags(0);
if (fd < 0)
return fd;
 
-   if (mount)
-   file = shmem_file_setup_with_mnt(mount, "memfd:restrictedmem", 
0, VM_NORESERVE);
-   else
-   file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE);
+   if (flags & RMFD_HUGEPAGE)
+   shmem_setup_flags |= VM_HUGEPAGE;
+
+   if (mount) {
+   file = shmem_file_setup_with_mnt(mount, "memfd:restrictedmem",
+0, shmem_setup_flags);
+   } else {
+   file = shmem_file_setup("memfd:restrictedmem", 0, 
shmem_setup_flags);
+   }
 
if (IS_ERR(file)) {
err = PTR_ERR(file);
@@ -230,7 +236,8 @@ static bool is_shmem_mount(struct vfsmount *mnt)
return mnt->mnt_sb->s_magic == TMPFS_MAGIC;
 }
 
-static int restrictedmem_create_from_path(const char __user *mount_path)
+static int restrictedmem_create_from_path(unsigned int flags,
+ const char __user *mount_path)
 {
int ret;
struct path path;
@@ -250,7 +257,7 @@ static int restrictedmem_create_from_path(const char __user 
*mount_path)
if (unlikely(ret))
goto out;
 
-   ret = restrictedmem_create(path.mnt);
+   ret = restrictedmem_create(flags, path.mnt);
 
mnt_drop_write(path.mnt);
 out:
@@ -261,16 +268,16 @@ static int restrictedmem_create_from_path(const char 
__user *mount_path)
 
 SYSCALL_DEFINE2(memfd_restricted, unsigned int, flags, const char __user *, 
mount_path)
 {
-   if (flags & ~RMFD_TMPFILE)
+   if (flags & ~(RMFD_TMPFILE | RMFD_HUGEPAGE))
return -EINVAL;
 
if (flags == RMFD_TMPFILE) {
if (!mount_path)
return -EINVAL;
 
-   return restrictedmem_create_from_path(mount_path);
+   return restrictedmem_create_from_path(flags, mount_path);
} else {
-   return restrictedmem_create(NULL);
+   return restrictedmem_create(flags, NULL);
}
 }
 
-- 
2.39.2.637.g21b0678d19-goog




[RFC PATCH 0/2] Add flag as THP allocation hint for memfd_restricted() syscall

2023-02-17 Thread Ackerley Tng
Hello,

This patchset builds upon the memfd_restricted() system call that has
been discussed in the ‘KVM: mm: fd-based approach for supporting KVM’
patch series, at
https://lore.kernel.org/lkml/20221202061347.1070246-1-chao.p.p...@linux.intel.com/T/#m7e944d7892afdd1d62a03a287bd488c56e377b0c

The tree can be found at:
https://github.com/googleprodkernel/linux-cc/tree/restrictedmem-rmfd-hugepage

Following the RFC to provide mount for memfd_restricted() syscall at
https://lore.kernel.org/lkml/cover.1676507663.git.ackerley...@google.com/T/#u,
this patchset adds the RMFD_HUGEPAGE flag to the memfd_restricted()
syscall, which will hint the kernel to use Transparent HugePages to
back restrictedmem pages.

This supplements the interface proposed earlier, which requires the
creation of a tmpfs mount to be passed to memfd_restricted(), with a
more direct per-file hint.

Dependencies:

+ Sean’s iteration of the ‘KVM: mm: fd-based approach for supporting
  KVM’ patch series at
  https://github.com/sean-jc/linux/tree/x86/upm_base_support
+ Proposed fix for restrictedmem_getattr() as mentioned on the mailing
  list at
  
https://lore.kernel.org/lkml/diqzzga0fv96@ackerleytng-cloudtop-sg.c.googlers.com/
+ Hugh’s patch:
  https://lore.kernel.org/lkml/c140f56a-1aa3-f7ae-b7d1-93da7d5a3...@google.com/,
  which provides functionality in shmem that reads the VM_HUGEPAGE
  flag in key functions shmem_is_huge() and shmem_get_inode()

Future work/TODOs:
+ man page for the memfd_restricted() syscall
+ Support for per file NUMA binding hints

Ackerley Tng (2):
  mm: restrictedmem: Add flag as THP allocation hint for
memfd_restricted() syscall
  selftests: restrictedmem: Add selftest for RMFD_HUGEPAGE

 include/uapi/linux/restrictedmem.h|  1 +
 mm/restrictedmem.c| 27 ---
 .../restrictedmem_hugepage_test.c | 25 +
 3 files changed, 43 insertions(+), 10 deletions(-)

--
2.39.2.637.g21b0678d19-goog



[RFC PATCH 2/2] selftests: restrictedmem: Check hugepage-ness of shmem file backing restrictedmem fd

2023-02-15 Thread Ackerley Tng
For memfd_restricted() calls without a userspace mount, the backing
file should be the shmem mount in the kernel, and the size of backing
pages should be as defined by system-wide shmem configuration.

If a userspace mount is provided, the size of backing pages should be
as defined in the mount.

Signed-off-by: Ackerley Tng 
---
 tools/testing/selftests/Makefile  |   1 +
 .../selftests/restrictedmem/.gitignore|   3 +
 .../testing/selftests/restrictedmem/Makefile  |  14 +
 .../testing/selftests/restrictedmem/common.c  |   9 +
 .../testing/selftests/restrictedmem/common.h  |   8 +
 .../restrictedmem_hugepage_test.c | 344 ++
 6 files changed, 379 insertions(+)
 create mode 100644 tools/testing/selftests/restrictedmem/.gitignore
 create mode 100644 tools/testing/selftests/restrictedmem/Makefile
 create mode 100644 tools/testing/selftests/restrictedmem/common.c
 create mode 100644 tools/testing/selftests/restrictedmem/common.h
 create mode 100644 
tools/testing/selftests/restrictedmem/restrictedmem_hugepage_test.c

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index f07aef7c592c..44078eeefb79 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -60,6 +60,7 @@ TARGETS += pstore
 TARGETS += ptrace
 TARGETS += openat2
 TARGETS += resctrl
+TARGETS += restrictedmem
 TARGETS += rlimits
 TARGETS += rseq
 TARGETS += rtc
diff --git a/tools/testing/selftests/restrictedmem/.gitignore 
b/tools/testing/selftests/restrictedmem/.gitignore
new file mode 100644
index ..2581bcc8ff29
--- /dev/null
+++ b/tools/testing/selftests/restrictedmem/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+restrictedmem_hugepage_test
diff --git a/tools/testing/selftests/restrictedmem/Makefile 
b/tools/testing/selftests/restrictedmem/Makefile
new file mode 100644
index ..da9665718c8a
--- /dev/null
+++ b/tools/testing/selftests/restrictedmem/Makefile
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0
+
+CFLAGS = $(KHDR_INCLUDES)
+
+TEST_GEN_PROGS += restrictedmem_hugepage_test
+
+include ../lib.mk
+
+EXTRA_CLEAN = $(OUTPUT)/common.o
+
+$(OUTPUT)/common.o: common.c
+   $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c -ffreestanding $< -o $@
+
+$(TEST_GEN_PROGS): $(OUTPUT)/common.o
diff --git a/tools/testing/selftests/restrictedmem/common.c 
b/tools/testing/selftests/restrictedmem/common.c
new file mode 100644
index ..79b2ac98cc89
--- /dev/null
+++ b/tools/testing/selftests/restrictedmem/common.c
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include 
+#include 
+
+int memfd_restricted(unsigned int flags, char *mount_path)
+{
+   return syscall(__NR_memfd_restricted, flags, mount_path);
+}
diff --git a/tools/testing/selftests/restrictedmem/common.h 
b/tools/testing/selftests/restrictedmem/common.h
new file mode 100644
index ..5d59edc4f23f
--- /dev/null
+++ b/tools/testing/selftests/restrictedmem/common.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef SELFTESTS_RESTRICTEDMEM_COMMON_H
+#define SELFTESTS_RESTRICTEDMEM_COMMON_H
+
+int memfd_restricted(unsigned int flags, char *mount_path);
+
+#endif  // SELFTESTS_RESTRICTEDMEM_COMMON_H
diff --git 
a/tools/testing/selftests/restrictedmem/restrictedmem_hugepage_test.c 
b/tools/testing/selftests/restrictedmem/restrictedmem_hugepage_test.c
new file mode 100644
index ..0d9cf2ced754
--- /dev/null
+++ b/tools/testing/selftests/restrictedmem/restrictedmem_hugepage_test.c
@@ -0,0 +1,344 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "linux/limits.h"
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "linux/restrictedmem.h"
+
+#include "common.h"
+#include "../kselftest_harness.h"
+
+static int get_hpage_pmd_size(void)
+{
+   FILE *fp;
+   char buf[100];
+   char *ret;
+   int size;
+
+   fp = fopen("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", "r");
+   if (!fp)
+   return -1;
+
+   ret = fgets(buf, 100, fp);
+   if (ret != buf) {
+   size = -1;
+   goto out;
+   }
+
+   if (sscanf(buf, "%d\n", ) != 1)
+   size = -1;
+
+out:
+   fclose(fp);
+
+   return size;
+}
+
+static bool is_valid_shmem_thp_policy(char *policy)
+{
+   if (strcmp(policy, "always") == 0)
+   return true;
+   if (strcmp(policy, "within_size") == 0)
+   return true;
+   if (strcmp(policy, "advise") == 0)
+   return true;
+   if (strcmp(policy, "never") == 0)
+   return true;
+   if (strcmp(policy, "deny") == 0)
+   return true;
+   if (strcmp(policy, "force") == 0)
+   return true;
+
+   return false;
+}
+
+static int

[RFC PATCH 0/2] Providing mount for memfd_restricted() syscall

2023-02-15 Thread Ackerley Tng
Hello,

This patchset builds upon the memfd_restricted() system call that has
been discussed in the ‘KVM: mm: fd-based approach for supporting KVM’
patch series, at
https://lore.kernel.org/lkml/20221202061347.1070246-1-chao.p.p...@linux.intel.com/T/#m7e944d7892afdd1d62a03a287bd488c56e377b0c

The tree can be found at:
https://github.com/googleprodkernel/linux-cc/tree/restrictedmem-provide-mount-path

In this patchset, a modification to the memfd_restricted() syscall is
proposed, which allows userspace to provide a mount, on which the file
will be created and returned from the memfd_restricted().

Allowing userspace to provide a mount allows userspace to control
various memory binding policies via tmpfs mount options, such as
Transparent HugePage memory allocation policy through
‘huge=always/never’ and NUMA memory allocation policy through
‘mpol=local/bind:*’.

Dependencies:
+ Sean’s iteration of the ‘KVM: mm: fd-based approach for supporting
  KVM’ patch series at
  https://github.com/sean-jc/linux/tree/x86/upm_base_support
+ Proposed fixes for these issues mentioned on the mailing list:
+ 
https://lore.kernel.org/lkml/diqzzga0fv96@ackerleytng-cloudtop-sg.c.googlers.com/

Future work/TODOs:
+ man page for the memfd_restricted() syscall
+ Support for per file Transparent HugePage allocation hints
+ Support for per file NUMA binding hints

Ackerley Tng (2):
  mm: restrictedmem: Allow userspace to specify mount_path for
memfd_restricted
  selftests: restrictedmem: Check hugepage-ness of shmem file backing
restrictedmem fd

 include/linux/syscalls.h  |   2 +-
 include/uapi/linux/restrictedmem.h|   8 +
 mm/restrictedmem.c|  63 +++-
 tools/testing/selftests/Makefile  |   1 +
 .../selftests/restrictedmem/.gitignore|   3 +
 .../testing/selftests/restrictedmem/Makefile  |  14 +
 .../testing/selftests/restrictedmem/common.c  |   9 +
 .../testing/selftests/restrictedmem/common.h  |   8 +
 .../restrictedmem_hugepage_test.c | 344 ++
 9 files changed, 445 insertions(+), 7 deletions(-)
 create mode 100644 include/uapi/linux/restrictedmem.h
 create mode 100644 tools/testing/selftests/restrictedmem/.gitignore
 create mode 100644 tools/testing/selftests/restrictedmem/Makefile
 create mode 100644 tools/testing/selftests/restrictedmem/common.c
 create mode 100644 tools/testing/selftests/restrictedmem/common.h
 create mode 100644 
tools/testing/selftests/restrictedmem/restrictedmem_hugepage_test.c

--
2.39.1.637.g21b0678d19-goog



[RFC PATCH 1/2] mm: restrictedmem: Allow userspace to specify mount_path for memfd_restricted

2023-02-15 Thread Ackerley Tng
By default, the backing shmem file for a restrictedmem fd is created
on shmem's kernel space mount.

With this patch, an optional tmpfs mount can be specified, which will
be used as the mountpoint for backing the shmem file associated with a
restrictedmem fd.

This change is modeled after how sys_open() can create an unnamed
temporary file in a given directory with O_TMPFILE.

This will help restrictedmem fds inherit the properties of the
provided tmpfs mounts, for example, hugepage allocation hints, NUMA
binding hints, etc.

Signed-off-by: Ackerley Tng 
---
 include/linux/syscalls.h   |  2 +-
 include/uapi/linux/restrictedmem.h |  8 
 mm/restrictedmem.c | 63 +++---
 3 files changed, 66 insertions(+), 7 deletions(-)
 create mode 100644 include/uapi/linux/restrictedmem.h

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index f9e9e0c820c5..4b8efe9a8680 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1056,7 +1056,7 @@ asmlinkage long sys_memfd_secret(unsigned int flags);
 asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long 
len,
unsigned long home_node,
unsigned long flags);
-asmlinkage long sys_memfd_restricted(unsigned int flags);
+asmlinkage long sys_memfd_restricted(unsigned int flags, const char __user 
*mount_path);
 
 /*
  * Architecture-specific system calls
diff --git a/include/uapi/linux/restrictedmem.h 
b/include/uapi/linux/restrictedmem.h
new file mode 100644
index ..9f108dd1ac4c
--- /dev/null
+++ b/include/uapi/linux/restrictedmem.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_RESTRICTEDMEM_H
+#define _UAPI_LINUX_RESTRICTEDMEM_H
+
+/* flags for memfd_restricted */
+#define RMFD_TMPFILE   0x0001U
+
+#endif /* _UAPI_LINUX_RESTRICTEDMEM_H */
diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
index c5d869d8c2d8..97f3e2159e8b 100644
--- a/mm/restrictedmem.c
+++ b/mm/restrictedmem.c
@@ -1,11 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0
-#include "linux/sbitmap.h"
+#include 
 #include 
 #include 
 #include 
 #include 
 #include 
 #include 
+#include 
 #include 
 
 struct restrictedmem {
@@ -189,19 +190,20 @@ static struct file *restrictedmem_file_create(struct file 
*memfd)
return file;
 }
 
-SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
+static int restrictedmem_create(struct vfsmount *mount)
 {
struct file *file, *restricted_file;
int fd, err;
 
-   if (flags)
-   return -EINVAL;
-
fd = get_unused_fd_flags(0);
if (fd < 0)
return fd;
 
-   file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE);
+   if (mount)
+   file = shmem_file_setup_with_mnt(mount, "memfd:restrictedmem", 
0, VM_NORESERVE);
+   else
+   file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE);
+
if (IS_ERR(file)) {
err = PTR_ERR(file);
goto err_fd;
@@ -223,6 +225,55 @@ SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
return err;
 }
 
+static bool is_shmem_mount(struct vfsmount *mnt)
+{
+   return mnt->mnt_sb->s_magic == TMPFS_MAGIC;
+}
+
+static int restrictedmem_create_from_path(const char __user *mount_path)
+{
+   int ret;
+   struct path path;
+
+   ret = user_path_at(AT_FDCWD, mount_path,
+  LOOKUP_FOLLOW | LOOKUP_MOUNTPOINT,
+  );
+   if (ret)
+   return ret;
+
+   if (!is_shmem_mount(path.mnt)) {
+   ret = -EINVAL;
+   goto out;
+   }
+
+   ret = mnt_want_write(path.mnt);
+   if (unlikely(ret))
+   goto out;
+
+   ret = restrictedmem_create(path.mnt);
+
+   mnt_drop_write(path.mnt);
+out:
+   path_put();
+
+   return ret;
+}
+
+SYSCALL_DEFINE2(memfd_restricted, unsigned int, flags, const char __user *, 
mount_path)
+{
+   if (flags & ~RMFD_TMPFILE)
+   return -EINVAL;
+
+   if (flags == RMFD_TMPFILE) {
+   if (!mount_path)
+   return -EINVAL;
+
+   return restrictedmem_create_from_path(mount_path);
+   } else {
+   return restrictedmem_create(NULL);
+   }
+}
+
 int restrictedmem_bind(struct file *file, pgoff_t start, pgoff_t end,
   struct restrictedmem_notifier *notifier, bool exclusive)
 {
-- 
2.39.1.637.g21b0678d19-goog




Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory

2023-01-29 Thread Ackerley Tng




+static int restrictedmem_getattr(struct user_namespace *mnt_userns,
+const struct path *path, struct kstat *stat,
+u32 request_mask, unsigned int query_flags)
+{
+   struct inode *inode = d_inode(path->dentry);
+   struct restrictedmem_data *data = inode->i_mapping->private_data;
+   struct file *memfd = data->memfd;
+
+   return memfd->f_inode->i_op->getattr(mnt_userns, path, stat,
+request_mask, query_flags);


Instead of calling shmem's getattr() with path, we should be using the
the memfd's path.

Otherwise, shmem's getattr() will use restrictedmem's inode instead of
shmem's inode. The private fields will be of the wrong type, and the
host will crash when shmem_is_huge() does SHMEM_SB(inode->i_sb)->huge),
since inode->i_sb->s_fs_info is NULL for the restrictedmem's superblock.

Here's the patch:

diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
index 37191cd9eed1..06b72d593bd8 100644
--- a/mm/restrictedmem.c
+++ b/mm/restrictedmem.c
@@ -84,7 +84,7 @@ static int restrictedmem_getattr(struct user_namespace  
*mnt_userns,

struct restrictedmem *rm = inode->i_mapping->private_data;
struct file *memfd = rm->memfd;

-   return memfd->f_inode->i_op->getattr(mnt_userns, path, stat,
+   return memfd->f_inode->i_op->getattr(mnt_userns, >f_path, stat,
 request_mask, query_flags);
 }


+}
+
+static int restrictedmem_setattr(struct user_namespace *mnt_userns,
+struct dentry *dentry, struct iattr *attr)
+{
+   struct inode *inode = d_inode(dentry);
+   struct restrictedmem_data *data = inode->i_mapping->private_data;
+   struct file *memfd = data->memfd;
+   int ret;
+
+   if (attr->ia_valid & ATTR_SIZE) {
+   if (memfd->f_inode->i_size)
+   return -EPERM;
+
+   if (!PAGE_ALIGNED(attr->ia_size))
+   return -EINVAL;
+   }
+
+   ret = memfd->f_inode->i_op->setattr(mnt_userns,
+   file_dentry(memfd), attr);
+   return ret;
+}
+
+static const struct inode_operations restrictedmem_iops = {
+   .getattr = restrictedmem_getattr,
+   .setattr = restrictedmem_setattr,
+};




Re: [PATCH v9 7/8] KVM: Handle page fault for private memory

2022-11-16 Thread Ackerley Tng
> A memslot with KVM_MEM_PRIVATE being set can include both fd-based
> private memory and hva-based shared memory. Architecture code (like TDX
> code) can tell whether the on-going fault is private or not. This patch
> adds a 'is_private' field to kvm_page_fault to indicate this and
> architecture code is expected to set it.
>
> To handle page fault for such memslot, the handling logic is different
> depending on whether the fault is private or shared. KVM checks if
> 'is_private' matches the host's view of the page (maintained in
> mem_attr_array).
>   - For a successful match, private pfn is obtained with
> restrictedmem_get_page () from private fd and shared pfn is obtained
> with existing get_user_pages().
>   - For a failed match, KVM causes a KVM_EXIT_MEMORY_FAULT exit to
> userspace. Userspace then can convert memory between private/shared
> in host's view and retry the fault.
>
> Co-developed-by: Yu Zhang 
> Signed-off-by: Yu Zhang 
> Signed-off-by: Chao Peng 
> ---
>  arch/x86/kvm/mmu/mmu.c  | 56 +++--
>  arch/x86/kvm/mmu/mmu_internal.h | 14 -
>  arch/x86/kvm/mmu/mmutrace.h |  1 +
>  arch/x86/kvm/mmu/spte.h |  6 
>  arch/x86/kvm/mmu/tdp_mmu.c  |  3 +-
>  include/linux/kvm_host.h| 28 +
>  6 files changed, 103 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 67a9823a8c35..10017a9f26ee 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3030,7 +3030,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, 
> gfn_t gfn,
>
>  int kvm_mmu_max_mapping_level(struct kvm *kvm,
> const struct kvm_memory_slot *slot, gfn_t gfn,
> -   int max_level)
> +   int max_level, bool is_private)
>  {
>   struct kvm_lpage_info *linfo;
>   int host_level;
> @@ -3042,6 +3042,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
>   break;
>   }
>
> + if (is_private)
> + return max_level;
> +
>   if (max_level == PG_LEVEL_4K)
>   return PG_LEVEL_4K;
>
> @@ -3070,7 +3073,8 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, 
> struct kvm_page_fault *fault
>* level, which will be used to do precise, accurate accounting.
>*/
>   fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
> -  fault->gfn, 
> fault->max_level);
> +  fault->gfn, 
> fault->max_level,
> +  fault->is_private);
>   if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
>   return;
>
> @@ -4141,6 +4145,32 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, 
> struct kvm_async_pf *work)
>   kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
>  }
>
> +static inline u8 order_to_level(int order)
> +{
> + BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G);
> +
> + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G))
> + return PG_LEVEL_1G;
> +
> + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
> + return PG_LEVEL_2M;
> +
> + return PG_LEVEL_4K;
> +}
> +
> +static int kvm_faultin_pfn_private(struct kvm_page_fault *fault)
>  +{
>  +int order;
>  +struct kvm_memory_slot *slot = fault->slot;
>  +
>  +if (kvm_restricted_mem_get_pfn(slot, fault->gfn, >pfn, ))
>+  return RET_PF_RETRY;
>+
>+  fault->max_level = min(order_to_level(order), fault->max_level);
>+  fault->map_writable = !(slot->flags & KVM_MEM_READONLY);
>+  return RET_PF_CONTINUE;
>+}
>+
> static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault 
> *fault)
> {
>   struct kvm_memory_slot *slot = fault->slot;
>@@ -4173,6 +4203,22 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, 
>struct kvm_page_fault *fault)
>   return RET_PF_EMULATE;
>   }
>
>+  if (kvm_slot_can_be_private(slot) &&
>+  fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
>+  vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
>+  if (fault->is_private)
>+  vcpu->run->memory.flags = KVM_MEMORY_EXIT_FLAG_PRIVATE;
>+  else
>+  vcpu->run->memory.flags = 0;
>+  vcpu->run->memory.padding = 0;
>+  vcpu->run->memory.gpa = fault->gfn << PAGE_SHIFT;
>+  vcpu->run->memory.size = PAGE_SIZE;
>+  return RET_PF_USER;
>+  }
>+
>+  if (fault->is_private)
>+  return kvm_faultin_pfn_private(fault);
>+

Since each memslot may also not be backed by restricted memory, we
should also check if the memslot has been set up for private memory
with

if (fault->is_private && kvm_slot_can_be_private(slot))
return