date:20230807

[PATCH v2 2/2] ocxl: use pci_find_next_dvsec_capability() to simplify the code

2023-08-07 Thread Xiongfeng Wang

PCI core add pci_find_next_dvsec_capability() to query the next DVSEC.
We can use that core API to simplify the code. Also remove the unused
macros.

Signed-off-by: Xiongfeng Wang 
Reviewed-by: Andrew Donnellan 
---
 arch/powerpc/platforms/powernv/ocxl.c | 20 ++--
 drivers/misc/ocxl/config.c| 21 ++---
 include/misc/ocxl-config.h|  4 
 3 files changed, 8 insertions(+), 37 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/ocxl.c 
b/arch/powerpc/platforms/powernv/ocxl.c
index 629067781cec..8dbc1a9535fc 100644
--- a/arch/powerpc/platforms/powernv/ocxl.c
+++ b/arch/powerpc/platforms/powernv/ocxl.c
@@ -71,29 +71,13 @@ static DEFINE_MUTEX(links_list_lock);
  * the AFUs, by pro-rating if needed.
  */
 
-static int find_dvsec_from_pos(struct pci_dev *dev, int dvsec_id, int pos)
-{
-   int vsec = pos;
-   u16 vendor, id;
-
-   while ((vsec = pci_find_next_ext_capability(dev, vsec,
-   OCXL_EXT_CAP_ID_DVSEC))) {
-   pci_read_config_word(dev, vsec + OCXL_DVSEC_VENDOR_OFFSET,
-   );
-   pci_read_config_word(dev, vsec + OCXL_DVSEC_ID_OFFSET, );
-   if (vendor == PCI_VENDOR_ID_IBM && id == dvsec_id)
-   return vsec;
-   }
-   return 0;
-}
-
 static int find_dvsec_afu_ctrl(struct pci_dev *dev, u8 afu_idx)
 {
int vsec = 0;
u8 idx;
 
-   while ((vsec = find_dvsec_from_pos(dev, OCXL_DVSEC_AFU_CTRL_ID,
-  vsec))) {
+   while ((vsec = pci_find_next_dvsec_capability(dev, vsec,
+   PCI_VENDOR_ID_IBM, OCXL_DVSEC_AFU_CTRL_ID))) {
pci_read_config_byte(dev, vsec + OCXL_DVSEC_AFU_CTRL_AFU_IDX,
);
if (idx == afu_idx)
diff --git a/drivers/misc/ocxl/config.c b/drivers/misc/ocxl/config.c
index 92ab49705f64..6c0fca32e6db 100644
--- a/drivers/misc/ocxl/config.c
+++ b/drivers/misc/ocxl/config.c
@@ -39,23 +39,14 @@ static int find_dvsec(struct pci_dev *dev, int dvsec_id)
 static int find_dvsec_afu_ctrl(struct pci_dev *dev, u8 afu_idx)
 {
int vsec = 0;
-   u16 vendor, id;
u8 idx;
 
-   while ((vsec = pci_find_next_ext_capability(dev, vsec,
-   OCXL_EXT_CAP_ID_DVSEC))) {
-   pci_read_config_word(dev, vsec + OCXL_DVSEC_VENDOR_OFFSET,
-   );
-   pci_read_config_word(dev, vsec + OCXL_DVSEC_ID_OFFSET, );
-
-   if (vendor == PCI_VENDOR_ID_IBM &&
-   id == OCXL_DVSEC_AFU_CTRL_ID) {
-   pci_read_config_byte(dev,
-   vsec + OCXL_DVSEC_AFU_CTRL_AFU_IDX,
-   );
-   if (idx == afu_idx)
-   return vsec;
-   }
+   while ((vsec = pci_find_next_dvsec_capability(dev, vsec,
+   PCI_VENDOR_ID_IBM, OCXL_DVSEC_AFU_CTRL_ID))) {
+   pci_read_config_byte(dev, vsec + OCXL_DVSEC_AFU_CTRL_AFU_IDX,
+);
+   if (idx == afu_idx)
+   return vsec;
}
return 0;
 }
diff --git a/include/misc/ocxl-config.h b/include/misc/ocxl-config.h
index ccfd3b463517..40cf1b143170 100644
--- a/include/misc/ocxl-config.h
+++ b/include/misc/ocxl-config.h
@@ -10,10 +10,6 @@
  * It follows the specification for opencapi 3.0
  */
 
-#define OCXL_EXT_CAP_ID_DVSEC 0x23
-
-#define OCXL_DVSEC_VENDOR_OFFSET  0x4
-#define OCXL_DVSEC_ID_OFFSET  0x8
 #define OCXL_DVSEC_TL_ID  0xF000
 #define   OCXL_DVSEC_TL_BACKOFF_TIMERS  0x10
 #define   OCXL_DVSEC_TL_RECV_CAP0x18
-- 
2.20.1

[PATCH v2 1/2] PCI: Add pci_find_next_dvsec_capability to find next Designated VSEC

2023-08-07 Thread Xiongfeng Wang

Some devices may have several DVSEC (Designated Vendor-Specific Extended
Capability) entries with the same DVSEC ID. Add
pci_find_next_dvsec_capability() to find them all.

Signed-off-by: Xiongfeng Wang 
Reviewed-by: Andrew Donnellan 
Acked-by: Bjorn Helgaas 
---
 drivers/pci/pci.c   | 39 ++-
 include/linux/pci.h |  2 ++
 2 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 60230da957e0..2ff5b1ce0eec 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -749,35 +749,48 @@ u16 pci_find_vsec_capability(struct pci_dev *dev, u16 
vendor, int cap)
 EXPORT_SYMBOL_GPL(pci_find_vsec_capability);
 
 /**
- * pci_find_dvsec_capability - Find DVSEC for vendor
+ * pci_find_next_dvsec_capability - Find next DVSEC for vendor
  * @dev: PCI device to query
+ * @start: Address at which to start looking (0 to start at beginning of list)
  * @vendor: Vendor ID to match for the DVSEC
- * @dvsec: Designated Vendor-specific capability ID
+ * @dvsec: Vendor-defined DVSEC ID
  *
- * If DVSEC has Vendor ID @vendor and DVSEC ID @dvsec return the capability
- * offset in config space; otherwise return 0.
+ * Returns the address of the next DVSEC if the DVSEC has Vendor ID @vendor and
+ * DVSEC ID @dvsec; otherwise return 0. DVSEC can occur several times with the
+ * same DVSEC ID for some devices, and this provides a way to find them all.
  */
-u16 pci_find_dvsec_capability(struct pci_dev *dev, u16 vendor, u16 dvsec)
+u16 pci_find_next_dvsec_capability(struct pci_dev *dev, u16 start, u16 vendor,
+  u16 dvsec)
 {
-   int pos;
+   u16 pos = start;
 
-   pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_DVSEC);
-   if (!pos)
-   return 0;
-
-   while (pos) {
+   while ((pos = pci_find_next_ext_capability(dev, pos,
+ PCI_EXT_CAP_ID_DVSEC))) {
u16 v, id;
 
pci_read_config_word(dev, pos + PCI_DVSEC_HEADER1, );
pci_read_config_word(dev, pos + PCI_DVSEC_HEADER2, );
if (vendor == v && dvsec == id)
return pos;
-
-   pos = pci_find_next_ext_capability(dev, pos, 
PCI_EXT_CAP_ID_DVSEC);
}
 
return 0;
 }
+EXPORT_SYMBOL_GPL(pci_find_next_dvsec_capability);
+
+/**
+ * pci_find_dvsec_capability - Find DVSEC for vendor
+ * @dev: PCI device to query
+ * @vendor: Vendor ID to match for the DVSEC
+ * @dvsec: Vendor-defined DVSEC ID
+ *
+ * If DVSEC has Vendor ID @vendor and DVSEC ID @dvsec return the capability
+ * offset in config space; otherwise return 0.
+ */
+u16 pci_find_dvsec_capability(struct pci_dev *dev, u16 vendor, u16 dvsec)
+{
+   return pci_find_next_dvsec_capability(dev, 0, vendor, dvsec);
+}
 EXPORT_SYMBOL_GPL(pci_find_dvsec_capability);
 
 /**
diff --git a/include/linux/pci.h b/include/linux/pci.h
index c69a2cc1f412..82bb905daf72 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1168,6 +1168,8 @@ u16 pci_find_next_ext_capability(struct pci_dev *dev, u16 
pos, int cap);
 struct pci_bus *pci_find_next_bus(const struct pci_bus *from);
 u16 pci_find_vsec_capability(struct pci_dev *dev, u16 vendor, int cap);
 u16 pci_find_dvsec_capability(struct pci_dev *dev, u16 vendor, u16 dvsec);
+u16 pci_find_next_dvsec_capability(struct pci_dev *dev, u16 start, u16 vendor,
+  u16 dvsec);
 
 u64 pci_get_dsn(struct pci_dev *dev);
 
-- 
2.20.1

[PATCH v2 0/2] Introduce pci_find_next_dvsec_capability() to simplify the code

2023-08-07 Thread Xiongfeng Wang

Some devices may have several DVSEC (Designated Vendor-Specific Extended
Capability) entries with the same DVSEC ID. Introduce
pci_find_next_dvsec_capability() to simplify the code.

ChangeLog:
v1->v2:  Add Reviewed-by and Acked-by tags
 Modify commit message and document a little for the first patch

Xiongfeng Wang (2):
  PCI: Add pci_find_next_dvsec_capability to find next Designated VSEC
  ocxl: use pci_find_next_dvsec_capability() to simplify the code

 arch/powerpc/platforms/powernv/ocxl.c | 20 ++
 drivers/misc/ocxl/config.c| 21 +--
 drivers/pci/pci.c | 39 ++-
 include/linux/pci.h   |  2 ++
 include/misc/ocxl-config.h|  4 ---
 5 files changed, 36 insertions(+), 50 deletions(-)

-- 
2.20.1

[PATCH -next] ASoC: imx-audio-rpmsg: Remove redundant initialization owner in imx_audio_rpmsg_driver

2023-08-07 Thread Li Zetao

The module_rpmsg_driver() will set "THIS_MODULE" to driver.owner when
register a rpmsg_driver driver, so it is redundant initialization to set
driver.owner in the statement. Remove it for clean code.

Signed-off-by: Li Zetao 
---
 sound/soc/fsl/imx-audio-rpmsg.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sound/soc/fsl/imx-audio-rpmsg.c b/sound/soc/fsl/imx-audio-rpmsg.c
index d5234ac4b09b..289e47c03d40 100644
--- a/sound/soc/fsl/imx-audio-rpmsg.c
+++ b/sound/soc/fsl/imx-audio-rpmsg.c
@@ -116,7 +116,6 @@ static struct rpmsg_device_id imx_audio_rpmsg_id_table[] = {
 
 static struct rpmsg_driver imx_audio_rpmsg_driver = {
.drv.name   = "imx_audio_rpmsg",
-   .drv.owner  = THIS_MODULE,
.id_table   = imx_audio_rpmsg_id_table,
.probe  = imx_audio_rpmsg_probe,
.callback   = imx_audio_rpmsg_cb,
-- 
2.34.1

Re: [RFC PATCH v11 28/29] KVM: selftests: Add basic selftest for guest_memfd()

2023-08-07 Thread Ackerley Tng

Sean Christopherson  writes:

> Add a selftest to verify the basic functionality of guest_memfd():
>
> 

Here's one more test:

>From 72dc6836f01bdd613d64d4c6a4f2af8f2b777ba2 Mon Sep 17 00:00:00 2001
From: Ackerley Tng 
Date: Tue, 1 Aug 2023 18:02:50 +
Subject: [PATCH] KVM: selftests: Add tests - invalid inputs for
 KVM_CREATE_GUEST_MEMFD

Test that invalid inputs for KVM_CREATE_GUEST_MEMFD, such as
non-page-aligned page size and invalid flags, are rejected by the
KVM_CREATE_GUEST_MEMFD with EINVAL

Signed-off-by: Ackerley Tng 
---
 tools/testing/selftests/kvm/guest_memfd_test.c  | 17 +
 .../selftests/kvm/include/kvm_util_base.h   | 11 +--
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c 
b/tools/testing/selftests/kvm/guest_memfd_test.c
index eb93c608a7e0..ad20f11b2d2c 100644
--- a/tools/testing/selftests/kvm/guest_memfd_test.c
+++ b/tools/testing/selftests/kvm/guest_memfd_test.c
@@ -90,6 +90,21 @@ static void test_fallocate(int fd, size_t page_size, size_t 
total_size)
TEST_ASSERT(!ret, "fallocate to restore punched hole should succeed");
 }
 
+static void test_create_guest_memfd_invalid(struct kvm_vm *vm, size_t 
page_size)
+{
+   int fd;
+
+   /* Non-page-aligned page_size */
+   fd = __vm_create_guest_memfd(vm, 1, 0);
+   ASSERT_EQ(fd, -1);
+   ASSERT_EQ(errno, EINVAL);
+
+   /* Invalid flags */
+   fd = __vm_create_guest_memfd(vm, page_size, 99);
+   ASSERT_EQ(fd, -1);
+   ASSERT_EQ(errno, EINVAL);
+}
+
 
 int main(int argc, char *argv[])
 {
@@ -103,6 +118,8 @@ int main(int argc, char *argv[])
 
vm = vm_create_barebones();
 
+   test_create_guest_memfd_invalid(vm, page_size);
+
fd = vm_create_guest_memfd(vm, total_size, 0);
 
test_file_read_write(fd);
diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h 
b/tools/testing/selftests/kvm/include/kvm_util_base.h
index 39b38c75b99c..8bdfadd72349 100644
--- a/tools/testing/selftests/kvm/include/kvm_util_base.h
+++ b/tools/testing/selftests/kvm/include/kvm_util_base.h
@@ -474,7 +474,8 @@ static inline uint64_t vm_get_stat(struct kvm_vm *vm, const 
char *stat_name)
 }
 
 void vm_create_irqchip(struct kvm_vm *vm);
-static inline int vm_create_guest_memfd(struct kvm_vm *vm, uint64_t size,
+
+static inline int __vm_create_guest_memfd(struct kvm_vm *vm, uint64_t size,
uint64_t flags)
 {
struct kvm_create_guest_memfd gmem = {
@@ -482,7 +483,13 @@ static inline int vm_create_guest_memfd(struct kvm_vm *vm, 
uint64_t size,
.flags = flags,
};
 
-   int fd = __vm_ioctl(vm, KVM_CREATE_GUEST_MEMFD, );
+   return __vm_ioctl(vm, KVM_CREATE_GUEST_MEMFD, );
+}
+
+static inline int vm_create_guest_memfd(struct kvm_vm *vm, uint64_t size,
+   uint64_t flags)
+{
+   int fd = __vm_create_guest_memfd(vm, size, flags);
 
TEST_ASSERT(fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_GUEST_MEMFD, fd));
return fd;
-- 
2.41.0.640.ga95def55d0-goog

Re: [RFC PATCH v11 28/29] KVM: selftests: Add basic selftest for guest_memfd()

2023-08-07 Thread Ackerley Tng

Sean Christopherson  writes:

> Add a selftest to verify the basic functionality of guest_memfd():
>
> + file descriptor created with the guest_memfd() ioctl does not allow
>   read/write/mmap operations
> + file size and block size as returned from fstat are as expected
> + fallocate on the fd checks that offset/length on
>   fallocate(FALLOC_FL_PUNCH_HOLE) should be page aligned
>

> 

> +
> +static void test_fallocate(int fd, size_t page_size, size_t total_size)
> +{
> + int ret;
> +
> + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, total_size);
> + TEST_ASSERT(!ret, "fallocate with aligned offset and size should 
> succeed");
> +
> + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
> + page_size - 1, page_size);
> + TEST_ASSERT(ret, "fallocate with unaligned offset should fail");
> +
> + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE, total_size, page_size);
> + TEST_ASSERT(ret, "fallocate beginning at total_size should fail");
> +
> + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE, total_size + page_size, 
> page_size);
> + TEST_ASSERT(ret, "fallocate beginning at total_size should fail");

This should be

TEST_ASSERT(ret, "fallocate beginning after total_size should fail");

> +
> + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
> + total_size, page_size);
> + TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) at total_size should succeed");
> +
> + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
> + total_size + page_size, page_size);
> + TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) after total_size should 
> succeed");
> +
> + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
> + page_size, page_size - 1);
> + TEST_ASSERT(ret, "fallocate with unaligned size should fail");
> +
> + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
> + page_size, page_size);
> + TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) with aligned offset and size 
> should succeed");
> +
> + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE, page_size, page_size);
> + TEST_ASSERT(!ret, "fallocate to restore punched hole should succeed");
> +}

>

Re: [RFC PATCH v11 27/29] KVM: selftests: Expand set_memory_region_test to validate guest_memfd()

2023-08-07 Thread Ackerley Tng

Sean Christopherson  writes:

> From: Chao Peng 
>
> Expand set_memory_region_test to exercise various positive and negative
> testcases for private memory.
>
>  - Non-guest_memfd() file descriptor for private memory
>  - guest_memfd() from different VM
>  - Overlapping bindings
>  - Unaligned bindings
>
> Signed-off-by: Chao Peng 
> Co-developed-by: Ackerley Tng 
> Signed-off-by: Ackerley Tng 
> [sean: trim the testcases to remove duplicate coverage]
> Signed-off-by: Sean Christopherson 
> ---
>  .../selftests/kvm/include/kvm_util_base.h | 10 ++
>  .../selftests/kvm/set_memory_region_test.c| 99 +++
>  2 files changed, 109 insertions(+)
>
> diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h 
> b/tools/testing/selftests/kvm/include/kvm_util_base.h
> index 334df27a6f43..39b38c75b99c 100644
> --- a/tools/testing/selftests/kvm/include/kvm_util_base.h
> +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h
> @@ -789,6 +789,16 @@ static inline struct kvm_vm *vm_create_barebones(void)
>   return vm_create(VM_SHAPE_DEFAULT);
>  }
>  

> 

> +
> +static void test_add_private_memory_region(void)
> +{
> + struct kvm_vm *vm, *vm2;
> + int memfd, i;
> +
> + pr_info("Testing ADD of KVM_MEM_PRIVATE memory regions\n");
> +
> + vm = vm_create_barebones_protected_vm();
> +
> + test_invalid_guest_memfd(vm, vm->kvm_fd, 0, "KVM fd should fail");
> + test_invalid_guest_memfd(vm, vm->fd, 0, "VM's fd should fail");
> +
> + memfd = kvm_memfd_alloc(MEM_REGION_SIZE, false);
> + test_invalid_guest_memfd(vm, vm->fd, 0, "Regular memfd() should fail");

This should be

test_invalid_guest_memfd(vm, memfd, 0, "Regular memfd() should fail");

> + close(memfd);
> +
> + vm2 = vm_create_barebones_protected_vm();
> + memfd = vm_create_guest_memfd(vm2, MEM_REGION_SIZE, 0);
> + test_invalid_guest_memfd(vm, memfd, 0, "Other VM's guest_memfd() should 
> fail");
> +
> + vm_set_user_memory_region2(vm2, MEM_REGION_SLOT, KVM_MEM_PRIVATE,
> +MEM_REGION_GPA, MEM_REGION_SIZE, 0, memfd, 
> 0);
> + close(memfd);
> + kvm_vm_free(vm2);
> +
> + memfd = vm_create_guest_memfd(vm, MEM_REGION_SIZE, 0);
> + for (i = 1; i < PAGE_SIZE; i++)
> + test_invalid_guest_memfd(vm, memfd, i, "Unaligned offset should 
> fail");
> +
> + vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_PRIVATE,
> +MEM_REGION_GPA, MEM_REGION_SIZE, 0, memfd, 
> 0);
> + close(memfd);
> +
> + kvm_vm_free(vm);
> +}
> +

>

Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

2023-08-07 Thread Ackerley Tng

Sean Christopherson  writes:

>   

> +static int kvm_gmem_release(struct inode *inode, struct file *file)
> +{
> + struct kvm_gmem *gmem = file->private_data;
> + struct kvm_memory_slot *slot;
> + struct kvm *kvm = gmem->kvm;
> + unsigned long index;
> +
> + filemap_invalidate_lock(inode->i_mapping);
> +
> + /*
> +  * Prevent concurrent attempts to *unbind* a memslot.  This is the last
> +  * reference to the file and thus no new bindings can be created, but
> +  * dereferencing the slot for existing bindings needs to be protected
> +  * against memslot updates, specifically so that unbind doesn't race
> +  * and free the memslot (kvm_gmem_get_file() will return NULL).
> +  */
> + mutex_lock(>slots_lock);
> +
> + xa_for_each(>bindings, index, slot)
> + rcu_assign_pointer(slot->gmem.file, NULL);
> +
> + synchronize_rcu();
> +
> + /*
> +  * All in-flight operations are gone and new bindings can be created.
> +  * Zap all SPTEs pointed at by this file.  Do not free the backing
> +  * memory, as its lifetime is associated with the inode, not the file.
> +  */
> + kvm_gmem_invalidate_begin(gmem, 0, -1ul);
> + kvm_gmem_invalidate_end(gmem, 0, -1ul);
> +
> + mutex_unlock(>slots_lock);
> +
> + list_del(>entry);
> +
> + filemap_invalidate_unlock(inode->i_mapping);
> +
> + xa_destroy(>bindings);
> + kfree(gmem);
> +
> + kvm_put_kvm(kvm);
> +
> + return 0;
> +}
> +

> 

> +
> +int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
> +   unsigned int fd, loff_t offset)
> +{
> + loff_t size = slot->npages << PAGE_SHIFT;
> + unsigned long start, end, flags;
> + struct kvm_gmem *gmem;
> + struct inode *inode;
> + struct file *file;
> +
> + BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
> +
> + file = fget(fd);
> + if (!file)
> + return -EINVAL;
> +
> + if (file->f_op != _gmem_fops)
> + goto err;
> +
> + gmem = file->private_data;
> + if (gmem->kvm != kvm)
> + goto err;
> +
> + inode = file_inode(file);
> + flags = (unsigned long)inode->i_private;
> +
> + /*
> +  * For simplicity, require the offset into the file and the size of the
> +  * memslot to be aligned to the largest possible page size used to back
> +  * the file (same as the size of the file itself).
> +  */
> + if (!kvm_gmem_is_valid_size(offset, flags) ||
> + !kvm_gmem_is_valid_size(size, flags))
> + goto err;
> +
> + if (offset + size > i_size_read(inode))
> + goto err;
> +
> + filemap_invalidate_lock(inode->i_mapping);
> +
> + start = offset >> PAGE_SHIFT;
> + end = start + slot->npages;
> +
> + if (!xa_empty(>bindings) &&
> + xa_find(>bindings, , end - 1, XA_PRESENT)) {
> + filemap_invalidate_unlock(inode->i_mapping);
> + goto err;
> + }
> +
> + /*
> +  * No synchronize_rcu() needed, any in-flight readers are guaranteed to
> +  * be see either a NULL file or this new file, no need for them to go
> +  * away.
> +  */
> + rcu_assign_pointer(slot->gmem.file, file);
> + slot->gmem.pgoff = start;
> +
> + xa_store_range(>bindings, start, end - 1, slot, GFP_KERNEL);
> + filemap_invalidate_unlock(inode->i_mapping);
> +
> + /*
> +  * Drop the reference to the file, even on success.  The file pins KVM,
> +  * not the other way 'round.  Active bindings are invalidated if the
> +  * file is closed before memslots are destroyed.
> +  */
> + fput(file);
> + return 0;
> +
> +err:
> + fput(file);
> + return -EINVAL;
> +}
> +

I’d like to propose an alternative to the refcounting approach between
the gmem file and associated kvm, where we think of KVM’s memslots as
users of the gmem file.

Instead of having the gmem file pin the VM (i.e. take a refcount on
kvm), we could let memslot take a refcount on the gmem file when the
memslots are configured.

Here’s a POC patch that flips the refcounting (and modified selftests in
the next commit):
https://github.com/googleprodkernel/linux-cc/commit/7f487b029b89b9f3e9b094a721bc0772f3c8c797

One side effect of having the gmem file pin the VM is that now the gmem
file becomes sort of a false handle on the VM:

+ Closing the file destroys the file pointers in the VM and invalidates
  the pointers
+ Keeping the file open keeps the VM around in the kernel even though
  the VM fd may already be closed.

I feel that memslots form a natural way of managing usage of the gmem
file. When a memslot is created, it is using the file; hence we take a
refcount on the gmem file, and as memslots are removed, we drop
refcounts on the gmem file.

The KVM pointer is shared among all the bindings in gmem’s xarray, and we can 
enforce that a gmem file is used only with one VM:

+ When binding a memslot to the file, if a kvm

[PATCH] powerpc/pseries: Rework lppaca_shared_proc() to avoid DEBUG_PREEMPT

2023-08-07 Thread Russell Currey

lppaca_shared_proc() takes a pointer to the lppaca which is typically
accessed through get_lppaca().  With DEBUG_PREEMPT enabled, this leads
to checking if preemption is enabled, for example:

BUG: using smp_processor_id() in preemptible [] code: grep/10693
caller is lparcfg_data+0x408/0x19a0
CPU: 4 PID: 10693 Comm: grep Not tainted 6.5.0-rc3 #2
Call Trace:
dump_stack_lvl+0x154/0x200 (unreliable)
check_preemption_disabled+0x214/0x220
lparcfg_data+0x408/0x19a0
...

This isn't actually a problem however, as it does not matter which
lppaca is accessed, the shared proc state will be the same.
vcpudispatch_stats_procfs_init() already works around this by disabling
preemption, but the lparcfg code does not, erroring any time
/proc/powerpc/lparcfg is accessed with DEBUG_PREEMPT enabled.

Instead of disabling preemption on the caller side, rework
lppaca_shared_proc() to not take a pointer and instead directly access
the lppaca, bypassing any potential preemption checks.

Fixes: f13c13a00512 ("powerpc: Stop using non-architected shared_proc field in 
lppaca")
Signed-off-by: Russell Currey 
---
Fixes tag might be a bit overkill.
---
 arch/powerpc/include/asm/lppaca.h|  9 -
 arch/powerpc/include/asm/paca.h  |  5 +
 arch/powerpc/platforms/pseries/lpar.c| 10 +-
 arch/powerpc/platforms/pseries/lparcfg.c |  4 ++--
 arch/powerpc/platforms/pseries/setup.c   |  2 +-
 drivers/cpuidle/cpuidle-pseries.c|  8 +---
 6 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/include/asm/lppaca.h 
b/arch/powerpc/include/asm/lppaca.h
index 34d44cb17c87..c12e1a6e3595 100644
--- a/arch/powerpc/include/asm/lppaca.h
+++ b/arch/powerpc/include/asm/lppaca.h
@@ -127,7 +127,14 @@ struct lppaca {
  */
 #define LPPACA_OLD_SHARED_PROC 2
 
-static inline bool lppaca_shared_proc(struct lppaca *l)
+/*
+ * All CPUs should have the same shared proc value, so directly access the PACA
+ * to avoid false positives from DEBUG_PREEMPT.
+ *
+ * local_paca can't be referenced directly from lppaca.h, hence the macro.
+ */
+#define lppaca_shared_proc() (__lppaca_shared_proc(local_paca->lppaca_ptr))
+static inline bool __lppaca_shared_proc(struct lppaca *l)
 {
if (!firmware_has_feature(FW_FEATURE_SPLPAR))
return false;
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index cb325938766a..f77337b92ccf 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -49,6 +49,11 @@ extern unsigned int debug_smp_processor_id(void); /* from 
linux/smp.h */
 
 #ifdef CONFIG_PPC_PSERIES
 #define get_lppaca()   (get_paca()->lppaca_ptr)
+/*
+ * All CPUs should have the same shared proc value, so directly access the PACA
+ * to avoid false positives from DEBUG_PREEMPT.
+ */
+#define lppaca_shared_proc() (__lppaca_shared_proc(local_paca->lppaca_ptr))
 #endif
 
 #define get_slb_shadow()   (get_paca()->slb_shadow_ptr)
diff --git a/arch/powerpc/platforms/pseries/lpar.c 
b/arch/powerpc/platforms/pseries/lpar.c
index 2eab323f6970..cb2f1211f7eb 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -639,16 +639,8 @@ static const struct proc_ops 
vcpudispatch_stats_freq_proc_ops = {
 
 static int __init vcpudispatch_stats_procfs_init(void)
 {
-   /*
-* Avoid smp_processor_id while preemptible. All CPUs should have
-* the same value for lppaca_shared_proc.
-*/
-   preempt_disable();
-   if (!lppaca_shared_proc(get_lppaca())) {
-   preempt_enable();
+   if (!lppaca_shared_proc())
return 0;
-   }
-   preempt_enable();
 
if (!proc_create("powerpc/vcpudispatch_stats", 0600, NULL,
_stats_proc_ops))
diff --git a/arch/powerpc/platforms/pseries/lparcfg.c 
b/arch/powerpc/platforms/pseries/lparcfg.c
index 8acc70509520..1c151d77e74b 100644
--- a/arch/powerpc/platforms/pseries/lparcfg.c
+++ b/arch/powerpc/platforms/pseries/lparcfg.c
@@ -206,7 +206,7 @@ static void parse_ppp_data(struct seq_file *m)
   ppp_data.active_system_procs);
 
/* pool related entries are appropriate for shared configs */
-   if (lppaca_shared_proc(get_lppaca())) {
+   if (lppaca_shared_proc()) {
unsigned long pool_idle_time, pool_procs;
 
seq_printf(m, "pool=%d\n", ppp_data.pool_num);
@@ -560,7 +560,7 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
   partition_potential_processors);
 
seq_printf(m, "shared_processor_mode=%d\n",
-  lppaca_shared_proc(get_lppaca()));
+  lppaca_shared_proc());
 
 #ifdef CONFIG_PPC_64S_HASH_MMU
if (!radix_enabled())
diff --git a/arch/powerpc/platforms/pseries/setup.c 
b/arch/powerpc/platforms/pseries/setup.c
index

[PATCH mm-unstable v9 31/31] mm: Remove pgtable_{pmd, pte}_page_{ctor, dtor}() wrappers

2023-08-07 Thread Vishal Moola (Oracle)

These functions are no longer necessary. Remove them and cleanup
Documentation referencing them.

Acked-by: Mike Rapoport (IBM) 
Signed-off-by: Vishal Moola (Oracle) 
---
 Documentation/mm/split_page_table_lock.rst| 12 +--
 .../zh_CN/mm/split_page_table_lock.rst| 14 ++---
 include/linux/mm.h| 20 ---
 3 files changed, 13 insertions(+), 33 deletions(-)

diff --git a/Documentation/mm/split_page_table_lock.rst 
b/Documentation/mm/split_page_table_lock.rst
index a834fad9de12..e4f6972eb6c0 100644
--- a/Documentation/mm/split_page_table_lock.rst
+++ b/Documentation/mm/split_page_table_lock.rst
@@ -58,7 +58,7 @@ Support of split page table lock by an architecture
 ===
 
 There's no need in special enabling of PTE split page table lock: everything
-required is done by pgtable_pte_page_ctor() and pgtable_pte_page_dtor(), which
+required is done by pagetable_pte_ctor() and pagetable_pte_dtor(), which
 must be called on PTE table allocation / freeing.
 
 Make sure the architecture doesn't use slab allocator for page table
@@ -68,8 +68,8 @@ This field shares storage with page->ptl.
 PMD split lock only makes sense if you have more than two page table
 levels.
 
-PMD split lock enabling requires pgtable_pmd_page_ctor() call on PMD table
-allocation and pgtable_pmd_page_dtor() on freeing.
+PMD split lock enabling requires pagetable_pmd_ctor() call on PMD table
+allocation and pagetable_pmd_dtor() on freeing.
 
 Allocation usually happens in pmd_alloc_one(), freeing in pmd_free() and
 pmd_free_tlb(), but make sure you cover all PMD table allocation / freeing
@@ -77,7 +77,7 @@ paths: i.e X86_PAE preallocate few PMDs on pgd_alloc().
 
 With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK.
 
-NOTE: pgtable_pte_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must
+NOTE: pagetable_pte_ctor() and pagetable_pmd_ctor() can fail -- it must
 be handled properly.
 
 page->ptl
@@ -97,7 +97,7 @@ trick:
split lock with enabled DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC, but costs
one more cache line for indirect access;
 
-The spinlock_t allocated in pgtable_pte_page_ctor() for PTE table and in
-pgtable_pmd_page_ctor() for PMD table.
+The spinlock_t allocated in pagetable_pte_ctor() for PTE table and in
+pagetable_pmd_ctor() for PMD table.
 
 Please, never access page->ptl directly -- use appropriate helper.
diff --git a/Documentation/translations/zh_CN/mm/split_page_table_lock.rst 
b/Documentation/translations/zh_CN/mm/split_page_table_lock.rst
index 4fb7aa666037..a2c288670a24 100644
--- a/Documentation/translations/zh_CN/mm/split_page_table_lock.rst
+++ b/Documentation/translations/zh_CN/mm/split_page_table_lock.rst
@@ -56,16 +56,16 @@ Hugetlb特定的辅助函数:
 架构对分页表锁的支持
 
 
-没有必要特别启用PTE分页表锁：所有需要的东西都由pgtable_pte_page_ctor()
-和pgtable_pte_page_dtor()完成，它们必须在PTE表分配/释放时被调用。
+没有必要特别启用PTE分页表锁：所有需要的东西都由pagetable_pte_ctor()
+和pagetable_pte_dtor()完成，它们必须在PTE表分配/释放时被调用。
 
 确保架构不使用slab分配器来分配页表：slab使用page->slab_cache来分配其页
 面。这个区域与page->ptl共享存储。
 
 PMD分页锁只有在你有两个以上的页表级别时才有意义。
 
-启用PMD分页锁需要在PMD表分配时调用pgtable_pmd_page_ctor()，在释放时调
-用pgtable_pmd_page_dtor()。
+启用PMD分页锁需要在PMD表分配时调用pagetable_pmd_ctor()，在释放时调
+用pagetable_pmd_dtor()。
 
 分配通常发生在pmd_alloc_one()中，释放发生在pmd_free()和pmd_free_tlb()
 中，但要确保覆盖所有的PMD表分配/释放路径：即X86_PAE在pgd_alloc()中预先
@@ -73,7 +73,7 @@ PMD分页锁只有在你有两个以上的页表级别时才有意义。
 
 一切就绪后，你可以设置CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK。
 
-注意：pgtable_pte_page_ctor()和pgtable_pmd_page_ctor()可能失败--必
+注意：pagetable_pte_ctor()和pagetable_pmd_ctor()可能失败--必
 须正确处理。
 
 page->ptl
@@ -90,7 +90,7 @@ page->ptl用于访问分割页表锁，其中'page'是包含该表的页面struc
的指针并动态分配它。这允许在启用DEBUG_SPINLOCK或DEBUG_LOCK_ALLOC的
情况下使用分页锁，但由于间接访问而多花了一个缓存行。
 
-PTE表的spinlock_t分配在pgtable_pte_page_ctor()中，PMD表的spinlock_t
-分配在pgtable_pmd_page_ctor()中。
+PTE表的spinlock_t分配在pagetable_pte_ctor()中，PMD表的spinlock_t
+分配在pagetable_pmd_ctor()中。
 
 请不要直接访问page->ptl - -使用适当的辅助函数。
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6310e0c59efe..6a95dfed4957 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2932,11 +2932,6 @@ static inline bool pagetable_pte_ctor(struct ptdesc 
*ptdesc)
return true;
 }
 
-static inline bool pgtable_pte_page_ctor(struct page *page)
-{
-   return pagetable_pte_ctor(page_ptdesc(page));
-}
-
 static inline void pagetable_pte_dtor(struct ptdesc *ptdesc)
 {
struct folio *folio = ptdesc_folio(ptdesc);
@@ -2946,11 +2941,6 @@ static inline void pagetable_pte_dtor(struct ptdesc 
*ptdesc)
lruvec_stat_sub_folio(folio, NR_PAGETABLE);
 }
 
-static inline void pgtable_pte_page_dtor(struct page *page)
-{
-   pagetable_pte_dtor(page_ptdesc(page));
-}
-
 pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp);
 static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr)
 {
@@ -3057,11 +3047,6 @@ static inline bool pagetable_pmd_ctor(struct ptdesc 
*ptdesc)
return true;
 }

[PATCH mm-unstable v9 30/31] um: Convert {pmd, pte}_free_tlb() to use ptdescs

2023-08-07 Thread Vishal Moola (Oracle)

Part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents. Also cleans up some spacing issues.

Acked-by: Mike Rapoport (IBM) 
Signed-off-by: Vishal Moola (Oracle) 
---
 arch/um/include/asm/pgalloc.h | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h
index 8ec7cd46dd96..de5e31c64793 100644
--- a/arch/um/include/asm/pgalloc.h
+++ b/arch/um/include/asm/pgalloc.h
@@ -25,19 +25,19 @@
  */
 extern pgd_t *pgd_alloc(struct mm_struct *);
 
-#define __pte_free_tlb(tlb,pte, address)   \
-do {   \
-   pgtable_pte_page_dtor(pte); \
-   tlb_remove_page((tlb),(pte));   \
+#define __pte_free_tlb(tlb, pte, address)  \
+do {   \
+   pagetable_pte_dtor(page_ptdesc(pte));   \
+   tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte)));  \
 } while (0)
 
 #ifdef CONFIG_3_LEVEL_PGTABLES
 
-#define __pmd_free_tlb(tlb, pmd, address)  \
-do {   \
-   pgtable_pmd_page_dtor(virt_to_page(pmd));   \
-   tlb_remove_page((tlb),virt_to_page(pmd));   \
-} while (0)\
+#define __pmd_free_tlb(tlb, pmd, address)  \
+do {   \
+   pagetable_pmd_dtor(virt_to_ptdesc(pmd));\
+   tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(pmd)); \
+} while (0)
 
 #endif
 
-- 
2.40.1

[PATCH mm-unstable v9 29/31] sparc: Convert pgtable_pte_page_{ctor, dtor}() to ptdesc equivalents

2023-08-07 Thread Vishal Moola (Oracle)

Part of the conversions to replace pgtable pte constructor/destructors with
ptdesc equivalents.

Acked-by: Mike Rapoport (IBM) 
Signed-off-by: Vishal Moola (Oracle) 
---
 arch/sparc/mm/srmmu.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c
index 13f027afc875..8393faa3e596 100644
--- a/arch/sparc/mm/srmmu.c
+++ b/arch/sparc/mm/srmmu.c
@@ -355,7 +355,8 @@ pgtable_t pte_alloc_one(struct mm_struct *mm)
return NULL;
page = pfn_to_page(__nocache_pa((unsigned long)ptep) >> PAGE_SHIFT);
spin_lock(>page_table_lock);
-   if (page_ref_inc_return(page) == 2 && !pgtable_pte_page_ctor(page)) {
+   if (page_ref_inc_return(page) == 2 &&
+   !pagetable_pte_ctor(page_ptdesc(page))) {
page_ref_dec(page);
ptep = NULL;
}
@@ -371,7 +372,7 @@ void pte_free(struct mm_struct *mm, pgtable_t ptep)
page = pfn_to_page(__nocache_pa((unsigned long)ptep) >> PAGE_SHIFT);
spin_lock(>page_table_lock);
if (page_ref_dec_return(page) == 1)
-   pgtable_pte_page_dtor(page);
+   pagetable_pte_dtor(page_ptdesc(page));
spin_unlock(>page_table_lock);
 
srmmu_free_nocache(ptep, SRMMU_PTE_TABLE_SIZE);
-- 
2.40.1

[PATCH mm-unstable v9 28/31] sparc64: Convert various functions to use ptdescs

2023-08-07 Thread Vishal Moola (Oracle)

As part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents, convert various page table functions to use ptdescs.

Acked-by: Mike Rapoport (IBM) 
Signed-off-by: Vishal Moola (Oracle) 
---
 arch/sparc/mm/init_64.c | 17 +
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 680ef206565c..f83017992eaa 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2907,14 +2907,15 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
 
 pgtable_t pte_alloc_one(struct mm_struct *mm)
 {
-   struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-   if (!page)
+   struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL | __GFP_ZERO, 0);
+
+   if (!ptdesc)
return NULL;
-   if (!pgtable_pte_page_ctor(page)) {
-   __free_page(page);
+   if (!pagetable_pte_ctor(ptdesc)) {
+   pagetable_free(ptdesc);
return NULL;
}
-   return (pte_t *) page_address(page);
+   return ptdesc_address(ptdesc);
 }
 
 void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
@@ -2924,10 +2925,10 @@ void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 
 static void __pte_free(pgtable_t pte)
 {
-   struct page *page = virt_to_page(pte);
+   struct ptdesc *ptdesc = virt_to_ptdesc(pte);
 
-   pgtable_pte_page_dtor(page);
-   __free_page(page);
+   pagetable_pte_dtor(ptdesc);
+   pagetable_free(ptdesc);
 }
 
 void pte_free(struct mm_struct *mm, pgtable_t pte)
-- 
2.40.1

[PATCH mm-unstable v9 27/31] sh: Convert pte_free_tlb() to use ptdescs

2023-08-07 Thread Vishal Moola (Oracle)

Part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents. Also cleans up some spacing issues.

Reviewed-by: Geert Uytterhoeven 
Acked-by: John Paul Adrian Glaubitz 
Acked-by: Mike Rapoport (IBM) 
Signed-off-by: Vishal Moola (Oracle) 
---
 arch/sh/include/asm/pgalloc.h | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h
index a9e98233c4d4..5d8577ab1591 100644
--- a/arch/sh/include/asm/pgalloc.h
+++ b/arch/sh/include/asm/pgalloc.h
@@ -2,6 +2,7 @@
 #ifndef __ASM_SH_PGALLOC_H
 #define __ASM_SH_PGALLOC_H
 
+#include 
 #include 
 
 #define __HAVE_ARCH_PMD_ALLOC_ONE
@@ -31,10 +32,10 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t 
*pmd,
set_pmd(pmd, __pmd((unsigned long)page_address(pte)));
 }
 
-#define __pte_free_tlb(tlb,pte,addr)   \
-do {   \
-   pgtable_pte_page_dtor(pte); \
-   tlb_remove_page((tlb), (pte));  \
+#define __pte_free_tlb(tlb, pte, addr) \
+do {   \
+   pagetable_pte_dtor(page_ptdesc(pte));   \
+   tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte)));  \
 } while (0)
 
 #endif /* __ASM_SH_PGALLOC_H */
-- 
2.40.1

[PATCH mm-unstable v9 26/31] riscv: Convert alloc_{pmd, pte}_late() to use ptdescs

2023-08-07 Thread Vishal Moola (Oracle)

As part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents, convert various page table functions to use ptdescs.

Some of the functions use the *get*page*() helper functions. Convert
these to use pagetable_alloc() and ptdesc_address() instead to help
standardize page tables further.

Acked-by: Palmer Dabbelt 
Acked-by: Mike Rapoport (IBM) 
Signed-off-by: Vishal Moola (Oracle) 
---
 arch/riscv/include/asm/pgalloc.h |  8 
 arch/riscv/mm/init.c | 16 ++--
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h
index 59dc12b5b7e8..d169a4f41a2e 100644
--- a/arch/riscv/include/asm/pgalloc.h
+++ b/arch/riscv/include/asm/pgalloc.h
@@ -153,10 +153,10 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 
 #endif /* __PAGETABLE_PMD_FOLDED */
 
-#define __pte_free_tlb(tlb, pte, buf)   \
-do {\
-   pgtable_pte_page_dtor(pte); \
-   tlb_remove_page((tlb), pte);\
+#define __pte_free_tlb(tlb, pte, buf)  \
+do {   \
+   pagetable_pte_dtor(page_ptdesc(pte));   \
+   tlb_remove_page_ptdesc((tlb), page_ptdesc(pte));\
 } while (0)
 #endif /* CONFIG_MMU */
 
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 9ce504737d18..430a3d05a841 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -353,12 +353,10 @@ static inline phys_addr_t __init 
alloc_pte_fixmap(uintptr_t va)
 
 static phys_addr_t __init alloc_pte_late(uintptr_t va)
 {
-   unsigned long vaddr;
-
-   vaddr = __get_free_page(GFP_KERNEL);
-   BUG_ON(!vaddr || !pgtable_pte_page_ctor(virt_to_page((void *)vaddr)));
+   struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0);
 
-   return __pa(vaddr);
+   BUG_ON(!ptdesc || !pagetable_pte_ctor(ptdesc));
+   return __pa((pte_t *)ptdesc_address(ptdesc));
 }
 
 static void __init create_pte_mapping(pte_t *ptep,
@@ -436,12 +434,10 @@ static phys_addr_t __init alloc_pmd_fixmap(uintptr_t va)
 
 static phys_addr_t __init alloc_pmd_late(uintptr_t va)
 {
-   unsigned long vaddr;
-
-   vaddr = __get_free_page(GFP_KERNEL);
-   BUG_ON(!vaddr || !pgtable_pmd_page_ctor(virt_to_page((void *)vaddr)));
+   struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0);
 
-   return __pa(vaddr);
+   BUG_ON(!ptdesc || !pagetable_pmd_ctor(ptdesc));
+   return __pa((pmd_t *)ptdesc_address(ptdesc));
 }
 
 static void __init create_pmd_mapping(pmd_t *pmdp,
-- 
2.40.1

[PATCH mm-unstable v9 25/31] openrisc: Convert __pte_free_tlb() to use ptdescs

2023-08-07 Thread Vishal Moola (Oracle)

Part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents.

Acked-by: Mike Rapoport (IBM) 
Signed-off-by: Vishal Moola (Oracle) 
---
 arch/openrisc/include/asm/pgalloc.h | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/openrisc/include/asm/pgalloc.h 
b/arch/openrisc/include/asm/pgalloc.h
index b7b2b8d16fad..c6a73772a546 100644
--- a/arch/openrisc/include/asm/pgalloc.h
+++ b/arch/openrisc/include/asm/pgalloc.h
@@ -66,10 +66,10 @@ extern inline pgd_t *pgd_alloc(struct mm_struct *mm)
 
 extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm);
 
-#define __pte_free_tlb(tlb, pte, addr) \
-do {   \
-   pgtable_pte_page_dtor(pte); \
-   tlb_remove_page((tlb), (pte));  \
+#define __pte_free_tlb(tlb, pte, addr) \
+do {   \
+   pagetable_pte_dtor(page_ptdesc(pte));   \
+   tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte)));  \
 } while (0)
 
 #endif
-- 
2.40.1

[PATCH mm-unstable v9 24/31] nios2: Convert __pte_free_tlb() to use ptdescs

2023-08-07 Thread Vishal Moola (Oracle)

Part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents.

Acked-by: Mike Rapoport (IBM) 
Acked-by: Dinh Nguyen 
Signed-off-by: Vishal Moola (Oracle) 
---
 arch/nios2/include/asm/pgalloc.h | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h
index ecd1657bb2ce..ce6bb8e74271 100644
--- a/arch/nios2/include/asm/pgalloc.h
+++ b/arch/nios2/include/asm/pgalloc.h
@@ -28,10 +28,10 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t 
*pmd,
 
 extern pgd_t *pgd_alloc(struct mm_struct *mm);
 
-#define __pte_free_tlb(tlb, pte, addr) \
-   do {\
-   pgtable_pte_page_dtor(pte); \
-   tlb_remove_page((tlb), (pte));  \
+#define __pte_free_tlb(tlb, pte, addr) \
+   do {\
+   pagetable_pte_dtor(page_ptdesc(pte));   \
+   tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte)));  \
} while (0)
 
 #endif /* _ASM_NIOS2_PGALLOC_H */
-- 
2.40.1

[PATCH mm-unstable v9 23/31] mips: Convert various functions to use ptdescs

2023-08-07 Thread Vishal Moola (Oracle)

As part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents, convert various page table functions to use ptdescs.

Some of the functions use the *get*page*() helper functions. Convert
these to use pagetable_alloc() and ptdesc_address() instead to help
standardize page tables further.

Acked-by: Mike Rapoport (IBM) 
Signed-off-by: Vishal Moola (Oracle) 
---
 arch/mips/include/asm/pgalloc.h | 32 ++--
 arch/mips/mm/pgtable.c  |  8 +---
 2 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h
index f72e737dda21..40e40a7eb94a 100644
--- a/arch/mips/include/asm/pgalloc.h
+++ b/arch/mips/include/asm/pgalloc.h
@@ -51,13 +51,13 @@ extern pgd_t *pgd_alloc(struct mm_struct *mm);
 
 static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
-   free_pages((unsigned long)pgd, PGD_TABLE_ORDER);
+   pagetable_free(virt_to_ptdesc(pgd));
 }
 
-#define __pte_free_tlb(tlb,pte,address)\
-do {   \
-   pgtable_pte_page_dtor(pte); \
-   tlb_remove_page((tlb), pte);\
+#define __pte_free_tlb(tlb, pte, address)  \
+do {   \
+   pagetable_pte_dtor(page_ptdesc(pte));   \
+   tlb_remove_page_ptdesc((tlb), page_ptdesc(pte));\
 } while (0)
 
 #ifndef __PAGETABLE_PMD_FOLDED
@@ -65,18 +65,18 @@ do {
\
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
 {
pmd_t *pmd;
-   struct page *pg;
+   struct ptdesc *ptdesc;
 
-   pg = alloc_pages(GFP_KERNEL_ACCOUNT, PMD_TABLE_ORDER);
-   if (!pg)
+   ptdesc = pagetable_alloc(GFP_KERNEL_ACCOUNT, PMD_TABLE_ORDER);
+   if (!ptdesc)
return NULL;
 
-   if (!pgtable_pmd_page_ctor(pg)) {
-   __free_pages(pg, PMD_TABLE_ORDER);
+   if (!pagetable_pmd_ctor(ptdesc)) {
+   pagetable_free(ptdesc);
return NULL;
}
 
-   pmd = (pmd_t *)page_address(pg);
+   pmd = ptdesc_address(ptdesc);
pmd_init(pmd);
return pmd;
 }
@@ -90,10 +90,14 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, 
unsigned long address)
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address)
 {
pud_t *pud;
+   struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM,
+   PUD_TABLE_ORDER);
 
-   pud = (pud_t *) __get_free_pages(GFP_KERNEL, PUD_TABLE_ORDER);
-   if (pud)
-   pud_init(pud);
+   if (!ptdesc)
+   return NULL;
+   pud = ptdesc_address(ptdesc);
+
+   pud_init(pud);
return pud;
 }
 
diff --git a/arch/mips/mm/pgtable.c b/arch/mips/mm/pgtable.c
index b13314be5d0e..1506e458040d 100644
--- a/arch/mips/mm/pgtable.c
+++ b/arch/mips/mm/pgtable.c
@@ -10,10 +10,12 @@
 
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-   pgd_t *ret, *init;
+   pgd_t *init, *ret = NULL;
+   struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM,
+   PGD_TABLE_ORDER);
 
-   ret = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_TABLE_ORDER);
-   if (ret) {
+   if (ptdesc) {
+   ret = ptdesc_address(ptdesc);
init = pgd_offset(_mm, 0UL);
pgd_init(ret);
memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD,
-- 
2.40.1

[PATCH mm-unstable v9 22/31] m68k: Convert various functions to use ptdescs

2023-08-07 Thread Vishal Moola (Oracle)

As part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents, convert various page table functions to use ptdescs.

Some of the functions use the *get*page*() helper functions. Convert
these to use pagetable_alloc() and ptdesc_address() instead to help
standardize page tables further.

Acked-by: Mike Rapoport (IBM) 
Acked-by: Geert Uytterhoeven 
Signed-off-by: Vishal Moola (Oracle) 
---
 arch/m68k/include/asm/mcf_pgalloc.h  | 47 ++--
 arch/m68k/include/asm/sun3_pgalloc.h |  8 ++---
 arch/m68k/mm/motorola.c  |  4 +--
 3 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/arch/m68k/include/asm/mcf_pgalloc.h 
b/arch/m68k/include/asm/mcf_pgalloc.h
index 5c2c0a864524..302c5bf67179 100644
--- a/arch/m68k/include/asm/mcf_pgalloc.h
+++ b/arch/m68k/include/asm/mcf_pgalloc.h
@@ -5,22 +5,22 @@
 #include 
 #include 
 
-extern inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
-   free_page((unsigned long) pte);
+   pagetable_free(virt_to_ptdesc(pte));
 }
 
 extern const char bad_pmd_string[];
 
-extern inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
+static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
 {
-   unsigned long page = __get_free_page(GFP_DMA);
+   struct ptdesc *ptdesc = pagetable_alloc((GFP_DMA | __GFP_ZERO) &
+   ~__GFP_HIGHMEM, 0);
 
-   if (!page)
+   if (!ptdesc)
return NULL;
 
-   memset((void *)page, 0, PAGE_SIZE);
-   return (pte_t *) (page);
+   return ptdesc_address(ptdesc);
 }
 
 extern inline pmd_t *pmd_alloc_kernel(pgd_t *pgd, unsigned long address)
@@ -35,36 +35,34 @@ extern inline pmd_t *pmd_alloc_kernel(pgd_t *pgd, unsigned 
long address)
 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pgtable,
  unsigned long address)
 {
-   struct page *page = virt_to_page(pgtable);
+   struct ptdesc *ptdesc = virt_to_ptdesc(pgtable);
 
-   pgtable_pte_page_dtor(page);
-   __free_page(page);
+   pagetable_pte_dtor(ptdesc);
+   pagetable_free(ptdesc);
 }
 
 static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
 {
-   struct page *page = alloc_pages(GFP_DMA, 0);
+   struct ptdesc *ptdesc = pagetable_alloc(GFP_DMA | __GFP_ZERO, 0);
pte_t *pte;
 
-   if (!page)
+   if (!ptdesc)
return NULL;
-   if (!pgtable_pte_page_ctor(page)) {
-   __free_page(page);
+   if (!pagetable_pte_ctor(ptdesc)) {
+   pagetable_free(ptdesc);
return NULL;
}
 
-   pte = page_address(page);
-   clear_page(pte);
-
+   pte = ptdesc_address(ptdesc);
return pte;
 }
 
 static inline void pte_free(struct mm_struct *mm, pgtable_t pgtable)
 {
-   struct page *page = virt_to_page(pgtable);
+   struct ptdesc *ptdesc = virt_to_ptdesc(pgtable);
 
-   pgtable_pte_page_dtor(page);
-   __free_page(page);
+   pagetable_pte_dtor(ptdesc);
+   pagetable_free(ptdesc);
 }
 
 /*
@@ -75,16 +73,19 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t 
pgtable)
 
 static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
-   free_page((unsigned long) pgd);
+   pagetable_free(virt_to_ptdesc(pgd));
 }
 
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
pgd_t *new_pgd;
+   struct ptdesc *ptdesc = pagetable_alloc((GFP_DMA | __GFP_NOWARN) &
+   ~__GFP_HIGHMEM, 0);
 
-   new_pgd = (pgd_t *)__get_free_page(GFP_DMA | __GFP_NOWARN);
-   if (!new_pgd)
+   if (!ptdesc)
return NULL;
+   new_pgd = ptdesc_address(ptdesc);
+
memcpy(new_pgd, swapper_pg_dir, PTRS_PER_PGD * sizeof(pgd_t));
memset(new_pgd, 0, PAGE_OFFSET >> PGDIR_SHIFT);
return new_pgd;
diff --git a/arch/m68k/include/asm/sun3_pgalloc.h 
b/arch/m68k/include/asm/sun3_pgalloc.h
index 198036aff519..ff48573db2c0 100644
--- a/arch/m68k/include/asm/sun3_pgalloc.h
+++ b/arch/m68k/include/asm/sun3_pgalloc.h
@@ -17,10 +17,10 @@
 
 extern const char bad_pmd_string[];
 
-#define __pte_free_tlb(tlb,pte,addr)   \
-do {   \
-   pgtable_pte_page_dtor(pte); \
-   tlb_remove_page((tlb), pte);\
+#define __pte_free_tlb(tlb, pte, addr) \
+do {   \
+   pagetable_pte_dtor(page_ptdesc(pte));   \
+   tlb_remove_page_ptdesc((tlb), page_ptdesc(pte));\
 } while (0)
 
 static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t 
*pte)
diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c
index 8bca46e51e94..c1761d309fc6 100644
--- a/arch/m68k/mm/motorola.c
+++ b/arch/m68k/mm/motorola.c
@@ -161,7 +161,7 @@

[PATCH mm-unstable v9 21/31] loongarch: Convert various functions to use ptdescs

2023-08-07 Thread Vishal Moola (Oracle)

As part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents, convert various page table functions to use ptdescs.

Some of the functions use the *get*page*() helper functions. Convert
these to use pagetable_alloc() and ptdesc_address() instead to help
standardize page tables further.

Acked-by: Mike Rapoport (IBM) 
Signed-off-by: Vishal Moola (Oracle) 
---
 arch/loongarch/include/asm/pgalloc.h | 27 +++
 arch/loongarch/mm/pgtable.c  |  7 ---
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/arch/loongarch/include/asm/pgalloc.h 
b/arch/loongarch/include/asm/pgalloc.h
index af1d1e4a6965..23f5b1107246 100644
--- a/arch/loongarch/include/asm/pgalloc.h
+++ b/arch/loongarch/include/asm/pgalloc.h
@@ -45,9 +45,9 @@ extern void pagetable_init(void);
 extern pgd_t *pgd_alloc(struct mm_struct *mm);
 
 #define __pte_free_tlb(tlb, pte, address)  \
-do {   \
-   pgtable_pte_page_dtor(pte); \
-   tlb_remove_page((tlb), pte);\
+do {   \
+   pagetable_pte_dtor(page_ptdesc(pte));   \
+   tlb_remove_page_ptdesc((tlb), page_ptdesc(pte));\
 } while (0)
 
 #ifndef __PAGETABLE_PMD_FOLDED
@@ -55,18 +55,18 @@ do {
\
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
 {
pmd_t *pmd;
-   struct page *pg;
+   struct ptdesc *ptdesc;
 
-   pg = alloc_page(GFP_KERNEL_ACCOUNT);
-   if (!pg)
+   ptdesc = pagetable_alloc(GFP_KERNEL_ACCOUNT, 0);
+   if (!ptdesc)
return NULL;
 
-   if (!pgtable_pmd_page_ctor(pg)) {
-   __free_page(pg);
+   if (!pagetable_pmd_ctor(ptdesc)) {
+   pagetable_free(ptdesc);
return NULL;
}
 
-   pmd = (pmd_t *)page_address(pg);
+   pmd = ptdesc_address(ptdesc);
pmd_init(pmd);
return pmd;
 }
@@ -80,10 +80,13 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, 
unsigned long address)
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address)
 {
pud_t *pud;
+   struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0);
 
-   pud = (pud_t *) __get_free_page(GFP_KERNEL);
-   if (pud)
-   pud_init(pud);
+   if (!ptdesc)
+   return NULL;
+   pud = ptdesc_address(ptdesc);
+
+   pud_init(pud);
return pud;
 }
 
diff --git a/arch/loongarch/mm/pgtable.c b/arch/loongarch/mm/pgtable.c
index 1260cf30e3ee..b14343e211b6 100644
--- a/arch/loongarch/mm/pgtable.c
+++ b/arch/loongarch/mm/pgtable.c
@@ -11,10 +11,11 @@
 
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-   pgd_t *ret, *init;
+   pgd_t *init, *ret = NULL;
+   struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0);
 
-   ret = (pgd_t *) __get_free_page(GFP_KERNEL);
-   if (ret) {
+   if (ptdesc) {
+   ret = (pgd_t *)ptdesc_address(ptdesc);
init = pgd_offset(_mm, 0UL);
pgd_init(ret);
memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD,
-- 
2.40.1

[PATCH mm-unstable v9 20/31] hexagon: Convert __pte_free_tlb() to use ptdescs

2023-08-07 Thread Vishal Moola (Oracle)

Part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents.

Acked-by: Mike Rapoport (IBM) 
Signed-off-by: Vishal Moola (Oracle) 
---
 arch/hexagon/include/asm/pgalloc.h | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/hexagon/include/asm/pgalloc.h 
b/arch/hexagon/include/asm/pgalloc.h
index f0c47e6a7427..55988625e6fb 100644
--- a/arch/hexagon/include/asm/pgalloc.h
+++ b/arch/hexagon/include/asm/pgalloc.h
@@ -87,10 +87,10 @@ static inline void pmd_populate_kernel(struct mm_struct 
*mm, pmd_t *pmd,
max_kernel_seg = pmdindex;
 }
 
-#define __pte_free_tlb(tlb, pte, addr) \
-do {   \
-   pgtable_pte_page_dtor((pte));   \
-   tlb_remove_page((tlb), (pte));  \
+#define __pte_free_tlb(tlb, pte, addr) \
+do {   \
+   pagetable_pte_dtor((page_ptdesc(pte))); \
+   tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte)));  \
 } while (0)
 
 #endif
-- 
2.40.1

[PATCH mm-unstable v9 19/31] csky: Convert __pte_free_tlb() to use ptdescs

2023-08-07 Thread Vishal Moola (Oracle)

Part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents.

Acked-by: Guo Ren 
Acked-by: Mike Rapoport (IBM) 
Signed-off-by: Vishal Moola (Oracle) 
---
 arch/csky/include/asm/pgalloc.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h
index 7d57e5da0914..9c84c9012e53 100644
--- a/arch/csky/include/asm/pgalloc.h
+++ b/arch/csky/include/asm/pgalloc.h
@@ -63,8 +63,8 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 
 #define __pte_free_tlb(tlb, pte, address)  \
 do {   \
-   pgtable_pte_page_dtor(pte); \
-   tlb_remove_page(tlb, pte);  \
+   pagetable_pte_dtor(page_ptdesc(pte));   \
+   tlb_remove_page_ptdesc(tlb, page_ptdesc(pte));  \
 } while (0)
 
 extern void pagetable_init(void);
-- 
2.40.1

[PATCH mm-unstable v9 18/31] arm64: Convert various functions to use ptdescs

2023-08-07 Thread Vishal Moola (Oracle)

As part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents, convert various page table functions to use ptdescs.

Acked-by: Mike Rapoport (IBM) 
Acked-by: Catalin Marinas 
Signed-off-by: Vishal Moola (Oracle) 
---
 arch/arm64/include/asm/tlb.h | 14 --
 arch/arm64/mm/mmu.c  |  7 ---
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index c995d1f4594f..2c29239d05c3 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@@ -75,18 +75,20 @@ static inline void tlb_flush(struct mmu_gather *tlb)
 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
  unsigned long addr)
 {
-   pgtable_pte_page_dtor(pte);
-   tlb_remove_table(tlb, pte);
+   struct ptdesc *ptdesc = page_ptdesc(pte);
+
+   pagetable_pte_dtor(ptdesc);
+   tlb_remove_ptdesc(tlb, ptdesc);
 }
 
 #if CONFIG_PGTABLE_LEVELS > 2
 static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
  unsigned long addr)
 {
-   struct page *page = virt_to_page(pmdp);
+   struct ptdesc *ptdesc = virt_to_ptdesc(pmdp);
 
-   pgtable_pmd_page_dtor(page);
-   tlb_remove_table(tlb, page);
+   pagetable_pmd_dtor(ptdesc);
+   tlb_remove_ptdesc(tlb, ptdesc);
 }
 #endif
 
@@ -94,7 +96,7 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, 
pmd_t *pmdp,
 static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pudp,
  unsigned long addr)
 {
-   tlb_remove_table(tlb, virt_to_page(pudp));
+   tlb_remove_ptdesc(tlb, virt_to_ptdesc(pudp));
 }
 #endif
 
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 95d360805f8a..47781bec6171 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -426,6 +426,7 @@ static phys_addr_t __pgd_pgtable_alloc(int shift)
 static phys_addr_t pgd_pgtable_alloc(int shift)
 {
phys_addr_t pa = __pgd_pgtable_alloc(shift);
+   struct ptdesc *ptdesc = page_ptdesc(phys_to_page(pa));
 
/*
 * Call proper page table ctor in case later we need to
@@ -433,12 +434,12 @@ static phys_addr_t pgd_pgtable_alloc(int shift)
 * this pre-allocated page table.
 *
 * We don't select ARCH_ENABLE_SPLIT_PMD_PTLOCK if pmd is
-* folded, and if so pgtable_pmd_page_ctor() becomes nop.
+* folded, and if so pagetable_pte_ctor() becomes nop.
 */
if (shift == PAGE_SHIFT)
-   BUG_ON(!pgtable_pte_page_ctor(phys_to_page(pa)));
+   BUG_ON(!pagetable_pte_ctor(ptdesc));
else if (shift == PMD_SHIFT)
-   BUG_ON(!pgtable_pmd_page_ctor(phys_to_page(pa)));
+   BUG_ON(!pagetable_pmd_ctor(ptdesc));
 
return pa;
 }
-- 
2.40.1

[PATCH mm-unstable v9 17/31] arm: Convert various functions to use ptdescs

2023-08-07 Thread Vishal Moola (Oracle)

As part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents, convert various page table functions to use ptdescs.

late_alloc() also uses the __get_free_pages() helper function. Convert
this to use pagetable_alloc() and ptdesc_address() instead to help
standardize page tables further.

Acked-by: Mike Rapoport (IBM) 
Signed-off-by: Vishal Moola (Oracle) 
---
 arch/arm/include/asm/tlb.h | 12 +++-
 arch/arm/mm/mmu.c  |  7 ---
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
index b8cbe03ad260..f40d06ad5d2a 100644
--- a/arch/arm/include/asm/tlb.h
+++ b/arch/arm/include/asm/tlb.h
@@ -39,7 +39,9 @@ static inline void __tlb_remove_table(void *_table)
 static inline void
 __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr)
 {
-   pgtable_pte_page_dtor(pte);
+   struct ptdesc *ptdesc = page_ptdesc(pte);
+
+   pagetable_pte_dtor(ptdesc);
 
 #ifndef CONFIG_ARM_LPAE
/*
@@ -50,17 +52,17 @@ __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, 
unsigned long addr)
__tlb_adjust_range(tlb, addr - PAGE_SIZE, 2 * PAGE_SIZE);
 #endif
 
-   tlb_remove_table(tlb, pte);
+   tlb_remove_ptdesc(tlb, ptdesc);
 }
 
 static inline void
 __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr)
 {
 #ifdef CONFIG_ARM_LPAE
-   struct page *page = virt_to_page(pmdp);
+   struct ptdesc *ptdesc = virt_to_ptdesc(pmdp);
 
-   pgtable_pmd_page_dtor(page);
-   tlb_remove_table(tlb, page);
+   pagetable_pmd_dtor(ptdesc);
+   tlb_remove_ptdesc(tlb, ptdesc);
 #endif
 }
 
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index c9981c23e8e9..674ed71573a8 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -737,11 +737,12 @@ static void __init *early_alloc(unsigned long sz)
 
 static void *__init late_alloc(unsigned long sz)
 {
-   void *ptr = (void *)__get_free_pages(GFP_PGTABLE_KERNEL, get_order(sz));
+   void *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_HIGHMEM,
+   get_order(sz));
 
-   if (!ptr || !pgtable_pte_page_ctor(virt_to_page(ptr)))
+   if (!ptdesc || !pagetable_pte_ctor(ptdesc))
BUG();
-   return ptr;
+   return ptdesc_to_virt(ptdesc);
 }
 
 static pte_t * __init arm_pte_alloc(pmd_t *pmd, unsigned long addr,
-- 
2.40.1

[PATCH mm-unstable v9 16/31] pgalloc: Convert various functions to use ptdescs

2023-08-07 Thread Vishal Moola (Oracle)

As part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents, convert various page table functions to use ptdescs.

Some of the functions use the *get*page*() helper functions. Convert
these to use pagetable_alloc() and ptdesc_address() instead to help
standardize page tables further.

Signed-off-by: Vishal Moola (Oracle) 
---
 include/asm-generic/pgalloc.h | 88 +--
 1 file changed, 52 insertions(+), 36 deletions(-)

diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
index a7cf825befae..c75d4a753849 100644
--- a/include/asm-generic/pgalloc.h
+++ b/include/asm-generic/pgalloc.h
@@ -8,7 +8,7 @@
 #define GFP_PGTABLE_USER   (GFP_PGTABLE_KERNEL | __GFP_ACCOUNT)
 
 /**
- * __pte_alloc_one_kernel - allocate a page for PTE-level kernel page table
+ * __pte_alloc_one_kernel - allocate memory for a PTE-level kernel page table
  * @mm: the mm_struct of the current context
  *
  * This function is intended for architectures that need
@@ -18,12 +18,17 @@
  */
 static inline pte_t *__pte_alloc_one_kernel(struct mm_struct *mm)
 {
-   return (pte_t *)__get_free_page(GFP_PGTABLE_KERNEL);
+   struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL &
+   ~__GFP_HIGHMEM, 0);
+
+   if (!ptdesc)
+   return NULL;
+   return ptdesc_address(ptdesc);
 }
 
 #ifndef __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
 /**
- * pte_alloc_one_kernel - allocate a page for PTE-level kernel page table
+ * pte_alloc_one_kernel - allocate memory for a PTE-level kernel page table
  * @mm: the mm_struct of the current context
  *
  * Return: pointer to the allocated memory or %NULL on error
@@ -35,40 +40,40 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct 
*mm)
 #endif
 
 /**
- * pte_free_kernel - free PTE-level kernel page table page
+ * pte_free_kernel - free PTE-level kernel page table memory
  * @mm: the mm_struct of the current context
  * @pte: pointer to the memory containing the page table
  */
 static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
-   free_page((unsigned long)pte);
+   pagetable_free(virt_to_ptdesc(pte));
 }
 
 /**
- * __pte_alloc_one - allocate a page for PTE-level user page table
+ * __pte_alloc_one - allocate memory for a PTE-level user page table
  * @mm: the mm_struct of the current context
  * @gfp: GFP flags to use for the allocation
  *
- * Allocates a page and runs the pgtable_pte_page_ctor().
+ * Allocate memory for a page table and ptdesc and runs pagetable_pte_ctor().
  *
  * This function is intended for architectures that need
  * anything beyond simple page allocation or must have custom GFP flags.
  *
- * Return: `struct page` initialized as page table or %NULL on error
+ * Return: `struct page` referencing the ptdesc or %NULL on error
  */
 static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp)
 {
-   struct page *pte;
+   struct ptdesc *ptdesc;
 
-   pte = alloc_page(gfp);
-   if (!pte)
+   ptdesc = pagetable_alloc(gfp, 0);
+   if (!ptdesc)
return NULL;
-   if (!pgtable_pte_page_ctor(pte)) {
-   __free_page(pte);
+   if (!pagetable_pte_ctor(ptdesc)) {
+   pagetable_free(ptdesc);
return NULL;
}
 
-   return pte;
+   return ptdesc_page(ptdesc);
 }
 
 #ifndef __HAVE_ARCH_PTE_ALLOC_ONE
@@ -76,9 +81,9 @@ static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, 
gfp_t gfp)
  * pte_alloc_one - allocate a page for PTE-level user page table
  * @mm: the mm_struct of the current context
  *
- * Allocates a page and runs the pgtable_pte_page_ctor().
+ * Allocate memory for a page table and ptdesc and runs pagetable_pte_ctor().
  *
- * Return: `struct page` initialized as page table or %NULL on error
+ * Return: `struct page` referencing the ptdesc or %NULL on error
  */
 static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
 {
@@ -92,14 +97,16 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
  */
 
 /**
- * pte_free - free PTE-level user page table page
+ * pte_free - free PTE-level user page table memory
  * @mm: the mm_struct of the current context
- * @pte_page: the `struct page` representing the page table
+ * @pte_page: the `struct page` referencing the ptdesc
  */
 static inline void pte_free(struct mm_struct *mm, struct page *pte_page)
 {
-   pgtable_pte_page_dtor(pte_page);
-   __free_page(pte_page);
+   struct ptdesc *ptdesc = page_ptdesc(pte_page);
+
+   pagetable_pte_dtor(ptdesc);
+   pagetable_free(ptdesc);
 }
 
 
@@ -107,10 +114,11 @@ static inline void pte_free(struct mm_struct *mm, struct 
page *pte_page)
 
 #ifndef __HAVE_ARCH_PMD_ALLOC_ONE
 /**
- * pmd_alloc_one - allocate a page for PMD-level page table
+ * pmd_alloc_one - allocate memory for a PMD-level page table
  * @mm: the mm_struct of the current context
  *
- * Allocates a page and runs the

[PATCH mm-unstable v9 15/31] mm: remove page table members from struct page

2023-08-07 Thread Vishal Moola (Oracle)

The page table members are now split out into their own ptdesc struct.
Remove them from struct page.

Signed-off-by: Vishal Moola (Oracle) 
Acked-by: Mike Rapoport (IBM) 
---
 include/linux/mm_types.h | 21 -
 1 file changed, 21 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index ea34b22b4cbf..f5ba5b0bc836 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -141,24 +141,6 @@ struct page {
struct {/* Tail pages of compound page */
unsigned long compound_head;/* Bit zero is set */
};
-   struct {/* Page table pages */
-   unsigned long _pt_pad_1;/* compound_head */
-   pgtable_t pmd_huge_pte; /* protected by page->ptl */
-   /*
-* A PTE page table page might be freed by use of
-* rcu_head: which overlays those two fields above.
-*/
-   unsigned long _pt_pad_2;/* mapping */
-   union {
-   struct mm_struct *pt_mm; /* x86 pgds only */
-   atomic_t pt_frag_refcount; /* powerpc */
-   };
-#if ALLOC_SPLIT_PTLOCKS
-   spinlock_t *ptl;
-#else
-   spinlock_t ptl;
-#endif
-   };
struct {/* ZONE_DEVICE pages */
/** @pgmap: Points to the hosting device page map. */
struct dev_pagemap *pgmap;
@@ -454,10 +436,7 @@ struct ptdesc {
 TABLE_MATCH(flags, __page_flags);
 TABLE_MATCH(compound_head, pt_list);
 TABLE_MATCH(compound_head, _pt_pad_1);
-TABLE_MATCH(pmd_huge_pte, pmd_huge_pte);
 TABLE_MATCH(mapping, __page_mapping);
-TABLE_MATCH(pt_mm, pt_mm);
-TABLE_MATCH(ptl, ptl);
 TABLE_MATCH(rcu_head, pt_rcu_head);
 TABLE_MATCH(page_type, __page_type);
 TABLE_MATCH(_refcount, _refcount);
-- 
2.40.1

[PATCH mm-unstable v9 14/31] s390: Convert various pgalloc functions to use ptdescs

2023-08-07 Thread Vishal Moola (Oracle)

As part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents, convert various page table functions to use ptdescs.

Some of the functions use the *get*page*() helper functions. Convert
these to use pagetable_alloc() and ptdesc_address() instead to help
standardize page tables further.

Acked-by: Mike Rapoport (IBM) 
Signed-off-by: Vishal Moola (Oracle) 
---
 arch/s390/include/asm/pgalloc.h |   4 +-
 arch/s390/include/asm/tlb.h |   4 +-
 arch/s390/mm/pgalloc.c  | 128 
 3 files changed, 69 insertions(+), 67 deletions(-)

diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index 89a9d5ef94f8..376b4b23bdaa 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -86,7 +86,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, 
unsigned long vmaddr)
if (!table)
return NULL;
crst_table_init(table, _SEGMENT_ENTRY_EMPTY);
-   if (!pgtable_pmd_page_ctor(virt_to_page(table))) {
+   if (!pagetable_pmd_ctor(virt_to_ptdesc(table))) {
crst_table_free(mm, table);
return NULL;
}
@@ -97,7 +97,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 {
if (mm_pmd_folded(mm))
return;
-   pgtable_pmd_page_dtor(virt_to_page(pmd));
+   pagetable_pmd_dtor(virt_to_ptdesc(pmd));
crst_table_free(mm, (unsigned long *) pmd);
 }
 
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index b91f4a9b044c..383b1f91442c 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -89,12 +89,12 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, 
pmd_t *pmd,
 {
if (mm_pmd_folded(tlb->mm))
return;
-   pgtable_pmd_page_dtor(virt_to_page(pmd));
+   pagetable_pmd_dtor(virt_to_ptdesc(pmd));
__tlb_adjust_range(tlb, address, PAGE_SIZE);
tlb->mm->context.flush_mm = 1;
tlb->freed_tables = 1;
tlb->cleared_puds = 1;
-   tlb_remove_table(tlb, pmd);
+   tlb_remove_ptdesc(tlb, pmd);
 }
 
 /*
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index d7374add7820..07fc660a24aa 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -43,17 +43,17 @@ __initcall(page_table_register_sysctl);
 
 unsigned long *crst_table_alloc(struct mm_struct *mm)
 {
-   struct page *page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+   struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER);
 
-   if (!page)
+   if (!ptdesc)
return NULL;
-   arch_set_page_dat(page, CRST_ALLOC_ORDER);
-   return (unsigned long *) page_to_virt(page);
+   arch_set_page_dat(ptdesc_page(ptdesc), CRST_ALLOC_ORDER);
+   return (unsigned long *) ptdesc_to_virt(ptdesc);
 }
 
 void crst_table_free(struct mm_struct *mm, unsigned long *table)
 {
-   free_pages((unsigned long)table, CRST_ALLOC_ORDER);
+   pagetable_free(virt_to_ptdesc(table));
 }
 
 static void __crst_table_upgrade(void *arg)
@@ -140,21 +140,21 @@ static inline unsigned int atomic_xor_bits(atomic_t *v, 
unsigned int bits)
 
 struct page *page_table_alloc_pgste(struct mm_struct *mm)
 {
-   struct page *page;
+   struct ptdesc *ptdesc;
u64 *table;
 
-   page = alloc_page(GFP_KERNEL);
-   if (page) {
-   table = (u64 *)page_to_virt(page);
+   ptdesc = pagetable_alloc(GFP_KERNEL, 0);
+   if (ptdesc) {
+   table = (u64 *)ptdesc_to_virt(ptdesc);
memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
}
-   return page;
+   return ptdesc_page(ptdesc);
 }
 
 void page_table_free_pgste(struct page *page)
 {
-   __free_page(page);
+   pagetable_free(page_ptdesc(page));
 }
 
 #endif /* CONFIG_PGSTE */
@@ -242,7 +242,7 @@ void page_table_free_pgste(struct page *page)
 unsigned long *page_table_alloc(struct mm_struct *mm)
 {
unsigned long *table;
-   struct page *page;
+   struct ptdesc *ptdesc;
unsigned int mask, bit;
 
/* Try to get a fragment of a 4K page as a 2K page table */
@@ -250,9 +250,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
table = NULL;
spin_lock_bh(>context.lock);
if (!list_empty(>context.pgtable_list)) {
-   page = list_first_entry(>context.pgtable_list,
-   struct page, lru);
-   mask = atomic_read(>_refcount) >> 24;
+   ptdesc = list_first_entry(>context.pgtable_list,
+   struct ptdesc, pt_list);
+   mask = atomic_read(>_refcount) >> 24;
/*
 * The pending removal bits must also be checked.

[PATCH mm-unstable v9 13/31] x86: Convert various functions to use ptdescs

2023-08-07 Thread Vishal Moola (Oracle)

In order to split struct ptdesc from struct page, convert various
functions to use ptdescs.

Some of the functions use the *get*page*() helper functions. Convert
these to use pagetable_alloc() and ptdesc_address() instead to help
standardize page tables further.

Signed-off-by: Vishal Moola (Oracle) 
---
 arch/x86/mm/pgtable.c | 47 ++-
 1 file changed, 28 insertions(+), 19 deletions(-)

diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 15a8009a4480..d3a93e8766ee 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -52,7 +52,7 @@ early_param("userpte", setup_userpte);
 
 void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
 {
-   pgtable_pte_page_dtor(pte);
+   pagetable_pte_dtor(page_ptdesc(pte));
paravirt_release_pte(page_to_pfn(pte));
paravirt_tlb_remove_table(tlb, pte);
 }
@@ -60,7 +60,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
 #if CONFIG_PGTABLE_LEVELS > 2
 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 {
-   struct page *page = virt_to_page(pmd);
+   struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
/*
 * NOTE! For PAE, any changes to the top page-directory-pointer-table
@@ -69,8 +69,8 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 #ifdef CONFIG_X86_PAE
tlb->need_flush_all = 1;
 #endif
-   pgtable_pmd_page_dtor(page);
-   paravirt_tlb_remove_table(tlb, page);
+   pagetable_pmd_dtor(ptdesc);
+   paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc));
 }
 
 #if CONFIG_PGTABLE_LEVELS > 3
@@ -92,16 +92,16 @@ void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
 
 static inline void pgd_list_add(pgd_t *pgd)
 {
-   struct page *page = virt_to_page(pgd);
+   struct ptdesc *ptdesc = virt_to_ptdesc(pgd);
 
-   list_add(>lru, _list);
+   list_add(>pt_list, _list);
 }
 
 static inline void pgd_list_del(pgd_t *pgd)
 {
-   struct page *page = virt_to_page(pgd);
+   struct ptdesc *ptdesc = virt_to_ptdesc(pgd);
 
-   list_del(>lru);
+   list_del(>pt_list);
 }
 
 #define UNSHARED_PTRS_PER_PGD  \
@@ -112,12 +112,12 @@ static inline void pgd_list_del(pgd_t *pgd)
 
 static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
 {
-   virt_to_page(pgd)->pt_mm = mm;
+   virt_to_ptdesc(pgd)->pt_mm = mm;
 }
 
 struct mm_struct *pgd_page_get_mm(struct page *page)
 {
-   return page->pt_mm;
+   return page_ptdesc(page)->pt_mm;
 }
 
 static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
@@ -213,11 +213,14 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, 
pmd_t *pmd)
 static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
 {
int i;
+   struct ptdesc *ptdesc;
 
for (i = 0; i < count; i++)
if (pmds[i]) {
-   pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
-   free_page((unsigned long)pmds[i]);
+   ptdesc = virt_to_ptdesc(pmds[i]);
+
+   pagetable_pmd_dtor(ptdesc);
+   pagetable_free(ptdesc);
mm_dec_nr_pmds(mm);
}
 }
@@ -230,18 +233,24 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t 
*pmds[], int count)
 
if (mm == _mm)
gfp &= ~__GFP_ACCOUNT;
+   gfp &= ~__GFP_HIGHMEM;
 
for (i = 0; i < count; i++) {
-   pmd_t *pmd = (pmd_t *)__get_free_page(gfp);
-   if (!pmd)
+   pmd_t *pmd = NULL;
+   struct ptdesc *ptdesc = pagetable_alloc(gfp, 0);
+
+   if (!ptdesc)
failed = true;
-   if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) {
-   free_page((unsigned long)pmd);
-   pmd = NULL;
+   if (ptdesc && !pagetable_pmd_ctor(ptdesc)) {
+   pagetable_free(ptdesc);
+   ptdesc = NULL;
failed = true;
}
-   if (pmd)
+   if (ptdesc) {
mm_inc_nr_pmds(mm);
+   pmd = ptdesc_address(ptdesc);
+   }
+
pmds[i] = pmd;
}
 
@@ -830,7 +839,7 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr)
 
free_page((unsigned long)pmd_sv);
 
-   pgtable_pmd_page_dtor(virt_to_page(pmd));
+   pagetable_pmd_dtor(virt_to_ptdesc(pmd));
free_page((unsigned long)pmd);
 
return 1;
-- 
2.40.1

[PATCH mm-unstable v9 12/31] powerpc: Convert various functions to use ptdescs

2023-08-07 Thread Vishal Moola (Oracle)

In order to split struct ptdesc from struct page, convert various
functions to use ptdescs.

Acked-by: Mike Rapoport (IBM) 
Signed-off-by: Vishal Moola (Oracle) 
---
 arch/powerpc/mm/book3s64/mmu_context.c | 10 ++---
 arch/powerpc/mm/book3s64/pgtable.c | 32 +++---
 arch/powerpc/mm/pgtable-frag.c | 58 +-
 3 files changed, 50 insertions(+), 50 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/mmu_context.c 
b/arch/powerpc/mm/book3s64/mmu_context.c
index c766e4c26e42..1715b07c630c 100644
--- a/arch/powerpc/mm/book3s64/mmu_context.c
+++ b/arch/powerpc/mm/book3s64/mmu_context.c
@@ -246,15 +246,15 @@ static void destroy_contexts(mm_context_t *ctx)
 static void pmd_frag_destroy(void *pmd_frag)
 {
int count;
-   struct page *page;
+   struct ptdesc *ptdesc;
 
-   page = virt_to_page(pmd_frag);
+   ptdesc = virt_to_ptdesc(pmd_frag);
/* drop all the pending references */
count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT;
/* We allow PTE_FRAG_NR fragments from a PTE page */
-   if (atomic_sub_and_test(PMD_FRAG_NR - count, >pt_frag_refcount)) {
-   pgtable_pmd_page_dtor(page);
-   __free_page(page);
+   if (atomic_sub_and_test(PMD_FRAG_NR - count, 
>pt_frag_refcount)) {
+   pagetable_pmd_dtor(ptdesc);
+   pagetable_free(ptdesc);
}
 }
 
diff --git a/arch/powerpc/mm/book3s64/pgtable.c 
b/arch/powerpc/mm/book3s64/pgtable.c
index 75b938268b04..1498ccd08367 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -384,22 +384,22 @@ static pmd_t *get_pmd_from_cache(struct mm_struct *mm)
 static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm)
 {
void *ret = NULL;
-   struct page *page;
+   struct ptdesc *ptdesc;
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
 
if (mm == _mm)
gfp &= ~__GFP_ACCOUNT;
-   page = alloc_page(gfp);
-   if (!page)
+   ptdesc = pagetable_alloc(gfp, 0);
+   if (!ptdesc)
return NULL;
-   if (!pgtable_pmd_page_ctor(page)) {
-   __free_pages(page, 0);
+   if (!pagetable_pmd_ctor(ptdesc)) {
+   pagetable_free(ptdesc);
return NULL;
}
 
-   atomic_set(>pt_frag_refcount, 1);
+   atomic_set(>pt_frag_refcount, 1);
 
-   ret = page_address(page);
+   ret = ptdesc_address(ptdesc);
/*
 * if we support only one fragment just return the
 * allocated page.
@@ -409,12 +409,12 @@ static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm)
 
spin_lock(>page_table_lock);
/*
-* If we find pgtable_page set, we return
+* If we find ptdesc_page set, we return
 * the allocated page with single fragment
 * count.
 */
if (likely(!mm->context.pmd_frag)) {
-   atomic_set(>pt_frag_refcount, PMD_FRAG_NR);
+   atomic_set(>pt_frag_refcount, PMD_FRAG_NR);
mm->context.pmd_frag = ret + PMD_FRAG_SIZE;
}
spin_unlock(>page_table_lock);
@@ -435,15 +435,15 @@ pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned 
long vmaddr)
 
 void pmd_fragment_free(unsigned long *pmd)
 {
-   struct page *page = virt_to_page(pmd);
+   struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
 
-   if (PageReserved(page))
-   return free_reserved_page(page);
+   if (pagetable_is_reserved(ptdesc))
+   return free_reserved_ptdesc(ptdesc);
 
-   BUG_ON(atomic_read(>pt_frag_refcount) <= 0);
-   if (atomic_dec_and_test(>pt_frag_refcount)) {
-   pgtable_pmd_page_dtor(page);
-   __free_page(page);
+   BUG_ON(atomic_read(>pt_frag_refcount) <= 0);
+   if (atomic_dec_and_test(>pt_frag_refcount)) {
+   pagetable_pmd_dtor(ptdesc);
+   pagetable_free(ptdesc);
}
 }
 
diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c
index 0c6b68130025..8c31802f97e8 100644
--- a/arch/powerpc/mm/pgtable-frag.c
+++ b/arch/powerpc/mm/pgtable-frag.c
@@ -18,15 +18,15 @@
 void pte_frag_destroy(void *pte_frag)
 {
int count;
-   struct page *page;
+   struct ptdesc *ptdesc;
 
-   page = virt_to_page(pte_frag);
+   ptdesc = virt_to_ptdesc(pte_frag);
/* drop all the pending references */
count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
/* We allow PTE_FRAG_NR fragments from a PTE page */
-   if (atomic_sub_and_test(PTE_FRAG_NR - count, >pt_frag_refcount)) {
-   pgtable_pte_page_dtor(page);
-   __free_page(page);
+   if (atomic_sub_and_test(PTE_FRAG_NR - count, 
>pt_frag_refcount)) {
+   pagetable_pte_dtor(ptdesc);
+   pagetable_free(ptdesc);
}
 }
 
@@ -55,25 +55,25 @@ static pte_t *get_pte_from_cache(struct mm_struct *mm)
 static pte_t

[PATCH mm-unstable v9 11/31] mm: Create ptdesc equivalents for pgtable_{pte,pmd}_page_{ctor,dtor}

2023-08-07 Thread Vishal Moola (Oracle)

Create pagetable_pte_ctor(), pagetable_pmd_ctor(), pagetable_pte_dtor(),
and pagetable_pmd_dtor() and make the original pgtable
constructor/destructors wrappers.

Acked-by: Mike Rapoport (IBM) 
Signed-off-by: Vishal Moola (Oracle) 
---
 include/linux/mm.h | 56 ++
 1 file changed, 42 insertions(+), 14 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 94984d49ab01..6310e0c59efe 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2921,20 +2921,34 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { 
return true; }
 static inline void ptlock_free(struct ptdesc *ptdesc) {}
 #endif /* USE_SPLIT_PTE_PTLOCKS */
 
-static inline bool pgtable_pte_page_ctor(struct page *page)
+static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc)
 {
-   if (!ptlock_init(page_ptdesc(page)))
+   struct folio *folio = ptdesc_folio(ptdesc);
+
+   if (!ptlock_init(ptdesc))
return false;
-   __SetPageTable(page);
-   inc_lruvec_page_state(page, NR_PAGETABLE);
+   __folio_set_pgtable(folio);
+   lruvec_stat_add_folio(folio, NR_PAGETABLE);
return true;
 }
 
+static inline bool pgtable_pte_page_ctor(struct page *page)
+{
+   return pagetable_pte_ctor(page_ptdesc(page));
+}
+
+static inline void pagetable_pte_dtor(struct ptdesc *ptdesc)
+{
+   struct folio *folio = ptdesc_folio(ptdesc);
+
+   ptlock_free(ptdesc);
+   __folio_clear_pgtable(folio);
+   lruvec_stat_sub_folio(folio, NR_PAGETABLE);
+}
+
 static inline void pgtable_pte_page_dtor(struct page *page)
 {
-   ptlock_free(page_ptdesc(page));
-   __ClearPageTable(page);
-   dec_lruvec_page_state(page, NR_PAGETABLE);
+   pagetable_pte_dtor(page_ptdesc(page));
 }
 
 pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp);
@@ -3032,20 +3046,34 @@ static inline spinlock_t *pmd_lock(struct mm_struct 
*mm, pmd_t *pmd)
return ptl;
 }
 
-static inline bool pgtable_pmd_page_ctor(struct page *page)
+static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc)
 {
-   if (!pmd_ptlock_init(page_ptdesc(page)))
+   struct folio *folio = ptdesc_folio(ptdesc);
+
+   if (!pmd_ptlock_init(ptdesc))
return false;
-   __SetPageTable(page);
-   inc_lruvec_page_state(page, NR_PAGETABLE);
+   __folio_set_pgtable(folio);
+   lruvec_stat_add_folio(folio, NR_PAGETABLE);
return true;
 }
 
+static inline bool pgtable_pmd_page_ctor(struct page *page)
+{
+   return pagetable_pmd_ctor(page_ptdesc(page));
+}
+
+static inline void pagetable_pmd_dtor(struct ptdesc *ptdesc)
+{
+   struct folio *folio = ptdesc_folio(ptdesc);
+
+   pmd_ptlock_free(ptdesc);
+   __folio_clear_pgtable(folio);
+   lruvec_stat_sub_folio(folio, NR_PAGETABLE);
+}
+
 static inline void pgtable_pmd_page_dtor(struct page *page)
 {
-   pmd_ptlock_free(page_ptdesc(page));
-   __ClearPageTable(page);
-   dec_lruvec_page_state(page, NR_PAGETABLE);
+   pagetable_pmd_dtor(page_ptdesc(page));
 }
 
 /*
-- 
2.40.1

[PATCH mm-unstable v9 10/31] mm: Convert ptlock_free() to use ptdescs

2023-08-07 Thread Vishal Moola (Oracle)

This removes some direct accesses to struct page, working towards
splitting out struct ptdesc from struct page.

Acked-by: Mike Rapoport (IBM) 
Signed-off-by: Vishal Moola (Oracle) 
---
 include/linux/mm.h | 10 +-
 mm/memory.c|  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index aa6f77c71453..94984d49ab01 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2861,7 +2861,7 @@ static inline void pagetable_free(struct ptdesc *pt)
 #if ALLOC_SPLIT_PTLOCKS
 void __init ptlock_cache_init(void);
 bool ptlock_alloc(struct ptdesc *ptdesc);
-extern void ptlock_free(struct page *page);
+void ptlock_free(struct ptdesc *ptdesc);
 
 static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
 {
@@ -2877,7 +2877,7 @@ static inline bool ptlock_alloc(struct ptdesc *ptdesc)
return true;
 }
 
-static inline void ptlock_free(struct page *page)
+static inline void ptlock_free(struct ptdesc *ptdesc)
 {
 }
 
@@ -2918,7 +2918,7 @@ static inline spinlock_t *pte_lockptr(struct mm_struct 
*mm, pmd_t *pmd)
 }
 static inline void ptlock_cache_init(void) {}
 static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; }
-static inline void ptlock_free(struct page *page) {}
+static inline void ptlock_free(struct ptdesc *ptdesc) {}
 #endif /* USE_SPLIT_PTE_PTLOCKS */
 
 static inline bool pgtable_pte_page_ctor(struct page *page)
@@ -2932,7 +2932,7 @@ static inline bool pgtable_pte_page_ctor(struct page 
*page)
 
 static inline void pgtable_pte_page_dtor(struct page *page)
 {
-   ptlock_free(page);
+   ptlock_free(page_ptdesc(page));
__ClearPageTable(page);
dec_lruvec_page_state(page, NR_PAGETABLE);
 }
@@ -3006,7 +3006,7 @@ static inline void pmd_ptlock_free(struct ptdesc *ptdesc)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
VM_BUG_ON_PAGE(ptdesc->pmd_huge_pte, ptdesc_page(ptdesc));
 #endif
-   ptlock_free(ptdesc_page(ptdesc));
+   ptlock_free(ptdesc);
 }
 
 #define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte)
diff --git a/mm/memory.c b/mm/memory.c
index 3606ef72ba70..d003076b218d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6145,8 +6145,8 @@ bool ptlock_alloc(struct ptdesc *ptdesc)
return true;
 }
 
-void ptlock_free(struct page *page)
+void ptlock_free(struct ptdesc *ptdesc)
 {
-   kmem_cache_free(page_ptl_cachep, page->ptl);
+   kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
 }
 #endif
-- 
2.40.1

[PATCH mm-unstable v9 09/31] mm: Convert pmd_ptlock_free() to use ptdescs

2023-08-07 Thread Vishal Moola (Oracle)

This removes some direct accesses to struct page, working towards
splitting out struct ptdesc from struct page.

Acked-by: Mike Rapoport (IBM) 
Signed-off-by: Vishal Moola (Oracle) 
---
 include/linux/mm.h | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 13947b17f25e..aa6f77c71453 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3001,12 +3001,12 @@ static inline bool pmd_ptlock_init(struct ptdesc 
*ptdesc)
return ptlock_init(ptdesc);
 }
 
-static inline void pmd_ptlock_free(struct page *page)
+static inline void pmd_ptlock_free(struct ptdesc *ptdesc)
 {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-   VM_BUG_ON_PAGE(page->pmd_huge_pte, page);
+   VM_BUG_ON_PAGE(ptdesc->pmd_huge_pte, ptdesc_page(ptdesc));
 #endif
-   ptlock_free(page);
+   ptlock_free(ptdesc_page(ptdesc));
 }
 
 #define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte)
@@ -3019,7 +3019,7 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct 
*mm, pmd_t *pmd)
 }
 
 static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; }
-static inline void pmd_ptlock_free(struct page *page) {}
+static inline void pmd_ptlock_free(struct ptdesc *ptdesc) {}
 
 #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)
 
@@ -3043,7 +3043,7 @@ static inline bool pgtable_pmd_page_ctor(struct page 
*page)
 
 static inline void pgtable_pmd_page_dtor(struct page *page)
 {
-   pmd_ptlock_free(page);
+   pmd_ptlock_free(page_ptdesc(page));
__ClearPageTable(page);
dec_lruvec_page_state(page, NR_PAGETABLE);
 }
-- 
2.40.1

[PATCH mm-unstable v9 08/31] mm: Convert ptlock_init() to use ptdescs

2023-08-07 Thread Vishal Moola (Oracle)

This removes some direct accesses to struct page, working towards
splitting out struct ptdesc from struct page.

Acked-by: Mike Rapoport (IBM) 
Signed-off-by: Vishal Moola (Oracle) 
---
 include/linux/mm.h | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 040982fe9063..13947b17f25e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2892,7 +2892,7 @@ static inline spinlock_t *pte_lockptr(struct mm_struct 
*mm, pmd_t *pmd)
return ptlock_ptr(page_ptdesc(pmd_page(*pmd)));
 }
 
-static inline bool ptlock_init(struct page *page)
+static inline bool ptlock_init(struct ptdesc *ptdesc)
 {
/*
 * prep_new_page() initialize page->private (and therefore page->ptl)
@@ -2901,10 +2901,10 @@ static inline bool ptlock_init(struct page *page)
 * It can happen if arch try to use slab for page table allocation:
 * slab code uses page->slab_cache, which share storage with page->ptl.
 */
-   VM_BUG_ON_PAGE(*(unsigned long *)>ptl, page);
-   if (!ptlock_alloc(page_ptdesc(page)))
+   VM_BUG_ON_PAGE(*(unsigned long *)>ptl, ptdesc_page(ptdesc));
+   if (!ptlock_alloc(ptdesc))
return false;
-   spin_lock_init(ptlock_ptr(page_ptdesc(page)));
+   spin_lock_init(ptlock_ptr(ptdesc));
return true;
 }
 
@@ -2917,13 +2917,13 @@ static inline spinlock_t *pte_lockptr(struct mm_struct 
*mm, pmd_t *pmd)
return >page_table_lock;
 }
 static inline void ptlock_cache_init(void) {}
-static inline bool ptlock_init(struct page *page) { return true; }
+static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; }
 static inline void ptlock_free(struct page *page) {}
 #endif /* USE_SPLIT_PTE_PTLOCKS */
 
 static inline bool pgtable_pte_page_ctor(struct page *page)
 {
-   if (!ptlock_init(page))
+   if (!ptlock_init(page_ptdesc(page)))
return false;
__SetPageTable(page);
inc_lruvec_page_state(page, NR_PAGETABLE);
@@ -2998,7 +2998,7 @@ static inline bool pmd_ptlock_init(struct ptdesc *ptdesc)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
ptdesc->pmd_huge_pte = NULL;
 #endif
-   return ptlock_init(ptdesc_page(ptdesc));
+   return ptlock_init(ptdesc);
 }
 
 static inline void pmd_ptlock_free(struct page *page)
-- 
2.40.1

[PATCH mm-unstable v9 07/31] mm: Convert pmd_ptlock_init() to use ptdescs

2023-08-07 Thread Vishal Moola (Oracle)

This removes some direct accesses to struct page, working towards
splitting out struct ptdesc from struct page.

Acked-by: Mike Rapoport (IBM) 
Signed-off-by: Vishal Moola (Oracle) 
---
 include/linux/mm.h | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index bc82a64e5f01..040982fe9063 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2993,12 +2993,12 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct 
*mm, pmd_t *pmd)
return ptlock_ptr(pmd_ptdesc(pmd));
 }
 
-static inline bool pmd_ptlock_init(struct page *page)
+static inline bool pmd_ptlock_init(struct ptdesc *ptdesc)
 {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-   page->pmd_huge_pte = NULL;
+   ptdesc->pmd_huge_pte = NULL;
 #endif
-   return ptlock_init(page);
+   return ptlock_init(ptdesc_page(ptdesc));
 }
 
 static inline void pmd_ptlock_free(struct page *page)
@@ -3018,7 +3018,7 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct 
*mm, pmd_t *pmd)
return >page_table_lock;
 }
 
-static inline bool pmd_ptlock_init(struct page *page) { return true; }
+static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; }
 static inline void pmd_ptlock_free(struct page *page) {}
 
 #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)
@@ -3034,7 +3034,7 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, 
pmd_t *pmd)
 
 static inline bool pgtable_pmd_page_ctor(struct page *page)
 {
-   if (!pmd_ptlock_init(page))
+   if (!pmd_ptlock_init(page_ptdesc(page)))
return false;
__SetPageTable(page);
inc_lruvec_page_state(page, NR_PAGETABLE);
-- 
2.40.1

[PATCH mm-unstable v9 06/31] mm: Convert ptlock_ptr() to use ptdescs

2023-08-07 Thread Vishal Moola (Oracle)

This removes some direct accesses to struct page, working towards
splitting out struct ptdesc from struct page.

Acked-by: Mike Rapoport (IBM) 
Signed-off-by: Vishal Moola (Oracle) 
---
 arch/x86/xen/mmu_pv.c |  2 +-
 include/linux/mm.h| 14 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index e0a975165de7..8796ec310483 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -667,7 +667,7 @@ static spinlock_t *xen_pte_lock(struct page *page, struct 
mm_struct *mm)
spinlock_t *ptl = NULL;
 
 #if USE_SPLIT_PTE_PTLOCKS
-   ptl = ptlock_ptr(page);
+   ptl = ptlock_ptr(page_ptdesc(page));
spin_lock_nest_lock(ptl, >page_table_lock);
 #endif
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6aea8fb671f1..bc82a64e5f01 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2863,9 +2863,9 @@ void __init ptlock_cache_init(void);
 bool ptlock_alloc(struct ptdesc *ptdesc);
 extern void ptlock_free(struct page *page);
 
-static inline spinlock_t *ptlock_ptr(struct page *page)
+static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
 {
-   return page->ptl;
+   return ptdesc->ptl;
 }
 #else /* ALLOC_SPLIT_PTLOCKS */
 static inline void ptlock_cache_init(void)
@@ -2881,15 +2881,15 @@ static inline void ptlock_free(struct page *page)
 {
 }
 
-static inline spinlock_t *ptlock_ptr(struct page *page)
+static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
 {
-   return >ptl;
+   return >ptl;
 }
 #endif /* ALLOC_SPLIT_PTLOCKS */
 
 static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
 {
-   return ptlock_ptr(pmd_page(*pmd));
+   return ptlock_ptr(page_ptdesc(pmd_page(*pmd)));
 }
 
 static inline bool ptlock_init(struct page *page)
@@ -2904,7 +2904,7 @@ static inline bool ptlock_init(struct page *page)
VM_BUG_ON_PAGE(*(unsigned long *)>ptl, page);
if (!ptlock_alloc(page_ptdesc(page)))
return false;
-   spin_lock_init(ptlock_ptr(page));
+   spin_lock_init(ptlock_ptr(page_ptdesc(page)));
return true;
 }
 
@@ -2990,7 +2990,7 @@ static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd)
 
 static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
 {
-   return ptlock_ptr(ptdesc_page(pmd_ptdesc(pmd)));
+   return ptlock_ptr(pmd_ptdesc(pmd));
 }
 
 static inline bool pmd_ptlock_init(struct page *page)
-- 
2.40.1

[PATCH mm-unstable v9 05/31] mm: Convert ptlock_alloc() to use ptdescs

2023-08-07 Thread Vishal Moola (Oracle)

This removes some direct accesses to struct page, working towards
splitting out struct ptdesc from struct page.

Acked-by: Mike Rapoport (IBM) 
Signed-off-by: Vishal Moola (Oracle) 
---
 include/linux/mm.h | 6 +++---
 mm/memory.c| 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f6d14a5fe747..6aea8fb671f1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2860,7 +2860,7 @@ static inline void pagetable_free(struct ptdesc *pt)
 #if USE_SPLIT_PTE_PTLOCKS
 #if ALLOC_SPLIT_PTLOCKS
 void __init ptlock_cache_init(void);
-extern bool ptlock_alloc(struct page *page);
+bool ptlock_alloc(struct ptdesc *ptdesc);
 extern void ptlock_free(struct page *page);
 
 static inline spinlock_t *ptlock_ptr(struct page *page)
@@ -2872,7 +2872,7 @@ static inline void ptlock_cache_init(void)
 {
 }
 
-static inline bool ptlock_alloc(struct page *page)
+static inline bool ptlock_alloc(struct ptdesc *ptdesc)
 {
return true;
 }
@@ -2902,7 +2902,7 @@ static inline bool ptlock_init(struct page *page)
 * slab code uses page->slab_cache, which share storage with page->ptl.
 */
VM_BUG_ON_PAGE(*(unsigned long *)>ptl, page);
-   if (!ptlock_alloc(page))
+   if (!ptlock_alloc(page_ptdesc(page)))
return false;
spin_lock_init(ptlock_ptr(page));
return true;
diff --git a/mm/memory.c b/mm/memory.c
index 956aad8aff34..3606ef72ba70 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6134,14 +6134,14 @@ void __init ptlock_cache_init(void)
SLAB_PANIC, NULL);
 }
 
-bool ptlock_alloc(struct page *page)
+bool ptlock_alloc(struct ptdesc *ptdesc)
 {
spinlock_t *ptl;
 
ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
if (!ptl)
return false;
-   page->ptl = ptl;
+   ptdesc->ptl = ptl;
return true;
 }
 
-- 
2.40.1

[PATCH mm-unstable v9 04/31] mm: Convert pmd_pgtable_page() callers to use pmd_ptdesc()

2023-08-07 Thread Vishal Moola (Oracle)

Converts internal pmd_pgtable_page() callers to use pmd_ptdesc(). This
removes some direct accesses to struct page, working towards splitting
out struct ptdesc from struct page.

Acked-by: Mike Rapoport (IBM) 
Signed-off-by: Vishal Moola (Oracle) 
---
 include/linux/mm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 54dc176b90ea..f6d14a5fe747 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2990,7 +2990,7 @@ static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd)
 
 static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
 {
-   return ptlock_ptr(pmd_pgtable_page(pmd));
+   return ptlock_ptr(ptdesc_page(pmd_ptdesc(pmd)));
 }
 
 static inline bool pmd_ptlock_init(struct page *page)
@@ -3009,7 +3009,7 @@ static inline void pmd_ptlock_free(struct page *page)
ptlock_free(page);
 }
 
-#define pmd_huge_pte(mm, pmd) (pmd_pgtable_page(pmd)->pmd_huge_pte)
+#define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte)
 
 #else
 
-- 
2.40.1

[PATCH mm-unstable v9 03/31] mm: add utility functions for ptdesc

2023-08-07 Thread Vishal Moola (Oracle)

Introduce utility functions setting the foundation for ptdescs.  These
will also assist in the splitting out of ptdesc from struct page.

Functions that focus on the descriptor are prefixed with ptdesc_* while
functions that focus on the pagetable are prefixed with pagetable_*.

pagetable_alloc() is defined to allocate new ptdesc pages as compound
pages.  This is to standardize ptdescs by allowing for one allocation and
one free function, in contrast to 2 allocation and 2 free functions.

Signed-off-by: Vishal Moola (Oracle) 
---
 include/asm-generic/tlb.h | 11 +++
 include/linux/mm.h| 61 +++
 include/linux/mm_types.h  | 12 
 3 files changed, 84 insertions(+)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index bc32a2284c56..129a3a759976 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -480,6 +480,17 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, 
struct page *page)
return tlb_remove_page_size(tlb, page, PAGE_SIZE);
 }
 
+static inline void tlb_remove_ptdesc(struct mmu_gather *tlb, void *pt)
+{
+   tlb_remove_table(tlb, pt);
+}
+
+/* Like tlb_remove_ptdesc, but for page-like page directories. */
+static inline void tlb_remove_page_ptdesc(struct mmu_gather *tlb, struct 
ptdesc *pt)
+{
+   tlb_remove_page(tlb, ptdesc_page(pt));
+}
+
 static inline void tlb_change_page_size(struct mmu_gather *tlb,
 unsigned int page_size)
 {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ec15ebc6def1..54dc176b90ea 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2806,6 +2806,57 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, 
pud_t *pud, unsigned long a
 }
 #endif /* CONFIG_MMU */
 
+static inline struct ptdesc *virt_to_ptdesc(const void *x)
+{
+   return page_ptdesc(virt_to_page(x));
+}
+
+static inline void *ptdesc_to_virt(const struct ptdesc *pt)
+{
+   return page_to_virt(ptdesc_page(pt));
+}
+
+static inline void *ptdesc_address(const struct ptdesc *pt)
+{
+   return folio_address(ptdesc_folio(pt));
+}
+
+static inline bool pagetable_is_reserved(struct ptdesc *pt)
+{
+   return folio_test_reserved(ptdesc_folio(pt));
+}
+
+/**
+ * pagetable_alloc - Allocate pagetables
+ * @gfp:GFP flags
+ * @order:  desired pagetable order
+ *
+ * pagetable_alloc allocates memory for page tables as well as a page table
+ * descriptor to describe that memory.
+ *
+ * Return: The ptdesc describing the allocated page tables.
+ */
+static inline struct ptdesc *pagetable_alloc(gfp_t gfp, unsigned int order)
+{
+   struct page *page = alloc_pages(gfp | __GFP_COMP, order);
+
+   return page_ptdesc(page);
+}
+
+/**
+ * pagetable_free - Free pagetables
+ * @pt:The page table descriptor
+ *
+ * pagetable_free frees the memory of all page tables described by a page
+ * table descriptor and the memory for the descriptor itself.
+ */
+static inline void pagetable_free(struct ptdesc *pt)
+{
+   struct page *page = ptdesc_page(pt);
+
+   __free_pages(page, compound_order(page));
+}
+
 #if USE_SPLIT_PTE_PTLOCKS
 #if ALLOC_SPLIT_PTLOCKS
 void __init ptlock_cache_init(void);
@@ -2932,6 +2983,11 @@ static inline struct page *pmd_pgtable_page(pmd_t *pmd)
return virt_to_page((void *)((unsigned long) pmd & mask));
 }
 
+static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd)
+{
+   return page_ptdesc(pmd_pgtable_page(pmd));
+}
+
 static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
 {
return ptlock_ptr(pmd_pgtable_page(pmd));
@@ -3044,6 +3100,11 @@ static inline void mark_page_reserved(struct page *page)
adjust_managed_page_count(page, -1);
 }
 
+static inline void free_reserved_ptdesc(struct ptdesc *pt)
+{
+   free_reserved_page(ptdesc_page(pt));
+}
+
 /*
  * Default method to free all the __init memory into the buddy system.
  * The freed pages will be poisoned with pattern "poison" if it's within
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index cb47438ae17f..ea34b22b4cbf 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -467,6 +467,18 @@ TABLE_MATCH(memcg_data, pt_memcg_data);
 #undef TABLE_MATCH
 static_assert(sizeof(struct ptdesc) <= sizeof(struct page));
 
+#define ptdesc_page(pt)(_Generic((pt), 
\
+   const struct ptdesc *:  (const struct page *)(pt),  \
+   struct ptdesc *:(struct page *)(pt)))
+
+#define ptdesc_folio(pt)   (_Generic((pt), \
+   const struct ptdesc *:  (const struct folio *)(pt), \
+   struct ptdesc *:(struct folio *)(pt)))
+
+#define page_ptdesc(p) (_Generic((p),  \
+   const struct page *:(const struct ptdesc *)(p), \
+   struct page *:  (struct ptdesc *)(p)))
+
 /*
  * Used for

[PATCH mm-unstable v9 02/31] pgtable: create struct ptdesc

2023-08-07 Thread Vishal Moola (Oracle)

Currently, page table information is stored within struct page.  As part
of simplifying struct page, create struct ptdesc for page table
information.

Signed-off-by: Vishal Moola (Oracle) 
Acked-by: Mike Rapoport (IBM) 
---
 include/linux/mm_types.h | 70 
 1 file changed, 70 insertions(+)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 18c8c3d793b0..cb47438ae17f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -397,6 +397,76 @@ FOLIO_MATCH(flags, _flags_2);
 FOLIO_MATCH(compound_head, _head_2);
 #undef FOLIO_MATCH
 
+/**
+ * struct ptdesc -Memory descriptor for page tables.
+ * @__page_flags: Same as page flags. Unused for page tables.
+ * @pt_rcu_head:  For freeing page table pages.
+ * @pt_list:  List of used page tables. Used for s390 and x86.
+ * @_pt_pad_1:Padding that aliases with page's compound head.
+ * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs.
+ * @__page_mapping:   Aliases with page->mapping. Unused for page tables.
+ * @pt_mm:Used for x86 pgds.
+ * @pt_frag_refcount: For fragmented page table tracking. Powerpc and s390 
only.
+ * @_pt_pad_2:Padding to ensure proper alignment.
+ * @ptl:  Lock for the page table.
+ * @__page_type:  Same as page->page_type. Unused for page tables.
+ * @_refcount:Same as page refcount. Used for s390 page tables.
+ * @pt_memcg_data:Memcg data. Tracked for page tables here.
+ *
+ * This struct overlays struct page for now. Do not modify without a good
+ * understanding of the issues.
+ */
+struct ptdesc {
+   unsigned long __page_flags;
+
+   union {
+   struct rcu_head pt_rcu_head;
+   struct list_head pt_list;
+   struct {
+   unsigned long _pt_pad_1;
+   pgtable_t pmd_huge_pte;
+   };
+   };
+   unsigned long __page_mapping;
+
+   union {
+   struct mm_struct *pt_mm;
+   atomic_t pt_frag_refcount;
+   };
+
+   union {
+   unsigned long _pt_pad_2;
+#if ALLOC_SPLIT_PTLOCKS
+   spinlock_t *ptl;
+#else
+   spinlock_t ptl;
+#endif
+   };
+   unsigned int __page_type;
+   atomic_t _refcount;
+#ifdef CONFIG_MEMCG
+   unsigned long pt_memcg_data;
+#endif
+};
+
+#define TABLE_MATCH(pg, pt)\
+   static_assert(offsetof(struct page, pg) == offsetof(struct ptdesc, pt))
+TABLE_MATCH(flags, __page_flags);
+TABLE_MATCH(compound_head, pt_list);
+TABLE_MATCH(compound_head, _pt_pad_1);
+TABLE_MATCH(pmd_huge_pte, pmd_huge_pte);
+TABLE_MATCH(mapping, __page_mapping);
+TABLE_MATCH(pt_mm, pt_mm);
+TABLE_MATCH(ptl, ptl);
+TABLE_MATCH(rcu_head, pt_rcu_head);
+TABLE_MATCH(page_type, __page_type);
+TABLE_MATCH(_refcount, _refcount);
+#ifdef CONFIG_MEMCG
+TABLE_MATCH(memcg_data, pt_memcg_data);
+#endif
+#undef TABLE_MATCH
+static_assert(sizeof(struct ptdesc) <= sizeof(struct page));
+
 /*
  * Used for sizing the vmemmap region on some architectures
  */
-- 
2.40.1

[PATCH mm-unstable v9 01/31] mm: Add PAGE_TYPE_OP folio functions

2023-08-07 Thread Vishal Moola (Oracle)

No folio equivalents for page type operations have been defined, so
define them for later folio conversions.

Also changes the Page##uname macros to take in const struct page* since
we only read the memory here.

Acked-by: Mike Rapoport (IBM) 
Signed-off-by: Vishal Moola (Oracle) 
---
 include/linux/page-flags.h | 30 +++---
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 92a2063a0a23..9218028caf33 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -908,6 +908,8 @@ static inline bool is_page_hwpoison(struct page *page)
 
 #define PageType(page, flag)   \
((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
+#define folio_test_type(folio, flag)   \
+   ((folio->page.page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
 
 static inline int page_type_has_type(unsigned int page_type)
 {
@@ -919,27 +921,41 @@ static inline int page_has_type(struct page *page)
return page_type_has_type(page->page_type);
 }
 
-#define PAGE_TYPE_OPS(uname, lname)\
-static __always_inline int Page##uname(struct page *page)  \
+#define PAGE_TYPE_OPS(uname, lname, fname) \
+static __always_inline int Page##uname(const struct page *page)
\
 {  \
return PageType(page, PG_##lname);  \
 }  \
+static __always_inline int folio_test_##fname(const struct folio *folio)\
+{  \
+   return folio_test_type(folio, PG_##lname);  \
+}  \
 static __always_inline void __SetPage##uname(struct page *page)
\
 {  \
VM_BUG_ON_PAGE(!PageType(page, 0), page);   \
page->page_type &= ~PG_##lname; \
 }  \
+static __always_inline void __folio_set_##fname(struct folio *folio)   \
+{  \
+   VM_BUG_ON_FOLIO(!folio_test_type(folio, 0), folio); \
+   folio->page.page_type &= ~PG_##lname;   \
+}  \
 static __always_inline void __ClearPage##uname(struct page *page)  \
 {  \
VM_BUG_ON_PAGE(!Page##uname(page), page);   \
page->page_type |= PG_##lname;  \
-}
+}  \
+static __always_inline void __folio_clear_##fname(struct folio *folio) \
+{  \
+   VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \
+   folio->page.page_type |= PG_##lname;\
+}  \
 
 /*
  * PageBuddy() indicates that the page is free and in the buddy system
  * (see mm/page_alloc.c).
  */
-PAGE_TYPE_OPS(Buddy, buddy)
+PAGE_TYPE_OPS(Buddy, buddy, buddy)
 
 /*
  * PageOffline() indicates that the page is logically offline although the
@@ -963,7 +979,7 @@ PAGE_TYPE_OPS(Buddy, buddy)
  * pages should check PageOffline() and synchronize with such drivers using
  * page_offline_freeze()/page_offline_thaw().
  */
-PAGE_TYPE_OPS(Offline, offline)
+PAGE_TYPE_OPS(Offline, offline, offline)
 
 extern void page_offline_freeze(void);
 extern void page_offline_thaw(void);
@@ -973,12 +989,12 @@ extern void page_offline_end(void);
 /*
  * Marks pages in use as page tables.
  */
-PAGE_TYPE_OPS(Table, table)
+PAGE_TYPE_OPS(Table, table, pgtable)
 
 /*
  * Marks guardpages used with debug_pagealloc.
  */
-PAGE_TYPE_OPS(Guard, guard)
+PAGE_TYPE_OPS(Guard, guard, guard)
 
 extern bool is_free_buddy_page(struct page *page);
 
-- 
2.40.1

[PATCH mm-unstable v9 00/31] Split ptdesc from struct page

2023-08-07 Thread Vishal Moola (Oracle)

The MM subsystem is trying to shrink struct page. This patchset
introduces a memory descriptor for page table tracking - struct ptdesc.

This patchset introduces ptdesc, splits ptdesc from struct page, and
converts many callers of page table constructor/destructors to use ptdescs.

Ptdesc is a foundation to further standardize page tables, and eventually
allow for dynamic allocation of page tables independent of struct page.
However, the use of pages for page table tracking is quite deeply
ingrained and varied across archictectures, so there is still a lot of
work to be done before that can happen.

This applies cleanly onto the current unstable after dropping v8 of this
series.

v9:
  Fix build errors for NOMMU configs - trying to define ptdesc before
spinlock_t and struct page were defined.
  Moved definition of struct ptdesc to include/linux/mm_types.h instead
include/linux/pgtable.h

Vishal Moola (Oracle) (31):
  mm: Add PAGE_TYPE_OP folio functions
  pgtable: create struct ptdesc
  mm: add utility functions for ptdesc
  mm: Convert pmd_pgtable_page() callers to use pmd_ptdesc()
  mm: Convert ptlock_alloc() to use ptdescs
  mm: Convert ptlock_ptr() to use ptdescs
  mm: Convert pmd_ptlock_init() to use ptdescs
  mm: Convert ptlock_init() to use ptdescs
  mm: Convert pmd_ptlock_free() to use ptdescs
  mm: Convert ptlock_free() to use ptdescs
  mm: Create ptdesc equivalents for pgtable_{pte,pmd}_page_{ctor,dtor}
  powerpc: Convert various functions to use ptdescs
  x86: Convert various functions to use ptdescs
  s390: Convert various pgalloc functions to use ptdescs
  mm: remove page table members from struct page
  pgalloc: Convert various functions to use ptdescs
  arm: Convert various functions to use ptdescs
  arm64: Convert various functions to use ptdescs
  csky: Convert __pte_free_tlb() to use ptdescs
  hexagon: Convert __pte_free_tlb() to use ptdescs
  loongarch: Convert various functions to use ptdescs
  m68k: Convert various functions to use ptdescs
  mips: Convert various functions to use ptdescs
  nios2: Convert __pte_free_tlb() to use ptdescs
  openrisc: Convert __pte_free_tlb() to use ptdescs
  riscv: Convert alloc_{pmd, pte}_late() to use ptdescs
  sh: Convert pte_free_tlb() to use ptdescs
  sparc64: Convert various functions to use ptdescs
  sparc: Convert pgtable_pte_page_{ctor, dtor}() to ptdesc equivalents
  um: Convert {pmd, pte}_free_tlb() to use ptdescs
  mm: Remove pgtable_{pmd, pte}_page_{ctor, dtor}() wrappers

 Documentation/mm/split_page_table_lock.rst|  12 +-
 .../zh_CN/mm/split_page_table_lock.rst|  14 +-
 arch/arm/include/asm/tlb.h|  12 +-
 arch/arm/mm/mmu.c |   7 +-
 arch/arm64/include/asm/tlb.h  |  14 +-
 arch/arm64/mm/mmu.c   |   7 +-
 arch/csky/include/asm/pgalloc.h   |   4 +-
 arch/hexagon/include/asm/pgalloc.h|   8 +-
 arch/loongarch/include/asm/pgalloc.h  |  27 ++--
 arch/loongarch/mm/pgtable.c   |   7 +-
 arch/m68k/include/asm/mcf_pgalloc.h   |  47 +++---
 arch/m68k/include/asm/sun3_pgalloc.h  |   8 +-
 arch/m68k/mm/motorola.c   |   4 +-
 arch/mips/include/asm/pgalloc.h   |  32 ++--
 arch/mips/mm/pgtable.c|   8 +-
 arch/nios2/include/asm/pgalloc.h  |   8 +-
 arch/openrisc/include/asm/pgalloc.h   |   8 +-
 arch/powerpc/mm/book3s64/mmu_context.c|  10 +-
 arch/powerpc/mm/book3s64/pgtable.c|  32 ++--
 arch/powerpc/mm/pgtable-frag.c|  58 +++
 arch/riscv/include/asm/pgalloc.h  |   8 +-
 arch/riscv/mm/init.c  |  16 +-
 arch/s390/include/asm/pgalloc.h   |   4 +-
 arch/s390/include/asm/tlb.h   |   4 +-
 arch/s390/mm/pgalloc.c| 128 +++
 arch/sh/include/asm/pgalloc.h |   9 +-
 arch/sparc/mm/init_64.c   |  17 +-
 arch/sparc/mm/srmmu.c |   5 +-
 arch/um/include/asm/pgalloc.h |  18 +--
 arch/x86/mm/pgtable.c |  47 +++---
 arch/x86/xen/mmu_pv.c |   2 +-
 include/asm-generic/pgalloc.h |  88 +-
 include/asm-generic/tlb.h |  11 ++
 include/linux/mm.h| 151 +-
 include/linux/mm_types.h  |  97 ---
 include/linux/page-flags.h|  30 +++-
 mm/memory.c   |   8 +-
 37 files changed, 585 insertions(+), 385 deletions(-)

-- 
2.40.1

Re: linux-next: Tree for Aug 7 (sound/soc/fsl/)

2023-08-07 Thread Randy Dunlap



On 8/6/23 22:47, Stephen Rothwell wrote:
> Hi all,
> 
> Changes since 20230804:
> 

on PPC32:

WARNING: unmet direct dependencies detected for SND_SOC_MPC5200_AC97
  Depends on [n]: SOUND [=y] && SND [=y] && SND_SOC [=y] && SND_POWERPC_SOC 
[=y] && PPC_MPC52xx [=y] && PPC_BESTCOMM [=n]
  Selected by [y]:
  - SND_MPC52xx_SOC_PCM030 [=y] && SOUND [=y] && SND [=y] && SND_SOC [=y] && 
SND_POWERPC_SOC [=y] && PPC_MPC5200_SIMPLE [=y]
  - SND_MPC52xx_SOC_EFIKA [=y] && SOUND [=y] && SND [=y] && SND_SOC [=y] && 
SND_POWERPC_SOC [=y] && PPC_EFIKA [=y]

powerpc-linux-ld: sound/soc/fsl/mpc5200_psc_ac97.o: in function 
`psc_ac97_of_remove':
mpc5200_psc_ac97.c:(.text.psc_ac97_of_remove+0x1c): undefined reference to 
`mpc5200_audio_dma_destroy'
powerpc-linux-ld: sound/soc/fsl/mpc5200_psc_ac97.o: in function 
`psc_ac97_of_probe':
mpc5200_psc_ac97.c:(.text.psc_ac97_of_probe+0x20): undefined reference to 
`mpc5200_audio_dma_create'


Full randconfig file is attached.

-- 
~Randy

config-r4919.gz
Description: application/gzip

Re: [PATCH V2 2/2] tools/perf/tests: perf all metricgroups test fails when perf_event access is restricted

2023-08-07 Thread Disha Goel


On 04/08/23 10:30 am, Athira Rajeev wrote:

Perf all metricgroups test fails as below when perf_event access
is restricted.

 ./perf test -v "perf all metricgroups test"
 Testing Memory_BW
 Error:
 Access to performance monitoring and observability operations is limited.
 Enforced MAC policy settings (SELinux) can limit access to performance
 access to performance monitoring and observability operations for processes
 without CAP_PERFMON, CAP_SYS_PTRACE or CAP_SYS_ADMIN Linux capability.

 test child finished with -1
  end 
 perf all metricgroups test: FAILED!

Fix the testcase to skip those metric events which needs perf_event access
explicitly. The exit code of the testcase is based on return code of
the perf stat command ( enabled by set -e option ). Hence save the
exit status in a variable and use that to decide success or fail for the
testcase.

Signed-off-by: Athira Rajeev


With this patch applied(on power) perf metricgroups test works correctly when 
perf_event access is restricted.

 # ./perf test "perf all metricgroups test"
 96: perf all metricgroups test  : Ok

Tested-by: Disha Goel


---
Changelog:
v1 -> v2:
  Changed the condition to use "echo" and "grep" so it works on
  Posix shell as well.

  tools/perf/tests/shell/stat_all_metricgroups.sh | 14 +++---
  1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/tools/perf/tests/shell/stat_all_metricgroups.sh 
b/tools/perf/tests/shell/stat_all_metricgroups.sh
index cb35e488809a..eaa5e1172294 100755
--- a/tools/perf/tests/shell/stat_all_metricgroups.sh
+++ b/tools/perf/tests/shell/stat_all_metricgroups.sh
@@ -2,11 +2,19 @@
  # perf all metricgroups test
  # SPDX-License-Identifier: GPL-2.0

-set -e
-
  for m in $(perf list --raw-dump metricgroups); do
echo "Testing $m"
-  perf stat -M "$m" -a true
+  result=$(perf stat -M "$m" -a true 2>&1)
+  rc=$?
+  # Skip if there is no access to perf_events monitoring
+  # Otherwise exit based on the return code of perf comamnd.
+  if echo "$result" | grep -q "Access to performance monitoring and observability 
operations is limited";
+  then
+  continue
+  else
+  [ $rc -ne 0 ] && exit $rc
+  fi
+
  done

  exit 0

Re: [PATCH V2 2/2] tools/perf/tests: perf all metricgroups test fails when perf_event access is restricted

2023-08-07 Thread Arnaldo Carvalho de Melo

Em Mon, Aug 07, 2023 at 08:14:39PM +0530, Disha Goel escreveu:
> On 04/08/23 10:30 am, Athira Rajeev wrote:
> > Perf all metricgroups test fails as below when perf_event access
> > is restricted.
> > 
> >  ./perf test -v "perf all metricgroups test"
> >  Testing Memory_BW
> >  Error:
> >  Access to performance monitoring and observability operations is 
> > limited.
> >  Enforced MAC policy settings (SELinux) can limit access to performance
> >  access to performance monitoring and observability operations for 
> > processes
> >  without CAP_PERFMON, CAP_SYS_PTRACE or CAP_SYS_ADMIN Linux capability.
> > 
> >  test child finished with -1
> >   end 
> >  perf all metricgroups test: FAILED!
> > 
> > Fix the testcase to skip those metric events which needs perf_event access
> > explicitly. The exit code of the testcase is based on return code of
> > the perf stat command ( enabled by set -e option ). Hence save the
> > exit status in a variable and use that to decide success or fail for the
> > testcase.

I wonder if we shouldn't somehow check if the credentials needed to
performing a test shouldn't be checked before trying it. This way we
would check if the check that the tool or the kernel is doing is the
appropriate one.

I.e. the kernel refusal for doing something may be an error.

- Arnaldo

> > Signed-off-by: Athira Rajeev
> 
> With this patch applied(on power) perf metricgroups test works correctly when 
> perf_event access is restricted.
> 
>  # ./perf test "perf all metricgroups test"
>  96: perf all metricgroups test  : Ok
> 
> Tested-by: Disha Goel
> 
> > ---
> > Changelog:
> > v1 -> v2:
> >   Changed the condition to use "echo" and "grep" so it works on
> >   Posix shell as well.
> > 
> >   tools/perf/tests/shell/stat_all_metricgroups.sh | 14 +++---
> >   1 file changed, 11 insertions(+), 3 deletions(-)
> > 
> > diff --git a/tools/perf/tests/shell/stat_all_metricgroups.sh 
> > b/tools/perf/tests/shell/stat_all_metricgroups.sh
> > index cb35e488809a..eaa5e1172294 100755
> > --- a/tools/perf/tests/shell/stat_all_metricgroups.sh
> > +++ b/tools/perf/tests/shell/stat_all_metricgroups.sh
> > @@ -2,11 +2,19 @@
> >   # perf all metricgroups test
> >   # SPDX-License-Identifier: GPL-2.0
> > 
> > -set -e
> > -
> >   for m in $(perf list --raw-dump metricgroups); do
> > echo "Testing $m"
> > -  perf stat -M "$m" -a true
> > +  result=$(perf stat -M "$m" -a true 2>&1)
> > +  rc=$?
> > +  # Skip if there is no access to perf_events monitoring
> > +  # Otherwise exit based on the return code of perf comamnd.
> > +  if echo "$result" | grep -q "Access to performance monitoring and 
> > observability operations is limited";
> > +  then
> > +  continue
> > +  else
> > +  [ $rc -ne 0 ] && exit $rc
> > +  fi
> > +
> >   done
> > 
> >   exit 0

-- 

- Arnaldo

Re: [PATCH 1/2] PCI: Add pci_find_next_dvsec_capability to find next designated VSEC

2023-08-07 Thread Bjorn Helgaas

[+cc David since drivers/platform/x86/intel/vsec.c does some similar
things, although it seems to iterate over all Intel DVSEC IDs at once]

In subject:

  PCI: Add pci_find_next_dvsec_capability() to find next Designated VSEC

On Mon, Aug 07, 2023 at 11:18:45AM +0800, Xiongfeng Wang wrote:
> Some devices may have several DVSEC(Designated Vendor-Specific Extended
> Capability) entries with the same DVSEC ID. Add
> pci_find_next_dvsec_capability() to find them all.

Add space between "DVSEC" and "(Designated ...)".

> Signed-off-by: Xiongfeng Wang 

Acked-by: Bjorn Helgaas 

so you can merge this along with the ocxl patch that uses it.

> ---
>  drivers/pci/pci.c   | 37 +
>  include/linux/pci.h |  2 ++
>  2 files changed, 27 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index 60230da957e0..3455ca7306ae 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -749,35 +749,48 @@ u16 pci_find_vsec_capability(struct pci_dev *dev, u16 
> vendor, int cap)
>  EXPORT_SYMBOL_GPL(pci_find_vsec_capability);
>  
>  /**
> - * pci_find_dvsec_capability - Find DVSEC for vendor
> + * pci_find_next_dvsec_capability - Find next DVSEC for vendor
>   * @dev: PCI device to query
> + * @start: address at which to start looking (0 to start at beginning of 
> list)

s/address/Address/ to match other parameters

>   * @vendor: Vendor ID to match for the DVSEC
>   * @dvsec: Designated Vendor-specific capability ID

There are a lot of IDs floating around here, so to better match the
spec language:

  @dvsec: Vendor-defined DVSEC ID

> - * If DVSEC has Vendor ID @vendor and DVSEC ID @dvsec return the capability
> - * offset in config space; otherwise return 0.
> + * Returns the address of the next DVSEC if the DVSEC has Vendor ID @vendor 
> and
> + * DVSEC ID @dvsec; otherwise return 0. DVSEC can occur several times with 
> the
> + * same DVSEC ID for some devices, and this provides a way to find them all.
>   */
> -u16 pci_find_dvsec_capability(struct pci_dev *dev, u16 vendor, u16 dvsec)
> +u16 pci_find_next_dvsec_capability(struct pci_dev *dev, u16 start, u16 
> vendor,
> +u16 dvsec)
>  {
> - int pos;
> + u16 pos = start;
>  
> - pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_DVSEC);
> - if (!pos)
> - return 0;
> -
> - while (pos) {
> + while ((pos = pci_find_next_ext_capability(dev, pos,
> +   PCI_EXT_CAP_ID_DVSEC))) {
>   u16 v, id;
>  
>   pci_read_config_word(dev, pos + PCI_DVSEC_HEADER1, );
>   pci_read_config_word(dev, pos + PCI_DVSEC_HEADER2, );
>   if (vendor == v && dvsec == id)
>   return pos;
> -
> - pos = pci_find_next_ext_capability(dev, pos, 
> PCI_EXT_CAP_ID_DVSEC);
>   }
>  
>   return 0;
>  }
> +EXPORT_SYMBOL_GPL(pci_find_next_dvsec_capability);
> +
> +/**
> + * pci_find_dvsec_capability - Find DVSEC for vendor
> + * @dev: PCI device to query
> + * @vendor: Vendor ID to match for the DVSEC
> + * @dvsec: Designated Vendor-specific capability ID
> + *
> + * If DVSEC has Vendor ID @vendor and DVSEC ID @dvsec return the capability
> + * offset in config space; otherwise return 0.
> + */
> +u16 pci_find_dvsec_capability(struct pci_dev *dev, u16 vendor, u16 dvsec)
> +{
> + return pci_find_next_dvsec_capability(dev, 0, vendor, dvsec);
> +}
>  EXPORT_SYMBOL_GPL(pci_find_dvsec_capability);
>  
>  /**
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index c69a2cc1f412..82bb905daf72 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -1168,6 +1168,8 @@ u16 pci_find_next_ext_capability(struct pci_dev *dev, 
> u16 pos, int cap);
>  struct pci_bus *pci_find_next_bus(const struct pci_bus *from);
>  u16 pci_find_vsec_capability(struct pci_dev *dev, u16 vendor, int cap);
>  u16 pci_find_dvsec_capability(struct pci_dev *dev, u16 vendor, u16 dvsec);
> +u16 pci_find_next_dvsec_capability(struct pci_dev *dev, u16 start, u16 
> vendor,
> +u16 dvsec);
>  
>  u64 pci_get_dsn(struct pci_dev *dev);
>  
> -- 
> 2.20.1
>

Re: [PATCH v7 7/7] mm/memory_hotplug: Enable runtime update of memmap_on_memory parameter

2023-08-07 Thread David Hildenbrand


On 07.08.23 14:41, David Hildenbrand wrote:

On 07.08.23 14:27, Michal Hocko wrote:

On Sat 05-08-23 19:54:23, Aneesh Kumar K V wrote:
[...]

Do you see a need for firmware-managed memory to be hotplugged in with
different memory block sizes?


In short. Yes. Slightly longer, a fixed size memory block semantic is
just standing in the way and I would even argue it is actively harmful.
Just have a look at ridicously small memory blocks on ppc. I do
understand that it makes some sense to be aligned to the memory model
(so sparsmem section aligned). In an ideal world, memory hotplug v2
interface (if we ever go that path) should be physical memory range based.


Yes, we discussed that a couple of times already (and so far nobody
cared to implement any of that).

Small memory block sizes are very beneficial for use cases like PPC
dlar, virtio-mem, hyperv-balloon, ... essentially in most virtual
environments where you might want to add/remove memory in very small
granularity. I don't see that changing any time soon. Rather the opposite.

Small memory block sizes are suboptimal for large machines where you
might never end up removing such memory (boot memory), or when dealing
with devices that can only be removed in one piece (DIMM/kmem). We
already have memory groups in place to model that.

For the latter it might be beneficial to have memory blocks of larger
size that correspond to the physical memory ranges. That might also make
a memmap (re-)configuration easier.

Not sure if that is standing in any way or is harmful, though.



Just because I thought of something right now, I'll share it, maybe it 
makes sense.


Assume when we get add_memory*(MHP_MEMMAP_ON_MEMORY) and it is enabled 
by the admin:


1) We create a single altmap at the beginning of the memory

2) We create the existing fixed-size memory block devices, but flag them
   to be part of a single "altmap" unit.

3) Whenever we trigger offlining of a single such memory block, we
   offline *all* memory blocks belonging to that altmap, essentially
   using a single offline_pages() call and updating all memory block
   states accordingly.

4) Whenever we trigger onlining of a single such memory block, we
   online *all* memory blocks belonging to that altmap, using a single
   online_pages() call.

5) We fail remove_memory() if it doesn't cover the same (altmap) range.

So we can avoid having a memory block v2 (and all that comes with that 
...) for now and still get that altmap stuff sorted out. As that altmap 
behavior can be controlled by the admin, we should be fine for now.


I think all memory notifiers should already be able to handle bigger 
granularity, but it would be easy to check. Some internal things might 
require a bit of tweaking.


Just a thought.

--
Cheers,

David / dhildenb

Re: [PATCH v2 24/28] pinctrl: Add support for the Lantic PEF2256 pinmux

2023-08-07 Thread Herve Codina

Hi Linus, Andrew,

On Mon, 7 Aug 2023 15:17:11 +0200
Andrew Lunn  wrote:

> On Mon, Aug 07, 2023 at 03:06:42PM +0200, Linus Walleij wrote:
> > On Mon, Aug 7, 2023 at 3:05 PM Linus Walleij  
> > wrote:
> >   
> > > > Signed-off-by: Herve Codina   
> > >
> > > So it is a bridge chip? Please use that terminology since Linux
> > > DRM often talks about bridges.  
> > 
> > Replying to self: no it's not a bridge, it's a WAN thingy.
> > 
> > So perhaps write that this is a WAN interface adapter chip.  
> 
> Hi Linus
> 
> In the E1/T1/J1 world, framer is a well understood concept. Maybe the
> text needs a bit more background information to explain what this is
> to somebody who does not have an old school telecoms background.
> 
>Andrew

Maybe I can add in my commit log:
--- 8< ---
This kind of component can be found in old telecommunication system.
It was used to digital transmission of many simultaneous telephone calls
by time-division multiplexing. Also using HDLC protocol, WAN networks
can be reached through the framer.
--- 8< ---

Do you think it will be better ?

Regards,
Hervé Codina

-- 
Hervé Codina, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com

Re: [PATCH v2 24/28] pinctrl: Add support for the Lantic PEF2256 pinmux

2023-08-07 Thread Herve Codina

Hi Linus,

On Mon, 7 Aug 2023 15:05:15 +0200
Linus Walleij  wrote:

> Hi Herve,
> 
> thanks for your patch!
> 
> First: is this patch something we could merge separately? I don't see
> any dependency on the other patches.

It depends on pef2256:
in drivers/pinctrl/Kconfig:
--- 8< ---
+config PINCTRL_PEF2256
+   tristate "Lantiq PEF2256 (FALC56) pin controller driver"
+   depends on OF && FRAMER_PEF2256
--- 8< ---
in drivers/pinctrl/pinctrl-pef2256.c
--- 8< ---
+#include 
--- 8< ---

All the pef2256 it depends on is provided by
 path 23/28 "net: wan: framer: Add support for the Lantiq PEF2256 framer"

> 
> On Wed, Jul 26, 2023 at 5:04 PM Herve Codina  wrote:
> 
> > The Lantiq PEF2256 is a framer and line interface component designed to
> > fulfill all required interfacing between an analog E1/T1/J1 line and the
> > digital PCM system highway/H.100 bus.
> >
> > This pinmux support handles the pin muxing part (pins RP(A..D) and pins
> > XP(A..D)) of the PEF2256.
> >
> > Signed-off-by: Herve Codina   
> 
> So it is a bridge chip? Please use that terminology since Linux
> DRM often talks about bridges.
> 
> > +++ b/drivers/pinctrl/pinctrl-pef2256-regs.h  
> (...)
> > +#include "linux/bitfield.h"  
> 
> Really? I don't think there is such a file there.
> 
> Do you mean  and does this even compile?

Yes and it compiles (even with quoted included file).
I will be changed to  in the next interation.

> 
> > diff --git a/drivers/pinctrl/pinctrl-pef2256.c 
> > b/drivers/pinctrl/pinctrl-pef2256.c  
> (...)
> > +struct pef2256_pinctrl {
> > +   struct device *dev;
> > +   struct regmap *regmap;
> > +   enum pef2256_version version;
> > +   struct {
> > +   struct pinctrl_desc pctrl_desc;
> > +   const struct pef2256_function_desc *functions;
> > +   unsigned int nfunctions;
> > +   } pinctrl;  
> 
> Uh anonymous struct... can't you just define the struct separately
> with a name? Or fold it into struct pef2256_pinctrl without the
> additional struct? Thanks.

I will fold it into struct pef2256_pinctrl in the next iteration.

Thanks
Hervé

> 
> Otherwise it looks neat!
> 
> Yours,
> Linus Walleij

Re: [PATCH v2 24/28] pinctrl: Add support for the Lantic PEF2256 pinmux

2023-08-07 Thread Andrew Lunn

On Mon, Aug 07, 2023 at 03:06:42PM +0200, Linus Walleij wrote:
> On Mon, Aug 7, 2023 at 3:05 PM Linus Walleij  wrote:
> 
> > > Signed-off-by: Herve Codina 
> >
> > So it is a bridge chip? Please use that terminology since Linux
> > DRM often talks about bridges.
> 
> Replying to self: no it's not a bridge, it's a WAN thingy.
> 
> So perhaps write that this is a WAN interface adapter chip.

Hi Linus

In the E1/T1/J1 world, framer is a well understood concept. Maybe the
text needs a bit more background information to explain what this is
to somebody who does not have an old school telecoms background.

   Andrew

Re: [PATCH v2 24/28] pinctrl: Add support for the Lantic PEF2256 pinmux

2023-08-07 Thread Mark Brown

On Mon, Aug 07, 2023 at 03:05:15PM +0200, Linus Walleij wrote:
> On Wed, Jul 26, 2023 at 5:04 PM Herve Codina  wrote:

> > +#include "linux/bitfield.h"

> Really? I don't think there is such a file there.

> Do you mean  and does this even compile?

#include "" means "try the local directory first then fall back to
system includes" so it'll work, it picks up extra stuff on top of what
<> does.  There's a stylistic issue though.

signature.asc
Description: PGP signature

Re: [PATCH v2 24/28] pinctrl: Add support for the Lantic PEF2256 pinmux

2023-08-07 Thread Linus Walleij

On Mon, Aug 7, 2023 at 3:05 PM Linus Walleij  wrote:

> > Signed-off-by: Herve Codina 
>
> So it is a bridge chip? Please use that terminology since Linux
> DRM often talks about bridges.

Replying to self: no it's not a bridge, it's a WAN thingy.

So perhaps write that this is a WAN interface adapter chip.

Yours,
Linus Walleij

Re: [PATCH v2 24/28] pinctrl: Add support for the Lantic PEF2256 pinmux

2023-08-07 Thread Linus Walleij

Hi Herve,

thanks for your patch!

First: is this patch something we could merge separately? I don't see
any dependency on the other patches.

On Wed, Jul 26, 2023 at 5:04 PM Herve Codina  wrote:

> The Lantiq PEF2256 is a framer and line interface component designed to
> fulfill all required interfacing between an analog E1/T1/J1 line and the
> digital PCM system highway/H.100 bus.
>
> This pinmux support handles the pin muxing part (pins RP(A..D) and pins
> XP(A..D)) of the PEF2256.
>
> Signed-off-by: Herve Codina 

So it is a bridge chip? Please use that terminology since Linux
DRM often talks about bridges.

> +++ b/drivers/pinctrl/pinctrl-pef2256-regs.h
(...)
> +#include "linux/bitfield.h"

Really? I don't think there is such a file there.

Do you mean  and does this even compile?

> diff --git a/drivers/pinctrl/pinctrl-pef2256.c 
> b/drivers/pinctrl/pinctrl-pef2256.c
(...)
> +struct pef2256_pinctrl {
> +   struct device *dev;
> +   struct regmap *regmap;
> +   enum pef2256_version version;
> +   struct {
> +   struct pinctrl_desc pctrl_desc;
> +   const struct pef2256_function_desc *functions;
> +   unsigned int nfunctions;
> +   } pinctrl;

Uh anonymous struct... can't you just define the struct separately
with a name? Or fold it into struct pef2256_pinctrl without the
additional struct? Thanks.

Otherwise it looks neat!

Yours,
Linus Walleij

Re: [PATCH v7 7/7] mm/memory_hotplug: Enable runtime update of memmap_on_memory parameter

2023-08-07 Thread David Hildenbrand


On 03.08.23 13:30, Michal Hocko wrote:

On Thu 03-08-23 11:24:08, David Hildenbrand wrote:
[...]

would be readable only when the block is offline and it would reallocate
vmemmap on the change. Makes sense? Are there any risks? Maybe pfn
walkers?


The question is: is it of any real value such that it would be worth the
cost and risk?


One of the primary reasons for memmap_on_memory is that *memory hotplug*
succeeds even in low-memory situations (including, low on ZONE_NORMAL
situations).


Sorry for the late reply, I'm busy with 100 other things.



One usecase I would have in mind is a mix of smaller and larger memory
blocks. For larger ones you want to have memmap_on_memory in general
because they do not eat memory from outside but small(er) ones might be
more tricky because now you can add a lot of blocks that would be
internally fragmented to prevent larger allocations to form.


Okay, I see what you mean.

The internal fragmentation might become an issue at some point: for 
x86-64 with 128 MiB blocks / 2 MiB THP it's not a real issue right now. 
For a arm64 64k with 512 MiB blocks and 512 MiB THP / hugelb it could be 
one.


I recall discussing that with Oscar back when he added memmap_on_memory, 
where we also discussed the variable-sized memory blocks to avoid such 
internal fragmentation.


For small ones you probably want to only use memmap_on_memory when 
unavoidable: for example, when adding without memmap_on_memory would 
fail / already failed. Possibly some later memmap relocation might make 
sense in some scenarios.





So you want that behavior already when hotplugging such
devices. While there might be value to relocate it later, I'm not sure if
that is really worth it, and it does not solve the main use case.


Is it worth it? TBH I am not sure same as I am not sure the global
default should be writable after boot. If we want to make it more
dynamic we should however talk about the proper layer this is
implemented on.


Agreed.

--
Cheers,

David / dhildenb

Re: [PATCH v7 7/7] mm/memory_hotplug: Enable runtime update of memmap_on_memory parameter

2023-08-07 Thread David Hildenbrand


On 07.08.23 14:27, Michal Hocko wrote:

On Sat 05-08-23 19:54:23, Aneesh Kumar K V wrote:
[...]

Do you see a need for firmware-managed memory to be hotplugged in with
different memory block sizes?


In short. Yes. Slightly longer, a fixed size memory block semantic is
just standing in the way and I would even argue it is actively harmful.
Just have a look at ridicously small memory blocks on ppc. I do
understand that it makes some sense to be aligned to the memory model
(so sparsmem section aligned). In an ideal world, memory hotplug v2
interface (if we ever go that path) should be physical memory range based.


Yes, we discussed that a couple of times already (and so far nobody 
cared to implement any of that).


Small memory block sizes are very beneficial for use cases like PPC 
dlar, virtio-mem, hyperv-balloon, ... essentially in most virtual 
environments where you might want to add/remove memory in very small 
granularity. I don't see that changing any time soon. Rather the opposite.


Small memory block sizes are suboptimal for large machines where you 
might never end up removing such memory (boot memory), or when dealing 
with devices that can only be removed in one piece (DIMM/kmem). We 
already have memory groups in place to model that.


For the latter it might be beneficial to have memory blocks of larger 
size that correspond to the physical memory ranges. That might also make 
a memmap (re-)configuration easier.


Not sure if that is standing in any way or is harmful, though.

--
Cheers,

David / dhildenb

Re: [PATCH v4 1/2] nmi_backtrace: Allow excluding an arbitrary CPU

2023-08-07 Thread Chen-Yu Tsai

On Fri, Aug 4, 2023 at 10:01 PM Douglas Anderson  wrote:
>
> The APIs that allow backtracing across CPUs have always had a way to
> exclude the current CPU. This convenience means callers didn't need to
> find a place to allocate a CPU mask just to handle the common case.
>
> Let's extend the API to take a CPU ID to exclude instead of just a
> boolean. This isn't any more complex for the API to handle and allows
> the hardlockup detector to exclude a different CPU (the one it already
> did a trace for) without needing to find space for a CPU mask.
>
> Arguably, this new API also encourages safer behavior. Specifically if
> the caller wants to avoid tracing the current CPU (maybe because they
> already traced the current CPU) this makes it more obvious to the
> caller that they need to make sure that the current CPU ID can't
> change.
>
> Acked-by: Michal Hocko 
> Signed-off-by: Douglas Anderson 
> ---
>
> Changes in v4:
> - Renamed trigger_allbutself_cpu_backtrace() for when trigger is unsupported.
>
> Changes in v3:
> - ("nmi_backtrace: Allow excluding an arbitrary CPU") new for v3.
>
>  arch/arm/include/asm/irq.h   |  2 +-
>  arch/arm/kernel/smp.c|  4 ++--
>  arch/loongarch/include/asm/irq.h |  2 +-
>  arch/loongarch/kernel/process.c  |  4 ++--
>  arch/mips/include/asm/irq.h  |  2 +-
>  arch/mips/kernel/process.c   |  4 ++--
>  arch/powerpc/include/asm/irq.h   |  2 +-
>  arch/powerpc/kernel/stacktrace.c |  4 ++--
>  arch/powerpc/kernel/watchdog.c   |  4 ++--
>  arch/sparc/include/asm/irq_64.h  |  2 +-
>  arch/sparc/kernel/process_64.c   |  6 +++---
>  arch/x86/include/asm/irq.h   |  2 +-
>  arch/x86/kernel/apic/hw_nmi.c|  4 ++--
>  include/linux/nmi.h  | 14 +++---
>  kernel/watchdog.c|  2 +-
>  lib/nmi_backtrace.c  |  6 +++---
>  16 files changed, 32 insertions(+), 32 deletions(-)
>

[...]

> diff --git a/include/linux/nmi.h b/include/linux/nmi.h
> index e3e6a64b98e0..7cf7801856a1 100644
> --- a/include/linux/nmi.h
> +++ b/include/linux/nmi.h
> @@ -157,31 +157,31 @@ static inline void touch_nmi_watchdog(void)
>  #ifdef arch_trigger_cpumask_backtrace
>  static inline bool trigger_all_cpu_backtrace(void)
>  {
> -   arch_trigger_cpumask_backtrace(cpu_online_mask, false);
> +   arch_trigger_cpumask_backtrace(cpu_online_mask, -1);
> return true;
>  }
>
> -static inline bool trigger_allbutself_cpu_backtrace(void)
> +static inline bool trigger_allbutcpu_cpu_backtrace(int exclude_cpu)
>  {
> -   arch_trigger_cpumask_backtrace(cpu_online_mask, true);
> +   arch_trigger_cpumask_backtrace(cpu_online_mask, exclude_cpu);
> return true;
>  }
>
>  static inline bool trigger_cpumask_backtrace(struct cpumask *mask)
>  {
> -   arch_trigger_cpumask_backtrace(mask, false);
> +   arch_trigger_cpumask_backtrace(mask, -1);
> return true;
>  }
>
>  static inline bool trigger_single_cpu_backtrace(int cpu)
>  {
> -   arch_trigger_cpumask_backtrace(cpumask_of(cpu), false);
> +   arch_trigger_cpumask_backtrace(cpumask_of(cpu), -1);
> return true;
>  }
>
>  /* generic implementation */
>  void nmi_trigger_cpumask_backtrace(const cpumask_t *mask,
> -  bool exclude_self,
> +  int exclude_cpu,
>void (*raise)(cpumask_t *mask));
>  bool nmi_cpu_backtrace(struct pt_regs *regs);
>
> @@ -190,7 +190,7 @@ static inline bool trigger_all_cpu_backtrace(void)
>  {
> return false;
>  }
> -static inline bool trigger_allbutself_cpu_backtrace(void)
> +static inline bool trigger_allbutcpu_cpu_backtrace(void)
  ^
The parameter here is still wrong. It should be "int exclude_cpu".

This patch in Andrew's queue is causing build errors on next-20230807 on arm64:

kernel/watchdog.c: In function ‘watchdog_timer_fn’:
kernel/watchdog.c:521:25: error: too many arguments to function
‘trigger_allbutcpu_cpu_backtrace’
  521 |
trigger_allbutcpu_cpu_backtrace(smp_processor_id());
  | ^~~
In file included from kernel/watchdog.c:17:
./include/linux/nmi.h:193:20: note: declared here
  193 | static inline bool trigger_allbutcpu_cpu_backtrace(void)
  |^~~
make[3]: *** [scripts/Makefile.build:243: kernel/watchdog.o] Error 1


ChenYu

Re: [PATCH v7 7/7] mm/memory_hotplug: Enable runtime update of memmap_on_memory parameter

2023-08-07 Thread Michal Hocko

On Sat 05-08-23 19:54:23, Aneesh Kumar K V wrote:
[...]
> Do you see a need for firmware-managed memory to be hotplugged in with
> different memory block sizes?

In short. Yes. Slightly longer, a fixed size memory block semantic is
just standing in the way and I would even argue it is actively harmful.
Just have a look at ridicously small memory blocks on ppc. I do
understand that it makes some sense to be aligned to the memory model
(so sparsmem section aligned). In an ideal world, memory hotplug v2
interface (if we ever go that path) should be physical memory range based.
-- 
Michal Hocko
SUSE Labs

Re: [PATCH v3 1/2] nmi_backtrace: Allow excluding an arbitrary CPU

2023-08-07 Thread Michal Hocko

On Fri 04-08-23 09:06:07, Doug Anderson wrote:
> Hi,
> 
> On Fri, Aug 4, 2023 at 8:02 AM Michal Hocko  wrote:
> >
> > > > It would have been slightly safer to modify 
> > > > arch_trigger_cpumask_backtrace
> > > > by switching arguments so that some leftovers are captured easier.
> > >
> > > I'm not sure I understand. Oh, you're saying make the prototype of
> > > arch_trigger_cpumask_backtrace() incompatible so that if someone is
> > > directly calling it then it'll be a compile-time error?
> >
> > exactly. bool to int promotion would be too easy to miss while the
> > pointer to int would complain loudly.
> >
> > > I guess the
> > > hope is that nobody is calling that directly and they're calling
> > > through the trigger_...() functions.
> >
> > Hope is one thing, being preventive another.
> >
> > > For now I'm going to leave this alone.
> >
> > If you are going to send another version then please consider this. Not
> > a hard requirement but better.
> 
> If I do send another version, do you have any suggestions for how to
> change this to make it incompatible?

I would swap parameters as this seems simplest.

> I guess swapping the order of the
> parameters would be best? I considered doing that for v4 but I felt
> like long term the current order of the parameters was better.

Yes the current ordering is better but having it other way around is not
really horrendous either.

> I also
> considered a rename, but that different problems. ;-) If I rename both
> the #define and the function then if someone has an out-of-tree patch
> adding arch_trigger_cpumask_backtrace() for another architecture, like
> say arm64, then there would be no compile-time failure indicating that
> the out-of-tree patch needs updating. I could rename the functions but
> _not_ the #define, I guess?

I think that swapping would be simplest as the type mismatch should
catch also pending out-of-tree potential implementations.

-- 
Michal Hocko
SUSE Labs

[PATCH v8 2/2] PCI: rpaphp: Error out on busy status from get-sensor-state

2023-08-07 Thread Mahesh Salgaonkar

When certain PHB HW failure causes pHyp to recover PHB, it marks the PE
state as temporarily unavailable until recovery is complete. This also
triggers an EEH handler in Linux which needs to notify drivers, and perform
recovery. But before notifying the driver about the PCI error it uses
get_adapter_status()->rpaphp_get_sensor_state()->rtas_call(get-sensor-state)
operation of the hotplug_slot to determine if the slot contains a device or
not. If the slot is empty, the recovery is skipped entirely.

eeh_event_handler()
  ->eeh_handle_normal_event()
->eeh_slot_presence_check()
  ->get_adapter_status()
->rpaphp_get_sensor_state()
  ->rtas_get_sensor()
->rtas_call(get-sensor-state)

However on certain PHB failures, the RTAS call rtas_call(get-sensor-state)
returns extended busy error (9902) until PHB is recovered by pHyp. Once PHB
is recovered, the rtas_call(get-sensor-state) returns success with correct
presence status. The RTAS call interface rtas_get_sensor() loops over the
RTAS call on extended delay return code (9902) until the return value is
either success (0) or error (-1). This causes the EEH handler to get stuck
for ~6 seconds before it could notify that the PCI error has been detected
and stop any active operations. Hence with running I/O traffic, during this
6 seconds, the network driver continues its operation and hits a timeout
(netdev watchdog).


[52732.244731] DEBUG: ibm_read_slot_reset_state2()
[52732.244762] DEBUG: ret = 0, rets[0]=5, rets[1]=1, rets[2]=4000, rets[3]=>
[52732.244798] DEBUG: in eeh_slot_presence_check
[52732.244804] DEBUG: error state check
[52732.244807] DEBUG: Is slot hotpluggable
[52732.244810] DEBUG: hotpluggable ops ?
[52732.244953] DEBUG: Calling ops->get_adapter_status
[52732.244958] DEBUG: calling rpaphp_get_sensor_state
[52736.564262] [ cut here ]
[52736.564299] NETDEV WATCHDOG: enP64p1s0f3 (tg3): transmit queue 0 timed o>
[52736.564324] WARNING: CPU: 1442 PID: 0 at net/sched/sch_generic.c:478 dev>
[...]
[52736.564505] NIP [c0c32368] dev_watchdog+0x438/0x440
[52736.564513] LR [c0c32364] dev_watchdog+0x434/0x440


On timeouts, network driver starts dumping debug information to console
(e.g bnx2 driver calls bnx2x_panic_dump()), and go into recovery path while
pHyp is still recovering the PHB. As part of recovery, the driver tries to
reset the device and it keeps failing since every PCI read/write returns
ff's. And when EEH recovery kicks-in, the driver is unable to recover the
device. This impacts the ssh connection and leads to the system being
inaccessible. To get the NIC working again it needs a reboot or re-assign
the I/O adapter from HMC.

[ 9531.168587] EEH: Beginning: 'slot_reset'
[ 9531.168601] PCI 0013:01:00.0#1: EEH: Invoking bnx2x->slot_reset()
[...]
[ 9614.110094] bnx2x: [bnx2x_func_stop:9129(enP19p1s0f0)]FUNC_STOP ramrod 
failed. Running a dry transaction
[ 9614.110300] bnx2x: [bnx2x_igu_int_disable:902(enP19p1s0f0)]BUG! Proper val 
not read from IGU!
[ 9629.178067] bnx2x: [bnx2x_fw_command:3055(enP19p1s0f0)]FW failed to respond!
[ 9629.178085] bnx2x 0013:01:00.0 enP19p1s0f0: bc 7.10.4
[ 9629.178091] bnx2x: [bnx2x_fw_dump_lvl:789(enP19p1s0f0)]Cannot dump MCP info 
while in PCI error
[ 9644.241813] bnx2x: [bnx2x_io_slot_reset:14245(enP19p1s0f0)]IO slot reset --> 
driver unload
[...]
[ 9644.241819] PCI 0013:01:00.0#1: EEH: bnx2x driver reports: 'disconnect'
[ 9644.241823] PCI 0013:01:00.1#1: EEH: Invoking bnx2x->slot_reset()
[ 9644.241827] bnx2x: [bnx2x_io_slot_reset:14229(enP19p1s0f1)]IO slot reset 
initializing...
[ 9644.241916] bnx2x 0013:01:00.1: enabling device (0140 -> 0142)
[ 9644.258604] bnx2x: [bnx2x_io_slot_reset:14245(enP19p1s0f1)]IO slot reset --> 
driver unload
[ 9644.258612] PCI 0013:01:00.1#1: EEH: bnx2x driver reports: 'disconnect'
[ 9644.258615] EEH: Finished:'slot_reset' with aggregate recovery 
state:'disconnect'
[ 9644.258620] EEH: Unable to recover from failure from PHB#13-PE#1.
[ 9644.261811] EEH: Beginning: 'error_detected(permanent failure)'
[...]
[ 9644.261823] EEH: Finished:'error_detected(permanent failure)'

Hence, it becomes important to inform driver about the PCI error detection
as early as possible, so that driver is aware of PCI error and waits for
EEH handler's next action for successful recovery.

Current implementation uses rtas_get_sensor() API which blocks the slot
check state until RTAS call returns success. To avoid this, fix the PCI
hotplug driver (rpaphp) to return an error (-EBUSY) if the slot presence
state can not be detected immediately while PE is in EEH recovery state.
Change rpaphp_get_sensor_state() to invoke rtas_call(get-sensor-state)
directly only if the respective PE is in EEH recovery state, and take
actions based on RTAS return status. This way EEH handler will not be
blocked on rpaphp_get_sensor_state() and can immediately notify driver
about the PCI error and stop any active

[PATCH v8 1/2] powerpc/rtas: Rename rtas_error_rc to rtas_generic_errno

2023-08-07 Thread Mahesh Salgaonkar

rtas_generic_errno() function will convert the generic rtas return codes
into errno. Also, #define descriptive names for rtas return codes and use
it instead of numeric values.

Signed-off-by: Mahesh Salgaonkar 
---

(no changes since v7)

Change in V7:
- Until v6 there was only one patch with subject "PCI hotplug: rpaphp:
  Error out on busy status from get-sensor-state". Starting from v7, adding
  this new patch to introduce rtas_generic_errno() to handle generic rtas
  error codes.
  https://lore.kernel.org/all/20220429162545.GA79541@bhelgaas/
---
 arch/powerpc/include/asm/rtas.h |   10 +++
 arch/powerpc/kernel/rtas.c  |   53 ---
 2 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 3abe15ac79db1..5572a0a2f6e18 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -202,7 +202,9 @@ typedef struct {
 #define RTAS_USER_REGION_SIZE (64 * 1024)
 
 /* RTAS return status codes */
-#define RTAS_BUSY  -2/* RTAS Busy */
+#define RTAS_HARDWARE_ERROR(-1)  /* Hardware Error */
+#define RTAS_BUSY  (-2)  /* RTAS Busy */
+#define RTAS_INVALID_PARAMETER (-3)  /* Invalid indicator/domain/sensor etc. */
 #define RTAS_EXTENDED_DELAY_MIN9900
 #define RTAS_EXTENDED_DELAY_MAX9905
 
@@ -212,6 +214,11 @@ typedef struct {
 #define RTAS_THREADS_ACTIVE -9005 /* Multiple processor threads active */
 #define RTAS_OUTSTANDING_COPROC -9006 /* Outstanding coprocessor operations */
 
+/* statuses specific to get-sensor-state */
+#define RTAS_SLOT_UNISOLATED   (-9000)
+#define RTAS_SLOT_NOT_UNISOLATED   (-9001)
+#define RTAS_SLOT_NOT_USABLE   (-9002)
+
 /* RTAS event classes */
 #define RTAS_INTERNAL_ERROR0x8000 /* set bit 0 */
 #define RTAS_EPOW_WARNING  0x4000 /* set bit 1 */
@@ -425,6 +432,7 @@ extern int rtas_set_indicator(int indicator, int index, int 
new_value);
 extern int rtas_set_indicator_fast(int indicator, int index, int new_value);
 extern void rtas_progress(char *s, unsigned short hex);
 int rtas_ibm_suspend_me(int *fw_status);
+int rtas_generic_errno(int rtas_rc);
 
 struct rtc_time;
 extern time64_t rtas_get_boot_time(void);
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index c087320ff..80b6099e8ce20 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -1330,33 +1330,34 @@ bool __ref rtas_busy_delay(int status)
 }
 EXPORT_SYMBOL_GPL(rtas_busy_delay);
 
-static int rtas_error_rc(int rtas_rc)
+int rtas_generic_errno(int rtas_rc)
 {
int rc;
 
switch (rtas_rc) {
-   case -1:/* Hardware Error */
-   rc = -EIO;
-   break;
-   case -3:/* Bad indicator/domain/etc */
-   rc = -EINVAL;
-   break;
-   case -9000: /* Isolation error */
-   rc = -EFAULT;
-   break;
-   case -9001: /* Outstanding TCE/PTE */
-   rc = -EEXIST;
-   break;
-   case -9002: /* No usable slot */
-   rc = -ENODEV;
-   break;
-   default:
-   pr_err("%s: unexpected error %d\n", __func__, rtas_rc);
-   rc = -ERANGE;
-   break;
+   case RTAS_HARDWARE_ERROR:   /* Hardware Error */
+   rc = -EIO;
+   break;
+   case RTAS_INVALID_PARAMETER:/* Bad indicator/domain/etc */
+   rc = -EINVAL;
+   break;
+   case RTAS_SLOT_UNISOLATED:  /* Isolation error */
+   rc = -EFAULT;
+   break;
+   case RTAS_SLOT_NOT_UNISOLATED:  /* Outstanding TCE/PTE */
+   rc = -EEXIST;
+   break;
+   case RTAS_SLOT_NOT_USABLE:  /* No usable slot */
+   rc = -ENODEV;
+   break;
+   default:
+   pr_err("%s: unexpected error %d\n", __func__, rtas_rc);
+   rc = -ERANGE;
+   break;
}
return rc;
 }
+EXPORT_SYMBOL(rtas_generic_errno);
 
 int rtas_get_power_level(int powerdomain, int *level)
 {
@@ -1370,7 +1371,7 @@ int rtas_get_power_level(int powerdomain, int *level)
udelay(1);
 
if (rc < 0)
-   return rtas_error_rc(rc);
+   return rtas_generic_errno(rc);
return rc;
 }
 EXPORT_SYMBOL_GPL(rtas_get_power_level);
@@ -1388,7 +1389,7 @@ int rtas_set_power_level(int powerdomain, int level, int 
*setlevel)
} while (rtas_busy_delay(rc));
 
if (rc < 0)
-   return rtas_error_rc(rc);
+   return rtas_generic_errno(rc);
return rc;
 }
 EXPORT_SYMBOL_GPL(rtas_set_power_level);
@@

Re: [PATCH] perf test: Fix parse-events tests to skip parametrized events

2023-08-07 Thread Sachin Sant




> On 07-Aug-2023, at 10:20 AM, Athira Rajeev  
> wrote:
> 
> Testcase "Parsing of all PMU events from sysfs" parse events for
> all PMUs, and not just cpu. In case of powerpc, the PowerVM
> environment supports events from hv_24x7 and hv_gpci PMU which
> is of example format like below:
> 
> - hv_24x7/CPM_ADJUNCT_INST,domain=?,core=?/
> - hv_gpci/event,partition_id=?/
> 
> The value for "?" needs to be filled in depending on system
> configuration. It is better to skip these parametrized events
> in this test as it is done in:
> 'commit b50d691e50e6 ("perf test: Fix "all PMU test" to skip
> parametrized events")' which handled a simialr instance with
> "all PMU test".
> 
> Fix parse-events test to skip parametrized events since
> it needs proper setup of the parameters.
> 
> Signed-off-by: Athira Rajeev 
> —

Thanks Athira for the fix. With this fix applied the reported problem
Is fixed.

6.1: Test event parsing: Ok
  6.2: Parsing of all PMU events from sysfs : Ok
  6.3: Parsing of given PMU events from sysfs: Ok

Tested-by: Sachin Sant 

- Sachin

Re: [PATCH] tools/perf: Fix bpf__probe to set bpf_prog_type type only if differs from the desired one

2023-08-07 Thread Sachin Sant




> On 07-Aug-2023, at 10:22 AM, Athira Rajeev  
> wrote:
> 
> The test "BPF prologue generation" fails as below:
> 
>   Writing event: p:perf_bpf_probe/func _text+10423200 f_mode=+20(%gpr3):x32 
> offset=%gpr4:s64 orig=%gpr5:s32
>   In map_prologue, ntevs=1
>   mapping[0]=0
>   libbpf: prog 'bpf_func__null_lseek': BPF program load failed: Permission 
> denied
>   libbpf: prog 'bpf_func__null_lseek': -- BEGIN PROG LOAD LOG --
>   btf_vmlinux is malformed
>   reg type unsupported for arg#0 function bpf_func__null_lseek#5
>   0: R1=ctx(off=0,imm=0) R10=fp0
>   ;
>   0: (57) r3 &= 2
>   R3 !read_ok
>   processed 1 insns (limit 100) max_states_per_insn 0 total_states 0 
> peak_states 0 mark_read 0
>   -- END PROG LOAD LOG --
>   libbpf: prog 'bpf_func__null_lseek': failed to load: -13
>   libbpf: failed to load object '[bpf_prologue_test]'
>   bpf: load objects failed: err=-13: (Permission denied)
>   Failed to add events selected by BPF
> 
> This fails occurs after this commit:
> commit d6e6286a12e7 ("libbpf: disassociate section handler
> on explicit bpf_program__set_type() call")'
> 
> With this change, SEC_DEF handler libbpf which is determined
> initially based on program's SEC() is set to NULL. The change
> is made because sec_def is not valid when user sets the program
> type with bpf_program__set_type function. This commit also fixed
> bpf_prog_test_load() helper in selftests/bpf to force-set program
> type only if it differs from the desired one.
> 
> The "bpf__probe" function in util/bpf-loader.c, also calls
> bpf_program__set_type to set bpf_prog_type. Add similar fix in
> here as well to avoid setting sec_def to NULL.
> 
> Reported-by: Sachin Sant 
> Signed-off-by: Athira Rajeev 
> ---

Thanks Athira for the fix.
With this patch applied perf BPF prologue sub test works correctly.

 42: BPF filter :
 42.1: Basic BPF filtering: Ok
 42.2: BPF pinning  : Ok
 42.3: BPF prologue generation  : Ok

Tested-by: Sachin Sant 

Can you please use the above mentioned id(without vnet) in the reported-by ?

- Sachin

61 matches

Mail list logo