date:20240209

[PATCH 1/2] PCI: endpoint: Clean up hardware description for BARs

2024-02-09 Thread Niklas Cassel

The hardware description for BARs is scattered in many different variables
in pci_epc_features. Some of these things are mutually exclusive, so it
can create confusion over which variable that has precedence over another.

Improve the situation by creating a struct pci_epc_bar_desc, and a new
enum pci_epc_bar_type, and convert the endpoint controller drivers to use
this more well defined format.

Signed-off-by: Niklas Cassel 
---
 drivers/pci/controller/dwc/pci-imx6.c |  3 +-
 drivers/pci/controller/dwc/pci-keystone.c | 12 +++
 .../pci/controller/dwc/pci-layerscape-ep.c|  5 ++-
 drivers/pci/controller/dwc/pcie-keembay.c |  8 +++--
 drivers/pci/controller/dwc/pcie-rcar-gen4.c   |  4 ++-
 drivers/pci/controller/dwc/pcie-tegra194.c| 10 --
 drivers/pci/controller/dwc/pcie-uniphier-ep.c | 15 ++--
 drivers/pci/controller/pcie-rcar-ep.c | 14 +---
 drivers/pci/endpoint/functions/pci-epf-ntb.c  |  4 +--
 drivers/pci/endpoint/functions/pci-epf-test.c |  8 ++---
 drivers/pci/endpoint/functions/pci-epf-vntb.c |  2 +-
 drivers/pci/endpoint/pci-epc-core.c   | 32 +
 drivers/pci/endpoint/pci-epf-core.c   | 15 
 include/linux/pci-epc.h   | 34 +++
 14 files changed, 108 insertions(+), 58 deletions(-)

diff --git a/drivers/pci/controller/dwc/pci-imx6.c 
b/drivers/pci/controller/dwc/pci-imx6.c
index dc2c036ab28c..47a9a96484ed 100644
--- a/drivers/pci/controller/dwc/pci-imx6.c
+++ b/drivers/pci/controller/dwc/pci-imx6.c
@@ -1081,7 +1081,8 @@ static const struct pci_epc_features 
imx8m_pcie_epc_features = {
.linkup_notifier = false,
.msi_capable = true,
.msix_capable = false,
-   .reserved_bar = 1 << BAR_1 | 1 << BAR_3,
+   .bar[BAR_1] = { .type = BAR_RESERVED, },
+   .bar[BAR_3] = { .type = BAR_RESERVED, },
.align = SZ_64K,
 };
 
diff --git a/drivers/pci/controller/dwc/pci-keystone.c 
b/drivers/pci/controller/dwc/pci-keystone.c
index c0c62533a3f1..b2b93b4fa82d 100644
--- a/drivers/pci/controller/dwc/pci-keystone.c
+++ b/drivers/pci/controller/dwc/pci-keystone.c
@@ -924,12 +924,12 @@ static const struct pci_epc_features 
ks_pcie_am654_epc_features = {
.linkup_notifier = false,
.msi_capable = true,
.msix_capable = true,
-   .reserved_bar = 1 << BAR_0 | 1 << BAR_1,
-   .bar_fixed_64bit = 1 << BAR_0,
-   .bar_fixed_size[2] = SZ_1M,
-   .bar_fixed_size[3] = SZ_64K,
-   .bar_fixed_size[4] = 256,
-   .bar_fixed_size[5] = SZ_1M,
+   .bar[BAR_0] = { .type = BAR_RESERVED, .only_64bit = true, },
+   .bar[BAR_1] = { .type = BAR_RESERVED, },
+   .bar[BAR_2] = { .type = BAR_FIXED, .fixed_size = SZ_1M, },
+   .bar[BAR_3] = { .type = BAR_FIXED, .fixed_size = SZ_64K, },
+   .bar[BAR_4] = { .type = BAR_FIXED, .fixed_size = 256, },
+   .bar[BAR_5] = { .type = BAR_FIXED, .fixed_size = SZ_1M, },
.align = SZ_1M,
 };
 
diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c 
b/drivers/pci/controller/dwc/pci-layerscape-ep.c
index 2e398494e7c0..1f6ee1460ec2 100644
--- a/drivers/pci/controller/dwc/pci-layerscape-ep.c
+++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c
@@ -250,7 +250,10 @@ static int __init ls_pcie_ep_probe(struct platform_device 
*pdev)
pci->dev = dev;
pci->ops = pcie->drvdata->dw_pcie_ops;
 
-   ls_epc->bar_fixed_64bit = (1 << BAR_2) | (1 << BAR_4);
+   ls_epc->bar[BAR_2].only_64bit = true;
+   ls_epc->bar[BAR_3].type = BAR_RESERVED;
+   ls_epc->bar[BAR_4].only_64bit = true;
+   ls_epc->bar[BAR_5].type = BAR_RESERVED;
ls_epc->linkup_notifier = true;
 
pcie->pci = pci;
diff --git a/drivers/pci/controller/dwc/pcie-keembay.c 
b/drivers/pci/controller/dwc/pcie-keembay.c
index 208d3b0ba196..5e8e54f597dd 100644
--- a/drivers/pci/controller/dwc/pcie-keembay.c
+++ b/drivers/pci/controller/dwc/pcie-keembay.c
@@ -312,8 +312,12 @@ static const struct pci_epc_features 
keembay_pcie_epc_features = {
.linkup_notifier= false,
.msi_capable= true,
.msix_capable   = true,
-   .reserved_bar   = BIT(BAR_1) | BIT(BAR_3) | BIT(BAR_5),
-   .bar_fixed_64bit= BIT(BAR_0) | BIT(BAR_2) | BIT(BAR_4),
+   .bar[BAR_0] = { .only_64bit = true, },
+   .bar[BAR_1] = { .type = BAR_RESERVED, },
+   .bar[BAR_2] = { .only_64bit = true, },
+   .bar[BAR_3] = { .type = BAR_RESERVED, },
+   .bar[BAR_4] = { .only_64bit = true, },
+   .bar[BAR_5] = { .type = BAR_RESERVED, },
.align  = SZ_16K,
 };
 
diff --git a/drivers/pci/controller/dwc/pcie-rcar-gen4.c 
b/drivers/pci/controller/dwc/pcie-rcar-gen4.c
index e9166619b1f9..0be760ed420b 100644
--- a/drivers/pci/controller/dwc/pcie-rcar-gen4.c
+++ b/drivers/pci/controller/dwc/pcie-rcar-gen4.c
@@ -383,7 +383,9 @@ static const struct

[PATCH 0/2] PCI endpoint BAR hardware description cleanup

2024-02-09 Thread Niklas Cassel

The series is based on top of:
https://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git/log/?h=endpoint


Hello all,

This series cleans up the hardware description for PCI endpoint BARs.

The problems with the existing hardware description:
-The documentation is lackluster.
-Some of the names are confusingly similar, e.g. fixed_64bit and
 fixed_size, even though these are for completely unrelated things.
-The way that the BARs are defined in the endpoint controller drivers
 is messy, because the left hand side is not a BAR, so you can mark a
 BAR as e.g. both fixed size and reserved.

This series tries to address all the problems above.

Personally, I think that the code is more readable, both the endpoint
controller drivers, but also pci-epc-core.c.

(Oh, and as you can probably guess, I will be sending out a patch series
that adds BAR_RESIZABLE to enum pci_epc_bar_type in the coming week(s).)


Kind regards,
Niklas


Niklas Cassel (2):
  PCI: endpoint: Clean up hardware description for BARs
  PCI: endpoint: Drop only_64bit on reserved BARs

 drivers/pci/controller/dwc/pci-imx6.c |  3 +-
 drivers/pci/controller/dwc/pci-keystone.c | 12 +++---
 .../pci/controller/dwc/pci-layerscape-ep.c|  5 ++-
 drivers/pci/controller/dwc/pcie-keembay.c |  8 +++-
 drivers/pci/controller/dwc/pcie-rcar-gen4.c   |  4 +-
 drivers/pci/controller/dwc/pcie-tegra194.c| 10 +++--
 drivers/pci/controller/dwc/pcie-uniphier-ep.c | 15 ++--
 drivers/pci/controller/pcie-rcar-ep.c | 14 ---
 drivers/pci/endpoint/functions/pci-epf-ntb.c  |  4 +-
 drivers/pci/endpoint/functions/pci-epf-test.c |  8 ++--
 drivers/pci/endpoint/functions/pci-epf-vntb.c |  2 +-
 drivers/pci/endpoint/pci-epc-core.c   | 25 +---
 drivers/pci/endpoint/pci-epf-core.c   | 15 
 include/linux/pci-epc.h   | 38 ---
 14 files changed, 105 insertions(+), 58 deletions(-)

-- 
2.43.0

[PATCH 4/4] powerpc: ibmebus: make ibmebus_bus_type const

2024-02-09 Thread Ricardo B. Marliere

Now that the driver core can properly handle constant struct bus_type,
move the ibmebus_bus_type variable to be a constant structure as well,
placing it into read-only memory which can not be modified at runtime.

Cc: Greg Kroah-Hartman 
Suggested-by: Greg Kroah-Hartman 
Signed-off-by: Ricardo B. Marliere 
---
 arch/powerpc/include/asm/ibmebus.h   | 2 +-
 arch/powerpc/platforms/pseries/ibmebus.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/ibmebus.h 
b/arch/powerpc/include/asm/ibmebus.h
index 6f33253a364a..46fe406f461c 100644
--- a/arch/powerpc/include/asm/ibmebus.h
+++ b/arch/powerpc/include/asm/ibmebus.h
@@ -48,7 +48,7 @@
 
 struct platform_driver;
 
-extern struct bus_type ibmebus_bus_type;
+extern const struct bus_type ibmebus_bus_type;
 
 int ibmebus_register_driver(struct platform_driver *drv);
 void ibmebus_unregister_driver(struct platform_driver *drv);
diff --git a/arch/powerpc/platforms/pseries/ibmebus.c 
b/arch/powerpc/platforms/pseries/ibmebus.c
index 998e3aff2457..4bb611afaba4 100644
--- a/arch/powerpc/platforms/pseries/ibmebus.c
+++ b/arch/powerpc/platforms/pseries/ibmebus.c
@@ -432,7 +432,7 @@ static int ibmebus_bus_modalias(const struct device *dev, 
struct kobj_uevent_env
return of_device_uevent_modalias(dev, env);
 }
 
-struct bus_type ibmebus_bus_type = {
+const struct bus_type ibmebus_bus_type = {
.name  = "ibmebus",
.uevent= ibmebus_bus_modalias,
.bus_groups = ibmbus_bus_groups,

-- 
2.43.0

[PATCH 3/4] powerpc: mpic: make mpic_subsys const

2024-02-09 Thread Ricardo B. Marliere

Now that the driver core can properly handle constant struct bus_type,
move the mpic_subsys variable to be a constant structure as well,
placing it into read-only memory which can not be modified at runtime.

Cc: Greg Kroah-Hartman 
Suggested-by: Greg Kroah-Hartman 
Signed-off-by: Ricardo B. Marliere 
---
 arch/powerpc/include/asm/mpic.h | 2 +-
 arch/powerpc/sysdev/mpic.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/mpic.h b/arch/powerpc/include/asm/mpic.h
index 58353c5bd3fb..0c03a98986cd 100644
--- a/arch/powerpc/include/asm/mpic.h
+++ b/arch/powerpc/include/asm/mpic.h
@@ -336,7 +336,7 @@ struct mpic
 #endif
 };
 
-extern struct bus_type mpic_subsys;
+extern const struct bus_type mpic_subsys;
 
 /*
  * MPIC flags (passed to mpic_alloc)
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index dabbdd356664..d94cf36b0f65 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -49,7 +49,7 @@
 #define DBG(fmt...)
 #endif
 
-struct bus_type mpic_subsys = {
+const struct bus_type mpic_subsys = {
.name = "mpic",
.dev_name = "mpic",
 };

-- 
2.43.0

[PATCH 2/4] powerpc: vio: make vio_bus_type const

2024-02-09 Thread Ricardo B. Marliere

Now that the driver core can properly handle constant struct bus_type,
move the vio_bus_type variable to be a constant structure as well,
placing it into read-only memory which can not be modified at runtime.

Cc: Greg Kroah-Hartman 
Suggested-by: Greg Kroah-Hartman 
Signed-off-by: Ricardo B. Marliere 
---
 arch/powerpc/include/asm/vio.h   | 2 +-
 arch/powerpc/platforms/pseries/vio.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/vio.h b/arch/powerpc/include/asm/vio.h
index cc9b787627ad..6faf2a931755 100644
--- a/arch/powerpc/include/asm/vio.h
+++ b/arch/powerpc/include/asm/vio.h
@@ -39,7 +39,7 @@
  */
 #define VIO_CMO_MIN_ENT 1562624
 
-extern struct bus_type vio_bus_type;
+extern const struct bus_type vio_bus_type;
 
 struct iommu_table;
 
diff --git a/arch/powerpc/platforms/pseries/vio.c 
b/arch/powerpc/platforms/pseries/vio.c
index 6c58824190a2..90ff85c879bf 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -1615,7 +1615,7 @@ static struct attribute *vio_cmo_dev_attrs[] = {
 };
 ATTRIBUTE_GROUPS(vio_cmo_dev);
 
-struct bus_type vio_bus_type = {
+const struct bus_type vio_bus_type = {
.name = "vio",
.dev_groups = vio_cmo_dev_groups,
.bus_groups = vio_bus_groups,
@@ -1634,7 +1634,7 @@ static struct attribute *vio_dev_attrs[] = {
 };
 ATTRIBUTE_GROUPS(vio_dev);
 
-struct bus_type vio_bus_type = {
+const struct bus_type vio_bus_type = {
.name = "vio",
.dev_groups = vio_dev_groups,
.uevent = vio_hotplug,

-- 
2.43.0

[PATCH 1/4] powerpc: vio: move device attributes into a new ifdef

2024-02-09 Thread Ricardo B. Marliere

In order to make the distinction of the vio_bus_type variable based on
CONFIG_PPC_SMLPAR more explicit, move the required structs into a new
ifdef block. This is needed in order to make vio_bus_type const and
because the distinction is made explicit, there is no need to set the
fields within the vio_cmo_sysfs_init function.

Cc: Greg Kroah-Hartman 
Signed-off-by: Ricardo B. Marliere 
---
 arch/powerpc/platforms/pseries/vio.c | 59 +---
 1 file changed, 34 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/vio.c 
b/arch/powerpc/platforms/pseries/vio.c
index 2dc9cbc4bcd8..6c58824190a2 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -991,18 +991,6 @@ static DEVICE_ATTR_RO(cmo_allocated);
 static DEVICE_ATTR_RW(cmo_desired);
 static DEVICE_ATTR_RW(cmo_allocs_failed);
 
-static struct attribute *vio_cmo_dev_attrs[] = {
-   _attr_name.attr,
-   _attr_devspec.attr,
-   _attr_modalias.attr,
-   _attr_cmo_entitled.attr,
-   _attr_cmo_allocated.attr,
-   _attr_cmo_desired.attr,
-   _attr_cmo_allocs_failed.attr,
-   NULL,
-};
-ATTRIBUTE_GROUPS(vio_cmo_dev);
-
 /* sysfs bus functions and data structures for CMO */
 
 #define viobus_cmo_rd_attr(name)\
@@ -1062,11 +1050,7 @@ static struct attribute *vio_bus_attrs[] = {
 };
 ATTRIBUTE_GROUPS(vio_bus);
 
-static void __init vio_cmo_sysfs_init(void)
-{
-   vio_bus_type.dev_groups = vio_cmo_dev_groups;
-   vio_bus_type.bus_groups = vio_bus_groups;
-}
+static void __init vio_cmo_sysfs_init(void) { }
 #else /* CONFIG_PPC_SMLPAR */
 int vio_cmo_entitlement_update(size_t new_entitlement) { return 0; }
 void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired) {}
@@ -1584,14 +1568,6 @@ static ssize_t modalias_show(struct device *dev, struct 
device_attribute *attr,
 }
 static DEVICE_ATTR_RO(modalias);
 
-static struct attribute *vio_dev_attrs[] = {
-   _attr_name.attr,
-   _attr_devspec.attr,
-   _attr_modalias.attr,
-   NULL,
-};
-ATTRIBUTE_GROUPS(vio_dev);
-
 void vio_unregister_device(struct vio_dev *viodev)
 {
device_unregister(>dev);
@@ -1626,6 +1602,38 @@ static int vio_hotplug(const struct device *dev, struct 
kobj_uevent_env *env)
return 0;
 }
 
+#ifdef CONFIG_PPC_SMLPAR
+static struct attribute *vio_cmo_dev_attrs[] = {
+   _attr_name.attr,
+   _attr_devspec.attr,
+   _attr_modalias.attr,
+   _attr_cmo_entitled.attr,
+   _attr_cmo_allocated.attr,
+   _attr_cmo_desired.attr,
+   _attr_cmo_allocs_failed.attr,
+   NULL,
+};
+ATTRIBUTE_GROUPS(vio_cmo_dev);
+
+struct bus_type vio_bus_type = {
+   .name = "vio",
+   .dev_groups = vio_cmo_dev_groups,
+   .bus_groups = vio_bus_groups,
+   .uevent = vio_hotplug,
+   .match = vio_bus_match,
+   .probe = vio_bus_probe,
+   .remove = vio_bus_remove,
+   .shutdown = vio_bus_shutdown,
+};
+#else /* CONFIG_PPC_SMLPAR */
+static struct attribute *vio_dev_attrs[] = {
+   _attr_name.attr,
+   _attr_devspec.attr,
+   _attr_modalias.attr,
+   NULL,
+};
+ATTRIBUTE_GROUPS(vio_dev);
+
 struct bus_type vio_bus_type = {
.name = "vio",
.dev_groups = vio_dev_groups,
@@ -1635,6 +1643,7 @@ struct bus_type vio_bus_type = {
.remove = vio_bus_remove,
.shutdown = vio_bus_shutdown,
 };
+#endif /* CONFIG_PPC_SMLPAR */
 
 /**
  * vio_get_attribute: - get attribute for virtual device

-- 
2.43.0

[PATCH 0/4] powerpc: struct bus_type cleanup

2024-02-09 Thread Ricardo B. Marliere

This series is part of an effort to cleanup the users of the driver
core, as can be seen in many recent patches authored by Greg across the
tree (e.g. [1]). Patch 1/4 is a prerequisite to 2/4, but the others have
no dependency. They were built using bootlin's without warnings using
powerpc64le-power8--glibc--stable-2023.11-1 toolchain.

---
[1]: 
https://lore.kernel.org/lkml/?q=f%3Agregkh%40linuxfoundation.org+s%3A%22make%22+and+s%3A%22const%22

Cc: Greg Kroah-Hartman 
Signed-off-by: Ricardo B. Marliere 

---
Ricardo B. Marliere (4):
  powerpc: vio: move device attributes into a new ifdef
  powerpc: vio: make vio_bus_type const
  powerpc: mpic: make mpic_subsys const
  powerpc: ibmebus: make ibmebus_bus_type const

 arch/powerpc/include/asm/ibmebus.h   |  2 +-
 arch/powerpc/include/asm/mpic.h  |  2 +-
 arch/powerpc/include/asm/vio.h   |  2 +-
 arch/powerpc/platforms/pseries/ibmebus.c |  2 +-
 arch/powerpc/platforms/pseries/vio.c | 61 ++--
 arch/powerpc/sysdev/mpic.c   |  2 +-
 6 files changed, 40 insertions(+), 31 deletions(-)
---
base-commit: 41bccc98fb7931d63d03f326a746ac4d429c1dd3
change-id: 20240209-bus_cleanup-powerpc2-498426fccb98

Best regards,
-- 
Ricardo B. Marliere

Re: [PATCH v5 00/25] Transparent Contiguous PTEs for User Mappings

2024-02-09 Thread Ryan Roberts

On 09/02/2024 22:16, David Hildenbrand wrote:
>>> 1) Convert READ_ONCE() -> ptep_get()
>>> 2) Convert set_pte_at() -> set_ptes()
>>> 3) All the "New layer" renames and addition of the trivial wrappers
>>
>> Yep that makes sense. I'll start prepping that today. I'll hold off reposting
>> until I have your comments on 19-25. I'm also hoping that David will repost 
>> the
>> zap series today so that it can get into mm-unstable by mid-next week. Then 
>> I'll
>> repost on top of that, hopefully by end of next week, folding in all your
>> comments. This should give planty of time to soak in linux-next.
> 
> Just sent out v2. Will review this series (early) next week.
> 
> Have a great weekend!

Cheers, David - you too!

>

Re: [PATCH] mm/hugetlb: Move page order check inside hugetlb_cma_reserve()

2024-02-09 Thread Jane Chu


On 2/8/2024 9:42 PM, Anshuman Khandual wrote:


All platforms could benefit from page order check against MAX_PAGE_ORDER
before allocating a CMA area for gigantic hugetlb pages. Let's move this
check from individual platforms to generic hugetlb.

Cc: Catalin Marinas 
Cc: Will Deacon 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: linux-arm-ker...@lists.infradead.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux...@kvack.org
Cc: linux-ker...@vger.kernel.org
Signed-off-by: Anshuman Khandual 
---
This applies on v6.8-rc3
  
  arch/arm64/mm/hugetlbpage.c   | 7 ---

  arch/powerpc/mm/hugetlbpage.c | 4 +---
  mm/hugetlb.c  | 7 +++
  3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 8116ac599f80..6720ec8d50e7 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -45,13 +45,6 @@ void __init arm64_hugetlb_cma_reserve(void)
else
order = CONT_PMD_SHIFT - PAGE_SHIFT;
  
-	/*

-* HugeTLB CMA reservation is required for gigantic
-* huge pages which could not be allocated via the
-* page allocator. Just warn if there is any change
-* breaking this assumption.
-*/
-   WARN_ON(order <= MAX_PAGE_ORDER);
hugetlb_cma_reserve(order);
  }
  #endif /* CONFIG_CMA */
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 0a540b37aab6..16557d008eef 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -614,8 +614,6 @@ void __init gigantic_hugetlb_cma_reserve(void)
 */
order = mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT;
  
-	if (order) {

-   VM_WARN_ON(order <= MAX_PAGE_ORDER);
+   if (order)
hugetlb_cma_reserve(order);
-   }
  }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cf9c9b2906ea..345b3524df35 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7699,6 +7699,13 @@ void __init hugetlb_cma_reserve(int order)
bool node_specific_cma_alloc = false;
int nid;
  
+	/*

+* HugeTLB CMA reservation is required for gigantic
+* huge pages which could not be allocated via the
+* page allocator. Just warn if there is any change
+* breaking this assumption.
+*/
+   VM_WARN_ON(order <= MAX_PAGE_ORDER);
cma_reserve_called = true;
  
  	if (!hugetlb_cma_size)


Looks straight forward to me.

Reviewed-by: Jane Chu

Re: [PATCH v3 01/15] arm64/mm: Make set_ptes() robust when OAs cross 48-bit boundary

2024-02-09 Thread David Hildenbrand


On 08.02.24 07:10, Mike Rapoport wrote:

On Mon, Jan 29, 2024 at 01:46:35PM +0100, David Hildenbrand wrote:

From: Ryan Roberts 

Since the high bits [51:48] of an OA are not stored contiguously in the
PTE, there is a theoretical bug in set_ptes(), which just adds PAGE_SIZE
to the pte to get the pte with the next pfn. This works until the pfn
crosses the 48-bit boundary, at which point we overflow into the upper
attributes.

Of course one could argue (and Matthew Wilcox has :) that we will never
see a folio cross this boundary because we only allow naturally aligned
power-of-2 allocation, so this would require a half-petabyte folio. So
its only a theoretical bug. But its better that the code is robust
regardless.

I've implemented pte_next_pfn() as part of the fix, which is an opt-in
core-mm interface. So that is now available to the core-mm, which will
be needed shortly to support forthcoming fork()-batching optimizations.

Link: https://lkml.kernel.org/r/20240125173534.1659317-1-ryan.robe...@arm.com
Fixes: 4a169d61c2ed ("arm64: implement the new page table range API")
Closes: 
https://lore.kernel.org/linux-mm/fdaeb9a5-d890-499a-92c8-d171df43a...@arm.com/
Signed-off-by: Ryan Roberts 
Reviewed-by: Catalin Marinas 
Reviewed-by: David Hildenbrand 
Signed-off-by: David Hildenbrand 


Reviewed-by: Mike Rapoport (IBM) 


Thanks for the review Mike, appreciated!

--
Cheers,

David / dhildenb

Re: [PATCH v5 00/25] Transparent Contiguous PTEs for User Mappings

2024-02-09 Thread David Hildenbrand


1) Convert READ_ONCE() -> ptep_get()
2) Convert set_pte_at() -> set_ptes()
3) All the "New layer" renames and addition of the trivial wrappers


Yep that makes sense. I'll start prepping that today. I'll hold off reposting
until I have your comments on 19-25. I'm also hoping that David will repost the
zap series today so that it can get into mm-unstable by mid-next week. Then I'll
repost on top of that, hopefully by end of next week, folding in all your
comments. This should give planty of time to soak in linux-next.


Just sent out v2. Will review this series (early) next week.

Have a great weekend!

--
Cheers,

David / dhildenb

[PATCH v2 09/10] mm/mmu_gather: improve cond_resched() handling with large folios and expensive page freeing

2024-02-09 Thread David Hildenbrand

It's a pain that we have to handle cond_resched() in
tlb_batch_pages_flush() manually and cannot simply handle it in
release_pages() -- release_pages() can be called from atomic context.
Well, in a perfect world we wouldn't have to make our code more at all.

With page poisoning and init_on_free, we might now run into soft lockups
when we free a lot of rather large folio fragments, because page freeing
time then depends on the actual memory size we are freeing instead of on
the number of folios that are involved.

In the absolute (unlikely) worst case, on arm64 with 64k we will be able
to free up to 256 folio fragments that each span 512 MiB: zeroing out 128
GiB does sound like it might take a while. But instead of ignoring this
unlikely case, let's just handle it.

So, let's teach tlb_batch_pages_flush() that there are some
configurations where page freeing is horribly slow, and let's reschedule
more frequently -- similarly like we did for now before we had large folio
fragments in there. Note that we might end up freeing only a single folio
fragment at a time that might exceed the old 512 pages limit: but if we
cannot even free a single MAX_ORDER page on a system without running into
soft lockups, something else is already completely bogus.

In the future, we might want to detect if handling cond_resched() is
required at all, and just not do any of that with full preemption enabled.

Signed-off-by: David Hildenbrand 
---
 mm/mmu_gather.c | 50 -
 1 file changed, 41 insertions(+), 9 deletions(-)

diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index d175c0f1e2c8..2774044b5790 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -91,18 +91,19 @@ void tlb_flush_rmaps(struct mmu_gather *tlb, struct 
vm_area_struct *vma)
 }
 #endif
 
-static void tlb_batch_pages_flush(struct mmu_gather *tlb)
+static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
 {
-   struct mmu_gather_batch *batch;
-
-   for (batch = >local; batch && batch->nr; batch = batch->next) {
-   struct encoded_page **pages = batch->encoded_pages;
+   struct encoded_page **pages = batch->encoded_pages;
+   unsigned int nr, nr_pages;
 
+   /*
+* We might end up freeing a lot of pages. Reschedule on a regular
+* basis to avoid soft lockups in configurations without full
+* preemption enabled. The magic number of 512 folios seems to work.
+*/
+   if (!page_poisoning_enabled_static() && !want_init_on_free()) {
while (batch->nr) {
-   /*
-* limit free batch count when PAGE_SIZE > 4K
-*/
-   unsigned int nr = min(512U, batch->nr);
+   nr = min(512, batch->nr);
 
/*
 * Make sure we cover page + nr_pages, and don't leave
@@ -119,6 +120,37 @@ static void tlb_batch_pages_flush(struct mmu_gather *tlb)
cond_resched();
}
}
+
+   /*
+* With page poisoning and init_on_free, the time it takes to free
+* memory grows proportionally with the actual memory size. Therefore,
+* limit based on the actual memory size and not the number of involved
+* folios.
+*/
+   while (batch->nr) {
+   for (nr = 0, nr_pages = 0;
+nr < batch->nr && nr_pages < 512; nr++) {
+   if (unlikely(encoded_page_flags(pages[nr]) &
+ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+   nr_pages += encoded_nr_pages(pages[++nr]);
+   else
+   nr_pages++;
+   }
+
+   free_pages_and_swap_cache(pages, nr);
+   pages += nr;
+   batch->nr -= nr;
+
+   cond_resched();
+   }
+}
+
+static void tlb_batch_pages_flush(struct mmu_gather *tlb)
+{
+   struct mmu_gather_batch *batch;
+
+   for (batch = >local; batch && batch->nr; batch = batch->next)
+   __tlb_batch_free_encoded_pages(batch);
tlb->active = >local;
 }
 
-- 
2.43.0

[PATCH v2 10/10] mm/memory: optimize unmap/zap with PTE-mapped THP

2024-02-09 Thread David Hildenbrand

Similar to how we optimized fork(), let's implement PTE batching when
consecutive (present) PTEs map consecutive pages of the same large
folio.

Most infrastructure we need for batching (mmu gather, rmap) is already
there. We only have to add get_and_clear_full_ptes() and
clear_full_ptes(). Similarly, extend zap_install_uffd_wp_if_needed() to
process a PTE range.

We won't bother sanity-checking the mapcount of all subpages, but only
check the mapcount of the first subpage we process. If there is a real
problem hiding somewhere, we can trigger it simply by using small
folios, or when we zap single pages of a large folio. Ideally, we had
that check in rmap code (including for delayed rmap), but then we cannot
print the PTE. Let's keep it simple for now. If we ever have a cheap
folio_mapcount(), we might just want to check for underflows there.

To keep small folios as fast as possible force inlining of a specialized
variant using __always_inline with nr=1.

Signed-off-by: David Hildenbrand 
---
 include/linux/pgtable.h | 70 +++
 mm/memory.c | 92 +
 2 files changed, 136 insertions(+), 26 deletions(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index aab227e12493..49ab1f73b5c2 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -580,6 +580,76 @@ static inline pte_t ptep_get_and_clear_full(struct 
mm_struct *mm,
 }
 #endif
 
+#ifndef get_and_clear_full_ptes
+/**
+ * get_and_clear_full_ptes - Clear present PTEs that map consecutive pages of
+ *  the same folio, collecting dirty/accessed bits.
+ * @mm: Address space the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to clear.
+ * @full: Whether we are clearing a full mm.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over ptep_get_and_clear_full(), merging dirty/accessed bits into the
+ * returned PTE.
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock.  The PTEs map consecutive
+ * pages that belong to the same folio.  The PTEs are all in the same PMD.
+ */
+static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm,
+   unsigned long addr, pte_t *ptep, unsigned int nr, int full)
+{
+   pte_t pte, tmp_pte;
+
+   pte = ptep_get_and_clear_full(mm, addr, ptep, full);
+   while (--nr) {
+   ptep++;
+   addr += PAGE_SIZE;
+   tmp_pte = ptep_get_and_clear_full(mm, addr, ptep, full);
+   if (pte_dirty(tmp_pte))
+   pte = pte_mkdirty(pte);
+   if (pte_young(tmp_pte))
+   pte = pte_mkyoung(pte);
+   }
+   return pte;
+}
+#endif
+
+#ifndef clear_full_ptes
+/**
+ * clear_full_ptes - Clear present PTEs that map consecutive pages of the same
+ *  folio.
+ * @mm: Address space the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to clear.
+ * @full: Whether we are clearing a full mm.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over ptep_get_and_clear_full().
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock.  The PTEs map consecutive
+ * pages that belong to the same folio.  The PTEs are all in the same PMD.
+ */
+static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+   pte_t *ptep, unsigned int nr, int full)
+{
+   for (;;) {
+   ptep_get_and_clear_full(mm, addr, ptep, full);
+   if (--nr == 0)
+   break;
+   ptep++;
+   addr += PAGE_SIZE;
+   }
+}
+#endif
 
 /*
  * If two threads concurrently fault at the same page, the thread that
diff --git a/mm/memory.c b/mm/memory.c
index a3efc4da258a..3b8e56eb08a3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1515,7 +1515,7 @@ static inline bool zap_drop_file_uffd_wp(struct 
zap_details *details)
  */
 static inline void
 zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
- unsigned long addr, pte_t *pte,
+ unsigned long addr, pte_t *pte, int nr,
  struct zap_details *details, pte_t pteval)
 {
/* Zap on anonymous always means dropping everything */
@@ -1525,20 +1525,27 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct 
*vma,
if (zap_drop_file_uffd_wp(details))
return;
 
-   pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
+   for (;;) {
+

[PATCH v2 08/10] mm/mmu_gather: add __tlb_remove_folio_pages()

2024-02-09 Thread David Hildenbrand

Add __tlb_remove_folio_pages(), which will remove multiple consecutive
pages that belong to the same large folio, instead of only a single
page. We'll be using this function when optimizing unmapping/zapping of
large folios that are mapped by PTEs.

We're using the remaining spare bit in an encoded_page to indicate that
the next enoced page in an array contains actually shifted "nr_pages".
Teach swap/freeing code about putting multiple folio references, and
delayed rmap handling to remove page ranges of a folio.

This extension allows for still gathering almost as many small folios
as we used to (-1, because we have to prepare for a possibly bigger next
entry), but still allows for gathering consecutive pages that belong to the
same large folio.

Note that we don't pass the folio pointer, because it is not required for
now. Further, we don't support page_size != PAGE_SIZE, it won't be
required for simple PTE batching.

We have to provide a separate s390 implementation, but it's fairly
straight forward.

Another, more invasive and likely more expensive, approach would be to
use folio+range or a PFN range instead of page+nr_pages. But, we should
do that consistently for the whole mmu_gather. For now, let's keep it
simple and add "nr_pages" only.

Note that it is now possible to gather significantly more pages: In the
past, we were able to gather ~1 pages, now we can gather
also gather ~5000 folio fragments that span multiple pages. A folio
fragement on x86-64 can be up to 512 pages (2 MiB THP) and on arm64 with
64k in theory 8192 pages (512 MiB THP). Gathering more memory is not
considered something we should worry about, especially because these are
already corner cases.

While we can gather more total memory, we won't free more folio
fragments. As long as page freeing time primarily only depends on the
number of involved folios, there is no effective change for !preempt
configurations. However, we'll adjust tlb_batch_pages_flush() separately to
handle corner cases where page freeing time grows proportionally with the
actual memory size.

Signed-off-by: David Hildenbrand 
---
 arch/s390/include/asm/tlb.h | 17 +++
 include/asm-generic/tlb.h   |  8 +
 include/linux/mm_types.h| 20 
 mm/mmu_gather.c | 61 +++--
 mm/swap.c   | 12 ++--
 mm/swap_state.c | 15 +++--
 6 files changed, 119 insertions(+), 14 deletions(-)

diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index 48df896d5b79..e95b2c8081eb 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -26,6 +26,8 @@ void __tlb_remove_table(void *_table);
 static inline void tlb_flush(struct mmu_gather *tlb);
 static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
struct page *page, bool delay_rmap, int page_size);
+static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb,
+   struct page *page, unsigned int nr_pages, bool delay_rmap);
 
 #define tlb_flush tlb_flush
 #define pte_free_tlb pte_free_tlb
@@ -52,6 +54,21 @@ static inline bool __tlb_remove_page_size(struct mmu_gather 
*tlb,
return false;
 }
 
+static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb,
+   struct page *page, unsigned int nr_pages, bool delay_rmap)
+{
+   struct encoded_page *encoded_pages[] = {
+   encode_page(page, ENCODED_PAGE_BIT_NR_PAGES_NEXT),
+   encode_nr_pages(nr_pages),
+   };
+
+   VM_WARN_ON_ONCE(delay_rmap);
+   VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1));
+
+   free_pages_and_swap_cache(encoded_pages, ARRAY_SIZE(encoded_pages));
+   return false;
+}
+
 static inline void tlb_flush(struct mmu_gather *tlb)
 {
__tlb_flush_mm_lazy(tlb->mm);
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 95d60a4f468a..bd00dd238b79 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -69,6 +69,7 @@
  *
  *  - tlb_remove_page() / __tlb_remove_page()
  *  - tlb_remove_page_size() / __tlb_remove_page_size()
+ *  - __tlb_remove_folio_pages()
  *
  *__tlb_remove_page_size() is the basic primitive that queues a page for
  *freeing. __tlb_remove_page() assumes PAGE_SIZE. Both will return a
@@ -78,6 +79,11 @@
  *tlb_remove_page() and tlb_remove_page_size() imply the call to
  *tlb_flush_mmu() when required and has no return value.
  *
+ *__tlb_remove_folio_pages() is similar to __tlb_remove_page(), however,
+ *instead of removing a single page, remove the given number of consecutive
+ *pages that are all part of the same (large) folio: just like calling
+ *__tlb_remove_page() on each page individually.
+ *
  *  - tlb_change_page_size()
  *
  *call before __tlb_remove_page*() to set the current page-size; implies a
@@ -262,6 +268,8 @@ struct mmu_gather_batch {
 
 extern bool __tlb_remove_page_size(struct mmu_gather

[PATCH v2 07/10] mm/mmu_gather: add tlb_remove_tlb_entries()

2024-02-09 Thread David Hildenbrand

Let's add a helper that lets us batch-process multiple consecutive PTEs.

Note that the loop will get optimized out on all architectures except on
powerpc. We have to add an early define of __tlb_remove_tlb_entry() on
ppc to make the compiler happy (and avoid making tlb_remove_tlb_entries() a
macro).

Reviewed-by: Ryan Roberts 
Signed-off-by: David Hildenbrand 
---
 arch/powerpc/include/asm/tlb.h |  2 ++
 include/asm-generic/tlb.h  | 20 
 2 files changed, 22 insertions(+)

diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
index b3de6102a907..1ca7d4c4b90d 100644
--- a/arch/powerpc/include/asm/tlb.h
+++ b/arch/powerpc/include/asm/tlb.h
@@ -19,6 +19,8 @@
 
 #include 
 
+static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep,
+ unsigned long address);
 #define __tlb_remove_tlb_entry __tlb_remove_tlb_entry
 
 #define tlb_flush tlb_flush
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 2eb7b0d4f5d2..95d60a4f468a 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -608,6 +608,26 @@ static inline void tlb_flush_p4d_range(struct mmu_gather 
*tlb,
__tlb_remove_tlb_entry(tlb, ptep, address); \
} while (0)
 
+/**
+ * tlb_remove_tlb_entries - remember unmapping of multiple consecutive ptes for
+ * later tlb invalidation.
+ *
+ * Similar to tlb_remove_tlb_entry(), but remember unmapping of multiple
+ * consecutive ptes instead of only a single one.
+ */
+static inline void tlb_remove_tlb_entries(struct mmu_gather *tlb,
+   pte_t *ptep, unsigned int nr, unsigned long address)
+{
+   tlb_flush_pte_range(tlb, address, PAGE_SIZE * nr);
+   for (;;) {
+   __tlb_remove_tlb_entry(tlb, ptep, address);
+   if (--nr == 0)
+   break;
+   ptep++;
+   address += PAGE_SIZE;
+   }
+}
+
 #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)   \
do {\
unsigned long _sz = huge_page_size(h);  \
-- 
2.43.0

[PATCH v2 06/10] mm/mmu_gather: define ENCODED_PAGE_FLAG_DELAY_RMAP

2024-02-09 Thread David Hildenbrand

Nowadays, encoded pages are only used in mmu_gather handling. Let's
update the documentation, and define ENCODED_PAGE_BIT_DELAY_RMAP. While at
it, rename ENCODE_PAGE_BITS to ENCODED_PAGE_BITS.

If encoded page pointers would ever be used in other context again, we'd
likely want to change the defines to reflect their context (e.g.,
ENCODED_PAGE_FLAG_MMU_GATHER_DELAY_RMAP). For now, let's keep it simple.

This is a preparation for using the remaining spare bit to indicate that
the next item in an array of encoded pages is a "nr_pages" argument and
not an encoded page.

Reviewed-by: Ryan Roberts 
Signed-off-by: David Hildenbrand 
---
 include/linux/mm_types.h | 17 +++--
 mm/mmu_gather.c  |  5 +++--
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8b611e13153e..1b89eec0d6df 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -210,8 +210,8 @@ struct page {
  *
  * An 'encoded_page' pointer is a pointer to a regular 'struct page', but
  * with the low bits of the pointer indicating extra context-dependent
- * information. Not super-common, but happens in mmu_gather and mlock
- * handling, and this acts as a type system check on that use.
+ * information. Only used in mmu_gather handling, and this acts as a type
+ * system check on that use.
  *
  * We only really have two guaranteed bits in general, although you could
  * play with 'struct page' alignment (see CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
@@ -220,21 +220,26 @@ struct page {
  * Use the supplied helper functions to endcode/decode the pointer and bits.
  */
 struct encoded_page;
-#define ENCODE_PAGE_BITS 3ul
+
+#define ENCODED_PAGE_BITS  3ul
+
+/* Perform rmap removal after we have flushed the TLB. */
+#define ENCODED_PAGE_BIT_DELAY_RMAP1ul
+
 static __always_inline struct encoded_page *encode_page(struct page *page, 
unsigned long flags)
 {
-   BUILD_BUG_ON(flags > ENCODE_PAGE_BITS);
+   BUILD_BUG_ON(flags > ENCODED_PAGE_BITS);
return (struct encoded_page *)(flags | (unsigned long)page);
 }
 
 static inline unsigned long encoded_page_flags(struct encoded_page *page)
 {
-   return ENCODE_PAGE_BITS & (unsigned long)page;
+   return ENCODED_PAGE_BITS & (unsigned long)page;
 }
 
 static inline struct page *encoded_page_ptr(struct encoded_page *page)
 {
-   return (struct page *)(~ENCODE_PAGE_BITS & (unsigned long)page);
+   return (struct page *)(~ENCODED_PAGE_BITS & (unsigned long)page);
 }
 
 /*
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index ac733d81b112..6540c99c6758 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -53,7 +53,7 @@ static void tlb_flush_rmap_batch(struct mmu_gather_batch 
*batch, struct vm_area_
for (int i = 0; i < batch->nr; i++) {
struct encoded_page *enc = batch->encoded_pages[i];
 
-   if (encoded_page_flags(enc)) {
+   if (encoded_page_flags(enc) & ENCODED_PAGE_BIT_DELAY_RMAP) {
struct page *page = encoded_page_ptr(enc);
folio_remove_rmap_pte(page_folio(page), page, vma);
}
@@ -119,6 +119,7 @@ static void tlb_batch_list_free(struct mmu_gather *tlb)
 bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
bool delay_rmap, int page_size)
 {
+   int flags = delay_rmap ? ENCODED_PAGE_BIT_DELAY_RMAP : 0;
struct mmu_gather_batch *batch;
 
VM_BUG_ON(!tlb->end);
@@ -132,7 +133,7 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct 
page *page,
 * Add the page and check if we are full. If so
 * force a flush.
 */
-   batch->encoded_pages[batch->nr++] = encode_page(page, delay_rmap);
+   batch->encoded_pages[batch->nr++] = encode_page(page, flags);
if (batch->nr == batch->max) {
if (!tlb_next_batch(tlb))
return true;
-- 
2.43.0

[PATCH v2 05/10] mm/mmu_gather: pass "delay_rmap" instead of encoded page to __tlb_remove_page_size()

2024-02-09 Thread David Hildenbrand

We have two bits available in the encoded page pointer to store
additional information. Currently, we use one bit to request delay of the
rmap removal until after a TLB flush.

We want to make use of the remaining bit internally for batching of
multiple pages of the same folio, specifying that the next encoded page
pointer in an array is actually "nr_pages". So pass page + delay_rmap flag
instead of an encoded page, to handle the encoding internally.

Reviewed-by: Ryan Roberts 
Signed-off-by: David Hildenbrand 
---
 arch/s390/include/asm/tlb.h | 13 ++---
 include/asm-generic/tlb.h   | 12 ++--
 mm/mmu_gather.c |  7 ---
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index d1455a601adc..48df896d5b79 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -25,8 +25,7 @@
 void __tlb_remove_table(void *_table);
 static inline void tlb_flush(struct mmu_gather *tlb);
 static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
- struct encoded_page *page,
- int page_size);
+   struct page *page, bool delay_rmap, int page_size);
 
 #define tlb_flush tlb_flush
 #define pte_free_tlb pte_free_tlb
@@ -42,14 +41,14 @@ static inline bool __tlb_remove_page_size(struct mmu_gather 
*tlb,
  * tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page
  * has already been freed, so just do free_page_and_swap_cache.
  *
- * s390 doesn't delay rmap removal, so there is nothing encoded in
- * the page pointer.
+ * s390 doesn't delay rmap removal.
  */
 static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
- struct encoded_page *page,
- int page_size)
+   struct page *page, bool delay_rmap, int page_size)
 {
-   free_page_and_swap_cache(encoded_page_ptr(page));
+   VM_WARN_ON_ONCE(delay_rmap);
+
+   free_page_and_swap_cache(page);
return false;
 }
 
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 129a3a759976..2eb7b0d4f5d2 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -260,9 +260,8 @@ struct mmu_gather_batch {
  */
 #define MAX_GATHER_BATCH_COUNT (1UL/MAX_GATHER_BATCH)
 
-extern bool __tlb_remove_page_size(struct mmu_gather *tlb,
-  struct encoded_page *page,
-  int page_size);
+extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
+   bool delay_rmap, int page_size);
 
 #ifdef CONFIG_SMP
 /*
@@ -462,13 +461,14 @@ static inline void tlb_flush_mmu_tlbonly(struct 
mmu_gather *tlb)
 static inline void tlb_remove_page_size(struct mmu_gather *tlb,
struct page *page, int page_size)
 {
-   if (__tlb_remove_page_size(tlb, encode_page(page, 0), page_size))
+   if (__tlb_remove_page_size(tlb, page, false, page_size))
tlb_flush_mmu(tlb);
 }
 
-static __always_inline bool __tlb_remove_page(struct mmu_gather *tlb, struct 
page *page, unsigned int flags)
+static __always_inline bool __tlb_remove_page(struct mmu_gather *tlb,
+   struct page *page, bool delay_rmap)
 {
-   return __tlb_remove_page_size(tlb, encode_page(page, flags), PAGE_SIZE);
+   return __tlb_remove_page_size(tlb, page, delay_rmap, PAGE_SIZE);
 }
 
 /* tlb_remove_page
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 604ddf08affe..ac733d81b112 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -116,7 +116,8 @@ static void tlb_batch_list_free(struct mmu_gather *tlb)
tlb->local.next = NULL;
 }
 
-bool __tlb_remove_page_size(struct mmu_gather *tlb, struct encoded_page *page, 
int page_size)
+bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
+   bool delay_rmap, int page_size)
 {
struct mmu_gather_batch *batch;
 
@@ -131,13 +132,13 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, 
struct encoded_page *page, i
 * Add the page and check if we are full. If so
 * force a flush.
 */
-   batch->encoded_pages[batch->nr++] = page;
+   batch->encoded_pages[batch->nr++] = encode_page(page, delay_rmap);
if (batch->nr == batch->max) {
if (!tlb_next_batch(tlb))
return true;
batch = tlb->active;
}
-   VM_BUG_ON_PAGE(batch->nr > batch->max, encoded_page_ptr(page));
+   VM_BUG_ON_PAGE(batch->nr > batch->max, page);
 
return false;
 }
-- 
2.43.0

[PATCH v2 04/10] mm/memory: factor out zapping folio pte into zap_present_folio_pte()

2024-02-09 Thread David Hildenbrand

Let's prepare for further changes by factoring it out into a separate
function.

Reviewed-by: Ryan Roberts 
Signed-off-by: David Hildenbrand 
---
 mm/memory.c | 53 -
 1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 7a3ebb6e5909..a3efc4da258a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1528,30 +1528,14 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct 
*vma,
pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
 }
 
-static inline void zap_present_pte(struct mmu_gather *tlb,
-   struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
-   unsigned long addr, struct zap_details *details,
-   int *rss, bool *force_flush, bool *force_break)
+static inline void zap_present_folio_pte(struct mmu_gather *tlb,
+   struct vm_area_struct *vma, struct folio *folio,
+   struct page *page, pte_t *pte, pte_t ptent, unsigned long addr,
+   struct zap_details *details, int *rss, bool *force_flush,
+   bool *force_break)
 {
struct mm_struct *mm = tlb->mm;
bool delay_rmap = false;
-   struct folio *folio;
-   struct page *page;
-
-   page = vm_normal_page(vma, addr, ptent);
-   if (!page) {
-   /* We don't need up-to-date accessed/dirty bits. */
-   ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
-   arch_check_zapped_pte(vma, ptent);
-   tlb_remove_tlb_entry(tlb, pte, addr);
-   VM_WARN_ON_ONCE(userfaultfd_wp(vma));
-   ksm_might_unmap_zero_page(mm, ptent);
-   return;
-   }
-
-   folio = page_folio(page);
-   if (unlikely(!should_zap_folio(details, folio)))
-   return;
 
if (!folio_test_anon(folio)) {
ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
@@ -1586,6 +1570,33 @@ static inline void zap_present_pte(struct mmu_gather 
*tlb,
}
 }
 
+static inline void zap_present_pte(struct mmu_gather *tlb,
+   struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
+   unsigned long addr, struct zap_details *details,
+   int *rss, bool *force_flush, bool *force_break)
+{
+   struct mm_struct *mm = tlb->mm;
+   struct folio *folio;
+   struct page *page;
+
+   page = vm_normal_page(vma, addr, ptent);
+   if (!page) {
+   /* We don't need up-to-date accessed/dirty bits. */
+   ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
+   arch_check_zapped_pte(vma, ptent);
+   tlb_remove_tlb_entry(tlb, pte, addr);
+   VM_WARN_ON_ONCE(userfaultfd_wp(vma));
+   ksm_might_unmap_zero_page(mm, ptent);
+   return;
+   }
+
+   folio = page_folio(page);
+   if (unlikely(!should_zap_folio(details, folio)))
+   return;
+   zap_present_folio_pte(tlb, vma, folio, page, pte, ptent, addr, details,
+ rss, force_flush, force_break);
+}
+
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
-- 
2.43.0

[PATCH v2 03/10] mm/memory: further separate anon and pagecache folio handling in zap_present_pte()

2024-02-09 Thread David Hildenbrand

We don't need up-to-date accessed-dirty information for anon folios and can
simply work with the ptent we already have. Also, we know the RSS counter
we want to update.

We can safely move arch_check_zapped_pte() + tlb_remove_tlb_entry() +
zap_install_uffd_wp_if_needed() after updating the folio and RSS.

While at it, only call zap_install_uffd_wp_if_needed() if there is even
any chance that pte_install_uffd_wp_if_needed() would do *something*.
That is, just don't bother if uffd-wp does not apply.

Reviewed-by: Ryan Roberts 
Signed-off-by: David Hildenbrand 
---
 mm/memory.c | 16 +++-
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 4da6923709b2..7a3ebb6e5909 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1552,12 +1552,9 @@ static inline void zap_present_pte(struct mmu_gather 
*tlb,
folio = page_folio(page);
if (unlikely(!should_zap_folio(details, folio)))
return;
-   ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
-   arch_check_zapped_pte(vma, ptent);
-   tlb_remove_tlb_entry(tlb, pte, addr);
-   zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
 
if (!folio_test_anon(folio)) {
+   ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
if (pte_dirty(ptent)) {
folio_mark_dirty(folio);
if (tlb_delay_rmap(tlb)) {
@@ -1567,8 +1564,17 @@ static inline void zap_present_pte(struct mmu_gather 
*tlb,
}
if (pte_young(ptent) && likely(vma_has_recency(vma)))
folio_mark_accessed(folio);
+   rss[mm_counter(folio)]--;
+   } else {
+   /* We don't need up-to-date accessed/dirty bits. */
+   ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
+   rss[MM_ANONPAGES]--;
}
-   rss[mm_counter(folio)]--;
+   arch_check_zapped_pte(vma, ptent);
+   tlb_remove_tlb_entry(tlb, pte, addr);
+   if (unlikely(userfaultfd_pte_wp(vma, ptent)))
+   zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
+
if (!delay_rmap) {
folio_remove_rmap_pte(folio, page, vma);
if (unlikely(page_mapcount(page) < 0))
-- 
2.43.0

[PATCH v2 02/10] mm/memory: handle !page case in zap_present_pte() separately

2024-02-09 Thread David Hildenbrand

We don't need uptodate accessed/dirty bits, so in theory we could
replace ptep_get_and_clear_full() by an optimized ptep_clear_full()
function. Let's rely on the provided pte.

Further, there is no scenario where we would have to insert uffd-wp
markers when zapping something that is not a normal page (i.e., zeropage).
Add a sanity check to make sure this remains true.

should_zap_folio() no longer has to handle NULL pointers. This change
replaces 2/3 "!page/!folio" checks by a single "!page" one.

Note that arch_check_zapped_pte() on x86-64 checks the HW-dirty bit to
detect shadow stack entries. But for shadow stack entries, the HW dirty
bit (in combination with non-writable PTEs) is set by software. So for the
arch_check_zapped_pte() check, we don't have to sync against HW setting
the HW dirty bit concurrently, it is always set.

Reviewed-by: Ryan Roberts 
Signed-off-by: David Hildenbrand 
---
 mm/memory.c | 22 +++---
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 5b0dc33133a6..4da6923709b2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1497,10 +1497,6 @@ static inline bool should_zap_folio(struct zap_details 
*details,
if (should_zap_cows(details))
return true;
 
-   /* E.g. the caller passes NULL for the case of a zero folio */
-   if (!folio)
-   return true;
-
/* Otherwise we should only zap non-anon folios */
return !folio_test_anon(folio);
 }
@@ -1538,24 +1534,28 @@ static inline void zap_present_pte(struct mmu_gather 
*tlb,
int *rss, bool *force_flush, bool *force_break)
 {
struct mm_struct *mm = tlb->mm;
-   struct folio *folio = NULL;
bool delay_rmap = false;
+   struct folio *folio;
struct page *page;
 
page = vm_normal_page(vma, addr, ptent);
-   if (page)
-   folio = page_folio(page);
+   if (!page) {
+   /* We don't need up-to-date accessed/dirty bits. */
+   ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
+   arch_check_zapped_pte(vma, ptent);
+   tlb_remove_tlb_entry(tlb, pte, addr);
+   VM_WARN_ON_ONCE(userfaultfd_wp(vma));
+   ksm_might_unmap_zero_page(mm, ptent);
+   return;
+   }
 
+   folio = page_folio(page);
if (unlikely(!should_zap_folio(details, folio)))
return;
ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
arch_check_zapped_pte(vma, ptent);
tlb_remove_tlb_entry(tlb, pte, addr);
zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
-   if (unlikely(!page)) {
-   ksm_might_unmap_zero_page(mm, ptent);
-   return;
-   }
 
if (!folio_test_anon(folio)) {
if (pte_dirty(ptent)) {
-- 
2.43.0

[PATCH v2 01/10] mm/memory: factor out zapping of present pte into zap_present_pte()

2024-02-09 Thread David Hildenbrand

Let's prepare for further changes by factoring out processing of present
PTEs.

Signed-off-by: David Hildenbrand 
---
 mm/memory.c | 94 ++---
 1 file changed, 53 insertions(+), 41 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 7c3ca41a7610..5b0dc33133a6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1532,13 +1532,61 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct 
*vma,
pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
 }
 
+static inline void zap_present_pte(struct mmu_gather *tlb,
+   struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
+   unsigned long addr, struct zap_details *details,
+   int *rss, bool *force_flush, bool *force_break)
+{
+   struct mm_struct *mm = tlb->mm;
+   struct folio *folio = NULL;
+   bool delay_rmap = false;
+   struct page *page;
+
+   page = vm_normal_page(vma, addr, ptent);
+   if (page)
+   folio = page_folio(page);
+
+   if (unlikely(!should_zap_folio(details, folio)))
+   return;
+   ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
+   arch_check_zapped_pte(vma, ptent);
+   tlb_remove_tlb_entry(tlb, pte, addr);
+   zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
+   if (unlikely(!page)) {
+   ksm_might_unmap_zero_page(mm, ptent);
+   return;
+   }
+
+   if (!folio_test_anon(folio)) {
+   if (pte_dirty(ptent)) {
+   folio_mark_dirty(folio);
+   if (tlb_delay_rmap(tlb)) {
+   delay_rmap = true;
+   *force_flush = true;
+   }
+   }
+   if (pte_young(ptent) && likely(vma_has_recency(vma)))
+   folio_mark_accessed(folio);
+   }
+   rss[mm_counter(folio)]--;
+   if (!delay_rmap) {
+   folio_remove_rmap_pte(folio, page, vma);
+   if (unlikely(page_mapcount(page) < 0))
+   print_bad_pte(vma, addr, ptent, page);
+   }
+   if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) {
+   *force_flush = true;
+   *force_break = true;
+   }
+}
+
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
struct zap_details *details)
 {
+   bool force_flush = false, force_break = false;
struct mm_struct *mm = tlb->mm;
-   int force_flush = 0;
int rss[NR_MM_COUNTERS];
spinlock_t *ptl;
pte_t *start_pte;
@@ -1555,7 +1603,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
arch_enter_lazy_mmu_mode();
do {
pte_t ptent = ptep_get(pte);
-   struct folio *folio = NULL;
+   struct folio *folio;
struct page *page;
 
if (pte_none(ptent))
@@ -1565,45 +1613,9 @@ static unsigned long zap_pte_range(struct mmu_gather 
*tlb,
break;
 
if (pte_present(ptent)) {
-   unsigned int delay_rmap;
-
-   page = vm_normal_page(vma, addr, ptent);
-   if (page)
-   folio = page_folio(page);
-
-   if (unlikely(!should_zap_folio(details, folio)))
-   continue;
-   ptent = ptep_get_and_clear_full(mm, addr, pte,
-   tlb->fullmm);
-   arch_check_zapped_pte(vma, ptent);
-   tlb_remove_tlb_entry(tlb, pte, addr);
-   zap_install_uffd_wp_if_needed(vma, addr, pte, details,
- ptent);
-   if (unlikely(!page)) {
-   ksm_might_unmap_zero_page(mm, ptent);
-   continue;
-   }
-
-   delay_rmap = 0;
-   if (!folio_test_anon(folio)) {
-   if (pte_dirty(ptent)) {
-   folio_mark_dirty(folio);
-   if (tlb_delay_rmap(tlb)) {
-   delay_rmap = 1;
-   force_flush = 1;
-   }
-   }
-   if (pte_young(ptent) && 
likely(vma_has_recency(vma)))
-   folio_mark_accessed(folio);
-   }
-   rss[mm_counter(folio)]--;
-   if (!delay_rmap) {
-

[PATCH v2 00/10] mm/memory: optimize unmap/zap with PTE-mapped THP

2024-02-09 Thread David Hildenbrand

This series is based on [1]. Similar to what we did with fork(), let's
implement PTE batching during unmap/zap when processing PTE-mapped THPs.

We collect consecutive PTEs that map consecutive pages of the same large
folio, making sure that the other PTE bits are compatible, and (a) adjust
the refcount only once per batch, (b) call rmap handling functions only
once per batch, (c) perform batch PTE setting/updates and (d) perform TLB
entry removal once per batch.

Ryan was previously working on this in the context of cont-pte for
arm64, int latest iteration [2] with a focus on arm6 with cont-pte only.
This series implements the optimization for all architectures, independent
of such PTE bits, teaches MMU gather/TLB code to be fully aware of such
large-folio-pages batches as well, and amkes use of our new rmap batching
function when removing the rmap.

To achieve that, we have to enlighten MMU gather / page freeing code
(i.e., everything that consumes encoded_page) to process unmapping
of consecutive pages that all belong to the same large folio. I'm being
very careful to not degrade order-0 performance, and it looks like I
managed to achieve that.

While this series should -- similar to [1] -- be beneficial for adding
cont-pte support on arm64[2], it's one of the requirements for maintaining
a total mapcount[3] for large folios with minimal added overhead and
further changes[4] that build up on top of the total mapcount.

Independent of all that, this series results in a speedup during munmap()
and similar unmapping (process teardown, MADV_DONTNEED on larger ranges)
with PTE-mapped THP, which is the default with THPs that are smaller than
a PMD (for example, 16KiB to 1024KiB mTHPs for anonymous memory[5]).

On an Intel Xeon Silver 4210R CPU, munmap'ing a 1GiB VMA backed by
PTE-mapped folios of the same size (stddev < 1%) results in the following
runtimes for munmap() in seconds (shorter is better):

Folio Size | mm-unstable |  New | Change
-
  4KiB |0.058110 | 0.057715 |   - 1%
 16KiB |0.044198 | 0.035469 |   -20%
 32KiB |0.034216 | 0.023522 |   -31%
 64KiB |0.029207 | 0.018434 |   -37%
128KiB |0.026579 | 0.014026 |   -47%
256KiB |0.025130 | 0.011756 |   -53%
512KiB |0.024292 | 0.010703 |   -56%
   1024KiB |0.023812 | 0.010294 |   -57%
   2048KiB |0.023785 | 0.009910 |   -58%

CCing especially s390x folks, because they have a tlb freeing hooks that
needs adjustment. Only tested on x86-64 for now, will have to do some more
stress testing. Compile-tested on most other architectures. The PPC
change is negleglible and makes my cross-compiler happy.

[1] https://lkml.kernel.org/r/20240129124649.189745-1-da...@redhat.com
[2] https://lkml.kernel.org/r/20231218105100.172635-1-ryan.robe...@arm.com
[3] https://lkml.kernel.org/r/20230809083256.699513-1-da...@redhat.com
[4] https://lkml.kernel.org/r/20231124132626.235350-1-da...@redhat.com
[5] https://lkml.kernel.org/r/20231207161211.2374093-1-ryan.robe...@arm.com

---

The performance numbers are from v1. I did a quick benchmark run of v2
and nothing significantly changed -- because nothing in the code
significantly changed. Sending this out ASAP, so Ryan can make progress
with cont-pte.

v1 -> v2:
* "mm/memory: factor out zapping of present pte into zap_present_pte()"
 -> Initialize "struct folio *folio" to NULL
* "mm/memory: handle !page case in zap_present_pte() separately"
 -> Extend description regarding arch_check_zapped_pte()
* "mm/mmu_gather: add __tlb_remove_folio_pages()"
 -> ENCODED_PAGE_BIT_NR_PAGES_NEXT
 -> Extend patch description regarding "batching more"
* "mm/mmu_gather: improve cond_resched() handling with large folios and
   expensive page freeing"
 -> Handle the (so far) theoretical case of possible soft lockups when
we zero/poison memory when freeing pages. Try to keep old behavior in
that corner case to be safe.
* "mm/memory: optimize unmap/zap with PTE-mapped THP"
 -> Clarify description of new ptep clearing functions regarding "present
PTEs"
 -> Extend patch description regarding relaxed mapcount sanity checks
 -> Improve zap_present_ptes() description
* Pick up RB's

Cc: Andrew Morton 
Cc: Matthew Wilcox (Oracle) 
Cc: Ryan Roberts 
Cc: Catalin Marinas 
Cc: Yin Fengwei 
Cc: Michal Hocko 
Cc: Will Deacon 
Cc: "Aneesh Kumar K.V" 
Cc: Nick Piggin 
Cc: Peter Zijlstra 
Cc: Michael Ellerman 
Cc: Christophe Leroy 
Cc: "Naveen N. Rao" 
Cc: Heiko Carstens 
Cc: Vasily Gorbik 
Cc: Alexander Gordeev 
Cc: Christian Borntraeger 
Cc: Sven Schnelle 
Cc: Arnd Bergmann 
Cc: linux-a...@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-s...@vger.kernel.org

David Hildenbrand (10):
  mm/memory: factor out zapping of present pte into zap_present_pte()
  mm/memory: handle !page case in zap_present_pte() separately
  mm/memory: further separate anon and pagecache folio handling in
zap_present_pte()
  mm/memory: factor out zapping

Re: [PATCH v3 4/6] bitmap: Introduce bitmap_off()

2024-02-09 Thread Jakub Kicinski

On Tue,  6 Feb 2024 15:07:14 +0100 Herve Codina wrote:
> The bitmap_onto() function translates one bitmap relative to another but
> no function are present to perform the reverse translation.
> 
> Introduce bitmap_off() to fill this hole.

Argh, Yury is not even CCed on this? I was about to ping him but then 
I realized his email is completely missing :o

Please repost this and CC the appropriate maintainers...
-- 
pw-bot: cr

Re: [PATCH v1] spi: spi-ppc4xx: include missing platform_device.h

2024-02-09 Thread Mark Brown

On Fri, 09 Feb 2024 15:59:07 +0100, Christian Lamparter wrote:
> the driver currently fails to compile on 6.8-rc3 due to:
> | spi-ppc4xx.c: In function ‘spi_ppc4xx_of_probe’:
> | @346:36: error: invalid use of undefined type ‘struct platform_device’
> | 346 | struct device_node *np = op->dev.of_node;
> | |^~
> | ... (more similar errors)
> 
> [...]

Applied to

   https://git.kernel.org/pub/scm/linux/kernel/git/broonie/spi.git for-next

Thanks!

[1/1] spi: spi-ppc4xx: include missing platform_device.h
  commit: 9f208e097801f9c2088eb339a1162fff81c08b4e

All being well this means that it will be integrated into the linux-next
tree (usually sometime in the next 24 hours) and sent to Linus during
the next merge window (or sooner if it is a bug fix), however if
problems are discovered then the patch may be dropped or reverted.

You may get further e-mails resulting from automated or manual testing
and review of the tree, please engage with people reporting problems and
send followup patches addressing any issues that are reported if needed.

If any updates are required or you are submitting further changes they
should be sent as incremental updates against current git, existing
patches will not be replaced.

Please add any relevant lists and maintainers to the CCs when replying
to this mail.

Thanks,
Mark

Re: [PATCH v2] drivers/ps3: select VIDEO to provide cmdline functions

2024-02-09 Thread Thomas Zimmermann


Hi

Am 09.02.24 um 06:15 schrieb Michael Ellerman:

Thomas Zimmermann  writes:

Am 07.02.24 um 17:13 schrieb Randy Dunlap:

When VIDEO is not set, there is a build error. Fix that by selecting
VIDEO for PS3_PS3AV.

ERROR: modpost: ".video_get_options" [drivers/ps3/ps3av_mod.ko] undefined!

Fixes: dae7fbf43fd0 ("driver/ps3: Include  for mode parsing")
Fixes: a3b6792e990d ("video/cmdline: Introduce CONFIG_VIDEO for video= 
parameter")
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Christophe Leroy 
Cc: Aneesh Kumar K.V 
Cc: Naveen N. Rao 
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Thomas Zimmermann 
Cc: Geoff Levand 
Acked-by: Geoff Levand 
Cc: linux-fb...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Signed-off-by: Randy Dunlap 

Reviewed-by: Thomas Zimmermann 

Can you take it via whatever tree the CONFIG_VIDEO patch is in?


The patch is now in drm-misc-next.

Best regards
Thomas



Acked-by: Michael Ellerman 

cheers


--
--
Thomas Zimmermann
Graphics Driver Developer
SUSE Software Solutions Germany GmbH
Frankenstrasse 146, 90461 Nuernberg, Germany
GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman
HRB 36809 (AG Nuernberg)

Re: [kvm-unit-tests PATCH v4 6/8] migration: Add quiet migration support

2024-02-09 Thread Thomas Huth


On 09/02/2024 10.11, Nicholas Piggin wrote:

Console output required to support migration becomes quite noisy
when doing lots of migrations. Provide a migrate_quiet() call that
suppresses console output and doesn't log a message.

Signed-off-by: Nicholas Piggin 
---
  lib/migrate.c | 11 +++
  lib/migrate.h |  1 +
  scripts/arch-run.bash |  4 ++--
  3 files changed, 14 insertions(+), 2 deletions(-)


Reviewed-by: Thomas Huth

Re: [kvm-unit-tests PATCH v4 4/8] migration: Support multiple migrations

2024-02-09 Thread Thomas Huth


On 09/02/2024 10.11, Nicholas Piggin wrote:

Support multiple migrations by flipping dest file/socket variables to
source after the migration is complete, ready to start again. A new
destination is created if the test outputs the migrate line again.
Test cases may now switch to calling migrate() one or more times.

Signed-off-by: Nicholas Piggin 
---
  lib/migrate.c |  8 ++--
  lib/migrate.h |  1 +
  scripts/arch-run.bash | 86 ---
  3 files changed, 77 insertions(+), 18 deletions(-)


Reviewed-by: Thomas Huth

Re: [PATCH v4 00/10] devm_led_classdev_register() usage problem

2024-02-09 Thread Andy Shevchenko

On Thu, Dec 14, 2023 at 08:36:04PM +0300, George Stark wrote:
> This patch series fixes the problem of devm_led_classdev_register misusing.
> 
> The basic problem is described in [1]. Shortly when 
> devm_led_classdev_register()
> is used then led_classdev_unregister() called after driver's remove() 
> callback.
> led_classdev_unregister() calls driver's brightness_set callback and that 
> callback
> may use resources which were destroyed already in driver's remove().
> 
> After discussion with maintainers [2] [3] we decided:
> 1) don't touch led subsytem core code and don't remove led_set_brightness() 
> from it
> but fix drivers
> 2) don't use devm_led_classdev_unregister
> 
> So the solution is to use devm wrappers for all resources
> driver's brightness_set() depends on. And introduce dedicated devm wrapper
> for mutex as it's often used resource.
> 
> [1] 
> https://lore.kernel.org/lkml/8704539b-ed3b-44e6-aa82-586e2f895...@salutedevices.com/T/
> [2] 
> https://lore.kernel.org/lkml/8704539b-ed3b-44e6-aa82-586e2f895...@salutedevices.com/T/#mc132b9b350fa51931b4fcfe14705d9f06e91421f
> [3] 
> https://lore.kernel.org/lkml/8704539b-ed3b-44e6-aa82-586e2f895...@salutedevices.com/T/#mdbf572a85c33f869a553caf986b6228bb65c8383

Are you going to send an updated version with the amended second patch?

-- 
With Best Regards,
Andy Shevchenko

Re: [PATCH v4 00/10] devm_led_classdev_register() usage problem

2024-02-09 Thread Andy Shevchenko

On Thu, Dec 21, 2023 at 03:11:11PM +, Lee Jones wrote:
> On Thu, 14 Dec 2023, George Stark wrote:
> 
> > This patch series fixes the problem of devm_led_classdev_register misusing.
> > 
> > The basic problem is described in [1]. Shortly when 
> > devm_led_classdev_register()
> > is used then led_classdev_unregister() called after driver's remove() 
> > callback.
> > led_classdev_unregister() calls driver's brightness_set callback and that 
> > callback
> > may use resources which were destroyed already in driver's remove().
> > 
> > After discussion with maintainers [2] [3] we decided:
> > 1) don't touch led subsytem core code and don't remove led_set_brightness() 
> > from it
> > but fix drivers
> > 2) don't use devm_led_classdev_unregister
> > 
> > So the solution is to use devm wrappers for all resources
> > driver's brightness_set() depends on. And introduce dedicated devm wrapper
> > for mutex as it's often used resource.
> > 
> > [1] 
> > https://lore.kernel.org/lkml/8704539b-ed3b-44e6-aa82-586e2f895...@salutedevices.com/T/
> > [2] 
> > https://lore.kernel.org/lkml/8704539b-ed3b-44e6-aa82-586e2f895...@salutedevices.com/T/#mc132b9b350fa51931b4fcfe14705d9f06e91421f
> > [3] 
> > https://lore.kernel.org/lkml/8704539b-ed3b-44e6-aa82-586e2f895...@salutedevices.com/T/#mdbf572a85c33f869a553caf986b6228bb65c8383

...

> FYI: I'll conduct my review once the locking side is settled.

To reduce burden can you apply the first one? It's a fix.

-- 
With Best Regards,
Andy Shevchenko

Re: [RFC PATCH 5/5] powerpc/smp: Remap boot CPU onto core 0 if >= nr_cpu_ids

2024-02-09 Thread Jiri Bohac

On Tue, Jan 02, 2024 at 10:16:04AM +0530, Aneesh Kumar K.V wrote:
> Michael Ellerman  writes:
> 
> 
> 
> >  #ifdef CONFIG_PPC64
> >  int boot_cpu_hwid = -1;
> > @@ -492,12 +493,26 @@ void __init smp_setup_cpu_maps(void)
> > avail = !of_property_match_string(dn,
> > "enable-method", "spin-table");
> >  
> > -   cpu = assign_threads(cpu, nthreads, avail, intserv);
> > +   if (boot_core_hwid >= 0) {
> > +   if (cpu == 0) {
> > +   pr_info("Skipping CPU node %pOF to allow for 
> > boot core.\n", dn);
> > +   cpu = nthreads;
> > +   continue;
> > +   }
> >  
> > -   if (cpu >= nr_cpu_ids) {
> > +   if (be32_to_cpu(intserv[0]) == boot_core_hwid) {
> > +   pr_info("Renumbered boot core %pOF to logical 
> > 0\n", dn);
> > +   assign_threads(0, nthreads, avail, intserv);
> > +   of_node_put(dn);
> > +   break;
> >
> 
> I was expecting a 'continue' here. Why 'break' the loop? The condition that
> should break the loop should be cpu >= nr_cpu_ids 

No, the patch seems correct:

We're in the "if (boot_core_hwid >= 0)" branch, meaning that it
was determined by early_init_dt_scan_cpus() that boot_cpuid >=
nr_cpu_ids. So we loop until we get to the boot CPU, so it can be
renumbered to 0. Once we do that we break, because we
know we are already past nr_cpu_ids - otherwise boot_core_hwid
would not be >= 0. 


> > +   }
> > +   } else if (cpu >= nr_cpu_ids) {
> > of_node_put(dn);
> > break;
> > }

Here is what you expected - in case the boot CPU was < nr_cpu_ids
we break as soon as nr_cpu_ids is reached.

> > +
> > +   if (cpu < nr_cpu_ids)

this ensures that CPUs between nr_cpu_ids and the boot CPU are
correctly ignored in case we're already past nr_cpu_ids and only
scanning further to find the boot CPU to be renumbered to 0

-- 
Jiri Bohac 
SUSE Labs, Prague, Czechia

Re: [kvm-unit-tests PATCH v4 2/8] arch-run: Clean up initrd cleanup

2024-02-09 Thread Thomas Huth


On 09/02/2024 10.11, Nicholas Piggin wrote:

Rather than put a big script into the trap handler, have it call
a function.

Signed-off-by: Nicholas Piggin 
---
  scripts/arch-run.bash | 13 -
  1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/scripts/arch-run.bash b/scripts/arch-run.bash
index 11d47a85..c1dd67ab 100644
--- a/scripts/arch-run.bash
+++ b/scripts/arch-run.bash
@@ -269,10 +269,21 @@ search_qemu_binary ()
export PATH=$save_path
  }
  
+initrd_cleanup ()

+{
+   rm -f $KVM_UNIT_TESTS_ENV
+   if [ "$KVM_UNIT_TESTS_ENV_OLD" ]; then
+   export KVM_UNIT_TESTS_ENV="$KVM_UNIT_TESTS_ENV_OLD"
+   else
+   unset KVM_UNIT_TESTS_ENV
+   fi
+   unset KVM_UNIT_TESTS_ENV_OLD
+}
+
  initrd_create ()
  {
if [ "$ENVIRON_DEFAULT" = "yes" ]; then
-   trap_exit_push 'rm -f $KVM_UNIT_TESTS_ENV; [ "$KVM_UNIT_TESTS_ENV_OLD" ] 
&& export KVM_UNIT_TESTS_ENV="$KVM_UNIT_TESTS_ENV_OLD" || unset KVM_UNIT_TESTS_ENV; unset 
KVM_UNIT_TESTS_ENV_OLD'
+   trap_exit_push 'initrd_cleanup'
[ -f "$KVM_UNIT_TESTS_ENV" ] && export 
KVM_UNIT_TESTS_ENV_OLD="$KVM_UNIT_TESTS_ENV"
export KVM_UNIT_TESTS_ENV=$(mktemp)
env_params


Reviewed-by: Thomas Huth

[PATCH v1] spi: spi-ppc4xx: include missing platform_device.h

2024-02-09 Thread Christian Lamparter

the driver currently fails to compile on 6.8-rc3 due to:
| spi-ppc4xx.c: In function ‘spi_ppc4xx_of_probe’:
| @346:36: error: invalid use of undefined type ‘struct platform_device’
| 346 | struct device_node *np = op->dev.of_node;
| |^~
| ... (more similar errors)

it was working with 6.7. Looks like it only needed the include
and its compiling fine!

Signed-off-by: Christian Lamparter 
---
 drivers/spi/spi-ppc4xx.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/spi/spi-ppc4xx.c b/drivers/spi/spi-ppc4xx.c
index 03aab661be9d..412d6e678224 100644
--- a/drivers/spi/spi-ppc4xx.c
+++ b/drivers/spi/spi-ppc4xx.c
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
-- 
2.43.0

[kvm-unit-tests PATCH v4 8/8] migration: add a migration selftest

2024-02-09 Thread Nicholas Piggin

Add a selftest for migration support in  guest library and test harness
code. It performs migrations in a tight loop to irritate races and bugs
in the test harness code.

Include the test in arm, s390, powerpc.

Acked-by: Claudio Imbrenda  (s390x)
Reviewed-by: Thomas Huth 
Signed-off-by: Nicholas Piggin 
---
 arm/Makefile.common  |  1 +
 arm/selftest-migration.c |  1 +
 arm/unittests.cfg|  6 ++
 common/selftest-migration.c  | 34 ++
 powerpc/Makefile.common  |  1 +
 powerpc/selftest-migration.c |  1 +
 powerpc/unittests.cfg|  4 
 s390x/Makefile   |  1 +
 s390x/selftest-migration.c   |  1 +
 s390x/unittests.cfg  |  4 
 10 files changed, 54 insertions(+)
 create mode 12 arm/selftest-migration.c
 create mode 100644 common/selftest-migration.c
 create mode 12 powerpc/selftest-migration.c
 create mode 12 s390x/selftest-migration.c

diff --git a/arm/Makefile.common b/arm/Makefile.common
index f828dbe0..f107c478 100644
--- a/arm/Makefile.common
+++ b/arm/Makefile.common
@@ -5,6 +5,7 @@
 #
 
 tests-common  = $(TEST_DIR)/selftest.$(exe)
+tests-common += $(TEST_DIR)/selftest-migration.$(exe)
 tests-common += $(TEST_DIR)/spinlock-test.$(exe)
 tests-common += $(TEST_DIR)/pci-test.$(exe)
 tests-common += $(TEST_DIR)/pmu.$(exe)
diff --git a/arm/selftest-migration.c b/arm/selftest-migration.c
new file mode 12
index ..bd1eb266
--- /dev/null
+++ b/arm/selftest-migration.c
@@ -0,0 +1 @@
+../common/selftest-migration.c
\ No newline at end of file
diff --git a/arm/unittests.cfg b/arm/unittests.cfg
index fe601cbb..db0e4c9b 100644
--- a/arm/unittests.cfg
+++ b/arm/unittests.cfg
@@ -55,6 +55,12 @@ smp = $MAX_SMP
 extra_params = -append 'smp'
 groups = selftest
 
+# Test migration
+[selftest-migration]
+file = selftest-migration.flat
+groups = selftest migration
+arch = arm64
+
 # Test PCI emulation
 [pci-test]
 file = pci-test.flat
diff --git a/common/selftest-migration.c b/common/selftest-migration.c
new file mode 100644
index ..f70c505f
--- /dev/null
+++ b/common/selftest-migration.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Machine independent migration tests
+ *
+ * This is just a very simple test that is intended to stress the migration
+ * support in the test harness. This could be expanded to test more guest
+ * library code, but architecture-specific tests should be used to test
+ * migration of tricky machine state.
+ */
+#include 
+#include 
+
+#if defined(__arm__) || defined(__aarch64__)
+/* arm can only call getchar 15 times */
+#define NR_MIGRATIONS 15
+#else
+#define NR_MIGRATIONS 100
+#endif
+
+int main(int argc, char **argv)
+{
+   int i = 0;
+
+   report_prefix_push("migration");
+
+   for (i = 0; i < NR_MIGRATIONS; i++)
+   migrate_quiet();
+
+   report(true, "simple harness stress test");
+
+   report_prefix_pop();
+
+   return report_summary();
+}
diff --git a/powerpc/Makefile.common b/powerpc/Makefile.common
index eb88398d..da4a7bbb 100644
--- a/powerpc/Makefile.common
+++ b/powerpc/Makefile.common
@@ -6,6 +6,7 @@
 
 tests-common = \
$(TEST_DIR)/selftest.elf \
+   $(TEST_DIR)/selftest-migration.elf \
$(TEST_DIR)/spapr_hcall.elf \
$(TEST_DIR)/rtas.elf \
$(TEST_DIR)/emulator.elf \
diff --git a/powerpc/selftest-migration.c b/powerpc/selftest-migration.c
new file mode 12
index ..bd1eb266
--- /dev/null
+++ b/powerpc/selftest-migration.c
@@ -0,0 +1 @@
+../common/selftest-migration.c
\ No newline at end of file
diff --git a/powerpc/unittests.cfg b/powerpc/unittests.cfg
index e71140aa..7ce57de0 100644
--- a/powerpc/unittests.cfg
+++ b/powerpc/unittests.cfg
@@ -36,6 +36,10 @@ smp = 2
 extra_params = -m 256 -append 'setup smp=2 mem=256'
 groups = selftest
 
+[selftest-migration]
+file = selftest-migration.elf
+groups = selftest migration
+
 [spapr_hcall]
 file = spapr_hcall.elf
 
diff --git a/s390x/Makefile b/s390x/Makefile
index b72f7578..344d46d6 100644
--- a/s390x/Makefile
+++ b/s390x/Makefile
@@ -1,4 +1,5 @@
 tests = $(TEST_DIR)/selftest.elf
+tests += $(TEST_DIR)/selftest-migration.elf
 tests += $(TEST_DIR)/intercept.elf
 tests += $(TEST_DIR)/emulator.elf
 tests += $(TEST_DIR)/sieve.elf
diff --git a/s390x/selftest-migration.c b/s390x/selftest-migration.c
new file mode 12
index ..bd1eb266
--- /dev/null
+++ b/s390x/selftest-migration.c
@@ -0,0 +1 @@
+../common/selftest-migration.c
\ No newline at end of file
diff --git a/s390x/unittests.cfg b/s390x/unittests.cfg
index f5024b6e..a7ad522c 100644
--- a/s390x/unittests.cfg
+++ b/s390x/unittests.cfg
@@ -24,6 +24,10 @@ groups = selftest
 # please keep the kernel cmdline in sync with $(TEST_DIR)/selftest.parmfile
 extra_params = -append 'test 123'
 
+[selftest-migration]
+file = selftest-migration.elf
+groups = selftest migration
+
 [intercept]
 file = intercept.elf
 
-- 
2.42.0

[kvm-unit-tests PATCH v4 7/8] Add common/ directory for architecture-independent tests

2024-02-09 Thread Nicholas Piggin

x86/sieve.c is used by s390x, arm, and riscv via symbolic link. Make a
new directory common/ for architecture-independent tests and move
sieve.c here.

Reviewed-by: Thomas Huth 
Signed-off-by: Nicholas Piggin 
---
 arm/sieve.c|  2 +-
 common/sieve.c | 51 +
 riscv/sieve.c  |  2 +-
 s390x/sieve.c  |  2 +-
 x86/sieve.c| 52 +-
 5 files changed, 55 insertions(+), 54 deletions(-)
 create mode 100644 common/sieve.c
 mode change 100644 => 12 x86/sieve.c

diff --git a/arm/sieve.c b/arm/sieve.c
index 8f14a5c3..fe299f30 12
--- a/arm/sieve.c
+++ b/arm/sieve.c
@@ -1 +1 @@
-../x86/sieve.c
\ No newline at end of file
+../common/sieve.c
\ No newline at end of file
diff --git a/common/sieve.c b/common/sieve.c
new file mode 100644
index ..8150f2d9
--- /dev/null
+++ b/common/sieve.c
@@ -0,0 +1,51 @@
+#include "alloc.h"
+#include "libcflat.h"
+
+static int sieve(char* data, int size)
+{
+int i, j, r = 0;
+
+for (i = 0; i < size; ++i)
+   data[i] = 1;
+
+data[0] = data[1] = 0;
+
+for (i = 2; i < size; ++i)
+   if (data[i]) {
+   ++r;
+   for (j = i*2; j < size; j += i)
+   data[j] = 0;
+   }
+return r;
+}
+
+static void test_sieve(const char *msg, char *data, int size)
+{
+int r;
+
+printf("%s:", msg);
+r = sieve(data, size);
+printf("%d out of %d\n", r, size);
+}
+
+#define STATIC_SIZE 100
+#define VSIZE 1
+char static_data[STATIC_SIZE];
+
+int main(void)
+{
+void *v;
+int i;
+
+printf("starting sieve\n");
+test_sieve("static", static_data, STATIC_SIZE);
+setup_vm();
+test_sieve("mapped", static_data, STATIC_SIZE);
+for (i = 0; i < 3; ++i) {
+   v = malloc(VSIZE);
+   test_sieve("virtual", v, VSIZE);
+   free(v);
+}
+
+return 0;
+}
diff --git a/riscv/sieve.c b/riscv/sieve.c
index 8f14a5c3..fe299f30 12
--- a/riscv/sieve.c
+++ b/riscv/sieve.c
@@ -1 +1 @@
-../x86/sieve.c
\ No newline at end of file
+../common/sieve.c
\ No newline at end of file
diff --git a/s390x/sieve.c b/s390x/sieve.c
index 8f14a5c3..fe299f30 12
--- a/s390x/sieve.c
+++ b/s390x/sieve.c
@@ -1 +1 @@
-../x86/sieve.c
\ No newline at end of file
+../common/sieve.c
\ No newline at end of file
diff --git a/x86/sieve.c b/x86/sieve.c
deleted file mode 100644
index 8150f2d9..
--- a/x86/sieve.c
+++ /dev/null
@@ -1,51 +0,0 @@
-#include "alloc.h"
-#include "libcflat.h"
-
-static int sieve(char* data, int size)
-{
-int i, j, r = 0;
-
-for (i = 0; i < size; ++i)
-   data[i] = 1;
-
-data[0] = data[1] = 0;
-
-for (i = 2; i < size; ++i)
-   if (data[i]) {
-   ++r;
-   for (j = i*2; j < size; j += i)
-   data[j] = 0;
-   }
-return r;
-}
-
-static void test_sieve(const char *msg, char *data, int size)
-{
-int r;
-
-printf("%s:", msg);
-r = sieve(data, size);
-printf("%d out of %d\n", r, size);
-}
-
-#define STATIC_SIZE 100
-#define VSIZE 1
-char static_data[STATIC_SIZE];
-
-int main(void)
-{
-void *v;
-int i;
-
-printf("starting sieve\n");
-test_sieve("static", static_data, STATIC_SIZE);
-setup_vm();
-test_sieve("mapped", static_data, STATIC_SIZE);
-for (i = 0; i < 3; ++i) {
-   v = malloc(VSIZE);
-   test_sieve("virtual", v, VSIZE);
-   free(v);
-}
-
-return 0;
-}
diff --git a/x86/sieve.c b/x86/sieve.c
new file mode 12
index ..fe299f30
--- /dev/null
+++ b/x86/sieve.c
@@ -0,0 +1 @@
+../common/sieve.c
\ No newline at end of file
-- 
2.42.0

[kvm-unit-tests PATCH v4 6/8] migration: Add quiet migration support

2024-02-09 Thread Nicholas Piggin

Console output required to support migration becomes quite noisy
when doing lots of migrations. Provide a migrate_quiet() call that
suppresses console output and doesn't log a message.

Signed-off-by: Nicholas Piggin 
---
 lib/migrate.c | 11 +++
 lib/migrate.h |  1 +
 scripts/arch-run.bash |  4 ++--
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/lib/migrate.c b/lib/migrate.c
index b7721659..92d1d957 100644
--- a/lib/migrate.c
+++ b/lib/migrate.c
@@ -18,6 +18,17 @@ void migrate(void)
report_info("Migration complete");
 }
 
+/*
+ * Like migrate() but suppress output and logs, useful for intensive
+ * migration stress testing without polluting logs. Test cases should
+ * provide relevant information about migration in failure reports.
+ */
+void migrate_quiet(void)
+{
+   puts("Now migrate the VM (quiet)\n");
+   (void)getchar();
+}
+
 /*
  * Initiate migration and wait for it to complete.
  * If this function is called more than once, it is a no-op.
diff --git a/lib/migrate.h b/lib/migrate.h
index 2af06a72..95b9102b 100644
--- a/lib/migrate.h
+++ b/lib/migrate.h
@@ -7,4 +7,5 @@
  */
 
 void migrate(void);
+void migrate_quiet(void);
 void migrate_once(void);
diff --git a/scripts/arch-run.bash b/scripts/arch-run.bash
index c98429e8..0a98e512 100644
--- a/scripts/arch-run.bash
+++ b/scripts/arch-run.bash
@@ -152,7 +152,7 @@ run_migration ()
-chardev socket,id=mon,path=${src_qmp},server=on,wait=off \
-mon chardev=mon,mode=control > ${src_outfifo} &
live_pid=$!
-   cat ${src_outfifo} | tee ${src_out} &
+   cat ${src_outfifo} | tee ${src_out} | grep -v "Now migrate the VM 
(quiet)" &
 
# Start the first destination QEMU machine in advance of the test
# reaching the migration point, since we expect at least one migration.
@@ -190,7 +190,7 @@ do_migration ()
-mon chardev=mon,mode=control -incoming unix:${dst_incoming} \
< <(cat ${dst_infifo}) > ${dst_outfifo} &
incoming_pid=$!
-   cat ${dst_outfifo} | tee ${dst_out} &
+   cat ${dst_outfifo} | tee ${dst_out} | grep -v "Now migrate the VM 
(quiet)" &
 
# The test must prompt the user to migrate, so wait for the
# "Now migrate VM" console message.
-- 
2.42.0

[kvm-unit-tests PATCH v4 5/8] arch-run: rename migration variables

2024-02-09 Thread Nicholas Piggin

Using 1 and 2 for source and destination is confusing, particularly
now with multiple migrations that flip between them. Do a rename
pass to 'src' and 'dst' to tidy things up.

Acked-by: Thomas Huth 
Signed-off-by: Nicholas Piggin 
---
 scripts/arch-run.bash | 111 +-
 1 file changed, 56 insertions(+), 55 deletions(-)

diff --git a/scripts/arch-run.bash b/scripts/arch-run.bash
index c2002d7a..c98429e8 100644
--- a/scripts/arch-run.bash
+++ b/scripts/arch-run.bash
@@ -132,27 +132,27 @@ run_migration ()
migcmdline=$@
 
trap 'trap - TERM ; kill 0 ; exit 2' INT TERM
-   trap 'rm -f ${migout1} ${migout2} ${migout_fifo1} ${migout_fifo2} 
${migsock} ${qmp1} ${qmp2} ${fifo}' RETURN EXIT
-
-   migsock=$(mktemp -u -t mig-helper-socket.XX)
-   migout1=$(mktemp -t mig-helper-stdout1.XX)
-   migout_fifo1=$(mktemp -u -t mig-helper-fifo-stdout1.XX)
-   migout2=$(mktemp -t mig-helper-stdout2.XX)
-   migout_fifo2=$(mktemp -u -t mig-helper-fifo-stdout2.XX)
-   qmp1=$(mktemp -u -t mig-helper-qmp1.XX)
-   qmp2=$(mktemp -u -t mig-helper-qmp2.XX)
-   fifo=$(mktemp -u -t mig-helper-fifo.XX)
-   qmpout1=/dev/null
-   qmpout2=/dev/null
-
-   mkfifo ${migout_fifo1}
-   mkfifo ${migout_fifo2}
+   trap 'rm -f ${src_out} ${dst_out} ${src_outfifo} ${dst_outfifo} 
${dst_incoming} ${src_qmp} ${dst_qmp} ${dst_infifo}' RETURN EXIT
+
+   dst_incoming=$(mktemp -u -t mig-helper-socket-incoming.XX)
+   src_out=$(mktemp -t mig-helper-stdout1.XX)
+   src_outfifo=$(mktemp -u -t mig-helper-fifo-stdout1.XX)
+   dst_out=$(mktemp -t mig-helper-stdout2.XX)
+   dst_outfifo=$(mktemp -u -t mig-helper-fifo-stdout2.XX)
+   src_qmp=$(mktemp -u -t mig-helper-qmp1.XX)
+   dst_qmp=$(mktemp -u -t mig-helper-qmp2.XX)
+   dst_infifo=$(mktemp -u -t mig-helper-fifo-stdin.XX)
+   src_qmpout=/dev/null
+   dst_qmpout=/dev/null
+
+   mkfifo ${src_outfifo}
+   mkfifo ${dst_outfifo}
 
eval "$migcmdline" \
-   -chardev socket,id=mon1,path=${qmp1},server=on,wait=off \
-   -mon chardev=mon1,mode=control > ${migout_fifo1} &
+   -chardev socket,id=mon,path=${src_qmp},server=on,wait=off \
+   -mon chardev=mon,mode=control > ${src_outfifo} &
live_pid=$!
-   cat ${migout_fifo1} | tee ${migout1} &
+   cat ${src_outfifo} | tee ${src_out} &
 
# Start the first destination QEMU machine in advance of the test
# reaching the migration point, since we expect at least one migration.
@@ -162,7 +162,7 @@ run_migration ()
 
while ps -p ${live_pid} > /dev/null ; do
# Wait for test exit or further migration messages.
-   if ! grep -q -i "Now migrate the VM" < ${migout1} ; then
+   if ! grep -q -i "Now migrate the VM" < ${src_out} ; then
sleep 0.1
else
do_migration || return $?
@@ -184,80 +184,81 @@ do_migration ()
# We have to use cat to open the named FIFO, because named FIFO's,
# unlike pipes, will block on open() until the other end is also
# opened, and that totally breaks QEMU...
-   mkfifo ${fifo}
+   mkfifo ${dst_infifo}
eval "$migcmdline" \
-   -chardev socket,id=mon2,path=${qmp2},server=on,wait=off \
-   -mon chardev=mon2,mode=control -incoming unix:${migsock} \
-   < <(cat ${fifo}) > ${migout_fifo2} &
+   -chardev socket,id=mon,path=${dst_qmp},server=on,wait=off \
+   -mon chardev=mon,mode=control -incoming unix:${dst_incoming} \
+   < <(cat ${dst_infifo}) > ${dst_outfifo} &
incoming_pid=$!
-   cat ${migout_fifo2} | tee ${migout2} &
+   cat ${dst_outfifo} | tee ${dst_out} &
 
# The test must prompt the user to migrate, so wait for the
# "Now migrate VM" console message.
-   while ! grep -q -i "Now migrate the VM" < ${migout1} ; do
+   while ! grep -q -i "Now migrate the VM" < ${src_out} ; do
if ! ps -p ${live_pid} > /dev/null ; then
echo "ERROR: Test exit before migration point." >&2
-   echo > ${fifo}
-   qmp ${qmp1} '"quit"'> ${qmpout1} 2>/dev/null
-   qmp ${qmp2} '"quit"'> ${qmpout2} 2>/dev/null
+   echo > ${dst_infifo}
+   qmp ${src_qmp} '"quit"'> ${src_qmpout} 2>/dev/null
+   qmp ${dst_qmp} '"quit"'> ${dst_qmpout} 2>/dev/null
return 3
fi
sleep 0.1
done
 
# Wait until the destination has created the incoming and qmp sockets
-   while ! [ -S ${migsock} ] ; do sleep 0.1 ; done
-   while ! [ -S ${qmp2} ] ; do sleep

[kvm-unit-tests PATCH v4 4/8] migration: Support multiple migrations

2024-02-09 Thread Nicholas Piggin

Support multiple migrations by flipping dest file/socket variables to
source after the migration is complete, ready to start again. A new
destination is created if the test outputs the migrate line again.
Test cases may now switch to calling migrate() one or more times.

Signed-off-by: Nicholas Piggin 
---
 lib/migrate.c |  8 ++--
 lib/migrate.h |  1 +
 scripts/arch-run.bash | 86 ---
 3 files changed, 77 insertions(+), 18 deletions(-)

diff --git a/lib/migrate.c b/lib/migrate.c
index 527e63ae..b7721659 100644
--- a/lib/migrate.c
+++ b/lib/migrate.c
@@ -8,8 +8,10 @@
 #include 
 #include "migrate.h"
 
-/* static for now since we only support migrating exactly once per test. */
-static void migrate(void)
+/*
+ * Initiate migration and wait for it to complete.
+ */
+void migrate(void)
 {
puts("Now migrate the VM, then press a key to continue...\n");
(void)getchar();
@@ -19,8 +21,6 @@ static void migrate(void)
 /*
  * Initiate migration and wait for it to complete.
  * If this function is called more than once, it is a no-op.
- * Since migrate_cmd can only migrate exactly once this function can
- * simplify the control flow, especially when skipping tests.
  */
 void migrate_once(void)
 {
diff --git a/lib/migrate.h b/lib/migrate.h
index 3c94e6af..2af06a72 100644
--- a/lib/migrate.h
+++ b/lib/migrate.h
@@ -6,4 +6,5 @@
  * Author: Nico Boehr 
  */
 
+void migrate(void);
 void migrate_once(void);
diff --git a/scripts/arch-run.bash b/scripts/arch-run.bash
index 9a5aaddc..c2002d7a 100644
--- a/scripts/arch-run.bash
+++ b/scripts/arch-run.bash
@@ -129,12 +129,16 @@ run_migration ()
return 77
fi
 
+   migcmdline=$@
+
trap 'trap - TERM ; kill 0 ; exit 2' INT TERM
-   trap 'rm -f ${migout1} ${migout_fifo1} ${migsock} ${qmp1} ${qmp2} 
${fifo}' RETURN EXIT
+   trap 'rm -f ${migout1} ${migout2} ${migout_fifo1} ${migout_fifo2} 
${migsock} ${qmp1} ${qmp2} ${fifo}' RETURN EXIT
 
migsock=$(mktemp -u -t mig-helper-socket.XX)
migout1=$(mktemp -t mig-helper-stdout1.XX)
migout_fifo1=$(mktemp -u -t mig-helper-fifo-stdout1.XX)
+   migout2=$(mktemp -t mig-helper-stdout2.XX)
+   migout_fifo2=$(mktemp -u -t mig-helper-fifo-stdout2.XX)
qmp1=$(mktemp -u -t mig-helper-qmp1.XX)
qmp2=$(mktemp -u -t mig-helper-qmp2.XX)
fifo=$(mktemp -u -t mig-helper-fifo.XX)
@@ -142,20 +146,54 @@ run_migration ()
qmpout2=/dev/null
 
mkfifo ${migout_fifo1}
-   eval "$@" -chardev socket,id=mon1,path=${qmp1},server=on,wait=off \
+   mkfifo ${migout_fifo2}
+
+   eval "$migcmdline" \
+   -chardev socket,id=mon1,path=${qmp1},server=on,wait=off \
-mon chardev=mon1,mode=control > ${migout_fifo1} &
live_pid=$!
cat ${migout_fifo1} | tee ${migout1} &
 
-   # We have to use cat to open the named FIFO, because named FIFO's, 
unlike
-   # pipes, will block on open() until the other end is also opened, and 
that
-   # totally breaks QEMU...
+   # Start the first destination QEMU machine in advance of the test
+   # reaching the migration point, since we expect at least one migration.
+   # Then destination machines are started after the test outputs
+   # subsequent "Now migrate the VM" messages.
+   do_migration || return $?
+
+   while ps -p ${live_pid} > /dev/null ; do
+   # Wait for test exit or further migration messages.
+   if ! grep -q -i "Now migrate the VM" < ${migout1} ; then
+   sleep 0.1
+   else
+   do_migration || return $?
+   fi
+   done
+
+   wait ${live_pid}
+   ret=$?
+
+   while (( $(jobs -r | wc -l) > 0 )); do
+   sleep 0.1
+   done
+
+   return $ret
+}
+
+do_migration ()
+{
+   # We have to use cat to open the named FIFO, because named FIFO's,
+   # unlike pipes, will block on open() until the other end is also
+   # opened, and that totally breaks QEMU...
mkfifo ${fifo}
-   eval "$@" -chardev socket,id=mon2,path=${qmp2},server=on,wait=off \
-   -mon chardev=mon2,mode=control -incoming unix:${migsock} < 
<(cat ${fifo}) &
+   eval "$migcmdline" \
+   -chardev socket,id=mon2,path=${qmp2},server=on,wait=off \
+   -mon chardev=mon2,mode=control -incoming unix:${migsock} \
+   < <(cat ${fifo}) > ${migout_fifo2} &
incoming_pid=$!
+   cat ${migout_fifo2} | tee ${migout2} &
 
-   # The test must prompt the user to migrate, so wait for the "migrate" 
keyword
+   # The test must prompt the user to migrate, so wait for the
+   # "Now migrate VM" console message.
while ! grep -q -i "Now migrate the VM" < ${migout1} ; do
if ! ps -p ${live_pid} > /dev/null ; then

[kvm-unit-tests PATCH v4 3/8] migration: use a more robust way to wait for background job

2024-02-09 Thread Nicholas Piggin

Starting a pipeline of jobs in the background does not seem to have
a simple way to reliably find the pid of a particular process in the
pipeline (because not all processes are started when the shell
continues to execute).

The way PID of QEMU is derived can result in a failure waiting on a
PID that is not running. This is easier to hit with subsequent
multiple-migration support. Changing this to use $! by swapping the
pipeline for a fifo is more robust.

Reviewed-by: Thomas Huth 
Signed-off-by: Nicholas Piggin 
---
 scripts/arch-run.bash | 15 +++
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/scripts/arch-run.bash b/scripts/arch-run.bash
index c1dd67ab..9a5aaddc 100644
--- a/scripts/arch-run.bash
+++ b/scripts/arch-run.bash
@@ -130,19 +130,22 @@ run_migration ()
fi
 
trap 'trap - TERM ; kill 0 ; exit 2' INT TERM
-   trap 'rm -f ${migout1} ${migsock} ${qmp1} ${qmp2} ${fifo}' RETURN EXIT
+   trap 'rm -f ${migout1} ${migout_fifo1} ${migsock} ${qmp1} ${qmp2} 
${fifo}' RETURN EXIT
 
migsock=$(mktemp -u -t mig-helper-socket.XX)
migout1=$(mktemp -t mig-helper-stdout1.XX)
+   migout_fifo1=$(mktemp -u -t mig-helper-fifo-stdout1.XX)
qmp1=$(mktemp -u -t mig-helper-qmp1.XX)
qmp2=$(mktemp -u -t mig-helper-qmp2.XX)
fifo=$(mktemp -u -t mig-helper-fifo.XX)
qmpout1=/dev/null
qmpout2=/dev/null
 
+   mkfifo ${migout_fifo1}
eval "$@" -chardev socket,id=mon1,path=${qmp1},server=on,wait=off \
-   -mon chardev=mon1,mode=control | tee ${migout1} &
-   live_pid=`jobs -l %+ | grep "eval" | awk '{print$2}'`
+   -mon chardev=mon1,mode=control > ${migout_fifo1} &
+   live_pid=$!
+   cat ${migout_fifo1} | tee ${migout1} &
 
# We have to use cat to open the named FIFO, because named FIFO's, 
unlike
# pipes, will block on open() until the other end is also opened, and 
that
@@ -150,7 +153,7 @@ run_migration ()
mkfifo ${fifo}
eval "$@" -chardev socket,id=mon2,path=${qmp2},server=on,wait=off \
-mon chardev=mon2,mode=control -incoming unix:${migsock} < 
<(cat ${fifo}) &
-   incoming_pid=`jobs -l %+ | awk '{print$2}'`
+   incoming_pid=$!
 
# The test must prompt the user to migrate, so wait for the "migrate" 
keyword
while ! grep -q -i "Now migrate the VM" < ${migout1} ; do
@@ -164,6 +167,10 @@ run_migration ()
sleep 1
done
 
+   # Wait until the destination has created the incoming and qmp sockets
+   while ! [ -S ${migsock} ] ; do sleep 0.1 ; done
+   while ! [ -S ${qmp2} ] ; do sleep 0.1 ; done
+
qmp ${qmp1} '"migrate", "arguments": { "uri": "unix:'${migsock}'" }' > 
${qmpout1}
 
# Wait for the migration to complete
-- 
2.42.0

[kvm-unit-tests PATCH v4 2/8] arch-run: Clean up initrd cleanup

2024-02-09 Thread Nicholas Piggin

Rather than put a big script into the trap handler, have it call
a function.

Signed-off-by: Nicholas Piggin 
---
 scripts/arch-run.bash | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/scripts/arch-run.bash b/scripts/arch-run.bash
index 11d47a85..c1dd67ab 100644
--- a/scripts/arch-run.bash
+++ b/scripts/arch-run.bash
@@ -269,10 +269,21 @@ search_qemu_binary ()
export PATH=$save_path
 }
 
+initrd_cleanup ()
+{
+   rm -f $KVM_UNIT_TESTS_ENV
+   if [ "$KVM_UNIT_TESTS_ENV_OLD" ]; then
+   export KVM_UNIT_TESTS_ENV="$KVM_UNIT_TESTS_ENV_OLD"
+   else
+   unset KVM_UNIT_TESTS_ENV
+   fi
+   unset KVM_UNIT_TESTS_ENV_OLD
+}
+
 initrd_create ()
 {
if [ "$ENVIRON_DEFAULT" = "yes" ]; then
-   trap_exit_push 'rm -f $KVM_UNIT_TESTS_ENV; [ 
"$KVM_UNIT_TESTS_ENV_OLD" ] && export 
KVM_UNIT_TESTS_ENV="$KVM_UNIT_TESTS_ENV_OLD" || unset KVM_UNIT_TESTS_ENV; unset 
KVM_UNIT_TESTS_ENV_OLD'
+   trap_exit_push 'initrd_cleanup'
[ -f "$KVM_UNIT_TESTS_ENV" ] && export 
KVM_UNIT_TESTS_ENV_OLD="$KVM_UNIT_TESTS_ENV"
export KVM_UNIT_TESTS_ENV=$(mktemp)
env_params
-- 
2.42.0

[kvm-unit-tests PATCH v4 1/8] arch-run: Fix TRAP handler recursion to remove temporary files properly

2024-02-09 Thread Nicholas Piggin

Migration files were not being removed when the QEMU process is
interrupted (e.g., with ^C). This is becaus the SIGINT propagates to the
bash TRAP handler, which recursively TRAPs due to the 'kill 0' in the
handler. This eventually crashes bash.

This can be observed by interrupting a long-running test program that is
run with MIGRATION=yes, /tmp/mig-helper-* files remain afterwards.

Removing TRAP recursion solves this problem and allows the EXIT handler
to run and clean up the files.

This also moves the trap handler before temp file creation, which closes
the small race between creation trap handler install.

Reviewed-by: Thomas Huth 
Signed-off-by: Nicholas Piggin 
---
 scripts/arch-run.bash | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/scripts/arch-run.bash b/scripts/arch-run.bash
index d0864360..11d47a85 100644
--- a/scripts/arch-run.bash
+++ b/scripts/arch-run.bash
@@ -129,6 +129,9 @@ run_migration ()
return 77
fi
 
+   trap 'trap - TERM ; kill 0 ; exit 2' INT TERM
+   trap 'rm -f ${migout1} ${migsock} ${qmp1} ${qmp2} ${fifo}' RETURN EXIT
+
migsock=$(mktemp -u -t mig-helper-socket.XX)
migout1=$(mktemp -t mig-helper-stdout1.XX)
qmp1=$(mktemp -u -t mig-helper-qmp1.XX)
@@ -137,9 +140,6 @@ run_migration ()
qmpout1=/dev/null
qmpout2=/dev/null
 
-   trap 'kill 0; exit 2' INT TERM
-   trap 'rm -f ${migout1} ${migsock} ${qmp1} ${qmp2} ${fifo}' RETURN EXIT
-
eval "$@" -chardev socket,id=mon1,path=${qmp1},server=on,wait=off \
-mon chardev=mon1,mode=control | tee ${migout1} &
live_pid=`jobs -l %+ | grep "eval" | awk '{print$2}'`
@@ -209,11 +209,11 @@ run_panic ()
return 77
fi
 
-   qmp=$(mktemp -u -t panic-qmp.XX)
-
-   trap 'kill 0; exit 2' INT TERM
+   trap 'trap - TERM ; kill 0 ; exit 2' INT TERM
trap 'rm -f ${qmp}' RETURN EXIT
 
+   qmp=$(mktemp -u -t panic-qmp.XX)
+
# start VM stopped so we don't miss any events
eval "$@" -chardev socket,id=mon1,path=${qmp},server=on,wait=off \
-mon chardev=mon1,mode=control -S &
-- 
2.42.0

[kvm-unit-tests PATCH v4 0/8] Multi-migration support

2024-02-09 Thread Nicholas Piggin

Thanks for the detailed reviews. Hopefully this is the last one...

Since v3:
- Addressed Thomas's review comments:
- Patch 2 initrd cleanup unset the old variable in the correct place.
- Patch 4 multi migration removed the extra wait for "Now migrate the
  VM" message, and updated comments around it.
- Patch 6 fix typo and whitespace in quiet migration support.
- Patch 8 fix typo and whitespace in migration selftest.

Since v2:
- Rebase on riscv port and auxvinfo fix was merged.
- Clean up initrd cleanup moves more commands into the new cleanup
  function from the trap handler comands (suggested by Thomas).
- "arch-run: Clean up temporary files properly" patch is now renamed
  to "arch-run: Fix TRAP handler..."
- Fix TRAP handler patch has redone changelog to be more precise about
  the problem and including recipe to recreate it.
- Fix TRAP handler patch reworked slightly to remove the theoretical
  race rather than just adding a comment about it.
- Patch 3 was missing a couple of fixes that leaked into patch 4,
  those are moved into patch 3.

Thanks,
Nick

Nicholas Piggin (8):
  arch-run: Fix TRAP handler recursion to remove temporary files
properly
  arch-run: Clean up initrd cleanup
  migration: use a more robust way to wait for background job
  migration: Support multiple migrations
  arch-run: rename migration variables
  migration: Add quiet migration support
  Add common/ directory for architecture-independent tests
  migration: add a migration selftest

 arm/Makefile.common  |   1 +
 arm/selftest-migration.c |   1 +
 arm/sieve.c  |   2 +-
 arm/unittests.cfg|   6 ++
 common/selftest-migration.c  |  34 +++
 common/sieve.c   |  51 ++
 lib/migrate.c|  19 +++-
 lib/migrate.h|   2 +
 powerpc/Makefile.common  |   1 +
 powerpc/selftest-migration.c |   1 +
 powerpc/unittests.cfg|   4 +
 riscv/sieve.c|   2 +-
 s390x/Makefile   |   1 +
 s390x/selftest-migration.c   |   1 +
 s390x/sieve.c|   2 +-
 s390x/unittests.cfg  |   4 +
 scripts/arch-run.bash| 177 +--
 x86/sieve.c  |  52 +-
 18 files changed, 253 insertions(+), 108 deletions(-)
 create mode 12 arm/selftest-migration.c
 create mode 100644 common/selftest-migration.c
 create mode 100644 common/sieve.c
 create mode 12 powerpc/selftest-migration.c
 create mode 12 s390x/selftest-migration.c
 mode change 100644 => 12 x86/sieve.c

-- 
2.42.0

Re: [PATCH v5 00/25] Transparent Contiguous PTEs for User Mappings

2024-02-09 Thread Ryan Roberts

On 08/02/2024 17:34, Mark Rutland wrote:
> On Fri, Feb 02, 2024 at 08:07:31AM +, Ryan Roberts wrote:
>> Hi All,
> 
> Hi Ryan,
> 
> I assume this is the same as your 'features/granule_perf/contpte-lkml_v' 
> branch
> on https://gitlab.arm.com/linux-arm/linux-rr/

Yep - great detective work! features/granule_perf/contpte-lkml_v5 corresponds
exactly to what I posted with all the dependencies in place.

> 
> I've taken a quick look, and I have a few initial/superficial comments before
> digging into the detail on the important changes.

Thanks for doing this!

> 
>> Patch Layout
>> 
>>
>> In this version, I've split the patches to better show each optimization:
>>
>>   - 1-2:mm prep: misc code and docs cleanups
> 
> I'm not confident enough to comment on patch 2, but these look reasonable to
> me.

Thanks. David has acked patch 2 already so I think we are good there.

> 
>>   - 3-8:mm,arm,arm64,powerpc,x86 prep: Replace pte_next_pfn() with more
>> general pte_advance_pfn()
> 
> These look fine to me.

Thanks!

> 
>>   - 9-18:   arm64 prep: Refactor ptep helpers into new layer
> 
> The result of patches 9-17 looks good to me, but the intermediate stages where
> some functions are converted is a bit odd, and it's a bit painful for review
> since you need to skip ahead a few patches to see the end result to tell that
> the conversions are consistent and complete.
> 
> IMO it'd be easier for review if that were three patches:
> 
> 1) Convert READ_ONCE() -> ptep_get()
> 2) Convert set_pte_at() -> set_ptes()
> 3) All the "New layer" renames and addition of the trivial wrappers

Yep that makes sense. I'll start prepping that today. I'll hold off reposting
until I have your comments on 19-25. I'm also hoping that David will repost the
zap series today so that it can get into mm-unstable by mid-next week. Then I'll
repost on top of that, hopefully by end of next week, folding in all your
comments. This should give planty of time to soak in linux-next.

Thanks,
Ryan

> 
> Patch 18 looks fine to me.
> 
>>   - 19: functional contpte implementation
>>   - 20-25:  various optimizations on top of the contpte implementation
> 
> I'll try to dig into these over the next few days.
> 
> Mark.

Re: [kvm-unit-tests PATCH v3 4/8] migration: Support multiple migrations

2024-02-09 Thread Thomas Huth


On 09/02/2024 09.39, Nicholas Piggin wrote:

On Fri Feb 9, 2024 at 6:19 PM AEST, Thomas Huth wrote:

On 09/02/2024 08.01, Nicholas Piggin wrote:

Support multiple migrations by flipping dest file/socket variables to
source after the migration is complete, ready to start again. A new
destination is created if the test outputs the migrate line again.
Test cases may now switch to calling migrate() one or more times.

Signed-off-by: Nicholas Piggin 
---

...

diff --git a/scripts/arch-run.bash b/scripts/arch-run.bash
index 3689d7c2..a914ba17 100644
--- a/scripts/arch-run.bash
+++ b/scripts/arch-run.bash
@@ -129,12 +129,16 @@ run_migration ()
return 77
fi
   
+	migcmdline=$@

+
trap 'trap - TERM ; kill 0 ; exit 2' INT TERM
-   trap 'rm -f ${migout1} ${migout_fifo1} ${migsock} ${qmp1} ${qmp2} 
${fifo}' RETURN EXIT
+   trap 'rm -f ${migout1} ${migout2} ${migout_fifo1} ${migout_fifo2} 
${migsock} ${qmp1} ${qmp2} ${fifo}' RETURN EXIT
   
   	migsock=$(mktemp -u -t mig-helper-socket.XX)

migout1=$(mktemp -t mig-helper-stdout1.XX)
migout_fifo1=$(mktemp -u -t mig-helper-fifo-stdout1.XX)
+   migout2=$(mktemp -t mig-helper-stdout2.XX)
+   migout_fifo2=$(mktemp -u -t mig-helper-fifo-stdout2.XX)
qmp1=$(mktemp -u -t mig-helper-qmp1.XX)
qmp2=$(mktemp -u -t mig-helper-qmp2.XX)
fifo=$(mktemp -u -t mig-helper-fifo.XX)
@@ -142,18 +146,61 @@ run_migration ()
qmpout2=/dev/null
   
   	mkfifo ${migout_fifo1}

-   eval "$@" -chardev socket,id=mon1,path=${qmp1},server=on,wait=off \
+   mkfifo ${migout_fifo2}
+
+   eval "$migcmdline" \
+   -chardev socket,id=mon1,path=${qmp1},server=on,wait=off \
-mon chardev=mon1,mode=control > ${migout_fifo1} &
live_pid=$!
cat ${migout_fifo1} | tee ${migout1} &
   
-	# We have to use cat to open the named FIFO, because named FIFO's, unlike

-   # pipes, will block on open() until the other end is also opened, and 
that
-   # totally breaks QEMU...
+   # The test must prompt the user to migrate, so wait for the "migrate"
+   # keyword
+   while ! grep -q -i "Now migrate the VM" < ${migout1} ; do
+   if ! ps -p ${live_pid} > /dev/null ; then
+   echo "ERROR: Test exit before migration point." >&2
+   qmp ${qmp1} '"quit"'> ${qmpout1} 2>/dev/null
+   return 3
+   fi
+   sleep 0.1
+   done
+
+   # This starts the first source QEMU in advance of the test reaching the
+   # migration point, since we expect at least one migration. Subsequent
+   # sources are started as the test hits migrate keywords.
+   do_migration || return $?
+
+   while ps -p ${live_pid} > /dev/null ; do
+   # Wait for EXIT or further migrations
+   if ! grep -q -i "Now migrate the VM" < ${migout1} ; then
+   sleep 0.1
+   else
+   do_migration || return $?
+   fi
+   done
+
+   wait ${live_pid}
+   ret=$?
+
+   while (( $(jobs -r | wc -l) > 0 )); do
+   sleep 0.1
+   done
+
+   return $ret
+}
+
+do_migration ()
+{
+   # We have to use cat to open the named FIFO, because named FIFO's,
+   # unlike pipes, will block on open() until the other end is also
+   # opened, and that totally breaks QEMU...
mkfifo ${fifo}
-   eval "$@" -chardev socket,id=mon2,path=${qmp2},server=on,wait=off \
-   -mon chardev=mon2,mode=control -incoming unix:${migsock} < <(cat 
${fifo}) &
+   eval "$migcmdline" \
+   -chardev socket,id=mon2,path=${qmp2},server=on,wait=off \
+   -mon chardev=mon2,mode=control -incoming unix:${migsock} \
+   < <(cat ${fifo}) > ${migout_fifo2} &
incoming_pid=$!
+   cat ${migout_fifo2} | tee ${migout2} &
   
   	# The test must prompt the user to migrate, so wait for the "migrate" keyword

while ! grep -q -i "Now migrate the VM" < ${migout1} ; do


So the old check for the "migrate" keyword is also still around?


It's just the comment is staleish, it only checks "Now migrate...".


Why do we
need to wait on two spots for the "Now mirgrate..." string now?


So that the it ensures we do one migration, subsequent ones are
optional.

I was thinking we could just remove that, and possibly even
remove the MIGRATION=yes/no paths and always just use the same
code here. But that's for another time.

Actually there is some weirdness here. There are *three* spots
where it waits for migration.


Yes, that's what I meant (I considered your two new additions like one spot ;-))


The first one in run_migration
can be removed, because it can call do_migration right away
to start up the destination qemu process ahead of the first
migration message as-per comment. I'll respin with

Re: [kvm-unit-tests PATCH v3 4/8] migration: Support multiple migrations

2024-02-09 Thread Nicholas Piggin

On Fri Feb 9, 2024 at 6:19 PM AEST, Thomas Huth wrote:
> On 09/02/2024 08.01, Nicholas Piggin wrote:
> > Support multiple migrations by flipping dest file/socket variables to
> > source after the migration is complete, ready to start again. A new
> > destination is created if the test outputs the migrate line again.
> > Test cases may now switch to calling migrate() one or more times.
> > 
> > Signed-off-by: Nicholas Piggin 
> > ---
> ...
> > diff --git a/scripts/arch-run.bash b/scripts/arch-run.bash
> > index 3689d7c2..a914ba17 100644
> > --- a/scripts/arch-run.bash
> > +++ b/scripts/arch-run.bash
> > @@ -129,12 +129,16 @@ run_migration ()
> > return 77
> > fi
> >   
> > +   migcmdline=$@
> > +
> > trap 'trap - TERM ; kill 0 ; exit 2' INT TERM
> > -   trap 'rm -f ${migout1} ${migout_fifo1} ${migsock} ${qmp1} ${qmp2} 
> > ${fifo}' RETURN EXIT
> > +   trap 'rm -f ${migout1} ${migout2} ${migout_fifo1} ${migout_fifo2} 
> > ${migsock} ${qmp1} ${qmp2} ${fifo}' RETURN EXIT
> >   
> > migsock=$(mktemp -u -t mig-helper-socket.XX)
> > migout1=$(mktemp -t mig-helper-stdout1.XX)
> > migout_fifo1=$(mktemp -u -t mig-helper-fifo-stdout1.XX)
> > +   migout2=$(mktemp -t mig-helper-stdout2.XX)
> > +   migout_fifo2=$(mktemp -u -t mig-helper-fifo-stdout2.XX)
> > qmp1=$(mktemp -u -t mig-helper-qmp1.XX)
> > qmp2=$(mktemp -u -t mig-helper-qmp2.XX)
> > fifo=$(mktemp -u -t mig-helper-fifo.XX)
> > @@ -142,18 +146,61 @@ run_migration ()
> > qmpout2=/dev/null
> >   
> > mkfifo ${migout_fifo1}
> > -   eval "$@" -chardev socket,id=mon1,path=${qmp1},server=on,wait=off \
> > +   mkfifo ${migout_fifo2}
> > +
> > +   eval "$migcmdline" \
> > +   -chardev socket,id=mon1,path=${qmp1},server=on,wait=off \
> > -mon chardev=mon1,mode=control > ${migout_fifo1} &
> > live_pid=$!
> > cat ${migout_fifo1} | tee ${migout1} &
> >   
> > -   # We have to use cat to open the named FIFO, because named FIFO's, 
> > unlike
> > -   # pipes, will block on open() until the other end is also opened, and 
> > that
> > -   # totally breaks QEMU...
> > +   # The test must prompt the user to migrate, so wait for the "migrate"
> > +   # keyword
> > +   while ! grep -q -i "Now migrate the VM" < ${migout1} ; do
> > +   if ! ps -p ${live_pid} > /dev/null ; then
> > +   echo "ERROR: Test exit before migration point." >&2
> > +   qmp ${qmp1} '"quit"'> ${qmpout1} 2>/dev/null
> > +   return 3
> > +   fi
> > +   sleep 0.1
> > +   done
> > +
> > +   # This starts the first source QEMU in advance of the test reaching the
> > +   # migration point, since we expect at least one migration. Subsequent
> > +   # sources are started as the test hits migrate keywords.
> > +   do_migration || return $?
> > +
> > +   while ps -p ${live_pid} > /dev/null ; do
> > +   # Wait for EXIT or further migrations
> > +   if ! grep -q -i "Now migrate the VM" < ${migout1} ; then
> > +   sleep 0.1
> > +   else
> > +   do_migration || return $?
> > +   fi
> > +   done
> > +
> > +   wait ${live_pid}
> > +   ret=$?
> > +
> > +   while (( $(jobs -r | wc -l) > 0 )); do
> > +   sleep 0.1
> > +   done
> > +
> > +   return $ret
> > +}
> > +
> > +do_migration ()
> > +{
> > +   # We have to use cat to open the named FIFO, because named FIFO's,
> > +   # unlike pipes, will block on open() until the other end is also
> > +   # opened, and that totally breaks QEMU...
> > mkfifo ${fifo}
> > -   eval "$@" -chardev socket,id=mon2,path=${qmp2},server=on,wait=off \
> > -   -mon chardev=mon2,mode=control -incoming unix:${migsock} < 
> > <(cat ${fifo}) &
> > +   eval "$migcmdline" \
> > +   -chardev socket,id=mon2,path=${qmp2},server=on,wait=off \
> > +   -mon chardev=mon2,mode=control -incoming unix:${migsock} \
> > +   < <(cat ${fifo}) > ${migout_fifo2} &
> > incoming_pid=$!
> > +   cat ${migout_fifo2} | tee ${migout2} &
> >   
> > # The test must prompt the user to migrate, so wait for the "migrate" 
> > keyword
> > while ! grep -q -i "Now migrate the VM" < ${migout1} ; do
>
> So the old check for the "migrate" keyword is also still around?

It's just the comment is staleish, it only checks "Now migrate...".

> Why do we 
> need to wait on two spots for the "Now mirgrate..." string now?

So that the it ensures we do one migration, subsequent ones are
optional.

I was thinking we could just remove that, and possibly even
remove the MIGRATION=yes/no paths and always just use the same
code here. But that's for another time.

Actually there is some weirdness here. There are *three* spots
where it waits for migration. The first one in run_migration
can be removed, because it can call do_migration right away
to start up the destination qemu process ahead of the first
migration message

Re: [kvm-unit-tests PATCH v3 8/8] migration: add a migration selftest

2024-02-09 Thread Thomas Huth


On 09/02/2024 08.01, Nicholas Piggin wrote:

Add a selftest for migration support in  guest library and test harness
code. It performs migrations a tight loop to irritate races and bugs in


"*in* a tight loop" ?


the test harness code.

Include the test in arm, s390, powerpc.

Acked-by: Claudio Imbrenda  (s390x)
Signed-off-by: Nicholas Piggin 
---
This has flushed out several bugs in developing the multi migration test
harness code already.

Thanks,
Nick

  arm/Makefile.common  |  1 +
  arm/selftest-migration.c |  1 +
  arm/unittests.cfg|  6 ++
  common/selftest-migration.c  | 34 ++
  powerpc/Makefile.common  |  1 +
  powerpc/selftest-migration.c |  1 +
  powerpc/unittests.cfg|  4 
  s390x/Makefile   |  1 +
  s390x/selftest-migration.c   |  1 +
  s390x/unittests.cfg  |  4 
  10 files changed, 54 insertions(+)
  create mode 12 arm/selftest-migration.c
  create mode 100644 common/selftest-migration.c
  create mode 12 powerpc/selftest-migration.c
  create mode 12 s390x/selftest-migration.c

diff --git a/arm/Makefile.common b/arm/Makefile.common
index f828dbe0..f107c478 100644
--- a/arm/Makefile.common
+++ b/arm/Makefile.common
@@ -5,6 +5,7 @@
  #
  
  tests-common  = $(TEST_DIR)/selftest.$(exe)

+tests-common += $(TEST_DIR)/selftest-migration.$(exe)
  tests-common += $(TEST_DIR)/spinlock-test.$(exe)
  tests-common += $(TEST_DIR)/pci-test.$(exe)
  tests-common += $(TEST_DIR)/pmu.$(exe)
diff --git a/arm/selftest-migration.c b/arm/selftest-migration.c
new file mode 12
index ..bd1eb266
--- /dev/null
+++ b/arm/selftest-migration.c
@@ -0,0 +1 @@
+../common/selftest-migration.c
\ No newline at end of file
diff --git a/arm/unittests.cfg b/arm/unittests.cfg
index fe601cbb..1ffd9a82 100644
--- a/arm/unittests.cfg
+++ b/arm/unittests.cfg
@@ -55,6 +55,12 @@ smp = $MAX_SMP
  extra_params = -append 'smp'
  groups = selftest
  
+# Test migration

+[selftest-migration]
+file = selftest-migration.flat
+groups = selftest migration
+
+arch = arm64


Please swap the last two lines!


  # Test PCI emulation
  [pci-test]
  file = pci-test.flat


With the nits fixed:
Reviewed-by: Thomas Huth

Re: [kvm-unit-tests PATCH v3 7/8] Add common/ directory for architecture-independent tests

2024-02-09 Thread Thomas Huth


On 09/02/2024 08.01, Nicholas Piggin wrote:

x86/sieve.c is used by s390x, arm, and riscv via symbolic link. Make a
new directory common/ for architecture-independent tests and move
sieve.c here.

Signed-off-by: Nicholas Piggin 
---
  arm/sieve.c|  2 +-
  common/sieve.c | 51 +
  riscv/sieve.c  |  2 +-
  s390x/sieve.c  |  2 +-
  x86/sieve.c| 52 +-
  5 files changed, 55 insertions(+), 54 deletions(-)
  create mode 100644 common/sieve.c
  mode change 100644 => 12 x86/sieve.c



Reviewed-by: Thomas Huth

Re: [kvm-unit-tests PATCH v3 5/8] arch-run: rename migration variables

2024-02-09 Thread Thomas Huth


On 09/02/2024 08.01, Nicholas Piggin wrote:

Using 1 and 2 for source and destination is confusing, particularly
now with multiple migrations that flip between them. Do a rename
pass to tidy things up.

Signed-off-by: Nicholas Piggin 
---
  scripts/arch-run.bash | 115 +-
  1 file changed, 58 insertions(+), 57 deletions(-)



Acked-by: Thomas Huth

Re: [kvm-unit-tests PATCH v3 2/8] arch-run: Clean up initrd cleanup

2024-02-09 Thread Nicholas Piggin

On Fri Feb 9, 2024 at 5:32 PM AEST, Thomas Huth wrote:
> On 09/02/2024 08.01, Nicholas Piggin wrote:
> > Rather than put a big script into the trap handler, have it call
> > a function.
> > 
> > Signed-off-by: Nicholas Piggin 
> > ---
> >   scripts/arch-run.bash | 13 -
> >   1 file changed, 12 insertions(+), 1 deletion(-)
> > 
> > diff --git a/scripts/arch-run.bash b/scripts/arch-run.bash
> > index 11d47a85..1e903e83 100644
> > --- a/scripts/arch-run.bash
> > +++ b/scripts/arch-run.bash
> > @@ -269,10 +269,21 @@ search_qemu_binary ()
> > export PATH=$save_path
> >   }
> >   
> > +initrd_cleanup ()
> > +{
> > +   rm -f $KVM_UNIT_TESTS_ENV
> > +   if [ "$KVM_UNIT_TESTS_ENV_OLD" ]; then
> > +   export KVM_UNIT_TESTS_ENV="$KVM_UNIT_TESTS_ENV_OLD"
> > +   else
> > +   unset KVM_UNIT_TESTS_ENV
> > +   unset KVM_UNIT_TESTS_ENV_OLD
> > +   fi
> > +}
>
> Looking at the original code below, shouldn't this rather unset 
> KVM_UNIT_TESTS_ENV_OLD after the "fi" statement?

Yes good catch.

Thanks,
Nick

Re: [kvm-unit-tests PATCH v3 4/8] migration: Support multiple migrations

2024-02-09 Thread Thomas Huth


On 09/02/2024 08.01, Nicholas Piggin wrote:

Support multiple migrations by flipping dest file/socket variables to
source after the migration is complete, ready to start again. A new
destination is created if the test outputs the migrate line again.
Test cases may now switch to calling migrate() one or more times.

Signed-off-by: Nicholas Piggin 
---

...

diff --git a/scripts/arch-run.bash b/scripts/arch-run.bash
index 3689d7c2..a914ba17 100644
--- a/scripts/arch-run.bash
+++ b/scripts/arch-run.bash
@@ -129,12 +129,16 @@ run_migration ()
return 77
fi
  
+	migcmdline=$@

+
trap 'trap - TERM ; kill 0 ; exit 2' INT TERM
-   trap 'rm -f ${migout1} ${migout_fifo1} ${migsock} ${qmp1} ${qmp2} 
${fifo}' RETURN EXIT
+   trap 'rm -f ${migout1} ${migout2} ${migout_fifo1} ${migout_fifo2} 
${migsock} ${qmp1} ${qmp2} ${fifo}' RETURN EXIT
  
  	migsock=$(mktemp -u -t mig-helper-socket.XX)

migout1=$(mktemp -t mig-helper-stdout1.XX)
migout_fifo1=$(mktemp -u -t mig-helper-fifo-stdout1.XX)
+   migout2=$(mktemp -t mig-helper-stdout2.XX)
+   migout_fifo2=$(mktemp -u -t mig-helper-fifo-stdout2.XX)
qmp1=$(mktemp -u -t mig-helper-qmp1.XX)
qmp2=$(mktemp -u -t mig-helper-qmp2.XX)
fifo=$(mktemp -u -t mig-helper-fifo.XX)
@@ -142,18 +146,61 @@ run_migration ()
qmpout2=/dev/null
  
  	mkfifo ${migout_fifo1}

-   eval "$@" -chardev socket,id=mon1,path=${qmp1},server=on,wait=off \
+   mkfifo ${migout_fifo2}
+
+   eval "$migcmdline" \
+   -chardev socket,id=mon1,path=${qmp1},server=on,wait=off \
-mon chardev=mon1,mode=control > ${migout_fifo1} &
live_pid=$!
cat ${migout_fifo1} | tee ${migout1} &
  
-	# We have to use cat to open the named FIFO, because named FIFO's, unlike

-   # pipes, will block on open() until the other end is also opened, and 
that
-   # totally breaks QEMU...
+   # The test must prompt the user to migrate, so wait for the "migrate"
+   # keyword
+   while ! grep -q -i "Now migrate the VM" < ${migout1} ; do
+   if ! ps -p ${live_pid} > /dev/null ; then
+   echo "ERROR: Test exit before migration point." >&2
+   qmp ${qmp1} '"quit"'> ${qmpout1} 2>/dev/null
+   return 3
+   fi
+   sleep 0.1
+   done
+
+   # This starts the first source QEMU in advance of the test reaching the
+   # migration point, since we expect at least one migration. Subsequent
+   # sources are started as the test hits migrate keywords.
+   do_migration || return $?
+
+   while ps -p ${live_pid} > /dev/null ; do
+   # Wait for EXIT or further migrations
+   if ! grep -q -i "Now migrate the VM" < ${migout1} ; then
+   sleep 0.1
+   else
+   do_migration || return $?
+   fi
+   done
+
+   wait ${live_pid}
+   ret=$?
+
+   while (( $(jobs -r | wc -l) > 0 )); do
+   sleep 0.1
+   done
+
+   return $ret
+}
+
+do_migration ()
+{
+   # We have to use cat to open the named FIFO, because named FIFO's,
+   # unlike pipes, will block on open() until the other end is also
+   # opened, and that totally breaks QEMU...
mkfifo ${fifo}
-   eval "$@" -chardev socket,id=mon2,path=${qmp2},server=on,wait=off \
-   -mon chardev=mon2,mode=control -incoming unix:${migsock} < <(cat 
${fifo}) &
+   eval "$migcmdline" \
+   -chardev socket,id=mon2,path=${qmp2},server=on,wait=off \
+   -mon chardev=mon2,mode=control -incoming unix:${migsock} \
+   < <(cat ${fifo}) > ${migout_fifo2} &
incoming_pid=$!
+   cat ${migout_fifo2} | tee ${migout2} &
  
  	# The test must prompt the user to migrate, so wait for the "migrate" keyword

while ! grep -q -i "Now migrate the VM" < ${migout1} ; do


So the old check for the "migrate" keyword is also still around? Why do we 
need to wait on two spots for the "Now mirgrate..." string now?


 Thomas



@@ -164,7 +211,7 @@ run_migration ()
qmp ${qmp2} '"quit"'> ${qmpout2} 2>/dev/null
return 3
fi
-   sleep 1
+   sleep 0.1
done
  
  	# Wait until the destination has created the incoming and qmp sockets

@@ -176,7 +223,7 @@ run_migration ()
# Wait for the migration to complete
migstatus=`qmp ${qmp1} '"query-migrate"' | grep return`
while ! grep -q '"completed"' <<<"$migstatus" ; do
-   sleep 1
+   sleep 0.1
if ! migstatus=`qmp ${qmp1} '"query-migrate"'`; then
echo "ERROR: Querying migration state failed." >&2
echo > ${fifo}
@@ -192,14 +239,34 @@ run_migration ()

[PATCH] powerpc/ftrace: Ignore ftrace locations in exit text sections

2024-02-09 Thread Naveen N Rao

Michael reported that we are seeing ftrace bug on bootup when KASAN is
enabled, and if we are using -fpatchable-function-entry:

ftrace: allocating 47780 entries in 18 pages
ftrace-powerpc: 0xc20b3d5c: No module provided for non-kernel 
address
[ ftrace bug ]
ftrace faulted on modifying
[] 0xc20b3d5c
Initializing ftrace call sites
ftrace record flags: 0
 (0)
 expected tramp: c008cef4
[ cut here ]
WARNING: CPU: 0 PID: 0 at kernel/trace/ftrace.c:2180 ftrace_bug+0x3c0/0x424
Modules linked in:
CPU: 0 PID: 0 Comm: swapper Not tainted 6.5.0-rc3-00120-g0f71dcfb4aef #860
Hardware name: IBM pSeries (emulated by qemu) POWER9 (raw) 0x4e1202 
0xf05 of:SLOF,HEAD hv:linux,kvm pSeries
NIP:  c03aa81c LR: c03aa818 CTR: 
REGS: c33cfab0 TRAP: 0700   Not tainted  
(6.5.0-rc3-00120-g0f71dcfb4aef)
MSR:  82021033   CR: 28028240  XER: 
CFAR: c02781a8 IRQMASK: 3
...
NIP [c03aa81c] ftrace_bug+0x3c0/0x424
LR [c03aa818] ftrace_bug+0x3bc/0x424
Call Trace:
 ftrace_bug+0x3bc/0x424 (unreliable)
 ftrace_process_locs+0x5f4/0x8a0
 ftrace_init+0xc0/0x1d0
 start_kernel+0x1d8/0x484

With CONFIG_FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY=y and
CONFIG_KASAN=y, compiler emits nops in functions that it generates for
registering and unregistering global variables (unlike with -pg and
-mprofile-kernel where calls to _mcount() are not generated in those
functions). Those functions then end up in INIT_TEXT and EXIT_TEXT
respectively. We don't expect to see any profiled functions in
EXIT_TEXT, so ftrace_init_nop() assumes that all addresses that aren't
in the core kernel text belongs to a module. Since these functions do
not match that criteria, we see the above bug.

Address this by having ftrace ignore all locations in the text exit
sections of vmlinux.

Fixes: 0f71dcfb4aef ("powerpc/ftrace: Add support for 
-fpatchable-function-entry")
Cc: sta...@vger.kernel.org
Reported-by: Michael Ellerman 
Signed-off-by: Naveen N Rao 
---
 arch/powerpc/include/asm/ftrace.h   |  9 +
 arch/powerpc/include/asm/sections.h |  1 +
 arch/powerpc/kernel/trace/ftrace.c  | 12 
 arch/powerpc/kernel/vmlinux.lds.S   |  2 ++
 4 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/ftrace.h 
b/arch/powerpc/include/asm/ftrace.h
index 1ebd2ca97f12..d6babd083202 100644
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@@ -20,14 +20,7 @@
 #ifndef __ASSEMBLY__
 extern void _mcount(void);
 
-static inline unsigned long ftrace_call_adjust(unsigned long addr)
-{
-   if (IS_ENABLED(CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY))
-   addr += MCOUNT_INSN_SIZE;
-
-   return addr;
-}
-
+unsigned long ftrace_call_adjust(unsigned long addr);
 unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip,
unsigned long sp);
 
diff --git a/arch/powerpc/include/asm/sections.h 
b/arch/powerpc/include/asm/sections.h
index ea26665f82cf..d389dcecdb0b 100644
--- a/arch/powerpc/include/asm/sections.h
+++ b/arch/powerpc/include/asm/sections.h
@@ -14,6 +14,7 @@ typedef struct func_desc func_desc_t;
 
 extern char __head_end[];
 extern char __srwx_boundary[];
+extern char _sexittext[], _eexittext[];
 
 /* Patch sites */
 extern s32 patch__call_flush_branch_caches1;
diff --git a/arch/powerpc/kernel/trace/ftrace.c 
b/arch/powerpc/kernel/trace/ftrace.c
index 82010629cf88..b5efd8d7bc01 100644
--- a/arch/powerpc/kernel/trace/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -27,10 +27,22 @@
 #include 
 #include 
 #include 
+#include 
 
 #defineNUM_FTRACE_TRAMPS   2
 static unsigned long ftrace_tramps[NUM_FTRACE_TRAMPS];
 
+unsigned long ftrace_call_adjust(unsigned long addr)
+{
+   if (addr >= (unsigned long)_sexittext && addr < (unsigned 
long)_eexittext)
+   return 0;
+
+   if (IS_ENABLED(CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY))
+   addr += MCOUNT_INSN_SIZE;
+
+   return addr;
+}
+
 static ppc_inst_t ftrace_create_branch_inst(unsigned long ip, unsigned long 
addr, int link)
 {
ppc_inst_t op;
diff --git a/arch/powerpc/kernel/vmlinux.lds.S 
b/arch/powerpc/kernel/vmlinux.lds.S
index 1c5970df3233..9c376ae6857d 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -281,7 +281,9 @@ SECTIONS
 * to deal with references from __bug_table
 */
.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
+   _sexittext = .;
EXIT_TEXT
+   _eexittext = .;
}
 
. = ALIGN(PAGE_SIZE);

base-commit: 4ef8376c466ae8b03e632dd8eca1e44315f7dd61
-- 
2.43.0

Re: [kvm-unit-tests PATCH v3 6/8] migration: Add quiet migration support

2024-02-09 Thread Thomas Huth


On 09/02/2024 08.01, Nicholas Piggin wrote:

Console output required to support migration becomes quite noisy
when doing lots of migrations. Provide a migrate_quiet() call that
suppresses console output and doesn't log a message.

Signed-off-by: Nicholas Piggin 
---
  lib/migrate.c | 12 
  lib/migrate.h |  1 +
  scripts/arch-run.bash |  4 ++--
  3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/lib/migrate.c b/lib/migrate.c
index b7721659..4e0ab516 100644
--- a/lib/migrate.c
+++ b/lib/migrate.c
@@ -18,6 +18,18 @@ void migrate(void)
report_info("Migration complete");
  }
  
+/*

+ * Like migrate() but supporess output and logs, useful for intensive


s/supporess/suppress/


+ * migration stress testing without polluting logs. Test cases should
+ * provide relevant information about migration in failure reports.
+ */
+void migrate_quiet(void)
+{
+   puts("Now migrate the VM (quiet)\n");
+   (void)getchar();
+}
+
+


Remove one empty line, please!


  /*
   * Initiate migration and wait for it to complete.
   * If this function is called more than once, it is a no-op.
diff --git a/lib/migrate.h b/lib/migrate.h
index 2af06a72..95b9102b 100644
--- a/lib/migrate.h
+++ b/lib/migrate.h
@@ -7,4 +7,5 @@
   */
  
  void migrate(void);

+void migrate_quiet(void);
  void migrate_once(void);
diff --git a/scripts/arch-run.bash b/scripts/arch-run.bash
index 0b45eb61..29cf9b0c 100644
--- a/scripts/arch-run.bash
+++ b/scripts/arch-run.bash
@@ -152,7 +152,7 @@ run_migration ()
-chardev socket,id=mon,path=${src_qmp},server=on,wait=off \
-mon chardev=mon,mode=control > ${src_outfifo} &
live_pid=$!
-   cat ${src_outfifo} | tee ${src_out} &
+   cat ${src_outfifo} | tee ${src_out} | grep -v "Now migrate the VM (quiet)" 
&
  
  	# The test must prompt the user to migrate, so wait for the "migrate"

# keyword
@@ -200,7 +200,7 @@ do_migration ()
-mon chardev=mon,mode=control -incoming unix:${dst_incoming} \
< <(cat ${dst_infifo}) > ${dst_outfifo} &
incoming_pid=$!
-   cat ${dst_outfifo} | tee ${dst_out} &
+   cat ${dst_outfifo} | tee ${dst_out} | grep -v "Now migrate the VM (quiet)" 
&
  
  	# The test must prompt the user to migrate, so wait for the "migrate" keyword

while ! grep -q -i "Now migrate the VM" < ${src_out} ; do


 Thomas

51 matches

Mail list logo