Re: [PATCH 2/4] powerpc/64s: Add POWER10 store sync mnemonics

2023-06-12 Thread Joel Stanley
On Fri, 9 Jun 2023 at 10:01, Nicholas Piggin  wrote:
>
> ISA v3.1 introduces new sync types for store ordering.
>
>   stncisync
>   stcisync
>   stsync
>
> Add ppc-opcode defines for these. This changes PPC_RAW_SYNC to take
> L,SC parameters and adds a PPC_RAW_HWSYNC for callers that want the
> plain old sync (aka hwsync).

I checked these against the ISA and they seem correct.

Did you consider changing LWSYNC to be defined in terms of your new
PPC_RAW_SYNC?

Reviewed-by: Joel Stanley .

>
> Signed-off-by: Nicholas Piggin 
> ---
>  arch/powerpc/include/asm/ppc-opcode.h | 19 ++-
>  arch/powerpc/kernel/traps.c   |  2 +-
>  arch/powerpc/lib/feature-fixups.c |  6 +++---
>  arch/powerpc/net/bpf_jit_comp64.c |  2 +-
>  4 files changed, 19 insertions(+), 10 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
> b/arch/powerpc/include/asm/ppc-opcode.h
> index ca5a0da7df4e..7bc8bbcd4adb 100644
> --- a/arch/powerpc/include/asm/ppc-opcode.h
> +++ b/arch/powerpc/include/asm/ppc-opcode.h
> @@ -326,6 +326,8 @@
>  #define ___PPC_R(r)(((r) & 0x1) << 16)
>  #define ___PPC_PRS(prs)(((prs) & 0x1) << 17)
>  #define ___PPC_RIC(ric)(((ric) & 0x3) << 18)
> +#define ___PPC_L(l)(((l) & 0x7) << 21)
> +#define ___PPC_SC(sc)  (((sc) & 0x3) << 16)
>  #define __PPC_RA(a)___PPC_RA(__REG_##a)
>  #define __PPC_RA0(a)   ___PPC_RA(__REGA0_##a)
>  #define __PPC_RB(b)___PPC_RB(__REG_##b)
> @@ -378,8 +380,6 @@
>  #define PPC_RAW_LQARX(t, a, b, eh) (0x7c000228 | ___PPC_RT(t) | 
> ___PPC_RA(a) | ___PPC_RB(b) | __PPC_EH(eh))
>  #define PPC_RAW_LDARX(t, a, b, eh) (0x7ca8 | ___PPC_RT(t) | 
> ___PPC_RA(a) | ___PPC_RB(b) | __PPC_EH(eh))
>  #define PPC_RAW_LWARX(t, a, b, eh) (0x7c28 | ___PPC_RT(t) | 
> ___PPC_RA(a) | ___PPC_RB(b) | __PPC_EH(eh))
> -#define PPC_RAW_PHWSYNC(0x7c8004ac)
> -#define PPC_RAW_PLWSYNC(0x7ca004ac)
>  #define PPC_RAW_STQCX(t, a, b) (0x7c00016d | ___PPC_RT(t) | 
> ___PPC_RA(a) | ___PPC_RB(b))
>  #define PPC_RAW_MADDHD(t, a, b, c) (0x1030 | ___PPC_RT(t) | 
> ___PPC_RA(a) | ___PPC_RB(b) | ___PPC_RC(c))
>  #define PPC_RAW_MADDHDU(t, a, b, c)(0x1031 | ___PPC_RT(t) | 
> ___PPC_RA(a) | ___PPC_RB(b) | ___PPC_RC(c))
> @@ -396,6 +396,13 @@
>  #define PPC_RAW_RFCI   (0x4c66)
>  #define PPC_RAW_RFDI   (0x4c4e)
>  #define PPC_RAW_RFMCI  (0x4c4c)
> +#define PPC_RAW_SYNC(l, sc)(0x7c0004ac | ___PPC_L(l) | 
> ___PPC_SC(sc))
> +#define PPC_RAW_HWSYNC()   PPC_RAW_SYNC(0, 0)
> +#define PPC_RAW_STNCISYNC()PPC_RAW_SYNC(1, 1)
> +#define PPC_RAW_STCISYNC() PPC_RAW_SYNC(0, 2)
> +#define PPC_RAW_STSYNC()   PPC_RAW_SYNC(0, 3)
> +#define PPC_RAW_PHWSYNC()  PPC_RAW_SYNC(4, 0)
> +#define PPC_RAW_PLWSYNC()  PPC_RAW_SYNC(5, 0)
>  #define PPC_RAW_TLBILX(t, a, b)(0x7c24 | __PPC_T_TLB(t) 
> |  __PPC_RA0(a) | __PPC_RB(b))
>  #define PPC_RAW_WAIT_v203  (0x7c7c)
>  #define PPC_RAW_WAIT(w, p) (0x7c3c | __PPC_WC(w) | 
> __PPC_PL(p))
> @@ -421,7 +428,6 @@
>  #define PPC_RAW_DCBFPS(a, b)   (0x7cac | ___PPC_RA(a) | 
> ___PPC_RB(b) | (4 << 21))
>  #define PPC_RAW_DCBSTPS(a, b)  (0x7cac | ___PPC_RA(a) | 
> ___PPC_RB(b) | (6 << 21))
>  #define PPC_RAW_SC()   (0x4402)
> -#define PPC_RAW_SYNC() (0x7c0004ac)
>  #define PPC_RAW_ISYNC()(0x4c00012c)
>
>  /*
> @@ -641,8 +647,11 @@
>  #define STBCIX(s, a, b)stringify_in_c(.long 
> PPC_RAW_STBCIX(s, a, b))
>  #define PPC_DCBFPS(a, b)   stringify_in_c(.long PPC_RAW_DCBFPS(a, b))
>  #define PPC_DCBSTPS(a, b)  stringify_in_c(.long PPC_RAW_DCBSTPS(a, b))
> -#define PPC_PHWSYNCstringify_in_c(.long PPC_RAW_PHWSYNC)
> -#define PPC_PLWSYNCstringify_in_c(.long PPC_RAW_PLWSYNC)
> +#define PPC_STNCISYNC  stringify_in_c(.long PPC_RAW_STNCISYNC())
> +#define PPC_STCISYNC   stringify_in_c(.long PPC_RAW_STCISYNC())
> +#define PPC_STSYNC stringify_in_c(.long PPC_RAW_STSYNC())
> +#define PPC_PHWSYNCstringify_in_c(.long PPC_RAW_PHWSYNC())
> +#define PPC_PLWSYNCstringify_in_c(.long PPC_RAW_PLWSYNC())
>  #define STXVD2X(s, a, b)   stringify_in_c(.long PPC_RAW_STXVD2X(s, a, b))
>  #define LXVD2X(s, a, b)stringify_in_c(.long 
> PPC_RAW_LXVD2X(s, a, b))
>  #define MFVRD(a, t)stringify_in_c(.long PPC_RAW_MFVRD(a, t))
> diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
> index 9bdd79aa51cf..4b216c208f41 100644
> --- a/arch/powerpc/kernel/traps.c
> +++ b/arch/powerpc/kernel/traps.c
> @@ -550,7 +550,7 @@ static inline int check_io_access(struct pt_regs *regs)
> nip -= 2;
> else if (*nip == PPC_RAW_ISYNC())
> 

[RFC PATCH 3/3] powernv/pci: Remove last IODA1 defines

2023-06-12 Thread Joel Stanley
Signed-off-by: Joel Stanley 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 5 +++--
 arch/powerpc/platforms/powernv/pci.h  | 1 -
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index c2af5a55a434..cb637827bc58 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -45,7 +45,8 @@
 #include "pci.h"
 #include "../../../../drivers/pci/pci.h"
 
-static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_OCAPI" };
+/* This array is indexed with enum pnv_phb_type */
+static const char * const pnv_phb_names[] = { "IODA2", "NPU_OCAPI" };
 
 static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
 static void pnv_pci_configure_bus(struct pci_bus *bus);
@@ -359,7 +360,7 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb 
*phb)
const __be32 *r;
u64 pci_addr;
 
-   if (phb->type != PNV_PHB_IODA1 && phb->type != PNV_PHB_IODA2) {
+   if (phb->type != PNV_PHB_IODA2) {
pr_info("  Not support M64 window\n");
return;
}
diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index 3353db882e35..957f2b47a3c0 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -10,7 +10,6 @@
 struct pci_dn;
 
 enum pnv_phb_type {
-   PNV_PHB_IODA1,
PNV_PHB_IODA2,
PNV_PHB_NPU_OCAPI,
 };
-- 
2.39.2



[RFC PATCH 2/3] powerpc/pci: Remove MVE code

2023-06-12 Thread Joel Stanley
With IODA1 support gone the OPAL calls to set MVE are dead code. Remove
them.

TODO: Do we have rules for removing unused OPAL APIs? Should we leave it
in opal.h? opal-call.c?

Signed-off-by: Joel Stanley 
---
 arch/powerpc/include/asm/opal.h|  3 ---
 arch/powerpc/platforms/powernv/opal-call.c |  2 --
 arch/powerpc/platforms/powernv/pci-ioda.c  | 23 +-
 3 files changed, 1 insertion(+), 27 deletions(-)

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 726125a534de..a9b31cc258fc 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -112,9 +112,6 @@ int64_t opal_pci_set_pe(uint64_t phb_id, uint64_t 
pe_number, uint64_t bus_dev_fu
uint8_t pe_action);
 int64_t opal_pci_set_peltv(uint64_t phb_id, uint32_t parent_pe, uint32_t 
child_pe,
   uint8_t state);
-int64_t opal_pci_set_mve(uint64_t phb_id, uint32_t mve_number, uint32_t 
pe_number);
-int64_t opal_pci_set_mve_enable(uint64_t phb_id, uint32_t mve_number,
-   uint32_t state);
 int64_t opal_pci_get_xive_reissue(uint64_t phb_id, uint32_t xive_number,
  uint8_t *p_bit, uint8_t *q_bit);
 int64_t opal_pci_set_xive_reissue(uint64_t phb_id, uint32_t xive_number,
diff --git a/arch/powerpc/platforms/powernv/opal-call.c 
b/arch/powerpc/platforms/powernv/opal-call.c
index f812c74c61e5..021b0ec29e24 100644
--- a/arch/powerpc/platforms/powernv/opal-call.c
+++ b/arch/powerpc/platforms/powernv/opal-call.c
@@ -167,8 +167,6 @@ OPAL_CALL(opal_pci_map_pe_mmio_window,  
OPAL_PCI_MAP_PE_MMIO_WINDOW);
 OPAL_CALL(opal_pci_set_phb_table_memory,   OPAL_PCI_SET_PHB_TABLE_MEMORY);
 OPAL_CALL(opal_pci_set_pe, OPAL_PCI_SET_PE);
 OPAL_CALL(opal_pci_set_peltv,  OPAL_PCI_SET_PELTV);
-OPAL_CALL(opal_pci_set_mve,OPAL_PCI_SET_MVE);
-OPAL_CALL(opal_pci_set_mve_enable, OPAL_PCI_SET_MVE_ENABLE);
 OPAL_CALL(opal_pci_get_xive_reissue,   OPAL_PCI_GET_XIVE_REISSUE);
 OPAL_CALL(opal_pci_set_xive_reissue,   OPAL_PCI_SET_XIVE_REISSUE);
 OPAL_CALL(opal_pci_set_xive_pe,OPAL_PCI_SET_XIVE_PE);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 2c4e842c2749..c2af5a55a434 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -865,29 +865,8 @@ int pnv_ioda_configure_pe(struct pnv_phb *phb, struct 
pnv_ioda_pe *pe)
for (rid = pe->rid; rid < rid_end; rid++)
phb->ioda.pe_rmap[rid] = pe->pe_number;
 
-   /* Setup one MVTs on IODA1 */
-   if (phb->type != PNV_PHB_IODA1) {
-   pe->mve_number = 0;
-   goto out;
-   }
+   pe->mve_number = 0;
 
-   pe->mve_number = pe->pe_number;
-   rc = opal_pci_set_mve(phb->opal_id, pe->mve_number, pe->pe_number);
-   if (rc != OPAL_SUCCESS) {
-   pe_err(pe, "OPAL error %ld setting up MVE %x\n",
-  rc, pe->mve_number);
-   pe->mve_number = -1;
-   } else {
-   rc = opal_pci_set_mve_enable(phb->opal_id,
-pe->mve_number, OPAL_ENABLE_MVE);
-   if (rc) {
-   pe_err(pe, "OPAL error %ld enabling MVE %x\n",
-  rc, pe->mve_number);
-   pe->mve_number = -1;
-   }
-   }
-
-out:
return 0;
 }
 
-- 
2.39.2



[RFC PATCH 1/3] powernv/pci: Remove ioda1 support

2023-06-12 Thread Joel Stanley
The final "VPL" Power7 boxes that were used for powernv bringup have
been scrapped, meaning there are no machines with ioda1 left.

This patch removes the obvious unused code.

Signed-off-by: Joel Stanley 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 448 +-
 arch/powerpc/platforms/powernv/pci.c  |   5 -
 arch/powerpc/platforms/powernv/pci.h  |   4 -
 3 files changed, 2 insertions(+), 455 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index a02e9cdb5b5d..2c4e842c2749 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -45,10 +45,6 @@
 #include "pci.h"
 #include "../../../../drivers/pci/pci.h"
 
-#define PNV_IODA1_M64_NUM  16  /* Number of M64 BARs   */
-#define PNV_IODA1_M64_SEGS 8   /* Segments per M64 BAR */
-#define PNV_IODA1_DMA32_SEGSIZE0x1000
-
 static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_OCAPI" };
 
 static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
@@ -280,86 +276,6 @@ static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev 
*pdev,
}
 }
 
-static int pnv_ioda1_init_m64(struct pnv_phb *phb)
-{
-   struct resource *r;
-   int index;
-
-   /*
-* There are 16 M64 BARs, each of which has 8 segments. So
-* there are as many M64 segments as the maximum number of
-* PEs, which is 128.
-*/
-   for (index = 0; index < PNV_IODA1_M64_NUM; index++) {
-   unsigned long base, segsz = phb->ioda.m64_segsize;
-   int64_t rc;
-
-   base = phb->ioda.m64_base +
-  index * PNV_IODA1_M64_SEGS * segsz;
-   rc = opal_pci_set_phb_mem_window(phb->opal_id,
-   OPAL_M64_WINDOW_TYPE, index, base, 0,
-   PNV_IODA1_M64_SEGS * segsz);
-   if (rc != OPAL_SUCCESS) {
-   pr_warn("  Error %lld setting M64 PHB#%x-BAR#%d\n",
-   rc, phb->hose->global_number, index);
-   goto fail;
-   }
-
-   rc = opal_pci_phb_mmio_enable(phb->opal_id,
-   OPAL_M64_WINDOW_TYPE, index,
-   OPAL_ENABLE_M64_SPLIT);
-   if (rc != OPAL_SUCCESS) {
-   pr_warn("  Error %lld enabling M64 PHB#%x-BAR#%d\n",
-   rc, phb->hose->global_number, index);
-   goto fail;
-   }
-   }
-
-   for (index = 0; index < phb->ioda.total_pe_num; index++) {
-   int64_t rc;
-
-   /*
-* P7IOC supports M64DT, which helps mapping M64 segment
-* to one particular PE#. However, PHB3 has fixed mapping
-* between M64 segment and PE#. In order to have same logic
-* for P7IOC and PHB3, we enforce fixed mapping between M64
-* segment and PE# on P7IOC.
-*/
-   rc = opal_pci_map_pe_mmio_window(phb->opal_id,
-   index, OPAL_M64_WINDOW_TYPE,
-   index / PNV_IODA1_M64_SEGS,
-   index % PNV_IODA1_M64_SEGS);
-   if (rc != OPAL_SUCCESS) {
-   pr_warn("%s: Error %lld mapping M64 for PHB#%x-PE#%x\n",
-   __func__, rc, phb->hose->global_number,
-   index);
-   goto fail;
-   }
-   }
-
-   /*
-* Exclude the segments for reserved and root bus PE, which
-* are first or last two PEs.
-*/
-   r = >hose->mem_resources[1];
-   if (phb->ioda.reserved_pe_idx == 0)
-   r->start += (2 * phb->ioda.m64_segsize);
-   else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
-   r->end -= (2 * phb->ioda.m64_segsize);
-   else
-   WARN(1, "Wrong reserved PE#%x on PHB#%x\n",
-phb->ioda.reserved_pe_idx, phb->hose->global_number);
-
-   return 0;
-
-fail:
-   for ( ; index >= 0; index--)
-   opal_pci_phb_mmio_enable(phb->opal_id,
-   OPAL_M64_WINDOW_TYPE, index, OPAL_DISABLE_M64);
-
-   return -EIO;
-}
-
 static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus,
unsigned long *pe_bitmap,
bool all)
@@ -518,10 +434,7 @@ static void __init pnv_ioda_parse_m64_window(struct 
pnv_phb *phb)
 * Setup init functions for M64 based on IODA version, IODA3 uses
 * the IODA2 code.
 */
-   if (phb->type == PNV_PHB_IODA1)
-   phb->init_m64 = pnv_ioda1_init_m64;
-   else
-   phb->init_m64 = pnv_ioda2_init_m64;
+   phb->init_m64 = pnv_ioda2_init_m64;
 }
 
 static void 

[RFC PATCH 0/3] powernv/pci: Remove unused IODA1 support

2023-06-12 Thread Joel Stanley
Oliver mentioned this code was only kept around to support the VPL
Power7 boxes. Now that they are all gone, remove the code.

Build and boot tested in qemu only.

Joel Stanley (3):
  powernv/pci: Remove ioda1 support
  powerpc/pci: Remove MVE code
  powernv/pci: Remove last IODA1 defines

 arch/powerpc/include/asm/opal.h|   3 -
 arch/powerpc/platforms/powernv/opal-call.c |   2 -
 arch/powerpc/platforms/powernv/pci-ioda.c  | 476 +
 arch/powerpc/platforms/powernv/pci.c   |   5 -
 arch/powerpc/platforms/powernv/pci.h   |   5 -
 5 files changed, 6 insertions(+), 485 deletions(-)

-- 
2.39.2



[PATCH] powerpc/xmon: Fix comparing pointer

2023-06-12 Thread wuyonggang001


Fix the following coccicheck warning:

arch/powerpc/xmon/spu-dis.c:51:34-35: WARNING comparing pointer to 0

Signed-off-by: Yonggang Wu 
---
 arch/powerpc/xmon/spu-dis.c | 384 ++--
 1 file changed, 193 insertions(+), 191 deletions(-)

diff --git a/arch/powerpc/xmon/spu-dis.c b/arch/powerpc/xmon/spu-dis.c
index 4b0a4e640f08..f48a2ddd7440 100644
--- a/arch/powerpc/xmon/spu-dis.c
+++ b/arch/powerpc/xmon/spu-dis.c
@@ -22,216 +22,218 @@ extern const int spu_num_opcodes;
 #define SPU_DISASM_TBL_SIZE (1 << 11)
 static const struct spu_opcode 
*spu_disassemble_table[SPU_DISASM_TBL_SIZE];


-static void
-init_spu_disassemble (void)
+static void init_spu_disassemble(void)
 {
-  int i;
-
-  /* If two instructions have the same opcode then we prefer the first
-   * one.  In most cases it is just an alternate mnemonic. */
-  for (i = 0; i < spu_num_opcodes; i++)
-{
-  int o = spu_opcodes[i].opcode;
-  if (o >= SPU_DISASM_TBL_SIZE)
-continue; /* abort (); */
-  if (spu_disassemble_table[o] == 0)
-spu_disassemble_table[o] = _opcodes[i];
-}
+int i;
+
+/*
+ * If two instructions have the same opcode then we prefer the 
first

+ * one.  In most cases it is just an alternate mnemonic.
+ */
+for (i = 0; i < spu_num_opcodes; i++) {
+int o = spu_opcodes[i].opcode;
+
+if (o >= SPU_DISASM_TBL_SIZE)
+continue; /* abort(); */
+if (spu_disassemble_table[o] == NULL)
+spu_disassemble_table[o] = _opcodes[i];
+}
 }

 /* Determine the instruction from the 10 least significant bits. */
-static const struct spu_opcode *
-get_index_for_opcode (unsigned int insn)
+static const struct spu_opcode *get_index_for_opcode(unsigned int insn)
 {
-  const struct spu_opcode *index;
-  unsigned int opcode = insn >> (32-11);
-
-  /* Init the table.  This assumes that element 0/opcode 0 (currently
-   * NOP) is always used */
-  if (spu_disassemble_table[0] == 0)
-init_spu_disassemble ();
-
-  if ((index = spu_disassemble_table[opcode & 0x780]) != 0
-  && index->insn_type == RRR)
-return index;
-
-  if ((index = spu_disassemble_table[opcode & 0x7f0]) != 0
-  && (index->insn_type == RI18 || index->insn_type == LBT))
-return index;
-
-  if ((index = spu_disassemble_table[opcode & 0x7f8]) != 0
-  && index->insn_type == RI10)
-return index;
-
-  if ((index = spu_disassemble_table[opcode & 0x7fc]) != 0
-  && (index->insn_type == RI16))
-return index;
-
-  if ((index = spu_disassemble_table[opcode & 0x7fe]) != 0
-  && (index->insn_type == RI8))
-return index;
-
-  if ((index = spu_disassemble_table[opcode & 0x7ff]) != 0)
-return index;
-
-  return NULL;
+const struct spu_opcode *index;
+unsigned int opcode = insn >> (32-11);
+
+/*
+ * Init the table.  This assumes that element 0/opcode 0 (currently
+ * NOP) is always used
+ */
+if (spu_disassemble_table[0] == NULL)
+init_spu_disassemble();
+
+index = spu_disassemble_table[opcode & 0x780];
+if (index != NULL && index->insn_type == RRR)
+return index;
+
+index = spu_disassemble_table[opcode & 0x7f0];
+if (index != NULL
+  && (index->insn_type == RI18 || index->insn_type == LBT))
+return index;
+
+index = spu_disassemble_table[opcode & 0x7f8];
+if (index != NULL
+  && index->insn_type == RI10)
+return index;
+
+index = spu_disassemble_table[opcode & 0x7fc]
+if (index != NULL && (index->insn_type == RI16))
+return index;
+
+index = spu_disassemble_table[opcode & 0x7fe];
+if (index != NULL && (index->insn_type == RI8))
+return index;
+
+index = spu_disassemble_table[opcode & 0x7ff];
+if (index != NULL)
+return index;
+
+return NULL;
 }

 /* Print a Spu instruction.  */

-int
-print_insn_spu (unsigned long insn, unsigned long memaddr)
+int print_insn_spu(unsigned long insn, unsigned long memaddr)
 {
-  int value;
-  int hex_value;
-  const struct spu_opcode *index;
-  enum spu_insns tag;
+int value;
+int hex_value;
+const struct spu_opcode *index;
+enum spu_insns tag;

-  index = get_index_for_opcode (insn);
+index = get_index_for_opcode(insn);

-  if (index == 0)
-{
-  printf(".long 0x%lx", insn);
-}
-  else
-{
-  int i;
-  int paren = 0;
-  tag = (enum spu_insns)(index - spu_opcodes);
-  printf("%s", index->mnemonic);
-  if (tag == M_BI || tag == M_BISL || tag == M_IRET || tag == 
M_BISLED
-  || tag == M_BIHNZ || tag == M_BIHZ || tag == M_BINZ || tag == 
M_BIZ

-  || tag == M_SYNC || tag == M_HBR)
+if (index == NULL)
 {
-  int fb = (insn >> (32-18)) & 0x7f;
-  if (fb & 0x40)
-printf(tag == M_SYNC ? "c" : "p");
-  if (fb & 0x20)
-printf("d");
-  if (fb & 0x10)
-printf("e");
-}
-  if (index->arg[0] != 0)
-printf("\t");
-  hex_value = 0;
-  for (i = 1;  i <= 

Re: [RFC 0/3] Asynchronous EEH recovery

2023-06-12 Thread Oliver O'Halloran
On Tue, Jun 13, 2023 at 11:44 AM Ganesh Goudar  wrote:
>
> Hi,
>
> EEH recovery is currently serialized and these patches shorten
> the time taken for EEH recovery by making the recovery to run
> in parallel. The original author of these patches is Sam Bobroff,
> I have rebased and tested these patches.
>
> On powervm with 64 VFs from same PHB,  I see approximately 48%
> reduction in time taken in EEH recovery.
>
> On powernv with 9 network cards, Where 2 cards installed on one
> PHB and 1 card on each of the rest of the PHBs, Providing 20 PFs
> in total. I see approximately 33% reduction in time taken in EEH
> recovery.
>
> These patches were originally posted as separate RFCs by Sam, And
> I rebased and posted these patches almost a year back, I stopped
> pursuing these patches as I was not able test this on powernv, Due
> to the issues in drivers of cards I was testing this on, Which are
> now resolved. Since I am re-posting this after long time, Posting
> this as a fresh RFC, Please comment.

What changes have you made since the last time you posted this series?
If the patches are the same then the comments I posted last time still
apply.


[RFC 2/3] powerpc/eeh: Provide a unique ID for each EEH recovery

2023-06-12 Thread Ganesh Goudar
Based on the original work from Sam Bobroff.

Give a unique ID to each recovery event, to ease log parsing
and prepare for parallel recovery.

Also add some new messages with a very simple format that may
be useful to log-parsers.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/eeh_event.h |   3 +-
 arch/powerpc/include/asm/ppc-pci.h   |   2 +-
 arch/powerpc/kernel/eeh.c|  42 +++---
 arch/powerpc/kernel/eeh_driver.c | 189 +++
 arch/powerpc/kernel/eeh_event.c  |  12 +-
 include/linux/mmzone.h   |   2 +-
 6 files changed, 147 insertions(+), 103 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh_event.h 
b/arch/powerpc/include/asm/eeh_event.h
index dadde7d52f46..a1fe736bc4cf 100644
--- a/arch/powerpc/include/asm/eeh_event.h
+++ b/arch/powerpc/include/asm/eeh_event.h
@@ -17,13 +17,14 @@
 struct eeh_event {
struct list_headlist;   /* to form event queue  */
struct eeh_pe   *pe;/* EEH PE   */
+   unsigned intid; /* Event ID */
 };
 
 int eeh_event_init(void);
 int eeh_send_failure_event(struct eeh_pe *pe);
 int __eeh_send_failure_event(struct eeh_pe *pe);
 void eeh_remove_event(struct eeh_pe *pe, bool force);
-void eeh_handle_normal_event(struct eeh_pe *pe);
+void eeh_handle_normal_event(unsigned int event_id, struct eeh_pe *pe);
 void eeh_handle_special_event(void);
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/ppc-pci.h 
b/arch/powerpc/include/asm/ppc-pci.h
index d9fcff575027..5b82e76dbd19 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -40,7 +40,7 @@ extern int rtas_setup_phb(struct pci_controller *phb);
 void eeh_addr_cache_insert_dev(struct pci_dev *dev);
 void eeh_addr_cache_rmv_dev(struct pci_dev *dev);
 struct eeh_dev *eeh_addr_cache_get_dev(unsigned long addr);
-void eeh_slot_error_detail(struct eeh_pe *pe, int severity);
+void eeh_slot_error_detail(unsigned int event_id, struct eeh_pe *pe, int 
severity);
 int eeh_pci_enable(struct eeh_pe *pe, int function);
 int eeh_pe_reset_full(struct eeh_pe *pe, bool include_passed);
 void eeh_save_bars(struct eeh_dev *edev);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 2c90c37524ed..148d5df0e606 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -200,7 +200,8 @@ EXPORT_SYMBOL_GPL(eeh_recovery_must_be_locked);
  * for the indicated PCI device, and puts them into a buffer
  * for RTAS error logging.
  */
-static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len)
+static size_t eeh_dump_dev_log(unsigned int event_id, struct eeh_dev *edev,
+  char *buf, size_t len)
 {
u32 cfg;
int cap, i;
@@ -210,27 +211,29 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char 
*buf, size_t len)
n += scnprintf(buf+n, len-n, "%04x:%02x:%02x.%01x\n",
edev->pe->phb->global_number, edev->bdfn >> 8,
PCI_SLOT(edev->bdfn), PCI_FUNC(edev->bdfn));
-   pr_warn("EEH: of node=%04x:%02x:%02x.%01x\n",
+   pr_warn("EEH(%u): of node=%04x:%02x:%02x.%01x\n",
+   event_id,
edev->pe->phb->global_number, edev->bdfn >> 8,
PCI_SLOT(edev->bdfn), PCI_FUNC(edev->bdfn));
 
eeh_ops->read_config(edev, PCI_VENDOR_ID, 4, );
n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg);
-   pr_warn("EEH: PCI device/vendor: %08x\n", cfg);
+   pr_warn("EEH(%u): PCI device/vendor: %08x\n",event_id, cfg);
 
eeh_ops->read_config(edev, PCI_COMMAND, 4, );
n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg);
-   pr_warn("EEH: PCI cmd/status register: %08x\n", cfg);
+   pr_warn("EEH(%u): PCI cmd/status register: %08x\n", event_id, cfg);
 
/* Gather bridge-specific registers */
if (edev->mode & EEH_DEV_BRIDGE) {
eeh_ops->read_config(edev, PCI_SEC_STATUS, 2, );
n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg);
-   pr_warn("EEH: Bridge secondary status: %04x\n", cfg);
+   pr_warn("EEH(%u): Bridge secondary status: %04x\n",
+   event_id, cfg);
 
eeh_ops->read_config(edev, PCI_BRIDGE_CONTROL, 2, );
n += scnprintf(buf+n, len-n, "brdg ctl:%x\n", cfg);
-   pr_warn("EEH: Bridge control: %04x\n", cfg);
+   pr_warn("EEH(%u): Bridge control: %04x\n", event_id, cfg);
}
 
/* Dump out the PCI-X command and status regs */
@@ -238,18 +241,19 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char 
*buf, size_t len)
if (cap) {
eeh_ops->read_config(edev, cap, 4, );
n += scnprintf(buf+n, len-n, "pcix-cmd:%x\n", cfg);
-   pr_warn("EEH: PCI-X cmd: %08x\n", cfg);
+   pr_warn("EEH(%u): PCI-X cmd: %08x\n", event_id, cfg);
 

[RFC 3/3] powerpc/eeh: Asynchronous recovery

2023-06-12 Thread Ganesh Goudar
Based on the original work from Sam Bobroff.

Currently, EEH recovery is entirely serialized and takes place within
a single kernel thread. This can cause recovery to take a long time
when there are many devices.

To shorten recovery time, this change allows recovery to proceed in
parallel in two ways:
- Each PHB is given it's own recovery event queue and can be recovered
independently from other PHBs.
- Driver handlers are called in parallel, but with the constraint that
handlers higher up (closer to the PHB) in the PE hierarchy must be
called before those lower down.

To maintain the constraint, above, the driver handlers are called by
traversing the tree of affected PEs from the top, stopping to call
handlers (in parallel) when a PE with devices is discovered. When the
calls for that PE are complete, traversal continues at each child PE.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/eeh.h|   1 +
 arch/powerpc/include/asm/eeh_event.h  |   7 +
 arch/powerpc/include/asm/pci-bridge.h |   3 +
 arch/powerpc/kernel/eeh_driver.c  | 323 +++---
 arch/powerpc/kernel/eeh_event.c   |  65 +++---
 arch/powerpc/kernel/eeh_pe.c  |   3 +
 6 files changed, 288 insertions(+), 114 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index d0f09e691498..06d7dabdccfe 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -128,6 +128,7 @@ static inline bool eeh_pe_passed(struct eeh_pe *pe)
 #define EEH_DEV_NO_HANDLER (1 << 8)/* No error handler */
 #define EEH_DEV_SYSFS  (1 << 9)/* Sysfs created*/
 #define EEH_DEV_REMOVED(1 << 10)   /* Removed permanently  
*/
+#define EEH_DEV_RECOVERING (1 << 11)   /* Recovering   */
 
 struct eeh_dev {
int mode;   /* EEH mode */
diff --git a/arch/powerpc/include/asm/eeh_event.h 
b/arch/powerpc/include/asm/eeh_event.h
index a1fe736bc4cf..b21f49e87b7b 100644
--- a/arch/powerpc/include/asm/eeh_event.h
+++ b/arch/powerpc/include/asm/eeh_event.h
@@ -8,6 +8,8 @@
 #define ASM_POWERPC_EEH_EVENT_H
 #ifdef __KERNEL__
 
+#include 
+
 /*
  * structure holding pci controller data that describes a
  * change in the isolation status of a PCI slot.  A pointer
@@ -15,16 +17,21 @@
  * callback.
  */
 struct eeh_event {
+   struct work_struct  work;
struct list_headlist;   /* to form event queue  */
struct eeh_pe   *pe;/* EEH PE   */
unsigned intid; /* Event ID */
 };
 
+extern spinlock_t eeh_eventlist_lock;
+
 int eeh_event_init(void);
+int eeh_phb_event(struct eeh_pe *pe);
 int eeh_send_failure_event(struct eeh_pe *pe);
 int __eeh_send_failure_event(struct eeh_pe *pe);
 void eeh_remove_event(struct eeh_pe *pe, bool force);
 void eeh_handle_normal_event(unsigned int event_id, struct eeh_pe *pe);
+void eeh_handle_normal_event_work(struct work_struct *work);
 void eeh_handle_special_event(void);
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index 2aa3a091ef20..55a5ff9ae30b 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -138,6 +138,9 @@ struct pci_controller {
 
/* iommu_ops support */
struct iommu_device iommu;
+
+   bool eeh_in_progress;
+   struct list_head eeh_eventlist;
 };
 
 /* These are used for config access before all the PCI probing
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index cdf2de0eba57..a484d6ef33a1 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -12,12 +12,17 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
 #include 
 #include 
 
+static atomic_t eeh_wu_id = ATOMIC_INIT(0);
+
 struct eeh_rmv_data {
struct list_head removed_vf_list;
int removed_dev_count;
@@ -248,73 +253,59 @@ static void eeh_set_irq_state(struct eeh_pe *root, bool 
enable)
 }
 
 typedef enum pci_ers_result (*eeh_report_fn)(unsigned int event_id,
+unsigned int id,
 struct pci_dev *,
 struct pci_driver *);
 static void eeh_pe_report_pdev(unsigned int event_id,
-  struct pci_dev *pdev, eeh_report_fn fn,
+  unsigned int id,
+  struct pci_dev *pdev,
+  const char *fn_name, eeh_report_fn fn,
   enum pci_ers_result *result,
-  const char *handler_name)
+  bool late, bool removed, bool passed)
 {
-   struct eeh_dev *edev;
struct pci_driver *driver;
-   bool 

[RFC 1/3] powerpc/eeh: Synchronization for safety

2023-06-12 Thread Ganesh Goudar
Based on the original work from Sam Bobroff.

There is currently little synchronization between EEH error detection
(eeh_dev_check_failure()), EEH error recovery
(eeh_handle_{normal,special}_event()) and the PCI subsystem (device
addition and removal), and so there are race conditions that lead to
crashes (often access to free'd memory or LIST_POISON).

However, a solution must consider:
- EEH error detection can occur in interrupt context, which prevents
the use of a mutex.
- EEH recovery may need to sleep, which prevents the use of a spinlock.
- EEH recovery uses PCI operations that may require the PCI
rescan/remove lock and/or device lock to be held
- PCI operations may hold the rescan/remove and/or device lock when
calling into EEH functions.
- Device driver callbacks may perform arbitrary PCI operations
during recovery, including device removal.

In this patch the existing mutex and spinlock are combined with the
EEH_PE_RECOVERING flag to provide some assurances that are then used
to reduce the race conditions.

The fields to be protected are the ones that provide the structure
of the trees of struct eeh_pe that are held for each PHB: the parent
pointer and child lists and the list of struct eeh_dev, as well as
the pe and pdev pointers within struct eeh_dev.

The existing way of using EEH_PE_RECOVERING is kept and slightly
extended: No struct eeh_pe will be removed while it has the flag set
on it. Additionally, when adding new PEs, they are marked
EEH_PE_RECOVERING if their parent PE is marked: this allows the
recovery thread to assume that all PEs underneath the one it's
processing will continue to exist during recovery.

Both the mutex and spinlock are held while any protected field is
changed or a PE is deleted, so holding either of them (elsewhere) will
keep them stable and safe to access. Additionally, if
EEH_PE_RECOVERING is set on a PE then the locks can be released and
re-acquired safely, as long as the protected fields aren't used while
no locks are held. This is used during recovery to release locks
for long sleeps (i.e. during eeh_wait_state() when we may sleep up to
5 minutes), or to maintain lock ordering.

The spinlock is used in error detection (which cannot use a mutex, see
above) and also where it's possible that the mutex is already held.
The mutex is used in areas that don't have that restriction, and where
blocking may be required. Care must be taken when ordering these locks
against the PCI rescan/remove lock and the device locks to avoid
deadlocking.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/eeh.h   |   6 +-
 arch/powerpc/kernel/eeh.c| 112 ++--
 arch/powerpc/kernel/eeh_driver.c | 288 ++-
 arch/powerpc/kernel/eeh_pe.c |  30 +-
 arch/powerpc/platforms/powernv/eeh-powernv.c |  12 +-
 arch/powerpc/platforms/pseries/eeh_pseries.c |   5 +-
 arch/powerpc/platforms/pseries/pci_dlpar.c   |   5 +-
 drivers/pci/hotplug/pnv_php.c|   5 +-
 drivers/pci/hotplug/rpadlpar_core.c  |   2 +
 drivers/vfio/vfio_iommu_spapr_tce.c  |  10 +-
 10 files changed, 365 insertions(+), 110 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 514dd056c2c8..d0f09e691498 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -271,11 +271,15 @@ static inline bool eeh_state_active(int state)
== (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
 }
 
+void eeh_recovery_lock(void);
+void eeh_recovery_unlock(void);
+void eeh_recovery_must_be_locked(void);
+
 typedef void (*eeh_edev_traverse_func)(struct eeh_dev *edev, void *flag);
 typedef void *(*eeh_pe_traverse_func)(struct eeh_pe *pe, void *flag);
 void eeh_set_pe_aux_size(int size);
 int eeh_phb_pe_create(struct pci_controller *phb);
-int eeh_wait_state(struct eeh_pe *pe, int max_wait);
+int eeh_wait_state(struct eeh_pe *pe, int max_wait, bool unlock);
 struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb);
 struct eeh_pe *eeh_pe_next(struct eeh_pe *pe, struct eeh_pe *root);
 struct eeh_pe *eeh_pe_get(struct pci_controller *phb, int pe_no);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index ab316e155ea9..2c90c37524ed 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -108,7 +108,25 @@ bool eeh_debugfs_no_recover;
 /* Platform dependent EEH operations */
 struct eeh_ops *eeh_ops = NULL;
 
-/* Lock to avoid races due to multiple reports of an error */
+/*
+ * confirm_error_lock and eeh_dev_mutex are used together to provide
+ * safety during EEH operations.
+ *
+ * Generally, the spinlock is used in error detection where it's not possible
+ * to use a mutex or where there is potential to deadlock with the mutex, and
+ * the mutex is used during recovery and other PCI related operations. One must
+ * be held when reading and both must be held when making changes to the
+ * protected fields: eeh_pe.parent, 

[RFC 0/3] Asynchronous EEH recovery

2023-06-12 Thread Ganesh Goudar
Hi,

EEH recovery is currently serialized and these patches shorten
the time taken for EEH recovery by making the recovery to run
in parallel. The original author of these patches is Sam Bobroff,
I have rebased and tested these patches.

On powervm with 64 VFs from same PHB,  I see approximately 48%
reduction in time taken in EEH recovery.

On powernv with 9 network cards, Where 2 cards installed on one
PHB and 1 card on each of the rest of the PHBs, Providing 20 PFs
in total. I see approximately 33% reduction in time taken in EEH
recovery.

These patches were originally posted as separate RFCs by Sam, And
I rebased and posted these patches almost a year back, I stopped
pursuing these patches as I was not able test this on powernv, Due
to the issues in drivers of cards I was testing this on, Which are
now resolved. Since I am re-posting this after long time, Posting
this as a fresh RFC, Please comment.

Thanks.  

Ganesh Goudar (3):
  powerpc/eeh: Synchronization for safety
  powerpc/eeh: Provide a unique ID for each EEH recovery
  powerpc/eeh: Asynchronous recovery

 arch/powerpc/include/asm/eeh.h   |   7 +-
 arch/powerpc/include/asm/eeh_event.h |  10 +-
 arch/powerpc/include/asm/pci-bridge.h|   3 +
 arch/powerpc/include/asm/ppc-pci.h   |   2 +-
 arch/powerpc/kernel/eeh.c| 154 +++--
 arch/powerpc/kernel/eeh_driver.c | 580 +++
 arch/powerpc/kernel/eeh_event.c  |  71 ++-
 arch/powerpc/kernel/eeh_pe.c |  33 +-
 arch/powerpc/platforms/powernv/eeh-powernv.c |  12 +-
 arch/powerpc/platforms/pseries/eeh_pseries.c |   5 +-
 arch/powerpc/platforms/pseries/pci_dlpar.c   |   5 +-
 drivers/pci/hotplug/pnv_php.c|   5 +-
 drivers/pci/hotplug/rpadlpar_core.c  |   2 +
 drivers/vfio/vfio_iommu_spapr_tce.c  |  10 +-
 include/linux/mmzone.h   |   2 +-
 15 files changed, 687 insertions(+), 214 deletions(-)

-- 
2.40.1



[PATCH v9 01/42] mm: Rename arch pte_mkwrite()'s to pte_mkwrite_novma()

2023-06-12 Thread Rick Edgecombe
The x86 Shadow stack feature includes a new type of memory called shadow
stack. This shadow stack memory has some unusual properties, which requires
some core mm changes to function properly.

One of these unusual properties is that shadow stack memory is writable,
but only in limited ways. These limits are applied via a specific PTE
bit combination. Nevertheless, the memory is writable, and core mm code
will need to apply the writable permissions in the typical paths that
call pte_mkwrite(). Future patches will make pte_mkwrite() take a VMA, so
that the x86 implementation of it can know whether to create regular
writable memory or shadow stack memory.

But there are a couple of challenges to this. Modifying the signatures of
each arch pte_mkwrite() implementation would be error prone because some
are generated with macros and would need to be re-implemented. Also, some
pte_mkwrite() callers operate on kernel memory without a VMA.

So this can be done in a three step process. First pte_mkwrite() can be
renamed to pte_mkwrite_novma() in each arch, with a generic pte_mkwrite()
added that just calls pte_mkwrite_novma(). Next callers without a VMA can
be moved to pte_mkwrite_novma(). And lastly, pte_mkwrite() and all callers
can be changed to take/pass a VMA.

Start the process by renaming pte_mkwrite() to pte_mkwrite_novma() and
adding the pte_mkwrite() wrapper in linux/pgtable.h. Apply the same
pattern for pmd_mkwrite(). Since not all archs have a pmd_mkwrite_novma(),
create a new arch config HAS_HUGE_PAGE that can be used to tell if
pmd_mkwrite() should be defined. Otherwise in the !HAS_HUGE_PAGE cases the
compiler would not be able to find pmd_mkwrite_novma().

No functional change.

Cc: linux-...@vger.kernel.org
Cc: linux-ker...@vger.kernel.org
Cc: linux-al...@vger.kernel.org
Cc: linux-snps-...@lists.infradead.org
Cc: linux-arm-ker...@lists.infradead.org
Cc: linux-c...@vger.kernel.org
Cc: linux-hexa...@vger.kernel.org
Cc: linux-i...@vger.kernel.org
Cc: loonga...@lists.linux.dev
Cc: linux-m...@lists.linux-m68k.org
Cc: Michal Simek 
Cc: Dinh Nguyen 
Cc: linux-m...@vger.kernel.org
Cc: openr...@lists.librecores.org
Cc: linux-par...@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-ri...@lists.infradead.org
Cc: linux-s...@vger.kernel.org
Cc: linux...@vger.kernel.org
Cc: sparcli...@vger.kernel.org
Cc: linux...@lists.infradead.org
Cc: linux-a...@vger.kernel.org
Cc: linux...@kvack.org
Suggested-by: Linus Torvalds 
Signed-off-by: Rick Edgecombe 
Link: 
https://lore.kernel.org/lkml/CAHk-=wizjsu7c9sfyzb3q04108stghff2wfbokgccgw7riz...@mail.gmail.com/
---
Hi Non-x86 Arch’s,

x86 has a feature that allows for the creation of a special type of
writable memory (shadow stack) that is only writable in limited specific
ways. Previously, changes were proposed to core MM code to teach it to
decide when to create normally writable memory or the special shadow stack
writable memory, but David Hildenbrand suggested[0] to change
pXX_mkwrite() to take a VMA, so awareness of shadow stack memory can be
moved into x86 code. Later Linus suggested a less error-prone way[1] to go
about this after the first attempt had a bug.

Since pXX_mkwrite() is defined in every arch, it requires some tree-wide
changes. So that is why you are seeing some patches out of a big x86
series pop up in your arch mailing list. There is no functional change.
After this refactor, the shadow stack series goes on to use the arch
helpers to push arch memory details inside arch/x86 and other arch's
with upcoming shadow stack features.

Testing was just 0-day build testing.

Hopefully that is enough context. Thanks!

[0] 
https://lore.kernel.org/lkml/0e29a2d0-08d8-bcd6-ff26-4bea0e403...@redhat.com/
[1] 
https://lore.kernel.org/lkml/CAHk-=wizjsu7c9sfyzb3q04108stghff2wfbokgccgw7riz...@mail.gmail.com/
---
 Documentation/mm/arch_pgtable_helpers.rst|  6 ++
 arch/Kconfig |  3 +++
 arch/alpha/include/asm/pgtable.h |  2 +-
 arch/arc/include/asm/hugepage.h  |  2 +-
 arch/arc/include/asm/pgtable-bits-arcv2.h|  2 +-
 arch/arm/include/asm/pgtable-3level.h|  2 +-
 arch/arm/include/asm/pgtable.h   |  2 +-
 arch/arm64/include/asm/pgtable.h |  4 ++--
 arch/csky/include/asm/pgtable.h  |  2 +-
 arch/hexagon/include/asm/pgtable.h   |  2 +-
 arch/ia64/include/asm/pgtable.h  |  2 +-
 arch/loongarch/include/asm/pgtable.h |  4 ++--
 arch/m68k/include/asm/mcf_pgtable.h  |  2 +-
 arch/m68k/include/asm/motorola_pgtable.h |  2 +-
 arch/m68k/include/asm/sun3_pgtable.h |  2 +-
 arch/microblaze/include/asm/pgtable.h|  2 +-
 arch/mips/include/asm/pgtable.h  |  6 +++---
 arch/nios2/include/asm/pgtable.h |  2 +-
 arch/openrisc/include/asm/pgtable.h  |  2 +-
 arch/parisc/include/asm/pgtable.h|  2 +-
 arch/powerpc/include/asm/book3s/32/pgtable.h |  2 +-
 

RE: [PATCH v5 25/26] PCI/AER: Forward RCH downstream port-detected errors to the CXL.mem dev handler

2023-06-12 Thread Dan Williams
Terry Bowman wrote:
> From: Robert Richter 
> 
> In Restricted CXL Device (RCD) mode a CXL device is exposed as an
> RCiEP, but CXL downstream and upstream ports are not enumerated and
> not visible in the PCIe hierarchy. [1] Protocol and link errors from
> these non-enumerated ports are signaled as internal AER errors, either
> Uncorrectable Internal Error (UIE) or Corrected Internal Errors (CIE)
> via an RCEC.
> 
> Restricted CXL host (RCH) downstream port-detected errors have the
> Requster ID of the RCEC set in the RCEC's AER Error Source ID
> register. A CXL handler must then inspect the error status in various
> CXL registers residing in the dport's component register space (CXL
> RAS capability) or the dport's RCRB (PCIe AER extended
> capability). [2]
> 
> Errors showing up in the RCEC's error handler must be handled and
> connected to the CXL subsystem. Implement this by forwarding the error
> to all CXL devices below the RCEC. Since the entire CXL device is
> controlled only using PCIe Configuration Space of device 0, function
> 0, only pass it there [3]. The error handling is limited to currently
> supported devices with the Memory Device class code set (CXL Type 3
> Device, PCI_CLASS_MEMORY_CXL, 502h), handle downstream port errors in
> the device's cxl_pci driver. Support for other CXL Device Types
> (e.g. a CXL.cache Device) can be added later.
> 
> To handle downstream port errors in addition to errors directed to the
> CXL endpoint device, a handler must also inspect the CXL RAS and PCIe
> AER capabilities of the CXL downstream port the device is connected
> to.
> 
> Since CXL downstream port errors are signaled using internal errors,
> the handler requires those errors to be unmasked. This is subject of a
> follow-on patch.
> 
> The reason for choosing this implementation is that the AER service
> driver claims the RCEC device, but does not allow it to register a
> custom specific handler to support CXL. Connecting the RCEC hard-wired
> with a CXL handler does not work, as the CXL subsystem might not be
> present all the time. The alternative to add an implementation to the
> portdrv to allow the registration of a custom RCEC error handler isn't
> worth doing it as CXL would be its only user. Instead, just check for
> an CXL RCEC and pass it down to the connected CXL device's error
> handler. With this approach the code can entirely be implemented in
> the PCIe AER driver and is independent of the CXL subsystem. The CXL
> driver only provides the handler.
> 
> [1] CXL 3.0 spec: 9.11.8 CXL Devices Attached to an RCH
> [2] CXL 3.0 spec, 12.2.1.1 RCH Downstream Port-detected Errors
> [3] CXL 3.0 spec, 8.1.3 PCIe DVSEC for CXL Devices
> 
> Co-developed-by: Terry Bowman 
> Signed-off-by: Terry Bowman 
> Signed-off-by: Robert Richter 
> Cc: "Oliver O'Halloran" 
> Cc: Bjorn Helgaas 
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: linux-...@vger.kernel.org
> Acked-by: Bjorn Helgaas 
> Reviewed-by: Jonathan Cameron 
> ---
>  drivers/pci/pcie/Kconfig | 12 +
>  drivers/pci/pcie/aer.c   | 96 +++-
>  2 files changed, 106 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/pci/pcie/Kconfig b/drivers/pci/pcie/Kconfig
> index 228652a59f27..4f0e70fafe2d 100644
> --- a/drivers/pci/pcie/Kconfig
> +++ b/drivers/pci/pcie/Kconfig
> @@ -49,6 +49,18 @@ config PCIEAER_INJECT
> gotten from:
>
> https://git.kernel.org/cgit/linux/kernel/git/gong.chen/aer-inject.git/
>  
> +config PCIEAER_CXL
> + bool "PCI Express CXL RAS support for Restricted Hosts (RCH)"
> + default y
> + depends on PCIEAER && CXL_PCI
> + help
> +   Enables error handling of downstream ports of a CXL host
> +   that is operating in RCD mode (Restricted CXL Host, RCH).
> +   The downstream port reports AER errors to a given RCEC.
> +   Errors are handled by the CXL memory device driver.
> +
> +   If unsure, say Y.
> +
>  #
>  # PCI Express ECRC
>  #
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index d3344fcf1f79..c354ca5e8f2b 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -946,14 +946,100 @@ static bool find_source_device(struct pci_dev *parent,
>   return true;
>  }
>  
> +#ifdef CONFIG_PCIEAER_CXL
> +
> +static bool is_cxl_mem_dev(struct pci_dev *dev)
> +{
> + /*
> +  * The capability, status, and control fields in Device 0,
> +  * Function 0 DVSEC control the CXL functionality of the
> +  * entire device (CXL 3.0, 8.1.3).
> +  */
> + if (dev->devfn != PCI_DEVFN(0, 0))
> + return false;
> +
> + /*
> +  * CXL Memory Devices must have the 502h class code set (CXL
> +  * 3.0, 8.1.12.1).
> +  */
> + if ((dev->class >> 8) != PCI_CLASS_MEMORY_CXL)
> + return false;

I think this is ok for now, but I expect it will want to be something
like dev->is_cxl in the future where that is the cached result of:

dev->is_cxl = 

Re: [PATCH 00/13] mm: jit/text allocator

2023-06-12 Thread Mike Rapoport
On Fri, Jun 09, 2023 at 10:02:16AM -0700, Song Liu wrote:
> On Thu, Jun 8, 2023 at 11:41 AM Mike Rapoport  wrote:
> >
> > On Tue, Jun 06, 2023 at 11:21:59AM -0700, Song Liu wrote:
> > > On Mon, Jun 5, 2023 at 3:09 AM Mark Rutland  wrote:
> > >
> > > [...]
> > >
> > > > > > > Can you give more detail on what parameters you need? If the only 
> > > > > > > extra
> > > > > > > parameter is just "does this allocation need to live close to 
> > > > > > > kernel
> > > > > > > text", that's not that big of a deal.
> > > > > >
> > > > > > My thinking was that we at least need the start + end for each 
> > > > > > caller. That
> > > > > > might be it, tbh.
> > > > >
> > > > > Do you mean that modules will have something like
> > > > >
> > > > >   jit_text_alloc(size, MODULES_START, MODULES_END);
> > > > >
> > > > > and kprobes will have
> > > > >
> > > > >   jit_text_alloc(size, KPROBES_START, KPROBES_END);
> > > > > ?
> > > >
> > > > Yes.
> > >
> > > How about we start with two APIs:
> > >  jit_text_alloc(size);
> > >  jit_text_alloc_range(size, start, end);
> > >
> > > AFAICT, arm64 is the only arch that requires the latter API. And TBH, I am
> > > not quite convinced it is needed.
> >
> > Right now arm64 and riscv override bpf and kprobes allocations to use the
> > entire vmalloc address space, but having the ability to allocate generated
> > code outside of modules area may be useful for other architectures.
> >
> > Still the start + end for the callers feels backwards to me because the
> > callers do not define the ranges, but rather the architectures, so we still
> > need a way for architectures to define how they want allocate memory for
> > the generated code.
> 
> Yeah, this makes sense.
> 
> >
> > > > > It sill can be achieved with a single jit_alloc_arch_params(), just by
> > > > > adding enum jit_type parameter to jit_text_alloc().
> > > >
> > > > That feels backwards to me; it centralizes a bunch of information about
> > > > distinct users to be able to shove that into a static array, when the 
> > > > callsites
> > > > can pass that information.
> > >
> > > I think we only two type of users: module and everything else (ftrace, 
> > > kprobe,
> > > bpf stuff). The key differences are:
> > >
> > >   1. module uses text and data; while everything else only uses text.
> > >   2. module code is generated by the compiler, and thus has stronger
> > >   requirements in address ranges; everything else are generated via some
> > >   JIT or manual written assembly, so they are more flexible with address
> > >   ranges (in JIT, we can avoid using instructions that requires a specific
> > >   address range).
> > >
> > > The next question is, can we have the two types of users share the same
> > > address ranges? If not, we can reserve the preferred range for modules,
> > > and let everything else use the other range. I don't see reasons to 
> > > further
> > > separate users in the "everything else" group.
> >
> > I agree that we can define only two types: modules and everything else and
> > let the architectures define if they need different ranges for these two
> > types, or want the same range for everything.
> >
> > With only two types we can have two API calls for alloc, and a single
> > structure that defines the ranges etc from the architecture side rather
> > than spread all over.
> >
> > Like something along these lines:
> >
> > struct execmem_range {
> > unsigned long   start;
> > unsigned long   end;
> > unsigned long   fallback_start;
> > unsigned long   fallback_end;
> > pgprot_tpgprot;
> > unsigned intalignment;
> > };
> >
> > struct execmem_modules_range {
> > enum execmem_module_flags flags;
> > struct execmem_range text;
> > struct execmem_range data;
> > };
> >
> > struct execmem_jit_range {
> > struct execmem_range text;
> > };
> >
> > struct execmem_params {
> > struct execmem_modules_rangemodules;
> > struct execmem_jit_rangejit;
> > };
> >
> > struct execmem_params *execmem_arch_params(void);
> >
> > void *execmem_text_alloc(size_t size);
> > void *execmem_data_alloc(size_t size);
> > void execmem_free(void *ptr);
> 
> With the jit variation, maybe we can just call these
> module_[text|data]_alloc()?

I was thinking about "execmem_*_alloc()" for allocations that must be close to 
kernel
image, like modules, ftrace on x86 and s390 and maybe something else in the
future.

And jit_text_alloc() for allocations that can reside anywhere.

I tried to find a different name for 'struct execmem_modules_range' but
couldn't think of anything better than 'struct execmem_close_to_kernel', so
I've left modules in the name.
 
> btw: Depending on the implementation of the allocator, we 

[PATCH v4 33/34] um: Convert {pmd, pte}_free_tlb() to use ptdescs

2023-06-12 Thread Vishal Moola (Oracle)
Part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents. Also cleans up some spacing issues.

Signed-off-by: Vishal Moola (Oracle) 
---
 arch/um/include/asm/pgalloc.h | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h
index 8ec7cd46dd96..de5e31c64793 100644
--- a/arch/um/include/asm/pgalloc.h
+++ b/arch/um/include/asm/pgalloc.h
@@ -25,19 +25,19 @@
  */
 extern pgd_t *pgd_alloc(struct mm_struct *);
 
-#define __pte_free_tlb(tlb,pte, address)   \
-do {   \
-   pgtable_pte_page_dtor(pte); \
-   tlb_remove_page((tlb),(pte));   \
+#define __pte_free_tlb(tlb, pte, address)  \
+do {   \
+   pagetable_pte_dtor(page_ptdesc(pte));   \
+   tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte)));  \
 } while (0)
 
 #ifdef CONFIG_3_LEVEL_PGTABLES
 
-#define __pmd_free_tlb(tlb, pmd, address)  \
-do {   \
-   pgtable_pmd_page_dtor(virt_to_page(pmd));   \
-   tlb_remove_page((tlb),virt_to_page(pmd));   \
-} while (0)\
+#define __pmd_free_tlb(tlb, pmd, address)  \
+do {   \
+   pagetable_pmd_dtor(virt_to_ptdesc(pmd));\
+   tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(pmd)); \
+} while (0)
 
 #endif
 
-- 
2.40.1



[PATCH v4 34/34] mm: Remove pgtable_{pmd, pte}_page_{ctor, dtor}() wrappers

2023-06-12 Thread Vishal Moola (Oracle)
These functions are no longer necessary. Remove them and cleanup
Documentation referencing them.

Signed-off-by: Vishal Moola (Oracle) 
---
 Documentation/mm/split_page_table_lock.rst| 12 +--
 .../zh_CN/mm/split_page_table_lock.rst| 14 ++---
 include/linux/mm.h| 20 ---
 3 files changed, 13 insertions(+), 33 deletions(-)

diff --git a/Documentation/mm/split_page_table_lock.rst 
b/Documentation/mm/split_page_table_lock.rst
index 50ee0dfc95be..4bffec728340 100644
--- a/Documentation/mm/split_page_table_lock.rst
+++ b/Documentation/mm/split_page_table_lock.rst
@@ -53,7 +53,7 @@ Support of split page table lock by an architecture
 ===
 
 There's no need in special enabling of PTE split page table lock: everything
-required is done by pgtable_pte_page_ctor() and pgtable_pte_page_dtor(), which
+required is done by pagetable_pte_ctor() and pagetable_pte_dtor(), which
 must be called on PTE table allocation / freeing.
 
 Make sure the architecture doesn't use slab allocator for page table
@@ -63,8 +63,8 @@ This field shares storage with page->ptl.
 PMD split lock only makes sense if you have more than two page table
 levels.
 
-PMD split lock enabling requires pgtable_pmd_page_ctor() call on PMD table
-allocation and pgtable_pmd_page_dtor() on freeing.
+PMD split lock enabling requires pagetable_pmd_ctor() call on PMD table
+allocation and pagetable_pmd_dtor() on freeing.
 
 Allocation usually happens in pmd_alloc_one(), freeing in pmd_free() and
 pmd_free_tlb(), but make sure you cover all PMD table allocation / freeing
@@ -72,7 +72,7 @@ paths: i.e X86_PAE preallocate few PMDs on pgd_alloc().
 
 With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK.
 
-NOTE: pgtable_pte_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must
+NOTE: pagetable_pte_ctor() and pagetable_pmd_ctor() can fail -- it must
 be handled properly.
 
 page->ptl
@@ -92,7 +92,7 @@ trick:
split lock with enabled DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC, but costs
one more cache line for indirect access;
 
-The spinlock_t allocated in pgtable_pte_page_ctor() for PTE table and in
-pgtable_pmd_page_ctor() for PMD table.
+The spinlock_t allocated in pagetable_pte_ctor() for PTE table and in
+pagetable_pmd_ctor() for PMD table.
 
 Please, never access page->ptl directly -- use appropriate helper.
diff --git a/Documentation/translations/zh_CN/mm/split_page_table_lock.rst 
b/Documentation/translations/zh_CN/mm/split_page_table_lock.rst
index 4fb7aa666037..a2c288670a24 100644
--- a/Documentation/translations/zh_CN/mm/split_page_table_lock.rst
+++ b/Documentation/translations/zh_CN/mm/split_page_table_lock.rst
@@ -56,16 +56,16 @@ Hugetlb特定的辅助函数:
 架构对分页表锁的支持
 
 
-没有必要特别启用PTE分页表锁:所有需要的东西都由pgtable_pte_page_ctor()
-和pgtable_pte_page_dtor()完成,它们必须在PTE表分配/释放时被调用。
+没有必要特别启用PTE分页表锁:所有需要的东西都由pagetable_pte_ctor()
+和pagetable_pte_dtor()完成,它们必须在PTE表分配/释放时被调用。
 
 确保架构不使用slab分配器来分配页表:slab使用page->slab_cache来分配其页
 面。这个区域与page->ptl共享存储。
 
 PMD分页锁只有在你有两个以上的页表级别时才有意义。
 
-启用PMD分页锁需要在PMD表分配时调用pgtable_pmd_page_ctor(),在释放时调
-用pgtable_pmd_page_dtor()。
+启用PMD分页锁需要在PMD表分配时调用pagetable_pmd_ctor(),在释放时调
+用pagetable_pmd_dtor()。
 
 分配通常发生在pmd_alloc_one()中,释放发生在pmd_free()和pmd_free_tlb()
 中,但要确保覆盖所有的PMD表分配/释放路径:即X86_PAE在pgd_alloc()中预先
@@ -73,7 +73,7 @@ PMD分页锁只有在你有两个以上的页表级别时才有意义。
 
 一切就绪后,你可以设置CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK。
 
-注意:pgtable_pte_page_ctor()和pgtable_pmd_page_ctor()可能失败--必
+注意:pagetable_pte_ctor()和pagetable_pmd_ctor()可能失败--必
 须正确处理。
 
 page->ptl
@@ -90,7 +90,7 @@ page->ptl用于访问分割页表锁,其中'page'是包含该表的页面struc
的指针并动态分配它。这允许在启用DEBUG_SPINLOCK或DEBUG_LOCK_ALLOC的
情况下使用分页锁,但由于间接访问而多花了一个缓存行。
 
-PTE表的spinlock_t分配在pgtable_pte_page_ctor()中,PMD表的spinlock_t
-分配在pgtable_pmd_page_ctor()中。
+PTE表的spinlock_t分配在pagetable_pte_ctor()中,PMD表的spinlock_t
+分配在pagetable_pmd_ctor()中。
 
 请不要直接访问page->ptl - -使用适当的辅助函数。
diff --git a/include/linux/mm.h b/include/linux/mm.h
index dc211c43610b..6d83483cf186 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2897,11 +2897,6 @@ static inline bool pagetable_pte_ctor(struct ptdesc 
*ptdesc)
return true;
 }
 
-static inline bool pgtable_pte_page_ctor(struct page *page)
-{
-   return pagetable_pte_ctor(page_ptdesc(page));
-}
-
 static inline void pagetable_pte_dtor(struct ptdesc *ptdesc)
 {
struct folio *folio = ptdesc_folio(ptdesc);
@@ -2911,11 +2906,6 @@ static inline void pagetable_pte_dtor(struct ptdesc 
*ptdesc)
lruvec_stat_sub_folio(folio, NR_PAGETABLE);
 }
 
-static inline void pgtable_pte_page_dtor(struct page *page)
-{
-   pagetable_pte_dtor(page_ptdesc(page));
-}
-
 #define pte_offset_map_lock(mm, pmd, address, ptlp)\
 ({ \
spinlock_t *__ptl = pte_lockptr(mm, pmd);   \
@@ -3006,11 +2996,6 @@ static inline bool pagetable_pmd_ctor(struct ptdesc 
*ptdesc)
return true;
 }
 

[PATCH v4 32/34] sparc: Convert pgtable_pte_page_{ctor, dtor}() to ptdesc equivalents

2023-06-12 Thread Vishal Moola (Oracle)
Part of the conversions to replace pgtable pte constructor/destructors with
ptdesc equivalents.

Signed-off-by: Vishal Moola (Oracle) 
---
 arch/sparc/mm/srmmu.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c
index 13f027afc875..8393faa3e596 100644
--- a/arch/sparc/mm/srmmu.c
+++ b/arch/sparc/mm/srmmu.c
@@ -355,7 +355,8 @@ pgtable_t pte_alloc_one(struct mm_struct *mm)
return NULL;
page = pfn_to_page(__nocache_pa((unsigned long)ptep) >> PAGE_SHIFT);
spin_lock(>page_table_lock);
-   if (page_ref_inc_return(page) == 2 && !pgtable_pte_page_ctor(page)) {
+   if (page_ref_inc_return(page) == 2 &&
+   !pagetable_pte_ctor(page_ptdesc(page))) {
page_ref_dec(page);
ptep = NULL;
}
@@ -371,7 +372,7 @@ void pte_free(struct mm_struct *mm, pgtable_t ptep)
page = pfn_to_page(__nocache_pa((unsigned long)ptep) >> PAGE_SHIFT);
spin_lock(>page_table_lock);
if (page_ref_dec_return(page) == 1)
-   pgtable_pte_page_dtor(page);
+   pagetable_pte_dtor(page_ptdesc(page));
spin_unlock(>page_table_lock);
 
srmmu_free_nocache(ptep, SRMMU_PTE_TABLE_SIZE);
-- 
2.40.1



[PATCH v4 30/34] sh: Convert pte_free_tlb() to use ptdescs

2023-06-12 Thread Vishal Moola (Oracle)
Part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents. Also cleans up some spacing issues.

Signed-off-by: Vishal Moola (Oracle) 
Reviewed-by: Geert Uytterhoeven 
Acked-by: John Paul Adrian Glaubitz 
---
 arch/sh/include/asm/pgalloc.h | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h
index a9e98233c4d4..5d8577ab1591 100644
--- a/arch/sh/include/asm/pgalloc.h
+++ b/arch/sh/include/asm/pgalloc.h
@@ -2,6 +2,7 @@
 #ifndef __ASM_SH_PGALLOC_H
 #define __ASM_SH_PGALLOC_H
 
+#include 
 #include 
 
 #define __HAVE_ARCH_PMD_ALLOC_ONE
@@ -31,10 +32,10 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t 
*pmd,
set_pmd(pmd, __pmd((unsigned long)page_address(pte)));
 }
 
-#define __pte_free_tlb(tlb,pte,addr)   \
-do {   \
-   pgtable_pte_page_dtor(pte); \
-   tlb_remove_page((tlb), (pte));  \
+#define __pte_free_tlb(tlb, pte, addr) \
+do {   \
+   pagetable_pte_dtor(page_ptdesc(pte));   \
+   tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte)));  \
 } while (0)
 
 #endif /* __ASM_SH_PGALLOC_H */
-- 
2.40.1



[PATCH v4 31/34] sparc64: Convert various functions to use ptdescs

2023-06-12 Thread Vishal Moola (Oracle)
As part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents, convert various page table functions to use ptdescs.

Signed-off-by: Vishal Moola (Oracle) 
---
 arch/sparc/mm/init_64.c | 17 +
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 04f9db0c3111..105915cd2eee 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2893,14 +2893,15 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
 
 pgtable_t pte_alloc_one(struct mm_struct *mm)
 {
-   struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-   if (!page)
+   struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL | __GFP_ZERO, 0);
+
+   if (!ptdesc)
return NULL;
-   if (!pgtable_pte_page_ctor(page)) {
-   __free_page(page);
+   if (!pagetable_pte_ctor(ptdesc)) {
+   pagetable_free(ptdesc);
return NULL;
}
-   return (pte_t *) page_address(page);
+   return ptdesc_address(ptdesc);
 }
 
 void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
@@ -2910,10 +2911,10 @@ void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 
 static void __pte_free(pgtable_t pte)
 {
-   struct page *page = virt_to_page(pte);
+   struct ptdesc *ptdesc = virt_to_ptdesc(pte);
 
-   pgtable_pte_page_dtor(page);
-   __free_page(page);
+   pagetable_pte_dtor(ptdesc);
+   pagetable_free(ptdesc);
 }
 
 void pte_free(struct mm_struct *mm, pgtable_t pte)
-- 
2.40.1



[PATCH v4 29/34] riscv: Convert alloc_{pmd, pte}_late() to use ptdescs

2023-06-12 Thread Vishal Moola (Oracle)
As part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents, convert various page table functions to use ptdescs.

Some of the functions use the *get*page*() helper functions. Convert
these to use pagetable_alloc() and ptdesc_address() instead to help
standardize page tables further.

Signed-off-by: Vishal Moola (Oracle) 
Acked-by: Palmer Dabbelt 
---
 arch/riscv/include/asm/pgalloc.h |  8 
 arch/riscv/mm/init.c | 16 ++--
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h
index 59dc12b5b7e8..d169a4f41a2e 100644
--- a/arch/riscv/include/asm/pgalloc.h
+++ b/arch/riscv/include/asm/pgalloc.h
@@ -153,10 +153,10 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 
 #endif /* __PAGETABLE_PMD_FOLDED */
 
-#define __pte_free_tlb(tlb, pte, buf)   \
-do {\
-   pgtable_pte_page_dtor(pte); \
-   tlb_remove_page((tlb), pte);\
+#define __pte_free_tlb(tlb, pte, buf)  \
+do {   \
+   pagetable_pte_dtor(page_ptdesc(pte));   \
+   tlb_remove_page_ptdesc((tlb), page_ptdesc(pte));\
 } while (0)
 #endif /* CONFIG_MMU */
 
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 3d689ffb2072..6bfeec80bf4e 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -354,12 +354,10 @@ static inline phys_addr_t __init 
alloc_pte_fixmap(uintptr_t va)
 
 static phys_addr_t __init alloc_pte_late(uintptr_t va)
 {
-   unsigned long vaddr;
-
-   vaddr = __get_free_page(GFP_KERNEL);
-   BUG_ON(!vaddr || !pgtable_pte_page_ctor(virt_to_page((void *)vaddr)));
+   struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0);
 
-   return __pa(vaddr);
+   BUG_ON(!ptdesc || !pagetable_pte_ctor(ptdesc));
+   return __pa((pte_t *)ptdesc_address(ptdesc));
 }
 
 static void __init create_pte_mapping(pte_t *ptep,
@@ -437,12 +435,10 @@ static phys_addr_t __init alloc_pmd_fixmap(uintptr_t va)
 
 static phys_addr_t __init alloc_pmd_late(uintptr_t va)
 {
-   unsigned long vaddr;
-
-   vaddr = __get_free_page(GFP_KERNEL);
-   BUG_ON(!vaddr || !pgtable_pmd_page_ctor(virt_to_page((void *)vaddr)));
+   struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0);
 
-   return __pa(vaddr);
+   BUG_ON(!ptdesc || !pagetable_pmd_ctor(ptdesc));
+   return __pa((pmd_t *)ptdesc_address(ptdesc));
 }
 
 static void __init create_pmd_mapping(pmd_t *pmdp,
-- 
2.40.1



[PATCH v4 28/34] openrisc: Convert __pte_free_tlb() to use ptdescs

2023-06-12 Thread Vishal Moola (Oracle)
Part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents.

Signed-off-by: Vishal Moola (Oracle) 
---
 arch/openrisc/include/asm/pgalloc.h | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/openrisc/include/asm/pgalloc.h 
b/arch/openrisc/include/asm/pgalloc.h
index b7b2b8d16fad..c6a73772a546 100644
--- a/arch/openrisc/include/asm/pgalloc.h
+++ b/arch/openrisc/include/asm/pgalloc.h
@@ -66,10 +66,10 @@ extern inline pgd_t *pgd_alloc(struct mm_struct *mm)
 
 extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm);
 
-#define __pte_free_tlb(tlb, pte, addr) \
-do {   \
-   pgtable_pte_page_dtor(pte); \
-   tlb_remove_page((tlb), (pte));  \
+#define __pte_free_tlb(tlb, pte, addr) \
+do {   \
+   pagetable_pte_dtor(page_ptdesc(pte));   \
+   tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte)));  \
 } while (0)
 
 #endif
-- 
2.40.1



[PATCH v4 27/34] nios2: Convert __pte_free_tlb() to use ptdescs

2023-06-12 Thread Vishal Moola (Oracle)
Part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents.

Signed-off-by: Vishal Moola (Oracle) 
---
 arch/nios2/include/asm/pgalloc.h | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h
index ecd1657bb2ce..ce6bb8e74271 100644
--- a/arch/nios2/include/asm/pgalloc.h
+++ b/arch/nios2/include/asm/pgalloc.h
@@ -28,10 +28,10 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t 
*pmd,
 
 extern pgd_t *pgd_alloc(struct mm_struct *mm);
 
-#define __pte_free_tlb(tlb, pte, addr) \
-   do {\
-   pgtable_pte_page_dtor(pte); \
-   tlb_remove_page((tlb), (pte));  \
+#define __pte_free_tlb(tlb, pte, addr) \
+   do {\
+   pagetable_pte_dtor(page_ptdesc(pte));   \
+   tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte)));  \
} while (0)
 
 #endif /* _ASM_NIOS2_PGALLOC_H */
-- 
2.40.1



[PATCH v4 26/34] mips: Convert various functions to use ptdescs

2023-06-12 Thread Vishal Moola (Oracle)
As part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents, convert various page table functions to use ptdescs.

Some of the functions use the *get*page*() helper functions. Convert
these to use pagetable_alloc() and ptdesc_address() instead to help
standardize page tables further.

Signed-off-by: Vishal Moola (Oracle) 
---
 arch/mips/include/asm/pgalloc.h | 31 +--
 arch/mips/mm/pgtable.c  |  7 ---
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h
index f72e737dda21..6940e5536664 100644
--- a/arch/mips/include/asm/pgalloc.h
+++ b/arch/mips/include/asm/pgalloc.h
@@ -51,13 +51,13 @@ extern pgd_t *pgd_alloc(struct mm_struct *mm);
 
 static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
-   free_pages((unsigned long)pgd, PGD_TABLE_ORDER);
+   pagetable_free(virt_to_ptdesc(pgd));
 }
 
-#define __pte_free_tlb(tlb,pte,address)\
-do {   \
-   pgtable_pte_page_dtor(pte); \
-   tlb_remove_page((tlb), pte);\
+#define __pte_free_tlb(tlb, pte, address)  \
+do {   \
+   pagetable_pte_dtor(page_ptdesc(pte));   \
+   tlb_remove_page_ptdesc((tlb), page_ptdesc(pte));\
 } while (0)
 
 #ifndef __PAGETABLE_PMD_FOLDED
@@ -65,18 +65,18 @@ do {
\
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
 {
pmd_t *pmd;
-   struct page *pg;
+   struct ptdesc *ptdesc;
 
-   pg = alloc_pages(GFP_KERNEL_ACCOUNT, PMD_TABLE_ORDER);
-   if (!pg)
+   ptdesc = pagetable_alloc(GFP_KERNEL_ACCOUNT, PMD_TABLE_ORDER);
+   if (!ptdesc)
return NULL;
 
-   if (!pgtable_pmd_page_ctor(pg)) {
-   __free_pages(pg, PMD_TABLE_ORDER);
+   if (!pagetable_pmd_ctor(ptdesc)) {
+   pagetable_free(ptdesc);
return NULL;
}
 
-   pmd = (pmd_t *)page_address(pg);
+   pmd = ptdesc_address(ptdesc);
pmd_init(pmd);
return pmd;
 }
@@ -90,10 +90,13 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, 
unsigned long address)
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address)
 {
pud_t *pud;
+   struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, PUD_TABLE_ORDER);
 
-   pud = (pud_t *) __get_free_pages(GFP_KERNEL, PUD_TABLE_ORDER);
-   if (pud)
-   pud_init(pud);
+   if (!ptdesc)
+   return NULL;
+   pud = ptdesc_address(ptdesc);
+
+   pud_init(pud);
return pud;
 }
 
diff --git a/arch/mips/mm/pgtable.c b/arch/mips/mm/pgtable.c
index b13314be5d0e..729258ff4e3b 100644
--- a/arch/mips/mm/pgtable.c
+++ b/arch/mips/mm/pgtable.c
@@ -10,10 +10,11 @@
 
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-   pgd_t *ret, *init;
+   pgd_t *init, *ret = NULL;
+   struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, PGD_TABLE_ORDER);
 
-   ret = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_TABLE_ORDER);
-   if (ret) {
+   if (ptdesc) {
+   ret = ptdesc_address(ptdesc);
init = pgd_offset(_mm, 0UL);
pgd_init(ret);
memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD,
-- 
2.40.1



[PATCH v4 25/34] m68k: Convert various functions to use ptdescs

2023-06-12 Thread Vishal Moola (Oracle)
As part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents, convert various page table functions to use ptdescs.

Some of the functions use the *get*page*() helper functions. Convert
these to use pagetable_alloc() and ptdesc_address() instead to help
standardize page tables further.

Signed-off-by: Vishal Moola (Oracle) 
---
 arch/m68k/include/asm/mcf_pgalloc.h  | 41 ++--
 arch/m68k/include/asm/sun3_pgalloc.h |  8 +++---
 arch/m68k/mm/motorola.c  |  4 +--
 3 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/arch/m68k/include/asm/mcf_pgalloc.h 
b/arch/m68k/include/asm/mcf_pgalloc.h
index 5c2c0a864524..857949ac9431 100644
--- a/arch/m68k/include/asm/mcf_pgalloc.h
+++ b/arch/m68k/include/asm/mcf_pgalloc.h
@@ -7,20 +7,19 @@
 
 extern inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
-   free_page((unsigned long) pte);
+   pagetable_free(virt_to_ptdesc(pte));
 }
 
 extern const char bad_pmd_string[];
 
 extern inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
 {
-   unsigned long page = __get_free_page(GFP_DMA);
+   struct ptdesc *ptdesc = pagetable_alloc(GFP_DMA | __GFP_ZERO, 0);
 
-   if (!page)
+   if (!ptdesc)
return NULL;
 
-   memset((void *)page, 0, PAGE_SIZE);
-   return (pte_t *) (page);
+   return ptdesc_address(ptdesc);
 }
 
 extern inline pmd_t *pmd_alloc_kernel(pgd_t *pgd, unsigned long address)
@@ -35,36 +34,36 @@ extern inline pmd_t *pmd_alloc_kernel(pgd_t *pgd, unsigned 
long address)
 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pgtable,
  unsigned long address)
 {
-   struct page *page = virt_to_page(pgtable);
+   struct ptdesc *ptdesc = virt_to_ptdesc(pgtable);
 
-   pgtable_pte_page_dtor(page);
-   __free_page(page);
+   pagetable_pte_dtor(ptdesc);
+   pagetable_free(ptdesc);
 }
 
 static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
 {
-   struct page *page = alloc_pages(GFP_DMA, 0);
+   struct ptdesc *ptdesc = pagetable_alloc(GFP_DMA, 0);
pte_t *pte;
 
-   if (!page)
+   if (!ptdesc)
return NULL;
-   if (!pgtable_pte_page_ctor(page)) {
-   __free_page(page);
+   if (!pagetable_pte_ctor(ptdesc)) {
+   pagetable_free(ptdesc);
return NULL;
}
 
-   pte = page_address(page);
-   clear_page(pte);
+   pte = ptdesc_address(ptdesc);
+   pagetable_clear(pte);
 
return pte;
 }
 
 static inline void pte_free(struct mm_struct *mm, pgtable_t pgtable)
 {
-   struct page *page = virt_to_page(pgtable);
+   struct ptdesc *ptdesc = virt_to_ptdesc(pgtable);
 
-   pgtable_pte_page_dtor(page);
-   __free_page(page);
+   pagetable_pte_dtor(ptdesc);
+   pagetable_free(ptdesc);
 }
 
 /*
@@ -75,16 +74,18 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t 
pgtable)
 
 static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
-   free_page((unsigned long) pgd);
+   pagetable_free(virt_to_ptdesc(pgd));
 }
 
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
pgd_t *new_pgd;
+   struct ptdesc *ptdesc = pagetable_alloc(GFP_DMA | GFP_NOWARN, 0);
 
-   new_pgd = (pgd_t *)__get_free_page(GFP_DMA | __GFP_NOWARN);
-   if (!new_pgd)
+   if (!ptdesc)
return NULL;
+   new_pgd = ptdesc_address(ptdesc);
+
memcpy(new_pgd, swapper_pg_dir, PTRS_PER_PGD * sizeof(pgd_t));
memset(new_pgd, 0, PAGE_OFFSET >> PGDIR_SHIFT);
return new_pgd;
diff --git a/arch/m68k/include/asm/sun3_pgalloc.h 
b/arch/m68k/include/asm/sun3_pgalloc.h
index 198036aff519..ff48573db2c0 100644
--- a/arch/m68k/include/asm/sun3_pgalloc.h
+++ b/arch/m68k/include/asm/sun3_pgalloc.h
@@ -17,10 +17,10 @@
 
 extern const char bad_pmd_string[];
 
-#define __pte_free_tlb(tlb,pte,addr)   \
-do {   \
-   pgtable_pte_page_dtor(pte); \
-   tlb_remove_page((tlb), pte);\
+#define __pte_free_tlb(tlb, pte, addr) \
+do {   \
+   pagetable_pte_dtor(page_ptdesc(pte));   \
+   tlb_remove_page_ptdesc((tlb), page_ptdesc(pte));\
 } while (0)
 
 static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t 
*pte)
diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c
index c75984e2d86b..594575a0780c 100644
--- a/arch/m68k/mm/motorola.c
+++ b/arch/m68k/mm/motorola.c
@@ -161,7 +161,7 @@ void *get_pointer_table(int type)
 * m68k doesn't have SPLIT_PTE_PTLOCKS for not having
 * SMP.
 */
-   pgtable_pte_page_ctor(virt_to_page(page));
+   

[PATCH v4 24/34] loongarch: Convert various functions to use ptdescs

2023-06-12 Thread Vishal Moola (Oracle)
As part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents, convert various page table functions to use ptdescs.

Some of the functions use the *get*page*() helper functions. Convert
these to use pagetable_alloc() and ptdesc_address() instead to help
standardize page tables further.

Signed-off-by: Vishal Moola (Oracle) 
---
 arch/loongarch/include/asm/pgalloc.h | 27 +++
 arch/loongarch/mm/pgtable.c  |  7 ---
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/arch/loongarch/include/asm/pgalloc.h 
b/arch/loongarch/include/asm/pgalloc.h
index af1d1e4a6965..70bb3bdd201e 100644
--- a/arch/loongarch/include/asm/pgalloc.h
+++ b/arch/loongarch/include/asm/pgalloc.h
@@ -45,9 +45,9 @@ extern void pagetable_init(void);
 extern pgd_t *pgd_alloc(struct mm_struct *mm);
 
 #define __pte_free_tlb(tlb, pte, address)  \
-do {   \
-   pgtable_pte_page_dtor(pte); \
-   tlb_remove_page((tlb), pte);\
+do {   \
+   pagetable_pte_dtor(page_ptdesc(pte));   \
+   tlb_remove_page_ptdesc((tlb), page_ptdesc(pte));\
 } while (0)
 
 #ifndef __PAGETABLE_PMD_FOLDED
@@ -55,18 +55,18 @@ do {
\
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
 {
pmd_t *pmd;
-   struct page *pg;
+   struct ptdesc *ptdesc;
 
-   pg = alloc_page(GFP_KERNEL_ACCOUNT);
-   if (!pg)
+   ptdesc = pagetable_alloc(GFP_KERNEL_ACCOUNT, 0);
+   if (!ptdesc)
return NULL;
 
-   if (!pgtable_pmd_page_ctor(pg)) {
-   __free_page(pg);
+   if (!pagetable_pmd_ctor(ptdesc)) {
+   pagetable_free(ptdesc);
return NULL;
}
 
-   pmd = (pmd_t *)page_address(pg);
+   pmd = ptdesc_address(ptdesc);
pmd_init(pmd);
return pmd;
 }
@@ -80,10 +80,13 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, 
unsigned long address)
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address)
 {
pud_t *pud;
+   struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0);
 
-   pud = (pud_t *) __get_free_page(GFP_KERNEL);
-   if (pud)
-   pud_init(pud);
+   if (!ptdesc)
+   return NULL;
+   pud = ptdesc_address(ptdesc);
+
+   pud_init(pud);
return pud;
 }
 
diff --git a/arch/loongarch/mm/pgtable.c b/arch/loongarch/mm/pgtable.c
index 36a6dc0148ae..cdba10ffc0df 100644
--- a/arch/loongarch/mm/pgtable.c
+++ b/arch/loongarch/mm/pgtable.c
@@ -11,10 +11,11 @@
 
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-   pgd_t *ret, *init;
+   pgd_t *init, *ret = NULL;
+   struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0);
 
-   ret = (pgd_t *) __get_free_page(GFP_KERNEL);
-   if (ret) {
+   if (ptdesc) {
+   ret = (pgd_t *)ptdesc_address(ptdesc);
init = pgd_offset(_mm, 0UL);
pgd_init(ret);
memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD,
-- 
2.40.1



[PATCH v4 23/34] hexagon: Convert __pte_free_tlb() to use ptdescs

2023-06-12 Thread Vishal Moola (Oracle)
Part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents.

Signed-off-by: Vishal Moola (Oracle) 
---
 arch/hexagon/include/asm/pgalloc.h | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/hexagon/include/asm/pgalloc.h 
b/arch/hexagon/include/asm/pgalloc.h
index f0c47e6a7427..55988625e6fb 100644
--- a/arch/hexagon/include/asm/pgalloc.h
+++ b/arch/hexagon/include/asm/pgalloc.h
@@ -87,10 +87,10 @@ static inline void pmd_populate_kernel(struct mm_struct 
*mm, pmd_t *pmd,
max_kernel_seg = pmdindex;
 }
 
-#define __pte_free_tlb(tlb, pte, addr) \
-do {   \
-   pgtable_pte_page_dtor((pte));   \
-   tlb_remove_page((tlb), (pte));  \
+#define __pte_free_tlb(tlb, pte, addr) \
+do {   \
+   pagetable_pte_dtor((page_ptdesc(pte))); \
+   tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte)));  \
 } while (0)
 
 #endif
-- 
2.40.1



[PATCH v4 22/34] csky: Convert __pte_free_tlb() to use ptdescs

2023-06-12 Thread Vishal Moola (Oracle)
Part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents.

Signed-off-by: Vishal Moola (Oracle) 
Acked-by: Guo Ren 
---
 arch/csky/include/asm/pgalloc.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h
index 7d57e5da0914..9c84c9012e53 100644
--- a/arch/csky/include/asm/pgalloc.h
+++ b/arch/csky/include/asm/pgalloc.h
@@ -63,8 +63,8 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 
 #define __pte_free_tlb(tlb, pte, address)  \
 do {   \
-   pgtable_pte_page_dtor(pte); \
-   tlb_remove_page(tlb, pte);  \
+   pagetable_pte_dtor(page_ptdesc(pte));   \
+   tlb_remove_page_ptdesc(tlb, page_ptdesc(pte));  \
 } while (0)
 
 extern void pagetable_init(void);
-- 
2.40.1



[PATCH v4 21/34] arm64: Convert various functions to use ptdescs

2023-06-12 Thread Vishal Moola (Oracle)
As part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents, convert various page table functions to use ptdescs.

Signed-off-by: Vishal Moola (Oracle) 
---
 arch/arm64/include/asm/tlb.h | 14 --
 arch/arm64/mm/mmu.c  |  7 ---
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index c995d1f4594f..2c29239d05c3 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@@ -75,18 +75,20 @@ static inline void tlb_flush(struct mmu_gather *tlb)
 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
  unsigned long addr)
 {
-   pgtable_pte_page_dtor(pte);
-   tlb_remove_table(tlb, pte);
+   struct ptdesc *ptdesc = page_ptdesc(pte);
+
+   pagetable_pte_dtor(ptdesc);
+   tlb_remove_ptdesc(tlb, ptdesc);
 }
 
 #if CONFIG_PGTABLE_LEVELS > 2
 static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
  unsigned long addr)
 {
-   struct page *page = virt_to_page(pmdp);
+   struct ptdesc *ptdesc = virt_to_ptdesc(pmdp);
 
-   pgtable_pmd_page_dtor(page);
-   tlb_remove_table(tlb, page);
+   pagetable_pmd_dtor(ptdesc);
+   tlb_remove_ptdesc(tlb, ptdesc);
 }
 #endif
 
@@ -94,7 +96,7 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, 
pmd_t *pmdp,
 static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pudp,
  unsigned long addr)
 {
-   tlb_remove_table(tlb, virt_to_page(pudp));
+   tlb_remove_ptdesc(tlb, virt_to_ptdesc(pudp));
 }
 #endif
 
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index af6bc8403ee4..5867a0e917b9 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -426,6 +426,7 @@ static phys_addr_t __pgd_pgtable_alloc(int shift)
 static phys_addr_t pgd_pgtable_alloc(int shift)
 {
phys_addr_t pa = __pgd_pgtable_alloc(shift);
+   struct ptdesc *ptdesc = page_ptdesc(phys_to_page(pa));
 
/*
 * Call proper page table ctor in case later we need to
@@ -433,12 +434,12 @@ static phys_addr_t pgd_pgtable_alloc(int shift)
 * this pre-allocated page table.
 *
 * We don't select ARCH_ENABLE_SPLIT_PMD_PTLOCK if pmd is
-* folded, and if so pgtable_pmd_page_ctor() becomes nop.
+* folded, and if so pagetable_pte_ctor() becomes nop.
 */
if (shift == PAGE_SHIFT)
-   BUG_ON(!pgtable_pte_page_ctor(phys_to_page(pa)));
+   BUG_ON(!pagetable_pte_ctor(ptdesc));
else if (shift == PMD_SHIFT)
-   BUG_ON(!pgtable_pmd_page_ctor(phys_to_page(pa)));
+   BUG_ON(!pagetable_pmd_ctor(ptdesc));
 
return pa;
 }
-- 
2.40.1



[PATCH v4 20/34] arm: Convert various functions to use ptdescs

2023-06-12 Thread Vishal Moola (Oracle)
As part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents, convert various page table functions to use ptdescs.

late_alloc() also uses the __get_free_pages() helper function. Convert
this to use pagetable_alloc() and ptdesc_address() instead to help
standardize page tables further.

Signed-off-by: Vishal Moola (Oracle) 
---
 arch/arm/include/asm/tlb.h | 12 +++-
 arch/arm/mm/mmu.c  |  6 +++---
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
index b8cbe03ad260..f40d06ad5d2a 100644
--- a/arch/arm/include/asm/tlb.h
+++ b/arch/arm/include/asm/tlb.h
@@ -39,7 +39,9 @@ static inline void __tlb_remove_table(void *_table)
 static inline void
 __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr)
 {
-   pgtable_pte_page_dtor(pte);
+   struct ptdesc *ptdesc = page_ptdesc(pte);
+
+   pagetable_pte_dtor(ptdesc);
 
 #ifndef CONFIG_ARM_LPAE
/*
@@ -50,17 +52,17 @@ __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, 
unsigned long addr)
__tlb_adjust_range(tlb, addr - PAGE_SIZE, 2 * PAGE_SIZE);
 #endif
 
-   tlb_remove_table(tlb, pte);
+   tlb_remove_ptdesc(tlb, ptdesc);
 }
 
 static inline void
 __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr)
 {
 #ifdef CONFIG_ARM_LPAE
-   struct page *page = virt_to_page(pmdp);
+   struct ptdesc *ptdesc = virt_to_ptdesc(pmdp);
 
-   pgtable_pmd_page_dtor(page);
-   tlb_remove_table(tlb, page);
+   pagetable_pmd_dtor(ptdesc);
+   tlb_remove_ptdesc(tlb, ptdesc);
 #endif
 }
 
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 22292cf3381c..294518fd0240 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -737,11 +737,11 @@ static void __init *early_alloc(unsigned long sz)
 
 static void *__init late_alloc(unsigned long sz)
 {
-   void *ptr = (void *)__get_free_pages(GFP_PGTABLE_KERNEL, get_order(sz));
+   void *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL, get_order(sz));
 
-   if (!ptr || !pgtable_pte_page_ctor(virt_to_page(ptr)))
+   if (!ptdesc || !pagetable_pte_ctor(ptdesc))
BUG();
-   return ptr;
+   return ptdesc;
 }
 
 static pte_t * __init arm_pte_alloc(pmd_t *pmd, unsigned long addr,
-- 
2.40.1



[PATCH v4 18/34] mm: Remove page table members from struct page

2023-06-12 Thread Vishal Moola (Oracle)
The page table members are now split out into their own ptdesc struct.
Remove them from struct page.

Signed-off-by: Vishal Moola (Oracle) 
---
 include/linux/mm_types.h | 14 --
 include/linux/pgtable.h  |  3 ---
 2 files changed, 17 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6161fe1ae5b8..31ffa1be21d0 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -141,20 +141,6 @@ struct page {
struct {/* Tail pages of compound page */
unsigned long compound_head;/* Bit zero is set */
};
-   struct {/* Page table pages */
-   unsigned long _pt_pad_1;/* compound_head */
-   pgtable_t pmd_huge_pte; /* protected by page->ptl */
-   unsigned long _pt_s390_gaddr;   /* mapping */
-   union {
-   struct mm_struct *pt_mm; /* x86 pgds only */
-   atomic_t pt_frag_refcount; /* powerpc */
-   };
-#if ALLOC_SPLIT_PTLOCKS
-   spinlock_t *ptl;
-#else
-   spinlock_t ptl;
-#endif
-   };
struct {/* ZONE_DEVICE pages */
/** @pgmap: Points to the hosting device page map. */
struct dev_pagemap *pgmap;
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index c405f74d3875..33cc19d752b3 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1019,10 +1019,7 @@ struct ptdesc {
 TABLE_MATCH(flags, __page_flags);
 TABLE_MATCH(compound_head, pt_list);
 TABLE_MATCH(compound_head, _pt_pad_1);
-TABLE_MATCH(pmd_huge_pte, pmd_huge_pte);
 TABLE_MATCH(mapping, _pt_s390_gaddr);
-TABLE_MATCH(pt_mm, pt_mm);
-TABLE_MATCH(ptl, ptl);
 #undef TABLE_MATCH
 static_assert(sizeof(struct ptdesc) <= sizeof(struct page));
 
-- 
2.40.1



[PATCH v4 19/34] pgalloc: Convert various functions to use ptdescs

2023-06-12 Thread Vishal Moola (Oracle)
As part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents, convert various page table functions to use ptdescs.

Some of the functions use the *get*page*() helper functions. Convert
these to use pagetable_alloc() and ptdesc_address() instead to help
standardize page tables further.

Signed-off-by: Vishal Moola (Oracle) 
---
 include/asm-generic/pgalloc.h | 62 +--
 1 file changed, 37 insertions(+), 25 deletions(-)

diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
index a7cf825befae..3fd6ce79e654 100644
--- a/include/asm-generic/pgalloc.h
+++ b/include/asm-generic/pgalloc.h
@@ -18,7 +18,11 @@
  */
 static inline pte_t *__pte_alloc_one_kernel(struct mm_struct *mm)
 {
-   return (pte_t *)__get_free_page(GFP_PGTABLE_KERNEL);
+   struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL, 0);
+
+   if (!ptdesc)
+   return NULL;
+   return ptdesc_address(ptdesc);
 }
 
 #ifndef __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
@@ -41,7 +45,7 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct 
*mm)
  */
 static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
-   free_page((unsigned long)pte);
+   pagetable_free(virt_to_ptdesc(pte));
 }
 
 /**
@@ -49,7 +53,7 @@ static inline void pte_free_kernel(struct mm_struct *mm, 
pte_t *pte)
  * @mm: the mm_struct of the current context
  * @gfp: GFP flags to use for the allocation
  *
- * Allocates a page and runs the pgtable_pte_page_ctor().
+ * Allocates a ptdesc and runs the pagetable_pte_ctor().
  *
  * This function is intended for architectures that need
  * anything beyond simple page allocation or must have custom GFP flags.
@@ -58,17 +62,17 @@ static inline void pte_free_kernel(struct mm_struct *mm, 
pte_t *pte)
  */
 static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp)
 {
-   struct page *pte;
+   struct ptdesc *ptdesc;
 
-   pte = alloc_page(gfp);
-   if (!pte)
+   ptdesc = pagetable_alloc(gfp, 0);
+   if (!ptdesc)
return NULL;
-   if (!pgtable_pte_page_ctor(pte)) {
-   __free_page(pte);
+   if (!pagetable_pte_ctor(ptdesc)) {
+   pagetable_free(ptdesc);
return NULL;
}
 
-   return pte;
+   return ptdesc_page(ptdesc);
 }
 
 #ifndef __HAVE_ARCH_PTE_ALLOC_ONE
@@ -76,7 +80,7 @@ static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, 
gfp_t gfp)
  * pte_alloc_one - allocate a page for PTE-level user page table
  * @mm: the mm_struct of the current context
  *
- * Allocates a page and runs the pgtable_pte_page_ctor().
+ * Allocates a ptdesc and runs the pagetable_pte_ctor().
  *
  * Return: `struct page` initialized as page table or %NULL on error
  */
@@ -98,8 +102,10 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
  */
 static inline void pte_free(struct mm_struct *mm, struct page *pte_page)
 {
-   pgtable_pte_page_dtor(pte_page);
-   __free_page(pte_page);
+   struct ptdesc *ptdesc = page_ptdesc(pte_page);
+
+   pagetable_pte_dtor(ptdesc);
+   pagetable_free(ptdesc);
 }
 
 
@@ -110,7 +116,7 @@ static inline void pte_free(struct mm_struct *mm, struct 
page *pte_page)
  * pmd_alloc_one - allocate a page for PMD-level page table
  * @mm: the mm_struct of the current context
  *
- * Allocates a page and runs the pgtable_pmd_page_ctor().
+ * Allocates a ptdesc and runs the pagetable_pmd_ctor().
  * Allocations use %GFP_PGTABLE_USER in user context and
  * %GFP_PGTABLE_KERNEL in kernel context.
  *
@@ -118,28 +124,30 @@ static inline void pte_free(struct mm_struct *mm, struct 
page *pte_page)
  */
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-   struct page *page;
+   struct ptdesc *ptdesc;
gfp_t gfp = GFP_PGTABLE_USER;
 
if (mm == _mm)
gfp = GFP_PGTABLE_KERNEL;
-   page = alloc_page(gfp);
-   if (!page)
+   ptdesc = pagetable_alloc(gfp, 0);
+   if (!ptdesc)
return NULL;
-   if (!pgtable_pmd_page_ctor(page)) {
-   __free_page(page);
+   if (!pagetable_pmd_ctor(ptdesc)) {
+   pagetable_free(ptdesc);
return NULL;
}
-   return (pmd_t *)page_address(page);
+   return ptdesc_address(ptdesc);
 }
 #endif
 
 #ifndef __HAVE_ARCH_PMD_FREE
 static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 {
+   struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
+
BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
-   pgtable_pmd_page_dtor(virt_to_page(pmd));
-   free_page((unsigned long)pmd);
+   pagetable_pmd_dtor(ptdesc);
+   pagetable_free(ptdesc);
 }
 #endif
 
@@ -149,11 +157,15 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t 
*pmd)
 
 static inline pud_t *__pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-   gfp_t gfp = GFP_PGTABLE_USER;
+   gfp_t gfp = GFP_PGTABLE_USER 

[PATCH v4 17/34] s390: Convert various pgalloc functions to use ptdescs

2023-06-12 Thread Vishal Moola (Oracle)
As part of the conversions to replace pgtable constructor/destructors with
ptdesc equivalents, convert various page table functions to use ptdescs.

Some of the functions use the *get*page*() helper functions. Convert
these to use pagetable_alloc() and ptdesc_address() instead to help
standardize page tables further.

Signed-off-by: Vishal Moola (Oracle) 
---
 arch/s390/include/asm/pgalloc.h |   4 +-
 arch/s390/include/asm/tlb.h |   4 +-
 arch/s390/mm/pgalloc.c  | 108 
 3 files changed, 59 insertions(+), 57 deletions(-)

diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index 17eb618f1348..00ad9b88fda9 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -86,7 +86,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, 
unsigned long vmaddr)
if (!table)
return NULL;
crst_table_init(table, _SEGMENT_ENTRY_EMPTY);
-   if (!pgtable_pmd_page_ctor(virt_to_page(table))) {
+   if (!pagetable_pmd_ctor(virt_to_ptdesc(table))) {
crst_table_free(mm, table);
return NULL;
}
@@ -97,7 +97,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 {
if (mm_pmd_folded(mm))
return;
-   pgtable_pmd_page_dtor(virt_to_page(pmd));
+   pagetable_pmd_dtor(virt_to_ptdesc(pmd));
crst_table_free(mm, (unsigned long *) pmd);
 }
 
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index b91f4a9b044c..383b1f91442c 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -89,12 +89,12 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, 
pmd_t *pmd,
 {
if (mm_pmd_folded(tlb->mm))
return;
-   pgtable_pmd_page_dtor(virt_to_page(pmd));
+   pagetable_pmd_dtor(virt_to_ptdesc(pmd));
__tlb_adjust_range(tlb, address, PAGE_SIZE);
tlb->mm->context.flush_mm = 1;
tlb->freed_tables = 1;
tlb->cleared_puds = 1;
-   tlb_remove_table(tlb, pmd);
+   tlb_remove_ptdesc(tlb, pmd);
 }
 
 /*
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 6b99932abc66..eeb7c95b98cf 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -43,17 +43,17 @@ __initcall(page_table_register_sysctl);
 
 unsigned long *crst_table_alloc(struct mm_struct *mm)
 {
-   struct page *page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+   struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER);
 
-   if (!page)
+   if (!ptdesc)
return NULL;
-   arch_set_page_dat(page, CRST_ALLOC_ORDER);
-   return (unsigned long *) page_to_virt(page);
+   arch_set_page_dat(ptdesc_page(ptdesc), CRST_ALLOC_ORDER);
+   return (unsigned long *) ptdesc_to_virt(ptdesc);
 }
 
 void crst_table_free(struct mm_struct *mm, unsigned long *table)
 {
-   free_pages((unsigned long)table, CRST_ALLOC_ORDER);
+   pagetable_free(virt_to_ptdesc(table));
 }
 
 static void __crst_table_upgrade(void *arg)
@@ -140,21 +140,21 @@ static inline unsigned int atomic_xor_bits(atomic_t *v, 
unsigned int bits)
 
 struct page *page_table_alloc_pgste(struct mm_struct *mm)
 {
-   struct page *page;
+   struct ptdesc *ptdesc;
u64 *table;
 
-   page = alloc_page(GFP_KERNEL);
-   if (page) {
-   table = (u64 *)page_to_virt(page);
+   ptdesc = pagetable_alloc(GFP_KERNEL, 0);
+   if (ptdesc) {
+   table = (u64 *)ptdesc_to_virt(ptdesc);
memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
}
-   return page;
+   return ptdesc_page(ptdesc);
 }
 
 void page_table_free_pgste(struct page *page)
 {
-   __free_page(page);
+   pagetable_free(page_ptdesc(page));
 }
 
 #endif /* CONFIG_PGSTE */
@@ -230,7 +230,7 @@ void page_table_free_pgste(struct page *page)
 unsigned long *page_table_alloc(struct mm_struct *mm)
 {
unsigned long *table;
-   struct page *page;
+   struct ptdesc *ptdesc;
unsigned int mask, bit;
 
/* Try to get a fragment of a 4K page as a 2K page table */
@@ -238,9 +238,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
table = NULL;
spin_lock_bh(>context.lock);
if (!list_empty(>context.pgtable_list)) {
-   page = list_first_entry(>context.pgtable_list,
-   struct page, lru);
-   mask = atomic_read(>pt_frag_refcount);
+   ptdesc = list_first_entry(>context.pgtable_list,
+   struct ptdesc, pt_list);
+   mask = atomic_read(>pt_frag_refcount);
/*
 * The pending removal bits must also be checked.
 * Failure to do so might 

[PATCH v4 16/34] s390: Convert various gmap functions to use ptdescs

2023-06-12 Thread Vishal Moola (Oracle)
In order to split struct ptdesc from struct page, convert various
functions to use ptdescs.

Some of the functions use the *get*page*() helper functions. Convert
these to use pagetable_alloc() and ptdesc_address() instead to help
standardize page tables further.

Signed-off-by: Vishal Moola (Oracle) 
---
 arch/s390/mm/gmap.c | 230 
 1 file changed, 128 insertions(+), 102 deletions(-)

diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 81c683426b49..010e87df7299 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -34,7 +34,7 @@
 static struct gmap *gmap_alloc(unsigned long limit)
 {
struct gmap *gmap;
-   struct page *page;
+   struct ptdesc *ptdesc;
unsigned long *table;
unsigned long etype, atype;
 
@@ -67,12 +67,12 @@ static struct gmap *gmap_alloc(unsigned long limit)
spin_lock_init(>guest_table_lock);
spin_lock_init(>shadow_lock);
refcount_set(>ref_count, 1);
-   page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
-   if (!page)
+   ptdesc = pagetable_alloc(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
+   if (!ptdesc)
goto out_free;
-   page->_pt_s390_gaddr = 0;
-   list_add(>lru, >crst_list);
-   table = page_to_virt(page);
+   ptdesc->_pt_s390_gaddr = 0;
+   list_add(>pt_list, >crst_list);
+   table = ptdesc_to_virt(ptdesc);
crst_table_init(table, etype);
gmap->table = table;
gmap->asce = atype | _ASCE_TABLE_LENGTH |
@@ -181,25 +181,25 @@ static void gmap_rmap_radix_tree_free(struct 
radix_tree_root *root)
  */
 static void gmap_free(struct gmap *gmap)
 {
-   struct page *page, *next;
+   struct ptdesc *ptdesc, *next;
 
/* Flush tlb of all gmaps (if not already done for shadows) */
if (!(gmap_is_shadow(gmap) && gmap->removed))
gmap_flush_tlb(gmap);
/* Free all segment & region tables. */
-   list_for_each_entry_safe(page, next, >crst_list, lru) {
-   page->_pt_s390_gaddr = 0;
-   __free_pages(page, CRST_ALLOC_ORDER);
+   list_for_each_entry_safe(ptdesc, next, >crst_list, pt_list) {
+   ptdesc->_pt_s390_gaddr = 0;
+   pagetable_free(ptdesc);
}
gmap_radix_tree_free(>guest_to_host);
gmap_radix_tree_free(>host_to_guest);
 
/* Free additional data for a shadow gmap */
if (gmap_is_shadow(gmap)) {
-   /* Free all page tables. */
-   list_for_each_entry_safe(page, next, >pt_list, lru) {
-   page->_pt_s390_gaddr = 0;
-   page_table_free_pgste(page);
+   /* Free all ptdesc tables. */
+   list_for_each_entry_safe(ptdesc, next, >pt_list, pt_list) 
{
+   ptdesc->_pt_s390_gaddr = 0;
+   page_table_free_pgste(ptdesc_page(ptdesc));
}
gmap_rmap_radix_tree_free(>host_to_rmap);
/* Release reference to the parent */
@@ -308,27 +308,27 @@ EXPORT_SYMBOL_GPL(gmap_get_enabled);
 static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
unsigned long init, unsigned long gaddr)
 {
-   struct page *page;
+   struct ptdesc *ptdesc;
unsigned long *new;
 
/* since we dont free the gmap table until gmap_free we can unlock */
-   page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
-   if (!page)
+   ptdesc = pagetable_alloc(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
+   if (!ptdesc)
return -ENOMEM;
-   new = page_to_virt(page);
+   new = ptdesc_to_virt(ptdesc);
crst_table_init(new, init);
spin_lock(>guest_table_lock);
if (*table & _REGION_ENTRY_INVALID) {
-   list_add(>lru, >crst_list);
+   list_add(>pt_list, >crst_list);
*table = __pa(new) | _REGION_ENTRY_LENGTH |
(*table & _REGION_ENTRY_TYPE_MASK);
-   page->_pt_s390_gaddr = gaddr;
-   page = NULL;
+   ptdesc->_pt_s390_gaddr = gaddr;
+   ptdesc = NULL;
}
spin_unlock(>guest_table_lock);
-   if (page) {
-   page->_pt_s390_gaddr = 0;
-   __free_pages(page, CRST_ALLOC_ORDER);
+   if (ptdesc) {
+   ptdesc->_pt_s390_gaddr = 0;
+   pagetable_free(ptdesc);
}
return 0;
 }
@@ -341,15 +341,15 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned 
long *table,
  */
 static unsigned long __gmap_segment_gaddr(unsigned long *entry)
 {
-   struct page *page;
+   struct ptdesc *ptdesc;
unsigned long offset, mask;
 
offset = (unsigned long) entry / sizeof(unsigned long);
offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE;
mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
-   page = virt_to_page((void *)((unsigned long) 

[PATCH v4 15/34] x86: Convert various functions to use ptdescs

2023-06-12 Thread Vishal Moola (Oracle)
In order to split struct ptdesc from struct page, convert various
functions to use ptdescs.

Some of the functions use the *get*page*() helper functions. Convert
these to use pagetable_alloc() and ptdesc_address() instead to help
standardize page tables further.

Signed-off-by: Vishal Moola (Oracle) 
---
 arch/x86/mm/pgtable.c | 46 +--
 1 file changed, 27 insertions(+), 19 deletions(-)

diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 15a8009a4480..6da7fd5d4782 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -52,7 +52,7 @@ early_param("userpte", setup_userpte);
 
 void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
 {
-   pgtable_pte_page_dtor(pte);
+   pagetable_pte_dtor(page_ptdesc(pte));
paravirt_release_pte(page_to_pfn(pte));
paravirt_tlb_remove_table(tlb, pte);
 }
@@ -60,7 +60,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
 #if CONFIG_PGTABLE_LEVELS > 2
 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 {
-   struct page *page = virt_to_page(pmd);
+   struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
/*
 * NOTE! For PAE, any changes to the top page-directory-pointer-table
@@ -69,8 +69,8 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 #ifdef CONFIG_X86_PAE
tlb->need_flush_all = 1;
 #endif
-   pgtable_pmd_page_dtor(page);
-   paravirt_tlb_remove_table(tlb, page);
+   pagetable_pmd_dtor(ptdesc);
+   paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc));
 }
 
 #if CONFIG_PGTABLE_LEVELS > 3
@@ -92,16 +92,16 @@ void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
 
 static inline void pgd_list_add(pgd_t *pgd)
 {
-   struct page *page = virt_to_page(pgd);
+   struct ptdesc *ptdesc = virt_to_ptdesc(pgd);
 
-   list_add(>lru, _list);
+   list_add(>pt_list, _list);
 }
 
 static inline void pgd_list_del(pgd_t *pgd)
 {
-   struct page *page = virt_to_page(pgd);
+   struct ptdesc *ptdesc = virt_to_ptdesc(pgd);
 
-   list_del(>lru);
+   list_del(>pt_list);
 }
 
 #define UNSHARED_PTRS_PER_PGD  \
@@ -112,12 +112,12 @@ static inline void pgd_list_del(pgd_t *pgd)
 
 static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
 {
-   virt_to_page(pgd)->pt_mm = mm;
+   virt_to_ptdesc(pgd)->pt_mm = mm;
 }
 
 struct mm_struct *pgd_page_get_mm(struct page *page)
 {
-   return page->pt_mm;
+   return page_ptdesc(page)->pt_mm;
 }
 
 static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
@@ -213,11 +213,14 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, 
pmd_t *pmd)
 static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
 {
int i;
+   struct ptdesc *ptdesc;
 
for (i = 0; i < count; i++)
if (pmds[i]) {
-   pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
-   free_page((unsigned long)pmds[i]);
+   ptdesc = virt_to_ptdesc(pmds[i]);
+
+   pagetable_pmd_dtor(ptdesc);
+   pagetable_free(ptdesc);
mm_dec_nr_pmds(mm);
}
 }
@@ -232,16 +235,21 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t 
*pmds[], int count)
gfp &= ~__GFP_ACCOUNT;
 
for (i = 0; i < count; i++) {
-   pmd_t *pmd = (pmd_t *)__get_free_page(gfp);
-   if (!pmd)
+   pmd_t *pmd = NULL;
+   struct ptdesc *ptdesc = pagetable_alloc(gfp, 0);
+
+   if (!ptdesc)
failed = true;
-   if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) {
-   free_page((unsigned long)pmd);
-   pmd = NULL;
+   if (ptdesc && !pagetable_pmd_ctor(ptdesc)) {
+   pagetable_free(ptdesc);
+   ptdesc = NULL;
failed = true;
}
-   if (pmd)
+   if (ptdesc) {
mm_inc_nr_pmds(mm);
+   pmd = ptdesc_address(ptdesc);
+   }
+
pmds[i] = pmd;
}
 
@@ -830,7 +838,7 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr)
 
free_page((unsigned long)pmd_sv);
 
-   pgtable_pmd_page_dtor(virt_to_page(pmd));
+   pagetable_pmd_dtor(virt_to_ptdesc(pmd));
free_page((unsigned long)pmd);
 
return 1;
-- 
2.40.1



[PATCH v4 14/34] powerpc: Convert various functions to use ptdescs

2023-06-12 Thread Vishal Moola (Oracle)
In order to split struct ptdesc from struct page, convert various
functions to use ptdescs.

Signed-off-by: Vishal Moola (Oracle) 
---
 arch/powerpc/mm/book3s64/mmu_context.c | 10 +++---
 arch/powerpc/mm/book3s64/pgtable.c | 32 +-
 arch/powerpc/mm/pgtable-frag.c | 46 +-
 3 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/mmu_context.c 
b/arch/powerpc/mm/book3s64/mmu_context.c
index c766e4c26e42..1715b07c630c 100644
--- a/arch/powerpc/mm/book3s64/mmu_context.c
+++ b/arch/powerpc/mm/book3s64/mmu_context.c
@@ -246,15 +246,15 @@ static void destroy_contexts(mm_context_t *ctx)
 static void pmd_frag_destroy(void *pmd_frag)
 {
int count;
-   struct page *page;
+   struct ptdesc *ptdesc;
 
-   page = virt_to_page(pmd_frag);
+   ptdesc = virt_to_ptdesc(pmd_frag);
/* drop all the pending references */
count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT;
/* We allow PTE_FRAG_NR fragments from a PTE page */
-   if (atomic_sub_and_test(PMD_FRAG_NR - count, >pt_frag_refcount)) {
-   pgtable_pmd_page_dtor(page);
-   __free_page(page);
+   if (atomic_sub_and_test(PMD_FRAG_NR - count, 
>pt_frag_refcount)) {
+   pagetable_pmd_dtor(ptdesc);
+   pagetable_free(ptdesc);
}
 }
 
diff --git a/arch/powerpc/mm/book3s64/pgtable.c 
b/arch/powerpc/mm/book3s64/pgtable.c
index 85c84e89e3ea..1212deeabe15 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -306,22 +306,22 @@ static pmd_t *get_pmd_from_cache(struct mm_struct *mm)
 static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm)
 {
void *ret = NULL;
-   struct page *page;
+   struct ptdesc *ptdesc;
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
 
if (mm == _mm)
gfp &= ~__GFP_ACCOUNT;
-   page = alloc_page(gfp);
-   if (!page)
+   ptdesc = pagetable_alloc(gfp, 0);
+   if (!ptdesc)
return NULL;
-   if (!pgtable_pmd_page_ctor(page)) {
-   __free_pages(page, 0);
+   if (!pagetable_pmd_ctor(ptdesc)) {
+   pagetable_free(ptdesc);
return NULL;
}
 
-   atomic_set(>pt_frag_refcount, 1);
+   atomic_set(>pt_frag_refcount, 1);
 
-   ret = page_address(page);
+   ret = ptdesc_address(ptdesc);
/*
 * if we support only one fragment just return the
 * allocated page.
@@ -331,12 +331,12 @@ static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm)
 
spin_lock(>page_table_lock);
/*
-* If we find pgtable_page set, we return
+* If we find ptdesc_page set, we return
 * the allocated page with single fragment
 * count.
 */
if (likely(!mm->context.pmd_frag)) {
-   atomic_set(>pt_frag_refcount, PMD_FRAG_NR);
+   atomic_set(>pt_frag_refcount, PMD_FRAG_NR);
mm->context.pmd_frag = ret + PMD_FRAG_SIZE;
}
spin_unlock(>page_table_lock);
@@ -357,15 +357,15 @@ pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned 
long vmaddr)
 
 void pmd_fragment_free(unsigned long *pmd)
 {
-   struct page *page = virt_to_page(pmd);
+   struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
 
-   if (PageReserved(page))
-   return free_reserved_page(page);
+   if (pagetable_is_reserved(ptdesc))
+   return free_reserved_ptdesc(ptdesc);
 
-   BUG_ON(atomic_read(>pt_frag_refcount) <= 0);
-   if (atomic_dec_and_test(>pt_frag_refcount)) {
-   pgtable_pmd_page_dtor(page);
-   __free_page(page);
+   BUG_ON(atomic_read(>pt_frag_refcount) <= 0);
+   if (atomic_dec_and_test(>pt_frag_refcount)) {
+   pagetable_pmd_dtor(ptdesc);
+   pagetable_free(ptdesc);
}
 }
 
diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c
index 20652daa1d7e..8961f1540209 100644
--- a/arch/powerpc/mm/pgtable-frag.c
+++ b/arch/powerpc/mm/pgtable-frag.c
@@ -18,15 +18,15 @@
 void pte_frag_destroy(void *pte_frag)
 {
int count;
-   struct page *page;
+   struct ptdesc *ptdesc;
 
-   page = virt_to_page(pte_frag);
+   ptdesc = virt_to_ptdesc(pte_frag);
/* drop all the pending references */
count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
/* We allow PTE_FRAG_NR fragments from a PTE page */
-   if (atomic_sub_and_test(PTE_FRAG_NR - count, >pt_frag_refcount)) {
-   pgtable_pte_page_dtor(page);
-   __free_page(page);
+   if (atomic_sub_and_test(PTE_FRAG_NR - count, 
>pt_frag_refcount)) {
+   pagetable_pte_dtor(ptdesc);
+   pagetable_free(ptdesc);
}
 }
 
@@ -55,25 +55,25 @@ static pte_t *get_pte_from_cache(struct mm_struct *mm)
 static pte_t *__alloc_for_ptecache(struct 

[PATCH v4 13/34] mm: Create ptdesc equivalents for pgtable_{pte,pmd}_page_{ctor,dtor}

2023-06-12 Thread Vishal Moola (Oracle)
Creates pagetable_pte_ctor(), pagetable_pmd_ctor(), pagetable_pte_dtor(),
and pagetable_pmd_dtor() and make the original pgtable
constructor/destructors wrappers.

Signed-off-by: Vishal Moola (Oracle) 
---
 include/linux/mm.h | 56 ++
 1 file changed, 42 insertions(+), 14 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a1af7983e1bd..dc211c43610b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2886,20 +2886,34 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { 
return true; }
 static inline void ptlock_free(struct ptdesc *ptdesc) {}
 #endif /* USE_SPLIT_PTE_PTLOCKS */
 
-static inline bool pgtable_pte_page_ctor(struct page *page)
+static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc)
 {
-   if (!ptlock_init(page_ptdesc(page)))
+   struct folio *folio = ptdesc_folio(ptdesc);
+
+   if (!ptlock_init(ptdesc))
return false;
-   __SetPageTable(page);
-   inc_lruvec_page_state(page, NR_PAGETABLE);
+   __folio_set_table(folio);
+   lruvec_stat_add_folio(folio, NR_PAGETABLE);
return true;
 }
 
+static inline bool pgtable_pte_page_ctor(struct page *page)
+{
+   return pagetable_pte_ctor(page_ptdesc(page));
+}
+
+static inline void pagetable_pte_dtor(struct ptdesc *ptdesc)
+{
+   struct folio *folio = ptdesc_folio(ptdesc);
+
+   ptlock_free(ptdesc);
+   __folio_clear_table(folio);
+   lruvec_stat_sub_folio(folio, NR_PAGETABLE);
+}
+
 static inline void pgtable_pte_page_dtor(struct page *page)
 {
-   ptlock_free(page_ptdesc(page));
-   __ClearPageTable(page);
-   dec_lruvec_page_state(page, NR_PAGETABLE);
+   pagetable_pte_dtor(page_ptdesc(page));
 }
 
 #define pte_offset_map_lock(mm, pmd, address, ptlp)\
@@ -2981,20 +2995,34 @@ static inline spinlock_t *pmd_lock(struct mm_struct 
*mm, pmd_t *pmd)
return ptl;
 }
 
-static inline bool pgtable_pmd_page_ctor(struct page *page)
+static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc)
 {
-   if (!pmd_ptlock_init(page_ptdesc(page)))
+   struct folio *folio = ptdesc_folio(ptdesc);
+
+   if (!pmd_ptlock_init(ptdesc))
return false;
-   __SetPageTable(page);
-   inc_lruvec_page_state(page, NR_PAGETABLE);
+   __folio_set_table(folio);
+   lruvec_stat_add_folio(folio, NR_PAGETABLE);
return true;
 }
 
+static inline bool pgtable_pmd_page_ctor(struct page *page)
+{
+   return pagetable_pmd_ctor(page_ptdesc(page));
+}
+
+static inline void pagetable_pmd_dtor(struct ptdesc *ptdesc)
+{
+   struct folio *folio = ptdesc_folio(ptdesc);
+
+   pmd_ptlock_free(ptdesc);
+   __folio_clear_table(folio);
+   lruvec_stat_sub_folio(folio, NR_PAGETABLE);
+}
+
 static inline void pgtable_pmd_page_dtor(struct page *page)
 {
-   pmd_ptlock_free(page_ptdesc(page));
-   __ClearPageTable(page);
-   dec_lruvec_page_state(page, NR_PAGETABLE);
+   pagetable_pmd_dtor(page_ptdesc(page));
 }
 
 /*
-- 
2.40.1



[PATCH v4 12/34] mm: Convert ptlock_free() to use ptdescs

2023-06-12 Thread Vishal Moola (Oracle)
This removes some direct accesses to struct page, working towards
splitting out struct ptdesc from struct page.

Signed-off-by: Vishal Moola (Oracle) 
---
 include/linux/mm.h | 10 +-
 mm/memory.c|  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3b54bb4c9753..a1af7983e1bd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2826,7 +2826,7 @@ static inline void pagetable_clear(void *x)
 #if ALLOC_SPLIT_PTLOCKS
 void __init ptlock_cache_init(void);
 bool ptlock_alloc(struct ptdesc *ptdesc);
-extern void ptlock_free(struct page *page);
+void ptlock_free(struct ptdesc *ptdesc);
 
 static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
 {
@@ -2842,7 +2842,7 @@ static inline bool ptlock_alloc(struct ptdesc *ptdesc)
return true;
 }
 
-static inline void ptlock_free(struct page *page)
+static inline void ptlock_free(struct ptdesc *ptdesc)
 {
 }
 
@@ -2883,7 +2883,7 @@ static inline spinlock_t *pte_lockptr(struct mm_struct 
*mm, pmd_t *pmd)
 }
 static inline void ptlock_cache_init(void) {}
 static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; }
-static inline void ptlock_free(struct page *page) {}
+static inline void ptlock_free(struct ptdesc *ptdesc) {}
 #endif /* USE_SPLIT_PTE_PTLOCKS */
 
 static inline bool pgtable_pte_page_ctor(struct page *page)
@@ -2897,7 +2897,7 @@ static inline bool pgtable_pte_page_ctor(struct page 
*page)
 
 static inline void pgtable_pte_page_dtor(struct page *page)
 {
-   ptlock_free(page);
+   ptlock_free(page_ptdesc(page));
__ClearPageTable(page);
dec_lruvec_page_state(page, NR_PAGETABLE);
 }
@@ -2955,7 +2955,7 @@ static inline void pmd_ptlock_free(struct ptdesc *ptdesc)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
VM_BUG_ON_PAGE(ptdesc->pmd_huge_pte, ptdesc_page(ptdesc));
 #endif
-   ptlock_free(ptdesc_page(ptdesc));
+   ptlock_free(ptdesc);
 }
 
 #define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte)
diff --git a/mm/memory.c b/mm/memory.c
index ba9579117686..d4d2ea5cf0fd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5945,8 +5945,8 @@ bool ptlock_alloc(struct ptdesc *ptdesc)
return true;
 }
 
-void ptlock_free(struct page *page)
+void ptlock_free(struct ptdesc *ptdesc)
 {
-   kmem_cache_free(page_ptl_cachep, page->ptl);
+   kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
 }
 #endif
-- 
2.40.1



[PATCH v4 11/34] mm: Convert pmd_ptlock_free() to use ptdescs

2023-06-12 Thread Vishal Moola (Oracle)
This removes some direct accesses to struct page, working towards
splitting out struct ptdesc from struct page.

Signed-off-by: Vishal Moola (Oracle) 
---
 include/linux/mm.h | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f48e626d9c98..3b54bb4c9753 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2950,12 +2950,12 @@ static inline bool pmd_ptlock_init(struct ptdesc 
*ptdesc)
return ptlock_init(ptdesc);
 }
 
-static inline void pmd_ptlock_free(struct page *page)
+static inline void pmd_ptlock_free(struct ptdesc *ptdesc)
 {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-   VM_BUG_ON_PAGE(page->pmd_huge_pte, page);
+   VM_BUG_ON_PAGE(ptdesc->pmd_huge_pte, ptdesc_page(ptdesc));
 #endif
-   ptlock_free(page);
+   ptlock_free(ptdesc_page(ptdesc));
 }
 
 #define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte)
@@ -2968,7 +2968,7 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct 
*mm, pmd_t *pmd)
 }
 
 static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; }
-static inline void pmd_ptlock_free(struct page *page) {}
+static inline void pmd_ptlock_free(struct ptdesc *ptdesc) {}
 
 #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)
 
@@ -2992,7 +2992,7 @@ static inline bool pgtable_pmd_page_ctor(struct page 
*page)
 
 static inline void pgtable_pmd_page_dtor(struct page *page)
 {
-   pmd_ptlock_free(page);
+   pmd_ptlock_free(page_ptdesc(page));
__ClearPageTable(page);
dec_lruvec_page_state(page, NR_PAGETABLE);
 }
-- 
2.40.1



[PATCH v4 10/34] mm: Convert ptlock_init() to use ptdescs

2023-06-12 Thread Vishal Moola (Oracle)
This removes some direct accesses to struct page, working towards
splitting out struct ptdesc from struct page.

Signed-off-by: Vishal Moola (Oracle) 
---
 include/linux/mm.h | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index daecf1db6cf1..f48e626d9c98 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2857,7 +2857,7 @@ static inline spinlock_t *pte_lockptr(struct mm_struct 
*mm, pmd_t *pmd)
return ptlock_ptr(page_ptdesc(pmd_page(*pmd)));
 }
 
-static inline bool ptlock_init(struct page *page)
+static inline bool ptlock_init(struct ptdesc *ptdesc)
 {
/*
 * prep_new_page() initialize page->private (and therefore page->ptl)
@@ -2866,10 +2866,10 @@ static inline bool ptlock_init(struct page *page)
 * It can happen if arch try to use slab for page table allocation:
 * slab code uses page->slab_cache, which share storage with page->ptl.
 */
-   VM_BUG_ON_PAGE(*(unsigned long *)>ptl, page);
-   if (!ptlock_alloc(page_ptdesc(page)))
+   VM_BUG_ON_PAGE(*(unsigned long *)>ptl, ptdesc_page(ptdesc));
+   if (!ptlock_alloc(ptdesc))
return false;
-   spin_lock_init(ptlock_ptr(page_ptdesc(page)));
+   spin_lock_init(ptlock_ptr(ptdesc));
return true;
 }
 
@@ -2882,13 +2882,13 @@ static inline spinlock_t *pte_lockptr(struct mm_struct 
*mm, pmd_t *pmd)
return >page_table_lock;
 }
 static inline void ptlock_cache_init(void) {}
-static inline bool ptlock_init(struct page *page) { return true; }
+static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; }
 static inline void ptlock_free(struct page *page) {}
 #endif /* USE_SPLIT_PTE_PTLOCKS */
 
 static inline bool pgtable_pte_page_ctor(struct page *page)
 {
-   if (!ptlock_init(page))
+   if (!ptlock_init(page_ptdesc(page)))
return false;
__SetPageTable(page);
inc_lruvec_page_state(page, NR_PAGETABLE);
@@ -2947,7 +2947,7 @@ static inline bool pmd_ptlock_init(struct ptdesc *ptdesc)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
ptdesc->pmd_huge_pte = NULL;
 #endif
-   return ptlock_init(ptdesc_page(ptdesc));
+   return ptlock_init(ptdesc);
 }
 
 static inline void pmd_ptlock_free(struct page *page)
-- 
2.40.1



[PATCH v4 09/34] mm: Convert pmd_ptlock_init() to use ptdescs

2023-06-12 Thread Vishal Moola (Oracle)
This removes some direct accesses to struct page, working towards
splitting out struct ptdesc from struct page.

Signed-off-by: Vishal Moola (Oracle) 
---
 include/linux/mm.h | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index bb934d51390f..daecf1db6cf1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2942,12 +2942,12 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct 
*mm, pmd_t *pmd)
return ptlock_ptr(pmd_ptdesc(pmd));
 }
 
-static inline bool pmd_ptlock_init(struct page *page)
+static inline bool pmd_ptlock_init(struct ptdesc *ptdesc)
 {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-   page->pmd_huge_pte = NULL;
+   ptdesc->pmd_huge_pte = NULL;
 #endif
-   return ptlock_init(page);
+   return ptlock_init(ptdesc_page(ptdesc));
 }
 
 static inline void pmd_ptlock_free(struct page *page)
@@ -2967,7 +2967,7 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct 
*mm, pmd_t *pmd)
return >page_table_lock;
 }
 
-static inline bool pmd_ptlock_init(struct page *page) { return true; }
+static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; }
 static inline void pmd_ptlock_free(struct page *page) {}
 
 #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)
@@ -2983,7 +2983,7 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, 
pmd_t *pmd)
 
 static inline bool pgtable_pmd_page_ctor(struct page *page)
 {
-   if (!pmd_ptlock_init(page))
+   if (!pmd_ptlock_init(page_ptdesc(page)))
return false;
__SetPageTable(page);
inc_lruvec_page_state(page, NR_PAGETABLE);
-- 
2.40.1



[PATCH v4 08/34] mm: Convert ptlock_ptr() to use ptdescs

2023-06-12 Thread Vishal Moola (Oracle)
This removes some direct accesses to struct page, working towards
splitting out struct ptdesc from struct page.

Signed-off-by: Vishal Moola (Oracle) 
---
 arch/x86/xen/mmu_pv.c |  2 +-
 include/linux/mm.h| 14 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index b3b8d289b9ab..f469862e3ef4 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -651,7 +651,7 @@ static spinlock_t *xen_pte_lock(struct page *page, struct 
mm_struct *mm)
spinlock_t *ptl = NULL;
 
 #if USE_SPLIT_PTE_PTLOCKS
-   ptl = ptlock_ptr(page);
+   ptl = ptlock_ptr(page_ptdesc(page));
spin_lock_nest_lock(ptl, >page_table_lock);
 #endif
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e6f1be2a405e..bb934d51390f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2828,9 +2828,9 @@ void __init ptlock_cache_init(void);
 bool ptlock_alloc(struct ptdesc *ptdesc);
 extern void ptlock_free(struct page *page);
 
-static inline spinlock_t *ptlock_ptr(struct page *page)
+static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
 {
-   return page->ptl;
+   return ptdesc->ptl;
 }
 #else /* ALLOC_SPLIT_PTLOCKS */
 static inline void ptlock_cache_init(void)
@@ -2846,15 +2846,15 @@ static inline void ptlock_free(struct page *page)
 {
 }
 
-static inline spinlock_t *ptlock_ptr(struct page *page)
+static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
 {
-   return >ptl;
+   return >ptl;
 }
 #endif /* ALLOC_SPLIT_PTLOCKS */
 
 static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
 {
-   return ptlock_ptr(pmd_page(*pmd));
+   return ptlock_ptr(page_ptdesc(pmd_page(*pmd)));
 }
 
 static inline bool ptlock_init(struct page *page)
@@ -2869,7 +2869,7 @@ static inline bool ptlock_init(struct page *page)
VM_BUG_ON_PAGE(*(unsigned long *)>ptl, page);
if (!ptlock_alloc(page_ptdesc(page)))
return false;
-   spin_lock_init(ptlock_ptr(page));
+   spin_lock_init(ptlock_ptr(page_ptdesc(page)));
return true;
 }
 
@@ -2939,7 +2939,7 @@ static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd)
 
 static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
 {
-   return ptlock_ptr(ptdesc_page(pmd_ptdesc(pmd)));
+   return ptlock_ptr(pmd_ptdesc(pmd));
 }
 
 static inline bool pmd_ptlock_init(struct page *page)
-- 
2.40.1



[PATCH v4 07/34] mm: Convert ptlock_alloc() to use ptdescs

2023-06-12 Thread Vishal Moola (Oracle)
This removes some direct accesses to struct page, working towards
splitting out struct ptdesc from struct page.

Signed-off-by: Vishal Moola (Oracle) 
---
 include/linux/mm.h | 6 +++---
 mm/memory.c| 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 088b7664f897..e6f1be2a405e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2825,7 +2825,7 @@ static inline void pagetable_clear(void *x)
 #if USE_SPLIT_PTE_PTLOCKS
 #if ALLOC_SPLIT_PTLOCKS
 void __init ptlock_cache_init(void);
-extern bool ptlock_alloc(struct page *page);
+bool ptlock_alloc(struct ptdesc *ptdesc);
 extern void ptlock_free(struct page *page);
 
 static inline spinlock_t *ptlock_ptr(struct page *page)
@@ -2837,7 +2837,7 @@ static inline void ptlock_cache_init(void)
 {
 }
 
-static inline bool ptlock_alloc(struct page *page)
+static inline bool ptlock_alloc(struct ptdesc *ptdesc)
 {
return true;
 }
@@ -2867,7 +2867,7 @@ static inline bool ptlock_init(struct page *page)
 * slab code uses page->slab_cache, which share storage with page->ptl.
 */
VM_BUG_ON_PAGE(*(unsigned long *)>ptl, page);
-   if (!ptlock_alloc(page))
+   if (!ptlock_alloc(page_ptdesc(page)))
return false;
spin_lock_init(ptlock_ptr(page));
return true;
diff --git a/mm/memory.c b/mm/memory.c
index 80ce9dda2779..ba9579117686 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5934,14 +5934,14 @@ void __init ptlock_cache_init(void)
SLAB_PANIC, NULL);
 }
 
-bool ptlock_alloc(struct page *page)
+bool ptlock_alloc(struct ptdesc *ptdesc)
 {
spinlock_t *ptl;
 
ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
if (!ptl)
return false;
-   page->ptl = ptl;
+   ptdesc->ptl = ptl;
return true;
 }
 
-- 
2.40.1



[PATCH v4 04/34] pgtable: Create struct ptdesc

2023-06-12 Thread Vishal Moola (Oracle)
Currently, page table information is stored within struct page. As part
of simplifying struct page, create struct ptdesc for page table
information.

Signed-off-by: Vishal Moola (Oracle) 
---
 include/linux/pgtable.h | 51 +
 1 file changed, 51 insertions(+)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index c5a51481bbb9..330de96ebfd6 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -975,6 +975,57 @@ static inline void ptep_modify_prot_commit(struct 
vm_area_struct *vma,
 #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */
 #endif /* CONFIG_MMU */
 
+
+/**
+ * struct ptdesc - Memory descriptor for page tables.
+ * @__page_flags: Same as page flags. Unused for page tables.
+ * @pt_list: List of used page tables. Used for s390 and x86.
+ * @_pt_pad_1: Padding that aliases with page's compound head.
+ * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs.
+ * @_pt_s390_gaddr: Aliases with page's mapping. Used for s390 gmap only.
+ * @pt_mm: Used for x86 pgds.
+ * @pt_frag_refcount: For fragmented page table tracking. Powerpc and s390 
only.
+ * @ptl: Lock for the page table.
+ *
+ * This struct overlays struct page for now. Do not modify without a good
+ * understanding of the issues.
+ */
+struct ptdesc {
+   unsigned long __page_flags;
+
+   union {
+   struct list_head pt_list;
+   struct {
+   unsigned long _pt_pad_1;
+   pgtable_t pmd_huge_pte;
+   };
+   };
+   unsigned long _pt_s390_gaddr;
+
+   union {
+   struct mm_struct *pt_mm;
+   atomic_t pt_frag_refcount;
+   };
+
+#if ALLOC_SPLIT_PTLOCKS
+   spinlock_t *ptl;
+#else
+   spinlock_t ptl;
+#endif
+};
+
+#define TABLE_MATCH(pg, pt)\
+   static_assert(offsetof(struct page, pg) == offsetof(struct ptdesc, pt))
+TABLE_MATCH(flags, __page_flags);
+TABLE_MATCH(compound_head, pt_list);
+TABLE_MATCH(compound_head, _pt_pad_1);
+TABLE_MATCH(pmd_huge_pte, pmd_huge_pte);
+TABLE_MATCH(mapping, _pt_s390_gaddr);
+TABLE_MATCH(pt_mm, pt_mm);
+TABLE_MATCH(ptl, ptl);
+#undef TABLE_MATCH
+static_assert(sizeof(struct ptdesc) <= sizeof(struct page));
+
 /*
  * No-op macros that just return the current protection value. Defined here
  * because these macros can be used even if CONFIG_MMU is not defined.
-- 
2.40.1



[PATCH v4 06/34] mm: Convert pmd_pgtable_page() to pmd_ptdesc()

2023-06-12 Thread Vishal Moola (Oracle)
Converts pmd_pgtable_page() to pmd_ptdesc() and all its callers. This
removes some direct accesses to struct page, working towards splitting
out struct ptdesc from struct page.

Signed-off-by: Vishal Moola (Oracle) 
---
 include/linux/mm.h | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f184f1eba85d..088b7664f897 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2931,15 +2931,15 @@ static inline void pgtable_pte_page_dtor(struct page 
*page)
 
 #if USE_SPLIT_PMD_PTLOCKS
 
-static inline struct page *pmd_pgtable_page(pmd_t *pmd)
+static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd)
 {
unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
-   return virt_to_page((void *)((unsigned long) pmd & mask));
+   return virt_to_ptdesc((void *)((unsigned long) pmd & mask));
 }
 
 static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
 {
-   return ptlock_ptr(pmd_pgtable_page(pmd));
+   return ptlock_ptr(ptdesc_page(pmd_ptdesc(pmd)));
 }
 
 static inline bool pmd_ptlock_init(struct page *page)
@@ -2958,7 +2958,7 @@ static inline void pmd_ptlock_free(struct page *page)
ptlock_free(page);
 }
 
-#define pmd_huge_pte(mm, pmd) (pmd_pgtable_page(pmd)->pmd_huge_pte)
+#define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte)
 
 #else
 
-- 
2.40.1



[PATCH v4 05/34] mm: add utility functions for ptdesc

2023-06-12 Thread Vishal Moola (Oracle)
Introduce utility functions setting the foundation for ptdescs. These
will also assist in the splitting out of ptdesc from struct page.

Functions that focus on the descriptor are prefixed with ptdesc_* while
functions that focus on the pagetable are prefixed with pagetable_*.

pagetable_alloc() is defined to allocate new ptdesc pages as compound
pages. This is to standardize ptdescs by allowing for one allocation
and one free function, in contrast to 2 allocation and 2 free functions.

Signed-off-by: Vishal Moola (Oracle) 
---
 include/asm-generic/tlb.h | 11 +++
 include/linux/mm.h| 61 +++
 include/linux/pgtable.h   | 12 
 3 files changed, 84 insertions(+)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index b46617207c93..6bade9e0e799 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -481,6 +481,17 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, 
struct page *page)
return tlb_remove_page_size(tlb, page, PAGE_SIZE);
 }
 
+static inline void tlb_remove_ptdesc(struct mmu_gather *tlb, void *pt)
+{
+   tlb_remove_table(tlb, pt);
+}
+
+/* Like tlb_remove_ptdesc, but for page-like page directories. */
+static inline void tlb_remove_page_ptdesc(struct mmu_gather *tlb, struct 
ptdesc *pt)
+{
+   tlb_remove_page(tlb, ptdesc_page(pt));
+}
+
 static inline void tlb_change_page_size(struct mmu_gather *tlb,
 unsigned int page_size)
 {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0db09639dd2d..f184f1eba85d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2766,6 +2766,62 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, 
pud_t *pud, unsigned long a
 }
 #endif /* CONFIG_MMU */
 
+static inline struct ptdesc *virt_to_ptdesc(const void *x)
+{
+   return page_ptdesc(virt_to_page(x));
+}
+
+static inline void *ptdesc_to_virt(const struct ptdesc *pt)
+{
+   return page_to_virt(ptdesc_page(pt));
+}
+
+static inline void *ptdesc_address(const struct ptdesc *pt)
+{
+   return folio_address(ptdesc_folio(pt));
+}
+
+static inline bool pagetable_is_reserved(struct ptdesc *pt)
+{
+   return folio_test_reserved(ptdesc_folio(pt));
+}
+
+/**
+ * pagetable_alloc - Allocate pagetables
+ * @gfp:GFP flags
+ * @order:  desired pagetable order
+ *
+ * pagetable_alloc allocates a page table descriptor as well as all pages
+ * described by it.
+ *
+ * Return: The ptdesc describing the allocated page tables.
+ */
+static inline struct ptdesc *pagetable_alloc(gfp_t gfp, unsigned int order)
+{
+   struct page *page = alloc_pages(gfp | __GFP_COMP, order);
+
+   return page_ptdesc(page);
+}
+
+/**
+ * pagetable_free - Free pagetables
+ * @pt:The page table descriptor
+ *
+ * pagetable_free frees a page table descriptor as well as all page
+ * tables described by said ptdesc.
+ */
+static inline void pagetable_free(struct ptdesc *pt)
+{
+   struct page *page = ptdesc_page(pt);
+
+   __free_pages(page, compound_order(page));
+}
+
+static inline void pagetable_clear(void *x)
+{
+   clear_page(x);
+}
+
 #if USE_SPLIT_PTE_PTLOCKS
 #if ALLOC_SPLIT_PTLOCKS
 void __init ptlock_cache_init(void);
@@ -2992,6 +3048,11 @@ static inline void mark_page_reserved(struct page *page)
adjust_managed_page_count(page, -1);
 }
 
+static inline void free_reserved_ptdesc(struct ptdesc *pt)
+{
+   free_reserved_page(ptdesc_page(pt));
+}
+
 /*
  * Default method to free all the __init memory into the buddy system.
  * The freed pages will be poisoned with pattern "poison" if it's within
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 330de96ebfd6..c405f74d3875 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1026,6 +1026,18 @@ TABLE_MATCH(ptl, ptl);
 #undef TABLE_MATCH
 static_assert(sizeof(struct ptdesc) <= sizeof(struct page));
 
+#define ptdesc_page(pt)(_Generic((pt), 
\
+   const struct ptdesc *:  (const struct page *)(pt),  \
+   struct ptdesc *:(struct page *)(pt)))
+
+#define ptdesc_folio(pt)   (_Generic((pt), \
+   const struct ptdesc *:  (const struct folio *)(pt), \
+   struct ptdesc *:(struct folio *)(pt)))
+
+#define page_ptdesc(p) (_Generic((p),  \
+   const struct page *:(const struct ptdesc *)(p), \
+   struct page *:  (struct ptdesc *)(p)))
+
 /*
  * No-op macros that just return the current protection value. Defined here
  * because these macros can be used even if CONFIG_MMU is not defined.
-- 
2.40.1



[PATCH v4 03/34] s390: Use pt_frag_refcount for pagetables

2023-06-12 Thread Vishal Moola (Oracle)
s390 currently uses _refcount to identify fragmented page tables.
The page table struct already has a member pt_frag_refcount used by
powerpc, so have s390 use that instead of the _refcount field as well.
This improves the safety for _refcount and the page table tracking.

This also allows us to simplify the tracking since we can once again use
the lower byte of pt_frag_refcount instead of the upper byte of _refcount.

Signed-off-by: Vishal Moola (Oracle) 
---
 arch/s390/mm/pgalloc.c | 38 +++---
 1 file changed, 15 insertions(+), 23 deletions(-)

diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 66ab68db9842..6b99932abc66 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -182,20 +182,17 @@ void page_table_free_pgste(struct page *page)
  * As follows from the above, no unallocated or fully allocated parent
  * pages are contained in mm_context_t::pgtable_list.
  *
- * The upper byte (bits 24-31) of the parent page _refcount is used
+ * The lower byte (bits 0-7) of the parent page pt_frag_refcount is used
  * for tracking contained 2KB-pgtables and has the following format:
  *
  *   PP  AA
- * 01234567upper byte (bits 24-31) of struct page::_refcount
+ * 01234567upper byte (bits 0-7) of struct page::pt_frag_refcount
  *   ||  ||
  *   ||  |+--- upper 2KB-pgtable is allocated
  *   ||  + lower 2KB-pgtable is allocated
  *   |+--- upper 2KB-pgtable is pending for removal
  *   + lower 2KB-pgtable is pending for removal
  *
- * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why
- * using _refcount is possible).
- *
  * When 2KB-pgtable is allocated the corresponding AA bit is set to 1.
  * The parent page is either:
  *   - added to mm_context_t::pgtable_list in case the second half of the
@@ -243,11 +240,12 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
if (!list_empty(>context.pgtable_list)) {
page = list_first_entry(>context.pgtable_list,
struct page, lru);
-   mask = atomic_read(>_refcount) >> 24;
+   mask = atomic_read(>pt_frag_refcount);
/*
 * The pending removal bits must also be checked.
 * Failure to do so might lead to an impossible
-* value of (i.e 0x13 or 0x23) written to _refcount.
+* value of (i.e 0x13 or 0x23) written to
+* pt_frag_refcount.
 * Such values violate the assumption that pending and
 * allocation bits are mutually exclusive, and the rest
 * of the code unrails as result. That could lead to
@@ -259,8 +257,8 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
bit = mask & 1; /* =1 -> second 2K */
if (bit)
table += PTRS_PER_PTE;
-   atomic_xor_bits(>_refcount,
-   0x01U << (bit + 24));
+   atomic_xor_bits(>pt_frag_refcount,
+   0x01U << bit);
list_del(>lru);
}
}
@@ -281,12 +279,12 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
table = (unsigned long *) page_to_virt(page);
if (mm_alloc_pgste(mm)) {
/* Return 4K page table with PGSTEs */
-   atomic_xor_bits(>_refcount, 0x03U << 24);
+   atomic_xor_bits(>pt_frag_refcount, 0x03U);
memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
} else {
/* Return the first 2K fragment of the page */
-   atomic_xor_bits(>_refcount, 0x01U << 24);
+   atomic_xor_bits(>pt_frag_refcount, 0x01U);
memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
spin_lock_bh(>context.lock);
list_add(>lru, >context.pgtable_list);
@@ -323,22 +321,19 @@ void page_table_free(struct mm_struct *mm, unsigned long 
*table)
 * will happen outside of the critical section from this
 * function or from __tlb_remove_table()
 */
-   mask = atomic_xor_bits(>_refcount, 0x11U << (bit + 24));
-   mask >>= 24;
+   mask = atomic_xor_bits(>pt_frag_refcount, 0x11U << bit);
if (mask & 0x03U)
list_add(>lru, >context.pgtable_list);
else
list_del(>lru);
spin_unlock_bh(>context.lock);
-   mask = atomic_xor_bits(>_refcount, 0x10U << (bit + 24));
-   

[PATCH v4 02/34] s390: Use _pt_s390_gaddr for gmap address tracking

2023-06-12 Thread Vishal Moola (Oracle)
s390 uses page->index to keep track of page tables for the guest address
space. In an attempt to consolidate the usage of page fields in s390,
replace _pt_pad_2 with _pt_s390_gaddr to replace page->index in gmap.

This will help with the splitting of struct ptdesc from struct page, as
well as allow s390 to use _pt_frag_refcount for fragmented page table
tracking.

Since page->_pt_s390_gaddr aliases with mapping, ensure its set to NULL
before freeing the pages as well.

This also reverts commit 7e25de77bc5ea ("s390/mm: use pmd_pgtable_page()
helper in __gmap_segment_gaddr()") which had s390 use
pmd_pgtable_page() to get a gmap page table, as pmd_pgtable_page()
should be used for more generic process page tables.

Signed-off-by: Vishal Moola (Oracle) 
---
 arch/s390/mm/gmap.c  | 56 +++-
 include/linux/mm_types.h |  2 +-
 2 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index dc90d1eb0d55..81c683426b49 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -70,7 +70,7 @@ static struct gmap *gmap_alloc(unsigned long limit)
page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
if (!page)
goto out_free;
-   page->index = 0;
+   page->_pt_s390_gaddr = 0;
list_add(>lru, >crst_list);
table = page_to_virt(page);
crst_table_init(table, etype);
@@ -187,16 +187,20 @@ static void gmap_free(struct gmap *gmap)
if (!(gmap_is_shadow(gmap) && gmap->removed))
gmap_flush_tlb(gmap);
/* Free all segment & region tables. */
-   list_for_each_entry_safe(page, next, >crst_list, lru)
+   list_for_each_entry_safe(page, next, >crst_list, lru) {
+   page->_pt_s390_gaddr = 0;
__free_pages(page, CRST_ALLOC_ORDER);
+   }
gmap_radix_tree_free(>guest_to_host);
gmap_radix_tree_free(>host_to_guest);
 
/* Free additional data for a shadow gmap */
if (gmap_is_shadow(gmap)) {
/* Free all page tables. */
-   list_for_each_entry_safe(page, next, >pt_list, lru)
+   list_for_each_entry_safe(page, next, >pt_list, lru) {
+   page->_pt_s390_gaddr = 0;
page_table_free_pgste(page);
+   }
gmap_rmap_radix_tree_free(>host_to_rmap);
/* Release reference to the parent */
gmap_put(gmap->parent);
@@ -318,12 +322,14 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned 
long *table,
list_add(>lru, >crst_list);
*table = __pa(new) | _REGION_ENTRY_LENGTH |
(*table & _REGION_ENTRY_TYPE_MASK);
-   page->index = gaddr;
+   page->_pt_s390_gaddr = gaddr;
page = NULL;
}
spin_unlock(>guest_table_lock);
-   if (page)
+   if (page) {
+   page->_pt_s390_gaddr = 0;
__free_pages(page, CRST_ALLOC_ORDER);
+   }
return 0;
 }
 
@@ -336,12 +342,14 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned 
long *table,
 static unsigned long __gmap_segment_gaddr(unsigned long *entry)
 {
struct page *page;
-   unsigned long offset;
+   unsigned long offset, mask;
 
offset = (unsigned long) entry / sizeof(unsigned long);
offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE;
-   page = pmd_pgtable_page((pmd_t *) entry);
-   return page->index + offset;
+   mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
+   page = virt_to_page((void *)((unsigned long) entry & mask));
+
+   return page->_pt_s390_gaddr + offset;
 }
 
 /**
@@ -1351,6 +1359,7 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned 
long raddr)
/* Free page table */
page = phys_to_page(pgt);
list_del(>lru);
+   page->_pt_s390_gaddr = 0;
page_table_free_pgste(page);
 }
 
@@ -1379,6 +1388,7 @@ static void __gmap_unshadow_sgt(struct gmap *sg, unsigned 
long raddr,
/* Free page table */
page = phys_to_page(pgt);
list_del(>lru);
+   page->_pt_s390_gaddr = 0;
page_table_free_pgste(page);
}
 }
@@ -1409,6 +1419,7 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned 
long raddr)
/* Free segment table */
page = phys_to_page(sgt);
list_del(>lru);
+   page->_pt_s390_gaddr = 0;
__free_pages(page, CRST_ALLOC_ORDER);
 }
 
@@ -1437,6 +1448,7 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned 
long raddr,
/* Free segment table */
page = phys_to_page(sgt);
list_del(>lru);
+   page->_pt_s390_gaddr = 0;
__free_pages(page, CRST_ALLOC_ORDER);
}
 }
@@ -1467,6 +1479,7 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned 
long raddr)
/* Free region 3 table */
  

[PATCH v4 01/34] mm: Add PAGE_TYPE_OP folio functions

2023-06-12 Thread Vishal Moola (Oracle)
No folio equivalents for page type operations have been defined, so
define them for later folio conversions.

Also changes the Page##uname macros to take in const struct page* since
we only read the memory here.

Signed-off-by: Vishal Moola (Oracle) 
---
 include/linux/page-flags.h | 20 ++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 92a2063a0a23..e99a616b9bcd 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -908,6 +908,8 @@ static inline bool is_page_hwpoison(struct page *page)
 
 #define PageType(page, flag)   \
((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
+#define folio_test_type(folio, flag)   \
+   ((folio->page.page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
 
 static inline int page_type_has_type(unsigned int page_type)
 {
@@ -920,20 +922,34 @@ static inline int page_has_type(struct page *page)
 }
 
 #define PAGE_TYPE_OPS(uname, lname)\
-static __always_inline int Page##uname(struct page *page)  \
+static __always_inline int Page##uname(const struct page *page)
\
 {  \
return PageType(page, PG_##lname);  \
 }  \
+static __always_inline int folio_test_##lname(const struct folio *folio)\
+{  \
+   return folio_test_type(folio, PG_##lname);  \
+}  \
 static __always_inline void __SetPage##uname(struct page *page)
\
 {  \
VM_BUG_ON_PAGE(!PageType(page, 0), page);   \
page->page_type &= ~PG_##lname; \
 }  \
+static __always_inline void __folio_set_##lname(struct folio *folio)   \
+{  \
+   VM_BUG_ON_FOLIO(!folio_test_type(folio, 0), folio); \
+   folio->page.page_type &= ~PG_##lname;   \
+}  \
 static __always_inline void __ClearPage##uname(struct page *page)  \
 {  \
VM_BUG_ON_PAGE(!Page##uname(page), page);   \
page->page_type |= PG_##lname;  \
-}
+}  \
+static __always_inline void __folio_clear_##lname(struct folio *folio) \
+{  \
+   VM_BUG_ON_FOLIO(!folio_test_##lname(folio), folio); \
+   folio->page.page_type |= PG_##lname;\
+}  \
 
 /*
  * PageBuddy() indicates that the page is free and in the buddy system
-- 
2.40.1



[PATCH v4 00/34] Split ptdesc from struct page

2023-06-12 Thread Vishal Moola (Oracle)
The MM subsystem is trying to shrink struct page. This patchset
introduces a memory descriptor for page table tracking - struct ptdesc.

This patchset introduces ptdesc, splits ptdesc from struct page, and
converts many callers of page table constructor/destructors to use ptdescs.

Ptdesc is a foundation to further standardize page tables, and eventually
allow for dynamic allocation of page tables independent of struct page.
However, the use of pages for page table tracking is quite deeply
ingrained and varied across archictectures, so there is still a lot of
work to be done before that can happen.

This is rebased on next-20230609.

v4:
  Got more Acked-bys
  Fixed m68k compilation issue
  Dropped unnecessary casts
  Cleanup some fields in struct ptdesc

Vishal Moola (Oracle) (34):
  mm: Add PAGE_TYPE_OP folio functions
  s390: Use _pt_s390_gaddr for gmap address tracking
  s390: Use pt_frag_refcount for pagetables
  pgtable: Create struct ptdesc
  mm: add utility functions for ptdesc
  mm: Convert pmd_pgtable_page() to pmd_ptdesc()
  mm: Convert ptlock_alloc() to use ptdescs
  mm: Convert ptlock_ptr() to use ptdescs
  mm: Convert pmd_ptlock_init() to use ptdescs
  mm: Convert ptlock_init() to use ptdescs
  mm: Convert pmd_ptlock_free() to use ptdescs
  mm: Convert ptlock_free() to use ptdescs
  mm: Create ptdesc equivalents for pgtable_{pte,pmd}_page_{ctor,dtor}
  powerpc: Convert various functions to use ptdescs
  x86: Convert various functions to use ptdescs
  s390: Convert various gmap functions to use ptdescs
  s390: Convert various pgalloc functions to use ptdescs
  mm: Remove page table members from struct page
  pgalloc: Convert various functions to use ptdescs
  arm: Convert various functions to use ptdescs
  arm64: Convert various functions to use ptdescs
  csky: Convert __pte_free_tlb() to use ptdescs
  hexagon: Convert __pte_free_tlb() to use ptdescs
  loongarch: Convert various functions to use ptdescs
  m68k: Convert various functions to use ptdescs
  mips: Convert various functions to use ptdescs
  nios2: Convert __pte_free_tlb() to use ptdescs
  openrisc: Convert __pte_free_tlb() to use ptdescs
  riscv: Convert alloc_{pmd, pte}_late() to use ptdescs
  sh: Convert pte_free_tlb() to use ptdescs
  sparc64: Convert various functions to use ptdescs
  sparc: Convert pgtable_pte_page_{ctor, dtor}() to ptdesc equivalents
  um: Convert {pmd, pte}_free_tlb() to use ptdescs
  mm: Remove pgtable_{pmd, pte}_page_{ctor, dtor}() wrappers

 Documentation/mm/split_page_table_lock.rst|  12 +-
 .../zh_CN/mm/split_page_table_lock.rst|  14 +-
 arch/arm/include/asm/tlb.h|  12 +-
 arch/arm/mm/mmu.c |   6 +-
 arch/arm64/include/asm/tlb.h  |  14 +-
 arch/arm64/mm/mmu.c   |   7 +-
 arch/csky/include/asm/pgalloc.h   |   4 +-
 arch/hexagon/include/asm/pgalloc.h|   8 +-
 arch/loongarch/include/asm/pgalloc.h  |  27 ++-
 arch/loongarch/mm/pgtable.c   |   7 +-
 arch/m68k/include/asm/mcf_pgalloc.h   |  41 ++--
 arch/m68k/include/asm/sun3_pgalloc.h  |   8 +-
 arch/m68k/mm/motorola.c   |   4 +-
 arch/mips/include/asm/pgalloc.h   |  31 +--
 arch/mips/mm/pgtable.c|   7 +-
 arch/nios2/include/asm/pgalloc.h  |   8 +-
 arch/openrisc/include/asm/pgalloc.h   |   8 +-
 arch/powerpc/mm/book3s64/mmu_context.c|  10 +-
 arch/powerpc/mm/book3s64/pgtable.c|  32 +--
 arch/powerpc/mm/pgtable-frag.c|  46 ++--
 arch/riscv/include/asm/pgalloc.h  |   8 +-
 arch/riscv/mm/init.c  |  16 +-
 arch/s390/include/asm/pgalloc.h   |   4 +-
 arch/s390/include/asm/tlb.h   |   4 +-
 arch/s390/mm/gmap.c   | 222 +++---
 arch/s390/mm/pgalloc.c| 126 +-
 arch/sh/include/asm/pgalloc.h |   9 +-
 arch/sparc/mm/init_64.c   |  17 +-
 arch/sparc/mm/srmmu.c |   5 +-
 arch/um/include/asm/pgalloc.h |  18 +-
 arch/x86/mm/pgtable.c |  46 ++--
 arch/x86/xen/mmu_pv.c |   2 +-
 include/asm-generic/pgalloc.h |  62 +++--
 include/asm-generic/tlb.h |  11 +
 include/linux/mm.h| 155 
 include/linux/mm_types.h  |  14 --
 include/linux/page-flags.h|  20 +-
 include/linux/pgtable.h   |  60 +
 mm/memory.c   |   8 +-
 39 files changed, 664 insertions(+), 449 deletions(-)

-- 
2.40.1



Re: [PATCH v14 00/15] phy: Add support for Lynx 10G SerDes

2023-06-12 Thread Sean Anderson
On 6/12/23 12:33, Vladimir Oltean wrote:
> On Mon, Jun 12, 2023 at 10:35:21AM -0400, Sean Anderson wrote:
>> > And if SERDES protocol switching was not physically possible, would this
>> > patch set still have value?

More to the point, did you make any progress in this area?

>> Yes. To e.g. set up SGMII25 or to fix the clocking situation.
> 
> Let me analyze the reasons one by one.
> 
> The clocking situation only exists due to a hope that protocol changing
> would be possible. If you were sure it wasn't possible, your board would
> have had refclks set up for protocol  via the supported PLL mapping .
> In essence, I consider this almost a non-argument, as it fixes a
> self-inflicted problem.

 also does not work, as outlined above.

The clocking is incompatible, and must be fixed by software.

The only thing self-inflicted is NXP's conflicting design.

> Have you actually tested changing individual lanes from SGMII to SGMII 2.5?
> I know it works on LS1028A, but I don't know if that's also true on LS1046A.
> Your comments do say "LYNX_PROTO_SGMII25, /* Not tested */".

Not yet. 

This driver would also be a good place to add the KR link training with
NXP tried to upstream a few years ago.

> Assuming a start from SERDES protocol  with PLL mapping ,
> this protocol change implies powering on PLL 1 (reset state machine
> wants it to be disabled) and moving those lanes from PLL 2 to PLL 1.
> 
> At first sight you might appear to have a point related to the fact that
> PLL register writes are necessary, and thus this whole shebang is necessary.
> But this can all be done using PBI commands, with the added benefit that
> U-Boot can also use those SERDES networking ports, and not just Linux.
> You can use the RCW+PBL specific to your board to inform the SoC that
> your platform's refclk 1 is 156 MHz (something which the reset state
> machine seems unable to learn, with protocol 0x). You don't have to
> put that in the device tree. You don't have to push code to any open
> source project to expose your platform specific details. Then, just like
> in the case of the Lynx 28G driver on LX2160, the SERDES driver could
> just treat the PLL configuration as read-only, which would greatly
> simplify things and eliminate the need for a clk driver.
> 
> Here is an illustrative example (sorry, I don't have a board with the
> right refclk on that PLL, to verify all the way):
> 
> ... snip ...

(which of course complicates the process of building the PBIs...)

> In short, I believe the reasons you have cited do not justify the
> complexity of the solution that you propose.

The work is done, and it is certainly more flexible than what you
propose. E.g. imagine a clock which needs to be configured before it has
the correct frequency. Such a thing is difficult to do in a PBI solution,
but is trivial in Linux.

--Sean


Re: [PATCH v1 07/21] m68k/kexec: refactor for kernel/Kconfig.kexec

2023-06-12 Thread Geert Uytterhoeven
On Mon, Jun 12, 2023 at 7:29 PM Eric DeVolder  wrote:
> The kexec and crash kernel options are provided in the common
> kernel/Kconfig.kexec. Utilize the common options and provide
> the ARCH_HAS_ and ARCH_SUPPORTS_ entries to recreate the
> equivalent set of KEXEC and CRASH options.
>
> Signed-off-by: Eric DeVolder 

Reviewed-by: Geert Uytterhoeven 
Acked-by: Geert Uytterhoeven 

Gr{oetje,eeting}s,

Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds


[PATCH v1 01/21] kexec: consolidate kexec and crash options into kernel/Kconfig.kexec

2023-06-12 Thread Eric DeVolder
The config options for kexec and crash features are consolidated
into new file kernel/Kconfig.kexec. Under the "General Setup" submenu
is a new submenu "Kexec and crash handling" where all the kexec and
crash options that were once in the arch-dependent submenu "Processor
type and features" are now consolidated.

The following options are impacted:

 - KEXEC
 - KEXEC_FILE
 - KEXEC_SIG
 - KEXEC_SIG_FORCE
 - KEXEC_BZIMAGE_VERIFY_SIG
 - KEXEC_JUMP
 - CRASH_DUMP

The three main options are KEXEC, KEXEC_FILE and CRASH_DUMP.

Architectures specify support of certain KEXEC and CRASH features with
similarly named new ARCH_HAS_ config options.

Architectures can utilize the new ARCH_SUPPORTS_ config
options to specify additional components when  is enabled.

To summarize, the ARCH_HAS_ permits the  to be
enabled, and the ARCH_SUPPORTS_ handles side effects (ie.
select statements).

Signed-off-by: Eric DeVolder 
---
 arch/Kconfig |  13 --
 init/Kconfig |   2 +
 kernel/Kconfig.kexec | 103 +++
 3 files changed, 105 insertions(+), 13 deletions(-)
 create mode 100644 kernel/Kconfig.kexec

diff --git a/arch/Kconfig b/arch/Kconfig
index 205fd23e0cad..a37730679730 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -11,19 +11,6 @@ source "arch/$(SRCARCH)/Kconfig"
 
 menu "General architecture-dependent options"
 
-config CRASH_CORE
-   bool
-
-config KEXEC_CORE
-   select CRASH_CORE
-   bool
-
-config KEXEC_ELF
-   bool
-
-config HAVE_IMA_KEXEC
-   bool
-
 config ARCH_HAS_SUBPAGE_FAULTS
bool
help
diff --git a/init/Kconfig b/init/Kconfig
index 32c24950c4ce..4424447e23a5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1917,6 +1917,8 @@ config BINDGEN_VERSION_TEXT
 config TRACEPOINTS
bool
 
+source "kernel/Kconfig.kexec"
+
 endmenu# General setup
 
 source "arch/Kconfig"
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
new file mode 100644
index ..660048099865
--- /dev/null
+++ b/kernel/Kconfig.kexec
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+menu "Kexec and crash features"
+
+config CRASH_CORE
+   bool
+
+config KEXEC_CORE
+   select CRASH_CORE
+   bool
+
+config KEXEC_ELF
+   bool
+
+config HAVE_IMA_KEXEC
+   bool
+
+config KEXEC
+   bool "Enable kexec system call"
+   default ARCH_DEFAULT_KEXEC
+   depends on ARCH_HAS_KEXEC
+   select KEXEC_CORE
+   help
+ kexec is a system call that implements the ability to shutdown your
+ current kernel, and to start another kernel.  It is like a reboot
+ but it is independent of the system firmware.   And like a reboot
+ you can start any kernel with it, not just Linux.
+
+ The name comes from the similarity to the exec system call.
+
+ It is an ongoing process to be certain the hardware in a machine
+ is properly shutdown, so do not be surprised if this code does not
+ initially work for you.  As of this writing the exact hardware
+ interface is strongly in flux, so no good recommendation can be
+ made.
+
+config KEXEC_FILE
+   bool "Enable kexec file based system call"
+   depends on ARCH_HAS_KEXEC_FILE
+   select KEXEC_CORE
+   help
+ This is new version of kexec system call. This system call is
+ file based and takes file descriptors as system call argument
+ for kernel and initramfs as opposed to list of segments as
+ accepted by previous system call.
+
+config KEXEC_SIG
+   bool "Verify kernel signature during kexec_file_load() syscall"
+   depends on KEXEC_FILE && MODULE_SIG_FORMAT
+   help
+
+ This option makes the kexec_file_load() syscall check for a valid
+ signature of the kernel image.  The image can still be loaded without
+ a valid signature unless you also enable KEXEC_SIG_FORCE, though if
+ there's a signature that we can check, then it must be valid.
+
+ In addition to this option, you need to enable signature
+ verification for the corresponding kernel image type being
+ loaded in order for this to work.
+
+config KEXEC_SIG_FORCE
+   bool "Require a valid signature in kexec_file_load() syscall"
+   depends on KEXEC_SIG
+   help
+ This option makes kernel signature verification mandatory for
+ the kexec_file_load() syscall.
+
+config KEXEC_BZIMAGE_VERIFY_SIG
+   bool "Enable bzImage signature verification support"
+   depends on KEXEC_SIG
+   depends on SIGNED_PE_FILE_VERIFICATION
+   select SYSTEM_TRUSTED_KEYRING
+   help
+ Enable bzImage signature verification support.
+
+config KEXEC_JUMP
+   bool "kexec jump"
+   depends on KEXEC && HIBERNATION
+   depends on ARCH_HAS_KEXEC_JUMP
+   help
+ Jump between original kernel and kexeced kernel and invoke
+ code in physical address mode via KEXEC
+

[PATCH v1 08/21] mips/kexec: refactor for kernel/Kconfig.kexec

2023-06-12 Thread Eric DeVolder
The kexec and crash kernel options are provided in the common
kernel/Kconfig.kexec. Utilize the common options and provide
the ARCH_HAS_ and ARCH_SUPPORTS_ entries to recreate the
equivalent set of KEXEC and CRASH options.

Signed-off-by: Eric DeVolder 
---
 arch/mips/Kconfig | 32 +---
 1 file changed, 5 insertions(+), 27 deletions(-)

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 675a8660cb85..fcf4d8b0775e 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2873,33 +2873,11 @@ config HZ
 config SCHED_HRTICK
def_bool HIGH_RES_TIMERS
 
-config KEXEC
-   bool "Kexec system call"
-   select KEXEC_CORE
-   help
- kexec is a system call that implements the ability to shutdown your
- current kernel, and to start another kernel.  It is like a reboot
- but it is independent of the system firmware.   And like a reboot
- you can start any kernel with it, not just Linux.
-
- The name comes from the similarity to the exec system call.
-
- It is an ongoing process to be certain the hardware in a machine
- is properly shutdown, so do not be surprised if this code does not
- initially work for you.  As of this writing the exact hardware
- interface is strongly in flux, so no good recommendation can be
- made.
-
-config CRASH_DUMP
-   bool "Kernel crash dumps"
-   help
- Generate crash dump after being started by kexec.
- This should be normally only set in special crash dump kernels
- which are loaded in the main kernel with kexec-tools into
- a specially reserved region and then later executed after
- a crash by kdump/kexec. The crash dump kernel must be compiled
- to a memory address not used by the main kernel or firmware using
- PHYSICAL_START.
+config ARCH_HAS_KEXEC
+   def_bool y
+
+config ARCH_HAS_CRASH_DUMP
+   def_bool y
 
 config PHYSICAL_START
hex "Physical address where the kernel is loaded"
-- 
2.31.1



[PATCH v1 12/21] s390/kexec: refactor for kernel/Kconfig.kexec

2023-06-12 Thread Eric DeVolder
The kexec and crash kernel options are provided in the common
kernel/Kconfig.kexec. Utilize the common options and provide
the ARCH_HAS_ and ARCH_SUPPORTS_ entries to recreate the
equivalent set of KEXEC and CRASH options.

Signed-off-by: Eric DeVolder 
---
 arch/s390/Kconfig | 65 ++-
 1 file changed, 19 insertions(+), 46 deletions(-)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 6dab9c1be508..2eda5a0e240f 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -243,6 +243,25 @@ config PGTABLE_LEVELS
 
 source "kernel/livepatch/Kconfig"
 
+config ARCH_DEFAULT_KEXEC
+   def_bool y
+
+config ARCH_HAS_KEXEC
+   def_bool y
+
+config ARCH_HAS_KEXEC_FILE
+   def_bool CRYPTO && CRYPTO_SHA256 && CRYPTO_SHA256_S390
+
+config ARCH_HAS_KEXEC_PURGATORY
+   def_bool KEXEC_FILE
+
+config ARCH_HAS_CRASH_DUMP
+   def_bool y
+   help
+ Refer to  for more details on 
this.
+ This option also enables s390 zfcpdump.
+ See also 
+
 menu "Processor type and features"
 
 config HAVE_MARCH_Z10_FEATURES
@@ -481,36 +500,6 @@ config SCHED_TOPOLOGY
 
 source "kernel/Kconfig.hz"
 
-config KEXEC
-   def_bool y
-   select KEXEC_CORE
-
-config KEXEC_FILE
-   bool "kexec file based system call"
-   select KEXEC_CORE
-   depends on CRYPTO
-   depends on CRYPTO_SHA256
-   depends on CRYPTO_SHA256_S390
-   help
- Enable the kexec file based system call. In contrast to the normal
- kexec system call this system call takes file descriptors for the
- kernel and initramfs as arguments.
-
-config ARCH_HAS_KEXEC_PURGATORY
-   def_bool y
-   depends on KEXEC_FILE
-
-config KEXEC_SIG
-   bool "Verify kernel signature during kexec_file_load() syscall"
-   depends on KEXEC_FILE && MODULE_SIG_FORMAT
-   help
- This option makes kernel signature verification mandatory for
- the kexec_file_load() syscall.
-
- In addition to that option, you need to enable signature
- verification for the corresponding kernel image type being
- loaded in order for this to work.
-
 config KERNEL_NOBP
def_bool n
prompt "Enable modified branch prediction for the kernel by default"
@@ -732,22 +721,6 @@ config VFIO_AP
 
 endmenu
 
-menu "Dump support"
-
-config CRASH_DUMP
-   bool "kernel crash dumps"
-   select KEXEC
-   help
- Generate crash dump after being started by kexec.
- Crash dump kernels are loaded in the main kernel with kexec-tools
- into a specially reserved region and then later executed after
- a crash by kdump/kexec.
- Refer to  for more details on 
this.
- This option also enables s390 zfcpdump.
- See also 
-
-endmenu
-
 config CCW
def_bool y
 
-- 
2.31.1



[PATCH v1 11/21] riscv/kexec: refactor for kernel/Kconfig.kexec

2023-06-12 Thread Eric DeVolder
The kexec and crash kernel options are provided in the common
kernel/Kconfig.kexec. Utilize the common options and provide
the ARCH_HAS_ and ARCH_SUPPORTS_ entries to recreate the
equivalent set of KEXEC and CRASH options.

Signed-off-by: Eric DeVolder 
---
 arch/riscv/Kconfig | 48 ++
 1 file changed, 14 insertions(+), 34 deletions(-)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 5966ad97c30c..bcaf49007237 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -585,48 +585,28 @@ config RISCV_BOOT_SPINWAIT
 
  If unsure what to do here, say N.
 
-config KEXEC
-   bool "Kexec system call"
-   depends on MMU
+config ARCH_HAS_KEXEC
+   def_bool MMU
+
+config ARCH_SUPPORTS_KEXEC
+   def_bool y
+   depends on KEXEC
select HOTPLUG_CPU if SMP
-   select KEXEC_CORE
-   help
- kexec is a system call that implements the ability to shutdown your
- current kernel, and to start another kernel. It is like a reboot
- but it is independent of the system firmware. And like a reboot
- you can start any kernel with it, not just Linux.
 
- The name comes from the similarity to the exec system call.
+config ARCH_HAS_KEXEC_FILE
+   def_bool 64BIT && MMU && CRYPTO && CRYPTO_SHA256
 
-config KEXEC_FILE
-   bool "kexec file based systmem call"
-   depends on 64BIT && MMU
-   select HAVE_IMA_KEXEC if IMA
-   select KEXEC_CORE
+config ARCH_SUPPORTS_KEXEC_FILE
+   def_bool y
+   depends on KEXEC_FILE
select KEXEC_ELF
-   help
- This is new version of kexec system call. This system call is
- file based and takes file descriptors as system call argument
- for kernel and initramfs as opposed to list of segments as
- accepted by previous system call.
-
- If you don't know what to do here, say Y.
+   select HAVE_IMA_KEXEC if IMA
 
 config ARCH_HAS_KEXEC_PURGATORY
def_bool KEXEC_FILE
-   depends on CRYPTO=y
-   depends on CRYPTO_SHA256=y
 
-config CRASH_DUMP
-   bool "Build kdump crash kernel"
-   help
- Generate crash dump after being started by kexec. This should
- be normally only set in special crash dump kernels which are
- loaded in the main kernel with kexec-tools into a specially
- reserved region and then later executed after a crash by
- kdump/kexec.
-
- For more details see Documentation/admin-guide/kdump/kdump.rst
+config ARCH_HAS_CRASH_DUMP
+   def_bool y
 
 config COMPAT
bool "Kernel support for 32-bit U-mode"
-- 
2.31.1



[PATCH v1 06/21] loongarch/kexec: refactor for kernel/Kconfig.kexec

2023-06-12 Thread Eric DeVolder
The kexec and crash kernel options are provided in the common
kernel/Kconfig.kexec. Utilize the common options and provide
the ARCH_HAS_ and ARCH_SUPPORTS_ entries to recreate the
equivalent set of KEXEC and CRASH options.

Signed-off-by: Eric DeVolder 
---
 arch/loongarch/Kconfig | 26 +++---
 1 file changed, 7 insertions(+), 19 deletions(-)

diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index d38b066fc931..e836d3310060 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -481,28 +481,16 @@ config ARCH_STRICT_ALIGN
  to run kernel only on systems with h/w unaligned access support in
  order to optimise for performance.
 
-config KEXEC
-   bool "Kexec system call"
-   select KEXEC_CORE
-   help
- kexec is a system call that implements the ability to shutdown your
- current kernel, and to start another kernel.  It is like a reboot
- but it is independent of the system firmware.   And like a reboot
- you can start any kernel with it, not just Linux.
+config ARCH_HAS_KEXEC
+   def_bool y
 
- The name comes from the similarity to the exec system call.
+config ARCH_HAS_CRASH_DUMP
+   def_bool y
 
-config CRASH_DUMP
-   bool "Build kdump crash kernel"
+config ARCH_SUPPORTS_CRASH_DUMP
+   def_bool y
+   depends on CRASH_DUMP
select RELOCATABLE
-   help
- Generate crash dump after being started by kexec. This should
- be normally only set in special crash dump kernels which are
- loaded in the main kernel with kexec-tools into a specially
- reserved region and then later executed after a crash by
- kdump/kexec.
-
- For more details see Documentation/admin-guide/kdump/kdump.rst
 
 config RELOCATABLE
bool "Relocatable kernel"
-- 
2.31.1



[PATCH v1 07/21] m68k/kexec: refactor for kernel/Kconfig.kexec

2023-06-12 Thread Eric DeVolder
The kexec and crash kernel options are provided in the common
kernel/Kconfig.kexec. Utilize the common options and provide
the ARCH_HAS_ and ARCH_SUPPORTS_ entries to recreate the
equivalent set of KEXEC and CRASH options.

Signed-off-by: Eric DeVolder 
---
 arch/m68k/Kconfig | 19 ++-
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 40198a1ebe27..ec71199e75b4 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -88,23 +88,8 @@ config MMU_SUN3
bool
depends on MMU && !MMU_MOTOROLA && !MMU_COLDFIRE
 
-config KEXEC
-   bool "kexec system call"
-   depends on M68KCLASSIC && MMU
-   select KEXEC_CORE
-   help
- kexec is a system call that implements the ability to shutdown your
- current kernel, and to start another kernel.  It is like a reboot
- but it is independent of the system firmware.   And like a reboot
- you can start any kernel with it, not just Linux.
-
- The name comes from the similarity to the exec system call.
-
- It is an ongoing process to be certain the hardware in a machine
- is properly shutdown, so do not be surprised if this code does not
- initially work for you.  As of this writing the exact hardware
- interface is strongly in flux, so no good recommendation can be
- made.
+config ARCH_HAS_KEXEC
+   def_bool M68KCLASSIC && MMU
 
 config BOOTINFO_PROC
bool "Export bootinfo in procfs"
-- 
2.31.1



[PATCH v1 04/21] ia64/kexec: refactor for kernel/Kconfig.kexec

2023-06-12 Thread Eric DeVolder
The kexec and crash kernel options are provided in the common
kernel/Kconfig.kexec. Utilize the common options and provide
the ARCH_HAS_ and ARCH_SUPPORTS_ entries to recreate the
equivalent set of KEXEC and CRASH options.

Signed-off-by: Eric DeVolder 
---
 arch/ia64/Kconfig | 28 +---
 1 file changed, 5 insertions(+), 23 deletions(-)

diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 21fa63ce5ffc..dbef97452839 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -360,31 +360,13 @@ config IA64_HP_AML_NFW
  the "force" module parameter, e.g., with the "aml_nfw.force"
  kernel command line option.
 
-config KEXEC
-   bool "kexec system call"
-   depends on !SMP || HOTPLUG_CPU
-   select KEXEC_CORE
-   help
- kexec is a system call that implements the ability to shutdown your
- current kernel, and to start another kernel.  It is like a reboot
- but it is independent of the system firmware.   And like a reboot
- you can start any kernel with it, not just Linux.
-
- The name comes from the similarity to the exec system call.
-
- It is an ongoing process to be certain the hardware in a machine
- is properly shutdown, so do not be surprised if this code does not
- initially work for you.  As of this writing the exact hardware
- interface is strongly in flux, so no good recommendation can be
- made.
+endmenu
 
-config CRASH_DUMP
- bool "kernel crash dumps"
- depends on IA64_MCA_RECOVERY && (!SMP || HOTPLUG_CPU)
- help
-   Generate crash dump after being started by kexec.
+config ARCH_HAS_KEXEC
+   def_bool !SMP || HOTPLUG_CPU
 
-endmenu
+config ARCH_HAS_CRASH_DUMP
+   def_bool IA64_MCA_RECOVERY && (!SMP || HOTPLUG_CPU)
 
 menu "Power management and ACPI options"
 
-- 
2.31.1



[PATCH v1 09/21] parisc/kexec: refactor for kernel/Kconfig.kexec

2023-06-12 Thread Eric DeVolder
The kexec and crash kernel options are provided in the common
kernel/Kconfig.kexec. Utilize the common options and provide
the ARCH_HAS_ and ARCH_SUPPORTS_ entries to recreate the
equivalent set of KEXEC and CRASH options.

Signed-off-by: Eric DeVolder 
---
 arch/parisc/Kconfig | 34 +++---
 1 file changed, 11 insertions(+), 23 deletions(-)

diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 967bde65dd0e..36c139ce9f5a 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -348,29 +348,17 @@ config NR_CPUS
default "4" if 64BIT
default "16"
 
-config KEXEC
-   bool "Kexec system call"
-   select KEXEC_CORE
-   help
- kexec is a system call that implements the ability to shutdown your
- current kernel, and to start another kernel.  It is like a reboot
- but it is independent of the system firmware.   And like a reboot
- you can start any kernel with it, not just Linux.
-
- It is an ongoing process to be certain the hardware in a machine
- shutdown, so do not be surprised if this code does not
- initially work for you.
-
-config KEXEC_FILE
-   bool "kexec file based system call"
-   select KEXEC_CORE
-   select KEXEC_ELF
-   help
- This enables the kexec_file_load() System call. This is
- file based and takes file descriptors as system call argument
- for kernel and initramfs as opposed to list of segments as
- accepted by previous system call.
-
 endmenu
 
+config ARCH_HAS_KEXEC
+   def_bool y
+
+config ARCH_HAS_KEXEC_FILE
+   def_bool y
+
+config ARCH_SUPPORTS_KEXEC_FILE
+   def_bool y
+   depends on KEXEC_FILE
+   select KEXEC_ELF
+
 source "drivers/parisc/Kconfig"
-- 
2.31.1



[PATCH v1 00/21] refactor Kconfig to consolidate KEXEC and CRASH options

2023-06-12 Thread Eric DeVolder
The Kconfig is refactored to consolidate KEXEC and CRASH options from
various arch//Kconfig files into new file kernel/Kconfig.kexec.

The Kconfig.kexec is now a submenu titled "Kexec and crash features"
located under "General Setup".

The following options are impacted:

 - KEXEC
 - KEXEC_FILE
 - KEXEC_SIG
 - KEXEC_SIG_FORCE
 - KEXEC_BZIMAGE_VERIFY_SIG
 - KEXEC_JUMP
 - CRASH_DUMP

Over time, these options have been copied between Kconfig files and
are very similar to one another, but with slight differences.

The following architectures are impacted by the refactor (because of
use of one or more KEXEC/CRASH options):

 - arm
 - arm64
 - ia64
 - loongarch
 - m68k
 - mips
 - parisc
 - powerpc
 - riscv
 - s390
 - sh
 - x86 

More information:

In the patch series "crash: Kernel handling of CPU and memory hot
un/plug"

 https://lore.kernel.org/lkml/20230503224145.7405-1-eric.devol...@oracle.com/

the new kernel feature introduces the config option CRASH_HOTPLUG.

In reviewing, Thomas Gleixner requested that the new config option
not be placed in x86 Kconfig. Rather the option needs a generic/common
home. To Thomas' point, the KEXEC and CRASH options have largely been
duplicated in the various arch//Kconfig files, with minor
differences. This kind of proliferation is to be avoid/stopped.

 https://lore.kernel.org/lkml/875y91yv63.ffs@tglx/

To that end, I have refactored the arch Kconfigs so as to consolidate
the various KEXEC and CRASH options. Generally speaking, this work has
the following themes:

- KEXEC and CRASH options are moved into new file kernel/Kconfig.kexec
  - These items from arch/Kconfig:
  CRASH_CORE KEXEC_CORE KEXEC_ELF HAVE_IMA_KEXEC
  - These items from arch/x86/Kconfig form the common options:
  KEXEC KEXEC_FILE KEXEC_SIG KEXEC_SIG_FORCE
  KEXEC_BZIMAGE_VERIFY_SIG KEXEC_JUMP CRASH_DUMP
  - The crash hotplug series appends CRASH_HOTPLUG to Kconfig.kexec
  NOTE: PHYSICAL_START could be argued to be included in this series.
- The Kconfig.kexec is now a submenu titled "Kexec and crash features"
- The Kconfig.kexec is now listed in "General Setup" submenu from
  init/Kconfig
- To control the main common options, new options ARCH_HAS_KEXEC,
  ARCH_HAS_KEXEC_FILE and ARCH_HAS_CRASH_DUMP are introduced.
  NOTE: I went with ARCH_HAS_ due to the existing ARCH_HAS_KEXEC_PURGATORY.
- To account for the slight differences, new options ARCH_SUPPORTS_KEXEC,
  ARCH_SUPPORTS_KEXEC_FILE and ARCH_SUPPORTS_CRASH_DUMP are used to
  elicit the same side effects as the original arch//Kconfig
  files for KEXEC and CRASH options.
  NOTE: I'm open to a better name than 'ARCH_SUPPORTS', perhaps
  ARCH_CUSTOMIZE ?

An example, 'make menuconfig' illustrating the submenu:

  > General setup > Kexec and crash features
  [*] Enable kexec system call
  [*] Enable kexec file based system call
  [*]   Verify kernel signature during kexec_file_load() syscall
  [ ] Require a valid signature in kexec_file_load() syscall
  [ ] Enable bzImage signature verification support
  [*] kexec jump
  [*] kernel crash dumps
  [*]   Update the crash elfcorehdr on system configuration changes

The three main options are KEXEC, KEXEC_FILE and CRASH_DUMP. In the
process of consolidating these options, I encountered slight differences
in the coding of these options in several of the architectures. As a
result, I settled on the following solution:

- Each of three main options has a 'depends on ARCH_HAS_'
  statement: ARCH_HAS_KEXEC, ARCH_HAS_KEXEC_FILE, ARCH_HAS_CRASH_DUMP.

  For example, the KEXEC_FILE option has a 'depends on
  ARCH_HAS_KEXEC_FILE' statement.

- The boolean ARCH_HAS_ in effect allows the arch to determine
  when the feature is allowed.  Archs which don't have the feature
  simply do not provide the corresponding ARCH_HAS_.
  For each arch, where there previously were KEXEC and/or CRASH
  options, these have been replaced with the corresponding boolean
  ARCH_HAS_, and an appropriate def_bool statement.

  For example, if the arch supports KEXEC_FILE, then the
  ARCH_HAS_KEXEC_FILE simply has a 'def_bool y'. This permits the
  KEXEC_FILE option to be available.

  If the arch has a 'depends on' statement in its original coding
  of the option, then that expression becomes part of the def_bool
  expression. For example, arm64 had:

  config KEXEC
depends on PM_SLEEP_SMP

  and in this solution, this converts to:

  config ARCH_HAS_KEXEC
def_bool PM_SLEEP_SMP


- In order to account for the differences in the config coding for
  the three common options, the ARCH_SUPPORTS_ is used.
  This options has a 'depends on ' statement to couple it
  to the main option, and from there can insert the differences
  from the common option and the arch original coding of that option.

  For example, a few archs enable CRYPTO and CRYTPO_SHA256 for
  KEXEC_FILE. These require a ARCH_SUPPORTS_KEXEC_FILE and
  'select CRYPTO' and 'select CRYPTO_SHA256' statements.

Illustrating the option relationships:

For KEXEC:
 

[PATCH v1 03/21] arm/kexec: refactor for kernel/Kconfig.kexec

2023-06-12 Thread Eric DeVolder
The kexec and crash kernel options are provided in the common
kernel/Kconfig.kexec. Utilize the common options and provide
the ARCH_HAS_ and ARCH_SUPPORTS_ entries to recreate the
equivalent set of KEXEC and CRASH options.

Signed-off-by: Eric DeVolder 
---
 arch/arm/Kconfig | 29 -
 1 file changed, 4 insertions(+), 25 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 0fb4b218f665..1eda2523134f 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1639,20 +1639,8 @@ config XIP_DEFLATED_DATA
  copied, saving some precious ROM space. A possible drawback is a
  slightly longer boot delay.
 
-config KEXEC
-   bool "Kexec system call (EXPERIMENTAL)"
-   depends on (!SMP || PM_SLEEP_SMP)
-   depends on MMU
-   select KEXEC_CORE
-   help
- kexec is a system call that implements the ability to shutdown your
- current kernel, and to start another kernel.  It is like a reboot
- but it is independent of the system firmware.   And like a reboot
- you can start any kernel with it, not just Linux.
-
- It is an ongoing process to be certain the hardware in a machine
- is properly shutdown, so do not be surprised if this code does not
- initially work for you.
+config ARCH_HAS_KEXEC
+   def_bool (!SMP || PM_SLEEP_SMP) && MMU
 
 config ATAGS_PROC
bool "Export atags in procfs"
@@ -1662,17 +1650,8 @@ config ATAGS_PROC
  Should the atags used to boot the kernel be exported in an "atags"
  file in procfs. Useful with kexec.
 
-config CRASH_DUMP
-   bool "Build kdump crash kernel (EXPERIMENTAL)"
-   help
- Generate crash dump after being started by kexec. This should
- be normally only set in special crash dump kernels which are
- loaded in the main kernel with kexec-tools into a specially
- reserved region and then later executed after a crash by
- kdump/kexec. The crash dump kernel must be compiled to a
- memory address not used by the main kernel
-
- For more details see Documentation/admin-guide/kdump/kdump.rst
+config ARCH_HAS_CRASH_DUMP
+   def_bool y
 
 config AUTO_ZRELADDR
bool "Auto calculation of the decompressed kernel image address" if 
!ARCH_MULTIPLATFORM
-- 
2.31.1



[PATCH v1 02/21] x86/kexec: refactor for kernel/Kconfig.kexec

2023-06-12 Thread Eric DeVolder
The kexec and crash kernel options are provided in the common
kernel/Kconfig.kexec. Utilize the common options and provide
the ARCH_HAS_ and ARCH_SUPPORTS_ entries to recreate the
equivalent set of KEXEC and CRASH options.

Signed-off-by: Eric DeVolder 
---
 arch/x86/Kconfig | 89 +++-
 1 file changed, 13 insertions(+), 76 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 53bab123a8ee..7dff2481abe0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2043,88 +2043,25 @@ config EFI_RUNTIME_MAP
 
 source "kernel/Kconfig.hz"
 
-config KEXEC
-   bool "kexec system call"
-   select KEXEC_CORE
-   help
- kexec is a system call that implements the ability to shutdown your
- current kernel, and to start another kernel.  It is like a reboot
- but it is independent of the system firmware.   And like a reboot
- you can start any kernel with it, not just Linux.
-
- The name comes from the similarity to the exec system call.
-
- It is an ongoing process to be certain the hardware in a machine
- is properly shutdown, so do not be surprised if this code does not
- initially work for you.  As of this writing the exact hardware
- interface is strongly in flux, so no good recommendation can be
- made.
-
-config KEXEC_FILE
-   bool "kexec file based system call"
-   select KEXEC_CORE
-   select HAVE_IMA_KEXEC if IMA
-   depends on X86_64
-   depends on CRYPTO=y
-   depends on CRYPTO_SHA256=y
-   help
- This is new version of kexec system call. This system call is
- file based and takes file descriptors as system call argument
- for kernel and initramfs as opposed to list of segments as
- accepted by previous system call.
+config ARCH_HAS_KEXEC
+   def_bool y
 
-config ARCH_HAS_KEXEC_PURGATORY
-   def_bool KEXEC_FILE
+config ARCH_HAS_KEXEC_FILE
+   def_bool X86_64 && CRYPTO && CRYPTO_SHA256
 
-config KEXEC_SIG
-   bool "Verify kernel signature during kexec_file_load() syscall"
+config ARCH_SUPPORTS_KEXEC_FILE
+   def_bool y
depends on KEXEC_FILE
-   help
-
- This option makes the kexec_file_load() syscall check for a valid
- signature of the kernel image.  The image can still be loaded without
- a valid signature unless you also enable KEXEC_SIG_FORCE, though if
- there's a signature that we can check, then it must be valid.
-
- In addition to this option, you need to enable signature
- verification for the corresponding kernel image type being
- loaded in order for this to work.
-
-config KEXEC_SIG_FORCE
-   bool "Require a valid signature in kexec_file_load() syscall"
-   depends on KEXEC_SIG
-   help
- This option makes kernel signature verification mandatory for
- the kexec_file_load() syscall.
+   select HAVE_IMA_KEXEC if IMA
 
-config KEXEC_BZIMAGE_VERIFY_SIG
-   bool "Enable bzImage signature verification support"
-   depends on KEXEC_SIG
-   depends on SIGNED_PE_FILE_VERIFICATION
-   select SYSTEM_TRUSTED_KEYRING
-   help
- Enable bzImage signature verification support.
+config ARCH_HAS_KEXEC_PURGATORY
+   def_bool KEXEC_FILE
 
-config CRASH_DUMP
-   bool "kernel crash dumps"
-   depends on X86_64 || (X86_32 && HIGHMEM)
-   help
- Generate crash dump after being started by kexec.
- This should be normally only set in special crash dump kernels
- which are loaded in the main kernel with kexec-tools into
- a specially reserved region and then later executed after
- a crash by kdump/kexec. The crash dump kernel must be compiled
- to a memory address not used by the main kernel or BIOS using
- PHYSICAL_START, or it must be built as a relocatable image
- (CONFIG_RELOCATABLE=y).
- For more details see Documentation/admin-guide/kdump/kdump.rst
+config ARCH_HAS_KEXEC_JUMP
+   def_bool y
 
-config KEXEC_JUMP
-   bool "kexec jump"
-   depends on KEXEC && HIBERNATION
-   help
- Jump between original kernel and kexeced kernel and invoke
- code in physical address mode via KEXEC
+config ARCH_HAS_CRASH_DUMP
+   def_bool X86_64 || (X86_32 && HIGHMEM)
 
 config PHYSICAL_START
hex "Physical address where the kernel is loaded" if (EXPERT || 
CRASH_DUMP)
-- 
2.31.1



[PATCH v1 05/21] arm64/kexec: refactor for kernel/Kconfig.kexec

2023-06-12 Thread Eric DeVolder
The kexec and crash kernel options are provided in the common
kernel/Kconfig.kexec. Utilize the common options and provide
the ARCH_HAS_ and ARCH_SUPPORTS_ entries to recreate the
equivalent set of KEXEC and CRASH options.

Signed-off-by: Eric DeVolder 
---
 arch/arm64/Kconfig | 61 --
 1 file changed, 10 insertions(+), 51 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 343e1e1cae10..33552476a877 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1433,60 +1433,19 @@ config PARAVIRT_TIME_ACCOUNTING
 
  If in doubt, say N here.
 
-config KEXEC
-   depends on PM_SLEEP_SMP
-   select KEXEC_CORE
-   bool "kexec system call"
-   help
- kexec is a system call that implements the ability to shutdown your
- current kernel, and to start another kernel.  It is like a reboot
- but it is independent of the system firmware.   And like a reboot
- you can start any kernel with it, not just Linux.
-
-config KEXEC_FILE
-   bool "kexec file based system call"
-   select KEXEC_CORE
-   select HAVE_IMA_KEXEC if IMA
-   help
- This is new version of kexec system call. This system call is
- file based and takes file descriptors as system call argument
- for kernel and initramfs as opposed to list of segments as
- accepted by previous system call.
-
-config KEXEC_SIG
-   bool "Verify kernel signature during kexec_file_load() syscall"
-   depends on KEXEC_FILE
-   help
- Select this option to verify a signature with loaded kernel
- image. If configured, any attempt of loading a image without
- valid signature will fail.
-
- In addition to that option, you need to enable signature
- verification for the corresponding kernel image type being
- loaded in order for this to work.
+config ARCH_HAS_KEXEC
+   def_bool PM_SLEEP_SMP
 
-config KEXEC_IMAGE_VERIFY_SIG
-   bool "Enable Image signature verification support"
-   default y
-   depends on KEXEC_SIG
-   depends on EFI && SIGNED_PE_FILE_VERIFICATION
-   help
- Enable Image signature verification support.
-
-comment "Support for PE file signature verification disabled"
-   depends on KEXEC_SIG
-   depends on !EFI || !SIGNED_PE_FILE_VERIFICATION
+config ARCH_HAS_KEXEC_FILE
+   def_bool y
 
-config CRASH_DUMP
-   bool "Build kdump crash kernel"
-   help
- Generate crash dump after being started by kexec. This should
- be normally only set in special crash dump kernels which are
- loaded in the main kernel with kexec-tools into a specially
- reserved region and then later executed after a crash by
- kdump/kexec.
+config ARCH_SUPPORTS_KEXEC_FILE
+   def_bool y
+   depends on KEXEC_FILE
+   select HAVE_IMA_KEXEC if IMA
 
- For more details see Documentation/admin-guide/kdump/kdump.rst
+config ARCH_HAS_CRASH_DUMP
+   def_bool y
 
 config TRANS_TABLE
def_bool y
-- 
2.31.1



[PATCH v1 13/21] sh/kexec: refactor for kernel/Kconfig.kexec

2023-06-12 Thread Eric DeVolder
The kexec and crash kernel options are provided in the common
kernel/Kconfig.kexec. Utilize the common options and provide
the ARCH_HAS_ and ARCH_SUPPORTS_ entries to recreate the
equivalent set of KEXEC and CRASH options.

Signed-off-by: Eric DeVolder 
---
 arch/sh/Kconfig | 46 --
 1 file changed, 8 insertions(+), 38 deletions(-)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 9652d367fc37..b9bfe5c3139c 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -546,44 +546,14 @@ menu "Kernel features"
 
 source "kernel/Kconfig.hz"
 
-config KEXEC
-   bool "kexec system call (EXPERIMENTAL)"
-   depends on MMU
-   select KEXEC_CORE
-   help
- kexec is a system call that implements the ability to shutdown your
- current kernel, and to start another kernel.  It is like a reboot
- but it is independent of the system firmware.  And like a reboot
- you can start any kernel with it, not just Linux.
-
- The name comes from the similarity to the exec system call.
-
- It is an ongoing process to be certain the hardware in a machine
- is properly shutdown, so do not be surprised if this code does not
- initially work for you.  As of this writing the exact hardware
- interface is strongly in flux, so no good recommendation can be
- made.
-
-config CRASH_DUMP
-   bool "kernel crash dumps (EXPERIMENTAL)"
-   depends on BROKEN_ON_SMP
-   help
- Generate crash dump after being started by kexec.
- This should be normally only set in special crash dump kernels
- which are loaded in the main kernel with kexec-tools into
- a specially reserved region and then later executed after
- a crash by kdump/kexec. The crash dump kernel must be compiled
- to a memory address not used by the main kernel using
- PHYSICAL_START.
-
- For more details see Documentation/admin-guide/kdump/kdump.rst
-
-config KEXEC_JUMP
-   bool "kexec jump (EXPERIMENTAL)"
-   depends on KEXEC && HIBERNATION
-   help
- Jump between original kernel and kexeced kernel and invoke
- code via KEXEC
+config ARCH_HAS_KEXEC
+   def_bool MMU
+
+config ARCH_HAS_CRASH_DUMP
+   def_bool BROKEN_ON_SMP
+
+config ARCH_HAS_KEXEC_JUMP
+   def_bool y
 
 config PHYSICAL_START
hex "Physical address where the kernel is loaded" if (EXPERT || 
CRASH_DUMP)
-- 
2.31.1



[PATCH v1 10/21] powerpc/kexec: refactor for kernel/Kconfig.kexec

2023-06-12 Thread Eric DeVolder
The kexec and crash kernel options are provided in the common
kernel/Kconfig.kexec. Utilize the common options and provide
the ARCH_HAS_ and ARCH_SUPPORTS_ entries to recreate the
equivalent set of KEXEC and CRASH options.

Signed-off-by: Eric DeVolder 
Reviewed-by: Sourabh Jain 
---
 arch/powerpc/Kconfig | 55 ++--
 1 file changed, 17 insertions(+), 38 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index bff5820b7cda..36f2fe0cc8a5 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -588,41 +588,21 @@ config PPC64_SUPPORTS_MEMORY_FAILURE
default "y" if PPC_POWERNV
select ARCH_SUPPORTS_MEMORY_FAILURE
 
-config KEXEC
-   bool "kexec system call"
-   depends on PPC_BOOK3S || PPC_E500 || (44x && !SMP)
-   select KEXEC_CORE
-   help
- kexec is a system call that implements the ability to shutdown your
- current kernel, and to start another kernel.  It is like a reboot
- but it is independent of the system firmware.   And like a reboot
- you can start any kernel with it, not just Linux.
-
- The name comes from the similarity to the exec system call.
-
- It is an ongoing process to be certain the hardware in a machine
- is properly shutdown, so do not be surprised if this code does not
- initially work for you.  As of this writing the exact hardware
- interface is strongly in flux, so no good recommendation can be
- made.
-
-config KEXEC_FILE
-   bool "kexec file based system call"
-   select KEXEC_CORE
-   select HAVE_IMA_KEXEC if IMA
-   select KEXEC_ELF
-   depends on PPC64
-   depends on CRYPTO=y
-   depends on CRYPTO_SHA256=y
-   help
- This is a new version of the kexec system call. This call is
- file based and takes in file descriptors as system call arguments
- for kernel and initramfs as opposed to a list of segments as is the
- case for the older kexec call.
+config ARCH_HAS_KEXEC
+   def_bool PPC_BOOK3S || PPC_E500 || (44x && !SMP)
+
+config ARCH_HAS_KEXEC_FILE
+   def_bool PPC64 && CRYPTO && CRYPTO_SHA256
 
 config ARCH_HAS_KEXEC_PURGATORY
def_bool KEXEC_FILE
 
+config ARCH_SUPPORTS_KEXEC_FILE
+   def_bool y
+   depends on KEXEC_FILE
+   select KEXEC_ELF
+   select HAVE_IMA_KEXEC if IMA
+
 config PPC64_BIG_ENDIAN_ELF_ABI_V2
bool "Build big-endian kernel using ELF ABI V2 (EXPERIMENTAL)"
depends on PPC64 && CPU_BIG_ENDIAN
@@ -682,14 +662,13 @@ config RELOCATABLE_TEST
  loaded at, which tends to be non-zero and therefore test the
  relocation code.
 
-config CRASH_DUMP
-   bool "Build a dump capture kernel"
-   depends on PPC64 || PPC_BOOK3S_32 || PPC_85xx || (44x && !SMP)
+config ARCH_HAS_CRASH_DUMP
+   def_bool PPC64 || PPC_BOOK3S_32 || PPC_85xx || (44x && !SMP)
+
+config ARCH_SUPPORTS_CRASH_DUMP
+   def_bool y
+   depends on CRASH_DUMP
select RELOCATABLE if PPC64 || 44x || PPC_85xx
-   help
- Build a kernel suitable for use as a dump capture kernel.
- The same kernel binary can be used as production kernel and dump
- capture kernel.
 
 config FA_DUMP
bool "Firmware-assisted dump"
-- 
2.31.1



Re: [PATCH v1 00/21] refactor Kconfig to consolidate KEXEC and CRASH options

2023-06-12 Thread Eric DeVolder
My apologies, but this patch series is 13 patches, not 21. The last patch is "PATCH v1 13/21 
sh/kexec: refactor for kernel/Kconfig.kexec"

I'll correct for v2.
eric

On 6/12/23 12:27, Eric DeVolder wrote:

The Kconfig is refactored to consolidate KEXEC and CRASH options from
various arch//Kconfig files into new file kernel/Kconfig.kexec.

The Kconfig.kexec is now a submenu titled "Kexec and crash features"
located under "General Setup".

The following options are impacted:

  - KEXEC
  - KEXEC_FILE
  - KEXEC_SIG
  - KEXEC_SIG_FORCE
  - KEXEC_BZIMAGE_VERIFY_SIG
  - KEXEC_JUMP
  - CRASH_DUMP

Over time, these options have been copied between Kconfig files and
are very similar to one another, but with slight differences.

The following architectures are impacted by the refactor (because of
use of one or more KEXEC/CRASH options):

  - arm
  - arm64
  - ia64
  - loongarch
  - m68k
  - mips
  - parisc
  - powerpc
  - riscv
  - s390
  - sh
  - x86

More information:

In the patch series "crash: Kernel handling of CPU and memory hot
un/plug"

  https://lore.kernel.org/lkml/20230503224145.7405-1-eric.devol...@oracle.com/

the new kernel feature introduces the config option CRASH_HOTPLUG.

In reviewing, Thomas Gleixner requested that the new config option
not be placed in x86 Kconfig. Rather the option needs a generic/common
home. To Thomas' point, the KEXEC and CRASH options have largely been
duplicated in the various arch//Kconfig files, with minor
differences. This kind of proliferation is to be avoid/stopped.

  https://lore.kernel.org/lkml/875y91yv63.ffs@tglx/

To that end, I have refactored the arch Kconfigs so as to consolidate
the various KEXEC and CRASH options. Generally speaking, this work has
the following themes:

- KEXEC and CRASH options are moved into new file kernel/Kconfig.kexec
   - These items from arch/Kconfig:
   CRASH_CORE KEXEC_CORE KEXEC_ELF HAVE_IMA_KEXEC
   - These items from arch/x86/Kconfig form the common options:
   KEXEC KEXEC_FILE KEXEC_SIG KEXEC_SIG_FORCE
   KEXEC_BZIMAGE_VERIFY_SIG KEXEC_JUMP CRASH_DUMP
   - The crash hotplug series appends CRASH_HOTPLUG to Kconfig.kexec
   NOTE: PHYSICAL_START could be argued to be included in this series.
- The Kconfig.kexec is now a submenu titled "Kexec and crash features"
- The Kconfig.kexec is now listed in "General Setup" submenu from
   init/Kconfig
- To control the main common options, new options ARCH_HAS_KEXEC,
   ARCH_HAS_KEXEC_FILE and ARCH_HAS_CRASH_DUMP are introduced.
   NOTE: I went with ARCH_HAS_ due to the existing ARCH_HAS_KEXEC_PURGATORY.
- To account for the slight differences, new options ARCH_SUPPORTS_KEXEC,
   ARCH_SUPPORTS_KEXEC_FILE and ARCH_SUPPORTS_CRASH_DUMP are used to
   elicit the same side effects as the original arch//Kconfig
   files for KEXEC and CRASH options.
   NOTE: I'm open to a better name than 'ARCH_SUPPORTS', perhaps
   ARCH_CUSTOMIZE ?

An example, 'make menuconfig' illustrating the submenu:

   > General setup > Kexec and crash features
   [*] Enable kexec system call
   [*] Enable kexec file based system call
   [*]   Verify kernel signature during kexec_file_load() syscall
   [ ] Require a valid signature in kexec_file_load() syscall
   [ ] Enable bzImage signature verification support
   [*] kexec jump
   [*] kernel crash dumps
   [*]   Update the crash elfcorehdr on system configuration changes

The three main options are KEXEC, KEXEC_FILE and CRASH_DUMP. In the
process of consolidating these options, I encountered slight differences
in the coding of these options in several of the architectures. As a
result, I settled on the following solution:

- Each of three main options has a 'depends on ARCH_HAS_'
   statement: ARCH_HAS_KEXEC, ARCH_HAS_KEXEC_FILE, ARCH_HAS_CRASH_DUMP.

   For example, the KEXEC_FILE option has a 'depends on
   ARCH_HAS_KEXEC_FILE' statement.

- The boolean ARCH_HAS_ in effect allows the arch to determine
   when the feature is allowed.  Archs which don't have the feature
   simply do not provide the corresponding ARCH_HAS_.
   For each arch, where there previously were KEXEC and/or CRASH
   options, these have been replaced with the corresponding boolean
   ARCH_HAS_, and an appropriate def_bool statement.

   For example, if the arch supports KEXEC_FILE, then the
   ARCH_HAS_KEXEC_FILE simply has a 'def_bool y'. This permits the
   KEXEC_FILE option to be available.

   If the arch has a 'depends on' statement in its original coding
   of the option, then that expression becomes part of the def_bool
   expression. For example, arm64 had:

   config KEXEC
 depends on PM_SLEEP_SMP

   and in this solution, this converts to:

   config ARCH_HAS_KEXEC
 def_bool PM_SLEEP_SMP


- In order to account for the differences in the config coding for
   the three common options, the ARCH_SUPPORTS_ is used.
   This options has a 'depends on ' statement to couple it
   to the main option, and from there can insert the differences
   from the common option and 

[RFC PATCH 1/1] powerpc: update ppc_save_regs to save current r1 in pt_regs

2023-06-12 Thread Aditya Gupta
ppc_save_regs() skips one stack frame while saving the CPU register states.
Instead of saving current R1, it pulls the previous stack frame pointer.

When vmcores caused by direct panic call (such as `echo c >
/proc/sysrq-trigger`), are debugged with gdb, gdb fails to show the
backtrace correctly. On further analysis, it was found that it was because
of mismatch between SP (r1) and NIP.

GDB uses NIP to get current function symbol and uses corresponding debug
info of that function to unwind previous frames, but due to the
mismatching SP and NIP, the unwinding does not work, and it fails to
unwind to the 2nd frame and hence does not show the backtrace.

GDB backtrace with vmcore of kernel without this patch:

-
(gdb) bt
 #0  0xc02a53e8 in crash_setup_regs (oldregs=,
newregs=0xc4f8f8d8) at ./arch/powerpc/include/asm/kexec.h:69
 #1  __crash_kexec (regs=) at kernel/kexec_core.c:974
 #2  0x0063 in ?? ()
 #3  0xc3579320 in ?? ()
-

Further analysis revealed that the mismatch occurred because
"ppc_save_regs" was saving the previous stack's SP instead of the current
r1. This patch fixes this by storing current r1 in the saved pt_regs.

GDB backtrace with vmcore of patched kernel:


(gdb) bt
 #0  0xc02a53e8 in crash_setup_regs (oldregs=0x0, 
newregs=0xc670b8d8)
at ./arch/powerpc/include/asm/kexec.h:69
 #1  __crash_kexec (regs=regs@entry=0x0) at kernel/kexec_core.c:974
 #2  0xc0168918 in panic (fmt=fmt@entry=0xc1654a60 "sysrq 
triggered crash\n")
at kernel/panic.c:358
 #3  0xc0b735f8 in sysrq_handle_crash (key=) at 
drivers/tty/sysrq.c:155
 #4  0xc0b742cc in __handle_sysrq (key=key@entry=99, 
check_mask=check_mask@entry=false)
at drivers/tty/sysrq.c:602
 #5  0xc0b7506c in write_sysrq_trigger (file=, 
buf=,
count=2, ppos=) at drivers/tty/sysrq.c:1163
 #6  0xc069a7bc in pde_write (ppos=, count=,
buf=, file=, pde=0xc362cb40) at 
fs/proc/inode.c:340
 #7  proc_reg_write (file=, buf=, 
count=,
ppos=) at fs/proc/inode.c:352
 #8  0xc05b3bbc in vfs_write (file=file@entry=0xc6aa6b00,
buf=buf@entry=0x61f498b4f60 ,
count=count@entry=2, pos=pos@entry=0xc670bda0) at 
fs/read_write.c:582
 #9  0xc05b4264 in ksys_write (fd=,
buf=0x61f498b4f60 , 
count=2)
at fs/read_write.c:637
 #10 0xc002ea2c in system_call_exception (regs=0xc670be80, 
r0=)
at arch/powerpc/kernel/syscall.c:171
 #11 0xc000c270 in system_call_vectored_common ()
at arch/powerpc/kernel/interrupt_64.S:192


Signed-off-by: Aditya Gupta 
---

More information:

This problem with gdb backtrace was discovered while working on a crash
tool enhancement to improve crash analysis using gdb passthrough to be
able print function arguments and local variables inside crash tool. gdb
passthrough simply asks gdb to handle the backtrace printing, where it
was noticed that it could not print correct backtrace in some vmcores.

The changes introduced here has an implication on xmon, that it will show
one extra `xmon` frame in backtrace. By looking at older commits it seems
that originally the ppc_save_regs function was introduced as
xmon_save_regs(). But now the same function has been renamed to
ppc_save_regs() and been used in few other places as well.

Is it still needed to skip one stack frame except if the caller is xmon() ?
If I am correct it was skipping one stack frame to avoid displaying xmon()
stack frame in backtrace. Please advice me if my understanding is correct or 
not.

---
 arch/powerpc/kernel/ppc_save_regs.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/ppc_save_regs.S 
b/arch/powerpc/kernel/ppc_save_regs.S
index 49813f982468..0caae1c8cf99 100644
--- a/arch/powerpc/kernel/ppc_save_regs.S
+++ b/arch/powerpc/kernel/ppc_save_regs.S
@@ -31,8 +31,8 @@ _GLOBAL(ppc_save_regs)
lbz r0,PACAIRQSOFTMASK(r13)
PPC_STL r0,SOFTE(r3)
 #endif
-   /* go up one stack frame for SP */
-   PPC_LL  r4,0(r1)
+   /* store current SP */
+   mr  r4,r1
PPC_STL r4,GPR1(r3)
/* get caller's LR */
PPC_LL  r0,LRSAVE(r4)
-- 
2.40.1



Re: [PATCH 8/9] powerpc: Add HOTPLUG_SMT support

2023-06-12 Thread Laurent Dufour
On 10/06/2023 23:10:02, Thomas Gleixner wrote:
> On Thu, Jun 01 2023 at 18:19, Laurent Dufour wrote:
>> @@ -435,12 +435,17 @@ void __init cpu_smt_disable(bool force)
>>   * The decision whether SMT is supported can only be done after the full
>>   * CPU identification. Called from architecture code.
>>   */
>> -void __init cpu_smt_check_topology(unsigned int num_threads)
>> +void __init cpu_smt_check_topology(unsigned int num_threads,
>> +   unsigned int max_threads)
>>  {
>>  if (!topology_smt_supported())
>>  cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
>>  
>> -cpu_smt_max_threads = num_threads;
>> +cpu_smt_max_threads = max_threads;
>> +
>> +WARN_ON(num_threads > max_threads);
>> +if (num_threads > max_threads)
>> +num_threads = max_threads;
> 
> This does not work. The call site does:
> 
>> +cpu_smt_check_topology(smt_enabled_at_boot, threads_per_core);
> 
> smt_enabled_at_boot is 0 when 'smt-enabled=off', which is not what the
> hotplug core expects. If SMT is disabled it brings up the primary
> thread, which means cpu_smt_num_threads = 1.

Thanks, Thomas,
Definitively, a test against smt_enabled_at_boot==0 is required here.

> This needs more thoughts to avoid a completely inconsistent duct tape
> mess.

Despite the test against smt_enabled_at_boot, mentioned above, I can't see
anything else to rework. Am I missing something?

> 
> Btw, the command line parser and the variable smt_enabled_at_boot being
> type int allow negative number of threads too... Maybe not what you want.

I do agree, it should an unsigned type.

Thanks,
Laurent.

> Thanks,
> 
> tglx
> 
> 
> 
> 



Re: [PATCH 8/9] powerpc: Add HOTPLUG_SMT support

2023-06-12 Thread Thomas Gleixner
On Mon, Jun 12 2023 at 17:20, Laurent Dufour wrote:
> On 10/06/2023 23:10:02, Thomas Gleixner wrote:
>> This needs more thoughts to avoid a completely inconsistent duct tape
>> mess.
>
> Despite the test against smt_enabled_at_boot, mentioned above, I can't see
> anything else to rework. Am I missing something?

See my comments on the core code changes.

>> Btw, the command line parser and the variable smt_enabled_at_boot being
>> type int allow negative number of threads too... Maybe not what you want.
>
> I do agree, it should an unsigned type.

I assume you'll fix that yourself. :)

Thanks,

tglx


Re: [PATCH v14 00/15] phy: Add support for Lynx 10G SerDes

2023-06-12 Thread Vladimir Oltean
On Mon, Jun 12, 2023 at 10:35:21AM -0400, Sean Anderson wrote:
> > And if SERDES protocol switching was not physically possible, would this
> > patch set still have value?
> 
> Yes. To e.g. set up SGMII25 or to fix the clocking situation.

Let me analyze the reasons one by one.

The clocking situation only exists due to a hope that protocol changing
would be possible. If you were sure it wasn't possible, your board would
have had refclks set up for protocol  via the supported PLL mapping .
In essence, I consider this almost a non-argument, as it fixes a
self-inflicted problem.

Have you actually tested changing individual lanes from SGMII to SGMII 2.5?
I know it works on LS1028A, but I don't know if that's also true on LS1046A.
Your comments do say "LYNX_PROTO_SGMII25, /* Not tested */".

Assuming a start from SERDES protocol  with PLL mapping ,
this protocol change implies powering on PLL 1 (reset state machine
wants it to be disabled) and moving those lanes from PLL 2 to PLL 1.

At first sight you might appear to have a point related to the fact that
PLL register writes are necessary, and thus this whole shebang is necessary.
But this can all be done using PBI commands, with the added benefit that
U-Boot can also use those SERDES networking ports, and not just Linux.
You can use the RCW+PBL specific to your board to inform the SoC that
your platform's refclk 1 is 156 MHz (something which the reset state
machine seems unable to learn, with protocol 0x). You don't have to
put that in the device tree. You don't have to push code to any open
source project to expose your platform specific details. Then, just like
in the case of the Lynx 28G driver on LX2160, the SERDES driver could
just treat the PLL configuration as read-only, which would greatly
simplify things and eliminate the need for a clk driver.

Here is an illustrative example (sorry, I don't have a board with the
right refclk on that PLL, to verify all the way):

Add this to ./serdes_10g.rcw in the root of the qoriq-rcw repository:

/*
 * Registers for the Lynx 10G SERDES block.
 *
 * Must be included by an SoC-specific header that defines the
 * SRDS_BASE value.
 */

#define PLLnRSTCTL(n)   (SRDS_BASE + (0x20 * (n)))
#define PLLnCR0(n)  (SRDS_BASE + (0x20 * (n)) + 0x0004)
#define  POFF(x)(((x) << 31) & 0x8000)
#define  REFCLK_SEL(x)  (((x) << 28) & 0x7000)
#define  REFCLK_EN(x)   (((x) << 27) & 0x0800)
#define  FRATE_SEL(x)   (((x) << 16) & 0x000f)
#define  DLYDIV_SEL(x)  ((x) & 0x0003)
#define PCCR8   (SRDS_BASE + 0x0220)
#define  SGMIIA_KX(x)   (((x) << 31) & 0x8000)
#define  SGMIIA_CFG(x)  (((x) << 28) & 0x7000)
#define  SGMIIB_KX(x)   (((x) << 27) & 0x0800)
#define  SGMIIB_CFG(x)  (((x) << 24) & 0x0700)
#define  SGMIIC_KX(x)   (((x) << 23) & 0x0080)
#define  SGMIIC_CFG(x)  (((x) << 20) & 0x0070)
#define  SGMIID_KX(x)   (((x) << 19) & 0x0008)
#define  SGMIID_CFG(x)  (((x) << 16) & 0x0007)
#define  SGMIIE_KX(x)   (((x) << 15) & 0x8000)
#define  SGMIIE_CFG(x)  (((x) << 12) & 0x7000)
#define  SGMIIF_KX(x)   (((x) << 11) & 0x0800)
#define  SGMIIF_CFG(x)  (((x) << 8) & 0x0700)
#define  SGMIIG_KX(x)   (((x) << 7) & 0x0080)
#define  SGMIIG_CFG(x)  (((x) << 4) & 0x0070)
#define  SGMIIH_KX(x)   (((x) << 3) & 0x0008)
#define  SGMIIH_CFG(x)  ((x) & 0x0007)
#define PCCRB   (SRDS_BASE + 0x022c)
#define  XFIA_CFG(x)(((x) << 28) & 0x7000)
#define  XFIB_CFG(x)(((x) << 24) & 0x0700)
#define  XFIC_CFG(x)(((x) << 20) & 0x0070)
#define  XFID_CFG(x)(((x) << 16) & 0x0007)
#define  XFIE_CFG(x)(((x) << 12) & 0x7000)
#define  XFIF_CFG(x)(((x) << 8) & 0x0700)
#define  XFIG_CFG(x)(((x) << 4) & 0x0070)
#define  XFIH_CFG(x)((x) & 0x0007)
#define LNmGCR0(m)  (SRDS_BASE + (0x40 * (m)) + 0x0800)
#define  RPLL_LES(x)(((x) << 31) & 0x8000)
#define  RRAT_SEL(x)(((x) << 28) & 0x3000)
#define  TPLL_LES(x)(((x) << 27) & 0x0800)
#define  TRAT_SEL(x)(((x) << 24) & 0x0300)
#define  RRST_B(x)  (((x) << 22) & 0x0040)
#define  TRST_B(x)  (((x) << 21) & 0x0020)
#define  RX_PD(x)   (((x) << 20) & 0x0010)
#define  TX_PD(x)   (((x) << 19) & 0x0008)
#define  IF20BIT_EN(x)  (((x) << 18) & 0x0004)
#define  FIRST_LANE(x)  (((x) << 16) & 0x0001)
#define  GCR0_RSV   0x1000
#define  PROTS(x)   (((x) << 7) & 0x0f80)
#define LNmGCR1(m)  (SRDS_BASE + (0x40 * (m)) + 0x0804)
#define  RDAT_INV(x)(((x) << 31) & 0x8000)
#define  

Re: [PATCH v4 1/1] PCI: layerscape: Add the endpoint linkup notifier support

2023-06-12 Thread Frank Li
On Mon, May 15, 2023 at 11:10:49AM -0400, Frank Li wrote:
> Layerscape has PME interrupt, which can be used as linkup notifier.
> Set CFG_READY bit of PEX_PF0_CONFIG to enable accesses from root complex
> when linkup detected.
> 
> Acked-by: Manivannan Sadhasivam 
> Signed-off-by: Xiaowei Bao 
> Signed-off-by: Frank Li 
> ---

Ping, not comments almost over 1 months.

> Change from v3 to v4
>  - swap irq and big_endian
> Change from v2 to v3
>  - align 80 column
>  - clear irq firstly
>  - dev_info to dev_dbg
>  - remove double space
>  - update commit message
> 
> Change from v1 to v2
> - pme -> PME
> - irq -> IRQ
> - update dev_info message according to Bjorn's suggestion
> 
>  .../pci/controller/dwc/pci-layerscape-ep.c| 102 +-
>  1 file changed, 101 insertions(+), 1 deletion(-)
> 
> 


Re: [PATCH v14 00/15] phy: Add support for Lynx 10G SerDes

2023-06-12 Thread Sean Anderson
On 6/10/23 18:21, Vladimir Oltean wrote:
> Hello Sean,
> 
> On Fri, Jun 09, 2023 at 03:19:22PM -0400, Sean Anderson wrote:
>> On 5/22/23 11:00, Vladimir Oltean wrote:
>> > On Mon, May 22, 2023 at 10:42:04AM -0400, Sean Anderson wrote:
>> >> Have you had a chance to review this driver?
>> > 
>> > Partially / too little (and no, I don't have an answer yet). I am
>> > debugging a SERDES protocol change procedure from XFI to SGMII.
>> 
>> I'd just like to reiterate that, like I said in the cover letter, I
>> believe this driver still has value even if it cannot yet perform
>> protocol switching.
>> 
>> Please send me your feedback, and I will try and incorporate it into the
>> next revision. Previously, you said you had major objections to the
>> contents of this series, but you still have not listed them.
> 
> And if SERDES protocol switching was not physically possible, would this
> patch set still have value?

Yes. To e.g. set up SGMII25 or to fix the clocking situation.

--Sean


Re: [PATCH v5 15/18] watchdog/perf: Add a weak function for an arch to detect if perf can use NMIs

2023-06-12 Thread Mark Rutland
On Mon, Jun 12, 2023 at 06:55:37AM -0700, Doug Anderson wrote:
> Mark,
> 
> On Mon, Jun 12, 2023 at 3:33 AM Mark Rutland  wrote:
> >
> > On Fri, May 19, 2023 at 10:18:39AM -0700, Douglas Anderson wrote:
> > > On arm64, NMI support needs to be detected at runtime. Add a weak
> > > function to the perf hardlockup detector so that an architecture can
> > > implement it to detect whether NMIs are available.
> > >
> > > Signed-off-by: Douglas Anderson 
> > > ---
> > > While I won't object to this patch landing, I consider it part of the
> > > arm64 perf hardlockup effort. I would be OK with the earlier patches
> > > in the series landing and then not landing ${SUBJECT} patch nor
> > > anything else later.
> >
> > FWIW, everything prior to this looks fine to me, so I reckon it'd be worth
> > splitting the series here and getting the buddy lockup detector in first, to
> > avoid a log-jam on all the subsequent NMI bits.
> 
> I think the whole series has already landed in Andrew's tree,
> including the arm64 "perf" lockup detector bits. I saw all the
> notifications from Andrew go through over the weekend that they were
> moved from an "unstable" branch to a "stable" one and I see them at:
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git/log/?h=mm-nonmm-stable
> 
> When I first saw Anderw land the arm64 perf lockup detector bits in
> his unstable branch several weeks ago, I sent a private message to the
> arm64 maintainers (yourself included) to make sure you were aware of
> it and that it hadn't been caught in mail filters. I got the
> impression that everything was OK. Is that not the case?

Sorry; I'm slowly catching up with a backlog of email, and I'm just behind.

Feel free to ignore this; sorry for the noise!

If we spot anything going wrong in testing we can look at fixing those up.

Thanks,
Mark.


Re: [PATCH v5 15/18] watchdog/perf: Add a weak function for an arch to detect if perf can use NMIs

2023-06-12 Thread Doug Anderson
Mark,

On Mon, Jun 12, 2023 at 3:33 AM Mark Rutland  wrote:
>
> On Fri, May 19, 2023 at 10:18:39AM -0700, Douglas Anderson wrote:
> > On arm64, NMI support needs to be detected at runtime. Add a weak
> > function to the perf hardlockup detector so that an architecture can
> > implement it to detect whether NMIs are available.
> >
> > Signed-off-by: Douglas Anderson 
> > ---
> > While I won't object to this patch landing, I consider it part of the
> > arm64 perf hardlockup effort. I would be OK with the earlier patches
> > in the series landing and then not landing ${SUBJECT} patch nor
> > anything else later.
>
> FWIW, everything prior to this looks fine to me, so I reckon it'd be worth
> splitting the series here and getting the buddy lockup detector in first, to
> avoid a log-jam on all the subsequent NMI bits.

I think the whole series has already landed in Andrew's tree,
including the arm64 "perf" lockup detector bits. I saw all the
notifications from Andrew go through over the weekend that they were
moved from an "unstable" branch to a "stable" one and I see them at:

https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git/log/?h=mm-nonmm-stable

When I first saw Anderw land the arm64 perf lockup detector bits in
his unstable branch several weeks ago, I sent a private message to the
arm64 maintainers (yourself included) to make sure you were aware of
it and that it hadn't been caught in mail filters. I got the
impression that everything was OK. Is that not the case?


-Doug


Re: [PATCH v5 15/18] watchdog/perf: Add a weak function for an arch to detect if perf can use NMIs

2023-06-12 Thread Mark Rutland
On Fri, May 19, 2023 at 10:18:39AM -0700, Douglas Anderson wrote:
> On arm64, NMI support needs to be detected at runtime. Add a weak
> function to the perf hardlockup detector so that an architecture can
> implement it to detect whether NMIs are available.
> 
> Signed-off-by: Douglas Anderson 
> ---
> While I won't object to this patch landing, I consider it part of the
> arm64 perf hardlockup effort. I would be OK with the earlier patches
> in the series landing and then not landing ${SUBJECT} patch nor
> anything else later.

FWIW, everything prior to this looks fine to me, so I reckon it'd be worth
splitting the series here and getting the buddy lockup detector in first, to
avoid a log-jam on all the subsequent NMI bits.

Thanks,
Mark.

> I'll also note that, as an alternative to this, it would be nice if we
> could figure out how to make perf_event_create_kernel_counter() fail
> on arm64 if NMIs aren't available. Maybe we could add a "must_use_nmi"
> element to "struct perf_event_attr"?
> 
> (no changes since v4)
> 
> Changes in v4:
> - ("Add a weak function for an arch to detect ...") new for v4.
> 
>  include/linux/nmi.h|  1 +
>  kernel/watchdog_perf.c | 12 +++-
>  2 files changed, 12 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/nmi.h b/include/linux/nmi.h
> index 47db14e7da52..eb616fc07c85 100644
> --- a/include/linux/nmi.h
> +++ b/include/linux/nmi.h
> @@ -210,6 +210,7 @@ static inline bool trigger_single_cpu_backtrace(int cpu)
>  
>  #ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF
>  u64 hw_nmi_get_sample_period(int watchdog_thresh);
> +bool arch_perf_nmi_is_available(void);
>  #endif
>  
>  #if defined(CONFIG_HARDLOCKUP_CHECK_TIMESTAMP) && \
> diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c
> index 349fcd4d2abc..8ea00c4a24b2 100644
> --- a/kernel/watchdog_perf.c
> +++ b/kernel/watchdog_perf.c
> @@ -234,12 +234,22 @@ void __init hardlockup_detector_perf_restart(void)
>   }
>  }
>  
> +bool __weak __init arch_perf_nmi_is_available(void)
> +{
> + return true;
> +}
> +
>  /**
>   * watchdog_hardlockup_probe - Probe whether NMI event is available at all
>   */
>  int __init watchdog_hardlockup_probe(void)
>  {
> - int ret = hardlockup_detector_event_create();
> + int ret;
> +
> + if (!arch_perf_nmi_is_available())
> + return -ENODEV;
> +
> + ret = hardlockup_detector_event_create();
>  
>   if (ret) {
>   pr_info("Perf NMI watchdog permanently disabled\n");
> -- 
> 2.40.1.698.g37aff9b760-goog
>