[PATCH v2] PCI/AER: Handle Multi UnCorrectable/Correctable errors properly

2022-03-14 Thread Kuppuswamy Sathyanarayanan
Currently the aer_irq() handler returns IRQ_NONE for cases without bits
PCI_ERR_ROOT_UNCOR_RCV or PCI_ERR_ROOT_COR_RCV are set. But this
assumption is incorrect.

Consider a scenario where aer_irq() is triggered for a correctable
error, and while we process the error and before we clear the error
status in "Root Error Status" register, if the same kind of error
is triggered again, since aer_irq() only clears events it saw, the
multi-bit error is left in tact. This will cause the interrupt to fire
again, resulting in entering aer_irq() with just the multi-bit error
logged in the "Root Error Status" register.

Repeated AER recovery test has revealed this condition does happen
and this prevents any new interrupt from being triggered. Allow to
process interrupt even if only multi-correctable (BIT 1) or
multi-uncorrectable bit (BIT 3) is set.

This error can be reproduced by making following changes to the
aer_irq() function and by executing the given test commands.

 static irqreturn_t aer_irq(int irq, void *context)
 struct aer_err_source e_src = {};

 pci_read_config_dword(rp, aer + PCI_ERR_ROOT_STATUS,
_src.status);
 +   pci_dbg(pdev->port, "Root Error Status: %04x\n",
 +  e_src.status);
 if (!(e_src.status & AER_ERR_STATUS_MASK))
 return IRQ_NONE;

 +   mdelay(5000);

 # Prep injection data for a correctable error.
 $ cd /sys/kernel/debug/apei/einj
 $ echo 0x0040 > error_type
 $ echo 0x4 > flags
 $ echo 0x891000 > param4

 # Root Error Status is initially clear
 $ setpci -s  ECAP0001+0x30.w
 

 # Inject one error
 $ echo 1 > error_inject

 # Interrupt received
 pcieport : AER: Root Error Status 0001

 # Inject another error (within 5 seconds)
 $ echo 1 > error_inject

 # No interrupt received, but "multiple ERR_COR" is now set
 $ setpci -s  ECAP0001+0x30.w
 0003

 # Wait for a while, then clear ERR_COR. A new interrupt immediately
   fires.
 $ setpci -s  ECAP0001+0x30.w=0x1
 pcieport : AER: Root Error Status 0002

Currently, the above issue has been only reproduced in the ICL server
platform.

[Eric: proposed reproducing steps]
Fixes: 4696b828ca37 ("PCI/AER: Hoist aerdrv.c, aer_inject.c up to 
drivers/pci/pcie/")
Reported-by: Eric Badger 
Reviewed-by: Ashok Raj 
Signed-off-by: Kuppuswamy Sathyanarayanan 

---

Changes since v1:
 * Added Fixes tag.
 * Included reproducing steps proposed by Eric.

 drivers/pci/pcie/aer.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 9fa1f97e5b27..7952e5efd6cf 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -101,6 +101,11 @@ struct aer_stats {
 #define ERR_COR_ID(d)  (d & 0x)
 #define ERR_UNCOR_ID(d)(d >> 16)
 
+#define AER_ERR_STATUS_MASK(PCI_ERR_ROOT_UNCOR_RCV |   \
+   PCI_ERR_ROOT_COR_RCV |  \
+   PCI_ERR_ROOT_MULTI_COR_RCV |\
+   PCI_ERR_ROOT_MULTI_UNCOR_RCV)
+
 static int pcie_aer_disable;
 static pci_ers_result_t aer_root_reset(struct pci_dev *dev);
 
@@ -1196,7 +1201,7 @@ static irqreturn_t aer_irq(int irq, void *context)
struct aer_err_source e_src = {};
 
pci_read_config_dword(rp, aer + PCI_ERR_ROOT_STATUS, _src.status);
-   if (!(e_src.status & (PCI_ERR_ROOT_UNCOR_RCV|PCI_ERR_ROOT_COR_RCV)))
+   if (!(e_src.status & AER_ERR_STATUS_MASK))
return IRQ_NONE;
 
pci_read_config_dword(rp, aer + PCI_ERR_ROOT_ERR_SRC, _src.id);
-- 
2.25.1



linux-next: manual merge of the tip tree with the powerpc tree

2022-03-14 Thread Stephen Rothwell
Hi all,

Today's linux-next merge of the tip tree got a conflict in:

  arch/powerpc/include/asm/livepatch.h

between commit:

  a4520b252765 ("powerpc/ftrace: Add support for livepatch to PPC32")

from the powerpc tree and commit:

  a557abfd1a16 ("x86/livepatch: Validate __fentry__ location")

from the tip tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc arch/powerpc/include/asm/livepatch.h
index 6f10de6af6e3,7b9dcd51af32..
--- a/arch/powerpc/include/asm/livepatch.h
+++ b/arch/powerpc/include/asm/livepatch.h
@@@ -14,21 -14,11 +14,11 @@@
  #ifdef CONFIG_LIVEPATCH
  static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long 
ip)
  {
 -  struct pt_regs *regs = ftrace_get_regs(fregs);
 -
 -  regs_set_return_ip(regs, ip);
 +  ftrace_instruction_pointer_set(fregs, ip);
  }
- 
- #define klp_get_ftrace_location klp_get_ftrace_location
- static inline unsigned long klp_get_ftrace_location(unsigned long faddr)
- {
-   /*
-* Live patch works on PPC32 and only with -mprofile-kernel on PPC64. In
-* both cases, the ftrace location is always within the first 16 bytes.
-*/
-   return ftrace_location_range(faddr, faddr + 16);
- }
 +#endif /* CONFIG_LIVEPATCH */
  
 +#ifdef CONFIG_LIVEPATCH_64
  static inline void klp_init_thread_info(struct task_struct *p)
  {
/* + 1 to account for STACK_END_MAGIC */


pgpRh8CpwStR3.pgp
Description: OpenPGP digital signature


Re: [PATCH 13/15] swiotlb: merge swiotlb-xen initialization into swiotlb

2022-03-14 Thread Stefano Stabellini
On Mon, 14 Mar 2022, Christoph Hellwig wrote:
> Reuse the generic swiotlb initialization for xen-swiotlb.  For ARM/ARM64
> this works trivially, while for x86 xen_swiotlb_fixup needs to be passed
> as the remap argument to swiotlb_init_remap/swiotlb_init_late.
> 
> Signed-off-by: Christoph Hellwig 

For arch/arm/xen and drivers/xen/swiotlb-xen.c

Reviewed-by: Stefano Stabellini 


Re: [PATCH 5/5] x86/pkeys: Standardize on u8 for pkey type

2022-03-14 Thread Dave Hansen
On 3/10/22 16:57, ira.we...@intel.com wrote:
> From: Ira Weiny 
> 
> The number of pkeys supported on x86 and powerpc are much smaller than a
> u16 value can hold.  It is desirable to standardize on the type for
> pkeys.  powerpc currently supports the most pkeys at 32.  u8 is plenty
> large for that.
> 
> Standardize on the pkey types by changing u16 to u8.

How widely was this intended to "standardize" things?  Looks like it may
have missed a few spots.

Also if we're worried about the type needing to change or with the wrong
type being used, I guess we could just to a pkey_t typedef.


Re: [PATCH 14/15] swiotlb: remove swiotlb_init_with_tbl and swiotlb_init_late_with_tbl

2022-03-14 Thread Boris Ostrovsky



On 3/14/22 3:31 AM, Christoph Hellwig wrote:

@@ -314,6 +293,7 @@ void __init swiotlb_init(bool addressing_limit, unsigned 
int flags)
  int swiotlb_init_late(size_t size, gfp_t gfp_mask,
int (*remap)(void *tlb, unsigned long nslabs))
  {
+   struct io_tlb_mem *mem = _tlb_default_mem;
unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
unsigned long bytes;
unsigned char *vstart = NULL;
@@ -355,33 +335,28 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
(PAGE_SIZE << order) >> 20);
nslabs = SLABS_PER_PAGE << order;
}
-   rc = swiotlb_late_init_with_tbl(vstart, nslabs);
-   if (rc)
-   free_pages((unsigned long)vstart, order);
-
-   return rc;
-}
-
-int
-swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
-{
-   struct io_tlb_mem *mem = _tlb_default_mem;
-   unsigned long bytes = nslabs << IO_TLB_SHIFT;
  
-	if (swiotlb_force_disable)

-   return 0;
+   if (remap)
+   rc = remap(vstart, nslabs);
+   if (rc) {
+   free_pages((unsigned long)vstart, order);
  
-	/* protect against double initialization */

-   if (WARN_ON_ONCE(mem->nslabs))
-   return -ENOMEM;
+   /* Min is 2MB */
+   if (nslabs <= 1024)
+   return rc;
+   nslabs = max(1024UL, ALIGN(nslabs >> 1, IO_TLB_SEGSIZE));
+   goto retry;
+   }



We now end up with two attempts to remap. I think this second one is what we 
want since it solves the problem I pointed in previous patch.


-boris




Re: [PATCH 13/15] swiotlb: merge swiotlb-xen initialization into swiotlb

2022-03-14 Thread Boris Ostrovsky



On 3/14/22 3:31 AM, Christoph Hellwig wrote:

-
  static void __init pci_xen_swiotlb_init(void)
  {
if (!xen_initial_domain() && !x86_swiotlb_enable)
return;
x86_swiotlb_enable = true;
-   xen_swiotlb = true;
-   xen_swiotlb_init_early();
+   swiotlb_init_remap(true, x86_swiotlb_flags, xen_swiotlb_fixup);



I think we need to have SWIOTLB_ANY set in x86_swiotlb_flags here.




dma_ops = _swiotlb_dma_ops;
if (IS_ENABLED(CONFIG_PCI))
pci_request_acs();
@@ -88,14 +85,16 @@ static void __init pci_xen_swiotlb_init(void)
  
  int pci_xen_swiotlb_init_late(void)

  {
-   int rc;
-
-   if (xen_swiotlb)
+   if (dma_ops == _swiotlb_dma_ops)
return 0;
  
-	rc = xen_swiotlb_init();

-   if (rc)
-   return rc;
+   /* we can work with the default swiotlb */
+   if (!io_tlb_default_mem.nslabs) {
+   int rc = swiotlb_init_late(swiotlb_size_or_default(),
+  GFP_KERNEL, xen_swiotlb_fixup);



This may be comment for previous patch but looking at swiotlb_init_late():


retry:
    order = get_order(nslabs << IO_TLB_SHIFT);
    nslabs = SLABS_PER_PAGE << order;
    bytes = nslabs << IO_TLB_SHIFT;

    while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
    vstart = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
  order);
    if (vstart)
    break;
    order--;
    }

    if (!vstart)
    return -ENOMEM;
    if (remap)
    rc = remap(vstart, nslabs);
    if (rc) {
    free_pages((unsigned long)vstart, order);

    /* Min is 2MB */
    if (nslabs <= 1024)
    return rc;
    nslabs = max(1024UL, ALIGN(nslabs >> 1, IO_TLB_SEGSIZE));
    goto retry;
    }

    if (order != get_order(bytes)) {
    pr_warn("only able to allocate %ld MB\n",
    (PAGE_SIZE << order) >> 20);
    nslabs = SLABS_PER_PAGE << order; <===
    }

    rc = swiotlb_late_init_with_tbl(vstart, nslabs);

Notice that we don't do remap() after final update to nslabs. We should.



-boris


Re: [PATCH v7 3/5] powerpc: Rework and improve STRICT_KERNEL_RWX patching

2022-03-14 Thread Jordan Niethe
On Sat, Mar 12, 2022 at 6:30 PM Christophe Leroy
 wrote:
>
> Hi Jordan
>
> Le 10/11/2021 à 01:37, Jordan Niethe a écrit :
> > From: "Christopher M. Riedl" 
> >
> > Rework code-patching with STRICT_KERNEL_RWX to prepare for a later patch
> > which uses a temporary mm for patching under the Book3s64 Radix MMU.
> > Make improvements by adding a WARN_ON when the patchsite doesn't match
> > after patching and return the error from __patch_instruction() properly.
> >
> > Signed-off-by: Christopher M. Riedl 
> > Signed-off-by: Jordan Niethe 
> > ---
> > v7: still pass addr to map_patch_area()
>
>
> This patch doesn-t apply, can you rebase the series ?
Yep, will do.
>
> Thanks
> Christophe
>
> > ---
> >   arch/powerpc/lib/code-patching.c | 20 ++--
> >   1 file changed, 10 insertions(+), 10 deletions(-)
> >
> > diff --git a/arch/powerpc/lib/code-patching.c 
> > b/arch/powerpc/lib/code-patching.c
> > index 29a30c3068ff..d586bf9c7581 100644
> > --- a/arch/powerpc/lib/code-patching.c
> > +++ b/arch/powerpc/lib/code-patching.c
> > @@ -75,6 +75,7 @@ static inline void stop_using_temp_mm(struct 
> > temp_mm_state prev_state)
> >   }
> >
> >   static DEFINE_PER_CPU(struct vm_struct *, text_poke_area);
> > +static DEFINE_PER_CPU(unsigned long, cpu_patching_addr);
> >
> >   static int text_area_cpu_up(unsigned int cpu)
> >   {
> > @@ -87,6 +88,7 @@ static int text_area_cpu_up(unsigned int cpu)
> >   return -1;
> >   }
> >   this_cpu_write(text_poke_area, area);
> > + this_cpu_write(cpu_patching_addr, (unsigned long)area->addr);
> >
> >   return 0;
> >   }
> > @@ -172,11 +174,10 @@ static inline int unmap_patch_area(unsigned long addr)
> >
> >   static int do_patch_instruction(u32 *addr, struct ppc_inst instr)
> >   {
> > - int err;
> > + int err, rc = 0;
> >   u32 *patch_addr = NULL;
> >   unsigned long flags;
> >   unsigned long text_poke_addr;
> > - unsigned long kaddr = (unsigned long)addr;
> >
> >   /*
> >* During early early boot patch_instruction is called
> > @@ -188,15 +189,13 @@ static int do_patch_instruction(u32 *addr, struct 
> > ppc_inst instr)
> >
> >   local_irq_save(flags);
> >
> > - text_poke_addr = (unsigned long)__this_cpu_read(text_poke_area)->addr;
> > - if (map_patch_area(addr, text_poke_addr)) {
> > - err = -1;
> > + text_poke_addr = __this_cpu_read(cpu_patching_addr);
> > + err = map_patch_area(addr, text_poke_addr);
> > + if (err)
> >   goto out;
> > - }
> > -
> > - patch_addr = (u32 *)(text_poke_addr + (kaddr & ~PAGE_MASK));
> >
> > - __patch_instruction(addr, instr, patch_addr);
> > + patch_addr = (u32 *)(text_poke_addr | offset_in_page(addr));
> > + rc = __patch_instruction(addr, instr, patch_addr);
> >
> >   err = unmap_patch_area(text_poke_addr);
> >   if (err)
> > @@ -204,8 +203,9 @@ static int do_patch_instruction(u32 *addr, struct 
> > ppc_inst instr)
> >
> >   out:
> >   local_irq_restore(flags);
> > + WARN_ON(!ppc_inst_equal(ppc_inst_read(addr), instr));
> >
> > - return err;
> > + return rc ? rc : err;
> >   }
> >   #else /* !CONFIG_STRICT_KERNEL_RWX */
> >


Re: [PATCH 3/4] powerpc: Handle prefixed instructions in show_user_instructions()

2022-03-14 Thread Jordan Niethe
On Wed, Feb 23, 2022 at 1:34 AM Christophe Leroy
 wrote:
>
>
>
> Le 02/06/2020 à 07:27, Jordan Niethe a écrit :
> > Currently prefixed instructions are treated as two word instructions by
> > show_user_instructions(), treat them as a single instruction. '<' and
> > '>' are placed around the instruction at the NIP, and for prefixed
> > instructions this is placed around the prefix only. Make the '<' and '>'
> > wrap the prefix and suffix.
> >
> > Currently showing a prefixed instruction looks like:
> > fbe1fff8 3920 0600 a3e3 <0400> f7e4 ebe1fff8 4e800020
> >
> > Make it look like:
> > 0xfbe1fff8 0x3920 0x0600 0xa3e3 <0x0400 0xf7e4> 
> > 0xebe1fff8 0x4e800020 0x 0x
>
> Is it really needed to have the leading 0x ?
You are right, that is not consistent with how instructions are usually dumped.
That formatting comes from ppc_inst_as_str(), which when mpe merged he
removed the leading 0x.

>
> And is there a reason for that two 0x at the end of the new line
> that we don't have at the end of the old line ?
No, that is wrong.
>
> This is initially split into 8 instructions per line in order to fit in
> a 80 columns screen/terminal.
>
> Could you make it such that it still fits within 80 cols ?
Sure that makes sense.
>
> Same for patch 4 on show_user_instructions()
>
> Christophe


[RFC v2 PATCH 5/5] powerpc/crash hp: add crash hotplug support for kexec_load

2022-03-14 Thread Sourabh Jain
The kernel changes needed to add support for crash hotplug support for
kexec_load system calls are similar to kexec_file_load (which has already
been implemented in earlier patches). Since kexec segment array is
prepared by kexec tool in the userspace, the kernel does aware of which
index FDT segment is present.

The only change was done to enabled crash hotplug support for kexec_load is
updated the crash hotplug handler to identify the FDT segment from kexec
segment array.

Signed-off-by: Sourabh Jain 
---
 arch/powerpc/kexec/core_64.c | 21 -
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c
index a4defb84720f..bb449f6e380f 100644
--- a/arch/powerpc/kexec/core_64.c
+++ b/arch/powerpc/kexec/core_64.c
@@ -480,7 +480,9 @@ int update_cpus_node(void *fdt)
 void arch_crash_hotplug_handler(struct kimage *image,
 unsigned int hp_action, unsigned long a, unsigned long b)
 {
-void *fdt;
+void *fdt, *ptr;
+unsigned int n;
+unsigned long mem, memsz;
 
/* No action needed for CPU hot-unplug */
if (hp_action == KEXEC_CRASH_HP_REMOVE_CPU)
@@ -492,6 +494,23 @@ void arch_crash_hotplug_handler(struct kimage *image,
return;
}
 
+   /* Sine kexec segments for kexec_load system call is prepred by
+* kexec tool in userspace we need loop through all the segments
+* to find out segment index corresponds FDT segment. In case of
+* kexec_file_load it is discovered during the load itself.
+*/
+if (!image->fdt_index_valid) {
+for (n = 0; n < image->nr_segments; n++) {
+mem = image->segment[n].mem;
+memsz = image->segment[n].memsz;
+ptr = __va(mem);
+if (ptr && fdt_magic(ptr) == FDT_MAGIC) {
+image->fdt_index = n;
+image->fdt_index_valid = true;
+}
+}
+}
+
 /* Must have valid FDT index */
 if (!image->fdt_index_valid) {
 pr_err("crash hp: unable to locate FDT segment");
-- 
2.35.1



[RFC v2 PATCH 4/5] powerpc/crash hp: add crash hotplug support for kexec_file_load

2022-03-14 Thread Sourabh Jain
Two major changes are done to enable the crash CPU hotplug handler.
Firstly, updated the kexec_load path to prepare kimage for hotplug
changes and secondly, implemented the crash hotplug handler itself.

On the kexec load path, memsz allocation of crash FDT segment is
updated to ensure that it has sufficient buffer space to accommodate
future hot add CPUs. Initialized the kimage members to track the FDT
segment.

The crash hotplug handler updates the cpus node of crash FDT. While
we update crash FDT the kexec_crash_image is marked invalid and restored
after FDT update to avoid race.

Since memory crash hotplug support is not there yet the crash hotplug
handler simply warn the user and return.

Signed-off-by: Sourabh Jain 
---
 arch/powerpc/kexec/core_64.c | 46 
 arch/powerpc/kexec/elf_64.c  | 39 ++
 2 files changed, 85 insertions(+)

diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c
index 249d2632526d..a4defb84720f 100644
--- a/arch/powerpc/kexec/core_64.c
+++ b/arch/powerpc/kexec/core_64.c
@@ -466,6 +466,52 @@ int update_cpus_node(void *fdt)
return ret;
 }
 
+#ifdef CONFIG_CRASH_HOTPLUG
+/**
+ * arch_crash_hotplug_handler() - Handle hotplug FDT changes
+ * @image: the active struct kimage
+ * @hp_action: the hot un/plug action being handled
+ * @a: first parameter dependent upon hp_action
+ * @b: first parameter dependent upon hp_action
+ *
+ * To accurately reflect CPU hot un/plug changes, the FDT
+ * must be updated with the new list of CPUs and memories.
+ */
+void arch_crash_hotplug_handler(struct kimage *image,
+unsigned int hp_action, unsigned long a, unsigned long b)
+{
+void *fdt;
+
+   /* No action needed for CPU hot-unplug */
+   if (hp_action == KEXEC_CRASH_HP_REMOVE_CPU)
+   return;
+
+   /* crash update on memory hotplug is not support yet */
+   if (hp_action == KEXEC_CRASH_HP_REMOVE_MEMORY || hp_action == 
KEXEC_CRASH_HP_ADD_MEMORY) {
+   pr_err("crash hp: crash update is not supported with memory 
hotplug\n");
+   return;
+   }
+
+/* Must have valid FDT index */
+if (!image->fdt_index_valid) {
+pr_err("crash hp: unable to locate FDT segment");
+return;
+}
+
+fdt = __va((void *)image->segment[image->fdt_index].mem);
+
+/* Temporarily invalidate the crash image while it is replaced */
+xchg(_crash_image, NULL);
+
+/* update FDT to refelect changes to CPU resrouces */
+if(update_cpus_node(fdt))
+pr_err("crash hp: failed to update crash FDT");
+
+/* The crash image is now valid once again */
+xchg(_crash_image, image);
+}
+#endif /* CONFIG_CRASH_HOTPLUG */
+
 #ifdef CONFIG_PPC_64S_HASH_MMU
 /* Values we need to export to the second kernel via the device tree. */
 static unsigned long htab_base;
diff --git a/arch/powerpc/kexec/elf_64.c b/arch/powerpc/kexec/elf_64.c
index eeb258002d1e..d522cbb19121 100644
--- a/arch/powerpc/kexec/elf_64.c
+++ b/arch/powerpc/kexec/elf_64.c
@@ -24,6 +24,32 @@
 #include 
 #include 
 
+
+#ifdef CONFIG_CRASH_HOTPLUG
+#define MAX_CORE 256
+#define PER_CORE_NODE_SIZE 1500
+
+/**
+ * get_crash_fdt_mem_sz() - calcuate mem size for crash kernel FDT
+ * @fdt: pointer to crash kernel FDT
+ *
+ * Calculate the buffer space needed to add more CPU nodes in FDT after
+ * capture kenrel load due to hot add events.
+ *
+ * Some assumption are made to calculate the additional buffer size needed
+ * to accommodate future hot add CPUs to the crash FDT. The maximum core count
+ * in the system would not go beyond MAX_CORE and memory needed to store per 
core
+ * date in FDT is PER_CORE_NODE_SIZE.
+ *
+ * Certainly MAX_CORE count can be replaced with possible core count and
+ * PER_CORE_NODE_SIZE to some standard value instead of randomly observed
+ * core size value on Power9 LPAR.
+ */
+static unsigned int get_crash_fdt_mem_sz(void *fdt) {
+   return fdt_totalsize(fdt) + (PER_CORE_NODE_SIZE * MAX_CORE);
+}
+#endif
+
 static void *elf64_load(struct kimage *image, char *kernel_buf,
unsigned long kernel_len, char *initrd,
unsigned long initrd_len, char *cmdline,
@@ -123,6 +149,19 @@ static void *elf64_load(struct kimage *image, char 
*kernel_buf,
kbuf.buf_align = PAGE_SIZE;
kbuf.top_down = true;
kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
+
+#ifdef CONFIG_CRASH_HOTPLUG
+   if (image->type == KEXEC_TYPE_CRASH) {
+   kbuf.memsz = get_crash_fdt_mem_sz(fdt);
+   fdt_set_totalsize(fdt, kbuf.memsz);
+   image->fdt_index = image->nr_segments;
+   image->fdt_index_valid = true;
+   } else
+#endif
+   {
+   kbuf.memsz = fdt_totalsize(fdt);
+   }
+
ret = kexec_add_buffer();
if (ret)
goto out_free_fdt;
-- 
2.35.1



[RFC v2 PATCH 2/5] powerpc/crash hp: introduce a new config option CRASH_HOTPLUG

2022-03-14 Thread Sourabh Jain
The option CRASH_HOTPLUG enables, in kernel update to kexec segments on
hotplug events.

All the updates needed on the capture kernel load path in the kernel for
both kexec_load and kexec_file_load system will be kept under this config.

Signed-off-by: Sourabh Jain 
---
 arch/powerpc/Kconfig | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index b779603978e1..b816339ef8c7 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -623,6 +623,17 @@ config FA_DUMP
  If unsure, say "y". Only special kernels like petitboot may
  need to say "N" here.
 
+config CRASH_HOTPLUG
+   bool "kernel updates of crash kexec segments"
+   depends on CRASH_DUMP && (HOTPLUG_CPU) && KEXEC_FILE
+   help
+ An efficient way to keep the capture kernel up-to-date with CPU
+ hotplug events. On hotplug event (CPU/memory) the kexec segments
+ of capture kernel becomes stale and need to be updated with latest
+ CPU and memory regions. In this method the kernel performs minimal
+ update to only relevant kexec segments on CPU hotplug event, instead
+ of triggering full capture reload from userspace using udev rule.
+
 config PRESERVE_FA_DUMP
bool "Preserve Firmware-assisted dump"
depends on PPC64 && PPC_POWERNV && !FA_DUMP
-- 
2.35.1



[RFC v2 PATCH 3/5] powrepc/crash hp: update kimage struct

2022-03-14 Thread Sourabh Jain
Two new members fdt_index and fdt_index_valid are added in kimage struct
to track the FDT kexec segment. These new members of kimage struct will
help the crash hotplug handler to easily access the FDT segment from the
kexec segment array. Otherwise, we have to loop through all kexec segments
to find the FDT segments.

Note: It is possible that we might need to update multiple kexec segments
for a hotplug event then tracking each segment with an individual variable
in kimage struct is not efficient. How about an array to track all the
hotplug kexec segments?

struct hp_segment{
int index;
bool is_valid;
}

Signed-off-by: Sourabh Jain 
---
 include/linux/kexec.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index e00c373c4095..e9038f2c75ee 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -306,6 +306,8 @@ struct kimage {
int offlinecpu;
bool elf_index_valid;
int elf_index;
+   int fdt_index;
+   bool fdt_index_valid;
 #endif
 #endif
 
-- 
2.35.1



[RFC v2 PATCH 0/5] In kernel handling of CPU hotplug events for crash kernel

2022-03-14 Thread Sourabh Jain
This patch series implements the crash hotplug handler on PowerPC introduced
by https://lkml.org/lkml/2022/2/9/1406 patch series.


The Problem:

Post hotplug/DLPAR events the capture kernel holds stale information about the
system. Dump collection with stale capture kernel might end up in dump capture
failure or an inaccurate dump collection.


Existing solution:
==
The existing solution to keep the capture kernel up-to-date is observe the
hotplug event via udev rule and trigger a full capture kernel reload post
hotplug event. 

Shortcomings:

- Leaves a window where kernel crash might not lead to successful dump
  collection.
- Reloading all kexec components for each hotplug is inefficient. Since only
  one or two kexec components need to be updated due to hotplug event reloading
  all kexec component is redundant.
- udev rules are prone to races if hotplug events are frequent.

More about issues with an existing solution is posted here:
 - https://lkml.org/lkml/2020/12/14/532
 - https://lists.ozlabs.org/pipermail/linuxppc-dev/2022-February/240254.html

Proposed Solution:
==
Instead of reloading all kexec segments on hotplug event, this patch series
focuses on updating only the relevant kexec segment. Once the kexec
segments are loaded in the kernel reserved area then an arch-specific hotplug 
handler
will update the relevant kexec segment based on hotplug event type.

As mentioned above this patch series implemented a PowerPC crash hotplug
handler for the CPU. The crash hotplug handler memory is in our TODO list.


A couple of minor changes are required to realize the benefit of the patch
series:

- disalble the udev rule:

  comment out the below line in kdump udev rule file:
  RHEL: /usr/lib/udev/rules.d/98-kexec.rules
  # SUBSYSTEM=="cpu", ACTION=="online", GOTO="kdump_reload_cpu"

- kexec tool needs to be updated with patch for kexec_load system call
  to work (not needed if -s option is used during kexec panic load):

---
diff --git a/kexec/arch/ppc64/kexec-elf-ppc64.c 
b/kexec/arch/ppc64/kexec-elf-ppc64.c
index 695b8b0..1dc6490 100644
--- a/kexec/arch/ppc64/kexec-elf-ppc64.c
+++ b/kexec/arch/ppc64/kexec-elf-ppc64.c
@@ -45,6 +45,29 @@ uint64_t initrd_base, initrd_size;
 unsigned char reuse_initrd = 0;
 const char *ramdisk;
 
+#define MAX_CORE 256
+#define PER_CORE_NODE_SIZE 1500
+
+/**
+ * get_crash_fdt_mem_sz() - calcuate mem size for crash kernel FDT
+ * @fdt: pointer to crash kernel FDT
+ *
+ * Calculate the buffer space needed to add more CPU nodes in FDT after
+ * capture kenrel load due to hot add events.
+ *
+ * Some assumption are made to calculate the additional buffer size needed
+ * to accommodate future hot add CPUs to the crash FDT. The maximum core count
+ * in the system would not go beyond MAX_CORE and memory needed to store per 
core
+ * date in FDT is PER_CORE_NODE_SIZE.
+ *
+ * Certainly MAX_CORE count can be replaced with possible core count and
+ * PER_CORE_NODE_SIZE to some standard value instead of randomly observed
+ * core size value on Power9 LPAR.
+ */
+static unsigned int get_crash_fdt_mem_sz(void *fdt) {
+   return fdt_totalsize(fdt) + (PER_CORE_NODE_SIZE * MAX_CORE);
+}
+
 int elf_ppc64_probe(const char *buf, off_t len)
 {
struct mem_ehdr ehdr;
@@ -179,6 +202,7 @@ int elf_ppc64_load(int argc, char **argv, const char *buf, 
off_t len,
uint64_t max_addr, hole_addr;
char *seg_buf = NULL;
off_t seg_size = 0;
+   unsigned int mem_sz = 0;
struct mem_phdr *phdr;
size_t size;
 #ifdef NEED_RESERVE_DTB
@@ -329,7 +353,13 @@ int elf_ppc64_load(int argc, char **argv, const char *buf, 
off_t len,
if (result < 0)
return result;
 
-   my_dt_offset = add_buffer(info, seg_buf, seg_size, seg_size,
+   if (info->kexec_flags & KEXEC_ON_CRASH) {
+   mem_sz = get_crash_fdt_mem_sz((void *)seg_buf);
+   fdt_set_totalsize(seg_buf, mem_sz);
+   info->fdt_index = info->nr_segments;
+   }
+
+   my_dt_offset = add_buffer(info, seg_buf, seg_size, mem_sz,
0, 0, max_addr, -1);
 
 #ifdef NEED_RESERVE_DTB
diff --git a/kexec/kexec.c b/kexec/kexec.c
index f63b36b..846b1a8 100644
--- a/kexec/kexec.c
+++ b/kexec/kexec.c
@@ -672,6 +672,9 @@ static void update_purgatory(struct kexec_info *info)
if (info->segment[i].mem == (void *)info->rhdr.rel_addr) {
continue;
}
+   if (info->fdt_index == i)
+   continue;
+
sha256_update(, info->segment[i].buf,
  info->segment[i].bufsz);
nullsz = info->segment[i].memsz - info->segment[i].bufsz;
diff --git a/kexec/kexec.h b/kexec/kexec.h
index 595dd68..0906a1b 100644
--- a/kexec/kexec.h
+++ b/kexec/kexec.h
@@ -169,6 +169,7 @@ struct kexec_info {
int command_line_len;
 
 

[RFC v2 PATCH 1/5] powerpc/kexec: make update_cpus_node non-static

2022-03-14 Thread Sourabh Jain
Make the update_cpus_node function non-static and export it for
usage in other kexec components.

The update_cpus_node definition is moved to core_64.c so that it
can be used with both kexec_load and kexec_file_load system calls.

Signed-off-by: Sourabh Jain 
---
 arch/powerpc/include/asm/kexec.h  |  1 +
 arch/powerpc/kexec/core_64.c  | 88 +++
 arch/powerpc/kexec/file_load_64.c | 87 --
 3 files changed, 89 insertions(+), 87 deletions(-)

diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
index 8ebdd23d987c..e1288826e22e 100644
--- a/arch/powerpc/include/asm/kexec.h
+++ b/arch/powerpc/include/asm/kexec.h
@@ -127,6 +127,7 @@ unsigned int kexec_extra_fdt_size_ppc64(struct kimage 
*image);
 int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
unsigned long initrd_load_addr,
unsigned long initrd_len, const char *cmdline);
+int update_cpus_node(void *fdt);
 #endif /* CONFIG_PPC64 */
 
 #endif /* CONFIG_KEXEC_FILE */
diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c
index 635b5fc30b53..249d2632526d 100644
--- a/arch/powerpc/kexec/core_64.c
+++ b/arch/powerpc/kexec/core_64.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -378,6 +379,93 @@ void default_machine_kexec(struct kimage *image)
/* NOTREACHED */
 }
 
+/**
+ * add_node_props - Reads node properties from device node structure and add
+ *  them to fdt.
+ * @fdt:Flattened device tree of the kernel
+ * @node_offset:offset of the node to add a property at
+ * @dn: device node pointer
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int add_node_props(void *fdt, int node_offset, const struct device_node 
*dn)
+{
+   int ret = 0;
+   struct property *pp;
+
+   if (!dn)
+   return -EINVAL;
+
+   for_each_property_of_node(dn, pp) {
+   ret = fdt_setprop(fdt, node_offset, pp->name, pp->value, 
pp->length);
+   if (ret < 0) {
+   pr_err("Unable to add %s property: %s\n", pp->name, 
fdt_strerror(ret));
+   return ret;
+   }
+   }
+   return ret;
+}
+
+/**
+ * update_cpus_node - Update cpus node of flattened device tree using of_root
+ *device node.
+ * @fdt:  Flattened device tree of the kernel.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int update_cpus_node(void *fdt)
+{
+   struct device_node *cpus_node, *dn;
+   int cpus_offset, cpus_subnode_offset, ret = 0;
+
+   cpus_offset = fdt_path_offset(fdt, "/cpus");
+   if (cpus_offset < 0 && cpus_offset != -FDT_ERR_NOTFOUND) {
+   pr_err("Malformed device tree: error reading /cpus node: %s\n",
+  fdt_strerror(cpus_offset));
+   return cpus_offset;
+   }
+
+   if (cpus_offset > 0) {
+   ret = fdt_del_node(fdt, cpus_offset);
+   if (ret < 0) {
+   pr_err("Error deleting /cpus node: %s\n", 
fdt_strerror(ret));
+   return -EINVAL;
+   }
+   }
+
+   /* Add cpus node to fdt */
+   cpus_offset = fdt_add_subnode(fdt, fdt_path_offset(fdt, "/"), "cpus");
+   if (cpus_offset < 0) {
+   pr_err("Error creating /cpus node: %s\n", 
fdt_strerror(cpus_offset));
+   return -EINVAL;
+   }
+
+   /* Add cpus node properties */
+   cpus_node = of_find_node_by_path("/cpus");
+   ret = add_node_props(fdt, cpus_offset, cpus_node);
+   of_node_put(cpus_node);
+   if (ret < 0)
+   return ret;
+
+   /* Loop through all subnodes of cpus and add them to fdt */
+   for_each_node_by_type(dn, "cpu") {
+   cpus_subnode_offset = fdt_add_subnode(fdt, cpus_offset, 
dn->full_name);
+   if (cpus_subnode_offset < 0) {
+   pr_err("Unable to add %s subnode: %s\n", dn->full_name,
+  fdt_strerror(cpus_subnode_offset));
+   ret = cpus_subnode_offset;
+   goto out;
+   }
+
+   ret = add_node_props(fdt, cpus_subnode_offset, dn);
+   if (ret < 0)
+   goto out;
+   }
+out:
+   of_node_put(dn);
+   return ret;
+}
+
 #ifdef CONFIG_PPC_64S_HASH_MMU
 /* Values we need to export to the second kernel via the device tree. */
 static unsigned long htab_base;
diff --git a/arch/powerpc/kexec/file_load_64.c 
b/arch/powerpc/kexec/file_load_64.c
index 07da6bf1cf24..57f991b0a9da 100644
--- a/arch/powerpc/kexec/file_load_64.c
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -951,93 +951,6 @@ unsigned int kexec_extra_fdt_size_ppc64(struct kimage 
*image)
return (unsigned int)(usm_entries * sizeof(u64));
 }
 
-/**
- * add_node_props - Reads node 

Re: [PATCH 12/15] swiotlb: provide swiotlb_init variants that remap the buffer

2022-03-14 Thread Boris Ostrovsky



On 3/14/22 3:31 AM, Christoph Hellwig wrote:

-void __init swiotlb_init(bool addressing_limit, unsigned int flags)
+void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
+   int (*remap)(void *tlb, unsigned long nslabs))
  {
-   size_t bytes = PAGE_ALIGN(default_nslabs << IO_TLB_SHIFT);
+   unsigned long nslabs = default_nslabs;
+   size_t bytes;
void *tlb;
  
  	if (!addressing_limit && !swiotlb_force_bounce)

@@ -271,12 +273,24 @@ void __init swiotlb_init(bool addressing_limit, unsigned 
int flags)
 * allow to pick a location everywhere for hypervisors with guest
 * memory encryption.
 */
+retry:
+   bytes = PAGE_ALIGN(default_nslabs << IO_TLB_SHIFT);
if (flags & SWIOTLB_ANY)
tlb = memblock_alloc(bytes, PAGE_SIZE);
else
tlb = memblock_alloc_low(bytes, PAGE_SIZE);
if (!tlb)
goto fail;
+   if (remap && remap(tlb, nslabs) < 0) {
+   memblock_free(tlb, PAGE_ALIGN(bytes));
+
+   /* Min is 2MB */
+   if (nslabs <= 1024)



This is IO_TLB_MIN_SLABS, isn't it? (Xen code didn't say so but that's what it 
meant to say I believe)



+   panic("%s: Failed to remap %zu bytes\n",
+ __func__, bytes);
+   nslabs = max(1024UL, ALIGN(nslabs >> 1, IO_TLB_SEGSIZE));
+   goto retry;
+   }

@@ -303,6 +323,7 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask)
if (swiotlb_force_disable)
return 0;
  
+retry:

order = get_order(nslabs << IO_TLB_SHIFT);
nslabs = SLABS_PER_PAGE << order;
bytes = nslabs << IO_TLB_SHIFT;
@@ -317,6 +338,17 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask)
  
  	if (!vstart)

return -ENOMEM;
+   if (remap)
+   rc = remap(vstart, nslabs);
+   if (rc) {
+   free_pages((unsigned long)vstart, order);
+
+   /* Min is 2MB */
+   if (nslabs <= 1024)



Same here.


-boris



+   return rc;
+   nslabs = max(1024UL, ALIGN(nslabs >> 1, IO_TLB_SEGSIZE));
+   goto retry;
+   }
  
  	if (order != get_order(bytes)) {

pr_warn("only able to allocate %ld MB\n",


Re: [PATCH 10/14] powerpc/rtas: replace rtas_call_unlocked with raw_rtas_call

2022-03-14 Thread Laurent Dufour
On 08/03/2022, 14:50:43, Nicholas Piggin wrote:
> Use the same calling and rets return convention with the raw rtas
> call rather than requiring callers to load and byteswap return
> values out of rtas_args.
> 
> Signed-off-by: Nicholas Piggin 

Despite a minor comment below
Reviewed-by: Laurent Dufour 

> ---
>  arch/powerpc/include/asm/rtas.h  |  4 +-
>  arch/powerpc/kernel/rtas.c   | 53 +++-
>  arch/powerpc/platforms/pseries/hotplug-cpu.c |  2 +-
>  arch/powerpc/platforms/pseries/ras.c |  7 +--
>  arch/powerpc/xmon/xmon.c |  2 +-
>  5 files changed, 33 insertions(+), 35 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
> index 82e5b055fa2a..1014ff140cf8 100644
> --- a/arch/powerpc/include/asm/rtas.h
> +++ b/arch/powerpc/include/asm/rtas.h
> @@ -241,8 +241,8 @@ extern int rtas_token(const char *service);
>  extern int rtas_service_present(const char *service);
>  extern int rtas_call(int token, int, int, int *, ...);
>  int rtas_call_reentrant(int token, int nargs, int nret, int *outputs, ...);
> -void rtas_call_unlocked(struct rtas_args *args, int token, int nargs,
> - int nret, ...);
> +int raw_rtas_call(struct rtas_args *args, int token,
> + int nargs, int nret, int *outputs, ...);

Minor, would it be better to keep the "unlocked" suffix to advise that the
rtas lock is not held here?

>  extern void __noreturn rtas_restart(char *cmd);
>  extern void rtas_power_off(void);
>  extern void __noreturn rtas_halt(void);
> diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
> index fece066115f0..751a20669669 100644
> --- a/arch/powerpc/kernel/rtas.c
> +++ b/arch/powerpc/kernel/rtas.c
> @@ -123,7 +123,7 @@ static void call_rtas_display_status(unsigned char c)
>   return;
>  
>   s = lock_rtas();
> - rtas_call_unlocked(, 10, 1, 1, NULL, c);
> + raw_rtas_call(, 10, 1, 1, NULL, c);
>   unlock_rtas(s);
>  }
>  
> @@ -434,10 +434,9 @@ static char *__fetch_rtas_last_error(char *altbuf)
>  #define get_errorlog_buffer()NULL
>  #endif
>  
> -
> -static void
> -va_rtas_call_unlocked(struct rtas_args *args, int token, int nargs, int nret,
> -   va_list list)
> +static int notrace va_raw_rtas_call(struct rtas_args *args, int token,
> + int nargs, int nret, int *outputs,
> + va_list list)
>  {
>   int i;
>  
> @@ -453,21 +452,37 @@ va_rtas_call_unlocked(struct rtas_args *args, int 
> token, int nargs, int nret,
>   args->rets[i] = 0;
>  
>   do_enter_rtas(__pa(args));
> +
> + if (nret > 1 && outputs != NULL) {
> + for (i = 0; i < nret-1; ++i)
> + outputs[i] = be32_to_cpu(args->rets[i+1]);
> + }
> +
> + return (nret > 0) ? be32_to_cpu(args->rets[0]) : 0;
>  }
>  
> -void rtas_call_unlocked(struct rtas_args *args, int token, int nargs, int 
> nret, ...)
> +/*
> + * Like rtas_call but no kmalloc or printk etc in error handling, so
> + * error won't go through log_error. No tracing, may be called in real mode.
> + * rtas_args must be supplied, and appropriate synchronization for the rtas
> + * call being made has to be performed by the caller.
> + */
> +int notrace raw_rtas_call(struct rtas_args *args, int token,
> + int nargs, int nret, int *outputs, ...)
>  {
>   va_list list;
> + int ret;
>  
> - va_start(list, nret);
> - va_rtas_call_unlocked(args, token, nargs, nret, list);
> + va_start(list, outputs);
> + ret = va_raw_rtas_call(args, token, nargs, nret, outputs, list);
>   va_end(list);
> +
> + return ret;
>  }
>  
>  int rtas_call(int token, int nargs, int nret, int *outputs, ...)
>  {
>   va_list list;
> - int i;
>   unsigned long s;
>   struct rtas_args *rtas_args;
>   char *buff_copy = NULL;
> @@ -482,19 +497,14 @@ int rtas_call(int token, int nargs, int nret, int 
> *outputs, ...)
>   rtas_args = 
>  
>   va_start(list, outputs);
> - va_rtas_call_unlocked(rtas_args, token, nargs, nret, list);
> + ret = va_raw_rtas_call(rtas_args, token, nargs, nret, outputs, list);
>   va_end(list);
>  
>   /* A -1 return code indicates that the last command couldn't
>  be completed due to a hardware error. */
> - if (be32_to_cpu(rtas_args->rets[0]) == -1)
> + if (ret == -1)
>   buff_copy = __fetch_rtas_last_error(NULL);
>  
> - if (nret > 1 && outputs != NULL)
> - for (i = 0; i < nret-1; ++i)
> - outputs[i] = be32_to_cpu(rtas_args->rets[i+1]);
> - ret = (nret > 0)? be32_to_cpu(rtas_args->rets[0]): 0;
> -
>   unlock_rtas(s);
>  
>   if (buff_copy) {
> @@ -950,7 +960,7 @@ int rtas_call_reentrant(int token, int nargs, int nret, 
> int *outputs, ...)
>   va_list list;
>   struct rtas_args *args;
>   

Re: [PATCH 09/14] powerpc/rtas: Leave MSR[RI] enabled over RTAS call

2022-03-14 Thread Laurent Dufour
On 08/03/2022, 14:50:42, Nicholas Piggin wrote:
> PAPR specifies that RTAS may be called with MSR[RI] enabled if the
> calling context is recoverable, and RTAS will manage RI as necessary.
> Call the rtas entry point with RI enabled, and add a check to ensure
> the caller has RI enabled.
> 
> Signed-off-by: Nicholas Piggin 
> ---
>  arch/powerpc/kernel/rtas.c   |  1 +
>  arch/powerpc/kernel/rtas_entry.S | 13 +++--
>  2 files changed, 4 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
> index 87ede1877816..fece066115f0 100644
> --- a/arch/powerpc/kernel/rtas.c
> +++ b/arch/powerpc/kernel/rtas.c
> @@ -57,6 +57,7 @@ static noinline void do_enter_rtas(unsigned long args)
>   hard_irq_disable();
>  
>   msr = mfmsr();
> + BUG_ON(!(msr & MSR_RI));
>   mtmsr(msr & ~(MSR_IR|MSR_DR));
>   enter_rtas(args);
>   mtmsr(msr);
> diff --git a/arch/powerpc/kernel/rtas_entry.S 
> b/arch/powerpc/kernel/rtas_entry.S
> index 292551684bbd..72b27b14ccc9 100644
> --- a/arch/powerpc/kernel/rtas_entry.S
> +++ b/arch/powerpc/kernel/rtas_entry.S
> @@ -95,7 +95,7 @@ _GLOBAL(enter_rtas)
>   clrldi  r4,r4,2 /* convert to realmode address */
>   mtlrr4
>  
> - LOAD_REG_IMMEDIATE(r6, MSR_ME)
> + LOAD_REG_IMMEDIATE(r6, MSR_ME|MSR_RI)
>  
>   LOAD_REG_ADDR(r4, rtas)
>   ld  r5,RTASENTRY(r4)/* get the rtas->entry value */
> @@ -113,15 +113,8 @@ _ASM_NOKPROBE_SYMBOL(enter_rtas)
>  rtas_return_loc:
>   FIXUP_ENDIAN
>  
> - /*
> -  * Clear RI and set SF before anything.
> -  */
> - mfmsr   r6
> - li  r0,MSR_RI
> - andcr6,r6,r0
> - sldir0,r0,(MSR_SF_LG - MSR_RI_LG)
> - or  r6,r6,r0
> - sync
> + /* Set SF before anything. */
> + LOAD_REG_IMMEDIATE(r6, MSR_KERNEL & ~(MSR_IR|MSR_DR))

I guess we cannot directly load a MSR value stored in the PACA at that
time, isn't it?

>   mtmsrd  r6
>  
>   GET_PACA(r13)



Re: [PATCH 08/14] powerpc/rtas: call enter_rtas in real-mode on 64-bit

2022-03-14 Thread Laurent Dufour
On 08/03/2022, 14:50:41, Nicholas Piggin wrote:
> This moves MSR save/restore and some real-mode juggling out of asm and
> into C code, simplifying things.
> 
> Signed-off-by: Nicholas Piggin 
> ---
>  arch/powerpc/kernel/rtas.c   | 15 ---
>  arch/powerpc/kernel/rtas_entry.S | 32 +---
>  2 files changed, 17 insertions(+), 30 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
> index 6b5892d6a56b..87ede1877816 100644
> --- a/arch/powerpc/kernel/rtas.c
> +++ b/arch/powerpc/kernel/rtas.c
> @@ -47,13 +47,22 @@
>  /* This is here deliberately so it's only used in this file */
>  void enter_rtas(unsigned long);
>  
> -static inline void do_enter_rtas(unsigned long args)
> +static noinline void do_enter_rtas(unsigned long args)
>  {
>   BUG_ON(!irqs_disabled());
>  
> - hard_irq_disable(); /* Ensure MSR[EE] is disabled on PPC64 */
> + if (IS_ENABLED(CONFIG_PPC64)) {
> + unsigned long msr;
>  
> - enter_rtas(args);
> + hard_irq_disable();
> +
> + msr = mfmsr();
> + mtmsr(msr & ~(MSR_IR|MSR_DR));
> + enter_rtas(args);
> + mtmsr(msr);
> + } else {
> + enter_rtas(args);
> + }
>  
>   srr_regs_clobbered(); /* rtas uses SRRs, invalidate */
>  }
> diff --git a/arch/powerpc/kernel/rtas_entry.S 
> b/arch/powerpc/kernel/rtas_entry.S
> index 5f65ea4436c6..292551684bbd 100644
> --- a/arch/powerpc/kernel/rtas_entry.S
> +++ b/arch/powerpc/kernel/rtas_entry.S
> @@ -84,14 +84,11 @@ _GLOBAL(enter_rtas)
>   li  r0,0
>   mtcrr0
>  
> - mfmsr   r6
> -
> - /* Unfortunately, the stack pointer and the MSR are also clobbered,
> -  * so they are saved in the PACA which allows us to restore
> -  * our original state after RTAS returns.
> + /*
> +  * The stack pointer is clobbered, so it is saved in the PACA which
> +  * allows us to restore our original state after RTAS returns.
>*/
>   std r1,PACAR1(r13)
> - std r6,PACASAVEDMSR(r13)
>  
>   /* Setup our real return addr */
>   LOAD_REG_ADDR(r4,rtas_return_loc)
> @@ -100,7 +97,6 @@ _GLOBAL(enter_rtas)
>  
>   LOAD_REG_IMMEDIATE(r6, MSR_ME)
>  
> -__enter_rtas:
>   LOAD_REG_ADDR(r4, rtas)
>   ld  r5,RTASENTRY(r4)/* get the rtas->entry value */
>   ld  r4,RTASBASE(r4) /* get the rtas->base value */
> @@ -112,6 +108,7 @@ __enter_rtas:
>   mtspr   SPRN_SRR1,r6
>   RFI_TO_KERNEL
>   b   .   /* prevent speculative execution */
> +_ASM_NOKPROBE_SYMBOL(enter_rtas)
>  
>  rtas_return_loc:
>   FIXUP_ENDIAN
> @@ -127,29 +124,10 @@ rtas_return_loc:
>   sync
>   mtmsrd  r6

Since MSR plumbing is still needed in the asm, what is the benefit of doing
the real mode switching in the C code?

What if the MSR is saved in the PACA before switching to real mode, and
restored in rtas_return_loc?

>  
> - /* relocation is off at this point */
>   GET_PACA(r13)
>  
> - bcl 20,31,$+4
> -0:   mflrr3
> - ld  r3,(1f-0b)(r3)  /* get _restore_regs */
> -
>   ld  r1,PACAR1(r13)  /* Restore our SP */
> - ld  r4,PACASAVEDMSR(r13)/* Restore our MSR */
>  
> - mtspr   SPRN_SRR0,r3
> - mtspr   SPRN_SRR1,r4
> - RFI_TO_KERNEL

rfid is not more called to restore MSR.
Noob question, is there any impact of using mtmsrd instead of rfid to
restore the MSR?

> - b   .   /* prevent speculative execution */
> -_ASM_NOKPROBE_SYMBOL(enter_rtas)
> -_ASM_NOKPROBE_SYMBOL(__enter_rtas)
> -_ASM_NOKPROBE_SYMBOL(rtas_return_loc)
> -
> - .align  3
> -1:   .8byte  rtas_restore_regs
> -
> -rtas_restore_regs:
> - /* relocation is on at this point */
>   REST_GPR(2, r1) /* Restore the TOC */
>   REST_NVGPRS(r1) /* Restore the non-volatiles */
>  
> @@ -169,5 +147,5 @@ rtas_restore_regs:
>  
>   mtlrr0
>   blr /* return to caller */
> -
> +_ASM_NOKPROBE_SYMBOL(rtas_return_loc)
>  #endif /* CONFIG_PPC32 */



Re: [PATCH 07/14] powerpc/rtas: PACA can be restored directly from SPRG

2022-03-14 Thread Laurent Dufour
On 08/03/2022, 14:50:40, Nicholas Piggin wrote:
> On 64-bit, PACA is saved in a SPRG so it does not need to be saved on
> stack. We also don't need to mask off the top bits for real mode
> addresses because the architecture does this for us.
> 
> Signed-off-by: Nicholas Piggin 

Reviewed-by: Laurent Dufour 

> ---
>  arch/powerpc/kernel/rtas_entry.S | 13 -
>  1 file changed, 4 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/rtas_entry.S 
> b/arch/powerpc/kernel/rtas_entry.S
> index 08eb731f08b8..5f65ea4436c6 100644
> --- a/arch/powerpc/kernel/rtas_entry.S
> +++ b/arch/powerpc/kernel/rtas_entry.S
> @@ -62,10 +62,9 @@ _GLOBAL(enter_rtas)
>  
>   /* Because RTAS is running in 32b mode, it clobbers the high order half
>* of all registers that it saves.  We therefore save those registers
> -  * RTAS might touch to the stack.  (r0, r3-r13 are caller saved)
> +  * RTAS might touch to the stack.  (r0, r3-r12 are caller saved)
>*/
>   SAVE_GPR(2, r1) /* Save the TOC */
> - SAVE_GPR(13, r1)/* Save paca */
>   SAVE_NVGPRS(r1) /* Save the non-volatiles */
>  
>   mfcrr4
> @@ -129,15 +128,14 @@ rtas_return_loc:
>   mtmsrd  r6
>  
>   /* relocation is off at this point */
> - GET_PACA(r4)
> - clrldi  r4,r4,2 /* convert to realmode address */
> + GET_PACA(r13)
>  
>   bcl 20,31,$+4
>  0:   mflrr3
>   ld  r3,(1f-0b)(r3)  /* get _restore_regs */
>  
> - ld  r1,PACAR1(r4)   /* Restore our SP */
> - ld  r4,PACASAVEDMSR(r4) /* Restore our MSR */
> + ld  r1,PACAR1(r13)  /* Restore our SP */
> + ld  r4,PACASAVEDMSR(r13)/* Restore our MSR */
>  
>   mtspr   SPRN_SRR0,r3
>   mtspr   SPRN_SRR1,r4
> @@ -153,11 +151,8 @@ _ASM_NOKPROBE_SYMBOL(rtas_return_loc)
>  rtas_restore_regs:
>   /* relocation is on at this point */
>   REST_GPR(2, r1) /* Restore the TOC */
> - REST_GPR(13, r1)/* Restore paca */
>   REST_NVGPRS(r1) /* Restore the non-volatiles */
>  
> - GET_PACA(r13)
> -
>   ld  r4,_CCR(r1)
>   mtcrr4
>   ld  r5,_CTR(r1)



Re: [PATCH 06/14] powerpc/rtas: Load rtas entry MSR explicitly

2022-03-14 Thread Laurent Dufour
On 08/03/2022, 14:50:39, Nicholas Piggin wrote:
> Rather than adjust the current MSR value to find the rtas entry
> MSR on 64-bit, load the explicit value we want as 32-bit does.
> 
> This prevents some facilities (e.g., VEC and VSX) from being left
> enabled which doesn't seem to cause a problem but it's more
> consistent to always use the same MSR and minimise facilities
> enabled.
> 
> Signed-off-by: Nicholas Piggin 

Reviewed-by: Laurent Dufour 

> ---
>  arch/powerpc/kernel/rtas_entry.S | 9 +
>  1 file changed, 1 insertion(+), 8 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/rtas_entry.S 
> b/arch/powerpc/kernel/rtas_entry.S
> index 7b93687b9a10..08eb731f08b8 100644
> --- a/arch/powerpc/kernel/rtas_entry.S
> +++ b/arch/powerpc/kernel/rtas_entry.S
> @@ -99,14 +99,7 @@ _GLOBAL(enter_rtas)
>   clrldi  r4,r4,2 /* convert to realmode address */
>   mtlrr4
>  
> - li  r0,0
> - ori r0,r0,MSR_EE|MSR_SE|MSR_BE|MSR_RI
> - andcr0,r6,r0
> -
> - li  r9,1
> - rldicr  r9,r9,MSR_SF_LG,(63-MSR_SF_LG)
> - ori r9,r9,MSR_IR|MSR_DR|MSR_FE0|MSR_FE1|MSR_FP|MSR_RI|MSR_LE
> - andcr6,r0,r9
> + LOAD_REG_IMMEDIATE(r6, MSR_ME)
>  
>  __enter_rtas:
>   LOAD_REG_ADDR(r4, rtas)



Re: [PATCH 05/14] powerpc/rtas: Modernise RI clearing on 64-bit

2022-03-14 Thread Laurent Dufour
On 08/03/2022, 14:50:38, Nicholas Piggin wrote:
> mtmsrd L=1 can clear MSR[RI] without the previous MSR value; it does
> not require sync; it can be moved later to before SRRs are live.
> 
> Signed-off-by: Nicholas Piggin 

Reviewed-by: Laurent Dufour 

> ---
>  arch/powerpc/kernel/rtas_entry.S | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/rtas_entry.S 
> b/arch/powerpc/kernel/rtas_entry.S
> index 45fa661c2ff6..7b93687b9a10 100644
> --- a/arch/powerpc/kernel/rtas_entry.S
> +++ b/arch/powerpc/kernel/rtas_entry.S
> @@ -109,13 +109,13 @@ _GLOBAL(enter_rtas)
>   andcr6,r0,r9
>  
>  __enter_rtas:
> - sync/* disable RI so SRR0/1 */
> - mtmsrd  r0  /* don't get trashed */
> -
>   LOAD_REG_ADDR(r4, rtas)
>   ld  r5,RTASENTRY(r4)/* get the rtas->entry value */
>   ld  r4,RTASBASE(r4) /* get the rtas->base value */
>  
> + li  r0,0
> + mtmsrd  r0,1/* disable RI before using SRR0/1 */
> +
>   mtspr   SPRN_SRR0,r5
>   mtspr   SPRN_SRR1,r6
>   RFI_TO_KERNEL



Re: [PATCH 04/14] powerpc/rtas: Call enter_rtas with MSR[EE] disabled

2022-03-14 Thread Laurent Dufour
On 08/03/2022, 14:50:37, Nicholas Piggin wrote:
> Disable MSR[EE] in C code rather than asm.
> 
> Signed-off-by: Nicholas Piggin 

FWIW,
Reviewed-by: Laurent Dufour 

> ---
>  arch/powerpc/kernel/rtas.c   |  4 
>  arch/powerpc/kernel/rtas_entry.S | 17 +
>  2 files changed, 5 insertions(+), 16 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
> index 733e6ef36758..6b5892d6a56b 100644
> --- a/arch/powerpc/kernel/rtas.c
> +++ b/arch/powerpc/kernel/rtas.c
> @@ -49,6 +49,10 @@ void enter_rtas(unsigned long);
>  
>  static inline void do_enter_rtas(unsigned long args)
>  {
> + BUG_ON(!irqs_disabled());
> +
> + hard_irq_disable(); /* Ensure MSR[EE] is disabled on PPC64 */
> +
>   enter_rtas(args);
>  
>   srr_regs_clobbered(); /* rtas uses SRRs, invalidate */
> diff --git a/arch/powerpc/kernel/rtas_entry.S 
> b/arch/powerpc/kernel/rtas_entry.S
> index 6fa10eb49a9c..45fa661c2ff6 100644
> --- a/arch/powerpc/kernel/rtas_entry.S
> +++ b/arch/powerpc/kernel/rtas_entry.S
> @@ -24,8 +24,6 @@ _GLOBAL(enter_rtas)
>   lwz r4,RTASBASE(r4)
>   mfmsr   r9
>   stw r9,8(r1)
> - LOAD_REG_IMMEDIATE(r0,MSR_KERNEL)
> - mtmsr   r0  /* disable interrupts so SRR0/1 don't get trashed */
>   li  r9,MSR_KERNEL & ~(MSR_IR|MSR_DR)
>   mtlrr6
>   stw r1, THREAD + RTAS_SP(r2)
> @@ -87,20 +85,7 @@ _GLOBAL(enter_rtas)
>   li  r0,0
>   mtcrr0
>  
> -#ifdef CONFIG_BUG
> - /* There is no way it is acceptable to get here with interrupts enabled,
> -  * check it with the asm equivalent of WARN_ON
> -  */
> - lbz r0,PACAIRQSOFTMASK(r13)
> -1:   tdeqi   r0,IRQS_ENABLED
> - EMIT_WARN_ENTRY 1b,__FILE__,__LINE__,BUGFLAG_WARNING
> -#endif
> -
> - /* Hard-disable interrupts */
>   mfmsr   r6
> - rldicl  r7,r6,48,1
> - rotldi  r7,r7,16
> - mtmsrd  r7,1
>  
>   /* Unfortunately, the stack pointer and the MSR are also clobbered,
>* so they are saved in the PACA which allows us to restore
> @@ -124,7 +109,7 @@ _GLOBAL(enter_rtas)
>   andcr6,r0,r9
>  
>  __enter_rtas:
> - sync/* disable interrupts so SRR0/1 */
> + sync/* disable RI so SRR0/1 */
>   mtmsrd  r0  /* don't get trashed */
>  
>   LOAD_REG_ADDR(r4, rtas)



Re: [PATCH] powerpc/xive: fix return value of __setup handler

2022-03-14 Thread Cédric Le Goater

On 3/13/22 07:59, Randy Dunlap wrote:

__setup() handlers should return 1 to obsolete_checksetup() in
init/main.c to indicate that the boot option has been handled.
A return of 0 causes the boot option/value to be listed as an Unknown
kernel parameter and added to init's (limited) argument or environment
strings. Also, error return codes don't mean anything to
obsolete_checksetup() -- only non-zero (usually 1) or zero.
So return 1 from xive_off() and xive_store_eoi_cmdline().

Fixes: 243e25112d06 ("powerpc/xive: Native exploitation of the XIVE interrupt 
controller")
Fixes: c21ee04f11ae ("powerpc/xive: Add a kernel parameter for StoreEOI")
Signed-off-by: Randy Dunlap 
From: Igor Zhbanov 
Link: lore.kernel.org/r/64644a2f-4a20-bab3-1e15-3b2cdd0de...@omprussia.ru
Cc: Benjamin Herrenschmidt 
Cc: Michael Ellerman 
Cc: Cédric Le Goater 
Cc: Paul Mackerras 


Reviewed-by: Cédric Le Goater 


Thanks,

C.



---
  arch/powerpc/sysdev/xive/common.c |6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)

--- linux-next-20220310.orig/arch/powerpc/sysdev/xive/common.c
+++ linux-next-20220310/arch/powerpc/sysdev/xive/common.c
@@ -1708,20 +1708,20 @@ __be32 *xive_queue_page_alloc(unsigned i
  static int __init xive_off(char *arg)
  {
xive_cmdline_disabled = true;
-   return 0;
+   return 1;
  }
  __setup("xive=off", xive_off);
  
  static int __init xive_store_eoi_cmdline(char *arg)

  {
if (!arg)
-   return -EINVAL;
+   return 1;
  
  	if (strncmp(arg, "off", 3) == 0) {

pr_info("StoreEOI disabled on kernel command line\n");
xive_store_eoi = false;
}
-   return 0;
+   return 1;
  }
  __setup("xive.store-eoi=", xive_store_eoi_cmdline);
  




[PATCH v2] static_call: Don't make __static_call_return0 static

2022-03-14 Thread Christophe Leroy
System.map shows that vmlinux contains several instances of
__static_call_return0():

c0004fc0 t __static_call_return0
c0011518 t __static_call_return0
c00d8160 t __static_call_return0

arch_static_call_transform() uses the middle one to check whether we are
setting a call to __static_call_return0 or not:

c0011520 :
c0011520:   3d 20 c0 01 lis r9,-16383   <== r9 =  
0xc001 << 16
c0011524:   39 29 15 18 addir9,r9,5400  <== r9 += 0x1518
c0011528:   7c 05 48 00 cmpwr5,r9   <== r9 has 
value 0xc0011518 here

So if static_call_update() is called with one of the other instances of
__static_call_return0(), arch_static_call_transform() won't recognise it.

In order to work properly, global single instance of __static_call_return0() is 
required.

Fixes: 3f2a8fc4b15d ("static_call/x86: Add __static_call_return0()")
Signed-off-by: Christophe Leroy 
---
v2: Add linux/static_call.h to avoid "missing prototype - should it be static" 
warning.
---
 include/linux/static_call.h   |   5 +-
 kernel/Makefile   |   3 +-
 kernel/static_call.c  | 541 --
 .../{static_call.c => static_call_inline.c}   |   5 -
 4 files changed, 3 insertions(+), 551 deletions(-)
 copy kernel/{static_call.c => static_call_inline.c} (99%)

diff --git a/include/linux/static_call.h b/include/linux/static_call.h
index 8ee175d233d7..df53bed9d71f 100644
--- a/include/linux/static_call.h
+++ b/include/linux/static_call.h
@@ -262,10 +262,7 @@ static inline int static_call_text_reserved(void *start, 
void *end)
return 0;
 }
 
-static inline long __static_call_return0(void)
-{
-   return 0;
-}
+extern long __static_call_return0(void);
 
 #define EXPORT_STATIC_CALL(name)   \
EXPORT_SYMBOL(STATIC_CALL_KEY(name));   \
diff --git a/kernel/Makefile b/kernel/Makefile
index 56f4ee97f328..a18d169732d2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -113,7 +113,8 @@ obj-$(CONFIG_CPU_PM) += cpu_pm.o
 obj-$(CONFIG_BPF) += bpf/
 obj-$(CONFIG_KCSAN) += kcsan/
 obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
-obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call.o
+obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o
+obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o
 obj-$(CONFIG_CFI_CLANG) += cfi.o
 
 obj-$(CONFIG_PERF_EVENTS) += events/
diff --git a/kernel/static_call.c b/kernel/static_call.c
index 43ba0b1e0edb..dae74642b3e1 100644
--- a/kernel/static_call.c
+++ b/kernel/static_call.c
@@ -1,548 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
-#include 
 #include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-
-extern struct static_call_site __start_static_call_sites[],
-  __stop_static_call_sites[];
-extern struct static_call_tramp_key __start_static_call_tramp_key[],
-   __stop_static_call_tramp_key[];
-
-static bool static_call_initialized;
-
-/* mutex to protect key modules/sites */
-static DEFINE_MUTEX(static_call_mutex);
-
-static void static_call_lock(void)
-{
-   mutex_lock(_call_mutex);
-}
-
-static void static_call_unlock(void)
-{
-   mutex_unlock(_call_mutex);
-}
-
-static inline void *static_call_addr(struct static_call_site *site)
-{
-   return (void *)((long)site->addr + (long)>addr);
-}
-
-static inline unsigned long __static_call_key(const struct static_call_site 
*site)
-{
-   return (long)site->key + (long)>key;
-}
-
-static inline struct static_call_key *static_call_key(const struct 
static_call_site *site)
-{
-   return (void *)(__static_call_key(site) & ~STATIC_CALL_SITE_FLAGS);
-}
-
-/* These assume the key is word-aligned. */
-static inline bool static_call_is_init(struct static_call_site *site)
-{
-   return __static_call_key(site) & STATIC_CALL_SITE_INIT;
-}
-
-static inline bool static_call_is_tail(struct static_call_site *site)
-{
-   return __static_call_key(site) & STATIC_CALL_SITE_TAIL;
-}
-
-static inline void static_call_set_init(struct static_call_site *site)
-{
-   site->key = (__static_call_key(site) | STATIC_CALL_SITE_INIT) -
-   (long)>key;
-}
-
-static int static_call_site_cmp(const void *_a, const void *_b)
-{
-   const struct static_call_site *a = _a;
-   const struct static_call_site *b = _b;
-   const struct static_call_key *key_a = static_call_key(a);
-   const struct static_call_key *key_b = static_call_key(b);
-
-   if (key_a < key_b)
-   return -1;
-
-   if (key_a > key_b)
-   return 1;
-
-   return 0;
-}
-
-static void static_call_site_swap(void *_a, void *_b, int size)
-{
-   long delta = (unsigned long)_a - (unsigned long)_b;
-   struct static_call_site *a = _a;
-   struct static_call_site *b = _b;
-   struct static_call_site tmp = *a;
-

[PATCH v1 2/2] static_call: Remove __DEFINE_STATIC_CALL macro

2022-03-14 Thread Christophe Leroy
Only DEFINE_STATIC_CALL use __DEFINE_STATIC_CALL macro now when
CONFIG_HAVE_STATIC_CALL is selected.

Only keep __DEFINE_STATIC_CALL() for the generic fallback, and
also use it to implement DEFINE_STATIC_CALL_NULL() in that case.

Signed-off-by: Christophe Leroy 
---
 include/linux/static_call.h | 23 ++-
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/include/linux/static_call.h b/include/linux/static_call.h
index 3c50b0fdda16..df53bed9d71f 100644
--- a/include/linux/static_call.h
+++ b/include/linux/static_call.h
@@ -180,13 +180,13 @@ extern int static_call_text_reserved(void *start, void 
*end);
 
 extern long __static_call_return0(void);
 
-#define __DEFINE_STATIC_CALL(name, _func, _func_init)  \
+#define DEFINE_STATIC_CALL(name, _func)
\
DECLARE_STATIC_CALL(name, _func);   \
struct static_call_key STATIC_CALL_KEY(name) = {\
-   .func = _func_init, \
+   .func = _func,  \
.type = 1,  \
};  \
-   ARCH_DEFINE_STATIC_CALL_TRAMP(name, _func_init)
+   ARCH_DEFINE_STATIC_CALL_TRAMP(name, _func)
 
 #define DEFINE_STATIC_CALL_NULL(name, _func)   \
DECLARE_STATIC_CALL(name, _func);   \
@@ -225,12 +225,12 @@ extern long __static_call_return0(void);
 
 static inline int static_call_init(void) { return 0; }
 
-#define __DEFINE_STATIC_CALL(name, _func, _func_init)  \
+#define DEFINE_STATIC_CALL(name, _func)
\
DECLARE_STATIC_CALL(name, _func);   \
struct static_call_key STATIC_CALL_KEY(name) = {\
-   .func = _func_init, \
+   .func = _func,  \
};  \
-   ARCH_DEFINE_STATIC_CALL_TRAMP(name, _func_init)
+   ARCH_DEFINE_STATIC_CALL_TRAMP(name, _func)
 
 #define DEFINE_STATIC_CALL_NULL(name, _func)   \
DECLARE_STATIC_CALL(name, _func);   \
@@ -292,11 +292,11 @@ static inline long __static_call_return0(void)
.func = _func_init, \
}
 
+#define DEFINE_STATIC_CALL(name, _func)
\
+   __DEFINE_STATIC_CALL(name, _func, _func)
+
 #define DEFINE_STATIC_CALL_NULL(name, _func)   \
-   DECLARE_STATIC_CALL(name, _func);   \
-   struct static_call_key STATIC_CALL_KEY(name) = {\
-   .func = NULL,   \
-   }
+   __DEFINE_STATIC_CALL(name, _func, NULL)
 
 #define DEFINE_STATIC_CALL_RET0(name, _func)   \
__DEFINE_STATIC_CALL(name, _func, __static_call_return0)
@@ -341,7 +341,4 @@ static inline int static_call_text_reserved(void *start, 
void *end)
 
 #endif /* CONFIG_HAVE_STATIC_CALL */
 
-#define DEFINE_STATIC_CALL(name, _func)
\
-   __DEFINE_STATIC_CALL(name, _func, _func)
-
 #endif /* _LINUX_STATIC_CALL_H */
-- 
2.35.1



[PATCH v1 1/2] static_call: Properly initialise DEFINE_STATIC_CALL_RET0()

2022-03-14 Thread Christophe Leroy
When a static call is updated with __static_call_return0() as target,
arch_static_call_transform() set it to use an optimised set of
instructions which are meant to lay in the same cacheline.

But when initialising a static call with DEFINE_STATIC_CALL_RET0(),
we get a branch to the real __static_call_return0() function instead
of getting the optimised setup:

c00d8120 <__SCT__perf_snapshot_branch_stack>:
c00d8120:   4b ff ff f4 b   c00d8114 <__static_call_return0>
c00d8124:   3d 80 c0 0e lis r12,-16370
c00d8128:   81 8c 81 3c lwz r12,-32452(r12)
c00d812c:   7d 89 03 a6 mtctr   r12
c00d8130:   4e 80 04 20 bctr
c00d8134:   38 60 00 00 li  r3,0
c00d8138:   4e 80 00 20 blr
c00d813c:   00 00 00 00 .long 0x0

Add ARCH_DEFINE_STATIC_CALL_RET0_TRAMP() defined by each architecture
to setup the optimised configuration, and rework
DEFINE_STATIC_CALL_RET0() to call it:

c00d8120 <__SCT__perf_snapshot_branch_stack>:
c00d8120:   48 00 00 14 b   c00d8134 
<__SCT__perf_snapshot_branch_stack+0x14>
c00d8124:   3d 80 c0 0e lis r12,-16370
c00d8128:   81 8c 81 3c lwz r12,-32452(r12)
c00d812c:   7d 89 03 a6 mtctr   r12
c00d8130:   4e 80 04 20 bctr
c00d8134:   38 60 00 00 li  r3,0
c00d8138:   4e 80 00 20 blr
c00d813c:   00 00 00 00 .long 0x0

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/static_call.h |  1 +
 arch/x86/include/asm/static_call.h |  2 ++
 include/linux/static_call.h| 20 +---
 3 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/static_call.h 
b/arch/powerpc/include/asm/static_call.h
index 0a0bc79bd1fa..de1018cc522b 100644
--- a/arch/powerpc/include/asm/static_call.h
+++ b/arch/powerpc/include/asm/static_call.h
@@ -24,5 +24,6 @@
 
 #define ARCH_DEFINE_STATIC_CALL_TRAMP(name, func)  __PPC_SCT(name, "b " 
#func)
 #define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name)   __PPC_SCT(name, "blr")
+#define ARCH_DEFINE_STATIC_CALL_RET0_TRAMP(name)   __PPC_SCT(name, "b 
.+20")
 
 #endif /* _ASM_POWERPC_STATIC_CALL_H */
diff --git a/arch/x86/include/asm/static_call.h 
b/arch/x86/include/asm/static_call.h
index ed4f8bb6c2d9..2455d721503e 100644
--- a/arch/x86/include/asm/static_call.h
+++ b/arch/x86/include/asm/static_call.h
@@ -38,6 +38,8 @@
 #define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name)   \
__ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; int3; nop; nop; nop")
 
+#define ARCH_DEFINE_STATIC_CALL_RET0_TRAMP(name)   \
+   ARCH_DEFINE_STATIC_CALL_TRAMP(name, __static_call_return0)
 
 #define ARCH_ADD_TRAMP_KEY(name)   \
asm(".pushsection .static_call_tramp_key, \"a\" \n" \
diff --git a/include/linux/static_call.h b/include/linux/static_call.h
index fcc5b48989b3..3c50b0fdda16 100644
--- a/include/linux/static_call.h
+++ b/include/linux/static_call.h
@@ -196,6 +196,14 @@ extern long __static_call_return0(void);
};  \
ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name)
 
+#define DEFINE_STATIC_CALL_RET0(name, _func)   \
+   DECLARE_STATIC_CALL(name, _func);   \
+   struct static_call_key STATIC_CALL_KEY(name) = {\
+   .func = __static_call_return0,  \
+   .type = 1,  \
+   };  \
+   ARCH_DEFINE_STATIC_CALL_RET0_TRAMP(name)
+
 #define static_call_cond(name) (void)__static_call(name)
 
 #define EXPORT_STATIC_CALL(name)   \
@@ -231,6 +239,12 @@ static inline int static_call_init(void) { return 0; }
};  \
ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name)
 
+#define DEFINE_STATIC_CALL_RET0(name, _func)   \
+   DECLARE_STATIC_CALL(name, _func);   \
+   struct static_call_key STATIC_CALL_KEY(name) = {\
+   .func = __static_call_return0,  \
+   };  \
+   ARCH_DEFINE_STATIC_CALL_RET0_TRAMP(name)
 
 #define static_call_cond(name) (void)__static_call(name)
 
@@ -284,6 +298,9 @@ static inline long __static_call_return0(void)
.func = NULL,   \
}
 
+#define DEFINE_STATIC_CALL_RET0(name, _func)   \
+   __DEFINE_STATIC_CALL(name, _func, __static_call_return0)
+
 static inline void 

[PATCH 15/15] x86: remove cruft from

2022-03-14 Thread Christoph Hellwig
 gets pulled in by all drivers using the DMA API.
Remove x86 internal variables and unnecessary includes from it.

Signed-off-by: Christoph Hellwig 
---
 arch/x86/include/asm/dma-mapping.h | 11 ---
 arch/x86/include/asm/iommu.h   |  2 ++
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/dma-mapping.h 
b/arch/x86/include/asm/dma-mapping.h
index 256fd8115223d..1c66708e30623 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -2,17 +2,6 @@
 #ifndef _ASM_X86_DMA_MAPPING_H
 #define _ASM_X86_DMA_MAPPING_H
 
-/*
- * IOMMU interface. See Documentation/core-api/dma-api-howto.rst and
- * Documentation/core-api/dma-api.rst for documentation.
- */
-
-#include 
-#include 
-
-extern int iommu_merge;
-extern int panic_on_overflow;
-
 extern const struct dma_map_ops *dma_ops;
 
 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index dba89ed40d38d..0bef44d30a278 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -8,6 +8,8 @@
 
 extern int force_iommu, no_iommu;
 extern int iommu_detected;
+extern int iommu_merge;
+extern int panic_on_overflow;
 
 #ifdef CONFIG_SWIOTLB
 extern bool x86_swiotlb_enable;
-- 
2.30.2



[PATCH 14/15] swiotlb: remove swiotlb_init_with_tbl and swiotlb_init_late_with_tbl

2022-03-14 Thread Christoph Hellwig
No users left.

Signed-off-by: Christoph Hellwig 
---
 include/linux/swiotlb.h |  2 -
 kernel/dma/swiotlb.c| 85 +++--
 2 files changed, 30 insertions(+), 57 deletions(-)

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 7b50c82f84ce9..7ed35dd3de6e7 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -34,13 +34,11 @@ struct scatterlist;
 /* default to 64MB */
 #define IO_TLB_DEFAULT_SIZE (64UL<<20)
 
-int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, unsigned int flags);
 unsigned long swiotlb_size_or_default(void);
 void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
int (*remap)(void *tlb, unsigned long nslabs));
 int swiotlb_init_late(size_t size, gfp_t gfp_mask,
int (*remap)(void *tlb, unsigned long nslabs));
-extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs);
 extern void __init swiotlb_update_mem_attributes(void);
 
 phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys,
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 88ea7b9bce6e9..d04bacdb0905b 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -225,33 +225,6 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem 
*mem, phys_addr_t start,
return;
 }
 
-int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs,
-   unsigned int flags)
-{
-   struct io_tlb_mem *mem = _tlb_default_mem;
-   size_t alloc_size;
-
-   if (swiotlb_force_disable)
-   return 0;
-
-   /* protect against double initialization */
-   if (WARN_ON_ONCE(mem->nslabs))
-   return -ENOMEM;
-
-   alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs));
-   mem->slots = memblock_alloc(alloc_size, PAGE_SIZE);
-   if (!mem->slots)
-   panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
- __func__, alloc_size, PAGE_SIZE);
-
-   swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, false);
-   mem->force_bounce = flags & SWIOTLB_FORCE;
-
-   if (flags & SWIOTLB_VERBOSE)
-   swiotlb_print_info();
-   return 0;
-}
-
 /*
  * Statically reserve bounce buffer space and initialize bounce buffer data
  * structures for the software IO TLB used to implement the DMA API.
@@ -259,7 +232,9 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long 
nslabs,
 void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
int (*remap)(void *tlb, unsigned long nslabs))
 {
+   struct io_tlb_mem *mem = _tlb_default_mem;
unsigned long nslabs = default_nslabs;
+   size_t alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs));
size_t bytes;
void *tlb;
 
@@ -280,7 +255,8 @@ void __init swiotlb_init_remap(bool addressing_limit, 
unsigned int flags,
else
tlb = memblock_alloc_low(bytes, PAGE_SIZE);
if (!tlb)
-   goto fail;
+   panic("%s: failed to allocate tlb structure\n", __func__);
+
if (remap && remap(tlb, nslabs) < 0) {
memblock_free(tlb, PAGE_ALIGN(bytes));
 
@@ -291,14 +267,17 @@ void __init swiotlb_init_remap(bool addressing_limit, 
unsigned int flags,
nslabs = max(1024UL, ALIGN(nslabs >> 1, IO_TLB_SEGSIZE));
goto retry;
}
-   if (swiotlb_init_with_tbl(tlb, default_nslabs, flags))
-   goto fail_free_mem;
-   return;
 
-fail_free_mem:
-   memblock_free(tlb, bytes);
-fail:
-   pr_warn("Cannot allocate buffer");
+   mem->slots = memblock_alloc(alloc_size, PAGE_SIZE);
+   if (!mem->slots)
+   panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
+ __func__, alloc_size, PAGE_SIZE);
+
+   swiotlb_init_io_tlb_mem(mem, __pa(tlb), default_nslabs, false);
+   mem->force_bounce = flags & SWIOTLB_FORCE;
+
+   if (flags & SWIOTLB_VERBOSE)
+   swiotlb_print_info();
 }
 
 void __init swiotlb_init(bool addressing_limit, unsigned int flags)
@@ -314,6 +293,7 @@ void __init swiotlb_init(bool addressing_limit, unsigned 
int flags)
 int swiotlb_init_late(size_t size, gfp_t gfp_mask,
int (*remap)(void *tlb, unsigned long nslabs))
 {
+   struct io_tlb_mem *mem = _tlb_default_mem;
unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
unsigned long bytes;
unsigned char *vstart = NULL;
@@ -355,33 +335,28 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
(PAGE_SIZE << order) >> 20);
nslabs = SLABS_PER_PAGE << order;
}
-   rc = swiotlb_late_init_with_tbl(vstart, nslabs);
-   if (rc)
-   free_pages((unsigned long)vstart, order);
-
-   return rc;
-}
-
-int
-swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
-{
-   struct io_tlb_mem *mem = _tlb_default_mem;
-   

[PATCH 13/15] swiotlb: merge swiotlb-xen initialization into swiotlb

2022-03-14 Thread Christoph Hellwig
Reuse the generic swiotlb initialization for xen-swiotlb.  For ARM/ARM64
this works trivially, while for x86 xen_swiotlb_fixup needs to be passed
as the remap argument to swiotlb_init_remap/swiotlb_init_late.

Signed-off-by: Christoph Hellwig 
---
 arch/arm/xen/mm.c   |  21 +++---
 arch/x86/include/asm/xen/page.h |   5 --
 arch/x86/kernel/pci-dma.c   |  19 +++--
 drivers/xen/swiotlb-xen.c   | 128 +---
 include/xen/arm/page.h  |   1 -
 include/xen/swiotlb-xen.h   |   8 +-
 6 files changed, 27 insertions(+), 155 deletions(-)

diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c
index 28c2070602535..ff05a7899cb86 100644
--- a/arch/arm/xen/mm.c
+++ b/arch/arm/xen/mm.c
@@ -23,22 +23,20 @@
 #include 
 #include 
 
-unsigned long xen_get_swiotlb_free_pages(unsigned int order)
+static gfp_t xen_swiotlb_gfp(void)
 {
phys_addr_t base;
-   gfp_t flags = __GFP_NOWARN|__GFP_KSWAPD_RECLAIM;
u64 i;
 
for_each_mem_range(i, , NULL) {
if (base < (phys_addr_t)0x) {
if (IS_ENABLED(CONFIG_ZONE_DMA32))
-   flags |= __GFP_DMA32;
-   else
-   flags |= __GFP_DMA;
-   break;
+   return __GFP_DMA32;
+   return __GFP_DMA;
}
}
-   return __get_free_pages(flags, order);
+
+   return GFP_KERNEL;
 }
 
 static bool hypercall_cflush = false;
@@ -140,10 +138,13 @@ static int __init xen_mm_init(void)
if (!xen_swiotlb_detect())
return 0;
 
-   rc = xen_swiotlb_init();
/* we can work with the default swiotlb */
-   if (rc < 0 && rc != -EEXIST)
-   return rc;
+   if (!io_tlb_default_mem.nslabs) {
+   rc = swiotlb_init_late(swiotlb_size_or_default(),
+  xen_swiotlb_gfp(), NULL);
+   if (rc < 0)
+   return rc;
+   }
 
cflush.op = 0;
cflush.a.dev_bus_addr = 0;
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index e989bc2269f54..1fc67df500145 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -357,9 +357,4 @@ static inline bool xen_arch_need_swiotlb(struct device *dev,
return false;
 }
 
-static inline unsigned long xen_get_swiotlb_free_pages(unsigned int order)
-{
-   return __get_free_pages(__GFP_NOWARN, order);
-}
-
 #endif /* _ASM_X86_XEN_PAGE_H */
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index a705a199bf8a3..dbb7b83fc3e48 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -72,15 +72,12 @@ static inline void __init pci_swiotlb_detect(void)
 #endif /* CONFIG_SWIOTLB */
 
 #ifdef CONFIG_SWIOTLB_XEN
-static bool xen_swiotlb;
-
 static void __init pci_xen_swiotlb_init(void)
 {
if (!xen_initial_domain() && !x86_swiotlb_enable)
return;
x86_swiotlb_enable = true;
-   xen_swiotlb = true;
-   xen_swiotlb_init_early();
+   swiotlb_init_remap(true, x86_swiotlb_flags, xen_swiotlb_fixup);
dma_ops = _swiotlb_dma_ops;
if (IS_ENABLED(CONFIG_PCI))
pci_request_acs();
@@ -88,14 +85,16 @@ static void __init pci_xen_swiotlb_init(void)
 
 int pci_xen_swiotlb_init_late(void)
 {
-   int rc;
-
-   if (xen_swiotlb)
+   if (dma_ops == _swiotlb_dma_ops)
return 0;
 
-   rc = xen_swiotlb_init();
-   if (rc)
-   return rc;
+   /* we can work with the default swiotlb */
+   if (!io_tlb_default_mem.nslabs) {
+   int rc = swiotlb_init_late(swiotlb_size_or_default(),
+  GFP_KERNEL, xen_swiotlb_fixup);
+   if (rc < 0)
+   return rc;
+   }
 
/* XXX: this switches the dma ops under live devices! */
dma_ops = _swiotlb_dma_ops;
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index c2da3eb4826e8..df8085b50df10 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -104,7 +104,7 @@ static int is_xen_swiotlb_buffer(struct device *dev, 
dma_addr_t dma_addr)
return 0;
 }
 
-static int xen_swiotlb_fixup(void *buf, unsigned long nslabs)
+int xen_swiotlb_fixup(void *buf, unsigned long nslabs)
 {
int rc;
unsigned int order = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT);
@@ -130,132 +130,6 @@ static int xen_swiotlb_fixup(void *buf, unsigned long 
nslabs)
return 0;
 }
 
-enum xen_swiotlb_err {
-   XEN_SWIOTLB_UNKNOWN = 0,
-   XEN_SWIOTLB_ENOMEM,
-   XEN_SWIOTLB_EFIXUP
-};
-
-static const char *xen_swiotlb_error(enum xen_swiotlb_err err)
-{
-   switch (err) {
-   case XEN_SWIOTLB_ENOMEM:
-   return "Cannot allocate Xen-SWIOTLB buffer\n";
-   case XEN_SWIOTLB_EFIXUP:
-   

[PATCH 12/15] swiotlb: provide swiotlb_init variants that remap the buffer

2022-03-14 Thread Christoph Hellwig
To shared more code between swiotlb and xen-swiotlb, offer a
swiotlb_init_remap interface and add a remap callback to
swiotlb_init_late that will allow Xen to remap the buffer the
buffer without duplicating much of the logic.

Signed-off-by: Christoph Hellwig 
---
 arch/x86/pci/sta2x11-fixup.c |  2 +-
 include/linux/swiotlb.h  |  5 -
 kernel/dma/swiotlb.c | 38 +---
 3 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
index c7e6faf59a861..7368afc039987 100644
--- a/arch/x86/pci/sta2x11-fixup.c
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -57,7 +57,7 @@ static void sta2x11_new_instance(struct pci_dev *pdev)
int size = STA2X11_SWIOTLB_SIZE;
/* First instance: register your own swiotlb area */
dev_info(>dev, "Using SWIOTLB (size %i)\n", size);
-   if (swiotlb_init_late(size, GFP_DMA))
+   if (swiotlb_init_late(size, GFP_DMA, NULL))
dev_emerg(>dev, "init swiotlb failed\n");
}
list_add(>list, _instance_list);
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index ee655f2e4d28b..7b50c82f84ce9 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -36,8 +36,11 @@ struct scatterlist;
 
 int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, unsigned int flags);
 unsigned long swiotlb_size_or_default(void);
+void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
+   int (*remap)(void *tlb, unsigned long nslabs));
+int swiotlb_init_late(size_t size, gfp_t gfp_mask,
+   int (*remap)(void *tlb, unsigned long nslabs));
 extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs);
-int swiotlb_init_late(size_t size, gfp_t gfp_mask);
 extern void __init swiotlb_update_mem_attributes(void);
 
 phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys,
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 79641c446d284..88ea7b9bce6e9 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -256,9 +256,11 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long 
nslabs,
  * Statically reserve bounce buffer space and initialize bounce buffer data
  * structures for the software IO TLB used to implement the DMA API.
  */
-void __init swiotlb_init(bool addressing_limit, unsigned int flags)
+void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
+   int (*remap)(void *tlb, unsigned long nslabs))
 {
-   size_t bytes = PAGE_ALIGN(default_nslabs << IO_TLB_SHIFT);
+   unsigned long nslabs = default_nslabs;
+   size_t bytes;
void *tlb;
 
if (!addressing_limit && !swiotlb_force_bounce)
@@ -271,12 +273,24 @@ void __init swiotlb_init(bool addressing_limit, unsigned 
int flags)
 * allow to pick a location everywhere for hypervisors with guest
 * memory encryption.
 */
+retry:
+   bytes = PAGE_ALIGN(default_nslabs << IO_TLB_SHIFT);
if (flags & SWIOTLB_ANY)
tlb = memblock_alloc(bytes, PAGE_SIZE);
else
tlb = memblock_alloc_low(bytes, PAGE_SIZE);
if (!tlb)
goto fail;
+   if (remap && remap(tlb, nslabs) < 0) {
+   memblock_free(tlb, PAGE_ALIGN(bytes));
+
+   /* Min is 2MB */
+   if (nslabs <= 1024)
+   panic("%s: Failed to remap %zu bytes\n",
+ __func__, bytes);
+   nslabs = max(1024UL, ALIGN(nslabs >> 1, IO_TLB_SEGSIZE));
+   goto retry;
+   }
if (swiotlb_init_with_tbl(tlb, default_nslabs, flags))
goto fail_free_mem;
return;
@@ -287,12 +301,18 @@ void __init swiotlb_init(bool addressing_limit, unsigned 
int flags)
pr_warn("Cannot allocate buffer");
 }
 
+void __init swiotlb_init(bool addressing_limit, unsigned int flags)
+{
+   return swiotlb_init_remap(addressing_limit, flags, NULL);
+}
+
 /*
  * Systems with larger DMA zones (those that don't support ISA) can
  * initialize the swiotlb later using the slab allocator if needed.
  * This should be just like above, but with some error catching.
  */
-int swiotlb_init_late(size_t size, gfp_t gfp_mask)
+int swiotlb_init_late(size_t size, gfp_t gfp_mask,
+   int (*remap)(void *tlb, unsigned long nslabs))
 {
unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
unsigned long bytes;
@@ -303,6 +323,7 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask)
if (swiotlb_force_disable)
return 0;
 
+retry:
order = get_order(nslabs << IO_TLB_SHIFT);
nslabs = SLABS_PER_PAGE << order;
bytes = nslabs << IO_TLB_SHIFT;
@@ -317,6 +338,17 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask)
 
if (!vstart)
return -ENOMEM;
+   if (remap)
+   rc = 

[PATCH 11/15] swiotlb: pass a gfp_mask argument to swiotlb_init_late

2022-03-14 Thread Christoph Hellwig
Let the caller chose a zone to allocate from.  This will be used
later on by the xen-swiotlb initialization on arm.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Anshuman Khandual 
---
 arch/x86/pci/sta2x11-fixup.c | 2 +-
 include/linux/swiotlb.h  | 2 +-
 kernel/dma/swiotlb.c | 7 ++-
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
index e0c039a75b2db..c7e6faf59a861 100644
--- a/arch/x86/pci/sta2x11-fixup.c
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -57,7 +57,7 @@ static void sta2x11_new_instance(struct pci_dev *pdev)
int size = STA2X11_SWIOTLB_SIZE;
/* First instance: register your own swiotlb area */
dev_info(>dev, "Using SWIOTLB (size %i)\n", size);
-   if (swiotlb_init_late(size))
+   if (swiotlb_init_late(size, GFP_DMA))
dev_emerg(>dev, "init swiotlb failed\n");
}
list_add(>list, _instance_list);
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index eabdd89987027..ee655f2e4d28b 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -37,7 +37,7 @@ struct scatterlist;
 int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, unsigned int flags);
 unsigned long swiotlb_size_or_default(void);
 extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs);
-int swiotlb_init_late(size_t size);
+int swiotlb_init_late(size_t size, gfp_t gfp_mask);
 extern void __init swiotlb_update_mem_attributes(void);
 
 phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys,
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 2ad12562c94fe..79641c446d284 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -292,7 +292,7 @@ void __init swiotlb_init(bool addressing_limit, unsigned 
int flags)
  * initialize the swiotlb later using the slab allocator if needed.
  * This should be just like above, but with some error catching.
  */
-int swiotlb_init_late(size_t size)
+int swiotlb_init_late(size_t size, gfp_t gfp_mask)
 {
unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
unsigned long bytes;
@@ -303,15 +303,12 @@ int swiotlb_init_late(size_t size)
if (swiotlb_force_disable)
return 0;
 
-   /*
-* Get IO TLB memory from the low pages
-*/
order = get_order(nslabs << IO_TLB_SHIFT);
nslabs = SLABS_PER_PAGE << order;
bytes = nslabs << IO_TLB_SHIFT;
 
while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
-   vstart = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
+   vstart = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
  order);
if (vstart)
break;
-- 
2.30.2



[PATCH 10/15] swiotlb: add a SWIOTLB_ANY flag to lift the low memory restriction

2022-03-14 Thread Christoph Hellwig
Power SVM wants to allocate a swiotlb buffer that is not restricted to
low memory for the trusted hypervisor scheme.  Consolidate the support
for this into the swiotlb_init interface by adding a new flag.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/include/asm/svm.h   |  4 
 arch/powerpc/include/asm/swiotlb.h   |  1 +
 arch/powerpc/kernel/dma-swiotlb.c|  1 +
 arch/powerpc/mm/mem.c|  5 +
 arch/powerpc/platforms/pseries/svm.c | 26 +-
 include/linux/swiotlb.h  |  1 +
 kernel/dma/swiotlb.c | 11 +--
 7 files changed, 14 insertions(+), 35 deletions(-)

diff --git a/arch/powerpc/include/asm/svm.h b/arch/powerpc/include/asm/svm.h
index 7546402d796af..85580b30aba48 100644
--- a/arch/powerpc/include/asm/svm.h
+++ b/arch/powerpc/include/asm/svm.h
@@ -15,8 +15,6 @@ static inline bool is_secure_guest(void)
return mfmsr() & MSR_S;
 }
 
-void __init svm_swiotlb_init(void);
-
 void dtl_cache_ctor(void *addr);
 #define get_dtl_cache_ctor()   (is_secure_guest() ? dtl_cache_ctor : NULL)
 
@@ -27,8 +25,6 @@ static inline bool is_secure_guest(void)
return false;
 }
 
-static inline void svm_swiotlb_init(void) {}
-
 #define get_dtl_cache_ctor() NULL
 
 #endif /* CONFIG_PPC_SVM */
diff --git a/arch/powerpc/include/asm/swiotlb.h 
b/arch/powerpc/include/asm/swiotlb.h
index 3c1a1cd161286..4203b5e0a88ed 100644
--- a/arch/powerpc/include/asm/swiotlb.h
+++ b/arch/powerpc/include/asm/swiotlb.h
@@ -9,6 +9,7 @@
 #include 
 
 extern unsigned int ppc_swiotlb_enable;
+extern unsigned int ppc_swiotlb_flags;
 
 #ifdef CONFIG_SWIOTLB
 void swiotlb_detect_4g(void);
diff --git a/arch/powerpc/kernel/dma-swiotlb.c 
b/arch/powerpc/kernel/dma-swiotlb.c
index fc7816126a401..ba256c37bcc0f 100644
--- a/arch/powerpc/kernel/dma-swiotlb.c
+++ b/arch/powerpc/kernel/dma-swiotlb.c
@@ -10,6 +10,7 @@
 #include 
 
 unsigned int ppc_swiotlb_enable;
+unsigned int ppc_swiotlb_flags;
 
 void __init swiotlb_detect_4g(void)
 {
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index e1519e2edc656..a4d65418c30a9 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -249,10 +249,7 @@ void __init mem_init(void)
 * back to to-down.
 */
memblock_set_bottom_up(true);
-   if (is_secure_guest())
-   svm_swiotlb_init();
-   else
-   swiotlb_init(ppc_swiotlb_enable, 0);
+   swiotlb_init(ppc_swiotlb_enable, ppc_swiotlb_flags);
 #endif
 
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
diff --git a/arch/powerpc/platforms/pseries/svm.c 
b/arch/powerpc/platforms/pseries/svm.c
index c5228f4969eb2..3b4045d508ec8 100644
--- a/arch/powerpc/platforms/pseries/svm.c
+++ b/arch/powerpc/platforms/pseries/svm.c
@@ -28,7 +28,7 @@ static int __init init_svm(void)
 * need to use the SWIOTLB buffer for DMA even if dma_capable() says
 * otherwise.
 */
-   swiotlb_force = SWIOTLB_FORCE;
+   ppc_swiotlb_flags |= SWIOTLB_ANY | SWIOTLB_FORCE;
 
/* Share the SWIOTLB buffer with the host. */
swiotlb_update_mem_attributes();
@@ -37,30 +37,6 @@ static int __init init_svm(void)
 }
 machine_early_initcall(pseries, init_svm);
 
-/*
- * Initialize SWIOTLB. Essentially the same as swiotlb_init(), except that it
- * can allocate the buffer anywhere in memory. Since the hypervisor doesn't 
have
- * any addressing limitation, we don't need to allocate it in low addresses.
- */
-void __init svm_swiotlb_init(void)
-{
-   unsigned char *vstart;
-   unsigned long bytes, io_tlb_nslabs;
-
-   io_tlb_nslabs = (swiotlb_size_or_default() >> IO_TLB_SHIFT);
-   io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
-
-   bytes = io_tlb_nslabs << IO_TLB_SHIFT;
-
-   vstart = memblock_alloc(PAGE_ALIGN(bytes), PAGE_SIZE);
-   if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, false))
-   return;
-
-
-   memblock_free(vstart, PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
-   panic("SVM: Cannot allocate SWIOTLB buffer");
-}
-
 int set_memory_encrypted(unsigned long addr, int numpages)
 {
if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT))
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index ae0407173e845..eabdd89987027 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -15,6 +15,7 @@ struct scatterlist;
 
 #define SWIOTLB_VERBOSE(1 << 0) /* verbose initialization */
 #define SWIOTLB_FORCE  (1 << 1) /* force bounce buffering */
+#define SWIOTLB_ANY(1 << 2) /* allow any memory for the buffer */
 
 /*
  * Maximum allowable number of contiguous slabs to map,
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 5ac6e128d4279..2ad12562c94fe 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -266,8 +266,15 @@ void __init swiotlb_init(bool addressing_limit, unsigned 
int flags)
if (swiotlb_force_disable)
return;
 
-   /* Get IO TLB 

[PATCH 09/15] swiotlb: make the swiotlb_init interface more useful

2022-03-14 Thread Christoph Hellwig
Pass a bool to pass if swiotlb needs to be enabled based on the
addressing needs and replace the verbose argument with a set of
flags, including one to force enable bounce buffering.

Note that this patch removes the possibility to force xen-swiotlb
use using swiotlb=force on the command line on x86 (arm and arm64
never supported that), but this interface will be restored shortly.

Signed-off-by: Christoph Hellwig 
---
 arch/arm/mm/init.c |  6 +
 arch/arm64/mm/init.c   |  6 +
 arch/ia64/mm/init.c|  4 +--
 arch/mips/cavium-octeon/dma-octeon.c   |  2 +-
 arch/mips/loongson64/dma.c |  2 +-
 arch/mips/sibyte/common/dma.c  |  2 +-
 arch/powerpc/mm/mem.c  |  3 ++-
 arch/powerpc/platforms/pseries/setup.c |  3 ---
 arch/riscv/mm/init.c   |  8 +-
 arch/s390/mm/init.c|  3 +--
 arch/x86/kernel/pci-dma.c  | 15 ++-
 drivers/xen/swiotlb-xen.c  |  4 +--
 include/linux/swiotlb.h| 15 ++-
 include/trace/events/swiotlb.h | 29 -
 kernel/dma/swiotlb.c   | 35 ++
 15 files changed, 55 insertions(+), 82 deletions(-)

diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 6d0cb0f7bc54b..73f30d278b565 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -312,11 +312,7 @@ static void __init free_highpages(void)
 void __init mem_init(void)
 {
 #ifdef CONFIG_ARM_LPAE
-   if (swiotlb_force == SWIOTLB_FORCE ||
-   max_pfn > arm_dma_pfn_limit)
-   swiotlb_init(1);
-   else
-   swiotlb_force = SWIOTLB_NO_FORCE;
+   swiotlb_init(max_pfn > arm_dma_pfn_limit, SWIOTLB_VERBOSE);
 #endif
 
set_max_mapnr(pfn_to_page(max_pfn) - mem_map);
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index db63cc885771a..52102adda3d28 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -373,11 +373,7 @@ void __init bootmem_init(void)
  */
 void __init mem_init(void)
 {
-   if (swiotlb_force == SWIOTLB_FORCE ||
-   max_pfn > PFN_DOWN(arm64_dma_phys_limit))
-   swiotlb_init(1);
-   else if (!xen_swiotlb_detect())
-   swiotlb_force = SWIOTLB_NO_FORCE;
+   swiotlb_init(max_pfn > PFN_DOWN(arm64_dma_phys_limit), SWIOTLB_VERBOSE);
 
/* this will put all unused low memory onto the freelists */
memblock_free_all();
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 5d165607bf354..3c3e15b22608f 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -437,9 +437,7 @@ mem_init (void)
if (iommu_detected)
break;
 #endif
-#ifdef CONFIG_SWIOTLB
-   swiotlb_init(1);
-#endif
+   swiotlb_init(true, SWIOTLB_VERBOSE);
} while (0);
 
 #ifdef CONFIG_FLATMEM
diff --git a/arch/mips/cavium-octeon/dma-octeon.c 
b/arch/mips/cavium-octeon/dma-octeon.c
index fb7547e217263..9fbba6a8fa4c5 100644
--- a/arch/mips/cavium-octeon/dma-octeon.c
+++ b/arch/mips/cavium-octeon/dma-octeon.c
@@ -235,5 +235,5 @@ void __init plat_swiotlb_setup(void)
 #endif
 
swiotlb_adjust_size(swiotlbsize);
-   swiotlb_init(1);
+   swiotlb_init(true, SWIOTLB_VERBOSE);
 }
diff --git a/arch/mips/loongson64/dma.c b/arch/mips/loongson64/dma.c
index 364f2f27c8723..8220a1bc0db64 100644
--- a/arch/mips/loongson64/dma.c
+++ b/arch/mips/loongson64/dma.c
@@ -24,5 +24,5 @@ phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 
 void __init plat_swiotlb_setup(void)
 {
-   swiotlb_init(1);
+   swiotlb_init(true, SWIOTLB_VERBOSE);
 }
diff --git a/arch/mips/sibyte/common/dma.c b/arch/mips/sibyte/common/dma.c
index eb47a94f3583e..c5c2c782aff68 100644
--- a/arch/mips/sibyte/common/dma.c
+++ b/arch/mips/sibyte/common/dma.c
@@ -10,5 +10,5 @@
 
 void __init plat_swiotlb_setup(void)
 {
-   swiotlb_init(1);
+   swiotlb_init(true, SWIOTLB_VERBOSE);
 }
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 8e301cd8925b2..e1519e2edc656 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 
+#include 
 #include 
 #include 
 #include 
@@ -251,7 +252,7 @@ void __init mem_init(void)
if (is_secure_guest())
svm_swiotlb_init();
else
-   swiotlb_init(0);
+   swiotlb_init(ppc_swiotlb_enable, 0);
 #endif
 
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
diff --git a/arch/powerpc/platforms/pseries/setup.c 
b/arch/powerpc/platforms/pseries/setup.c
index 83a04d967a59f..45d637ab58261 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -838,9 +838,6 @@ static void __init pSeries_setup_arch(void)
}
 
ppc_md.pcibios_root_bridge_prepare = pseries_root_bridge_prepare;
-
-   if (swiotlb_force == SWIOTLB_FORCE)
-   ppc_swiotlb_enable 

[PATCH 08/15] x86: centralize setting SWIOTLB_FORCE when guest memory encryption is enabled

2022-03-14 Thread Christoph Hellwig
Move enabling SWIOTLB_FORCE for guest memory encryption into common code.

Signed-off-by: Christoph Hellwig 
---
 arch/x86/kernel/cpu/mshyperv.c | 8 
 arch/x86/kernel/pci-dma.c  | 8 
 arch/x86/mm/mem_encrypt_amd.c  | 3 ---
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 5a99f993e6392..568274917f1cd 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -336,14 +336,6 @@ static void __init ms_hyperv_init_platform(void)
swiotlb_unencrypted_base = 
ms_hyperv.shared_gpa_boundary;
 #endif
}
-
-#ifdef CONFIG_SWIOTLB
-   /*
-* Enable swiotlb force mode in Isolation VM to
-* use swiotlb bounce buffer for dma transaction.
-*/
-   swiotlb_force = SWIOTLB_FORCE;
-#endif
}
 
if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) {
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index df96926421be0..04140e20ef1a3 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -53,6 +53,14 @@ static void __init pci_swiotlb_detect(void)
if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
x86_swiotlb_enable = true;
 
+   /*
+* Guest with guest memory encryption currently perform all DMA through
+* bounce buffers as the hypervisor can't access arbitrary VM memory
+* that is not explicitly shared with it.
+*/
+   if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
+   swiotlb_force = SWIOTLB_FORCE;
+
if (swiotlb_force == SWIOTLB_FORCE)
x86_swiotlb_enable = true;
 }
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
index 2b2d018ea3450..a72942d569cf9 100644
--- a/arch/x86/mm/mem_encrypt_amd.c
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -191,9 +191,6 @@ void __init sme_early_init(void)
/* Update the protection map with memory encryption mask */
for (i = 0; i < ARRAY_SIZE(protection_map); i++)
protection_map[i] = pgprot_encrypted(protection_map[i]);
-
-   if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
-   swiotlb_force = SWIOTLB_FORCE;
 }
 
 void __init sev_setup_arch(void)
-- 
2.30.2



[PATCH 07/15] x86: remove the IOMMU table infrastructure

2022-03-14 Thread Christoph Hellwig
The IOMMU table tries to separate the different IOMMUs into different
backends, but actually requires various cross calls.

Rewrite the code to do the generic swiotlb/swiotlb-xen setup directly
in pci-dma.c and then just call into the IOMMU drivers.

Signed-off-by: Christoph Hellwig 
---
 arch/ia64/include/asm/iommu_table.h|   7 --
 arch/x86/include/asm/dma-mapping.h |   1 -
 arch/x86/include/asm/gart.h|   5 +-
 arch/x86/include/asm/iommu.h   |   6 ++
 arch/x86/include/asm/iommu_table.h | 102 ---
 arch/x86/include/asm/swiotlb.h |  30 ---
 arch/x86/include/asm/xen/swiotlb-xen.h |   2 -
 arch/x86/kernel/Makefile   |   2 -
 arch/x86/kernel/amd_gart_64.c  |   5 +-
 arch/x86/kernel/aperture_64.c  |  14 ++--
 arch/x86/kernel/pci-dma.c  | 107 -
 arch/x86/kernel/pci-iommu_table.c  |  77 --
 arch/x86/kernel/pci-swiotlb.c  |  77 --
 arch/x86/kernel/tboot.c|   1 -
 arch/x86/kernel/vmlinux.lds.S  |  12 ---
 arch/x86/xen/Makefile  |   2 -
 arch/x86/xen/pci-swiotlb-xen.c |  96 --
 drivers/iommu/amd/init.c   |   6 --
 drivers/iommu/amd/iommu.c  |   5 +-
 drivers/iommu/intel/dmar.c |   6 +-
 include/linux/dmar.h   |   6 +-
 21 files changed, 110 insertions(+), 459 deletions(-)
 delete mode 100644 arch/ia64/include/asm/iommu_table.h
 delete mode 100644 arch/x86/include/asm/iommu_table.h
 delete mode 100644 arch/x86/include/asm/swiotlb.h
 delete mode 100644 arch/x86/kernel/pci-iommu_table.c
 delete mode 100644 arch/x86/kernel/pci-swiotlb.c
 delete mode 100644 arch/x86/xen/pci-swiotlb-xen.c

diff --git a/arch/ia64/include/asm/iommu_table.h 
b/arch/ia64/include/asm/iommu_table.h
deleted file mode 100644
index cc96116ac276a..0
--- a/arch/ia64/include/asm/iommu_table.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_IOMMU_TABLE_H
-#define _ASM_IA64_IOMMU_TABLE_H
-
-#define IOMMU_INIT_POST(_detect)
-
-#endif /* _ASM_IA64_IOMMU_TABLE_H */
diff --git a/arch/x86/include/asm/dma-mapping.h 
b/arch/x86/include/asm/dma-mapping.h
index bb1654fe0ce74..256fd8115223d 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -9,7 +9,6 @@
 
 #include 
 #include 
-#include 
 
 extern int iommu_merge;
 extern int panic_on_overflow;
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 3185565743459..5af8088a10df6 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -38,7 +38,7 @@ extern int gart_iommu_aperture_disabled;
 extern void early_gart_iommu_check(void);
 extern int gart_iommu_init(void);
 extern void __init gart_parse_options(char *);
-extern int gart_iommu_hole_init(void);
+void gart_iommu_hole_init(void);
 
 #else
 #define gart_iommu_aperture0
@@ -51,9 +51,8 @@ static inline void early_gart_iommu_check(void)
 static inline void gart_parse_options(char *options)
 {
 }
-static inline int gart_iommu_hole_init(void)
+static inline void gart_iommu_hole_init(void)
 {
-   return -ENODEV;
 }
 #endif
 
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index bf1ed2ddc74bd..dba89ed40d38d 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -9,6 +9,12 @@
 extern int force_iommu, no_iommu;
 extern int iommu_detected;
 
+#ifdef CONFIG_SWIOTLB
+extern bool x86_swiotlb_enable;
+#else
+#define x86_swiotlb_enable false
+#endif
+
 /* 10 seconds */
 #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
 
diff --git a/arch/x86/include/asm/iommu_table.h 
b/arch/x86/include/asm/iommu_table.h
deleted file mode 100644
index 1fb3fd1a83c25..0
--- a/arch/x86/include/asm/iommu_table.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_IOMMU_TABLE_H
-#define _ASM_X86_IOMMU_TABLE_H
-
-#include 
-
-/*
- * History lesson:
- * The execution chain of IOMMUs in 2.6.36 looks as so:
- *
- *[xen-swiotlb]
- * |
- * +[swiotlb *]--+
- */ | \
- *   /  |  \
- *[GART] [Calgary]  [Intel VT-d]
- * /
- */
- * [AMD-Vi]
- *
- * *: if SWIOTLB detected 'iommu=soft'/'swiotlb=force' it would skip
- * over the rest of IOMMUs and unconditionally initialize the SWIOTLB.
- * Also it would surreptitiously initialize set the swiotlb=1 if there were
- * more than 4GB and if the user did not pass in 'iommu=off'. The swiotlb
- * flag would be turned off by all IOMMUs except the Calgary one.
- *
- * The IOMMU_INIT* macros allow a similar tree (or more complex if desired)
- * to be built by defining who we depend on.
- *
- * And all that needs to be done is to use one of the macros in the IOMMU
- * and the pci-dma.c will take care of the rest.
- */
-
-struct 

[PATCH 06/15] MIPS/octeon: use swiotlb_init instead of open coding it

2022-03-14 Thread Christoph Hellwig
Use the generic swiotlb initialization helper instead of open coding it.

Signed-off-by: Christoph Hellwig 
Acked-by: Thomas Bogendoerfer 
---
 arch/mips/cavium-octeon/dma-octeon.c | 15 ++-
 arch/mips/pci/pci-octeon.c   |  2 +-
 2 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/arch/mips/cavium-octeon/dma-octeon.c 
b/arch/mips/cavium-octeon/dma-octeon.c
index df70308db0e69..fb7547e217263 100644
--- a/arch/mips/cavium-octeon/dma-octeon.c
+++ b/arch/mips/cavium-octeon/dma-octeon.c
@@ -186,15 +186,12 @@ phys_addr_t dma_to_phys(struct device *dev, dma_addr_t 
daddr)
return daddr;
 }
 
-char *octeon_swiotlb;
-
 void __init plat_swiotlb_setup(void)
 {
phys_addr_t start, end;
phys_addr_t max_addr;
phys_addr_t addr_size;
size_t swiotlbsize;
-   unsigned long swiotlb_nslabs;
u64 i;
 
max_addr = 0;
@@ -236,15 +233,7 @@ void __init plat_swiotlb_setup(void)
if (OCTEON_IS_OCTEON2() && max_addr >= 0x1ul)
swiotlbsize = 64 * (1<<20);
 #endif
-   swiotlb_nslabs = swiotlbsize >> IO_TLB_SHIFT;
-   swiotlb_nslabs = ALIGN(swiotlb_nslabs, IO_TLB_SEGSIZE);
-   swiotlbsize = swiotlb_nslabs << IO_TLB_SHIFT;
-
-   octeon_swiotlb = memblock_alloc_low(swiotlbsize, PAGE_SIZE);
-   if (!octeon_swiotlb)
-   panic("%s: Failed to allocate %zu bytes align=%lx\n",
- __func__, swiotlbsize, PAGE_SIZE);
 
-   if (swiotlb_init_with_tbl(octeon_swiotlb, swiotlb_nslabs, 1) == -ENOMEM)
-   panic("Cannot allocate SWIOTLB buffer");
+   swiotlb_adjust_size(swiotlbsize);
+   swiotlb_init(1);
 }
diff --git a/arch/mips/pci/pci-octeon.c b/arch/mips/pci/pci-octeon.c
index fc29b85cfa926..e457a18cbdc59 100644
--- a/arch/mips/pci/pci-octeon.c
+++ b/arch/mips/pci/pci-octeon.c
@@ -664,7 +664,7 @@ static int __init octeon_pci_setup(void)
 
/* BAR1 movable regions contiguous to cover the swiotlb */
octeon_bar1_pci_phys =
-   virt_to_phys(octeon_swiotlb) & ~((1ull << 22) - 1);
+   io_tlb_default_mem.start & ~((1ull << 22) - 1);
 
for (index = 0; index < 32; index++) {
union cvmx_pci_bar1_indexx bar1_index;
-- 
2.30.2



[PATCH 05/15] arm/xen: don't check for xen_initial_domain() in xen_create_contiguous_region

2022-03-14 Thread Christoph Hellwig
From: Stefano Stabellini 

It used to be that Linux enabled swiotlb-xen when running a dom0 on ARM.
Since f5079a9a2a31 "xen/arm: introduce XENFEAT_direct_mapped and
XENFEAT_not_direct_mapped", Linux detects whether to enable or disable
swiotlb-xen based on the new feature flags: XENFEAT_direct_mapped and
XENFEAT_not_direct_mapped.

However, there is still a leftover xen_initial_domain() check in
xen_create_contiguous_region. Remove the check as
xen_create_contiguous_region is only called by swiotlb-xen during
initialization. If xen_create_contiguous_region is called, we know Linux
is running 1:1 mapped so there is no need for additional checks.

Also update the in-code comment.

Signed-off-by: Stefano Stabellini 
Signed-off-by: Christoph Hellwig 
---
 arch/arm/xen/mm.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c
index a7e54a087b802..28c2070602535 100644
--- a/arch/arm/xen/mm.c
+++ b/arch/arm/xen/mm.c
@@ -122,10 +122,7 @@ int xen_create_contiguous_region(phys_addr_t pstart, 
unsigned int order,
 unsigned int address_bits,
 dma_addr_t *dma_handle)
 {
-   if (!xen_initial_domain())
-   return -EINVAL;
-
-   /* we assume that dom0 is mapped 1:1 for now */
+   /* the domain is 1:1 mapped to use swiotlb-xen */
*dma_handle = pstart;
return 0;
 }
-- 
2.30.2



[PATCH 04/15] swiotlb: rename swiotlb_late_init_with_default_size

2022-03-14 Thread Christoph Hellwig
swiotlb_late_init_with_default_size is an overly verbose name that
doesn't even catch what the function is doing, given that the size is
not just a default but the actual requested size.

Rename it to swiotlb_init_late.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Anshuman Khandual 
---
 arch/x86/pci/sta2x11-fixup.c | 2 +-
 include/linux/swiotlb.h  | 2 +-
 kernel/dma/swiotlb.c | 6 ++
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
index 101081ad64b6d..e0c039a75b2db 100644
--- a/arch/x86/pci/sta2x11-fixup.c
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -57,7 +57,7 @@ static void sta2x11_new_instance(struct pci_dev *pdev)
int size = STA2X11_SWIOTLB_SIZE;
/* First instance: register your own swiotlb area */
dev_info(>dev, "Using SWIOTLB (size %i)\n", size);
-   if (swiotlb_late_init_with_default_size(size))
+   if (swiotlb_init_late(size))
dev_emerg(>dev, "init swiotlb failed\n");
}
list_add(>list, _instance_list);
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 9fb3a568f0c51..b48b26bfa0edb 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -40,7 +40,7 @@ extern void swiotlb_init(int verbose);
 int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose);
 unsigned long swiotlb_size_or_default(void);
 extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs);
-extern int swiotlb_late_init_with_default_size(size_t default_size);
+int swiotlb_init_late(size_t size);
 extern void __init swiotlb_update_mem_attributes(void);
 
 phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys,
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index b75943c2a0a0e..14e08fa9621c2 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -281,11 +281,9 @@ swiotlb_init(int verbose)
  * initialize the swiotlb later using the slab allocator if needed.
  * This should be just like above, but with some error catching.
  */
-int
-swiotlb_late_init_with_default_size(size_t default_size)
+int swiotlb_init_late(size_t size)
 {
-   unsigned long nslabs =
-   ALIGN(default_size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
+   unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
unsigned long bytes;
unsigned char *vstart = NULL;
unsigned int order;
-- 
2.30.2



[PATCH 03/15] swiotlb: simplify swiotlb_max_segment

2022-03-14 Thread Christoph Hellwig
Remove the bogus Xen override that was usually larger than the actual
size and just calculate the value on demand.  Note that
swiotlb_max_segment still doesn't make sense as an interface and should
eventually be removed.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Anshuman Khandual 
---
 drivers/xen/swiotlb-xen.c |  2 --
 include/linux/swiotlb.h   |  1 -
 kernel/dma/swiotlb.c  | 20 +++-
 3 files changed, 3 insertions(+), 20 deletions(-)

diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 47aebd98f52f5..485cd06ed39e7 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -202,7 +202,6 @@ int xen_swiotlb_init(void)
rc = swiotlb_late_init_with_tbl(start, nslabs);
if (rc)
return rc;
-   swiotlb_set_max_segment(PAGE_SIZE);
return 0;
 error:
if (nslabs > 1024 && repeat--) {
@@ -254,7 +253,6 @@ void __init xen_swiotlb_init_early(void)
 
if (swiotlb_init_with_tbl(start, nslabs, true))
panic("Cannot allocate SWIOTLB buffer");
-   swiotlb_set_max_segment(PAGE_SIZE);
 }
 #endif /* CONFIG_X86 */
 
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index f6c3638255d54..9fb3a568f0c51 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -164,7 +164,6 @@ static inline void swiotlb_adjust_size(unsigned long size)
 #endif /* CONFIG_SWIOTLB */
 
 extern void swiotlb_print_info(void);
-extern void swiotlb_set_max_segment(unsigned int);
 
 #ifdef CONFIG_DMA_RESTRICTED_POOL
 struct page *swiotlb_alloc(struct device *dev, size_t size);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index af9d257501a64..b75943c2a0a0e 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -68,12 +68,6 @@ struct io_tlb_mem io_tlb_default_mem;
 
 phys_addr_t swiotlb_unencrypted_base;
 
-/*
- * Max segment that we can provide which (if pages are contingous) will
- * not be bounced (unless SWIOTLB_FORCE is set).
- */
-static unsigned int max_segment;
-
 static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT;
 
 static int __init
@@ -97,18 +91,12 @@ early_param("swiotlb", setup_io_tlb_npages);
 
 unsigned int swiotlb_max_segment(void)
 {
-   return io_tlb_default_mem.nslabs ? max_segment : 0;
+   if (!io_tlb_default_mem.nslabs)
+   return 0;
+   return rounddown(io_tlb_default_mem.nslabs << IO_TLB_SHIFT, PAGE_SIZE);
 }
 EXPORT_SYMBOL_GPL(swiotlb_max_segment);
 
-void swiotlb_set_max_segment(unsigned int val)
-{
-   if (swiotlb_force == SWIOTLB_FORCE)
-   max_segment = 1;
-   else
-   max_segment = rounddown(val, PAGE_SIZE);
-}
-
 unsigned long swiotlb_size_or_default(void)
 {
return default_nslabs << IO_TLB_SHIFT;
@@ -258,7 +246,6 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long 
nslabs, int verbose)
 
if (verbose)
swiotlb_print_info();
-   swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT);
return 0;
 }
 
@@ -359,7 +346,6 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
swiotlb_init_io_tlb_mem(mem, virt_to_phys(tlb), nslabs, true);
 
swiotlb_print_info();
-   swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT);
return 0;
 }
 
-- 
2.30.2



[PATCH 02/15] swiotlb: make swiotlb_exit a no-op if SWIOTLB_FORCE is set

2022-03-14 Thread Christoph Hellwig
If force bouncing is enabled we can't release the buffers.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Anshuman Khandual 
---
 kernel/dma/swiotlb.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 908eac2527cb1..af9d257501a64 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -369,6 +369,9 @@ void __init swiotlb_exit(void)
unsigned long tbl_vaddr;
size_t tbl_size, slots_size;
 
+   if (swiotlb_force == SWIOTLB_FORCE)
+   return;
+
if (!mem->nslabs)
return;
 
-- 
2.30.2



[PATCH 01/15] dma-direct: use is_swiotlb_active in dma_direct_map_page

2022-03-14 Thread Christoph Hellwig
Use the more specific is_swiotlb_active check instead of checking the
global swiotlb_force variable.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Anshuman Khandual 
---
 kernel/dma/direct.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index 4632b0f4f72eb..4dc16e08c7e1a 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -91,7 +91,7 @@ static inline dma_addr_t dma_direct_map_page(struct device 
*dev,
return swiotlb_map(dev, phys, size, dir, attrs);
 
if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
-   if (swiotlb_force != SWIOTLB_NO_FORCE)
+   if (is_swiotlb_active(dev))
return swiotlb_map(dev, phys, size, dir, attrs);
 
dev_WARN_ONCE(dev, 1,
-- 
2.30.2



cleanup swiotlb initialization v5

2022-03-14 Thread Christoph Hellwig
Hi all,

this series tries to clean up the swiotlb initialization, including
that of swiotlb-xen.  To get there is also removes the x86 iommu table
infrastructure that massively obsfucates the initialization path.

Git tree:

git://git.infradead.org/users/hch/misc.git swiotlb-init-cleanup

Gitweb:


http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/swiotlb-init-cleanup

Changes since v5:
 - split a patch into three
 - fix setting x86_swiotlb_enable for Xen
 - fix a comment about forced bounce buffering for guest memory
   encryption
 - remove the xen_initial_domain check from
   xen_create_contiguous_region

Changes since v3:
 - fix a compilation issue on some powerpc configfs
 - fix and cleanup how forced bounce buffering is enabled for
   guest memory encryption

Changes since v2:
 - make ppc_swiotlb_flags actually work again
 - also force enable swiotlb for guest encrypted memory to cater
   to hyperv which doesn't set the host encrypted memory flag

Changes since v1:
 - skip IOMMU initialization on Xen PV kernels
 - various small whitespace / typo fixes

Diffstat:
 arch/ia64/include/asm/iommu_table.h  |7 -
 arch/x86/include/asm/iommu_table.h   |  102 --
 arch/x86/include/asm/swiotlb.h   |   30 -
 arch/x86/kernel/pci-iommu_table.c|   77 -
 arch/x86/kernel/pci-swiotlb.c|   77 -
 arch/x86/xen/pci-swiotlb-xen.c   |   96 -
 b/arch/arm/mm/init.c |6 -
 b/arch/arm/xen/mm.c  |   26 ++--
 b/arch/arm64/mm/init.c   |6 -
 b/arch/ia64/mm/init.c|4 
 b/arch/mips/cavium-octeon/dma-octeon.c   |   15 --
 b/arch/mips/loongson64/dma.c |2 
 b/arch/mips/pci/pci-octeon.c |2 
 b/arch/mips/sibyte/common/dma.c  |2 
 b/arch/powerpc/include/asm/svm.h |4 
 b/arch/powerpc/include/asm/swiotlb.h |1 
 b/arch/powerpc/kernel/dma-swiotlb.c  |1 
 b/arch/powerpc/mm/mem.c  |6 -
 b/arch/powerpc/platforms/pseries/setup.c |3 
 b/arch/powerpc/platforms/pseries/svm.c   |   26 
 b/arch/riscv/mm/init.c   |8 -
 b/arch/s390/mm/init.c|3 
 b/arch/x86/include/asm/dma-mapping.h |   12 --
 b/arch/x86/include/asm/gart.h|5 
 b/arch/x86/include/asm/iommu.h   |8 +
 b/arch/x86/include/asm/xen/page.h|5 
 b/arch/x86/include/asm/xen/swiotlb-xen.h |2 
 b/arch/x86/kernel/Makefile   |2 
 b/arch/x86/kernel/amd_gart_64.c  |5 
 b/arch/x86/kernel/aperture_64.c  |   14 --
 b/arch/x86/kernel/cpu/mshyperv.c |8 -
 b/arch/x86/kernel/pci-dma.c  |  113 
 b/arch/x86/kernel/tboot.c|1 
 b/arch/x86/kernel/vmlinux.lds.S  |   12 --
 b/arch/x86/mm/mem_encrypt_amd.c  |3 
 b/arch/x86/pci/sta2x11-fixup.c   |2 
 b/arch/x86/xen/Makefile  |2 
 b/drivers/iommu/amd/init.c   |6 -
 b/drivers/iommu/amd/iommu.c  |5 
 b/drivers/iommu/intel/dmar.c |6 -
 b/drivers/xen/swiotlb-xen.c  |  132 ---
 b/include/linux/dmar.h   |6 -
 b/include/linux/swiotlb.h|   22 +--
 b/include/trace/events/swiotlb.h |   29 +
 b/include/xen/arm/page.h |1 
 b/include/xen/swiotlb-xen.h  |8 +
 b/kernel/dma/direct.h|2 
 b/kernel/dma/swiotlb.c   |  175 +++
 48 files changed, 264 insertions(+), 826 deletions(-)


[PATCH] static_call: Don't make __static_call_return0 static

2022-03-14 Thread Christophe Leroy
System.map shows that vmlinux contains several instances of
__static_call_return0():

c0004fc0 t __static_call_return0
c0011518 t __static_call_return0
c00d8160 t __static_call_return0

arch_static_call_transform() uses the middle one to check whether we are
setting a call to __static_call_return0 or not:

c0011520 :
c0011520:   3d 20 c0 01 lis r9,-16383   <== r9 =  
0xc001 << 16
c0011524:   39 29 15 18 addir9,r9,5400  <== r9 += 0x1518
c0011528:   7c 05 48 00 cmpwr5,r9   <== r9 has 
value 0xc0011518 here

So if static_call_update() is called with one of the other instances of
__static_call_return0(), arch_static_call_transform() won't recognise it.

In order to work properly, global single instance of __static_call_return0() is 
required.

Fixes: 3f2a8fc4b15d ("static_call/x86: Add __static_call_return0()")
Signed-off-by: Christophe Leroy 
---
 include/linux/static_call.h   |   5 +-
 kernel/Makefile   |   3 +-
 kernel/static_call.c  | 542 --
 .../{static_call.c => static_call_inline.c}   |   5 -
 4 files changed, 3 insertions(+), 552 deletions(-)
 copy kernel/{static_call.c => static_call_inline.c} (99%)

diff --git a/include/linux/static_call.h b/include/linux/static_call.h
index 3e56a9751c06..fcc5b48989b3 100644
--- a/include/linux/static_call.h
+++ b/include/linux/static_call.h
@@ -248,10 +248,7 @@ static inline int static_call_text_reserved(void *start, 
void *end)
return 0;
 }
 
-static inline long __static_call_return0(void)
-{
-   return 0;
-}
+extern long __static_call_return0(void);
 
 #define EXPORT_STATIC_CALL(name)   \
EXPORT_SYMBOL(STATIC_CALL_KEY(name));   \
diff --git a/kernel/Makefile b/kernel/Makefile
index 56f4ee97f328..a18d169732d2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -113,7 +113,8 @@ obj-$(CONFIG_CPU_PM) += cpu_pm.o
 obj-$(CONFIG_BPF) += bpf/
 obj-$(CONFIG_KCSAN) += kcsan/
 obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
-obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call.o
+obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o
+obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o
 obj-$(CONFIG_CFI_CLANG) += cfi.o
 
 obj-$(CONFIG_PERF_EVENTS) += events/
diff --git a/kernel/static_call.c b/kernel/static_call.c
index 43ba0b1e0edb..43dbc438044e 100644
--- a/kernel/static_call.c
+++ b/kernel/static_call.c
@@ -1,548 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-
-extern struct static_call_site __start_static_call_sites[],
-  __stop_static_call_sites[];
-extern struct static_call_tramp_key __start_static_call_tramp_key[],
-   __stop_static_call_tramp_key[];
-
-static bool static_call_initialized;
-
-/* mutex to protect key modules/sites */
-static DEFINE_MUTEX(static_call_mutex);
-
-static void static_call_lock(void)
-{
-   mutex_lock(_call_mutex);
-}
-
-static void static_call_unlock(void)
-{
-   mutex_unlock(_call_mutex);
-}
-
-static inline void *static_call_addr(struct static_call_site *site)
-{
-   return (void *)((long)site->addr + (long)>addr);
-}
-
-static inline unsigned long __static_call_key(const struct static_call_site 
*site)
-{
-   return (long)site->key + (long)>key;
-}
-
-static inline struct static_call_key *static_call_key(const struct 
static_call_site *site)
-{
-   return (void *)(__static_call_key(site) & ~STATIC_CALL_SITE_FLAGS);
-}
-
-/* These assume the key is word-aligned. */
-static inline bool static_call_is_init(struct static_call_site *site)
-{
-   return __static_call_key(site) & STATIC_CALL_SITE_INIT;
-}
-
-static inline bool static_call_is_tail(struct static_call_site *site)
-{
-   return __static_call_key(site) & STATIC_CALL_SITE_TAIL;
-}
-
-static inline void static_call_set_init(struct static_call_site *site)
-{
-   site->key = (__static_call_key(site) | STATIC_CALL_SITE_INIT) -
-   (long)>key;
-}
-
-static int static_call_site_cmp(const void *_a, const void *_b)
-{
-   const struct static_call_site *a = _a;
-   const struct static_call_site *b = _b;
-   const struct static_call_key *key_a = static_call_key(a);
-   const struct static_call_key *key_b = static_call_key(b);
-
-   if (key_a < key_b)
-   return -1;
-
-   if (key_a > key_b)
-   return 1;
-
-   return 0;
-}
-
-static void static_call_site_swap(void *_a, void *_b, int size)
-{
-   long delta = (unsigned long)_a - (unsigned long)_b;
-   struct static_call_site *a = _a;
-   struct static_call_site *b = _b;
-   struct static_call_site tmp = *a;
-
-   a->addr = b->addr  - delta;
-   a->key  = b->key   - delta;
-
-   b->addr =