from:"Björn Töpel"

[PATCH v4 11/11] riscv: Enable DAX VMEMMAP optimization

2024-06-05 Thread Björn Töpel

From: Björn Töpel 

Now that DAX is usable, enable the DAX VMEMMAP optimization as well.

Signed-off-by: Björn Töpel 
---
 arch/riscv/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 8a49b5f4c017..1631bf568158 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -73,6 +73,7 @@ config RISCV
select ARCH_WANT_GENERAL_HUGETLB if !RISCV_ISA_SVNAPOT
select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
select ARCH_WANT_LD_ORPHAN_WARN if !XIP_KERNEL
+   select ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
select ARCH_WANTS_NO_INSTR
select ARCH_WANTS_THP_SWAP if HAVE_ARCH_TRANSPARENT_HUGEPAGE
-- 
2.43.0

[PATCH v4 10/11] riscv: mm: Add support for ZONE_DEVICE

2024-06-05 Thread Björn Töpel

From: Björn Töpel 

ZONE_DEVICE pages need DEVMAP PTEs support to function
(ARCH_HAS_PTE_DEVMAP). Claim another RSW (reserved for software) bit
in the PTE for DEVMAP mark, add the corresponding helpers, and enable
ARCH_HAS_PTE_DEVMAP for riscv64.

Reviewed-by: Alexandre Ghiti 
Signed-off-by: Björn Töpel 
---
 arch/riscv/Kconfig|  1 +
 arch/riscv/include/asm/pgtable-64.h   | 20 
 arch/riscv/include/asm/pgtable-bits.h |  1 +
 arch/riscv/include/asm/pgtable.h  | 17 +
 4 files changed, 39 insertions(+)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 8d860ad3b171..8a49b5f4c017 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -37,6 +37,7 @@ config RISCV
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select ARCH_HAS_PMEM_API
select ARCH_HAS_PREPARE_SYNC_CORE_CMD
+   select ARCH_HAS_PTE_DEVMAP if 64BIT && MMU
select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_SET_DIRECT_MAP if MMU
select ARCH_HAS_SET_MEMORY if MMU
diff --git a/arch/riscv/include/asm/pgtable-64.h 
b/arch/riscv/include/asm/pgtable-64.h
index 8c36a8818432..0897dd99ab8d 100644
--- a/arch/riscv/include/asm/pgtable-64.h
+++ b/arch/riscv/include/asm/pgtable-64.h
@@ -398,4 +398,24 @@ static inline struct page *pgd_page(pgd_t pgd)
 #define p4d_offset p4d_offset
 p4d_t *p4d_offset(pgd_t *pgd, unsigned long address);
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int pte_devmap(pte_t pte);
+static inline pte_t pmd_pte(pmd_t pmd);
+
+static inline int pmd_devmap(pmd_t pmd)
+{
+   return pte_devmap(pmd_pte(pmd));
+}
+
+static inline int pud_devmap(pud_t pud)
+{
+   return 0;
+}
+
+static inline int pgd_devmap(pgd_t pgd)
+{
+   return 0;
+}
+#endif
+
 #endif /* _ASM_RISCV_PGTABLE_64_H */
diff --git a/arch/riscv/include/asm/pgtable-bits.h 
b/arch/riscv/include/asm/pgtable-bits.h
index 179bd4afece4..a8f5205cea54 100644
--- a/arch/riscv/include/asm/pgtable-bits.h
+++ b/arch/riscv/include/asm/pgtable-bits.h
@@ -19,6 +19,7 @@
 #define _PAGE_SOFT  (3 << 8)/* Reserved for software */
 
 #define _PAGE_SPECIAL   (1 << 8)/* RSW: 0x1 */
+#define _PAGE_DEVMAP(1 << 9)/* RSW, devmap */
 #define _PAGE_TABLE _PAGE_PRESENT
 
 /*
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 41f1b2c6f949..dd4d52940106 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -390,6 +390,13 @@ static inline int pte_special(pte_t pte)
return pte_val(pte) & _PAGE_SPECIAL;
 }
 
+#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
+static inline int pte_devmap(pte_t pte)
+{
+   return pte_val(pte) & _PAGE_DEVMAP;
+}
+#endif
+
 /* static inline pte_t pte_rdprotect(pte_t pte) */
 
 static inline pte_t pte_wrprotect(pte_t pte)
@@ -431,6 +438,11 @@ static inline pte_t pte_mkspecial(pte_t pte)
return __pte(pte_val(pte) | _PAGE_SPECIAL);
 }
 
+static inline pte_t pte_mkdevmap(pte_t pte)
+{
+   return __pte(pte_val(pte) | _PAGE_DEVMAP);
+}
+
 static inline pte_t pte_mkhuge(pte_t pte)
 {
return pte;
@@ -721,6 +733,11 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
return pte_pmd(pte_mkdirty(pmd_pte(pmd)));
 }
 
+static inline pmd_t pmd_mkdevmap(pmd_t pmd)
+{
+   return pte_pmd(pte_mkdevmap(pmd_pte(pmd)));
+}
+
 static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
 {
-- 
2.43.0

[PATCH v4 09/11] virtio-mem: Enable virtio-mem for RISC-V

2024-06-05 Thread Björn Töpel

From: Björn Töpel 

Now that RISC-V has memory hotplugging support, virtio-mem can be used
on the platform.

Acked-by: David Hildenbrand 
Signed-off-by: Björn Töpel 
---
 drivers/virtio/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index 6284538a8184..42a48ac763ee 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -122,7 +122,7 @@ config VIRTIO_BALLOON
 
 config VIRTIO_MEM
tristate "Virtio mem driver"
-   depends on X86_64 || ARM64
+   depends on X86_64 || ARM64 || RISCV
depends on VIRTIO
depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE
-- 
2.43.0

[PATCH v4 05/11] riscv: mm: Add pfn_to_kaddr() implementation

2024-06-05 Thread Björn Töpel

From: Björn Töpel 

The pfn_to_kaddr() function is used by KASAN's memory hotplugging
path. Add the missing function to the RISC-V port, so that it can be
built with MHP and CONFIG_KASAN.

Signed-off-by: Björn Töpel 
---
 arch/riscv/include/asm/page.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
index 115ac98b8d72..235fd45d998d 100644
--- a/arch/riscv/include/asm/page.h
+++ b/arch/riscv/include/asm/page.h
@@ -188,6 +188,11 @@ extern phys_addr_t __phys_addr_symbol(unsigned long x);
 
 unsigned long kaslr_offset(void);
 
+static __always_inline void *pfn_to_kaddr(unsigned long pfn)
+{
+   return __va(pfn << PAGE_SHIFT);
+}
+
 #endif /* __ASSEMBLY__ */
 
 #define virt_addr_valid(vaddr) ({  
\
-- 
2.43.0

[PATCH v4 08/11] riscv: Enable memory hotplugging for RISC-V

2024-06-05 Thread Björn Töpel

From: Björn Töpel 

Enable ARCH_ENABLE_MEMORY_HOTPLUG and ARCH_ENABLE_MEMORY_HOTREMOVE for
RISC-V.

Reviewed-by: Alexandre Ghiti 
Signed-off-by: Björn Töpel 
---
 arch/riscv/Kconfig | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 0525ee2d63c7..8d860ad3b171 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -16,6 +16,8 @@ config RISCV
select ACPI_REDUCED_HARDWARE_ONLY if ACPI
select ARCH_DMA_DEFAULT_COHERENT
select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION
+   select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM_VMEMMAP
+   select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG
select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
select ARCH_HAS_BINFMT_FLAT
@@ -46,6 +48,7 @@ config RISCV
select ARCH_HAS_UBSAN
select ARCH_HAS_VDSO_DATA
select ARCH_KEEP_MEMBLOCK if ACPI
+   select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE if 64BIT && MMU
select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT
select ARCH_STACKWALK
-- 
2.43.0

[PATCH v4 07/11] riscv: mm: Take memory hotplug read-lock during kernel page table dump

2024-06-05 Thread Björn Töpel

From: Björn Töpel 

During memory hot remove, the ptdump functionality can end up touching
stale data. Avoid any potential crashes (or worse), by holding the
memory hotplug read-lock while traversing the page table.

This change is analogous to arm64's commit bf2b59f60ee1 ("arm64/mm:
Hold memory hotplug lock while walking for kernel page table dump").

Reviewed-by: David Hildenbrand 
Reviewed-by: Oscar Salvador 
Signed-off-by: Björn Töpel 
---
 arch/riscv/mm/ptdump.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c
index 1289cc6d3700..9d5f657a251b 100644
--- a/arch/riscv/mm/ptdump.c
+++ b/arch/riscv/mm/ptdump.c
@@ -6,6 +6,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -370,7 +371,9 @@ bool ptdump_check_wx(void)
 
 static int ptdump_show(struct seq_file *m, void *v)
 {
+   get_online_mems();
ptdump_walk(m, m->private);
+   put_online_mems();
 
return 0;
 }
-- 
2.43.0

[PATCH v4 06/11] riscv: mm: Add memory hotplugging support

2024-06-05 Thread Björn Töpel

From: Björn Töpel 

For an architecture to support memory hotplugging, a couple of
callbacks needs to be implemented:

 arch_add_memory()
  This callback is responsible for adding the physical memory into the
  direct map, and call into the memory hotplugging generic code via
  __add_pages() that adds the corresponding struct page entries, and
  updates the vmemmap mapping.

 arch_remove_memory()
  This is the inverse of the callback above.

 vmemmap_free()
  This function tears down the vmemmap mappings (if
  CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the
  backing vmemmap pages. Note that for persistent memory, an
  alternative allocator for the backing pages can be used; The
  vmem_altmap. This means that when the backing pages are cleared,
  extra care is needed so that the correct deallocation method is
  used.

 arch_get_mappable_range()
  This functions returns the PA range that the direct map can map.
  Used by the MHP internals for sanity checks.

The page table unmap/teardown functions are heavily based on code from
the x86 tree. The same remove_pgd_mapping() function is used in both
vmemmap_free() and arch_remove_memory(), but in the latter function
the backing pages are not removed.

Signed-off-by: Björn Töpel 
---
 arch/riscv/mm/init.c | 267 +++
 1 file changed, 267 insertions(+)

diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 1f7e7c223bec..bfa2dea95354 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -1534,3 +1534,270 @@ struct execmem_info __init *execmem_arch_setup(void)
 }
 #endif /* CONFIG_MMU */
 #endif /* CONFIG_EXECMEM */
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+   struct page *page = pmd_page(*pmd);
+   struct ptdesc *ptdesc = page_ptdesc(page);
+   pte_t *pte;
+   int i;
+
+   for (i = 0; i < PTRS_PER_PTE; i++) {
+   pte = pte_start + i;
+   if (!pte_none(*pte))
+   return;
+   }
+
+   pagetable_pte_dtor(ptdesc);
+   if (PageReserved(page))
+   free_reserved_page(page);
+   else
+   pagetable_free(ptdesc);
+   pmd_clear(pmd);
+}
+
+static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+   struct page *page = pud_page(*pud);
+   struct ptdesc *ptdesc = page_ptdesc(page);
+   pmd_t *pmd;
+   int i;
+
+   for (i = 0; i < PTRS_PER_PMD; i++) {
+   pmd = pmd_start + i;
+   if (!pmd_none(*pmd))
+   return;
+   }
+
+   pagetable_pmd_dtor(ptdesc);
+   if (PageReserved(page))
+   free_reserved_page(page);
+   else
+   pagetable_free(ptdesc);
+   pud_clear(pud);
+}
+
+static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
+{
+   struct page *page = p4d_page(*p4d);
+   pud_t *pud;
+   int i;
+
+   for (i = 0; i < PTRS_PER_PUD; i++) {
+   pud = pud_start + i;
+   if (!pud_none(*pud))
+   return;
+   }
+
+   if (PageReserved(page))
+   free_reserved_page(page);
+   else
+   free_pages((unsigned long)page_address(page), 0);
+   p4d_clear(p4d);
+}
+
+static void __meminit free_vmemmap_storage(struct page *page, size_t size,
+  struct vmem_altmap *altmap)
+{
+   int order = get_order(size);
+
+   if (altmap) {
+   vmem_altmap_free(altmap, size >> PAGE_SHIFT);
+   return;
+   }
+
+   if (PageReserved(page)) {
+   unsigned int nr_pages = 1 << order;
+
+   while (nr_pages--)
+   free_reserved_page(page++);
+   return;
+   }
+
+   free_pages((unsigned long)page_address(page), order);
+}
+
+static void __meminit remove_pte_mapping(pte_t *pte_base, unsigned long addr, 
unsigned long end,
+bool is_vmemmap, struct vmem_altmap 
*altmap)
+{
+   unsigned long next;
+   pte_t *ptep, pte;
+
+   for (; addr < end; addr = next) {
+   next = (addr + PAGE_SIZE) & PAGE_MASK;
+   if (next > end)
+   next = end;
+
+   ptep = pte_base + pte_index(addr);
+   pte = ptep_get(ptep);
+   if (!pte_present(*ptep))
+   continue;
+
+   pte_clear(_mm, addr, ptep);
+   if (is_vmemmap)
+   free_vmemmap_storage(pte_page(pte), PAGE_SIZE, altmap);
+   }
+}
+
+static void __meminit remove_pmd_mapping(pmd_t *pmd_base, unsigned long addr, 
unsigned long end,
+bool is_vmemmap, struct vmem_altmap 
*altmap)
+{
+   unsigned long next;
+   pte_t *pte_base;
+   pmd_t *pmdp, pmd;
+
+   for (; addr < end; addr = next) {
+

[PATCH v4 04/11] riscv: mm: Refactor create_linear_mapping_range() for memory hot add

2024-06-05 Thread Björn Töpel

From: Björn Töpel 

Add a parameter to the direct map setup function, so it can be used in
arch_add_memory() later.

Reviewed-by: Alexandre Ghiti 
Reviewed-by: David Hildenbrand 
Reviewed-by: Oscar Salvador 
Signed-off-by: Björn Töpel 
---
 arch/riscv/mm/init.c | 15 ++-
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 0dd04cedc0d2..1f7e7c223bec 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -1232,7 +1232,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
 }
 
 static void __meminit create_linear_mapping_range(phys_addr_t start, 
phys_addr_t end,
- uintptr_t fixed_map_size)
+ uintptr_t fixed_map_size, 
const pgprot_t *pgprot)
 {
phys_addr_t pa;
uintptr_t va, map_size;
@@ -1243,7 +1243,7 @@ static void __meminit 
create_linear_mapping_range(phys_addr_t start, phys_addr_t
best_map_size(pa, va, end - pa);
 
create_pgd_mapping(swapper_pg_dir, va, pa, map_size,
-  pgprot_from_va(va));
+  pgprot ? *pgprot : pgprot_from_va(va));
}
 }
 
@@ -1287,22 +1287,19 @@ static void __init 
create_linear_mapping_page_table(void)
if (end >= __pa(PAGE_OFFSET) + memory_limit)
end = __pa(PAGE_OFFSET) + memory_limit;
 
-   create_linear_mapping_range(start, end, 0);
+   create_linear_mapping_range(start, end, 0, NULL);
}
 
 #ifdef CONFIG_STRICT_KERNEL_RWX
-   create_linear_mapping_range(ktext_start, ktext_start + ktext_size, 0);
-   create_linear_mapping_range(krodata_start,
-   krodata_start + krodata_size, 0);
+   create_linear_mapping_range(ktext_start, ktext_start + ktext_size, 0, 
NULL);
+   create_linear_mapping_range(krodata_start, krodata_start + 
krodata_size, 0, NULL);
 
memblock_clear_nomap(ktext_start,  ktext_size);
memblock_clear_nomap(krodata_start, krodata_size);
 #endif
 
 #ifdef CONFIG_KFENCE
-   create_linear_mapping_range(kfence_pool,
-   kfence_pool + KFENCE_POOL_SIZE,
-   PAGE_SIZE);
+   create_linear_mapping_range(kfence_pool, kfence_pool + 
KFENCE_POOL_SIZE, PAGE_SIZE, NULL);
 
memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);
 #endif
-- 
2.43.0

[PATCH v4 03/11] riscv: mm: Change attribute from init to meminit for page functions

2024-06-05 Thread Björn Töpel

From: Björn Töpel 

Prepare for memory hotplugging support by changing from __init to
__meminit for the page table functions that are used by the upcoming
architecture specific callbacks.

Changing the __init attribute to __meminit, avoids that the functions
are removed after init. The __meminit attribute makes sure the
functions are kept in the kernel text post init, but only if memory
hotplugging is enabled for the build.

Reviewed-by: Alexandre Ghiti 
Reviewed-by: David Hildenbrand 
Reviewed-by: Oscar Salvador 
Signed-off-by: Björn Töpel 
---
 arch/riscv/include/asm/mmu.h |  4 +--
 arch/riscv/include/asm/pgtable.h |  2 +-
 arch/riscv/mm/init.c | 56 ++--
 3 files changed, 28 insertions(+), 34 deletions(-)

diff --git a/arch/riscv/include/asm/mmu.h b/arch/riscv/include/asm/mmu.h
index 947fd60f9051..c9e03e9da3dc 100644
--- a/arch/riscv/include/asm/mmu.h
+++ b/arch/riscv/include/asm/mmu.h
@@ -31,8 +31,8 @@ typedef struct {
 #define cntx2asid(cntx)((cntx) & SATP_ASID_MASK)
 #define cntx2version(cntx) ((cntx) & ~SATP_ASID_MASK)
 
-void __init create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa,
-  phys_addr_t sz, pgprot_t prot);
+void __meminit create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa, 
phys_addr_t sz,
+ pgprot_t prot);
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_RISCV_MMU_H */
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index aad8b8ca51f1..41f1b2c6f949 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -165,7 +165,7 @@ struct pt_alloc_ops {
 #endif
 };
 
-extern struct pt_alloc_ops pt_ops __initdata;
+extern struct pt_alloc_ops pt_ops __meminitdata;
 
 #ifdef CONFIG_MMU
 /* Number of PGD entries that a user-mode program can use */
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index a5b3bc1f3b88..0dd04cedc0d2 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -297,7 +297,7 @@ static void __init setup_bootmem(void)
 }
 
 #ifdef CONFIG_MMU
-struct pt_alloc_ops pt_ops __initdata;
+struct pt_alloc_ops pt_ops __meminitdata;
 
 pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
 pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
@@ -359,7 +359,7 @@ static inline pte_t *__init get_pte_virt_fixmap(phys_addr_t 
pa)
return (pte_t *)set_fixmap_offset(FIX_PTE, pa);
 }
 
-static inline pte_t *__init get_pte_virt_late(phys_addr_t pa)
+static inline pte_t *__meminit get_pte_virt_late(phys_addr_t pa)
 {
return (pte_t *) __va(pa);
 }
@@ -378,7 +378,7 @@ static inline phys_addr_t __init alloc_pte_fixmap(uintptr_t 
va)
return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
 }
 
-static phys_addr_t __init alloc_pte_late(uintptr_t va)
+static phys_addr_t __meminit alloc_pte_late(uintptr_t va)
 {
struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0);
 
@@ -386,9 +386,8 @@ static phys_addr_t __init alloc_pte_late(uintptr_t va)
return __pa((pte_t *)ptdesc_address(ptdesc));
 }
 
-static void __init create_pte_mapping(pte_t *ptep,
- uintptr_t va, phys_addr_t pa,
- phys_addr_t sz, pgprot_t prot)
+static void __meminit create_pte_mapping(pte_t *ptep, uintptr_t va, 
phys_addr_t pa, phys_addr_t sz,
+pgprot_t prot)
 {
uintptr_t pte_idx = pte_index(va);
 
@@ -442,7 +441,7 @@ static pmd_t *__init get_pmd_virt_fixmap(phys_addr_t pa)
return (pmd_t *)set_fixmap_offset(FIX_PMD, pa);
 }
 
-static pmd_t *__init get_pmd_virt_late(phys_addr_t pa)
+static pmd_t *__meminit get_pmd_virt_late(phys_addr_t pa)
 {
return (pmd_t *) __va(pa);
 }
@@ -459,7 +458,7 @@ static phys_addr_t __init alloc_pmd_fixmap(uintptr_t va)
return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
 }
 
-static phys_addr_t __init alloc_pmd_late(uintptr_t va)
+static phys_addr_t __meminit alloc_pmd_late(uintptr_t va)
 {
struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0);
 
@@ -467,9 +466,9 @@ static phys_addr_t __init alloc_pmd_late(uintptr_t va)
return __pa((pmd_t *)ptdesc_address(ptdesc));
 }
 
-static void __init create_pmd_mapping(pmd_t *pmdp,
- uintptr_t va, phys_addr_t pa,
- phys_addr_t sz, pgprot_t prot)
+static void __meminit create_pmd_mapping(pmd_t *pmdp,
+uintptr_t va, phys_addr_t pa,
+phys_addr_t sz, pgprot_t prot)
 {
pte_t *ptep;
phys_addr_t pte_phys;
@@ -505,7 +504,7 @@ static pud_t *__init get_pud_virt_fixmap(phys_addr_t pa)
return (pud_t *)set_fixmap_offset(FIX_PUD, pa);
 }
 
-static pud_t *__init get_pud_virt_late(phys_addr_t pa)
+static pud_t *__meminit get_pud_virt_late(phys_addr_t pa)

[PATCH v4 02/11] riscv: mm: Pre-allocate vmemmap/direct map/kasan PGD entries

2024-06-05 Thread Björn Töpel

From: Björn Töpel 

The RISC-V port copies the PGD table from init_mm/swapper_pg_dir to
all userland page tables, which means that if the PGD level table is
changed, other page tables has to be updated as well.

Instead of having the PGD changes ripple out to all tables, the
synchronization can be avoided by pre-allocating the PGD entries/pages
at boot, avoiding the synchronization all together.

This is currently done for the bpf/modules, and vmalloc PGD regions.
Extend this scheme for the PGD regions touched by memory hotplugging.

Prepare the RISC-V port for memory hotplug by pre-allocate
vmemmap/direct map/kasan entries at the PGD level. This will roughly
waste ~128 (plus 32 if KASAN is enabled) worth of 4K pages when memory
hotplugging is enabled in the kernel configuration.

Reviewed-by: Alexandre Ghiti 
Signed-off-by: Björn Töpel 
---
 arch/riscv/include/asm/kasan.h | 4 ++--
 arch/riscv/mm/init.c   | 9 +
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h
index 0b85e363e778..e6a0071bdb56 100644
--- a/arch/riscv/include/asm/kasan.h
+++ b/arch/riscv/include/asm/kasan.h
@@ -6,8 +6,6 @@
 
 #ifndef __ASSEMBLY__
 
-#ifdef CONFIG_KASAN
-
 /*
  * The following comment was copied from arm64:
  * KASAN_SHADOW_START: beginning of the kernel virtual addresses.
@@ -34,6 +32,8 @@
  */
 #define KASAN_SHADOW_START ((KASAN_SHADOW_END - KASAN_SHADOW_SIZE) & 
PGDIR_MASK)
 #define KASAN_SHADOW_END   MODULES_LOWEST_VADDR
+
+#ifdef CONFIG_KASAN
 #define KASAN_SHADOW_OFFSET_AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
 
 void kasan_init(void);
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index fe5072f66c8c..a5b3bc1f3b88 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -28,6 +28,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1493,11 +1494,19 @@ static void __init preallocate_pgd_pages_range(unsigned 
long start, unsigned lon
panic("Failed to pre-allocate %s pages for %s area\n", lvl, area);
 }
 
+#define PAGE_END KASAN_SHADOW_START
+
 void __init pgtable_cache_init(void)
 {
preallocate_pgd_pages_range(VMALLOC_START, VMALLOC_END, "vmalloc");
if (IS_ENABLED(CONFIG_MODULES))
preallocate_pgd_pages_range(MODULES_VADDR, MODULES_END, 
"bpf/modules");
+   if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
+   preallocate_pgd_pages_range(VMEMMAP_START, VMEMMAP_END, 
"vmemmap");
+   preallocate_pgd_pages_range(PAGE_OFFSET, PAGE_END, "direct 
map");
+   if (IS_ENABLED(CONFIG_KASAN))
+   preallocate_pgd_pages_range(KASAN_SHADOW_START, 
KASAN_SHADOW_END, "kasan");
+   }
 }
 #endif
 
-- 
2.43.0

[PATCH v4 00/11] riscv: Memory Hot(Un)Plug support

2024-06-05 Thread Björn Töpel

From: Björn Töpel 


Memory Hot(Un)Plug support (and ZONE_DEVICE) for the RISC-V port


(For the restless folks: change log in the bottom!)

Introduction


To quote "Documentation/admin-guide/mm/memory-hotplug.rst": "Memory
hot(un)plug allows for increasing and decreasing the size of physical
memory available to a machine at runtime."

This series adds memory hot(un)plugging, and ZONE_DEVICE support for
the RISC-V Linux port.

MM configuration


RISC-V MM has the following configuration:

 * Memory blocks are 128M, analogous to x86-64. It uses PMD
   ("hugepage") vmemmaps. From that follows that 2M (PMD) worth of
   vmemmap spans 32768 pages á 4K which gets us 128M.

 * The pageblock size is the minimum minimum virtio_mem size, and on
   RISC-V it's 2M (2^9 * 4K).

Implementation
==

The PGD table on RISC-V is shared/copied between for all processes. To
avoid doing page table synchronization, the first patch (patch 1)
pre-allocated the PGD entries for vmemmap/direct map. By doing that
the init_mm PGD will be fixed at kernel init, and synchronization can
be avoided all together.

The following two patches (patch 2-3) does some preparations, followed
by the actual MHP implementation (patch 4-5). Then, MHP and virtio-mem
are enabled (patch 6-7), and finally ZONE_DEVICE support is added
(patch 8).

MHP and locking
===

TL;DR: The MHP does not step on any toes, except for ptdump.
Additional locking is required for ptdump.

Long version: For v2 I spent some time digging into init_mm
synchronization/update. Here are my findings, and I'd love them to be
corrected if incorrect.

It's been a gnarly path...

The `init_mm` structure is a special mm (perhaps not a "real" one).
It's a "lazy context" that tracks kernel page table resources, e.g.,
the kernel page table (swapper_pg_dir), a kernel page_table_lock (more
about the usage below), mmap_lock, and such.

`init_mm` does not track/contain any VMAs. Having the `init_mm` is
convenient, so that the regular kernel page table walk/modify
functions can be used.

Now, `init_mm` being special means that the locking for kernel page
tables are special as well.

On RISC-V the PGD (top-level page table structure), similar to x86, is
shared (copied) with user processes. If the kernel PGD is modified, it
has to be synched to user-mode processes PGDs. This is avoided by
pre-populating the PGD, so it'll be fixed from boot.

The in-kernel pgd regions are documented in
`Documentation/arch/riscv/vm-layout.rst`.

The distinct regions are:
 * vmemmap
 * vmalloc/ioremap space
 * direct mapping of all physical memory
 * kasan
 * modules, BPF
 * kernel

Memory hotplug is the process of adding/removing memory to/from the
kernel.

Adding is done in two phases:
 1. Add the memory to the kernel
 2. Online memory, making it available to the page allocator.

Step 1 is partially architecture dependent, and updates the init_mm
page table:
 * Update the direct map page tables. The direct map is a linear map,
   representing all physical memory: `virt = phys + PAGE_OFFSET`
 * Add a `struct page` for each added page of memory. Update the
   vmemmap (virtual mapping to the `struct page`, so we can easily
   transform a kernel virtual address to a `struct page *` address.

>From an MHP perspective, there are two regions of the PGD that are
updated:
 * vmemmap
 * direct mapping of all physical memory

The `struct mm_struct` has a couple of locks in play:
 * `spinlock_t page_table_lock` protects the page table, and some
counters
 * `struct rw_semaphore mmap_lock` protect an mm's VMAs

Note again that `init_mm` does not contain any VMAs, but still uses
the mmap_lock in some places.

The `page_table_lock` was originally used to to protect all pages
tables, but more recently a split page table lock has been introduced.
The split lock has a per-table lock for the PTE and PMD tables. If
split lock is disabled, all tables are guarded by
`mm->page_table_lock` (for user processes). Split page table locks are
not used for init_mm.

MHP operations is typically synchronized using
`DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock)`.

Actors
--

The following non-MHP actors in the kernel traverses (read), and/or
modifies the kernel PGD.

 * `ptdump`

   Walks the entire `init_mm`, via `ptdump_walk_pgd()` with the
   `mmap_write_lock(init_mm)` taken.

   Observation: ptdump can race with MHP, and needs additional locking
   to avoid crashes/races.

 * `set_direct_*` / `arch/riscv/mm/pageattr.c`

   The `set_direct_*` functionality is used to "synchronize" the
   direct map to other kernel mappings, e.g. modules/kernel text. The
   direct map is using "as large huge table mappings as possible",
   which means that the `set_direct_*` might need to split the direct

[PATCH v4 01/11] riscv: mm: Properly forward vmemmap_populate() altmap parameter

2024-06-05 Thread Björn Töpel

From: Björn Töpel 

Make sure that the altmap parameter is properly passed on to
vmemmap_populate_hugepages().

Reviewed-by: Alexandre Ghiti 
Signed-off-by: Björn Töpel 
---
 arch/riscv/mm/init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index e3405e4b99af..fe5072f66c8c 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -1439,7 +1439,7 @@ int __meminit vmemmap_populate(unsigned long start, 
unsigned long end, int node,
 * memory hotplug, we are not able to update all the page tables with
 * the new PMDs.
 */
-   return vmemmap_populate_hugepages(start, end, node, NULL);
+   return vmemmap_populate_hugepages(start, end, node, altmap);
 }
 #endif
 
-- 
2.43.0

Re: [PATCH] riscv: Fix early ftrace nop patching

2024-05-23 Thread Björn Töpel

Alexandre Ghiti  writes:

> Commit c97bf629963e ("riscv: Fix text patching when IPI are used")
> converted ftrace_make_nop() to use patch_insn_write() which does not
> emit any icache flush relying entirely on __ftrace_modify_code() to do
> that.
>
> But we missed that ftrace_make_nop() was called very early directly when
> converting mcount calls into nops (actually on riscv it converts 2B nops
> emitted by the compiler into 4B nops).
>
> This caused crashes on multiple HW as reported by Conor and Björn since
> the booting core could have half-patched instructions in its icache
> which would trigger an illegal instruction trap: fix this by emitting a
> local flush icache when early patching nops.
>
> Fixes: c97bf629963e ("riscv: Fix text patching when IPI are used")
> Signed-off-by: Alexandre Ghiti 

Nice!

I've manged to reproduce the crash on the VisionFive2 board (however
only triggered when CONFIG_RELOCATABLE=y), and can verify that this fix
solves the issue.

Reviewed-by: Björn Töpel 
Tested-by: Björn Töpel

Re: [PATCH v3 2/9] riscv: mm: Pre-allocate vmemmap/direct map PGD entries

2024-05-21 Thread Björn Töpel

Björn Töpel  writes:

> From: Björn Töpel 
>
> The RISC-V port copies the PGD table from init_mm/swapper_pg_dir to
> all userland page tables, which means that if the PGD level table is
> changed, other page tables has to be updated as well.
>
> Instead of having the PGD changes ripple out to all tables, the
> synchronization can be avoided by pre-allocating the PGD entries/pages
> at boot, avoiding the synchronization all together.
>
> This is currently done for the bpf/modules, and vmalloc PGD regions.
> Extend this scheme for the PGD regions touched by memory hotplugging.
>
> Prepare the RISC-V port for memory hotplug by pre-allocate
> vmemmap/direct map entries at the PGD level. This will roughly waste
> ~128 worth of 4K pages when memory hotplugging is enabled in the
> kernel configuration.
>
> Reviewed-by: Alexandre Ghiti 
> Signed-off-by: Björn Töpel 
> ---
>  arch/riscv/include/asm/kasan.h | 4 ++--
>  arch/riscv/mm/init.c   | 7 +++
>  2 files changed, 9 insertions(+), 2 deletions(-)
>
> diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h
> index 0b85e363e778..e6a0071bdb56 100644
> --- a/arch/riscv/include/asm/kasan.h
> +++ b/arch/riscv/include/asm/kasan.h
> @@ -6,8 +6,6 @@
>  
>  #ifndef __ASSEMBLY__
>  
> -#ifdef CONFIG_KASAN
> -
>  /*
>   * The following comment was copied from arm64:
>   * KASAN_SHADOW_START: beginning of the kernel virtual addresses.
> @@ -34,6 +32,8 @@
>   */
>  #define KASAN_SHADOW_START   ((KASAN_SHADOW_END - KASAN_SHADOW_SIZE) & 
> PGDIR_MASK)
>  #define KASAN_SHADOW_END MODULES_LOWEST_VADDR
> +
> +#ifdef CONFIG_KASAN
>  #define KASAN_SHADOW_OFFSET  _AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
>  
>  void kasan_init(void);
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index b66f846e7634..c98010ede810 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -27,6 +27,7 @@
>  
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -1488,10 +1489,16 @@ static void __init 
> preallocate_pgd_pages_range(unsigned long start, unsigned lon
>   panic("Failed to pre-allocate %s pages for %s area\n", lvl, area);
>  }
>  
> +#define PAGE_END KASAN_SHADOW_START
> +
>  void __init pgtable_cache_init(void)
>  {
>   preallocate_pgd_pages_range(VMALLOC_START, VMALLOC_END, "vmalloc");
>   if (IS_ENABLED(CONFIG_MODULES))
>   preallocate_pgd_pages_range(MODULES_VADDR, MODULES_END, 
> "bpf/modules");
> + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
> + preallocate_pgd_pages_range(VMEMMAP_START, VMEMMAP_END, 
> "vmemmap");
> + preallocate_pgd_pages_range(PAGE_OFFSET, PAGE_END, "direct 
> map");

Alex pointed out that KASAN PGDs should be preallocated as well! I'll
address this in the next revision.


Björn

Re: [PATCH v3 5/9] riscv: mm: Add memory hotplugging support

2024-05-21 Thread Björn Töpel

Alexandre Ghiti  writes:

> On Tue, May 21, 2024 at 1:49 PM Björn Töpel  wrote:
>>
>> From: Björn Töpel 
>>
>> For an architecture to support memory hotplugging, a couple of
>> callbacks needs to be implemented:
>>
>>  arch_add_memory()
>>   This callback is responsible for adding the physical memory into the
>>   direct map, and call into the memory hotplugging generic code via
>>   __add_pages() that adds the corresponding struct page entries, and
>>   updates the vmemmap mapping.
>>
>>  arch_remove_memory()
>>   This is the inverse of the callback above.
>>
>>  vmemmap_free()
>>   This function tears down the vmemmap mappings (if
>>   CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the
>>   backing vmemmap pages. Note that for persistent memory, an
>>   alternative allocator for the backing pages can be used; The
>>   vmem_altmap. This means that when the backing pages are cleared,
>>   extra care is needed so that the correct deallocation method is
>>   used.
>>
>>  arch_get_mappable_range()
>>   This functions returns the PA range that the direct map can map.
>>   Used by the MHP internals for sanity checks.
>>
>> The page table unmap/teardown functions are heavily based on code from
>> the x86 tree. The same remove_pgd_mapping() function is used in both
>> vmemmap_free() and arch_remove_memory(), but in the latter function
>> the backing pages are not removed.
>>
>> Signed-off-by: Björn Töpel 
>> ---
>>  arch/riscv/mm/init.c | 261 +++
>>  1 file changed, 261 insertions(+)
>>
>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
>> index 6f72b0b2b854..6693b742bf2f 100644
>> --- a/arch/riscv/mm/init.c
>> +++ b/arch/riscv/mm/init.c
>> @@ -1493,3 +1493,264 @@ void __init pgtable_cache_init(void)
>> }
>>  }
>>  #endif
>> +
>> +#ifdef CONFIG_MEMORY_HOTPLUG
>> +static void __meminit free_pagetable(struct page *page, int order)
>> +{
>> +   unsigned int nr_pages = 1 << order;
>> +
>> +   /*
>> +* vmemmap/direct page tables can be reserved, if added at
>> +* boot.
>> +*/
>> +   if (PageReserved(page)) {
>> +   __ClearPageReserved(page);
>
> What's the difference between __ClearPageReserved() and
> ClearPageReserved()? Because it seems like free_reserved_page() calls
> the latter already, so why would you need to call
> __ClearPageReserved() on the first page?

Indeed! x86 copy pasta (which uses bootmem info page that RV doesn't).

>> +   while (nr_pages--)
>> +   free_reserved_page(page++);
>> +   return;
>> +   }
>> +
>> +   free_pages((unsigned long)page_address(page), order);
>> +}
>> +
>> +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
>> +{
>> +   pte_t *pte;
>> +   int i;
>> +
>> +   for (i = 0; i < PTRS_PER_PTE; i++) {
>> +   pte = pte_start + i;
>> +   if (!pte_none(*pte))
>> +   return;
>> +   }
>> +
>> +   free_pagetable(pmd_page(*pmd), 0);
>> +   pmd_clear(pmd);
>> +}
>> +
>> +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
>> +{
>> +   pmd_t *pmd;
>> +   int i;
>> +
>> +   for (i = 0; i < PTRS_PER_PMD; i++) {
>> +   pmd = pmd_start + i;
>> +   if (!pmd_none(*pmd))
>> +   return;
>> +   }
>> +
>> +   free_pagetable(pud_page(*pud), 0);
>> +   pud_clear(pud);
>> +}
>> +
>> +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
>> +{
>> +   pud_t *pud;
>> +   int i;
>> +
>> +   for (i = 0; i < PTRS_PER_PUD; i++) {
>> +   pud = pud_start + i;
>> +   if (!pud_none(*pud))
>> +   return;
>> +   }
>> +
>> +   free_pagetable(p4d_page(*p4d), 0);
>> +   p4d_clear(p4d);
>> +}
>> +
>> +static void __meminit free_vmemmap_storage(struct page *page, size_t size,
>> +  struct vmem_altmap *altmap)
>> +{
>> +   if (altmap)
>> +   vmem_altmap_free(altmap, size >> PAGE_SHIFT);
>> +   else
>> +   free_pagetable(page, get_order(size));
>> +}
>> +
>> +static v

Re: [PATCH v3 9/9] riscv: mm: Add support for ZONE_DEVICE

2024-05-21 Thread Björn Töpel

Alexandre Ghiti  writes:

> On Tue, May 21, 2024 at 1:49 PM Björn Töpel  wrote:
>>
>> From: Björn Töpel 
>>
>> ZONE_DEVICE pages need DEVMAP PTEs support to function
>> (ARCH_HAS_PTE_DEVMAP). Claim another RSW (reserved for software) bit
>> in the PTE for DEVMAP mark, add the corresponding helpers, and enable
>> ARCH_HAS_PTE_DEVMAP for riscv64.
>>
>> Signed-off-by: Björn Töpel 
>> ---
>>  arch/riscv/Kconfig|  1 +
>>  arch/riscv/include/asm/pgtable-64.h   | 20 
>>  arch/riscv/include/asm/pgtable-bits.h |  1 +
>>  arch/riscv/include/asm/pgtable.h  | 17 +
>>  4 files changed, 39 insertions(+)
>>
>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>> index 2724dc2af29f..0b74698c63c7 100644
>> --- a/arch/riscv/Kconfig
>> +++ b/arch/riscv/Kconfig
>> @@ -36,6 +36,7 @@ config RISCV
>> select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
>> select ARCH_HAS_PMEM_API
>> select ARCH_HAS_PREPARE_SYNC_CORE_CMD
>> +   select ARCH_HAS_PTE_DEVMAP if 64BIT && MMU
>> select ARCH_HAS_PTE_SPECIAL
>> select ARCH_HAS_SET_DIRECT_MAP if MMU
>> select ARCH_HAS_SET_MEMORY if MMU
>> diff --git a/arch/riscv/include/asm/pgtable-64.h 
>> b/arch/riscv/include/asm/pgtable-64.h
>> index 221a5c1ee287..c67a9bbfd010 100644
>> --- a/arch/riscv/include/asm/pgtable-64.h
>> +++ b/arch/riscv/include/asm/pgtable-64.h
>> @@ -400,4 +400,24 @@ static inline struct page *pgd_page(pgd_t pgd)
>>  #define p4d_offset p4d_offset
>>  p4d_t *p4d_offset(pgd_t *pgd, unsigned long address);
>>
>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> +static inline int pte_devmap(pte_t pte);
>> +static inline pte_t pmd_pte(pmd_t pmd);
>> +
>> +static inline int pmd_devmap(pmd_t pmd)
>> +{
>> +   return pte_devmap(pmd_pte(pmd));
>> +}
>> +
>> +static inline int pud_devmap(pud_t pud)
>> +{
>> +   return 0;
>> +}
>> +
>> +static inline int pgd_devmap(pgd_t pgd)
>> +{
>> +   return 0;
>> +}
>> +#endif
>> +
>>  #endif /* _ASM_RISCV_PGTABLE_64_H */
>> diff --git a/arch/riscv/include/asm/pgtable-bits.h 
>> b/arch/riscv/include/asm/pgtable-bits.h
>> index 179bd4afece4..a8f5205cea54 100644
>> --- a/arch/riscv/include/asm/pgtable-bits.h
>> +++ b/arch/riscv/include/asm/pgtable-bits.h
>> @@ -19,6 +19,7 @@
>>  #define _PAGE_SOFT  (3 << 8)/* Reserved for software */
>>
>>  #define _PAGE_SPECIAL   (1 << 8)/* RSW: 0x1 */
>> +#define _PAGE_DEVMAP(1 << 9)/* RSW, devmap */
>>  #define _PAGE_TABLE _PAGE_PRESENT
>>
>>  /*
>> diff --git a/arch/riscv/include/asm/pgtable.h 
>> b/arch/riscv/include/asm/pgtable.h
>> index 7933f493db71..02fadc276064 100644
>> --- a/arch/riscv/include/asm/pgtable.h
>> +++ b/arch/riscv/include/asm/pgtable.h
>> @@ -387,6 +387,13 @@ static inline int pte_special(pte_t pte)
>> return pte_val(pte) & _PAGE_SPECIAL;
>>  }
>>
>> +#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
>> +static inline int pte_devmap(pte_t pte)
>> +{
>> +   return pte_val(pte) & _PAGE_DEVMAP;
>> +}
>> +#endif
>
> Not sure you need the #ifdef here.

W/o it 32b builds break (!defined(CONFIG_ARCH_HAS_PTE_DEVMAP) will have
a default implementation).. Maybe it's cleaner just to use that instead?

>> +
>>  /* static inline pte_t pte_rdprotect(pte_t pte) */
>>
>>  static inline pte_t pte_wrprotect(pte_t pte)
>> @@ -428,6 +435,11 @@ static inline pte_t pte_mkspecial(pte_t pte)
>> return __pte(pte_val(pte) | _PAGE_SPECIAL);
>>  }
>>
>> +static inline pte_t pte_mkdevmap(pte_t pte)
>> +{
>> +   return __pte(pte_val(pte) | _PAGE_DEVMAP);
>> +}
>> +
>>  static inline pte_t pte_mkhuge(pte_t pte)
>>  {
>> return pte;
>> @@ -711,6 +723,11 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
>> return pte_pmd(pte_mkdirty(pmd_pte(pmd)));
>>  }
>>
>> +static inline pmd_t pmd_mkdevmap(pmd_t pmd)
>> +{
>> +   return pte_pmd(pte_mkdevmap(pmd_pte(pmd)));
>> +}
>> +
>>  static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
>> pmd_t *pmdp, pmd_t pmd)
>>  {
>> --
>> 2.40.1
>>
>
> Otherwise, you can add:
>
> Reviewed-by: Alexandre Ghiti 

Thank you!


Björn

[PATCH v3 9/9] riscv: mm: Add support for ZONE_DEVICE

2024-05-21 Thread Björn Töpel

From: Björn Töpel 

ZONE_DEVICE pages need DEVMAP PTEs support to function
(ARCH_HAS_PTE_DEVMAP). Claim another RSW (reserved for software) bit
in the PTE for DEVMAP mark, add the corresponding helpers, and enable
ARCH_HAS_PTE_DEVMAP for riscv64.

Signed-off-by: Björn Töpel 
---
 arch/riscv/Kconfig|  1 +
 arch/riscv/include/asm/pgtable-64.h   | 20 
 arch/riscv/include/asm/pgtable-bits.h |  1 +
 arch/riscv/include/asm/pgtable.h  | 17 +
 4 files changed, 39 insertions(+)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 2724dc2af29f..0b74698c63c7 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -36,6 +36,7 @@ config RISCV
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select ARCH_HAS_PMEM_API
select ARCH_HAS_PREPARE_SYNC_CORE_CMD
+   select ARCH_HAS_PTE_DEVMAP if 64BIT && MMU
select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_SET_DIRECT_MAP if MMU
select ARCH_HAS_SET_MEMORY if MMU
diff --git a/arch/riscv/include/asm/pgtable-64.h 
b/arch/riscv/include/asm/pgtable-64.h
index 221a5c1ee287..c67a9bbfd010 100644
--- a/arch/riscv/include/asm/pgtable-64.h
+++ b/arch/riscv/include/asm/pgtable-64.h
@@ -400,4 +400,24 @@ static inline struct page *pgd_page(pgd_t pgd)
 #define p4d_offset p4d_offset
 p4d_t *p4d_offset(pgd_t *pgd, unsigned long address);
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int pte_devmap(pte_t pte);
+static inline pte_t pmd_pte(pmd_t pmd);
+
+static inline int pmd_devmap(pmd_t pmd)
+{
+   return pte_devmap(pmd_pte(pmd));
+}
+
+static inline int pud_devmap(pud_t pud)
+{
+   return 0;
+}
+
+static inline int pgd_devmap(pgd_t pgd)
+{
+   return 0;
+}
+#endif
+
 #endif /* _ASM_RISCV_PGTABLE_64_H */
diff --git a/arch/riscv/include/asm/pgtable-bits.h 
b/arch/riscv/include/asm/pgtable-bits.h
index 179bd4afece4..a8f5205cea54 100644
--- a/arch/riscv/include/asm/pgtable-bits.h
+++ b/arch/riscv/include/asm/pgtable-bits.h
@@ -19,6 +19,7 @@
 #define _PAGE_SOFT  (3 << 8)/* Reserved for software */
 
 #define _PAGE_SPECIAL   (1 << 8)/* RSW: 0x1 */
+#define _PAGE_DEVMAP(1 << 9)/* RSW, devmap */
 #define _PAGE_TABLE _PAGE_PRESENT
 
 /*
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 7933f493db71..02fadc276064 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -387,6 +387,13 @@ static inline int pte_special(pte_t pte)
return pte_val(pte) & _PAGE_SPECIAL;
 }
 
+#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
+static inline int pte_devmap(pte_t pte)
+{
+   return pte_val(pte) & _PAGE_DEVMAP;
+}
+#endif
+
 /* static inline pte_t pte_rdprotect(pte_t pte) */
 
 static inline pte_t pte_wrprotect(pte_t pte)
@@ -428,6 +435,11 @@ static inline pte_t pte_mkspecial(pte_t pte)
return __pte(pte_val(pte) | _PAGE_SPECIAL);
 }
 
+static inline pte_t pte_mkdevmap(pte_t pte)
+{
+   return __pte(pte_val(pte) | _PAGE_DEVMAP);
+}
+
 static inline pte_t pte_mkhuge(pte_t pte)
 {
return pte;
@@ -711,6 +723,11 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
return pte_pmd(pte_mkdirty(pmd_pte(pmd)));
 }
 
+static inline pmd_t pmd_mkdevmap(pmd_t pmd)
+{
+   return pte_pmd(pte_mkdevmap(pmd_pte(pmd)));
+}
+
 static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
 {
-- 
2.40.1

[PATCH v3 8/9] virtio-mem: Enable virtio-mem for RISC-V

2024-05-21 Thread Björn Töpel

From: Björn Töpel 

Now that RISC-V has memory hotplugging support, virtio-mem can be used
on the platform.

Acked-by: David Hildenbrand 
Signed-off-by: Björn Töpel 
---
 drivers/virtio/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index c17193544268..4e5cebf1b82a 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -122,7 +122,7 @@ config VIRTIO_BALLOON
 
 config VIRTIO_MEM
tristate "Virtio mem driver"
-   depends on X86_64 || ARM64
+   depends on X86_64 || ARM64 || RISCV
depends on VIRTIO
depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE
-- 
2.40.1

[PATCH v3 7/9] riscv: Enable memory hotplugging for RISC-V

2024-05-21 Thread Björn Töpel

From: Björn Töpel 

Enable ARCH_ENABLE_MEMORY_HOTPLUG and ARCH_ENABLE_MEMORY_HOTREMOVE for
RISC-V.

Signed-off-by: Björn Töpel 
---
 arch/riscv/Kconfig | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index fe5281398543..2724dc2af29f 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -16,6 +16,8 @@ config RISCV
select ACPI_REDUCED_HARDWARE_ONLY if ACPI
select ARCH_DMA_DEFAULT_COHERENT
select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION
+   select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM_VMEMMAP && 64BIT && MMU
+   select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG
select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
select ARCH_HAS_BINFMT_FLAT
-- 
2.40.1

[PATCH v3 6/9] riscv: mm: Take memory hotplug read-lock during kernel page table dump

2024-05-21 Thread Björn Töpel

From: Björn Töpel 

During memory hot remove, the ptdump functionality can end up touching
stale data. Avoid any potential crashes (or worse), by holding the
memory hotplug read-lock while traversing the page table.

This change is analogous to arm64's commit bf2b59f60ee1 ("arm64/mm:
Hold memory hotplug lock while walking for kernel page table dump").

Reviewed-by: David Hildenbrand 
Reviewed-by: Oscar Salvador 
Signed-off-by: Björn Töpel 
---
 arch/riscv/mm/ptdump.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c
index 1289cc6d3700..9d5f657a251b 100644
--- a/arch/riscv/mm/ptdump.c
+++ b/arch/riscv/mm/ptdump.c
@@ -6,6 +6,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -370,7 +371,9 @@ bool ptdump_check_wx(void)
 
 static int ptdump_show(struct seq_file *m, void *v)
 {
+   get_online_mems();
ptdump_walk(m, m->private);
+   put_online_mems();
 
return 0;
 }
-- 
2.40.1

[PATCH v3 5/9] riscv: mm: Add memory hotplugging support

2024-05-21 Thread Björn Töpel

From: Björn Töpel 

For an architecture to support memory hotplugging, a couple of
callbacks needs to be implemented:

 arch_add_memory()
  This callback is responsible for adding the physical memory into the
  direct map, and call into the memory hotplugging generic code via
  __add_pages() that adds the corresponding struct page entries, and
  updates the vmemmap mapping.

 arch_remove_memory()
  This is the inverse of the callback above.

 vmemmap_free()
  This function tears down the vmemmap mappings (if
  CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the
  backing vmemmap pages. Note that for persistent memory, an
  alternative allocator for the backing pages can be used; The
  vmem_altmap. This means that when the backing pages are cleared,
  extra care is needed so that the correct deallocation method is
  used.

 arch_get_mappable_range()
  This functions returns the PA range that the direct map can map.
  Used by the MHP internals for sanity checks.

The page table unmap/teardown functions are heavily based on code from
the x86 tree. The same remove_pgd_mapping() function is used in both
vmemmap_free() and arch_remove_memory(), but in the latter function
the backing pages are not removed.

Signed-off-by: Björn Töpel 
---
 arch/riscv/mm/init.c | 261 +++
 1 file changed, 261 insertions(+)

diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 6f72b0b2b854..6693b742bf2f 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -1493,3 +1493,264 @@ void __init pgtable_cache_init(void)
}
 }
 #endif
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void __meminit free_pagetable(struct page *page, int order)
+{
+   unsigned int nr_pages = 1 << order;
+
+   /*
+* vmemmap/direct page tables can be reserved, if added at
+* boot.
+*/
+   if (PageReserved(page)) {
+   __ClearPageReserved(page);
+   while (nr_pages--)
+   free_reserved_page(page++);
+   return;
+   }
+
+   free_pages((unsigned long)page_address(page), order);
+}
+
+static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+   pte_t *pte;
+   int i;
+
+   for (i = 0; i < PTRS_PER_PTE; i++) {
+   pte = pte_start + i;
+   if (!pte_none(*pte))
+   return;
+   }
+
+   free_pagetable(pmd_page(*pmd), 0);
+   pmd_clear(pmd);
+}
+
+static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+   pmd_t *pmd;
+   int i;
+
+   for (i = 0; i < PTRS_PER_PMD; i++) {
+   pmd = pmd_start + i;
+   if (!pmd_none(*pmd))
+   return;
+   }
+
+   free_pagetable(pud_page(*pud), 0);
+   pud_clear(pud);
+}
+
+static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
+{
+   pud_t *pud;
+   int i;
+
+   for (i = 0; i < PTRS_PER_PUD; i++) {
+   pud = pud_start + i;
+   if (!pud_none(*pud))
+   return;
+   }
+
+   free_pagetable(p4d_page(*p4d), 0);
+   p4d_clear(p4d);
+}
+
+static void __meminit free_vmemmap_storage(struct page *page, size_t size,
+  struct vmem_altmap *altmap)
+{
+   if (altmap)
+   vmem_altmap_free(altmap, size >> PAGE_SHIFT);
+   else
+   free_pagetable(page, get_order(size));
+}
+
+static void __meminit remove_pte_mapping(pte_t *pte_base, unsigned long addr, 
unsigned long end,
+bool is_vmemmap, struct vmem_altmap 
*altmap)
+{
+   unsigned long next;
+   pte_t *ptep, pte;
+
+   for (; addr < end; addr = next) {
+   next = (addr + PAGE_SIZE) & PAGE_MASK;
+   if (next > end)
+   next = end;
+
+   ptep = pte_base + pte_index(addr);
+   pte = READ_ONCE(*ptep);
+
+   if (!pte_present(*ptep))
+   continue;
+
+   pte_clear(_mm, addr, ptep);
+   if (is_vmemmap)
+   free_vmemmap_storage(pte_page(pte), PAGE_SIZE, altmap);
+   }
+}
+
+static void __meminit remove_pmd_mapping(pmd_t *pmd_base, unsigned long addr, 
unsigned long end,
+bool is_vmemmap, struct vmem_altmap 
*altmap)
+{
+   unsigned long next;
+   pte_t *pte_base;
+   pmd_t *pmdp, pmd;
+
+   for (; addr < end; addr = next) {
+   next = pmd_addr_end(addr, end);
+   pmdp = pmd_base + pmd_index(addr);
+   pmd = READ_ONCE(*pmdp);
+
+   if (!pmd_present(pmd))
+   continue;
+
+   if (pmd_leaf(pmd)) {
+   pmd_clear(pmdp);
+   if (is_vmemmap)
+   free_vmemmap_storage(pmd_page(pmd), PMD_SIZE, 
altmap)

[PATCH v3 4/9] riscv: mm: Refactor create_linear_mapping_range() for memory hot add

2024-05-21 Thread Björn Töpel

From: Björn Töpel 

Add a parameter to the direct map setup function, so it can be used in
arch_add_memory() later.

Reviewed-by: Alexandre Ghiti 
Reviewed-by: David Hildenbrand 
Reviewed-by: Oscar Salvador 
Signed-off-by: Björn Töpel 
---
 arch/riscv/mm/init.c | 15 ++-
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index c969427eab88..6f72b0b2b854 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -1227,7 +1227,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
 }
 
 static void __meminit create_linear_mapping_range(phys_addr_t start, 
phys_addr_t end,
- uintptr_t fixed_map_size)
+ uintptr_t fixed_map_size, 
const pgprot_t *pgprot)
 {
phys_addr_t pa;
uintptr_t va, map_size;
@@ -1238,7 +1238,7 @@ static void __meminit 
create_linear_mapping_range(phys_addr_t start, phys_addr_t
best_map_size(pa, va, end - pa);
 
create_pgd_mapping(swapper_pg_dir, va, pa, map_size,
-  pgprot_from_va(va));
+  pgprot ? *pgprot : pgprot_from_va(va));
}
 }
 
@@ -1282,22 +1282,19 @@ static void __init 
create_linear_mapping_page_table(void)
if (end >= __pa(PAGE_OFFSET) + memory_limit)
end = __pa(PAGE_OFFSET) + memory_limit;
 
-   create_linear_mapping_range(start, end, 0);
+   create_linear_mapping_range(start, end, 0, NULL);
}
 
 #ifdef CONFIG_STRICT_KERNEL_RWX
-   create_linear_mapping_range(ktext_start, ktext_start + ktext_size, 0);
-   create_linear_mapping_range(krodata_start,
-   krodata_start + krodata_size, 0);
+   create_linear_mapping_range(ktext_start, ktext_start + ktext_size, 0, 
NULL);
+   create_linear_mapping_range(krodata_start, krodata_start + 
krodata_size, 0, NULL);
 
memblock_clear_nomap(ktext_start,  ktext_size);
memblock_clear_nomap(krodata_start, krodata_size);
 #endif
 
 #ifdef CONFIG_KFENCE
-   create_linear_mapping_range(kfence_pool,
-   kfence_pool + KFENCE_POOL_SIZE,
-   PAGE_SIZE);
+   create_linear_mapping_range(kfence_pool, kfence_pool + 
KFENCE_POOL_SIZE, PAGE_SIZE, NULL);
 
memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);
 #endif
-- 
2.40.1

[PATCH v3 3/9] riscv: mm: Change attribute from init to meminit for page functions

2024-05-21 Thread Björn Töpel

From: Björn Töpel 

Prepare for memory hotplugging support by changing from __init to
__meminit for the page table functions that are used by the upcoming
architecture specific callbacks.

Changing the __init attribute to __meminit, avoids that the functions
are removed after init. The __meminit attribute makes sure the
functions are kept in the kernel text post init, but only if memory
hotplugging is enabled for the build.

Reviewed-by: David Hildenbrand 
Reviewed-by: Oscar Salvador 
Signed-off-by: Björn Töpel 
---
 arch/riscv/include/asm/mmu.h |  4 +--
 arch/riscv/include/asm/pgtable.h |  2 +-
 arch/riscv/mm/init.c | 56 ++--
 3 files changed, 28 insertions(+), 34 deletions(-)

diff --git a/arch/riscv/include/asm/mmu.h b/arch/riscv/include/asm/mmu.h
index 947fd60f9051..c9e03e9da3dc 100644
--- a/arch/riscv/include/asm/mmu.h
+++ b/arch/riscv/include/asm/mmu.h
@@ -31,8 +31,8 @@ typedef struct {
 #define cntx2asid(cntx)((cntx) & SATP_ASID_MASK)
 #define cntx2version(cntx) ((cntx) & ~SATP_ASID_MASK)
 
-void __init create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa,
-  phys_addr_t sz, pgprot_t prot);
+void __meminit create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa, 
phys_addr_t sz,
+ pgprot_t prot);
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_RISCV_MMU_H */
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 58fd7b70b903..7933f493db71 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -162,7 +162,7 @@ struct pt_alloc_ops {
 #endif
 };
 
-extern struct pt_alloc_ops pt_ops __initdata;
+extern struct pt_alloc_ops pt_ops __meminitdata;
 
 #ifdef CONFIG_MMU
 /* Number of PGD entries that a user-mode program can use */
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index c98010ede810..c969427eab88 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -295,7 +295,7 @@ static void __init setup_bootmem(void)
 }
 
 #ifdef CONFIG_MMU
-struct pt_alloc_ops pt_ops __initdata;
+struct pt_alloc_ops pt_ops __meminitdata;
 
 pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
 pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
@@ -357,7 +357,7 @@ static inline pte_t *__init get_pte_virt_fixmap(phys_addr_t 
pa)
return (pte_t *)set_fixmap_offset(FIX_PTE, pa);
 }
 
-static inline pte_t *__init get_pte_virt_late(phys_addr_t pa)
+static inline pte_t *__meminit get_pte_virt_late(phys_addr_t pa)
 {
return (pte_t *) __va(pa);
 }
@@ -376,7 +376,7 @@ static inline phys_addr_t __init alloc_pte_fixmap(uintptr_t 
va)
return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
 }
 
-static phys_addr_t __init alloc_pte_late(uintptr_t va)
+static phys_addr_t __meminit alloc_pte_late(uintptr_t va)
 {
struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0);
 
@@ -384,9 +384,8 @@ static phys_addr_t __init alloc_pte_late(uintptr_t va)
return __pa((pte_t *)ptdesc_address(ptdesc));
 }
 
-static void __init create_pte_mapping(pte_t *ptep,
- uintptr_t va, phys_addr_t pa,
- phys_addr_t sz, pgprot_t prot)
+static void __meminit create_pte_mapping(pte_t *ptep, uintptr_t va, 
phys_addr_t pa, phys_addr_t sz,
+pgprot_t prot)
 {
uintptr_t pte_idx = pte_index(va);
 
@@ -440,7 +439,7 @@ static pmd_t *__init get_pmd_virt_fixmap(phys_addr_t pa)
return (pmd_t *)set_fixmap_offset(FIX_PMD, pa);
 }
 
-static pmd_t *__init get_pmd_virt_late(phys_addr_t pa)
+static pmd_t *__meminit get_pmd_virt_late(phys_addr_t pa)
 {
return (pmd_t *) __va(pa);
 }
@@ -457,7 +456,7 @@ static phys_addr_t __init alloc_pmd_fixmap(uintptr_t va)
return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
 }
 
-static phys_addr_t __init alloc_pmd_late(uintptr_t va)
+static phys_addr_t __meminit alloc_pmd_late(uintptr_t va)
 {
struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0);
 
@@ -465,9 +464,9 @@ static phys_addr_t __init alloc_pmd_late(uintptr_t va)
return __pa((pmd_t *)ptdesc_address(ptdesc));
 }
 
-static void __init create_pmd_mapping(pmd_t *pmdp,
- uintptr_t va, phys_addr_t pa,
- phys_addr_t sz, pgprot_t prot)
+static void __meminit create_pmd_mapping(pmd_t *pmdp,
+uintptr_t va, phys_addr_t pa,
+phys_addr_t sz, pgprot_t prot)
 {
pte_t *ptep;
phys_addr_t pte_phys;
@@ -503,7 +502,7 @@ static pud_t *__init get_pud_virt_fixmap(phys_addr_t pa)
return (pud_t *)set_fixmap_offset(FIX_PUD, pa);
 }
 
-static pud_t *__init get_pud_virt_late(phys_addr_t pa)
+static pud_t *__meminit get_pud_virt_late(phys_addr_t pa)
 {
return (pud_t *)__va(p

[PATCH v3 2/9] riscv: mm: Pre-allocate vmemmap/direct map PGD entries

2024-05-21 Thread Björn Töpel

From: Björn Töpel 

The RISC-V port copies the PGD table from init_mm/swapper_pg_dir to
all userland page tables, which means that if the PGD level table is
changed, other page tables has to be updated as well.

Instead of having the PGD changes ripple out to all tables, the
synchronization can be avoided by pre-allocating the PGD entries/pages
at boot, avoiding the synchronization all together.

This is currently done for the bpf/modules, and vmalloc PGD regions.
Extend this scheme for the PGD regions touched by memory hotplugging.

Prepare the RISC-V port for memory hotplug by pre-allocate
vmemmap/direct map entries at the PGD level. This will roughly waste
~128 worth of 4K pages when memory hotplugging is enabled in the
kernel configuration.

Reviewed-by: Alexandre Ghiti 
Signed-off-by: Björn Töpel 
---
 arch/riscv/include/asm/kasan.h | 4 ++--
 arch/riscv/mm/init.c   | 7 +++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h
index 0b85e363e778..e6a0071bdb56 100644
--- a/arch/riscv/include/asm/kasan.h
+++ b/arch/riscv/include/asm/kasan.h
@@ -6,8 +6,6 @@
 
 #ifndef __ASSEMBLY__
 
-#ifdef CONFIG_KASAN
-
 /*
  * The following comment was copied from arm64:
  * KASAN_SHADOW_START: beginning of the kernel virtual addresses.
@@ -34,6 +32,8 @@
  */
 #define KASAN_SHADOW_START ((KASAN_SHADOW_END - KASAN_SHADOW_SIZE) & 
PGDIR_MASK)
 #define KASAN_SHADOW_END   MODULES_LOWEST_VADDR
+
+#ifdef CONFIG_KASAN
 #define KASAN_SHADOW_OFFSET_AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
 
 void kasan_init(void);
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index b66f846e7634..c98010ede810 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -27,6 +27,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1488,10 +1489,16 @@ static void __init preallocate_pgd_pages_range(unsigned 
long start, unsigned lon
panic("Failed to pre-allocate %s pages for %s area\n", lvl, area);
 }
 
+#define PAGE_END KASAN_SHADOW_START
+
 void __init pgtable_cache_init(void)
 {
preallocate_pgd_pages_range(VMALLOC_START, VMALLOC_END, "vmalloc");
if (IS_ENABLED(CONFIG_MODULES))
preallocate_pgd_pages_range(MODULES_VADDR, MODULES_END, 
"bpf/modules");
+   if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
+   preallocate_pgd_pages_range(VMEMMAP_START, VMEMMAP_END, 
"vmemmap");
+   preallocate_pgd_pages_range(PAGE_OFFSET, PAGE_END, "direct 
map");
+   }
 }
 #endif
-- 
2.40.1

[PATCH v3 1/9] riscv: mm: Properly forward vmemmap_populate() altmap parameter

2024-05-21 Thread Björn Töpel

From: Björn Töpel 

Make sure that the altmap parameter is properly passed on to
vmemmap_populate_hugepages().

Signed-off-by: Björn Töpel 
---
 arch/riscv/mm/init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 2574f6a3b0e7..b66f846e7634 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -1434,7 +1434,7 @@ int __meminit vmemmap_populate(unsigned long start, 
unsigned long end, int node,
 * memory hotplug, we are not able to update all the page tables with
 * the new PMDs.
 */
-   return vmemmap_populate_hugepages(start, end, node, NULL);
+   return vmemmap_populate_hugepages(start, end, node, altmap);
 }
 #endif
 
-- 
2.40.1

[PATCH v3 0/9] riscv: Memory Hot(Un)Plug support

2024-05-21 Thread Björn Töpel

From: Björn Töpel 


Memory Hot(Un)Plug support (and ZONE_DEVICE) for the RISC-V port


Introduction


To quote "Documentation/admin-guide/mm/memory-hotplug.rst": "Memory
hot(un)plug allows for increasing and decreasing the size of physical
memory available to a machine at runtime."

This series adds memory hot(un)plugging, and ZONE_DEVICE support for
the RISC-V Linux port.

MM configuration


RISC-V MM has the following configuration:

 * Memory blocks are 128M, analogous to x86-64. It uses PMD
   ("hugepage") vmemmaps. From that follows that 2M (PMD) worth of
   vmemmap spans 32768 pages á 4K which gets us 128M.

 * The pageblock size is the minimum minimum virtio_mem size, and on
   RISC-V it's 2M (2^9 * 4K).

Implementation
==

The PGD table on RISC-V is shared/copied between for all processes. To
avoid doing page table synchronization, the first patch (patch 1)
pre-allocated the PGD entries for vmemmap/direct map. By doing that
the init_mm PGD will be fixed at kernel init, and synchronization can
be avoided all together.

The following two patches (patch 2-3) does some preparations, followed
by the actual MHP implementation (patch 4-5). Then, MHP and virtio-mem
are enabled (patch 6-7), and finally ZONE_DEVICE support is added
(patch 8).

MHP and locking
===

TL;DR: The MHP does not step on any toes, except for ptdump.
Additional locking is required for ptdump.

Long version: For v2 I spent some time digging into init_mm
synchronization/update. Here are my findings, and I'd love them to be
corrected if incorrect.

It's been a gnarly path...

The `init_mm` structure is a special mm (perhaps not a "real" one).
It's a "lazy context" that tracks kernel page table resources, e.g.,
the kernel page table (swapper_pg_dir), a kernel page_table_lock (more
about the usage below), mmap_lock, and such.

`init_mm` does not track/contain any VMAs. Having the `init_mm` is
convenient, so that the regular kernel page table walk/modify
functions can be used.

Now, `init_mm` being special means that the locking for kernel page
tables are special as well.

On RISC-V the PGD (top-level page table structure), similar to x86, is
shared (copied) with user processes. If the kernel PGD is modified, it
has to be synched to user-mode processes PGDs. This is avoided by
pre-populating the PGD, so it'll be fixed from boot.

The in-kernel pgd regions are documented in
`Documentation/arch/riscv/vm-layout.rst`.

The distinct regions are:
 * vmemmap
 * vmalloc/ioremap space
 * direct mapping of all physical memory
 * kasan
 * modules, BPF
 * kernel

Memory hotplug is the process of adding/removing memory to/from the
kernel.

Adding is done in two phases:
 1. Add the memory to the kernel
 2. Online memory, making it available to the page allocator.

Step 1 is partially architecture dependent, and updates the init_mm
page table:
 * Update the direct map page tables. The direct map is a linear map,
   representing all physical memory: `virt = phys + PAGE_OFFSET`
 * Add a `struct page` for each added page of memory. Update the
   vmemmap (virtual mapping to the `struct page`, so we can easily
   transform a kernel virtual address to a `struct page *` address.

>From an MHP perspective, there are two regions of the PGD that are
updated:
 * vmemmap
 * direct mapping of all physical memory

The `struct mm_struct` has a couple of locks in play:
 * `spinlock_t page_table_lock` protects the page table, and some
counters
 * `struct rw_semaphore mmap_lock` protect an mm's VMAs

Note again that `init_mm` does not contain any VMAs, but still uses
the mmap_lock in some places.

The `page_table_lock` was originally used to to protect all pages
tables, but more recently a split page table lock has been introduced.
The split lock has a per-table lock for the PTE and PMD tables. If
split lock is disabled, all tables are guarded by
`mm->page_table_lock` (for user processes). Split page table locks are
not used for init_mm.

MHP operations is typically synchronized using
`DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock)`.

Actors
--

The following non-MHP actors in the kernel traverses (read), and/or
modifies the kernel PGD.

 * `ptdump`

   Walks the entire `init_mm`, via `ptdump_walk_pgd()` with the
   `mmap_write_lock(init_mm)` taken.

   Observation: ptdump can race with MHP, and needs additional locking
   to avoid crashes/races.

 * `set_direct_*` / `arch/riscv/mm/pageattr.c`

   The `set_direct_*` functionality is used to "synchronize" the
   direct map to other kernel mappings, e.g. modules/kernel text. The
   direct map is using "as large huge table mappings as possible",
   which means that the `set_direct_*` might need to split the direct
   map.

  The `set_direct_*` functions operates with

Re: [PATCH v2 8/8] riscv: mm: Add support for ZONE_DEVICE

2024-05-15 Thread Björn Töpel

Björn Töpel  writes:

> From: Björn Töpel 
>
> ZONE_DEVICE pages need DEVMAP PTEs support to function
> (ARCH_HAS_PTE_DEVMAP). Claim another RSW (reserved for software) bit
> in the PTE for DEVMAP mark, add the corresponding helpers, and enable
> ARCH_HAS_PTE_DEVMAP for riscv64.

...and this patch has rv32 build issues. Will fix as well.

Re: [PATCH v2 4/8] riscv: mm: Add memory hotplugging support

2024-05-14 Thread Björn Töpel

Oscar Salvador  writes:

> On Tue, May 14, 2024 at 04:04:42PM +0200, Björn Töpel wrote:
>> +static void __meminit free_vmemmap_storage(struct page *page, size_t size,
>> +   struct vmem_altmap *altmap)
>> +{
>> +if (altmap)
>> +vmem_altmap_free(altmap, size >> PAGE_SHIFT);
>> +else
>> +free_pages((unsigned long)page_address(page), get_order(size));
>
> David already pointed this out, but can check
> arch/x86/mm/init_64.c:free_pagetable().
>
> You will see that we have to do some magic for bootmem memory (DIMMs
> which were not hotplugged but already present)

Thank you!

>> +#ifdef CONFIG_SPARSEMEM_VMEMMAP
>> +void __ref vmemmap_free(unsigned long start, unsigned long end, struct 
>> vmem_altmap *altmap)
>> +{
>> +remove_pgd_mapping(start, end, true, altmap);
>> +}
>> +#endif /* CONFIG_SPARSEMEM_VMEMMAP */
>> +#endif /* CONFIG_MEMORY_HOTPLUG */
>
> I will comment on the patch where you add support for hotplug and the
> dependency, but on a track in LSFMM today, we decided that most likely
> we will drop memory-hotplug support for !CONFIG_SPARSEMEM_VMEMMAP
> environments.
> So, since you are adding this plain fresh, please consider to tight the
> hotplug dependency to CONFIG_SPARSEMEM_VMEMMAP.
> As a bonus, you will only have to maintain one flavour of functions.

Ah, yeah, I saw it mentioned on the LSF/MM/BPF topics. Less is
definitely more -- I'll make the next version depend on
SPARSEMEM_VMEMMAP.


Björn

Re: [PATCH v2 2/8] riscv: mm: Change attribute from init to meminit for page functions

2024-05-14 Thread Björn Töpel

Oscar Salvador  writes:

> On Tue, May 14, 2024 at 04:04:40PM +0200, Björn Töpel wrote:
>> From: Björn Töpel 
>> 
>> Prepare for memory hotplugging support by changing from __init to
>> __meminit for the page table functions that are used by the upcoming
>> architecture specific callbacks.
>> 
>> Changing the __init attribute to __meminit, avoids that the functions
>> are removed after init. The __meminit attribute makes sure the
>> functions are kept in the kernel text post init, but only if memory
>> hotplugging is enabled for the build.
>> 
>> Also, make sure that the altmap parameter is properly passed on to
>> vmemmap_populate_hugepages().
>> 
>> Signed-off-by: Björn Töpel 
>
> Reviewed-by: Oscar Salvador 
>
>> +static void __meminit create_linear_mapping_range(phys_addr_t start, 
>> phys_addr_t end,
>> +  uintptr_t fixed_map_size)
>>  {
>>  phys_addr_t pa;
>>  uintptr_t va, map_size;
>> @@ -1435,7 +1429,7 @@ int __meminit vmemmap_populate(unsigned long start, 
>> unsigned long end, int node,
>>   * memory hotplug, we are not able to update all the page tables with
>>   * the new PMDs.
>>   */
>> -return vmemmap_populate_hugepages(start, end, node, NULL);
>> +return vmemmap_populate_hugepages(start, end, node, altmap);
>
> I would have put this into a separate patch.

Thanks for the review, Oscar!

I'll split this up (also suggested by Alex!).


Cheers,
Björn

Re: [PATCH v2 4/8] riscv: mm: Add memory hotplugging support

2024-05-14 Thread Björn Töpel

Alexandre Ghiti  writes:

> On Tue, May 14, 2024 at 4:05 PM Björn Töpel  wrote:

>> +int __ref arch_add_memory(int nid, u64 start, u64 size, struct mhp_params 
>> *params)
>> +{
>> +   int ret;
>> +
>> +   create_linear_mapping_range(start, start + size, 0, >pgprot);
>> +   flush_tlb_all();
>> +   ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, 
>> params);
>> +   if (ret) {
>> +   remove_linear_mapping(start, size);
>> +   return ret;
>> +   }
>> +
>
> You need to flush the TLB here too since __add_pages() populates the
> page table with the new vmemmap mapping (only because riscv allows to
> cache invalid entries, I'll adapt this in my next version of Svvptc
> support).
>
>> +   max_pfn = PFN_UP(start + size);
>> +   max_low_pfn = max_pfn;
>> +   return 0;
>> +}
>> +
>> +void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap 
>> *altmap)
>> +{
>> +   __remove_pages(start >> PAGE_SHIFT, size >> PAGE_SHIFT, altmap);
>> +   remove_linear_mapping(start, size);
>
> You need to flush the TLB here too.

I'll address all of the above in the next version. Thanks for reviewing
the series!


Björn

Re: [PATCH v2 6/8] riscv: Enable memory hotplugging for RISC-V

2024-05-14 Thread Björn Töpel

Alexandre Ghiti  writes:

> On Tue, May 14, 2024 at 4:05 PM Björn Töpel  wrote:
>>
>> From: Björn Töpel 
>>
>> Enable ARCH_ENABLE_MEMORY_HOTPLUG and ARCH_ENABLE_MEMORY_HOTREMOVE for
>> RISC-V.
>>
>> Signed-off-by: Björn Töpel 
>> ---
>>  arch/riscv/Kconfig | 2 ++
>>  1 file changed, 2 insertions(+)
>>
>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>> index 6bec1bce6586..b9398b64bb69 100644
>> --- a/arch/riscv/Kconfig
>> +++ b/arch/riscv/Kconfig
>> @@ -16,6 +16,8 @@ config RISCV
>> select ACPI_REDUCED_HARDWARE_ONLY if ACPI
>> select ARCH_DMA_DEFAULT_COHERENT
>> select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION
>> +   select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM && 64BIT && MMU
>
> I think this should be SPARSEMEM_VMEMMAP here.

Hmm, care to elaborate? I thought that was optional.

Re: [PATCH v2 2/8] riscv: mm: Change attribute from init to meminit for page functions

2024-05-14 Thread Björn Töpel

Alexandre Ghiti  writes:

> On Tue, May 14, 2024 at 4:05 PM Björn Töpel  wrote:
>>
>> From: Björn Töpel 
>>
>> Prepare for memory hotplugging support by changing from __init to
>> __meminit for the page table functions that are used by the upcoming
>> architecture specific callbacks.
>>
>> Changing the __init attribute to __meminit, avoids that the functions
>> are removed after init. The __meminit attribute makes sure the
>> functions are kept in the kernel text post init, but only if memory
>> hotplugging is enabled for the build.
>>
>> Also, make sure that the altmap parameter is properly passed on to
>> vmemmap_populate_hugepages().
>>
>> Signed-off-by: Björn Töpel 
>> ---
>>  arch/riscv/include/asm/mmu.h |  4 +--
>>  arch/riscv/include/asm/pgtable.h |  2 +-
>>  arch/riscv/mm/init.c | 58 ++--
>>  3 files changed, 29 insertions(+), 35 deletions(-)
>>
>> diff --git a/arch/riscv/include/asm/mmu.h b/arch/riscv/include/asm/mmu.h
>> index 60be458e94da..c09c3c79f496 100644
>> --- a/arch/riscv/include/asm/mmu.h
>> +++ b/arch/riscv/include/asm/mmu.h
>> @@ -28,8 +28,8 @@ typedef struct {
>>  #endif
>>  } mm_context_t;
>>
>> -void __init create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa,
>> -  phys_addr_t sz, pgprot_t prot);
>> +void __meminit create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t 
>> pa, phys_addr_t sz,
>> + pgprot_t prot);
>>  #endif /* __ASSEMBLY__ */
>>
>>  #endif /* _ASM_RISCV_MMU_H */
>> diff --git a/arch/riscv/include/asm/pgtable.h 
>> b/arch/riscv/include/asm/pgtable.h
>> index 58fd7b70b903..7933f493db71 100644
>> --- a/arch/riscv/include/asm/pgtable.h
>> +++ b/arch/riscv/include/asm/pgtable.h
>> @@ -162,7 +162,7 @@ struct pt_alloc_ops {
>>  #endif
>>  };
>>
>> -extern struct pt_alloc_ops pt_ops __initdata;
>> +extern struct pt_alloc_ops pt_ops __meminitdata;
>>
>>  #ifdef CONFIG_MMU
>>  /* Number of PGD entries that a user-mode program can use */
>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
>> index 5b8cdfafb52a..c969427eab88 100644
>> --- a/arch/riscv/mm/init.c
>> +++ b/arch/riscv/mm/init.c
>> @@ -295,7 +295,7 @@ static void __init setup_bootmem(void)
>>  }
>>
>>  #ifdef CONFIG_MMU
>> -struct pt_alloc_ops pt_ops __initdata;
>> +struct pt_alloc_ops pt_ops __meminitdata;
>>
>>  pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
>>  pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
>> @@ -357,7 +357,7 @@ static inline pte_t *__init 
>> get_pte_virt_fixmap(phys_addr_t pa)
>> return (pte_t *)set_fixmap_offset(FIX_PTE, pa);
>>  }
>>
>> -static inline pte_t *__init get_pte_virt_late(phys_addr_t pa)
>> +static inline pte_t *__meminit get_pte_virt_late(phys_addr_t pa)
>>  {
>> return (pte_t *) __va(pa);
>>  }
>> @@ -376,7 +376,7 @@ static inline phys_addr_t __init 
>> alloc_pte_fixmap(uintptr_t va)
>> return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
>>  }
>>
>> -static phys_addr_t __init alloc_pte_late(uintptr_t va)
>> +static phys_addr_t __meminit alloc_pte_late(uintptr_t va)
>>  {
>> struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 
>> 0);
>>
>> @@ -384,9 +384,8 @@ static phys_addr_t __init alloc_pte_late(uintptr_t va)
>> return __pa((pte_t *)ptdesc_address(ptdesc));
>>  }
>>
>> -static void __init create_pte_mapping(pte_t *ptep,
>> - uintptr_t va, phys_addr_t pa,
>> - phys_addr_t sz, pgprot_t prot)
>> +static void __meminit create_pte_mapping(pte_t *ptep, uintptr_t va, 
>> phys_addr_t pa, phys_addr_t sz,
>> +pgprot_t prot)
>>  {
>> uintptr_t pte_idx = pte_index(va);
>>
>> @@ -440,7 +439,7 @@ static pmd_t *__init get_pmd_virt_fixmap(phys_addr_t pa)
>> return (pmd_t *)set_fixmap_offset(FIX_PMD, pa);
>>  }
>>
>> -static pmd_t *__init get_pmd_virt_late(phys_addr_t pa)
>> +static pmd_t *__meminit get_pmd_virt_late(phys_addr_t pa)
>>  {
>> return (pmd_t *) __va(pa);
>>  }
>> @@ -457,7 +456,7 @@ static phys_addr_t __init alloc_pmd_fixmap(uintptr_t va)
>> return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
>>  }
>>
>> -static phys_addr_t __init alloc_pmd_late(uintptr_t va)
>&g

Re: [PATCH v2 4/8] riscv: mm: Add memory hotplugging support

2024-05-14 Thread Björn Töpel

David Hildenbrand  writes:

> On 14.05.24 16:04, Björn Töpel wrote:
>> From: Björn Töpel 
>> 
>> For an architecture to support memory hotplugging, a couple of
>> callbacks needs to be implemented:
>> 
>>   arch_add_memory()
>>This callback is responsible for adding the physical memory into the
>>direct map, and call into the memory hotplugging generic code via
>>__add_pages() that adds the corresponding struct page entries, and
>>updates the vmemmap mapping.
>> 
>>   arch_remove_memory()
>>This is the inverse of the callback above.
>> 
>>   vmemmap_free()
>>This function tears down the vmemmap mappings (if
>>CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the
>>backing vmemmap pages. Note that for persistent memory, an
>>alternative allocator for the backing pages can be used; The
>>vmem_altmap. This means that when the backing pages are cleared,
>>extra care is needed so that the correct deallocation method is
>>used.
>> 
>>   arch_get_mappable_range()
>>This functions returns the PA range that the direct map can map.
>>Used by the MHP internals for sanity checks.
>> 
>> The page table unmap/teardown functions are heavily based on code from
>> the x86 tree. The same remove_pgd_mapping() function is used in both
>> vmemmap_free() and arch_remove_memory(), but in the latter function
>> the backing pages are not removed.
>> 
>> Signed-off-by: Björn Töpel 
>> ---
>>   arch/riscv/mm/init.c | 242 +++
>>   1 file changed, 242 insertions(+)
>> 
>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
>> index 6f72b0b2b854..7f0b921a3d3a 100644
>> --- a/arch/riscv/mm/init.c
>> +++ b/arch/riscv/mm/init.c
>> @@ -1493,3 +1493,245 @@ void __init pgtable_cache_init(void)
>>  }
>>   }
>>   #endif
>> +
>> +#ifdef CONFIG_MEMORY_HOTPLUG
>> +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
>> +{
>> +pte_t *pte;
>> +int i;
>> +
>> +for (i = 0; i < PTRS_PER_PTE; i++) {
>> +pte = pte_start + i;
>> +if (!pte_none(*pte))
>> +return;
>> +}
>> +
>> +free_pages((unsigned long)page_address(pmd_page(*pmd)), 0);
>> +pmd_clear(pmd);
>> +}
>> +
>> +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
>> +{
>> +pmd_t *pmd;
>> +int i;
>> +
>> +for (i = 0; i < PTRS_PER_PMD; i++) {
>> +pmd = pmd_start + i;
>> +if (!pmd_none(*pmd))
>> +return;
>> +}
>> +
>> +free_pages((unsigned long)page_address(pud_page(*pud)), 0);
>> +pud_clear(pud);
>> +}
>> +
>> +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
>> +{
>> +pud_t *pud;
>> +int i;
>> +
>> +for (i = 0; i < PTRS_PER_PUD; i++) {
>> +pud = pud_start + i;
>> +if (!pud_none(*pud))
>> +return;
>> +}
>> +
>> +free_pages((unsigned long)page_address(p4d_page(*p4d)), 0);
>> +p4d_clear(p4d);
>> +}
>> +
>> +static void __meminit free_vmemmap_storage(struct page *page, size_t size,
>> +   struct vmem_altmap *altmap)
>> +{
>> +if (altmap)
>> +vmem_altmap_free(altmap, size >> PAGE_SHIFT);
>> +else
>> +free_pages((unsigned long)page_address(page), get_order(size));
>
> If you unplug a DIMM that was added during boot (can happen on x86-64, 
> can it happen on riscv?), free_pages() would not be sufficient. You'd be 
> freeing a PG_reserved page that has to be freed differently.

I'd say if it can happen on x86-64, it probably can on RISC-V. I'll look
into this for the next spin!

Thanks for spending time on the series!


Cheers,
Björn

Re: [PATCH v2 1/8] riscv: mm: Pre-allocate vmemmap/direct map PGD entries

2024-05-14 Thread Björn Töpel

Alexandre Ghiti  writes:

> Hi Björn,
>
> On Tue, May 14, 2024 at 4:05 PM Björn Töpel  wrote:
>>
>> From: Björn Töpel 
>>
>> The RISC-V port copies the PGD table from init_mm/swapper_pg_dir to
>> all userland page tables, which means that if the PGD level table is
>> changed, other page tables has to be updated as well.
>>
>> Instead of having the PGD changes ripple out to all tables, the
>> synchronization can be avoided by pre-allocating the PGD entries/pages
>> at boot, avoiding the synchronization all together.
>>
>> This is currently done for the bpf/modules, and vmalloc PGD regions.
>> Extend this scheme for the PGD regions touched by memory hotplugging.
>>
>> Prepare the RISC-V port for memory hotplug by pre-allocate
>> vmemmap/direct map entries at the PGD level. This will roughly waste
>> ~128 worth of 4K pages when memory hotplugging is enabled in the
>> kernel configuration.
>>
>> Signed-off-by: Björn Töpel 
>> ---
>>  arch/riscv/include/asm/kasan.h | 4 ++--
>>  arch/riscv/mm/init.c   | 7 +++
>>  2 files changed, 9 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h
>> index 0b85e363e778..e6a0071bdb56 100644
>> --- a/arch/riscv/include/asm/kasan.h
>> +++ b/arch/riscv/include/asm/kasan.h
>> @@ -6,8 +6,6 @@
>>
>>  #ifndef __ASSEMBLY__
>>
>> -#ifdef CONFIG_KASAN
>> -
>>  /*
>>   * The following comment was copied from arm64:
>>   * KASAN_SHADOW_START: beginning of the kernel virtual addresses.
>> @@ -34,6 +32,8 @@
>>   */
>>  #define KASAN_SHADOW_START ((KASAN_SHADOW_END - KASAN_SHADOW_SIZE) & 
>> PGDIR_MASK)
>>  #define KASAN_SHADOW_END   MODULES_LOWEST_VADDR
>> +
>> +#ifdef CONFIG_KASAN
>>  #define KASAN_SHADOW_OFFSET_AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
>>
>>  void kasan_init(void);
>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
>> index 2574f6a3b0e7..5b8cdfafb52a 100644
>> --- a/arch/riscv/mm/init.c
>> +++ b/arch/riscv/mm/init.c
>> @@ -27,6 +27,7 @@
>>
>>  #include 
>>  #include 
>> +#include 
>>  #include 
>>  #include 
>>  #include 
>> @@ -1488,10 +1489,16 @@ static void __init 
>> preallocate_pgd_pages_range(unsigned long start, unsigned lon
>> panic("Failed to pre-allocate %s pages for %s area\n", lvl, area);
>>  }
>>
>> +#define PAGE_END KASAN_SHADOW_START
>> +
>>  void __init pgtable_cache_init(void)
>>  {
>> preallocate_pgd_pages_range(VMALLOC_START, VMALLOC_END, "vmalloc");
>> if (IS_ENABLED(CONFIG_MODULES))
>> preallocate_pgd_pages_range(MODULES_VADDR, MODULES_END, 
>> "bpf/modules");
>> +   if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
>> +   preallocate_pgd_pages_range(VMEMMAP_START, VMEMMAP_END, 
>> "vmemmap");
>> +   preallocate_pgd_pages_range(PAGE_OFFSET, PAGE_END, "direct 
>> map");
>> +   }
>>  }
>>  #endif
>> --
>> 2.40.1
>>
>
> As you asked, with
> https://lore.kernel.org/linux-riscv/20240514133614.87813-1-alexgh...@rivosinc.com/T/#u,
> you will be able to remove the usage of KASAN_SHADOW_START.

Very nice -- consistency! I'll need to respin, so I'll clean this up for
the next version.

> But anyhow, you can add:
>
> Reviewed-by: Alexandre Ghiti 


Thank you!
Björn

[PATCH v2 8/8] riscv: mm: Add support for ZONE_DEVICE

2024-05-14 Thread Björn Töpel

From: Björn Töpel 

ZONE_DEVICE pages need DEVMAP PTEs support to function
(ARCH_HAS_PTE_DEVMAP). Claim another RSW (reserved for software) bit
in the PTE for DEVMAP mark, add the corresponding helpers, and enable
ARCH_HAS_PTE_DEVMAP for riscv64.

Signed-off-by: Björn Töpel 
---
 arch/riscv/Kconfig|  1 +
 arch/riscv/include/asm/pgtable-64.h   | 20 
 arch/riscv/include/asm/pgtable-bits.h |  1 +
 arch/riscv/include/asm/pgtable.h  | 15 +++
 4 files changed, 37 insertions(+)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index b9398b64bb69..6d426afdd904 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -36,6 +36,7 @@ config RISCV
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select ARCH_HAS_PMEM_API
select ARCH_HAS_PREPARE_SYNC_CORE_CMD
+   select ARCH_HAS_PTE_DEVMAP if 64BIT && MMU
select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_SET_DIRECT_MAP if MMU
select ARCH_HAS_SET_MEMORY if MMU
diff --git a/arch/riscv/include/asm/pgtable-64.h 
b/arch/riscv/include/asm/pgtable-64.h
index 221a5c1ee287..c67a9bbfd010 100644
--- a/arch/riscv/include/asm/pgtable-64.h
+++ b/arch/riscv/include/asm/pgtable-64.h
@@ -400,4 +400,24 @@ static inline struct page *pgd_page(pgd_t pgd)
 #define p4d_offset p4d_offset
 p4d_t *p4d_offset(pgd_t *pgd, unsigned long address);
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int pte_devmap(pte_t pte);
+static inline pte_t pmd_pte(pmd_t pmd);
+
+static inline int pmd_devmap(pmd_t pmd)
+{
+   return pte_devmap(pmd_pte(pmd));
+}
+
+static inline int pud_devmap(pud_t pud)
+{
+   return 0;
+}
+
+static inline int pgd_devmap(pgd_t pgd)
+{
+   return 0;
+}
+#endif
+
 #endif /* _ASM_RISCV_PGTABLE_64_H */
diff --git a/arch/riscv/include/asm/pgtable-bits.h 
b/arch/riscv/include/asm/pgtable-bits.h
index 179bd4afece4..a8f5205cea54 100644
--- a/arch/riscv/include/asm/pgtable-bits.h
+++ b/arch/riscv/include/asm/pgtable-bits.h
@@ -19,6 +19,7 @@
 #define _PAGE_SOFT  (3 << 8)/* Reserved for software */
 
 #define _PAGE_SPECIAL   (1 << 8)/* RSW: 0x1 */
+#define _PAGE_DEVMAP(1 << 9)/* RSW, devmap */
 #define _PAGE_TABLE _PAGE_PRESENT
 
 /*
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 7933f493db71..216de1db3cd0 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -387,6 +387,11 @@ static inline int pte_special(pte_t pte)
return pte_val(pte) & _PAGE_SPECIAL;
 }
 
+static inline int pte_devmap(pte_t pte)
+{
+   return pte_val(pte) & _PAGE_DEVMAP;
+}
+
 /* static inline pte_t pte_rdprotect(pte_t pte) */
 
 static inline pte_t pte_wrprotect(pte_t pte)
@@ -428,6 +433,11 @@ static inline pte_t pte_mkspecial(pte_t pte)
return __pte(pte_val(pte) | _PAGE_SPECIAL);
 }
 
+static inline pte_t pte_mkdevmap(pte_t pte)
+{
+   return __pte(pte_val(pte) | _PAGE_DEVMAP);
+}
+
 static inline pte_t pte_mkhuge(pte_t pte)
 {
return pte;
@@ -711,6 +721,11 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
return pte_pmd(pte_mkdirty(pmd_pte(pmd)));
 }
 
+static inline pmd_t pmd_mkdevmap(pmd_t pmd)
+{
+   return pte_pmd(pte_mkdevmap(pmd_pte(pmd)));
+}
+
 static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
 {
-- 
2.40.1

[PATCH v2 6/8] riscv: Enable memory hotplugging for RISC-V

2024-05-14 Thread Björn Töpel

From: Björn Töpel 

Enable ARCH_ENABLE_MEMORY_HOTPLUG and ARCH_ENABLE_MEMORY_HOTREMOVE for
RISC-V.

Signed-off-by: Björn Töpel 
---
 arch/riscv/Kconfig | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 6bec1bce6586..b9398b64bb69 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -16,6 +16,8 @@ config RISCV
select ACPI_REDUCED_HARDWARE_ONLY if ACPI
select ARCH_DMA_DEFAULT_COHERENT
select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION
+   select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM && 64BIT && MMU
+   select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG
select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
select ARCH_HAS_BINFMT_FLAT
-- 
2.40.1

[PATCH v2 7/8] virtio-mem: Enable virtio-mem for RISC-V

2024-05-14 Thread Björn Töpel

From: Björn Töpel 

Now that RISC-V has memory hotplugging support, virtio-mem can be used
on the platform.

Signed-off-by: Björn Töpel 
---
 drivers/virtio/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index c17193544268..4e5cebf1b82a 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -122,7 +122,7 @@ config VIRTIO_BALLOON
 
 config VIRTIO_MEM
tristate "Virtio mem driver"
-   depends on X86_64 || ARM64
+   depends on X86_64 || ARM64 || RISCV
depends on VIRTIO
depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE
-- 
2.40.1

[PATCH v2 4/8] riscv: mm: Add memory hotplugging support

2024-05-14 Thread Björn Töpel

From: Björn Töpel 

For an architecture to support memory hotplugging, a couple of
callbacks needs to be implemented:

 arch_add_memory()
  This callback is responsible for adding the physical memory into the
  direct map, and call into the memory hotplugging generic code via
  __add_pages() that adds the corresponding struct page entries, and
  updates the vmemmap mapping.

 arch_remove_memory()
  This is the inverse of the callback above.

 vmemmap_free()
  This function tears down the vmemmap mappings (if
  CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the
  backing vmemmap pages. Note that for persistent memory, an
  alternative allocator for the backing pages can be used; The
  vmem_altmap. This means that when the backing pages are cleared,
  extra care is needed so that the correct deallocation method is
  used.

 arch_get_mappable_range()
  This functions returns the PA range that the direct map can map.
  Used by the MHP internals for sanity checks.

The page table unmap/teardown functions are heavily based on code from
the x86 tree. The same remove_pgd_mapping() function is used in both
vmemmap_free() and arch_remove_memory(), but in the latter function
the backing pages are not removed.

Signed-off-by: Björn Töpel 
---
 arch/riscv/mm/init.c | 242 +++
 1 file changed, 242 insertions(+)

diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 6f72b0b2b854..7f0b921a3d3a 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -1493,3 +1493,245 @@ void __init pgtable_cache_init(void)
}
 }
 #endif
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+   pte_t *pte;
+   int i;
+
+   for (i = 0; i < PTRS_PER_PTE; i++) {
+   pte = pte_start + i;
+   if (!pte_none(*pte))
+   return;
+   }
+
+   free_pages((unsigned long)page_address(pmd_page(*pmd)), 0);
+   pmd_clear(pmd);
+}
+
+static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+   pmd_t *pmd;
+   int i;
+
+   for (i = 0; i < PTRS_PER_PMD; i++) {
+   pmd = pmd_start + i;
+   if (!pmd_none(*pmd))
+   return;
+   }
+
+   free_pages((unsigned long)page_address(pud_page(*pud)), 0);
+   pud_clear(pud);
+}
+
+static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
+{
+   pud_t *pud;
+   int i;
+
+   for (i = 0; i < PTRS_PER_PUD; i++) {
+   pud = pud_start + i;
+   if (!pud_none(*pud))
+   return;
+   }
+
+   free_pages((unsigned long)page_address(p4d_page(*p4d)), 0);
+   p4d_clear(p4d);
+}
+
+static void __meminit free_vmemmap_storage(struct page *page, size_t size,
+  struct vmem_altmap *altmap)
+{
+   if (altmap)
+   vmem_altmap_free(altmap, size >> PAGE_SHIFT);
+   else
+   free_pages((unsigned long)page_address(page), get_order(size));
+}
+
+static void __meminit remove_pte_mapping(pte_t *pte_base, unsigned long addr, 
unsigned long end,
+bool is_vmemmap, struct vmem_altmap 
*altmap)
+{
+   unsigned long next;
+   pte_t *ptep, pte;
+
+   for (; addr < end; addr = next) {
+   next = (addr + PAGE_SIZE) & PAGE_MASK;
+   if (next > end)
+   next = end;
+
+   ptep = pte_base + pte_index(addr);
+   pte = READ_ONCE(*ptep);
+
+   if (!pte_present(*ptep))
+   continue;
+
+   pte_clear(_mm, addr, ptep);
+   if (is_vmemmap)
+   free_vmemmap_storage(pte_page(pte), PAGE_SIZE, altmap);
+   }
+}
+
+static void __meminit remove_pmd_mapping(pmd_t *pmd_base, unsigned long addr, 
unsigned long end,
+bool is_vmemmap, struct vmem_altmap 
*altmap)
+{
+   unsigned long next;
+   pte_t *pte_base;
+   pmd_t *pmdp, pmd;
+
+   for (; addr < end; addr = next) {
+   next = pmd_addr_end(addr, end);
+   pmdp = pmd_base + pmd_index(addr);
+   pmd = READ_ONCE(*pmdp);
+
+   if (!pmd_present(pmd))
+   continue;
+
+   if (pmd_leaf(pmd)) {
+   pmd_clear(pmdp);
+   if (is_vmemmap)
+   free_vmemmap_storage(pmd_page(pmd), PMD_SIZE, 
altmap);
+   continue;
+   }
+
+   pte_base = (pte_t *)pmd_page_vaddr(*pmdp);
+   remove_pte_mapping(pte_base, addr, next, is_vmemmap, altmap);
+   free_pte_table(pte_base, pmdp);
+   }
+}
+
+static void __meminit remove_pud_mapping(pud_t *pud_base, unsigned long addr, 
unsigned long end,
+bool is_vmemmap,

[PATCH v2 5/8] riscv: mm: Take memory hotplug read-lock during kernel page table dump

2024-05-14 Thread Björn Töpel

From: Björn Töpel 

During memory hot remove, the ptdump functionality can end up touching
stale data. Avoid any potential crashes (or worse), by holding the
memory hotplug read-lock while traversing the page table.

This change is analogous to arm64's commit bf2b59f60ee1 ("arm64/mm:
Hold memory hotplug lock while walking for kernel page table dump").

Signed-off-by: Björn Töpel 
---
 arch/riscv/mm/ptdump.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c
index 1289cc6d3700..9d5f657a251b 100644
--- a/arch/riscv/mm/ptdump.c
+++ b/arch/riscv/mm/ptdump.c
@@ -6,6 +6,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -370,7 +371,9 @@ bool ptdump_check_wx(void)
 
 static int ptdump_show(struct seq_file *m, void *v)
 {
+   get_online_mems();
ptdump_walk(m, m->private);
+   put_online_mems();
 
return 0;
 }
-- 
2.40.1

[PATCH v2 3/8] riscv: mm: Refactor create_linear_mapping_range() for memory hot add

2024-05-14 Thread Björn Töpel

From: Björn Töpel 

Add a parameter to the direct map setup function, so it can be used in
arch_add_memory() later.

Signed-off-by: Björn Töpel 
---
 arch/riscv/mm/init.c | 15 ++-
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index c969427eab88..6f72b0b2b854 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -1227,7 +1227,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
 }
 
 static void __meminit create_linear_mapping_range(phys_addr_t start, 
phys_addr_t end,
- uintptr_t fixed_map_size)
+ uintptr_t fixed_map_size, 
const pgprot_t *pgprot)
 {
phys_addr_t pa;
uintptr_t va, map_size;
@@ -1238,7 +1238,7 @@ static void __meminit 
create_linear_mapping_range(phys_addr_t start, phys_addr_t
best_map_size(pa, va, end - pa);
 
create_pgd_mapping(swapper_pg_dir, va, pa, map_size,
-  pgprot_from_va(va));
+  pgprot ? *pgprot : pgprot_from_va(va));
}
 }
 
@@ -1282,22 +1282,19 @@ static void __init 
create_linear_mapping_page_table(void)
if (end >= __pa(PAGE_OFFSET) + memory_limit)
end = __pa(PAGE_OFFSET) + memory_limit;
 
-   create_linear_mapping_range(start, end, 0);
+   create_linear_mapping_range(start, end, 0, NULL);
}
 
 #ifdef CONFIG_STRICT_KERNEL_RWX
-   create_linear_mapping_range(ktext_start, ktext_start + ktext_size, 0);
-   create_linear_mapping_range(krodata_start,
-   krodata_start + krodata_size, 0);
+   create_linear_mapping_range(ktext_start, ktext_start + ktext_size, 0, 
NULL);
+   create_linear_mapping_range(krodata_start, krodata_start + 
krodata_size, 0, NULL);
 
memblock_clear_nomap(ktext_start,  ktext_size);
memblock_clear_nomap(krodata_start, krodata_size);
 #endif
 
 #ifdef CONFIG_KFENCE
-   create_linear_mapping_range(kfence_pool,
-   kfence_pool + KFENCE_POOL_SIZE,
-   PAGE_SIZE);
+   create_linear_mapping_range(kfence_pool, kfence_pool + 
KFENCE_POOL_SIZE, PAGE_SIZE, NULL);
 
memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);
 #endif
-- 
2.40.1

[PATCH v2 2/8] riscv: mm: Change attribute from init to meminit for page functions

2024-05-14 Thread Björn Töpel

From: Björn Töpel 

Prepare for memory hotplugging support by changing from __init to
__meminit for the page table functions that are used by the upcoming
architecture specific callbacks.

Changing the __init attribute to __meminit, avoids that the functions
are removed after init. The __meminit attribute makes sure the
functions are kept in the kernel text post init, but only if memory
hotplugging is enabled for the build.

Also, make sure that the altmap parameter is properly passed on to
vmemmap_populate_hugepages().

Signed-off-by: Björn Töpel 
---
 arch/riscv/include/asm/mmu.h |  4 +--
 arch/riscv/include/asm/pgtable.h |  2 +-
 arch/riscv/mm/init.c | 58 ++--
 3 files changed, 29 insertions(+), 35 deletions(-)

diff --git a/arch/riscv/include/asm/mmu.h b/arch/riscv/include/asm/mmu.h
index 60be458e94da..c09c3c79f496 100644
--- a/arch/riscv/include/asm/mmu.h
+++ b/arch/riscv/include/asm/mmu.h
@@ -28,8 +28,8 @@ typedef struct {
 #endif
 } mm_context_t;
 
-void __init create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa,
-  phys_addr_t sz, pgprot_t prot);
+void __meminit create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa, 
phys_addr_t sz,
+ pgprot_t prot);
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_RISCV_MMU_H */
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 58fd7b70b903..7933f493db71 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -162,7 +162,7 @@ struct pt_alloc_ops {
 #endif
 };
 
-extern struct pt_alloc_ops pt_ops __initdata;
+extern struct pt_alloc_ops pt_ops __meminitdata;
 
 #ifdef CONFIG_MMU
 /* Number of PGD entries that a user-mode program can use */
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 5b8cdfafb52a..c969427eab88 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -295,7 +295,7 @@ static void __init setup_bootmem(void)
 }
 
 #ifdef CONFIG_MMU
-struct pt_alloc_ops pt_ops __initdata;
+struct pt_alloc_ops pt_ops __meminitdata;
 
 pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
 pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
@@ -357,7 +357,7 @@ static inline pte_t *__init get_pte_virt_fixmap(phys_addr_t 
pa)
return (pte_t *)set_fixmap_offset(FIX_PTE, pa);
 }
 
-static inline pte_t *__init get_pte_virt_late(phys_addr_t pa)
+static inline pte_t *__meminit get_pte_virt_late(phys_addr_t pa)
 {
return (pte_t *) __va(pa);
 }
@@ -376,7 +376,7 @@ static inline phys_addr_t __init alloc_pte_fixmap(uintptr_t 
va)
return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
 }
 
-static phys_addr_t __init alloc_pte_late(uintptr_t va)
+static phys_addr_t __meminit alloc_pte_late(uintptr_t va)
 {
struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0);
 
@@ -384,9 +384,8 @@ static phys_addr_t __init alloc_pte_late(uintptr_t va)
return __pa((pte_t *)ptdesc_address(ptdesc));
 }
 
-static void __init create_pte_mapping(pte_t *ptep,
- uintptr_t va, phys_addr_t pa,
- phys_addr_t sz, pgprot_t prot)
+static void __meminit create_pte_mapping(pte_t *ptep, uintptr_t va, 
phys_addr_t pa, phys_addr_t sz,
+pgprot_t prot)
 {
uintptr_t pte_idx = pte_index(va);
 
@@ -440,7 +439,7 @@ static pmd_t *__init get_pmd_virt_fixmap(phys_addr_t pa)
return (pmd_t *)set_fixmap_offset(FIX_PMD, pa);
 }
 
-static pmd_t *__init get_pmd_virt_late(phys_addr_t pa)
+static pmd_t *__meminit get_pmd_virt_late(phys_addr_t pa)
 {
return (pmd_t *) __va(pa);
 }
@@ -457,7 +456,7 @@ static phys_addr_t __init alloc_pmd_fixmap(uintptr_t va)
return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
 }
 
-static phys_addr_t __init alloc_pmd_late(uintptr_t va)
+static phys_addr_t __meminit alloc_pmd_late(uintptr_t va)
 {
struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0);
 
@@ -465,9 +464,9 @@ static phys_addr_t __init alloc_pmd_late(uintptr_t va)
return __pa((pmd_t *)ptdesc_address(ptdesc));
 }
 
-static void __init create_pmd_mapping(pmd_t *pmdp,
- uintptr_t va, phys_addr_t pa,
- phys_addr_t sz, pgprot_t prot)
+static void __meminit create_pmd_mapping(pmd_t *pmdp,
+uintptr_t va, phys_addr_t pa,
+phys_addr_t sz, pgprot_t prot)
 {
pte_t *ptep;
phys_addr_t pte_phys;
@@ -503,7 +502,7 @@ static pud_t *__init get_pud_virt_fixmap(phys_addr_t pa)
return (pud_t *)set_fixmap_offset(FIX_PUD, pa);
 }
 
-static pud_t *__init get_pud_virt_late(phys_addr_t pa)
+static pud_t *__meminit get_pud_virt_late(phys_addr_t pa)
 {
return (pud_t *)__va(pa);
 }
@@ -521,7 +520,7 @@ static phys_addr_t __init alloc_pud_fixmap(uint

[PATCH v2 1/8] riscv: mm: Pre-allocate vmemmap/direct map PGD entries

2024-05-14 Thread Björn Töpel

From: Björn Töpel 

The RISC-V port copies the PGD table from init_mm/swapper_pg_dir to
all userland page tables, which means that if the PGD level table is
changed, other page tables has to be updated as well.

Instead of having the PGD changes ripple out to all tables, the
synchronization can be avoided by pre-allocating the PGD entries/pages
at boot, avoiding the synchronization all together.

This is currently done for the bpf/modules, and vmalloc PGD regions.
Extend this scheme for the PGD regions touched by memory hotplugging.

Prepare the RISC-V port for memory hotplug by pre-allocate
vmemmap/direct map entries at the PGD level. This will roughly waste
~128 worth of 4K pages when memory hotplugging is enabled in the
kernel configuration.

Signed-off-by: Björn Töpel 
---
 arch/riscv/include/asm/kasan.h | 4 ++--
 arch/riscv/mm/init.c   | 7 +++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h
index 0b85e363e778..e6a0071bdb56 100644
--- a/arch/riscv/include/asm/kasan.h
+++ b/arch/riscv/include/asm/kasan.h
@@ -6,8 +6,6 @@
 
 #ifndef __ASSEMBLY__
 
-#ifdef CONFIG_KASAN
-
 /*
  * The following comment was copied from arm64:
  * KASAN_SHADOW_START: beginning of the kernel virtual addresses.
@@ -34,6 +32,8 @@
  */
 #define KASAN_SHADOW_START ((KASAN_SHADOW_END - KASAN_SHADOW_SIZE) & 
PGDIR_MASK)
 #define KASAN_SHADOW_END   MODULES_LOWEST_VADDR
+
+#ifdef CONFIG_KASAN
 #define KASAN_SHADOW_OFFSET_AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
 
 void kasan_init(void);
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 2574f6a3b0e7..5b8cdfafb52a 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -27,6 +27,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1488,10 +1489,16 @@ static void __init preallocate_pgd_pages_range(unsigned 
long start, unsigned lon
panic("Failed to pre-allocate %s pages for %s area\n", lvl, area);
 }
 
+#define PAGE_END KASAN_SHADOW_START
+
 void __init pgtable_cache_init(void)
 {
preallocate_pgd_pages_range(VMALLOC_START, VMALLOC_END, "vmalloc");
if (IS_ENABLED(CONFIG_MODULES))
preallocate_pgd_pages_range(MODULES_VADDR, MODULES_END, 
"bpf/modules");
+   if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
+   preallocate_pgd_pages_range(VMEMMAP_START, VMEMMAP_END, 
"vmemmap");
+   preallocate_pgd_pages_range(PAGE_OFFSET, PAGE_END, "direct 
map");
+   }
 }
 #endif
-- 
2.40.1

[PATCH v2 0/8] riscv: Memory Hot(Un)Plug support

2024-05-14 Thread Björn Töpel

From: Björn Töpel 


Memory Hot(Un)Plug support (and ZONE_DEVICE) for the RISC-V port


Introduction


To quote "Documentation/admin-guide/mm/memory-hotplug.rst": "Memory
hot(un)plug allows for increasing and decreasing the size of physical
memory available to a machine at runtime."

This series adds memory hot(un)plugging, and ZONE_DEVICE support for
the RISC-V Linux port.

I'm sending this series while LSF/MM/BPF is on-going, and with some
luck some MM person can review the series while zoning out on a talk.
;-)

MM configuration


RISC-V MM has the following configuration:

 * Memory blocks are 128M, analogous to x86-64. It uses PMD
   ("hugepage") vmemmaps. From that follows that 2M (PMD) worth of
   vmemmap spans 32768 pages á 4K which gets us 128M.

 * The pageblock size is the minimum minimum virtio_mem size, and on
   RISC-V it's 2M (2^9 * 4K).

Implementation
==

The PGD table on RISC-V is shared/copied between for all processes. To
avoid doing page table synchronization, the first patch (patch 1)
pre-allocated the PGD entries for vmemmap/direct map. By doing that
the init_mm PGD will be fixed at kernel init, and synchronization can
be avoided all together.

The following two patches (patch 2-3) does some preparations, followed
by the actual MHP implementation (patch 4-5). Then, MHP and virtio-mem
are enabled (patch 6-7), and finally ZONE_DEVICE support is added
(patch 8).

MHP and locking
===

TL;DR: The MHP does not step on any toes, except for ptdump.
Additional locking is required for ptdump.

Long version: For v2 I spent some time digging into init_mm
synchronization/update. Here are my findings, and I'd love them to be
corrected if incorrect.

It's been a gnarly path...

The `init_mm` structure is a special mm (perhaps not a "real" one).
It's a "lazy context" that tracks kernel page table resources, e.g.,
the kernel page table (swapper_pg_dir), a kernel page_table_lock (more
about the usage below), mmap_lock, and such.

`init_mm` does not track/contain any VMAs. Having the `init_mm` is
convenient, so that the regular kernel page table walk/modify
functions can be used.

Now, `init_mm` being special means that the locking for kernel page
tables are special as well.

On RISC-V the PGD (top-level page table structure), similar to x86, is
shared (copied) with user processes. If the kernel PGD is modified, it
has to be synched to user-mode processes PGDs. This is avoided by
pre-populating the PGD, so it'll be fixed from boot.

The in-kernel pgd regions are documented in
`Documentation/arch/riscv/vm-layout.rst`.

The distinct regions are:
 * vmemmap
 * vmalloc/ioremap space
 * direct mapping of all physical memory
 * kasan
 * modules, BPF
 * kernel

Memory hotplug is the process of adding/removing memory to/from the
kernel.

Adding is done in two phases:
 1. Add the memory to the kernel
 2. Online memory, making it available to the page allocator.

Step 1 is partially architecture dependent, and updates the init_mm
page table:
 * Update the direct map page tables. The direct map is a linear map,
   representing all physical memory: `virt = phys + PAGE_OFFSET`
 * Add a `struct page` for each added page of memory. Update the
   vmemmap (virtual mapping to the `struct page`, so we can easily
   transform a kernel virtual address to a `struct page *` address.

>From an MHP perspective, there are two regions of the PGD that are
updated:
 * vmemmap
 * direct mapping of all physical memory

The `struct mm_struct` has a couple of locks in play:
 * `spinlock_t page_table_lock` protects the page table, and some
counters
 * `struct rw_semaphore mmap_lock` protect an mm's VMAs

Note again that `init_mm` does not contain any VMAs, but still uses
the mmap_lock in some places.

The `page_table_lock` was originally used to to protect all pages
tables, but more recently a split page table lock has been introduced.
The split lock has a per-table lock for the PTE and PMD tables. If
split lock is disabled, all tables are guarded by
`mm->page_table_lock` (for user processes). Split page table locks are
not used for init_mm.

MHP operations is typically synchronized using
`DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock)`.

Actors
--

The following non-MHP actors in the kernel traverses (read), and/or
modifies the kernel PGD.

 * `ptdump`

   Walks the entire `init_mm`, via `ptdump_walk_pgd()` with the
   `mmap_write_lock(init_mm)` taken.

   Observation: ptdump can race with MHP, and needs additional locking
   to avoid crashes/races.

 * `set_direct_*` / `arch/riscv/mm/pageattr.c`

   The `set_direct_*` functionality is used to "synchronize" the
   direct map to other kernel mappings, e.g. modules/kernel text. The
   direct map is using "as large huge table m

Re: [PATCH] ftrace: riscv: move from REGS to ARGS

2024-04-24 Thread Björn Töpel

Puranjay Mohan  writes:

> This commit replaces riscv's support for FTRACE_WITH_REGS with support
> for FTRACE_WITH_ARGS. This is required for the ongoing effort to stop
> relying on stop_machine() for RISCV's implementation of ftrace.
>
> The main relevant benefit that this change will bring for the above
> use-case is that now we don't have separate ftrace_caller and
> ftrace_regs_caller trampolines. This will allow the callsite to call
> ftrace_caller by modifying a single instruction. Now the callsite can
> do something similar to:
>
> When not tracing:| When tracing:
>
> func:  func:
>   auipc t0, ftrace_caller_topauipc t0, ftrace_caller_top
>   nop  <==>  jalr  t0, ftrace_caller_bottom
>   [...]  [...]
>
> The above assumes that we are dropping the support of calling a direct
> trampoline from the callsite. We need to drop this as the callsite can't
> change the target address to call, it can only enable/disable a call to
> a preset target (ftrace_caller in the above diagram). We can later optimize
> this by calling an intermediate dispatcher trampoline before ftrace_caller.
>
> Currently, ftrace_regs_caller saves all CPU registers in the format of
> struct pt_regs and allows the tracer to modify them. We don't need to
> save all of the CPU registers because at function entry only a subset of
> pt_regs is live:
>
> |--+--+-|
> | Register | ABI Name | Description |
> |--+--+-|
> | x1   | ra   | Return address for traced function  |
> | x2   | sp   | Stack pointer   |
> | x5   | t0   | Return address for ftrace_caller trampoline |
> | x8   | s0/fp| Frame pointer   |
> | x10-11   | a0-1 | Function arguments/return values|
> | x12-17   | a2-7 | Function arguments  |
> |--+--+-|
>
> See RISCV calling convention[1] for the above table.
>
> Saving just the live registers decreases the amount of stack space
> required from 288 Bytes to 112 Bytes.
>
> Basic testing was done with this on the VisionFive 2 development board.
>
> Note:
>   - Moving from REGS to ARGS will mean that RISCV will stop supporting
> KPROBES_ON_FTRACE as it requires full pt_regs to be saved.
>   - KPROBES_ON_FTRACE will be supplanted by FPROBES see [2].

Puranjay,

Apologies for the slow response.

Restating from the RFC; This change is a prerequisite for moving towards
a "FTRACE_WITH_CALL_OPS" model on riscv, and finally getting rid of the
stop_machine() text patching.

I've tested this on QEMU, and on the VisionFive2. No regressions
(FTRACE_STARTUP_*, and kselftest), other than that KPROBES_ON_FTRACE no
longer works. (Which will be addressed by [2]).

Palmer, my preference is that this should go on for-next, rather than
being part of the upcoming text patching support (worked on by Andy),
but really up to you.

Tested-by: Björn Töpel 
Reviewed-by: Björn Töpel

Re: [RFC PATCH] ftrace: riscv: move from REGS to ARGS

2024-04-02 Thread Björn Töpel

Puranjay Mohan  writes:

> This commit replaces riscv's support for FTRACE_WITH_REGS with support
> for FTRACE_WITH_ARGS. This is required for the ongoing effort to stop
> relying on stop_machine() for RISCV's implementation of ftrace.
>
> The main relevant benefit that this change will bring for the above
> use-case is that now we don't have separate ftrace_caller and
> ftrace_regs_caller trampolines.  This will allow the callsite to call
> ftrace_caller by modifying a single instruction. Now the callsite can
> do something similar to:
>
> When not tracing:| When tracing:
>
> func:  func:
>   auipc t0, ftrace_caller_topauipc t0, ftrace_caller_top
>   nop  <==>  jalr  t0, ftrace_caller_bottom
>   [...]  [...]
>
> The above assumes that we are dropping the support of calling a direct
> trampoline from the callsite. We need to drop this as the callsite can't
> change the target address to call, it can only enable/disable a call to
> a preset target (ftrace_caller in the above diagram).
>
> Currently, ftrace_regs_caller saves all CPU registers in the format of
> struct pt_regs and allows the tracer to modify them. We don't need to
> save all of the CPU registers because at function entry only a subset of
> pt_regs is live:
>
> |--+--+-|
> | Register | ABI Name | Description |
> |--+--+-|
> | x1   | ra   | Return address for traced function  |
> | x2   | sp   | Stack pointer   |
> | x5   | t0   | Return address for ftrace_caller trampoline |
> | x8   | s0/fp| Frame pointer   |
> | x10-11   | a0-1 | Function arguments/return values|
> | x12-17   | a2-7 | Function arguments  |
> |--+--+-|
>
> See RISCV calling convention[1] for the above table.
>
> Saving just the live registers decreases the amount of stack space
> required from 288 Bytes to 112 Bytes.
>
> Basic testing was done with this on the VisionFive 2 development board.
>
> Note:
>   - Moving from REGS to ARGS will mean that RISCV will stop supporting
> KPROBES_ON_FTRACE as it requires full pt_regs to be saved.

...and FPROBES, but Masami is (still?) working on a series to address
that [1].

My perspective; This is following the work Mark et al has done for
arm64, and it does make sense for RISC-V as well. I'm in favor having
this change as part of the bigger call_ops/text patch change for RISC-V.

[...]

> diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h
> index 1276d7d9ca8b..1e95bf4ded6c 100644
> --- a/arch/riscv/include/asm/ftrace.h
> +++ b/arch/riscv/include/asm/ftrace.h
> @@ -124,20 +124,87 @@ struct dyn_ftrace;
>  int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec);
>  #define ftrace_init_nop ftrace_init_nop
>  
> -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
> +#define arch_ftrace_get_regs(regs) NULL
>  struct ftrace_ops;
> -struct ftrace_regs;
> +struct ftrace_regs {
> + unsigned long epc;
> + unsigned long ra;
> + unsigned long sp;
> + unsigned long s0;
> + unsigned long t1;
> + union {
> + unsigned long args[8];
> + struct {
> + unsigned long a0;
> + unsigned long a1;
> + unsigned long a2;
> + unsigned long a3;
> + unsigned long a4;
> + unsigned long a5;
> + unsigned long a6;
> + unsigned long a7;
> + };
> + };
> +};
> +
> +static __always_inline unsigned long
> +ftrace_regs_get_instruction_pointer(const struct ftrace_regs *fregs)
> +{
> + return fregs->epc;
> +}
> +
> +static __always_inline void
> +ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs,
> + unsigned long pc)
> +{
> + fregs->epc = pc;
> +}
> +
> +static __always_inline unsigned long
> +ftrace_regs_get_stack_pointer(const struct ftrace_regs *fregs)
> +{
> + return fregs->sp;
> +}
> +
> +static __always_inline unsigned long
> +ftrace_regs_get_argument(struct ftrace_regs *fregs, unsigned int n)
> +{
> + if (n < 8)
> + return fregs->args[n];
> + return 0;
> +}
> +
> +static __always_inline unsigned long
> +ftrace_regs_get_return_value(const struct ftrace_regs *fregs)
> +{
> + return fregs->a0;
> +}
> +
> +static __always_inline void
> +ftrace_regs_set_return_value(struct ftrace_regs *fregs,
> +  unsigned long ret)
> +{
> + fregs->a0 = ret;
> +}
> +
> +static __always_inline void
>

Re: [RFC PATCH] riscv: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS

2024-03-21 Thread Björn Töpel

Andy Chiu  writes:

> On Thu, Mar 21, 2024 at 4:48 PM Björn Töpel  wrote:
>>
>> Andy,
>>
>> Pulling out the A option:
>>
>> >> > A) Use auipc/jalr, only patch jalr to take us to a common
>> >> >dispatcher/trampoline
>> >> >
>> >> >  |  # probably on a data cache-line != func 
>> >> > .text to avoid ping-pong
>> >> >  | ...
>> >> >  | func:
>> >> >  |   ...make sure ra isn't messed up...
>> >> >  |   aupic
>> >> >  |   nop <=> jalr # Text patch point -> common_dispatch
>> >> >  |   ACTUAL_FUNC
>> >> >  |
>> >> >  | common_dispatch:
>> >> >  |   load  based on ra
>> >> >  |   jalr
>> >> >  |   ...
>> >> >
>> >> > The auipc is never touched, and will be overhead. Also, we need a mv to
>> >> > store ra in a scratch register as well -- like Arm. We'll have two insn
>> >> > per-caller overhead for a disabled caller.
>> >
>> > My patch series takes a similar "in-function dispatch" approach. A
>> > difference is that the  is
>> > embedded within each function entry. I'd like to have it moved to a
>> > run-time allocated array to reduce total text size.
>>
>> This is what arm64 has as well. It's a 8B + 1-2 dirt cheap movish like
>> instructions (save ra, prepare jump with auipc). I think that's a
>> reasonable overhead.
>>
>> > Another difference is that my series changes the first instruction to
>> > "j ACTUAL_FUNC" for the "ftrace disable" case. As long as the
>> > architecture guarantees the atomicity of the first instruction, then
>> > we are safe. For example, we are safe if the first instruction could
>> > only be "mv tmp, ra" or "j ACTUAL_FUNC". And since the loaded address is
>> > always valid, we can fix "mv + jalr" down so we don't have to
>> > play with the exception handler trick. The guarantee from arch would
>> > require ziccif (in RVA22) though, but I think it is the same for us
>> > (unless with stop_machine). For ziccif, I would rather call that out
>> > during boot than blindly assume.
>>
>> I'm maybe biased, but I'd prefer the A) over your version with the
>> unconditional jump. A) has the overhead of two, I'd say, free
>> instructions (again "Meten is Weten!" ;-)).
>
> Yes, I'd also prefer A for less overall patch size. We can also
> optimize the overhead with a direct jump if that makes sense. Though,
> we need to sort out a way to map functions to corresponding
> trampolines. A direct way I could image is CALL_OPS'ish patching
> style, if the ftrace destination has to be patched in a per-function
> manner. For example:
>
> 
> func_symbol:
> auipc t0, common_dispatch:high <=> j actual_func:
> jalr t0, common_dispatch:low(t0)
>
> common_dispatch:
> load t1, index + dispatch-list
> ld t1, 0(t1)
> jr t1

Yup, exactly like that (but I'd put the acutal target ptr in there,
instead of an additional indirection. Exactly what Mark does for arm64).

When we enter the common_dispatch, the ptr 
would be -12(t0).

As for patching auipc or jalr, I guess we need to measure what's best.
My knee-jerk would be always auipc is better than jump -- but let's
measure. ;-)


Björn

Re: [RFC PATCH] riscv: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS

2024-03-21 Thread Björn Töpel

Mark,

Mark Rutland  writes:

>> A) Use auipc/jalr, only patch jalr to take us to a common
>>dispatcher/trampoline
>>   
>>  |  # probably on a data cache-line != func .text 
>> to avoid ping-pong
>>  | ...
>>  | func:
>>  |   ...make sure ra isn't messed up...
>>  |   aupic
>>  |   nop <=> jalr # Text patch point -> common_dispatch
>>  |   ACTUAL_FUNC
>>  | 
>>  | common_dispatch:
>>  |   load  based on ra
>>  |   jalr
>>  |   ...
>> 
>> The auipc is never touched, and will be overhead. Also, we need a mv to
>> store ra in a scratch register as well -- like Arm. We'll have two insn
>> per-caller overhead for a disabled caller.
>
> Is the AUIPC a significant overhead? IIUC that's similar to Arm's ADRP, and 
> I'd
> have expected that to be pretty cheap.

No, reg-to-reg moves are dirt cheap in my book.

> IIUC your JALR can choose which destination register to store the return
> address in, and if so, you could leave the original ra untouched (and recover
> that in the common trampoline). Have I misunderstood that?
>
> Maybe that doesn't play nicely with something else?

No, you're right, we can link to another register, and shave off an
instruction. I can imagine that some implementation prefer x1/x5 for
branch prediction reasons, but that's something that we can measure on. 

So, 1-2 movs + nop are unconditionally executed on the disabled case.
(1-2 depending on the ra save/jalr reg strategy).

>> B) Use jal, which can only take us +/-1M, and requires multiple
>>dispatchers (and tracking which one to use, and properly distribute
>>them. Ick.)
>> 
>>  |  # probably on a data cache-line != func .text 
>> to avoid ping-pong
>>  | ...
>>  | func:
>>  |   ...make sure ra isn't messed up...
>>  |   nop <=> jal # Text patch point -> within_1M_to_func_dispatch
>>  |   ACTUAL_FUNC
>>  | 
>>  | within_1M_to_func_dispatch:
>>  |   load  based on ra
>>  |   jalr
>> 
>> C) Use jal, which can only take us +/-1M, and use a per-function
>>trampoline requires multiple dispatchers (and tracking which one to
>>use). Blows up text size A LOT.
>> 
>>  |  # somewhere, but probably on a different 
>> cacheline than the .text to avoid ping-ongs
>>  | ...
>>  | per_func_dispatch
>>  |   load  based on ra
>>  |   jalr
>>  | func:
>>  |   ...make sure ra isn't messed up...
>>  |   nop <=> jal # Text patch point -> per_func_dispatch
>>  |   ACTUAL_FUNC
>
> Beware that with option (C) you'll need to handle that in your unwinder for
> RELIABLE_STACKTRACE. If you don't have a symbol for per_func_dispatch (or
> func_trace_target_data_8B), PC values within per_func_dispatch would be
> symbolized as the prior function/data.

Good point (but I don't like C much...)!

>> It's a bit sad that we'll always have to have a dispatcher/trampoline,
>> but it's still better than stop_machine(). (And we'll need a fencei IPI
>> as well, but only one. ;-))
>> 
>> Today, I'm leaning towards A (which is what Mark suggested, and also
>> Robbin).. Any other options?
>
> Assuming my understanding of JALR above is correct, I reckon A is the nicest
> option out of A/B/C.

Yes! +1!


Björn

Re: [RFC PATCH] riscv: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS

2024-03-21 Thread Björn Töpel

Andy,

Pulling out the A option:

>> > A) Use auipc/jalr, only patch jalr to take us to a common
>> >dispatcher/trampoline
>> >
>> >  |  # probably on a data cache-line != func 
>> > .text to avoid ping-pong
>> >  | ...
>> >  | func:
>> >  |   ...make sure ra isn't messed up...
>> >  |   aupic
>> >  |   nop <=> jalr # Text patch point -> common_dispatch
>> >  |   ACTUAL_FUNC
>> >  |
>> >  | common_dispatch:
>> >  |   load  based on ra
>> >  |   jalr
>> >  |   ...
>> >
>> > The auipc is never touched, and will be overhead. Also, we need a mv to
>> > store ra in a scratch register as well -- like Arm. We'll have two insn
>> > per-caller overhead for a disabled caller.
>
> My patch series takes a similar "in-function dispatch" approach. A
> difference is that the  is
> embedded within each function entry. I'd like to have it moved to a
> run-time allocated array to reduce total text size.

This is what arm64 has as well. It's a 8B + 1-2 dirt cheap movish like
instructions (save ra, prepare jump with auipc). I think that's a
reasonable overhead.

> Another difference is that my series changes the first instruction to
> "j ACTUAL_FUNC" for the "ftrace disable" case. As long as the
> architecture guarantees the atomicity of the first instruction, then
> we are safe. For example, we are safe if the first instruction could
> only be "mv tmp, ra" or "j ACTUAL_FUNC". And since the loaded address is
> always valid, we can fix "mv + jalr" down so we don't have to
> play with the exception handler trick. The guarantee from arch would
> require ziccif (in RVA22) though, but I think it is the same for us
> (unless with stop_machine). For ziccif, I would rather call that out
> during boot than blindly assume.

I'm maybe biased, but I'd prefer the A) over your version with the
unconditional jump. A) has the overhead of two, I'd say, free
instructions (again "Meten is Weten!" ;-)).

> However, one thing I am not very sure is: do we need a destination
> address in a "per-function" manner? It seems like most of the time the
> destination address can only be ftrace_call, or ftrace_regs_call. If
> the number of destination addresses is very few, then we could
> potentially reduce the size of
> .

Yes, we do need a per-function manner. BPF, e.g., uses
dynamically/JIT:ed trampolines/targets.



Björn

Re: [RFC PATCH] riscv: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS

2024-03-14 Thread Björn Töpel

Björn Töpel  writes:

> Puranjay Mohan  writes:
>
>> Björn Töpel  writes:
>>
>>>
>>> Hmm, depending on RISC-V's CMODX path, the pro/cons CALL_OPS vs dynamic
>>> trampolines changes quite a bit.
>>>
>>> The more I look at the pains of patching two instruction ("split
>>> immediates"), the better "patch data" + one insn patching look.
>>
>> I was looking at how dynamic trampolines would be implemented for RISC-V.
>>
>> With CALL-OPS we need to patch the auipc+jalr at function entry only, the
>> ops pointer above the function can be patched atomically.
>>
>> With a dynamic trampoline we need a auipc+jalr pair at function entry to jump
>> to the trampoline and then another auipc+jalr pair to jump from trampoline to
>> ops->func. When the ops->func is modified, we would need to update the
>> auipc+jalr at in the trampoline.
>>
>> So, I am not sure how to move forward here, CALL-OPS or Dynamic trampolines?
>
> Yeah. Honestly, we need to figure out the patching story prior
> choosing the path, so let's start there.
>
> After reading Mark's reply, and discussing with OpenJDK folks (who does
> the most crazy text patching on all platforms), having to patch multiple
> instructions (where the address materialization is split over multiple
> instructions) is a no-go. It's just a too big can of worms. So, if we
> can only patch one insn, it's CALL_OPS.
>
> A couple of options (in addition to Andy's), and all require a
> per-function landing address ala CALL_OPS) tweaking what Mark is doing
> on Arm (given the poor branch range).
>
> ...and maybe we'll get RISC-V rainbows/unicorns in the future getting
> better reach (full 64b! ;-)).
>
> A) Use auipc/jalr, only patch jalr to take us to a common
>dispatcher/trampoline
>   
>  |  # probably on a data cache-line != func .text 
> to avoid ping-pong
>  | ...
>  | func:
>  |   ...make sure ra isn't messed up...
>  |   aupic
>  |   nop <=> jalr # Text patch point -> common_dispatch
>  |   ACTUAL_FUNC
>  | 
>  | common_dispatch:
>  |   load  based on ra
>  |   jalr
>  |   ...
>
> The auipc is never touched, and will be overhead. Also, we need a mv to
> store ra in a scratch register as well -- like Arm. We'll have two insn
> per-caller overhead for a disabled caller.
>
> B) Use jal, which can only take us +/-1M, and requires multiple
>dispatchers (and tracking which one to use, and properly distribute
>them. Ick.)
>
>  |  # probably on a data cache-line != func .text 
> to avoid ping-pong
>  | ...
>  | func:
>  |   ...make sure ra isn't messed up...
>  |   nop <=> jal # Text patch point -> within_1M_to_func_dispatch
>  |   ACTUAL_FUNC
>  | 
>  | within_1M_to_func_dispatch:
>  |   load  based on ra
>  |   jalr
>
> C) Use jal, which can only take us +/-1M, and use a per-function
>trampoline requires multiple dispatchers (and tracking which one to
>use). Blows up text size A LOT.
>
>  |  # somewhere, but probably on a different 
> cacheline than the .text to avoid ping-ongs
>  | ...
>  | per_func_dispatch
>  |   load  based on ra
>  |   jalr
>  | func:
>  |   ...make sure ra isn't messed up...
>  |   nop <=> jal # Text patch point -> per_func_dispatch
>  |   ACTUAL_FUNC

Brendan proposed yet another option, "in-function dispatch":

D) 
  |  # idk somewhere
  | ...
  | func:
  |mv tmp1, ra
  |aupic tmp2, 
  |mv tmp3, zero <=> ld tmp3, 
tmp2
  |nop <=> jalr ra, tmp3
  |ACTUAL_FUNC

There are 4 CMODX possiblities:
   mv, nop:  fully disabled, no problems
   mv, jalr: We will jump to zero. We would need to have the inst
 page/access fault handler take care of this case. Especially
 if we align the instructions so that they can be patched
 together, being interrupted in the middle and taking this
 path will be rare.
  ld, nop:   no problems
  ld, jalr:  fully enabled, no problems

Patching is a 64b store/sd, and we only need a fence.i at the end, since
we can handle all 4 possibilities.

For the disabled case we'll have:
A) mv, aupic, nop
D) mv, aupic, mv, nop.

Puranjay, I've flipped. Let's go Mark's CALL_OPS together with a new
text patch mechanism w/o stop_machine().


Björn

Re: [RFC PATCH] riscv: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS

2024-03-14 Thread Björn Töpel

Puranjay Mohan  writes:

> Björn Töpel  writes:
>
>>
>> Hmm, depending on RISC-V's CMODX path, the pro/cons CALL_OPS vs dynamic
>> trampolines changes quite a bit.
>>
>> The more I look at the pains of patching two instruction ("split
>> immediates"), the better "patch data" + one insn patching look.
>
> I was looking at how dynamic trampolines would be implemented for RISC-V.
>
> With CALL-OPS we need to patch the auipc+jalr at function entry only, the
> ops pointer above the function can be patched atomically.
>
> With a dynamic trampoline we need a auipc+jalr pair at function entry to jump
> to the trampoline and then another auipc+jalr pair to jump from trampoline to
> ops->func. When the ops->func is modified, we would need to update the
> auipc+jalr at in the trampoline.
>
> So, I am not sure how to move forward here, CALL-OPS or Dynamic trampolines?

Yeah. Honestly, we need to figure out the patching story prior
choosing the path, so let's start there.

After reading Mark's reply, and discussing with OpenJDK folks (who does
the most crazy text patching on all platforms), having to patch multiple
instructions (where the address materialization is split over multiple
instructions) is a no-go. It's just a too big can of worms. So, if we
can only patch one insn, it's CALL_OPS.

A couple of options (in addition to Andy's), and all require a
per-function landing address ala CALL_OPS) tweaking what Mark is doing
on Arm (given the poor branch range).

...and maybe we'll get RISC-V rainbows/unicorns in the future getting
better reach (full 64b! ;-)).

A) Use auipc/jalr, only patch jalr to take us to a common
   dispatcher/trampoline

 |  # probably on a data cache-line != func .text to 
avoid ping-pong
 | ...
 | func:
 |   ...make sure ra isn't messed up...
 |   aupic
 |   nop <=> jalr # Text patch point -> common_dispatch
 |   ACTUAL_FUNC
 | 
 | common_dispatch:
 |   load  based on ra
 |   jalr
 |   ...

The auipc is never touched, and will be overhead. Also, we need a mv to
store ra in a scratch register as well -- like Arm. We'll have two insn
per-caller overhead for a disabled caller.

B) Use jal, which can only take us +/-1M, and requires multiple
   dispatchers (and tracking which one to use, and properly distribute
   them. Ick.)

 |  # probably on a data cache-line != func .text to 
avoid ping-pong
 | ...
 | func:
 |   ...make sure ra isn't messed up...
 |   nop <=> jal # Text patch point -> within_1M_to_func_dispatch
 |   ACTUAL_FUNC
 | 
 | within_1M_to_func_dispatch:
 |   load  based on ra
 |   jalr

C) Use jal, which can only take us +/-1M, and use a per-function
   trampoline requires multiple dispatchers (and tracking which one to
   use). Blows up text size A LOT.

 |  # somewhere, but probably on a different 
cacheline than the .text to avoid ping-ongs
 | ...
 | per_func_dispatch
 |   load  based on ra
 |   jalr
 | func:
 |   ...make sure ra isn't messed up...
 |   nop <=> jal # Text patch point -> per_func_dispatch
 |   ACTUAL_FUNC

It's a bit sad that we'll always have to have a dispatcher/trampoline,
but it's still better than stop_machine(). (And we'll need a fencei IPI
as well, but only one. ;-))

Today, I'm leaning towards A (which is what Mark suggested, and also
Robbin).. Any other options?

[Now how do we implement OPTPROBES? I'm kidding. ;-)]

Björn

Re: [RFC PATCH] riscv: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS

2024-03-13 Thread Björn Töpel

Mark Rutland  writes:

> Hi Bjorn
>
> (apologies, my corporate mail server has butchered your name here).

Ha! That's the price I have to pay for carrying double-umlauts
everywhere. Thanks for getting back with a really useful answer!

>> On Arm64, CALL_OPS makes it possible to implement direct calls, while
>> only patching one BL instruction -- nice!
>
> The key thing here isn't that we patch a single instruction (since we have ot
> patch the ops pointer too!); it's that we can safely patch either of the ops
> pointer or BL/NOP at any time while threads are concurrently executing.

...which indeed is a very nice property!

> If you have a multi-instruction sequence, then threads can be preempted
> mid-sequence, and it's very painful/complex to handle all of the races that
> entails.

Oh yes. RISC-V is currently using auipc/jalr with stop_machine(), and
also requires that preemtion is off. Unusable to put it blunt.

> For example, if your callsites use a sequence:
>
>   AUIPC , 
>   JALR , ()
>
> Using stop_machine() won't allow you to patch that safely as some threads
> could be stuck mid-sequence, e.g.
>
>   AUIPC , 
>   [ preempted here ]
>   JALR , ()
>
> ... and you can't update the JALR to use a new funcptr immediate until those
> have completed the sequence.
>
> There are ways around that, but they're complicated and/or expensive, e.g.
>
> * Use a sequence of multiple patches, starting with replacing the JALR with an
>   exception-generating instruction with a fixup handler, which is sort-of what
>   x86 does with UD2. This may require multiple passes with
>   synchronize_rcu_tasks() to make sure all threads have seen the latest
>   instructions, and that cannot be done under stop_machine(), so if you need
>   stop_machine() for CMODx reasons, you may need to use that several times 
> with
>   intervening calls to synchronize_rcu_tasks().
>
> * Have the patching logic manually go over each thread and fix up the pt_regs
>   for the interrupted thread. This is pretty horrid since you could have 
> nested
>   exceptions and a task could have several pt_regs which might require
>   updating.

Yup, and both of these have rather unplesant overhead.

> The CALL_OPS approach is a bit easier to deal with as we can patch the
> per-callsite pointer atomically, then we can (possibly) enable/disable the
> callsite's branch, then wait for threads to drain once. 
>
> As a heads-up, there are some latent/generic issues with DYNAMIC_FTRACE
> generally in this area (CALL_OPs happens to side-step those, but trampoline
> usage is currently affected):
>
>   https://lore.kernel.org/lkml/Zenx_Q0UiwMbSAdP@FVFF77S0Q05N/
>
> ... I'm looking into fixing that at the moment, and it looks like that's 
> likely
> to require some per-architecture changes.
>
>> On RISC-V we cannot use use the same ideas as Arm64 straight off,
>> because the range of jal (compare to BL) is simply too short (+/-1M).
>> So, on RISC-V we need to use a full auipc/jal pair (the text patching
>> story is another chapter, but let's leave that aside for now). Since we
>> have to patch multiple instructions, the cmodx situation doesn't really
>> improve with CALL_OPS.
>
> The branch range thing is annoying, but I think this boils down to the same
> problem as arm64 has with needing a "MOV , LR" instruction that we have 
> to
> patch in once at boot time. You could do the same and patch in the AUIPC once,
> e.g. have
>
> | NOP
> | NOP 
> | func:
> | AUIPC , 
> | JALR , () // patched with NOP
>
> ... which'd look very similar to arm64's sequence:
>
> | NOP
> | NOP
> | func:
> | MOV X9, LR
> | BL ftrace_caller // patched with NOP
>
> ... which I think means it *might* be better from a cmodx perspective?
>
>> Let's say that we continue building on your patch and implement direct
>> calls on CALL_OPS for RISC-V as well.
>> 
>> From Florent's commit message for direct calls:
>> 
>>   |There are a few cases to distinguish:
>>   |- If a direct call ops is the only one tracing a function:
>>   |  - If the direct called trampoline is within the reach of a BL
>>   |instruction
>>   | -> the ftrace patchsite jumps to the trampoline
>>   |  - Else
>>   | -> the ftrace patchsite jumps to the ftrace_caller trampoline 
>> which
>>   |reads the ops pointer in the patchsite and jumps to the direct
>>   |call address stored in the ops
>>   |- Else
>>   |  -> the ftrace patchsite jumps to the ftrace_caller trampoline and 
>> its
>>   | ops literal points to ftrace_list_ops so it iterates over all
>>   | registered ftrace ops, including the direct call ops and calls 
>> its
>>   | call_direct_funcs handler which stores the direct called
>>   | trampoline's address in the ftrace_regs and the ftrace_caller
>>   | trampoline will return to that address instead of returning to 
>> the
>>   | traced function
>>

Re: [RFC PATCH] riscv: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS

2024-03-08 Thread Björn Töpel

Puranjay Mohan  writes:

>> Now, I think a better approach for RISC-V would be implementing what x86
>> has (arch_ftrace_update_trampoline()), rather than CALL_OPS for RISC-V.
>>
>> Thoughts?
>
> I am going to spin some patches for implementing
> arch_ftrace_update_trampoline() for
> RISC-V, then we can compare the two approaches and see which is
> better. But I agree
> that arch_ftrace_update_trampoline() is a better approach given that
> we can jump anywhere
> with auipc/jalr.

Yup, and the text size wont blow up.


Cheers,
Björn

Re: [RFC PATCH] riscv: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS

2024-03-08 Thread Björn Töpel

Puranjay Mohan  writes:

> Hi Björn,
>
> On Thu, Mar 7, 2024 at 8:27 PM Björn Töpel  wrote:
>>
>> Puranjay!
>>
>> Puranjay Mohan  writes:
>>
>> > This patch enables support for DYNAMIC_FTRACE_WITH_CALL_OPS on RISC-V.
>> > This allows each ftrace callsite to provide an ftrace_ops to the common
>> > ftrace trampoline, allowing each callsite to invoke distinct tracer
>> > functions without the need to fall back to list processing or to
>> > allocate custom trampolines for each callsite. This significantly speeds
>> > up cases where multiple distinct trace functions are used and callsites
>> > are mostly traced by a single tracer.
>> >
>> > The idea and most of the implementation is taken from the ARM64's
>> > implementation of the same feature. The idea is to place a pointer to
>> > the ftrace_ops as a literal at a fixed offset from the function entry
>> > point, which can be recovered by the common ftrace trampoline.
>>
>> Not really a review, but some more background; Another rationale (on-top
>> of the improved per-call performance!) for CALL_OPS was to use it to
>> build ftrace direct call support (which BPF uses a lot!). Mark, please
>> correct me if I'm lying here!
>>
>> On Arm64, CALL_OPS makes it possible to implement direct calls, while
>> only patching one BL instruction -- nice!
>>
>> On RISC-V we cannot use use the same ideas as Arm64 straight off,
>> because the range of jal (compare to BL) is simply too short (+/-1M).
>> So, on RISC-V we need to use a full auipc/jal pair (the text patching
>> story is another chapter, but let's leave that aside for now). Since we
>> have to patch multiple instructions, the cmodx situation doesn't really
>> improve with CALL_OPS.
>>
>> Let's say that we continue building on your patch and implement direct
>> calls on CALL_OPS for RISC-V as well.
>>
>> From Florent's commit message for direct calls:
>>
>>   |There are a few cases to distinguish:
>>   |- If a direct call ops is the only one tracing a function:
>>   |  - If the direct called trampoline is within the reach of a BL
>>   |instruction
>>   | -> the ftrace patchsite jumps to the trampoline
>>   |  - Else
>>   | -> the ftrace patchsite jumps to the ftrace_caller trampoline 
>> which
>>   |reads the ops pointer in the patchsite and jumps to the direct
>>   |call address stored in the ops
>>   |- Else
>>   |  -> the ftrace patchsite jumps to the ftrace_caller trampoline and 
>> its
>>   | ops literal points to ftrace_list_ops so it iterates over all
>>   | registered ftrace ops, including the direct call ops and calls 
>> its
>>   | call_direct_funcs handler which stores the direct called
>>   | trampoline's address in the ftrace_regs and the ftrace_caller
>>   | trampoline will return to that address instead of returning to 
>> the
>>   | traced function
>>
>> On RISC-V, where auipc/jalr is used, the direct called trampoline would
>> always be reachable, and then first Else-clause would never be entered.
>> This means the the performance for direct calls would be the same as the
>> one we have today (i.e. no regression!).
>>
>> RISC-V does like x86 does (-ish) -- patch multiple instructions, long
>> reach.
>>
>> Arm64 uses CALL_OPS and patch one instruction BL.
>>
>> Now, with this background in mind, compared to what we have today,
>> CALL_OPS would give us (again assuming we're using it for direct calls):
>>
>> * Better performance for tracer per-call (faster ops lookup) GOOD
>
> ^ this was the only motivation for me to implement this patch.
>
> I don't think implementing direct calls over call ops is fruitful for
> RISC-V because once
> the auipc/jalr can be patched atomically, the direct call trampoline
> is always reachable.
> Solving the atomic text patching problem would be fun!! I am eager to
> see how it will be
> solved.

Given the upcoming Zjid spec, we'll soon be in a much better place where
we can reason about cmodx.

>> * Larger text size (function alignment + extra nops) BAD
>> * Same direct call performance NEUTRAL
>> * Same complicated text patching required NEUTRAL
>>
>> It would be interesting to see how the per-call performance would
>> improve on x86 with CALL_OPS! ;-)
>
> If I remember from Steven's talk, x86 uses dynamically allocated trampolines
> for per callsite tracers, would CALL_OPS provide better performance than that?

Probably not, and it was really a tongue-in-cheek comment -- nothing I
encourage you to do!

Now, I think a better approach for RISC-V would be implementing what x86
has (arch_ftrace_update_trampoline()), rather than CALL_OPS for RISC-V.

Thoughts?


Björn

Re: [RFC PATCH] riscv: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS

2024-03-07 Thread Björn Töpel

Puranjay!

Puranjay Mohan  writes:

> This patch enables support for DYNAMIC_FTRACE_WITH_CALL_OPS on RISC-V.
> This allows each ftrace callsite to provide an ftrace_ops to the common
> ftrace trampoline, allowing each callsite to invoke distinct tracer
> functions without the need to fall back to list processing or to
> allocate custom trampolines for each callsite. This significantly speeds
> up cases where multiple distinct trace functions are used and callsites
> are mostly traced by a single tracer.
>
> The idea and most of the implementation is taken from the ARM64's
> implementation of the same feature. The idea is to place a pointer to
> the ftrace_ops as a literal at a fixed offset from the function entry
> point, which can be recovered by the common ftrace trampoline.

Not really a review, but some more background; Another rationale (on-top
of the improved per-call performance!) for CALL_OPS was to use it to
build ftrace direct call support (which BPF uses a lot!). Mark, please
correct me if I'm lying here!

On Arm64, CALL_OPS makes it possible to implement direct calls, while
only patching one BL instruction -- nice!

On RISC-V we cannot use use the same ideas as Arm64 straight off,
because the range of jal (compare to BL) is simply too short (+/-1M).
So, on RISC-V we need to use a full auipc/jal pair (the text patching
story is another chapter, but let's leave that aside for now). Since we
have to patch multiple instructions, the cmodx situation doesn't really
improve with CALL_OPS.

Let's say that we continue building on your patch and implement direct
calls on CALL_OPS for RISC-V as well.

>From Florent's commit message for direct calls:

  |There are a few cases to distinguish:
  |- If a direct call ops is the only one tracing a function:
  |  - If the direct called trampoline is within the reach of a BL
  |instruction
  | -> the ftrace patchsite jumps to the trampoline
  |  - Else
  | -> the ftrace patchsite jumps to the ftrace_caller trampoline which
  |reads the ops pointer in the patchsite and jumps to the direct
  |call address stored in the ops
  |- Else
  |  -> the ftrace patchsite jumps to the ftrace_caller trampoline and its
  | ops literal points to ftrace_list_ops so it iterates over all
  | registered ftrace ops, including the direct call ops and calls its
  | call_direct_funcs handler which stores the direct called
  | trampoline's address in the ftrace_regs and the ftrace_caller
  | trampoline will return to that address instead of returning to the
  | traced function

On RISC-V, where auipc/jalr is used, the direct called trampoline would
always be reachable, and then first Else-clause would never be entered.
This means the the performance for direct calls would be the same as the
one we have today (i.e. no regression!).

RISC-V does like x86 does (-ish) -- patch multiple instructions, long
reach.

Arm64 uses CALL_OPS and patch one instruction BL.

Now, with this background in mind, compared to what we have today,
CALL_OPS would give us (again assuming we're using it for direct calls):

* Better performance for tracer per-call (faster ops lookup) GOOD
* Larger text size (function alignment + extra nops) BAD
* Same direct call performance NEUTRAL
* Same complicated text patching required NEUTRAL

It would be interesting to see how the per-call performance would
improve on x86 with CALL_OPS! ;-)

I'm trying to wrap my head if it makes sense to have it on RISC-V, given
that we're a bit different from Arm64. Does the scale tip to the GOOD
side?

Oh, and we really need to see performance numbers on real HW! I have a
VF2 that I could try this series on.

Björn

Re: [PATCH v3 2/2] riscv: Fix text patching when IPI are used

2024-03-05 Thread Björn Töpel

Conor Dooley  writes:

> On Tue, Mar 05, 2024 at 08:33:30AM +0530, Anup Patel wrote:
>> On Tue, Mar 5, 2024 at 1:54 AM Björn Töpel  wrote:
>> >
>> > Conor Dooley  writes:
>> >
>> > > On Thu, Feb 29, 2024 at 01:10:56PM +0100, Alexandre Ghiti wrote:
>> > >> For now, we use stop_machine() to patch the text and when we use IPIs 
>> > >> for
>> > >> remote icache flushes (which is emitted in patch_text_nosync()), the 
>> > >> system
>> > >> hangs.
>> > >>
>> > >> So instead, make sure every CPU executes the stop_machine() patching
>> > >> function and emit a local icache flush there.
>> > >>
>> > >> Co-developed-by: Björn Töpel 
>> > >> Signed-off-by: Björn Töpel 
>> > >> Signed-off-by: Alexandre Ghiti 
>> > >> Reviewed-by: Andrea Parri 
>> > >
>> > > What commit does this fix?
>> >
>> > Hmm. The bug is exposed when the AIA IPI are introduced, and used
>> > (instead of the firmware-based).
>> >
>> > I'm not sure this is something we'd like backported, but rather a
>> > prerequisite to AIA.
>> >
>> > @Anup @Alex WDYT?
>> >
>> 
>> The current text patching never considered IPIs being injected
>> directly in S-mode from hart to another so we are seeing this
>> issue now with AIA IPIs.
>> 
>> We certainly don't need to backport this fix since it's more
>> of a preparatory fix for AIA IPIs.
>
> Whether or not this is backportable, if it fixes a bug, it should get
> a Fixes: tag for the commit that it fixes. Fixes: isn't "the backport"
> tag, cc: stable is.

I guess the question is if this *is* a fix, or rather a change required
for AIA (not a fix).

Re: [PATCH v3 2/2] riscv: Fix text patching when IPI are used

2024-03-04 Thread Björn Töpel

Conor Dooley  writes:

> On Thu, Feb 29, 2024 at 01:10:56PM +0100, Alexandre Ghiti wrote:
>> For now, we use stop_machine() to patch the text and when we use IPIs for
>> remote icache flushes (which is emitted in patch_text_nosync()), the system
>> hangs.
>> 
>> So instead, make sure every CPU executes the stop_machine() patching
>> function and emit a local icache flush there.
>> 
>> Co-developed-by: Björn Töpel 
>> Signed-off-by: Björn Töpel 
>> Signed-off-by: Alexandre Ghiti 
>> Reviewed-by: Andrea Parri 
>
> What commit does this fix?

Hmm. The bug is exposed when the AIA IPI are introduced, and used
(instead of the firmware-based).

I'm not sure this is something we'd like backported, but rather a
prerequisite to AIA.

@Anup @Alex WDYT?

Re: [PATCH] riscv: Fix text patching when icache flushes use IPIs

2024-02-06 Thread Björn Töpel

Alexandre Ghiti  writes:

> For now, we use stop_machine() to patch the text and when we use IPIs for
> remote icache flushes, the system hangs since the irqs are disabled on all
> cpus.
>
> So instead, make sure every cpu executes the stop_machine() patching
> function which emits a local icache flush and then avoids the use of
> IPIs.
>
> Co-developed-by: Björn Töpel 
> Signed-off-by: Björn Töpel 
> Signed-off-by: Alexandre Ghiti 

FWIW, the BPF selftests pass nicely with this (especially the
fentry/fexit tests ;-)). I don't know if it's worth much saying that
your own stuff was tested, but here goes:

Tested-by: Björn Töpel

[PATCH v12 for-next 4/4] samples: ftrace: Add RISC-V support for SAMPLE_FTRACE_DIRECT[_MULTI]

2023-11-30 Thread Björn Töpel

From: Song Shuai 

Add RISC-V variants of the ftrace-direct* samples.

Tested-by: Evgenii Shatokhin 
Signed-off-by: Song Shuai 
Tested-by: Guo Ren 
Signed-off-by: Guo Ren 
Acked-by: Björn Töpel 
---
 arch/riscv/Kconfig  |  2 +
 samples/ftrace/ftrace-direct-modify.c   | 35 ++
 samples/ftrace/ftrace-direct-multi-modify.c | 41 +
 samples/ftrace/ftrace-direct-multi.c| 25 +
 samples/ftrace/ftrace-direct-too.c  | 28 ++
 samples/ftrace/ftrace-direct.c  | 24 
 6 files changed, 155 insertions(+)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 4684cdc754a0..0ee79a92918d 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -142,6 +142,8 @@ config RISCV
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RETHOOK if !XIP_KERNEL
select HAVE_RSEQ
+   select HAVE_SAMPLE_FTRACE_DIRECT
+   select HAVE_SAMPLE_FTRACE_DIRECT_MULTI
select HAVE_STACKPROTECTOR
select HAVE_SYSCALL_TRACEPOINTS
select HOTPLUG_CORE_SYNC_DEAD if HOTPLUG_CPU
diff --git a/samples/ftrace/ftrace-direct-modify.c 
b/samples/ftrace/ftrace-direct-modify.c
index e2a6a69352df..81220390851a 100644
--- a/samples/ftrace/ftrace-direct-modify.c
+++ b/samples/ftrace/ftrace-direct-modify.c
@@ -24,6 +24,41 @@ extern void my_tramp2(void *);
 
 static unsigned long my_ip = (unsigned long)schedule;
 
+#ifdef CONFIG_RISCV
+#include 
+
+asm (
+"  .pushsection.text, \"ax\", @progbits\n"
+"  .type   my_tramp1, @function\n"
+"  .globl  my_tramp1\n"
+"   my_tramp1:\n"
+"  addisp,sp,-2*"SZREG"\n"
+"  "REG_S" t0,0*"SZREG"(sp)\n"
+"  "REG_S" ra,1*"SZREG"(sp)\n"
+"  callmy_direct_func1\n"
+"  "REG_L" t0,0*"SZREG"(sp)\n"
+"  "REG_L" ra,1*"SZREG"(sp)\n"
+"  addisp,sp,2*"SZREG"\n"
+"  jr  t0\n"
+"  .size   my_tramp1, .-my_tramp1\n"
+"  .type   my_tramp2, @function\n"
+"  .globl  my_tramp2\n"
+
+"   my_tramp2:\n"
+"  addisp,sp,-2*"SZREG"\n"
+"  "REG_S" t0,0*"SZREG"(sp)\n"
+"  "REG_S" ra,1*"SZREG"(sp)\n"
+"  callmy_direct_func2\n"
+"  "REG_L" t0,0*"SZREG"(sp)\n"
+"  "REG_L" ra,1*"SZREG"(sp)\n"
+"  addisp,sp,2*"SZREG"\n"
+"  jr  t0\n"
+"  .size   my_tramp2, .-my_tramp2\n"
+"  .popsection\n"
+);
+
+#endif /* CONFIG_RISCV */
+
 #ifdef CONFIG_X86_64
 
 #include 
diff --git a/samples/ftrace/ftrace-direct-multi-modify.c 
b/samples/ftrace/ftrace-direct-multi-modify.c
index 2e349834d63c..f943e40d57fd 100644
--- a/samples/ftrace/ftrace-direct-multi-modify.c
+++ b/samples/ftrace/ftrace-direct-multi-modify.c
@@ -22,6 +22,47 @@ void my_direct_func2(unsigned long ip)
 extern void my_tramp1(void *);
 extern void my_tramp2(void *);
 
+#ifdef CONFIG_RISCV
+#include 
+
+asm (
+"  .pushsection.text, \"ax\", @progbits\n"
+"  .type   my_tramp1, @function\n"
+"  .globl  my_tramp1\n"
+"   my_tramp1:\n"
+"   addi   sp,sp,-3*"SZREG"\n"
+"   "REG_S"a0,0*"SZREG"(sp)\n"
+"   "REG_S"t0,1*"SZREG"(sp)\n"
+"   "REG_S"ra,2*"SZREG"(sp)\n"
+"   mv a0,t0\n"
+"   call   my_direct_func1\n"
+"   "REG_L"a0,0*"SZREG"(sp)\n"
+"   "REG_L"t0,1*"SZREG"(sp)\n"
+"   "REG_L"ra,2*"SZREG"(sp)\n"
+"   addi   sp,sp,3*"SZREG"\n"
+"  jr  t0\n"
+"  .size   my_tramp1, .-my_tramp1\n"
+
+"  .type   my_tramp2, @function\n"
+"  .globl  my_tramp2\n"
+"   my_tramp2:\n"
+"   addi   sp,sp,-3*"SZREG"\n"
+"   "REG_S"a0,0*"SZREG"(sp)\n"
+"   "REG_S"t0,1*"SZREG"(sp)\n"
+"   "REG_S"ra,2*"SZREG"(sp)\n"
+"   mv a0,t0\n"
+"   call   my_direct_func2\n"
+"   "REG_L"a0,0*"SZREG"(sp)\n"
+"   "REG_L"t0,1*"SZREG"(sp)\n"
+"   "REG_L"ra,2*&quo

[PATCH v12 for-next 3/4] riscv: ftrace: Add DYNAMIC_FTRACE_WITH_DIRECT_CALLS support

2023-11-30 Thread Björn Töpel

From: Song Shuai 

Select the DYNAMIC_FTRACE_WITH_DIRECT_CALLS to provide the
register_ftrace_direct[_multi] interfaces allowing users to register
the customed trampoline (direct_caller) as the mcount for one or more
target functions. And modify_ftrace_direct[_multi] are also provided
for modifying direct_caller.

To make the direct_caller and the other ftrace hooks (e.g.
function/fgraph tracer, k[ret]probes) co-exist, a temporary register
is nominated to store the address of direct_caller in
ftrace_regs_caller. After the setting of the address direct_caller by
direct_ops->func and the RESTORE_REGS in ftrace_regs_caller,
direct_caller will be jumped to by the `jr` inst.

Add DYNAMIC_FTRACE_WITH_DIRECT_CALLS support for RISC-V.

Signed-off-by: Song Shuai 
Tested-by: Guo Ren 
Signed-off-by: Guo Ren 
Acked-by: Björn Töpel 
---
 arch/riscv/Kconfig  |  1 +
 arch/riscv/include/asm/ftrace.h |  7 +++
 arch/riscv/kernel/mcount-dyn.S  | 10 ++
 3 files changed, 18 insertions(+)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 69c95e42be9f..4684cdc754a0 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -114,6 +114,7 @@ config RISCV
select HAVE_DEBUG_KMEMLEAK
select HAVE_DMA_CONTIGUOUS if MMU
select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && MMU && 
(CLANG_SUPPORTS_DYNAMIC_FTRACE || GCC_SUPPORTS_DYNAMIC_FTRACE)
+   select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE
select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL
select HAVE_FUNCTION_GRAPH_TRACER
diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h
index b383926f73be..329172122952 100644
--- a/arch/riscv/include/asm/ftrace.h
+++ b/arch/riscv/include/asm/ftrace.h
@@ -135,6 +135,13 @@ struct ftrace_regs;
 void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
   struct ftrace_ops *op, struct ftrace_regs *fregs);
 #define ftrace_graph_func ftrace_graph_func
+
+static inline void __arch_ftrace_set_direct_caller(struct pt_regs *regs, 
unsigned long addr)
+{
+   regs->t1 = addr;
+}
+#define arch_ftrace_set_direct_caller(fregs, addr) \
+   __arch_ftrace_set_direct_caller(&(fregs)->regs, addr)
 #endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S
index c902a7ddb310..b7ce001779c1 100644
--- a/arch/riscv/kernel/mcount-dyn.S
+++ b/arch/riscv/kernel/mcount-dyn.S
@@ -229,6 +229,7 @@ SYM_FUNC_END(ftrace_caller)
 
 #else /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */
 SYM_FUNC_START(ftrace_regs_caller)
+   mv  t1, zero
SAVE_ABI_REGS 1
PREPARE_ARGS
 
@@ -236,7 +237,10 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
callftrace_stub
 
RESTORE_ABI_REGS 1
+   bnezt1, .Ldirect
jr  t0
+.Ldirect:
+   jr  t1
 SYM_FUNC_END(ftrace_regs_caller)
 
 SYM_FUNC_START(ftrace_caller)
@@ -250,3 +254,9 @@ SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
jr  t0
 SYM_FUNC_END(ftrace_caller)
 #endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+SYM_CODE_START(ftrace_stub_direct_tramp)
+   jr  t0
+SYM_CODE_END(ftrace_stub_direct_tramp)
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
-- 
2.40.1

[PATCH v12 for-next 0/4] riscv: ftrace: Miscellaneous ftrace improvements

2023-11-30 Thread Björn Töpel

From: Björn Töpel 

NB! Song told me that he would not have the time work on this series,
so I picked it up.

This series includes a three ftrace improvements for RISC-V:

1. Do not require to run recordmcount at build time (patch 1)
2. Simplification of the function graph functionality (patch 2)
3. Enable DYNAMIC_FTRACE_WITH_DIRECT_CALLS (patch 3 and 4)

The series has been tested on Qemu/rv64 virt/Debian sid with the
following test configs:
  CONFIG_FTRACE_SELFTEST=y
  CONFIG_FTRACE_STARTUP_TEST=y
  CONFIG_SAMPLE_FTRACE_DIRECT=m
  CONFIG_SAMPLE_FTRACE_DIRECT_MULTI=m
  CONFIG_SAMPLE_FTRACE_OPS=m

All tests pass.


Cheers,
Björn


Changes in v12:
- Massaged the commit messages a bit.
- Squashed the samples patches, so that the rv32 support is included
  from the start.
- Minor whitespace changes in the mcount-dyn.S.
- Minor style changes.

Changes in v11:
https://lore.kernel.org/linux-riscv/20230627111612.761164-1-suagrfil...@gmail.com/

- append a patch that makes the DIRECT_CALL samples support RV32I in
  this series fixing the rv32 build failure reported by Palmer
- validated with ftrace boottime selftest and manual sample modules test
  in qemu-system for RV32I and RV64I

Changes in v10:
https://lore.kernel.org/all/20230511093234.3123181-1-suagrfil...@gmail.com/

- add Acked-by from Björn Töpel in patch 2 and patch 4 
- replace `move` with `mv` in patch3 
- prettify patch 2/4 with proper tabs

Changes in v9:
https://lore.kernel.org/linux-riscv/20230510101857.2953955-1-suagrfil...@gmail.com/

1. add Acked-by from Björn Töpel in patch 1

2. rebase patch2/patch3 on Linux v6.4-rc1

  - patch 2: to make the `SAVE_ABI_REGS` configurable, revert the
modification of mcount-dyn.S from commit (45b32b946a97 "riscv:
entry: Consolidate general regs saving/restoring")

  - patch 3: to pass the trace_selftest, add the implement of
`ftrace_stub_direct_tramp` from commit (fee86a4ed536 "ftrace:
selftest: remove broken trace_direct_tramp") ; and fixup the context
conflict in Kconfig 

Changes in v8:
https://lore.kernel.org/linux-riscv/20230324033342.3177979-1-suagrfil...@gmail.com/
 - Fix incorrect address values in the 4nd patch 
 - Rebased on v6.3-rc2

Changes in v7:
https://lore.kernel.org/linux-riscv/20230112090603.1295340-1-guo...@kernel.org/
 - Fixup RESTORE_ABI_REGS by remove PT_T0(sp) overwrite.
 - Add FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY [1]
 - Fixup kconfig with HAVE_SAMPLE_FTRACE_DIRECT &
   HAVE_SAMPLE_FTRACE_DIRECT_MULTI

Changes in v6:
https://lore.kernel.org/linux-riscv/20230107133549.4192639-1-guo...@kernel.org/
 - Replace 8 with MCOUNT_INSN_SIZE
 - Replace "REG_L a1, PT_RA(sp)" with "mv a1, ra"
 - Add Evgenii Shatokhin comment

Changes in v5:
https://lore.kernel.org/linux-riscv/20221208091244.203407-1-guo...@kernel.org/
 - Sort Kconfig entries in alphabetical order.

Changes in v4:
https://lore.kernel.org/linux-riscv/20221129033230.255947-1-guo...@kernel.org/
 - Include [3] for maintenance. [Song Shuai]

Changes in V3:
https://lore.kernel.org/linux-riscv/20221123153950.2911981-1-guo...@kernel.org/
 - Include [2] for maintenance. [Song Shuai]

[1]: 
https://lore.kernel.org/linux-riscv/CAAYs2=j3eak9vu6xbaw0zpuoh00rh8v5c2u3fepkokzfibw...@mail.gmail.com/T/#t
[2]: https://lore.kernel.org/lkml/20221120084230.910152-1-suagrfil...@gmail.com/
[3]: 
https://lore.kernel.org/linux-riscv/20221123142025.1504030-1-suagrfil...@gmail.com/
 


Song Shuai (4):
  riscv: select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
  riscv: ftrace: Make function graph use ftrace directly
  riscv: ftrace: Add DYNAMIC_FTRACE_WITH_DIRECT_CALLS support
  samples: ftrace: Add RISC-V support for SAMPLE_FTRACE_DIRECT[_MULTI]

 arch/riscv/Kconfig  |   4 +
 arch/riscv/include/asm/ftrace.h |  18 +-
 arch/riscv/kernel/ftrace.c  |  30 ++-
 arch/riscv/kernel/mcount-dyn.S  | 198 
 samples/ftrace/ftrace-direct-modify.c   |  35 
 samples/ftrace/ftrace-direct-multi-modify.c |  41 
 samples/ftrace/ftrace-direct-multi.c|  25 +++
 samples/ftrace/ftrace-direct-too.c  |  28 +++
 samples/ftrace/ftrace-direct.c  |  24 +++
 9 files changed, 348 insertions(+), 55 deletions(-)


base-commit: 3ca112b71f35dd5d99fc4571a56b5fc6f0c15814
-- 
2.40.1

Re: [PATCH -fixes] riscv: Fix ftrace syscall handling which are now prefixed with __riscv_

2023-10-03 Thread Björn Töpel

Alexandre Ghiti  writes:

> ftrace creates entries for each syscall in the tracefs but has failed
> since commit 08d0ce30e0e4 ("riscv: Implement syscall wrappers") which
> prefixes all riscv syscalls with __riscv_.
>
> So fix this by implementing arch_syscall_match_sym_name() which allows us
> to ignore this prefix.
>
> And also ignore compat syscalls like x86/arm64 by implementing
> arch_trace_is_compat_syscall().
>
> Fixes: 08d0ce30e0e4 ("riscv: Implement syscall wrappers")
> Signed-off-by: Alexandre Ghiti 

This fix does the BPF test suite happier!

Tested-by: Björn Töpel

Re: [PATCH -fixes] riscv: Fix ftrace syscall handling which are now prefixed with __riscv_

2023-10-03 Thread Björn Töpel

Alexandre Ghiti  writes:

> @Conor Dooley This fails checkpatch but the documentation here states
> that this is how to do:
> https://elixir.bootlin.com/linux/latest/source/Documentation/trace/ftrace-design.rst#L246

FWIW, I'll relax the PW CI checkpatch test going forward. It's way too
strict...

[PATCH v3] riscv: Only consider swbp/ss handlers for correct privileged mode

2023-09-12 Thread Björn Töpel

From: Björn Töpel 

RISC-V software breakpoint trap handlers are used for {k,u}probes.

When trapping from kernelmode, only the kernelmode handlers should be
considered. Vice versa, only usermode handlers for usermode
traps. This is not the case on RISC-V, which can trigger a bug if a
userspace process uses uprobes, and a WARN() is triggered from
kernelmode (which is implemented via {c.,}ebreak).

The kernel will trap on the kernelmode {c.,}ebreak, look for uprobes
handlers, realize incorrectly that uprobes need to be handled, and
exit the trap handler early. The trap returns to re-executing the
{c.,}ebreak, and enter an infinite trap-loop.

The issue was found running the BPF selftest [1].

Fix this issue by only considering the swbp/ss handlers for
kernel/usermode respectively. Also, move CONFIG ifdeffery from traps.c
to the asm/{k,u}probes.h headers.

Note that linux/uprobes.h only include asm/uprobes.h if CONFIG_UPROBES
is defined, which is why asm/uprobes.h needs to be unconditionally
included in traps.c

Link: 
https://lore.kernel.org/linux-riscv/87v8d19aun@all.your.base.are.belong.to.us/
 # [1]
Fixes: 74784081aac8 ("riscv: Add uprobes supported")
Reviewed-by: Guo Ren 
Reviewed-by: Nam Cao 
Tested-by: Puranjay Mohan 
Signed-off-by: Björn Töpel 
---
v2->v3: Remove incorrect tags (Conor)
Collect review/test tags
v1->v2: Fix Clang build warning (kernel test robot)
---
 arch/riscv/include/asm/kprobes.h | 11 ++-
 arch/riscv/include/asm/uprobes.h | 13 -
 arch/riscv/kernel/traps.c| 28 ++--
 3 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/arch/riscv/include/asm/kprobes.h b/arch/riscv/include/asm/kprobes.h
index e7882ccb0fd4..78ea44f76718 100644
--- a/arch/riscv/include/asm/kprobes.h
+++ b/arch/riscv/include/asm/kprobes.h
@@ -40,6 +40,15 @@ void arch_remove_kprobe(struct kprobe *p);
 int kprobe_fault_handler(struct pt_regs *regs, unsigned int trapnr);
 bool kprobe_breakpoint_handler(struct pt_regs *regs);
 bool kprobe_single_step_handler(struct pt_regs *regs);
-
+#else
+static inline bool kprobe_breakpoint_handler(struct pt_regs *regs)
+{
+   return false;
+}
+
+static inline bool kprobe_single_step_handler(struct pt_regs *regs)
+{
+   return false;
+}
 #endif /* CONFIG_KPROBES */
 #endif /* _ASM_RISCV_KPROBES_H */
diff --git a/arch/riscv/include/asm/uprobes.h b/arch/riscv/include/asm/uprobes.h
index f2183e00fdd2..3fc7deda9190 100644
--- a/arch/riscv/include/asm/uprobes.h
+++ b/arch/riscv/include/asm/uprobes.h
@@ -34,7 +34,18 @@ struct arch_uprobe {
bool simulate;
 };
 
+#ifdef CONFIG_UPROBES
 bool uprobe_breakpoint_handler(struct pt_regs *regs);
 bool uprobe_single_step_handler(struct pt_regs *regs);
-
+#else
+static inline bool uprobe_breakpoint_handler(struct pt_regs *regs)
+{
+   return false;
+}
+
+static inline bool uprobe_single_step_handler(struct pt_regs *regs)
+{
+   return false;
+}
+#endif /* CONFIG_UPROBES */
 #endif /* _ASM_RISCV_UPROBES_H */
diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c
index 19807c4d3805..fae8f610d867 100644
--- a/arch/riscv/kernel/traps.c
+++ b/arch/riscv/kernel/traps.c
@@ -13,6 +13,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -247,22 +249,28 @@ static inline unsigned long 
get_break_insn_length(unsigned long pc)
return GET_INSN_LENGTH(insn);
 }
 
+static bool probe_single_step_handler(struct pt_regs *regs)
+{
+   bool user = user_mode(regs);
+
+   return user ? uprobe_single_step_handler(regs) : 
kprobe_single_step_handler(regs);
+}
+
+static bool probe_breakpoint_handler(struct pt_regs *regs)
+{
+   bool user = user_mode(regs);
+
+   return user ? uprobe_breakpoint_handler(regs) : 
kprobe_breakpoint_handler(regs);
+}
+
 void handle_break(struct pt_regs *regs)
 {
-#ifdef CONFIG_KPROBES
-   if (kprobe_single_step_handler(regs))
+   if (probe_single_step_handler(regs))
return;
 
-   if (kprobe_breakpoint_handler(regs))
-   return;
-#endif
-#ifdef CONFIG_UPROBES
-   if (uprobe_single_step_handler(regs))
+   if (probe_breakpoint_handler(regs))
return;
 
-   if (uprobe_breakpoint_handler(regs))
-   return;
-#endif
current->thread.bad_cause = regs->cause;
 
if (user_mode(regs))

base-commit: 0bb80ecc33a8fb5a682236443c1e740d5c917d1d
-- 
2.39.2

Re: linux-next: build failure after merge of the bpf-next tree

2021-03-10 Thread Björn Töpel


On 2021-03-11 01:47, Stephen Rothwell wrote:

Hi all,

After merging the bpf-next tree, today's linux-next build (perf) failed
like this:

make[3]: *** No rule to make target 'libbpf_util.h', needed by 
'/home/sfr/next/perf/staticobjs/xsk.o'.  Stop.



Hi Stephen,

It's an incremental build issue, as pointed out here [1], that is
resolved by cleaning the build.


Cheers,
Björn

[1] 
https://lore.kernel.org/bpf/CAEf4BzYPDF87At4=_gsndxof84oiqyjxgahl7_jvpuntovu...@mail.gmail.com/




Caused by commit

   7e8bbe24cb8b ("libbpf: xsk: Move barriers from libbpf_util.h to xsk.h")

I have used the bpf tree from next-20210310 for today.

Re: [PATCH] tools/memory-model: Fix smp_mb__after_spinlock() spelling

2021-03-05 Thread Björn Töpel


On 2021-03-05 16:36, Alan Stern wrote:

On Fri, Mar 05, 2021 at 11:28:23AM +0100, Björn Töpel wrote:

From: Björn Töpel 

A misspelled invokation of git-grep, revealed that

---^

Smetimes my brain is a little slow...  Do you confirm that this is a
joke?



I wish, Alan. I wish.

Looks like I can only spel function names correctly.


Have a nice weekend!
Björn



Alan

[PATCH] tools/memory-model: Fix smp_mb__after_spinlock() spelling

2021-03-05 Thread Björn Töpel

From: Björn Töpel 

A misspelled invokation of git-grep, revealed that
smp_mb__after_spinlock() was misspelled in explaination.txt.

Add missing "_" to smp_mb__after_spinlock().

Fixes: 1c27b644c0fd ("Automate memory-barriers.txt; provide Linux-kernel memory 
model")
Signed-off-by: Björn Töpel 
---
 tools/memory-model/Documentation/explanation.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/memory-model/Documentation/explanation.txt 
b/tools/memory-model/Documentation/explanation.txt
index f9d610d5a1a4..5d72f3112e56 100644
--- a/tools/memory-model/Documentation/explanation.txt
+++ b/tools/memory-model/Documentation/explanation.txt
@@ -2510,7 +2510,7 @@ they behave as follows:
smp_mb__after_atomic() orders po-earlier atomic updates and
the events preceding them against all po-later events;
 
-   smp_mb_after_spinlock() orders po-earlier lock acquisition
+   smp_mb__after_spinlock() orders po-earlier lock acquisition
events and the events preceding them against all po-later
events.
 

base-commit: 7f58c0fb9238abaa3997185ceec319201b6f5a94
-- 
2.27.0

Re: XDP socket rings, and LKMM litmus tests

2021-03-02 Thread Björn Töpel

On Tue, 2 Mar 2021 at 21:41, Paul E. McKenney  wrote:
>
> On Tue, Mar 02, 2021 at 09:24:04PM +0100, Björn Töpel wrote:
> > On Tue, 2 Mar 2021 at 20:57, Paul E. McKenney  wrote:
> > >
> > > On Tue, Mar 02, 2021 at 07:46:27PM +0100, Björn Töpel wrote:
> >
> > [...]
> >
> > >
> > > Before digging in too deeply, does the following simplification
> > > still capture your intent?
> > >
> >
> > Thanks for having a look, Paul!
> >
> > > P0(int *prod, int *cons, int *data)
> > > {
> > > int p;
> > > int cond = 0;
> > >
> > > p = READ_ONCE(*prod);
> > > if (p == READ_ONCE(*cons))
> > > cond = 1;
> >
> > With this, yes!
> >
> > > if (cond) {
> > > smp_mb();
> > > WRITE_ONCE(*data, 1);
> > > smp_wmb();
> > > WRITE_ONCE(*prod, p ^ 1);
> > > }
> > > }
> > >
> > > P1(int *prod, int *cons, int *data)
> > > {
> > > int c;
> > > int d = -1;
> > > int cond = 0;
> > >
> > > c = READ_ONCE(*cons);
> > > if (READ_ONCE(*prod) == c)
> > > cond = 1;
> >
> > Hmm, this would not be the correct state transition.
> >
> > c==1 && p==1 would set cond to 1, right?
> >
> > I would agree with:
> >   c = READ_ONCE(*cons);
> >   if (READ_ONCE(*prod) != c)
>
> Right you are!
>
> With that, it looks to me like LKMM is OK with removing the smp_mb().
> My guess is that the issue is that LKMM confines the effect of control
> dependencies to a single "if" statement, hence my reworking of your
> original.
>

Interesting!

I tried the acquire/release version:

P0(int *prod, int *cons, int *data)
{
int p;
int cond = 0;

p = READ_ONCE(*prod);
if (p == READ_ONCE(*cons)) {
WRITE_ONCE(*data, 1);
smp_store_release(prod, p ^ 1);
}
}

P1(int *prod, int *cons, int *data)
{
int c;
int d = -1;

c = READ_ONCE(*cons);
if (smp_load_acquire(prod) != c) {
d = READ_ONCE(*data);
smp_store_release(cons, c ^ 1);
}
}

and as with the previous example, restructuring the if-statement makes
"if (p == READ_ONCE(*cons)) {" sufficient, instead of "if (p ==
smp_load_acquire(cons)) {".

Yay!


Björn


> Thanx, Paul
>
> > >
> > > if (cond == 1) {
> > > smp_rmb();
> > > d = READ_ONCE(*data);
> > > smp_mb();
> > > WRITE_ONCE(*cons, c ^ 1);
> > > }
> > > }
> > >
> > > Thanx, Paul
> > >
> >
> > [...]
> >
> > Björn

Re: XDP socket rings, and LKMM litmus tests

2021-03-02 Thread Björn Töpel

On Tue, 2 Mar 2021 at 21:04, Paul E. McKenney  wrote:
>

[...]

>
> And if the answer is "yes", how about this one?
>

With the == to != change in P1, this is OK!

> P0(int *prod, int *cons, int *data)
> {
> int p;
>
> p = READ_ONCE(*prod);
> if (p == READ_ONCE(*cons)) {

...and now d==1 is not a valid state anymore according to herd. If
think I need some input here! :-D

> WRITE_ONCE(*data, 1);
> smp_wmb();
> WRITE_ONCE(*prod, p ^ 1);
> }
> }
>
> P1(int *prod, int *cons, int *data)
> {
> int c;
> int d = -1;
>
> c = READ_ONCE(*cons);
> if (READ_ONCE(*prod) == c) {
> smp_rmb();
> d = READ_ONCE(*data);
> smp_mb();
> WRITE_ONCE(*cons, c ^ 1);
> }
> }
>

[...]

Björn

Re: XDP socket rings, and LKMM litmus tests

2021-03-02 Thread Björn Töpel

On Tue, 2 Mar 2021 at 20:57, Paul E. McKenney  wrote:
>
> On Tue, Mar 02, 2021 at 07:46:27PM +0100, Björn Töpel wrote:

[...]

>
> Before digging in too deeply, does the following simplification
> still capture your intent?
>

Thanks for having a look, Paul!

> P0(int *prod, int *cons, int *data)
> {
> int p;
> int cond = 0;
>
> p = READ_ONCE(*prod);
> if (p == READ_ONCE(*cons))
> cond = 1;

With this, yes!

> if (cond) {
> smp_mb();
> WRITE_ONCE(*data, 1);
> smp_wmb();
> WRITE_ONCE(*prod, p ^ 1);
> }
> }
>
> P1(int *prod, int *cons, int *data)
> {
> int c;
> int d = -1;
> int cond = 0;
>
> c = READ_ONCE(*cons);
> if (READ_ONCE(*prod) == c)
> cond = 1;

Hmm, this would not be the correct state transition.

c==1 && p==1 would set cond to 1, right?

I would agree with:
  c = READ_ONCE(*cons);
  if (READ_ONCE(*prod) != c)


>
> if (cond == 1) {
> smp_rmb();
> d = READ_ONCE(*data);
> smp_mb();
> WRITE_ONCE(*cons, c ^ 1);
> }
> }
>
> Thanx, Paul
>

[...]

Björn

XDP socket rings, and LKMM litmus tests

2021-03-02 Thread Björn Töpel

Hi!

Firstly; The long Cc-list is to reach the LKMM-folks.

Some background; the XDP sockets use a ring-buffer to communicate
between the kernel and userland. It's a
single-consumer/single-producer ring, and described in
net/xdp/xsk_queue.h.

--8<---
/* The structure of the shared state of the rings are the same as the
 * ring buffer in kernel/events/ring_buffer.c. For the Rx and completion
 * ring, the kernel is the producer and user space is the consumer. For
 * the Tx and fill rings, the kernel is the consumer and user space is
 * the producer.
 *
 * producer consumer
 *
 * if (LOAD ->consumer) {   LOAD ->producer
 *(A)   smp_rmb()   (C)
 *STORE $data   LOAD $data
 *smp_wmb()   (B)   smp_mb()(D)
 *STORE ->producer  STORE ->consumer
 * }
 *
 * (A) pairs with (D), and (B) pairs with (C).
...
-->8---

I'd like to replace the smp_{r,w,}mb() barriers with acquire-release
semantics [1], without breaking existing userspace applications.

So, I figured I'd use herd7 and the LKMM model to build a litmus test
for the barrier version, then for the acquire-release version, and
finally permutations of both.

The idea is to use a one element ring, with a state machine outlined
in the litmus test.

The basic test for the existing smp_{r,w,}mb() barriers looks like:

$ cat spsc-rb+1p1c.litmus
C spsc-rb+1p1c

// Stupid one entry ring:
// prod cons allowed action   prod cons
//00 =>   prod  =>   10
//01 =>   cons  =>   00
//10 =>   cons  =>   11
//11 =>   prod  =>   01

{ prod = 1; }

// Here, we start at prod==1,cons==0, data==0, i.e. producer has
// written data=0, so from here only the consumer can start, and should
// consume data==0. Afterwards, producer can continue and write 1 to
// data. Can we enter state prod==0, cons==1, but consumer observerd
// the write of 1?

P0(int *prod, int *cons, int *data)
{
int p;
int c;
int cond = 0;

p = READ_ONCE(*prod);
c = READ_ONCE(*cons);
if (p == 0)
if (c == 0)
cond = 1;
if (p == 1)
if (c == 1)
cond = 1;

if (cond) {
smp_mb();
WRITE_ONCE(*data, 1);
smp_wmb();
WRITE_ONCE(*prod, p ^ 1);
}
}

P1(int *prod, int *cons, int *data)
{
int p;
int c;
int d = -1;
int cond = 0;

p = READ_ONCE(*prod);
c = READ_ONCE(*cons);
if (p == 1)
if (c == 0)
cond = 1;
if (p == 0)
if (c == 1)
cond = 1;

if (cond == 1) {
smp_rmb();
d = READ_ONCE(*data);
smp_mb();
WRITE_ONCE(*cons, c ^ 1);
}
}

exists( 1:d=1 /\ prod=0 /\ cons=1 );

--

The weird state changing if-statements is because that I didn't get
'&&' and '||' to work with herd.

When this is run:

$ herd7 -conf linux-kernel.cfg litmus-tests/spsc-rb+1p1c.litmus
Test spsc-rb+1p1c Allowed
States 2
1:d=0; cons=1; prod=0;
1:d=0; cons=1; prod=1;
No
Witnesses
Positive: 0 Negative: 2
Condition exists (1:d=1 /\ prod=0 /\ cons=1)
Observation spsc-rb+1p1c Never 0 2
Time spsc-rb+1p1c 0.04
Hash=b399756d6a1301ca5bda042f32130791

Now to my question; In P0 there's an smp_mb(). Without that, the d==1
can be observed from P1 (consumer):

$ herd7 -conf linux-kernel.cfg litmus-tests/spsc-rb+1p1c.litmus
Test spsc-rb+1p1c Allowed
States 3
1:d=0; cons=1; prod=0;
1:d=0; cons=1; prod=1;
1:d=1; cons=1; prod=0;
Ok
Witnesses
Positive: 1 Negative: 2
Condition exists (1:d=1 /\ prod=0 /\ cons=1)
Observation spsc-rb+1p1c Sometimes 1 2
Time spsc-rb+1p1c 0.04
Hash=0047fc21fa77da9a9aee15e35ec367ef

In commit c7f2e3cd6c1f ("perf: Optimize ring-buffer write by depending
on control dependencies") removes the corresponding smp_mb(), and also
the circular buffer in circular-buffers.txt (pre commit 6c43c091bdc5
("documentation: Update circular buffer for
load-acquire/store-release")) is missing the smp_mb() at the
producer-side.

I'm trying to wrap my head around why it's OK to remove the smp_mb()
in the cases above? I'm worried that the current XDP socket ring
implementation (which is missing smp_mb()) might be broken.


If you read this far, thanks! :-)
Björn


[1] https://lore.kernel.org/bpf/20210301104318.263262-2-bjorn.to...@gmail.com/

Re: general protection fault in xsk_recvmsg

2021-01-15 Thread Björn Töpel

#syz fix: xsk: Validate socket state in xsk_recvmsg, prior touching 
socket members

Re: memory leak in xskq_create

2020-12-16 Thread Björn Töpel




On 2020-12-16 19:11, Peilin Ye wrote:

Hi all,

On Sun, Dec 13, 2020 at 06:53:10AM -0800, syzbot wrote:

BUG: memory leak
unreferenced object 0x88810f897940 (size 64):
   comm "syz-executor991", pid 8502, jiffies 4294942194 (age 14.080s)
   hex dump (first 32 bytes):
 7f 00 00 00 80 00 00 00 00 00 00 00 00 00 00 00  
 00 a0 37 0c 81 88 ff ff 00 00 00 00 00 00 00 00  ..7.
   backtrace:
 [<639d0dd1>] xskq_create+0x23/0xd0 include/linux/slab.h:552
 [] xsk_init_queue net/xdp/xsk.c:508 [inline]
 [] xsk_setsockopt+0x1c4/0x590 net/xdp/xsk.c:875
 [<2b302260>] __sys_setsockopt+0x1b0/0x360 net/socket.c:2132
 [] __do_sys_setsockopt net/socket.c:2143 [inline]
 [] __se_sys_setsockopt net/socket.c:2140 [inline]
 [] __x64_sys_setsockopt+0x22/0x30 net/socket.c:2140
 [<05c2b4a0>] do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
 [<03db140f>] entry_SYSCALL_64_after_hwframe+0x44/0xa9


I have tested the following diff locally against syzbot's reproducer,
and sent a patch to it [1] for testing.  I will send a real patch here
tomorrow if syzbot is happy about it.  Please see explanation below.



Hi Peilin Ye!

Thanks for taking a look! Magnus has already addressed this problem in
another patch [1].


Cheers,
Björn



[1] 
https://lore.kernel.org/bpf/20201214085127.3960-1-magnus.karls...@gmail.com/

Re: [PATCH bpf-next v5 04/11] bpf: Rename BPF_XADD and prepare to encode other atomics in .imm

2020-12-15 Thread Björn Töpel

On Tue, 15 Dec 2020 at 13:25, Brendan Jackman  wrote:
>
> A subsequent patch will add additional atomic operations. These new
> operations will use the same opcode field as the existing XADD, with
> the immediate discriminating different operations.
>
> In preparation, rename the instruction mode BPF_ATOMIC and start
> calling the zero immediate BPF_ADD.
>
> This is possible (doesn't break existing valid BPF progs) because the
> immediate field is currently reserved MBZ and BPF_ADD is zero.
>
> All uses are removed from the tree but the BPF_XADD definition is
> kept around to avoid breaking builds for people including kernel
> headers.
>
> Signed-off-by: Brendan Jackman 
> ---
>  Documentation/networking/filter.rst   | 30 +++-
>  arch/arm/net/bpf_jit_32.c |  7 ++-
>  arch/arm64/net/bpf_jit_comp.c | 16 +--
>  arch/mips/net/ebpf_jit.c  | 11 +++--
>  arch/powerpc/net/bpf_jit_comp64.c | 25 --
>  arch/riscv/net/bpf_jit_comp32.c   | 20 ++--
>  arch/riscv/net/bpf_jit_comp64.c   | 16 +--

For RISC-V:

Acked-by: Björn Töpel

Re: linux-next: ERROR: build error for arm64

2020-12-06 Thread Björn Töpel


On 2020-12-06 08:32, Hui Su wrote:

hi, all:

The error came out like this when i build the linux-next kernel
with ARCH=arm64, with the arm64_defconfig:

   CC  drivers/net/ethernet/freescale/dpaa/dpaa_eth.o
../drivers/net/ethernet/freescale/dpaa/dpaa_eth.c: In function ‘dpaa_fq_init’:
../drivers/net/ethernet/freescale/dpaa/dpaa_eth.c:1135:9: error: too few 
arguments to function ‘xdp_rxq_info_reg’
  1135 |   err = xdp_rxq_info_reg(_fq->xdp_rxq, dpaa_fq->net_dev,
   | ^~~~
In file included from ../include/linux/netdevice.h:42,
  from ../include/uapi/linux/if_arp.h:27,
  from ../include/linux/if_arp.h:23,
  from ../drivers/net/ethernet/freescale/dpaa/dpaa_eth.c:40:
../include/net/xdp.h:229:5: note: declared here
   229 | int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
   | ^~~~
make[6]: *** [../scripts/Makefile.build:283: 
drivers/net/ethernet/freescale/dpaa/dpaa_eth.o] Error 1


Branch: git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
HEAD: 2996bd3f6ca9ea529b40c369a94b247657abdb4d
ARCH: arm64
CONFIG: arch/arm64/configs/defconfig
CMD: make ARCH=arm64 CROSS_COMPILE=/usr/bin/aarch64-linux-gnu- Image

It maybe introduced by the commit b02e5a0ebb172(xsk: Propagate napi_id to XDP 
socket Rx path),
and if anyone solved this, please add:

Reported-by: Hui Su 



Hui, was already resolved in commit fdd8b8249ef8 ("dpaa_eth: fix build
errorr in dpaa_fq_init").

Thanks,
Björn

Re: [PATCH][next] samples/bpf: Fix spelling mistake "recieving" -> "receiving"

2020-12-03 Thread Björn Töpel

On Thu, 3 Dec 2020 at 12:46, Colin King  wrote:
>
> From: Colin Ian King 
>
> There is a spelling mistake in an error message. Fix it.
>
> Signed-off-by: Colin Ian King 

Acked-by: Björn Töpel 

> ---
>  samples/bpf/xdpsock_user.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
> index 0fee7f3aef3c..9553c7c47fc4 100644
> --- a/samples/bpf/xdpsock_user.c
> +++ b/samples/bpf/xdpsock_user.c
> @@ -1570,7 +1570,7 @@ recv_xsks_map_fd(int *xsks_map_fd)
>
> err = recv_xsks_map_fd_from_ctrl_node(sock, xsks_map_fd);
> if (err) {
> -   fprintf(stderr, "Error %d recieving fd\n", err);
> +   fprintf(stderr, "Error %d receiving fd\n", err);
> return err;
> }
> return 0;
> --
> 2.29.2
>

Re: [PATCH] dpaa_eth: fix build errorr in dpaa_fq_init

2020-12-03 Thread Björn Töpel

On Thu, 3 Dec 2020 at 15:46, Anders Roxell  wrote:
>
> When building FSL_DPAA_ETH the following build error shows up:
>
> /tmp/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c: In function 
> ‘dpaa_fq_init’:
> /tmp/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c:1135:9: error: too few 
> arguments to function ‘xdp_rxq_info_reg’
>  1135 |   err = xdp_rxq_info_reg(_fq->xdp_rxq, dpaa_fq->net_dev,
>   | ^~~~
>
> Commit b02e5a0ebb17 ("xsk: Propagate napi_id to XDP socket Rx path")
> added an extra argument to function xdp_rxq_info_reg and commit
> d57e57d0cd04 ("dpaa_eth: add XDP_TX support") didn't know about that
> extra argument.
>
> Signed-off-by: Anders Roxell 
> ---
>
> I think this issue is seen since both patches went in at the same time
> to bpf-next and net-next.
>

Thanks Anders!

Indeed, when bpf-next is pulled into net-next this needs to be applied.

Acked-by: Björn Töpel 

>  drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c 
> b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
> index 947b3d2f9c7e..6cc8c4e078de 100644
> --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
> +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
> @@ -1133,7 +1133,7 @@ static int dpaa_fq_init(struct dpaa_fq *dpaa_fq, bool 
> td_enable)
> if (dpaa_fq->fq_type == FQ_TYPE_RX_DEFAULT ||
> dpaa_fq->fq_type == FQ_TYPE_RX_PCD) {
> err = xdp_rxq_info_reg(_fq->xdp_rxq, dpaa_fq->net_dev,
> -  dpaa_fq->fqid);
> +  dpaa_fq->fqid, 0);
> if (err) {
> dev_err(dev, "xdp_rxq_info_reg() = %d\n", err);
> return err;
> --
> 2.29.2
>

Re: [PATCH] xsk: add cq event

2020-11-17 Thread Björn Töpel


On 2020-11-16 17:12, Xuan Zhuo wrote:

On Mon, 16 Nov 2020 15:31:20 +0100, =?UTF-8?B?QmrDtnJuIFTDtnBlbA==?= 
 wrote:


On 2020-11-16 09:10, Xuan Zhuo wrote:



When we write all cq items to tx, we have to wait for a new event based



on poll to indicate that it is writable. But the current writability is



triggered based on whether tx is full or not, and In fact, when tx is



dissatisfied, the user of cq's item may not necessarily get it, because it



may still be occupied by the network card. In this case, we need to know



when cq is available, so this patch adds a socket option, When the user



configures this option using setsockopt, when cq is available, a



readable event is generated for all xsk bound to this umem.







I can't find a better description of this event,



I think it can also be 'readable', although it is indeed different from



the 'readable' of the new data. But the overhead of xsk checking whether



cq or rx is readable is small.







Signed-off-by: Xuan Zhuo 







Thanks for the patch!







I'm not a fan of having two different "readable" event (both Rx and cq).



Could you explain a bit what the use case is, so I get a better



understanding.







The Tx queues has a back-pressure mechanism, determined of the number of



elements in cq. Is it related to that?







Please explain a bit more what you're trying to solve, and maybe we can



figure out a better way forward!











Thanks!



Björn


I want to implement a tool for mass sending. For example, the size of cq is

1024, and I set the size of tx also to 1024, so that I will put all cq in tx at

once, and then I have to wait for an event, come Indicates that there is new

write space or new cq is available.



At present, we can only monitor the event of write able. This indicates whether

tx is full, but in fact, tx is basically not full, because the full state is

very short, and those tx items are being used by the network card. And

epoll_wait will be awakened directly every time, without waiting, but I cannot

get the cq item, so I still cannot successfully send the package again.



Of course, I don't like the "readable" event very much. This is a suitable

one I found in the existing epoll event. ^_^



More questions! By "Mass sending" do you mean maximum throughput, or
does that mean "in very large batches"?

For the latter to do 1k batches, you could increase the Tx/cq buffer
size to say 4k.

For maximum thoughput it's better to use smaller batches (e.g. what the
txpush scenario in samples/xdpsock does).

You're right that even if there's space in the Tx ring, it wont be sent
unless there's sufficient space in the cq ring. Maybe it would make
sense to be more restrictive when triggering the "writable" socket
event? E.g. only trigger it when there's space in Tx *and* sufficient cq
space?


Björn




Thanks.

Re: [PATCH] xsk: add cq event

2020-11-16 Thread Björn Töpel


On 2020-11-16 09:10, Xuan Zhuo wrote:

When we write all cq items to tx, we have to wait for a new event based
on poll to indicate that it is writable. But the current writability is
triggered based on whether tx is full or not, and In fact, when tx is
dissatisfied, the user of cq's item may not necessarily get it, because it
may still be occupied by the network card. In this case, we need to know
when cq is available, so this patch adds a socket option, When the user
configures this option using setsockopt, when cq is available, a
readable event is generated for all xsk bound to this umem.

I can't find a better description of this event,
I think it can also be 'readable', although it is indeed different from
the 'readable' of the new data. But the overhead of xsk checking whether
cq or rx is readable is small.

Signed-off-by: Xuan Zhuo 


Thanks for the patch!

I'm not a fan of having two different "readable" event (both Rx and cq).
Could you explain a bit what the use case is, so I get a better
understanding.

The Tx queues has a back-pressure mechanism, determined of the number of
elements in cq. Is it related to that?

Please explain a bit more what you're trying to solve, and maybe we can
figure out a better way forward!


Thanks!
Björn

Re: [PATCH][next] xsk: Fix null check on error return path

2020-09-02 Thread Björn Töpel


On 2020-09-02 17:07, Gustavo A. R. Silva wrote:

Currently, dma_map is being checked, when the right object identifier
to be null-checked is dma_map->dma_pages, instead.

Fix this by null-checking dma_map->dma_pages.

Addresses-Coverity-ID: 1496811 ("Logically dead code")
Fixes: 921b68692abb ("xsk: Enable sharing of dma mappings")
Signed-off-by: Gustavo A. R. Silva 


Nice catch!

Acked-by: Björn Töpel 


---
  net/xdp/xsk_buff_pool.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index 795d7c81c0ca..5b00bc5707f2 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -287,7 +287,7 @@ static struct xsk_dma_map *xp_create_dma_map(struct device 
*dev, struct net_devi
return NULL;
  
  	dma_map->dma_pages = kvcalloc(nr_pages, sizeof(*dma_map->dma_pages), GFP_KERNEL);

-   if (!dma_map) {
+   if (!dma_map->dma_pages) {
kfree(dma_map);
return NULL;
}

Re: [Linux-kernel-mentees] [PATCH net v2] xdp: Prevent kernel-infoleak in xsk_getsockopt()

2020-07-28 Thread Björn Töpel

On Tue, 28 Jul 2020 at 07:37, Peilin Ye  wrote:
>
> xsk_getsockopt() is copying uninitialized stack memory to userspace when
> `extra_stats` is `false`. Fix it.
>
> Fixes: 8aa5a33578e9 ("xsk: Add new statistics")
> Suggested-by: Dan Carpenter 
> Signed-off-by: Peilin Ye 
> ---

Acked-by: Björn Töpel 

> Doing `= {};` is sufficient since currently `struct xdp_statistics` is
> defined as follows:
>
> struct xdp_statistics {
> __u64 rx_dropped;
> __u64 rx_invalid_descs;
> __u64 tx_invalid_descs;
> __u64 rx_ring_full;
> __u64 rx_fill_ring_empty_descs;
> __u64 tx_ring_empty_descs;
> };
>
> When being copied to the userspace, `stats` will not contain any
> uninitialized "holes" between struct fields.
>
> Changes in v2:
> - Remove the "Cc: sta...@vger.kernel.org" tag. (Suggested by Song Liu
>   )
> - Initialize `stats` by assignment instead of using memset().
>   (Suggested by Song Liu )
>
>  net/xdp/xsk.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 26e3bba8c204..b2b533eddebf 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -840,7 +840,7 @@ static int xsk_getsockopt(struct socket *sock, int level, 
> int optname,
> switch (optname) {
> case XDP_STATISTICS:
> {
> -   struct xdp_statistics stats;
> +   struct xdp_statistics stats = {};
> bool extra_stats = true;
> size_t stats_size;
>
> --
> 2.25.1
>

Re: [PATCH bpf-next v1 0/3] bpf, riscv: Add compressed instructions to rv64 JIT

2020-07-21 Thread Björn Töpel

On Tue, 21 Jul 2020 at 04:52, Luke Nelson  wrote:
>
> This patch series enables using compressed riscv (RVC) instructions
> in the rv64 BPF JIT.
>
> RVC is a standard riscv extension that adds a set of compressed,
> 2-byte instructions that can replace some regular 4-byte instructions
> for improved code density.
>
> This series first modifies the JIT to support using 2-byte instructions
> (e.g., in jump offset computations), then adds RVC encoding and
> helper functions, and finally uses the helper functions to optimize
> the rv64 JIT.
>
> I used our formal verification framework, Serval, to verify the
> correctness of the RVC encodings and their uses in the rv64 JIT.
>
> The JIT continues to pass all tests in lib/test_bpf.c, and introduces
> no new failures to test_verifier; both with and without RVC being enabled.
>
> The following are examples of the JITed code for the verifier selftest
> "direct packet read test#3 for CGROUP_SKB OK", without and with RVC
> enabled, respectively. The former uses 178 bytes, and the latter uses 112,
> for a ~37% reduction in code size for this example.
>
> Without RVC:
>
>0: 02000813addi  a6,zero,32
>4: fd010113addi  sp,sp,-48
>8: 02813423sds0,40(sp)
>c: 02913023sds1,32(sp)
>   10: 01213c23sds2,24(sp)
>   14: 01313823sds3,16(sp)
>   18: 01413423sds4,8(sp)
>   1c: 03010413addi  s0,sp,48
>   20: 03056683lwu   a3,48(a0)
>   24: 02069693slli  a3,a3,0x20
>   28: 0206d693srli  a3,a3,0x20
>   2c: 03456703lwu   a4,52(a0)
>   30: 02071713slli  a4,a4,0x20
>   34: 02075713srli  a4,a4,0x20
>   38: 03856483lwu   s1,56(a0)
>   3c: 02049493slli  s1,s1,0x20
>   40: 0204d493srli  s1,s1,0x20
>   44: 03c56903lwu   s2,60(a0)
>   48: 02091913slli  s2,s2,0x20
>   4c: 02095913srli  s2,s2,0x20
>   50: 04056983lwu   s3,64(a0)
>   54: 0203slli  s3,s3,0x20
>   58: 0209d993srli  s3,s3,0x20
>   5c: 09056a03lwu   s4,144(a0)
>   60: 020a1a13slli  s4,s4,0x20
>   64: 020a5a13srli  s4,s4,0x20
>   68: 00900313addi  t1,zero,9
>   6c: 006a7463bgeu  s4,t1,0x74
>   70: 0a13addi  s4,zero,0
>   74: 02d52823swa3,48(a0)
>   78: 02e52a23swa4,52(a0)
>   7c: 02952c23sws1,56(a0)
>   80: 03252e23sws2,60(a0)
>   84: 05352023sws3,64(a0)
>   88: 0793addi  a5,zero,0
>   8c: 02813403lds0,40(sp)
>   90: 02013483lds1,32(sp)
>   94: 01813903lds2,24(sp)
>   98: 01013983lds3,16(sp)
>   9c: 00813a03lds4,8(sp)
>   a0: 03010113addi  sp,sp,48
>   a4: 00078513addi  a0,a5,0
>   a8: 8067jalr  zero,0(ra)
>
> With RVC:
>
>0:   02000813addia6,zero,32
>4:   7179c.addi16sp  sp,-48
>6:   f422c.sdsp  s0,40(sp)
>8:   f026c.sdsp  s1,32(sp)
>a:   ec4ac.sdsp  s2,24(sp)
>c:   e84ec.sdsp  s3,16(sp)
>e:   e452c.sdsp  s4,8(sp)
>   10:   1800c.addi4spn  s0,sp,48
>   12:   03056683lwu a3,48(a0)
>   16:   1682c.slli  a3,0x20
>   18:   9281c.srli  a3,0x20
>   1a:   03456703lwu a4,52(a0)
>   1e:   1702c.slli  a4,0x20
>   20:   9301c.srli  a4,0x20
>   22:   03856483lwu s1,56(a0)
>   26:   1482c.slli  s1,0x20
>   28:   9081c.srli  s1,0x20
>   2a:   03c56903lwu s2,60(a0)
>   2e:   1902c.slli  s2,0x20
>   30:   02095913srlis2,s2,0x20
>   34:   04056983lwu s3,64(a0)
>   38:   1982c.slli  s3,0x20
>   3a:   0209d993srlis3,s3,0x20
>   3e:   09056a03lwu s4,144(a0)
>   42:   1a02c.slli  s4,0x20
>   44:   020a5a13srlis4,s4,0x20
>   48:   4325c.lit1,9
>   4a:   006a7363bgeus4,t1,0x50
>   4e:   4a01c.lis4,0
>   50:   d914c.swa3,48(a0)
>   52:   d958c.swa4,52(a0)
>   54:   dd04c.sws1,56(a0)
>   56:   03252e23sw  s2,60(a0)
>   5a:   05352023sw  s3,64(a0)
>   5e:   4781c.lia5,0
>   60:   7422c.ldsp  s0,40(sp)
>   62:   7482c.ldsp  s1,32(sp)
>   64:   6962c.ldsp  s2,24(sp)
>   66:   69c2c.ldsp  s3,16(sp)
>   68:   6a22c.ldsp  s4,8(sp)
>   6a:   6145c.addi16sp  sp,48
>   6c:   853ec.mva0,a5
>   6e:   8082c.jrra
>
> RFC -> v1:
>   - From Björn Töpel:
> * Changed RVOFF macro to static inline "ninsns_rvoff".
> * Changed return type of rvc_ functions from u32 to u16.
> * Changed sizeof(u16) to sizeof(*ctx->insns).
>   * F

Re: [PATCH bpf-next v1 3/3] bpf, riscv: Use compressed instructions in the rv64 JIT

2020-07-21 Thread Björn Töpel

0x50
>   4e:   4a01c.lis4,0
>   50:   d914c.swa3,48(a0)
>   52:   d958c.swa4,52(a0)
>   54:   dd04c.sws1,56(a0)
>   56:   03252e23sw  s2,60(a0)
>   5a:   05352023sw  s3,64(a0)
>   5e:   4781c.lia5,0
>   60:   7422c.ldsp  s0,40(sp)
>   62:   7482c.ldsp  s1,32(sp)
>   64:   6962c.ldsp  s2,24(sp)
>   66:   69c2c.ldsp  s3,16(sp)
>   68:   6a22c.ldsp  s4,8(sp)
>   6a:   6145c.addi16sp  sp,48
>   6c:   853ec.mva0,a5
>   6e:   8082c.jrra
>
> Cc: Björn Töpel 
> Signed-off-by: Luke Nelson 
> ---
> Björn: I added you as Cc instead of Acked-by for this patch so you
> would have a chance to review the further changes I made in emit_imm
> (described above).

Thanks! LGTM! I'll add the Acked-by/Reviewed-by to the cover letter!

Really nice work!
Björn

> ---
>  arch/riscv/net/bpf_jit_comp64.c | 281 +---
>  1 file changed, 147 insertions(+), 134 deletions(-)
>
> diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
> index 55861269da2a..8a56b5293117 100644
> --- a/arch/riscv/net/bpf_jit_comp64.c
> +++ b/arch/riscv/net/bpf_jit_comp64.c
> @@ -132,19 +132,23 @@ static void emit_imm(u8 rd, s64 val, struct 
> rv_jit_context *ctx)
>  *
>  * This also means that we need to process LSB to MSB.
>  */
> -   s64 upper = (val + (1 << 11)) >> 12, lower = val & 0xfff;
> +   s64 upper = (val + (1 << 11)) >> 12;
> +   /* Sign-extend lower 12 bits to 64 bits since immediates for li, 
> addiw,
> +* and addi are signed and RVC checks will perform signed comparisons.
> +*/
> +   s64 lower = ((val & 0xfff) << 52) >> 52;
> int shift;
>
> if (is_32b_int(val)) {
> if (upper)
> -   emit(rv_lui(rd, upper), ctx);
> +   emit_lui(rd, upper, ctx);
>
> if (!upper) {
> -   emit(rv_addi(rd, RV_REG_ZERO, lower), ctx);
> +   emit_li(rd, lower, ctx);
> return;
> }
>
> -   emit(rv_addiw(rd, rd, lower), ctx);
> +   emit_addiw(rd, rd, lower, ctx);
> return;
> }
>
> @@ -154,9 +158,9 @@ static void emit_imm(u8 rd, s64 val, struct 
> rv_jit_context *ctx)
>
> emit_imm(rd, upper, ctx);
>
> -   emit(rv_slli(rd, rd, shift), ctx);
> +   emit_slli(rd, rd, shift, ctx);
> if (lower)
> -   emit(rv_addi(rd, rd, lower), ctx);
> +   emit_addi(rd, rd, lower, ctx);
>  }
>
>  static void __build_epilogue(bool is_tail_call, struct rv_jit_context *ctx)
> @@ -164,43 +168,43 @@ static void __build_epilogue(bool is_tail_call, struct 
> rv_jit_context *ctx)
> int stack_adjust = ctx->stack_size, store_offset = stack_adjust - 8;
>
> if (seen_reg(RV_REG_RA, ctx)) {
> -   emit(rv_ld(RV_REG_RA, store_offset, RV_REG_SP), ctx);
> +   emit_ld(RV_REG_RA, store_offset, RV_REG_SP, ctx);
> store_offset -= 8;
> }
> -   emit(rv_ld(RV_REG_FP, store_offset, RV_REG_SP), ctx);
> +   emit_ld(RV_REG_FP, store_offset, RV_REG_SP, ctx);
> store_offset -= 8;
> if (seen_reg(RV_REG_S1, ctx)) {
> -   emit(rv_ld(RV_REG_S1, store_offset, RV_REG_SP), ctx);
> +   emit_ld(RV_REG_S1, store_offset, RV_REG_SP, ctx);
> store_offset -= 8;
> }
> if (seen_reg(RV_REG_S2, ctx)) {
> -   emit(rv_ld(RV_REG_S2, store_offset, RV_REG_SP), ctx);
> +   emit_ld(RV_REG_S2, store_offset, RV_REG_SP, ctx);
> store_offset -= 8;
> }
> if (seen_reg(RV_REG_S3, ctx)) {
> -   emit(rv_ld(RV_REG_S3, store_offset, RV_REG_SP), ctx);
> +   emit_ld(RV_REG_S3, store_offset, RV_REG_SP, ctx);
> store_offset -= 8;
> }
> if (seen_reg(RV_REG_S4, ctx)) {
> -   emit(rv_ld(RV_REG_S4, store_offset, RV_REG_SP), ctx);
> +   emit_ld(RV_REG_S4, store_offset, RV_REG_SP, ctx);
> store_offset -= 8;
> }
> if (seen_reg(RV_REG_S5, ctx)) {
> -   emit(rv_ld(RV_REG_S5, store_offset, RV_REG_SP), ctx);
> +   emit_ld(RV_REG_S5, store_offset, RV_REG_SP, ctx);
> store_offset -= 8;
> }
> if (seen_reg(RV_REG_S6, ctx)) {
> -   emit(rv_ld(RV_REG_S6, store_offset, RV_REG_SP), ctx);
> +   emit_ld(RV_REG_S6,

Re: [RFC PATCH bpf-next 3/3] bpf, riscv: Use compressed instructions in the rv64 JIT

2020-07-15 Thread Björn Töpel

On Mon, 13 Jul 2020 at 20:37, Luke Nelson  wrote:
>
> This patch uses the RVC support and encodings from bpf_jit.h to optimize
> the rv64 jit.
>
> The optimizations work by replacing emit(rv_X(...)) with a call to a
> helper function emit_X, which will emit a compressed version of the
> instruction when possible, and when RVC is enabled.
>
> The JIT continues to pass all tests in lib/test_bpf.c, and introduces
> no new failures to test_verifier; both with and without RVC being enabled.
>
> The following are examples of the JITed code for the verifier selftest
> "direct packet read test#3 for CGROUP_SKB OK", without and with RVC
> enabled, respectively. The former uses 178 bytes, and the latter uses 112,
> for a ~37% reduction in code size for this example.
>
> Without RVC:
>
>0: 02000813addi  a6,zero,32
>4: fd010113addi  sp,sp,-48
>8: 02813423sds0,40(sp)
>c: 02913023sds1,32(sp)
>   10: 01213c23sds2,24(sp)
>   14: 01313823sds3,16(sp)
>   18: 01413423sds4,8(sp)
>   1c: 03010413addi  s0,sp,48
>   20: 03056683lwu   a3,48(a0)
>   24: 02069693slli  a3,a3,0x20
>   28: 0206d693srli  a3,a3,0x20
>   2c: 03456703lwu   a4,52(a0)
>   30: 02071713slli  a4,a4,0x20
>   34: 02075713srli  a4,a4,0x20
>   38: 03856483lwu   s1,56(a0)
>   3c: 02049493slli  s1,s1,0x20
>   40: 0204d493srli  s1,s1,0x20
>   44: 03c56903lwu   s2,60(a0)
>   48: 02091913slli  s2,s2,0x20
>   4c: 02095913srli  s2,s2,0x20
>   50: 04056983lwu   s3,64(a0)
>   54: 0203slli  s3,s3,0x20
>   58: 0209d993srli  s3,s3,0x20
>   5c: 09056a03lwu   s4,144(a0)
>   60: 020a1a13slli  s4,s4,0x20
>   64: 020a5a13srli  s4,s4,0x20
>   68: 00900313addi  t1,zero,9
>   6c: 006a7463bgeu  s4,t1,0x74
>   70: 0a13addi  s4,zero,0
>   74: 02d52823swa3,48(a0)
>   78: 02e52a23swa4,52(a0)
>   7c: 02952c23sws1,56(a0)
>   80: 03252e23sws2,60(a0)
>   84: 05352023sws3,64(a0)
>   88: 0793addi  a5,zero,0
>   8c: 02813403lds0,40(sp)
>   90: 02013483lds1,32(sp)
>   94: 01813903lds2,24(sp)
>   98: 01013983lds3,16(sp)
>   9c: 00813a03lds4,8(sp)
>   a0: 03010113addi  sp,sp,48
>   a4: 00078513addi  a0,a5,0
>   a8: 8067jalr  zero,0(ra)
>
> With RVC:
>
>0:   02000813addia6,zero,32
>4:   7179c.addi16sp  sp,-48
>6:   f422c.sdsp  s0,40(sp)
>8:   f026c.sdsp  s1,32(sp)
>a:   ec4ac.sdsp  s2,24(sp)
>c:   e84ec.sdsp  s3,16(sp)
>e:   e452c.sdsp  s4,8(sp)
>   10:   1800c.addi4spn  s0,sp,48
>   12:   03056683lwu a3,48(a0)
>   16:   1682c.slli  a3,0x20
>   18:   9281c.srli  a3,0x20
>   1a:   03456703lwu a4,52(a0)
>   1e:   1702c.slli  a4,0x20
>   20:   9301c.srli  a4,0x20
>   22:   03856483lwu s1,56(a0)
>   26:   1482c.slli  s1,0x20
>   28:   9081c.srli  s1,0x20
>   2a:   03c56903lwu s2,60(a0)
>   2e:   1902c.slli  s2,0x20
>   30:   02095913srlis2,s2,0x20
>   34:   04056983lwu s3,64(a0)
>   38:   1982c.slli  s3,0x20
>   3a:   0209d993srlis3,s3,0x20
>   3e:   09056a03lwu s4,144(a0)
>   42:   1a02c.slli  s4,0x20
>   44:   020a5a13srlis4,s4,0x20
>   48:   4325c.lit1,9
>   4a:   006a7363bgeus4,t1,0x50
>   4e:   4a01c.lis4,0
>   50:   d914c.swa3,48(a0)
>   52:   d958c.swa4,52(a0)
>   54:   dd04c.sws1,56(a0)
>   56:   03252e23sw  s2,60(a0)
>   5a:   05352023sw  s3,64(a0)
>   5e:   4781c.lia5,0
>   60:   7422c.ldsp  s0,40(sp)
>   62:   7482c.ldsp  s1,32(sp)
>   64:   6962    c.ldsp  s2,24(sp)
>   66:   69c2c.ldsp  s3,16(sp)
>   68:   6a22c.ldsp  s4,8(sp)
>   6a:   6145c.addi16sp  sp,48
>   6c:   853ec.mva0,a5
>   6e:   8082c.jrra
>
> Signed-off-by: Luke Nelson 

Very nice work! For the proper series, feel free to add:

Acked-by: Björn Töpel 

> ---
>  arch/riscv/net/bpf_jit_comp64.c | 275 +---
>  1 file changed, 142 insertions(+), 133 deletions(-)
>
> diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
> index 26feed92f1bc..823fe800f406 100644
> --- a/arch/riscv/net/bpf_jit_comp64.c
> +++ b/arch/riscv/net/bpf_jit_comp64.c
> @@ -137,14 +137,14 @@ static void emit_imm(u8 rd, s64 val, struct 
> rv_jit_context *ctx)
>
> if (is_32b_int(val))

Re: [RFC PATCH bpf-next 2/3] bpf, riscv: Add encodings for compressed instructions

2020-07-15 Thread Björn Töpel

On Mon, 13 Jul 2020 at 20:37, Luke Nelson  wrote:
>
> This patch adds functions for encoding and emitting compressed riscv
> (RVC) instructions to the BPF JIT.
>
> Some regular riscv instructions can be compressed into an RVC instruction
> if the instruction fields meet some requirements. For example, "add rd,
> rs1, rs2" can be compressed into "c.add rd, rs2" when rd == rs1.
>
> To make using RVC encodings simpler, this patch also adds helper
> functions that selectively emit either a regular instruction or a
> compressed instruction if possible.
>
> For example, emit_add will produce a "c.add" if possible and regular
> "add" otherwise.
>
> Signed-off-by: Luke Nelson 
> ---
>  arch/riscv/net/bpf_jit.h | 474 +++
>  1 file changed, 474 insertions(+)
>
> diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h
> index 5c89ea904c1a..f3ac2d4a50f7 100644
> --- a/arch/riscv/net/bpf_jit.h
> +++ b/arch/riscv/net/bpf_jit.h
> @@ -13,6 +13,15 @@
>  #include 
>  #include 
>
> +static inline bool rvc_enabled(void)
> +{
> +#ifdef CONFIG_RISCV_ISA_C
> +   return true;
> +#else
> +   return false;
> +#endif
> +}
> +
>  enum {
> RV_REG_ZERO =   0,  /* The constant value 0 */
> RV_REG_RA = 1,  /* Return address */
> @@ -48,6 +57,18 @@ enum {
> RV_REG_T6 = 31,
>  };
>
> +static inline bool is_creg(u8 reg)
> +{
> +   return (1 << reg) & (BIT(RV_REG_FP) |
> +BIT(RV_REG_S1) |
> +BIT(RV_REG_A0) |
> +BIT(RV_REG_A1) |
> +BIT(RV_REG_A2) |
> +BIT(RV_REG_A3) |
> +BIT(RV_REG_A4) |
> +BIT(RV_REG_A5));
> +}
> +
>  struct rv_jit_context {
> struct bpf_prog *prog;
> u16 *insns; /* RV insns */
> @@ -134,6 +155,16 @@ static inline int invert_bpf_cond(u8 cond)
> return -1;
>  }
>
> +static inline bool is_6b_int(long val)
> +{
> +   return -(1L << 5) <= val && val < (1L << 5);
> +}
> +
> +static inline bool is_10b_int(long val)
> +{
> +   return -(1L << 9) <= val && val < (1L << 9);
> +}
> +
>  static inline bool is_12b_int(long val)
>  {
> return -(1L << 11) <= val && val < (1L << 11);
> @@ -224,6 +255,59 @@ static inline u32 rv_amo_insn(u8 funct5, u8 aq, u8 rl, 
> u8 rs2, u8 rs1,
> return rv_r_insn(funct7, rs2, rs1, funct3, rd, opcode);
>  }
>
> +/* RISC-V compressed instruction formats. */
> +
> +static inline u32 rv_cr_insn(u8 funct4, u8 rd, u8 rs2, u8 op)

Please change so that the return type is u16, so it matches emitc.

> +{
> +   return (funct4 << 12) | (rd << 7) | (rs2 << 2) | op;
> +}
> +
> +static inline u32 rv_ci_insn(u8 funct3, u32 imm6, u8 rd, u8 op)
> +{
> +   u16 imm;
> +
> +   imm = ((imm6 & 0x20) << 7) | ((imm6 & 0x1f) << 2);
> +   return (funct3 << 13) | (rd << 7) | op | imm;
> +}
> +
> +static inline u32 rv_css_insn(u8 funct3, u32 uimm, u8 rs2, u8 op)
> +{
> +   return (funct3 << 13) | (uimm << 7) | (rs2 << 2) | op;
> +}
> +
> +static inline u32 rv_ciw_insn(u8 funct3, u32 uimm, u8 rd, u8 op)
> +{
> +   return (funct3 << 13) | (uimm << 5) | ((rd & 0x7) << 2) | op;
> +}
> +
> +static inline u32 rv_cl_insn(u8 funct3, u32 imm_hi, u8 rs1, u32 imm_lo, u8 
> rd,
> +u8 op)
> +{
> +   return (funct3 << 13) | (imm_hi << 10) | ((rs1 & 0x7) << 7) |
> +   (imm_lo << 5) | ((rd & 0x7) << 2) | op;
> +}
> +
> +static inline u32 rv_cs_insn(u8 funct3, u32 imm_hi, u8 rs1, u32 imm_lo, u8 
> rs2,
> +u8 op)
> +{
> +   return (funct3 << 13) | (imm_hi << 10) | ((rs1 & 0x7) << 7) |
> +   (imm_lo << 5) | ((rs2 & 0x7) << 2) | op;
> +}
> +
> +static inline u32 rv_ca_insn(u8 funct6, u8 rd, u8 funct2, u8 rs2, u8 op)
> +{
> +   return (funct6 << 10) | ((rd & 0x7) << 7) | (funct2 << 5) |
> +   ((rs2 & 0x7) << 2) | op;
> +}
> +
> +static inline u32 rv_cb_insn(u8 funct3, u32 imm6, u8 funct2, u8 rd, u8 op)
> +{
> +   u16 imm;
> +
> +   imm = ((imm6 & 0x20) << 7) | ((imm6 & 0x1f) << 2);
> +   return (funct3 << 13) | (funct2 << 10) | ((rd & 0x7) << 7) | op | imm;
> +}
> +
>  /* Instructions shared by both RV32 and RV64. */
>
>  static inline u32 rv_addi(u8 rd, u8 rs1, u16 imm11_0)
> @@ -431,6 +515,135 @@ static inline u32 rv_amoadd_w(u8 rd, u8 rs2, u8 rs1, u8 
> aq, u8 rl)
> return rv_amo_insn(0, aq, rl, rs2, rs1, 2, rd, 0x2f);
>  }
>
> +/* RVC instrutions. */
> +
> +static inline u32 rvc_addi4spn(u8 rd, u32 imm10)

And same here. Change to u16 for the return value.

> +{
> +   u32 imm;
> +
> +   imm = ((imm10 & 0x30) << 2) | ((imm10 & 0x3c0) >> 4) |
> +   ((imm10 & 0x4) >> 1) | ((imm10 & 0x8) >> 3);
> +   return rv_ciw_insn(0x0, imm, rd, 0x0);
> +}
> +
> +static inline u32 rvc_lw(u8 rd, u32 imm7, u8 rs1)
> +{
> +   u32 imm_hi, imm_lo;
> +

Re: [RFC PATCH bpf-next 1/3] bpf, riscv: Modify JIT ctx to support compressed instructions

2020-07-15 Thread Björn Töpel

On Mon, 13 Jul 2020 at 20:37, Luke Nelson  wrote:
>
> This patch makes the necessary changes to struct rv_jit_context and to
> bpf_int_jit_compile to support compressed riscv (RVC) instructions in
> the BPF JIT.
>
> It changes the JIT image to be u16 instead of u32, since RVC instructions
> are 2 bytes as opposed to 4.
>
> It also changes ctx->offset and ctx->ninsns to refer to 2-byte
> instructions rather than 4-byte ones. The riscv PC is always 16-bit
> aligned, so this is sufficient to refer to any valid riscv offset.
>
> The code for computing jump offsets in bytes is updated accordingly,
> and factored in to the RVOFF macro to simplify the code.
>
> Signed-off-by: Luke Nelson 
> ---
>  arch/riscv/net/bpf_jit.h| 23 ---
>  arch/riscv/net/bpf_jit_comp32.c | 14 +++---
>  arch/riscv/net/bpf_jit_comp64.c | 12 ++--
>  arch/riscv/net/bpf_jit_core.c   |  6 +++---
>  4 files changed, 36 insertions(+), 19 deletions(-)
>
> diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h
> index 20e235d06f66..5c89ea904c1a 100644
> --- a/arch/riscv/net/bpf_jit.h
> +++ b/arch/riscv/net/bpf_jit.h
> @@ -50,7 +50,7 @@ enum {
>
>  struct rv_jit_context {
> struct bpf_prog *prog;
> -   u32 *insns; /* RV insns */
> +   u16 *insns; /* RV insns */
> int ninsns;
> int epilogue_offset;
> int *offset;/* BPF to RV */
> @@ -58,6 +58,9 @@ struct rv_jit_context {
> int stack_size;
>  };
>
> +/* Convert from ninsns to bytes. */
> +#define RVOFF(ninsns)  ((ninsns) << 1)
> +

I guess it's a matter of taste, but I'd prefer a simple static inline
function instead of the macro.

>  struct rv_jit_data {
> struct bpf_binary_header *header;
> u8 *image;
> @@ -74,8 +77,22 @@ static inline void bpf_flush_icache(void *start, void *end)
> flush_icache_range((unsigned long)start, (unsigned long)end);
>  }
>
> +/* Emit a 4-byte riscv instruction. */
>  static inline void emit(const u32 insn, struct rv_jit_context *ctx)
>  {
> +   if (ctx->insns) {
> +   ctx->insns[ctx->ninsns] = insn;
> +   ctx->insns[ctx->ninsns + 1] = (insn >> 16);
> +   }
> +
> +   ctx->ninsns += 2;
> +}
> +
> +/* Emit a 2-byte riscv compressed instruction. */
> +static inline void emitc(const u16 insn, struct rv_jit_context *ctx)
> +{
> +   BUILD_BUG_ON(!rvc_enabled());
> +
> if (ctx->insns)
> ctx->insns[ctx->ninsns] = insn;
>
> @@ -86,7 +103,7 @@ static inline int epilogue_offset(struct rv_jit_context 
> *ctx)
>  {
> int to = ctx->epilogue_offset, from = ctx->ninsns;
>
> -   return (to - from) << 2;
> +   return RVOFF(to - from);
>  }
>
>  /* Return -1 or inverted cond. */
> @@ -149,7 +166,7 @@ static inline int rv_offset(int insn, int off, struct 
> rv_jit_context *ctx)
> off++; /* BPF branch is from PC+1, RV is from PC */
> from = (insn > 0) ? ctx->offset[insn - 1] : 0;
> to = (insn + off > 0) ? ctx->offset[insn + off - 1] : 0;
> -   return (to - from) << 2;
> +   return RVOFF(to - from);
>  }
>
>  /* Instruction formats. */
> diff --git a/arch/riscv/net/bpf_jit_comp32.c b/arch/riscv/net/bpf_jit_comp32.c
> index b198eaa74456..d22001aa0057 100644
> --- a/arch/riscv/net/bpf_jit_comp32.c
> +++ b/arch/riscv/net/bpf_jit_comp32.c
> @@ -644,7 +644,7 @@ static int emit_branch_r64(const s8 *src1, const s8 
> *src2, s32 rvoff,
>
> e = ctx->ninsns;
> /* Adjust for extra insns. */
> -   rvoff -= (e - s) << 2;
> +   rvoff -= RVOFF(e - s);
> emit_jump_and_link(RV_REG_ZERO, rvoff, true, ctx);
> return 0;
>  }
> @@ -713,7 +713,7 @@ static int emit_bcc(u8 op, u8 rd, u8 rs, int rvoff, 
> struct rv_jit_context *ctx)
> if (far) {
> e = ctx->ninsns;
> /* Adjust for extra insns. */
> -   rvoff -= (e - s) << 2;
> +   rvoff -= RVOFF(e - s);
> emit_jump_and_link(RV_REG_ZERO, rvoff, true, ctx);
> }
> return 0;
> @@ -731,7 +731,7 @@ static int emit_branch_r32(const s8 *src1, const s8 
> *src2, s32 rvoff,
>
> e = ctx->ninsns;
> /* Adjust for extra insns. */
> -   rvoff -= (e - s) << 2;
> +   rvoff -= RVOFF(e - s);
>
> if (emit_bcc(op, lo(rs1), lo(rs2), rvoff, ctx))
> return -1;
> @@ -795,7 +795,7 @@ static int emit_bpf_tail_call(int insn, struct 
> rv_jit_context *ctx)
>  * if (index >= max_entries)
>  *   goto out;
>  */
> -   off = (tc_ninsn - (ctx->ninsns - start_insn)) << 2;
> +   off = RVOFF(tc_ninsn - (ctx->ninsns - start_insn));
> emit_bcc(BPF_JGE, lo(idx_reg), RV_REG_T1, off, ctx);
>
> /*
> @@ -804,7 +804,7 @@ static int emit_bpf_tail_call(int insn, struct 
> rv_jit_context *ctx)
>  *   goto out;
>  */
> emit(rv_addi(RV_REG_T1, RV_REG_TCC, -1), ctx);
> -   off = (tc_ninsn -

Re: [RFC PATCH bpf-next 0/3] bpf, riscv: Add compressed instructions to rv64 JIT

2020-07-15 Thread Björn Töpel

On Mon, 13 Jul 2020 at 20:37, Luke Nelson  wrote:
>
> This patch series enables using compressed riscv (RVC) instructions
> in the rv64 BPF JIT.
>

First of all; Really nice work. I like this, and it makes the code
easier to read as well (e.g. emit_mv). I'm a bit curious why you only
did it for RV64, and not RV32? I have some minor comments on the
patches. I strongly encourage you to submit this as a proper (non-RFC)
set for bpf-next.


Cheers,
Björn

> RVC is a standard riscv extension that adds a set of compressed,
> 2-byte instructions that can replace some regular 4-byte instructions
> for improved code density.
>
> This series first modifies the JIT to support using 2-byte instructions
> (e.g., in jump offset computations), then adds RVC encoding and
> helper functions, and finally uses the helper functions to optimize
> the rv64 JIT.
>
> I used our formal verification framework, Serval, to verify the
> correctness of the RVC encodings and their uses in the rv64 JIT.
>
> The JIT continues to pass all tests in lib/test_bpf.c, and introduces
> no new failures to test_verifier; both with and without RVC being enabled.
>
> The following are examples of the JITed code for the verifier selftest
> "direct packet read test#3 for CGROUP_SKB OK", without and with RVC
> enabled, respectively. The former uses 178 bytes, and the latter uses 112,
> for a ~37% reduction in code size for this example.
>
> Without RVC:
>
>0: 02000813addi  a6,zero,32
>4: fd010113addi  sp,sp,-48
>8: 02813423sds0,40(sp)
>c: 02913023sds1,32(sp)
>   10: 01213c23sds2,24(sp)
>   14: 01313823sds3,16(sp)
>   18: 01413423sds4,8(sp)
>   1c: 03010413addi  s0,sp,48
>   20: 03056683lwu   a3,48(a0)
>   24: 02069693slli  a3,a3,0x20
>   28: 0206d693srli  a3,a3,0x20
>   2c: 03456703lwu   a4,52(a0)
>   30: 02071713slli  a4,a4,0x20
>   34: 02075713srli  a4,a4,0x20
>   38: 03856483lwu   s1,56(a0)
>   3c: 02049493slli  s1,s1,0x20
>   40: 0204d493srli  s1,s1,0x20
>   44: 03c56903lwu   s2,60(a0)
>   48: 02091913slli  s2,s2,0x20
>   4c: 02095913srli  s2,s2,0x20
>   50: 04056983lwu   s3,64(a0)
>   54: 0203slli  s3,s3,0x20
>   58: 0209d993srli  s3,s3,0x20
>   5c: 09056a03lwu   s4,144(a0)
>   60: 020a1a13slli  s4,s4,0x20
>   64: 020a5a13srli  s4,s4,0x20
>   68: 00900313addi  t1,zero,9
>   6c: 006a7463bgeu  s4,t1,0x74
>   70: 0a13addi  s4,zero,0
>   74: 02d52823swa3,48(a0)
>   78: 02e52a23swa4,52(a0)
>   7c: 02952c23sws1,56(a0)
>   80: 03252e23sws2,60(a0)
>   84: 05352023sws3,64(a0)
>   88: 0793addi  a5,zero,0
>   8c: 02813403lds0,40(sp)
>   90: 02013483lds1,32(sp)
>   94: 01813903lds2,24(sp)
>   98: 01013983lds3,16(sp)
>   9c: 00813a03lds4,8(sp)
>   a0: 03010113addi  sp,sp,48
>   a4: 00078513addi  a0,a5,0
>   a8: 8067jalr  zero,0(ra)
>
> With RVC:
>
>0:   02000813addia6,zero,32
>4:   7179c.addi16sp  sp,-48
>6:   f422c.sdsp  s0,40(sp)
>8:   f026c.sdsp  s1,32(sp)
>a:   ec4ac.sdsp  s2,24(sp)
>c:   e84ec.sdsp  s3,16(sp)
>e:   e452c.sdsp  s4,8(sp)
>   10:   1800c.addi4spn  s0,sp,48
>   12:   03056683lwu a3,48(a0)
>   16:   1682c.slli  a3,0x20
>   18:   9281c.srli  a3,0x20
>   1a:   03456703lwu a4,52(a0)
>   1e:   1702c.slli  a4,0x20
>   20:   9301c.srli  a4,0x20
>   22:   03856483lwu s1,56(a0)
>   26:   1482c.slli  s1,0x20
>   28:   9081c.srli  s1,0x20
>   2a:   03c56903lwu s2,60(a0)
>   2e:   1902c.slli  s2,0x20
>   30:   02095913srlis2,s2,0x20
>   34:   04056983lwu s3,64(a0)
>   38:   1982c.slli  s3,0x20
>   3a:   0209d993srlis3,s3,0x20
>   3e:   09056a03lwu s4,144(a0)
>   42:   1a02c.slli  s4,0x20
>   44:   020a5a13srlis4,s4,0x20
>   48:   4325c.lit1,9
>   4a:   006a7363bgeus4,t1,0x50
>   4e:   4a01c.lis4,0
>   50:   d914c.swa3,48(a0)
>   52:   d958c.swa4,52(a0)
>   54:   dd04c.sws1,56(a0)
>   56:   03252e23sw  s2,60(a0)
>   5a:   05352023sw  s3,64(a0)
>   5e:   4781c.lia5,0
>   60:   7422c.ldsp  s0,40(sp)
>   62:   7482c.ldsp  s1,32(sp)
>   64:   6962c.ldsp  s2,24(sp)
>   66:   69c2c.ldsp  s3,16(sp)
>   68:   6a22c.ldsp  s4,8(sp)
>   6a:   6145c.addi16sp  sp,48
>   6c:   853ec.mva0,a5
>   6e:   8082c.jrra
>
> Luke Nelson (3):
>   bpf, riscv: Modify JIT ctx to support compressed instructions
>   bpf, riscv: Add encodings for compressed instructions
>   bpf, riscv: Use compressed instructions in the rv64 JIT
>
>  arch/riscv/net/bpf_jit.h| 495 +++-
>

Re: [PATCH v2 2/2] riscv: Add jump-label implementation

2020-07-09 Thread Björn Töpel

On Wed, 8 Jul 2020 at 23:10, Emil Renner Berthing  wrote:
>
> Add jump-label implementation based on the ARM64 version
> and add CONFIG_JUMP_LABEL=y to the defconfigs.
>
> Signed-off-by: Emil Renner Berthing 
> Reviewed-by: Björn Töpel 

Tested-by: Björn Töpel 

> ---
>
> Tested on the HiFive Unleashed board.
>
> Changes since v1:
> - WARN and give up gracefully if the jump offset cannot be
>   represented in a JAL instruction.
> - Add missing braces.
> - Add CONFIG_JUMP_LABEL=y to defconfigs.
>
> All suggested by Björn Töpel.
>
> Changes since RFC:
> - Use RISCV_PTR and RISCV_LGPTR macros to match struct jump_table
>   also in 32bit kernels.
> - Remove unneeded branch ? 1 : 0, thanks Björn
> - Fix \n\n instead of \n\t mistake
>
>  .../core/jump-labels/arch-support.txt |  2 +-
>  arch/riscv/Kconfig|  2 +
>  arch/riscv/configs/defconfig  |  1 +
>  arch/riscv/configs/nommu_k210_defconfig   |  1 +
>  arch/riscv/configs/nommu_virt_defconfig   |  1 +
>  arch/riscv/configs/rv32_defconfig |  1 +
>  arch/riscv/include/asm/jump_label.h   | 59 +++
>  arch/riscv/kernel/Makefile|  2 +
>  arch/riscv/kernel/jump_label.c| 49 +++
>  9 files changed, 117 insertions(+), 1 deletion(-)
>  create mode 100644 arch/riscv/include/asm/jump_label.h
>  create mode 100644 arch/riscv/kernel/jump_label.c
>
> diff --git a/Documentation/features/core/jump-labels/arch-support.txt 
> b/Documentation/features/core/jump-labels/arch-support.txt
> index 632a1c7aefa2..760243d18ed7 100644
> --- a/Documentation/features/core/jump-labels/arch-support.txt
> +++ b/Documentation/features/core/jump-labels/arch-support.txt
> @@ -23,7 +23,7 @@
>  |openrisc: | TODO |
>  |  parisc: |  ok  |
>  | powerpc: |  ok  |
> -|   riscv: | TODO |
> +|   riscv: |  ok  |
>  |s390: |  ok  |
>  |  sh: | TODO |
>  |   sparc: |  ok  |
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index fd639937e251..d2f5c53fdc19 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -46,6 +46,8 @@ config RISCV
> select GENERIC_TIME_VSYSCALL if MMU && 64BIT
> select HANDLE_DOMAIN_IRQ
> select HAVE_ARCH_AUDITSYSCALL
> +   select HAVE_ARCH_JUMP_LABEL
> +   select HAVE_ARCH_JUMP_LABEL_RELATIVE
> select HAVE_ARCH_KASAN if MMU && 64BIT
> select HAVE_ARCH_KGDB
> select HAVE_ARCH_KGDB_QXFER_PKT
> diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig
> index 4da4886246a4..d58c93efb603 100644
> --- a/arch/riscv/configs/defconfig
> +++ b/arch/riscv/configs/defconfig
> @@ -17,6 +17,7 @@ CONFIG_BPF_SYSCALL=y
>  CONFIG_SOC_SIFIVE=y
>  CONFIG_SOC_VIRT=y
>  CONFIG_SMP=y
> +CONFIG_JUMP_LABEL=y
>  CONFIG_MODULES=y
>  CONFIG_MODULE_UNLOAD=y
>  CONFIG_NET=y
> diff --git a/arch/riscv/configs/nommu_k210_defconfig 
> b/arch/riscv/configs/nommu_k210_defconfig
> index b48138e329ea..cd1df62b13c7 100644
> --- a/arch/riscv/configs/nommu_k210_defconfig
> +++ b/arch/riscv/configs/nommu_k210_defconfig
> @@ -33,6 +33,7 @@ CONFIG_SMP=y
>  CONFIG_NR_CPUS=2
>  CONFIG_CMDLINE="earlycon console=ttySIF0"
>  CONFIG_CMDLINE_FORCE=y
> +CONFIG_JUMP_LABEL=y
>  # CONFIG_BLOCK is not set
>  CONFIG_BINFMT_FLAT=y
>  # CONFIG_COREDUMP is not set
> diff --git a/arch/riscv/configs/nommu_virt_defconfig 
> b/arch/riscv/configs/nommu_virt_defconfig
> index cf74e179bf90..f27596e9663e 100644
> --- a/arch/riscv/configs/nommu_virt_defconfig
> +++ b/arch/riscv/configs/nommu_virt_defconfig
> @@ -30,6 +30,7 @@ CONFIG_MAXPHYSMEM_2GB=y
>  CONFIG_SMP=y
>  CONFIG_CMDLINE="root=/dev/vda rw earlycon=uart8250,mmio,0x1000,115200n8 
> console=ttyS0"
>  CONFIG_CMDLINE_FORCE=y
> +CONFIG_JUMP_LABEL=y
>  # CONFIG_BLK_DEV_BSG is not set
>  CONFIG_PARTITION_ADVANCED=y
>  # CONFIG_MSDOS_PARTITION is not set
> diff --git a/arch/riscv/configs/rv32_defconfig 
> b/arch/riscv/configs/rv32_defconfig
> index 05bbf5240569..3a55f0e00d6c 100644
> --- a/arch/riscv/configs/rv32_defconfig
> +++ b/arch/riscv/configs/rv32_defconfig
> @@ -17,6 +17,7 @@ CONFIG_BPF_SYSCALL=y
>  CONFIG_SOC_VIRT=y
>  CONFIG_ARCH_RV32I=y
>  CONFIG_SMP=y
> +CONFIG_JUMP_LABEL=y
>  CONFIG_MODULES=y
>  CONFIG_MODULE_UNLOAD=y
>  CONFIG_NET=y
> diff --git a/arch/riscv/include/asm/jump_label.h 
> b/arch/riscv/include/asm/jump_label.h
> new file mode 100644
> index ..d5fb342bfccf
> --- /dev/null
> +++ b/arch/riscv/include/asm/jump_label.

Re: [PATCH v2 1/2] riscv: Support R_RISCV_ADD64 and R_RISCV_SUB64 relocs

2020-07-09 Thread Björn Töpel

On Wed, 8 Jul 2020 at 23:10, Emil Renner Berthing  wrote:
>
> These are needed for the __jump_table in modules using
> static keys/jump-labels with the layout from
> HAVE_ARCH_JUMP_LABEL_RELATIVE on 64bit kernels.
>
> Signed-off-by: Emil Renner Berthing 

Reviewed-by: Björn Töpel 
Tested-by: Björn Töpel 

> ---
>
> Tested on the HiFive Unleashed board.
>
> This patch is new in v2. It fixes an error loading modules
> containing static keys found by Björn Töpel.
>
>  arch/riscv/kernel/module.c | 16 
>  1 file changed, 16 insertions(+)
>
> diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c
> index 7191342c54da..104fba889cf7 100644
> --- a/arch/riscv/kernel/module.c
> +++ b/arch/riscv/kernel/module.c
> @@ -263,6 +263,13 @@ static int apply_r_riscv_add32_rela(struct module *me, 
> u32 *location,
> return 0;
>  }
>
> +static int apply_r_riscv_add64_rela(struct module *me, u32 *location,
> +   Elf_Addr v)
> +{
> +   *(u64 *)location += (u64)v;
> +   return 0;
> +}
> +
>  static int apply_r_riscv_sub32_rela(struct module *me, u32 *location,
> Elf_Addr v)
>  {
> @@ -270,6 +277,13 @@ static int apply_r_riscv_sub32_rela(struct module *me, 
> u32 *location,
> return 0;
>  }
>
> +static int apply_r_riscv_sub64_rela(struct module *me, u32 *location,
> +   Elf_Addr v)
> +{
> +   *(u64 *)location -= (u64)v;
> +   return 0;
> +}
> +
>  static int (*reloc_handlers_rela[]) (struct module *me, u32 *location,
> Elf_Addr v) = {
> [R_RISCV_32]= apply_r_riscv_32_rela,
> @@ -290,7 +304,9 @@ static int (*reloc_handlers_rela[]) (struct module *me, 
> u32 *location,
> [R_RISCV_RELAX] = apply_r_riscv_relax_rela,
> [R_RISCV_ALIGN] = apply_r_riscv_align_rela,
> [R_RISCV_ADD32] = apply_r_riscv_add32_rela,
> +   [R_RISCV_ADD64] = apply_r_riscv_add64_rela,
> [R_RISCV_SUB32] = apply_r_riscv_sub32_rela,
> +   [R_RISCV_SUB64] = apply_r_riscv_sub64_rela,
>  };
>
>  int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
> --
> 2.27.0
>

Re: [PATCH v1] riscv: Add jump-label implementation

2020-07-08 Thread Björn Töpel

On Tue, 7 Jul 2020 at 17:08, Emil Renner Berthing  wrote:
>
> Add jump-label implementation based on the ARM64 version.
>
> Tested on the HiFive Unleashed board.
>

I took your patch for a spin on qemu. The boot selftest
(CONFIG_STATIC_KEYS_SELFTEST=y) passes, but the module test
(CONFIG_TEST_STATIC_KEYS=m) does not.

When I run the in "test tools/testing/selftests/static_keys" (this
simply loads the test_static_key_base.ko and test_static_keys.ko
modules) I get:

[  134.090464] test_static_keys: Unknown relocation type 36

36 is the relocation type R_RISCV_ADD64, which is not handled in
arch/riscv/kernel/module.c.

If you dump the relocation entries for test_static_keys.ko
(riscv64-linux-gnu-objdump -r test_static_keys.ko), you'll see that:

RELOCATION RECORDS FOR [__jump_table]:
OFFSET   TYPE  VALUE
 R_RISCV_ADD32 .L1^B1
 R_RISCV_SUB32 .L0
0004 R_RISCV_ADD32 .L3
0004 R_RISCV_SUB32 .L0
0008 R_RISCV_ADD64 old_true_key+0x0001
0008 R_RISCV_SUB64 .L0
...

It would be great if you could add a patch for that as well (separate,
same series). R_RISCV_ADD64 *and* R_RISCV_SUB64 are currently
unhandled by module.c.


Cheers,
Björn

> Signed-off-by: Emil Renner Berthing 
> ---
>
> Changes since RFC:
> - Use RISCV_PTR and RISCV_LGPTR macros to match struct jump_table
>   also in 32bit kernels.
> - Remove unneeded branch ? 1 : 0, thanks Björn
> - Fix \n\n instead of \n\t mistake
>
>  .../core/jump-labels/arch-support.txt |  2 +-
>  arch/riscv/Kconfig|  2 +
>  arch/riscv/include/asm/jump_label.h   | 59 +++
>  arch/riscv/kernel/Makefile|  2 +
>  arch/riscv/kernel/jump_label.c| 44 ++
>  5 files changed, 108 insertions(+), 1 deletion(-)
>  create mode 100644 arch/riscv/include/asm/jump_label.h
>  create mode 100644 arch/riscv/kernel/jump_label.c
>
> diff --git a/Documentation/features/core/jump-labels/arch-support.txt 
> b/Documentation/features/core/jump-labels/arch-support.txt
> index 632a1c7aefa2..760243d18ed7 100644
> --- a/Documentation/features/core/jump-labels/arch-support.txt
> +++ b/Documentation/features/core/jump-labels/arch-support.txt
> @@ -23,7 +23,7 @@
>  |openrisc: | TODO |
>  |  parisc: |  ok  |
>  | powerpc: |  ok  |
> -|   riscv: | TODO |
> +|   riscv: |  ok  |
>  |s390: |  ok  |
>  |  sh: | TODO |
>  |   sparc: |  ok  |
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index fd639937e251..d2f5c53fdc19 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -46,6 +46,8 @@ config RISCV
> select GENERIC_TIME_VSYSCALL if MMU && 64BIT
> select HANDLE_DOMAIN_IRQ
> select HAVE_ARCH_AUDITSYSCALL
> +   select HAVE_ARCH_JUMP_LABEL
> +   select HAVE_ARCH_JUMP_LABEL_RELATIVE
> select HAVE_ARCH_KASAN if MMU && 64BIT
> select HAVE_ARCH_KGDB
> select HAVE_ARCH_KGDB_QXFER_PKT
> diff --git a/arch/riscv/include/asm/jump_label.h 
> b/arch/riscv/include/asm/jump_label.h
> new file mode 100644
> index ..d5fb342bfccf
> --- /dev/null
> +++ b/arch/riscv/include/asm/jump_label.h
> @@ -0,0 +1,59 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (C) 2020 Emil Renner Berthing
> + *
> + * Based on arch/arm64/include/asm/jump_label.h
> + */
> +#ifndef __ASM_JUMP_LABEL_H
> +#define __ASM_JUMP_LABEL_H
> +
> +#ifndef __ASSEMBLY__
> +
> +#include 
> +
> +#define JUMP_LABEL_NOP_SIZE 4
> +
> +static __always_inline bool arch_static_branch(struct static_key *key,
> +  bool branch)
> +{
> +   asm_volatile_goto(
> +   "   .option push\n\t"
> +   "   .option norelax \n\t"
> +   "   .option norvc   \n\t"
> +   "1: nop \n\t"
> +   "   .option pop \n\t"
> +   "   .pushsection__jump_table, \"aw\"\n\t"
> +   "   .align  " RISCV_LGPTR " \n\t"
> +   "   .long   1b - ., %l[label] - .   \n\t"
> +   "   " RISCV_PTR "   %0 - .  \n\t"
> +   "   .popsection \n\t"
> +   :  :  "i"(&((char *)key)[branch]) :  : label);
> +
> +   return false;
> +label:
> +   return true;
> +}
> +
> +static __always_inline bool arch_static_branch_jump(struct static_key *key,
> +   bool branch)
> +{
> +   asm_volatile_goto(
> +   "   .option push\n\t"
> +   "   .option norelax \n\t"
> +

Re: [PATCH v1] riscv: Add jump-label implementation

2020-07-08 Thread Björn Töpel

On Tue, 7 Jul 2020 at 17:08, Emil Renner Berthing  wrote:
>
> Add jump-label implementation based on the ARM64 version.
>
> Tested on the HiFive Unleashed board.
>
> Signed-off-by: Emil Renner Berthing 
> ---
>
> Changes since RFC:
> - Use RISCV_PTR and RISCV_LGPTR macros to match struct jump_table
>   also in 32bit kernels.
> - Remove unneeded branch ? 1 : 0, thanks Björn
> - Fix \n\n instead of \n\t mistake
>
>  .../core/jump-labels/arch-support.txt |  2 +-
>  arch/riscv/Kconfig|  2 +
>  arch/riscv/include/asm/jump_label.h   | 59 +++
>  arch/riscv/kernel/Makefile|  2 +
>  arch/riscv/kernel/jump_label.c| 44 ++
>  5 files changed, 108 insertions(+), 1 deletion(-)
>  create mode 100644 arch/riscv/include/asm/jump_label.h
>  create mode 100644 arch/riscv/kernel/jump_label.c
>
> diff --git a/Documentation/features/core/jump-labels/arch-support.txt 
> b/Documentation/features/core/jump-labels/arch-support.txt
> index 632a1c7aefa2..760243d18ed7 100644
> --- a/Documentation/features/core/jump-labels/arch-support.txt
> +++ b/Documentation/features/core/jump-labels/arch-support.txt
> @@ -23,7 +23,7 @@
>  |openrisc: | TODO |
>  |  parisc: |  ok  |
>  | powerpc: |  ok  |
> -|   riscv: | TODO |
> +|   riscv: |  ok  |
>  |s390: |  ok  |
>  |  sh: | TODO |
>  |   sparc: |  ok  |
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index fd639937e251..d2f5c53fdc19 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -46,6 +46,8 @@ config RISCV
> select GENERIC_TIME_VSYSCALL if MMU && 64BIT
> select HANDLE_DOMAIN_IRQ
> select HAVE_ARCH_AUDITSYSCALL
> +   select HAVE_ARCH_JUMP_LABEL
> +   select HAVE_ARCH_JUMP_LABEL_RELATIVE

Missed one thing in the last reply; Please update defconfig with
CONFIG_JUMP_LABEL=y.


Björn

> select HAVE_ARCH_KASAN if MMU && 64BIT
> select HAVE_ARCH_KGDB
> select HAVE_ARCH_KGDB_QXFER_PKT
> diff --git a/arch/riscv/include/asm/jump_label.h 
> b/arch/riscv/include/asm/jump_label.h
> new file mode 100644
> index ..d5fb342bfccf
> --- /dev/null
> +++ b/arch/riscv/include/asm/jump_label.h
> @@ -0,0 +1,59 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (C) 2020 Emil Renner Berthing
> + *
> + * Based on arch/arm64/include/asm/jump_label.h
> + */
> +#ifndef __ASM_JUMP_LABEL_H
> +#define __ASM_JUMP_LABEL_H
> +
> +#ifndef __ASSEMBLY__
> +
> +#include 
> +
> +#define JUMP_LABEL_NOP_SIZE 4
> +
> +static __always_inline bool arch_static_branch(struct static_key *key,
> +  bool branch)
> +{
> +   asm_volatile_goto(
> +   "   .option push\n\t"
> +   "   .option norelax \n\t"
> +   "   .option norvc   \n\t"
> +   "1: nop \n\t"
> +   "   .option pop \n\t"
> +   "   .pushsection__jump_table, \"aw\"\n\t"
> +   "   .align  " RISCV_LGPTR " \n\t"
> +   "   .long   1b - ., %l[label] - .   \n\t"
> +   "   " RISCV_PTR "   %0 - .  \n\t"
> +   "   .popsection \n\t"
> +   :  :  "i"(&((char *)key)[branch]) :  : label);
> +
> +   return false;
> +label:
> +   return true;
> +}
> +
> +static __always_inline bool arch_static_branch_jump(struct static_key *key,
> +   bool branch)
> +{
> +   asm_volatile_goto(
> +   "   .option push\n\t"
> +   "   .option norelax \n\t"
> +   "   .option norvc   \n\t"
> +   "1: jal zero, %l[label] \n\t"
> +   "   .option pop \n\t"
> +   "   .pushsection__jump_table, \"aw\"\n\t"
> +   "   .align  " RISCV_LGPTR " \n\t"
> +   "   .long   1b - ., %l[label] - .   \n\t"
> +   "   " RISCV_PTR "   %0 - .  \n\t"
> +   "   .popsection \n\t"
> +   :  :  "i"(&((char *)key)[branch]) :  : label);
> +
> +   return false;
> +label:
> +   return true;
> +}
> +
> +#endif  /* __ASSEMBLY__ */
> +#endif /* __ASM_JUMP_LABEL_H */
> diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
> index b355cf485671..a5287ab9f7f2 100644
> --- a/arch/riscv/kernel/Makefile
> +++ b/arch/riscv/kernel/Makefile
> @@ -53,4 +53,6 @@ endif
>  obj-$(CONFIG_HOTPLUG_CPU)  +=

Re: [PATCH v1] riscv: Add jump-label implementation

2020-07-08 Thread Björn Töpel

   \n\t"
> +   "   .popsection \n\t"
> +   :  :  "i"(&((char *)key)[branch]) :  : label);
> +
> +   return false;
> +label:
> +   return true;
> +}
> +
> +#endif  /* __ASSEMBLY__ */
> +#endif /* __ASM_JUMP_LABEL_H */
> diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
> index b355cf485671..a5287ab9f7f2 100644
> --- a/arch/riscv/kernel/Makefile
> +++ b/arch/riscv/kernel/Makefile
> @@ -53,4 +53,6 @@ endif
>  obj-$(CONFIG_HOTPLUG_CPU)  += cpu-hotplug.o
>  obj-$(CONFIG_KGDB) += kgdb.o
>
> +obj-$(CONFIG_JUMP_LABEL)   += jump_label.o
> +
>  clean:
> diff --git a/arch/riscv/kernel/jump_label.c b/arch/riscv/kernel/jump_label.c
> new file mode 100644
> index ..55b2d742efe1
> --- /dev/null
> +++ b/arch/riscv/kernel/jump_label.c
> @@ -0,0 +1,44 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (C) 2020 Emil Renner Berthing
> + *
> + * Based on arch/arm64/kernel/jump_label.c
> + */
> +#include 
> +#include 
> +#include 
> +
> +#define RISCV_INSN_NOP 0x0013
> +#define RISCV_INSN_JAL 0x006f
> +
> +void arch_jump_label_transform(struct jump_entry *entry,
> +  enum jump_label_type type)
> +{
> +   void *addr = (void *)jump_entry_code(entry);
> +   u32 insn;
> +
> +   if (type == JUMP_LABEL_JMP) {
> +   u32 offset = jump_entry_target(entry) - 
> jump_entry_code(entry);

The 20b jump offset of JAL is enough, but to catch future changes that
might potentially break this, I would add a WARN to catch if offset
>20b (or overkill fallback to JALR -- don't do that.).

Somewhat related to that; The UNIX specification for RISC-V requires
support for RVC. Would it make sense to use the C.-instructions C.J
and C.NOP? Really a nit, or more of a question. I'm fine with this 32b
version!

> +
> +   insn = RISCV_INSN_JAL |
> +   ((offset & GENMASK(19, 12)) << (12 - 12)) |
> +   ((offset & GENMASK(11, 11)) << (20 - 11)) |
> +   ((offset & GENMASK(10,  1)) << (21 -  1)) |
> +   ((offset & GENMASK(20, 20)) << (31 - 20));
> +   } else
> +   insn = RISCV_INSN_NOP;

Since the if-statement needs {}, the else-clause should have one too.

Feel free to add:
Reviewed-by: Björn Töpel 


Cheers,
Björn
> +
> +   patch_text_nosync(addr, , sizeof(insn));
> +}
> +
> +void arch_jump_label_transform_static(struct jump_entry *entry,
> + enum jump_label_type type)
> +{
> +   /*
> +* We use the same instructions in the arch_static_branch and
> +* arch_static_branch_jump inline functions, so there's no
> +* need to patch them up here.
> +* The core will call arch_jump_label_transform  when those
> +* instructions need to be replaced.
> +*/
> +}
> --
> 2.27.0
>

Re: [RFC] riscv: Add jump-label implementation

2020-07-04 Thread Björn Töpel

On Sat, 4 Jul 2020 at 13:35, Emil Renner Berthing  wrote:
>
> On Sat, 4 Jul 2020 at 13:23, Björn Töpel  wrote:
[...]
> > Indeed. And nice work! Can you respin the patch with the 32b fix
> > above, and also without the RFC tag?
>
> Yes, of course. If you don't mind I'll wait a bit and let this collect
> a bit more comments.
>

Certainly!

> > Curious; Why is [branch ? 1 : 0] needed when coding the boolean into
> > the key pointer (arm64 is just [branch]). Different encoding of
> > booleans (branch in this case)?
>
> No, that was just me being unsure exactly how bool works when used as
> an index. After reading up on it it seems the original code is right,
> you can actually trust that _Bool is either 0 or 1. I'll fix it in the
> next version. Thanks!
>

Cool! Thanks for clearing that up for me!

Cheers,
Björn

Re: [RFC] riscv: Add jump-label implementation

2020-07-04 Thread Björn Töpel

On Fri, 3 Jul 2020 at 17:43, Emil Renner Berthing  wrote:
>
> On Thu, 2 Jul 2020 at 22:07, Emil Renner Berthing  wrote:
> >
> > Add basic jump-label implementation heavily based
> > on the ARM64 version.
> >
> > Tested on the HiFive Unleashed.
> >
> > Signed-off-by: Emil Renner Berthing 
> > ---
> >
> > This seems to work on my HiFive Unleashed. At least boots, runs fine
> > and the static key self-tests doesn't complain, but I'm sure I've missed
> > something, so I'm sending this as an RFC.
> >
> > /Emil
> >
> >  .../core/jump-labels/arch-support.txt |  2 +-
> >  arch/riscv/Kconfig|  2 +
> >  arch/riscv/include/asm/jump_label.h   | 59 +++
> >  arch/riscv/kernel/Makefile|  2 +
> >  arch/riscv/kernel/jump_label.c| 44 ++
> >  5 files changed, 108 insertions(+), 1 deletion(-)
> >  create mode 100644 arch/riscv/include/asm/jump_label.h
> >  create mode 100644 arch/riscv/kernel/jump_label.c
> >
> > diff --git a/Documentation/features/core/jump-labels/arch-support.txt 
> > b/Documentation/features/core/jump-labels/arch-support.txt
> > index 632a1c7aefa2..760243d18ed7 100644
> > --- a/Documentation/features/core/jump-labels/arch-support.txt
> > +++ b/Documentation/features/core/jump-labels/arch-support.txt
> > @@ -23,7 +23,7 @@
> >  |openrisc: | TODO |
> >  |  parisc: |  ok  |
> >  | powerpc: |  ok  |
> > -|   riscv: | TODO |
> > +|   riscv: |  ok  |
> >  |s390: |  ok  |
> >  |  sh: | TODO |
> >  |   sparc: |  ok  |
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index fd639937e251..d2f5c53fdc19 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -46,6 +46,8 @@ config RISCV
> > select GENERIC_TIME_VSYSCALL if MMU && 64BIT
> > select HANDLE_DOMAIN_IRQ
> > select HAVE_ARCH_AUDITSYSCALL
> > +   select HAVE_ARCH_JUMP_LABEL
> > +   select HAVE_ARCH_JUMP_LABEL_RELATIVE
> > select HAVE_ARCH_KASAN if MMU && 64BIT
> > select HAVE_ARCH_KGDB
> > select HAVE_ARCH_KGDB_QXFER_PKT
> > diff --git a/arch/riscv/include/asm/jump_label.h 
> > b/arch/riscv/include/asm/jump_label.h
> > new file mode 100644
> > index ..29be6d351866
> > --- /dev/null
> > +++ b/arch/riscv/include/asm/jump_label.h
> > @@ -0,0 +1,59 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +/*
> > + * Copyright (C) 2020 Emil Renner Berthing
> > + *
> > + * Based on arch/arm64/include/asm/jump_label.h
> > + */
> > +#ifndef __ASM_JUMP_LABEL_H
> > +#define __ASM_JUMP_LABEL_H
> > +
> > +#ifndef __ASSEMBLY__
> > +
> > +#include 
> > +
> > +#define JUMP_LABEL_NOP_SIZE 4
> > +
> > +static __always_inline bool arch_static_branch(struct static_key *key,
> > +  bool branch)
> > +{
> > +   asm_volatile_goto(
> > +   "   .option push\n\n"
> > +   "   .option norelax \n\n"
> > +   "   .option norvc   \n\n"
> > +   "1: nop \n\t"
> > +   "   .option pop \n\n"
> > +   "   .pushsection__jump_table, \"aw\"\n\t"
> > +   "   .align  3   \n\t"
> > +   "   .long   1b - ., %l[l_yes] - .   \n\t"
> > +   "   .quad   %0 - .  \n\t"
>
> With HAVE_ARCH_JUMP_LABEL_RELATIVE we get
> struct jump_entry {
>   s32 code;
>   s32 target;
>   long key;
> }
> ..so this .quad and the one below should be replaced by the RISCV_PTR
> macro to match the struct in 32bit kernels.
>

Indeed. And nice work! Can you respin the patch with the 32b fix
above, and also without the RFC tag?

Curious; Why is [branch ? 1 : 0] needed when coding the boolean into
the key pointer (arm64 is just [branch]). Different encoding of
booleans (branch in this case)?


Cheers,
Björn

> > +   "   .popsection \n\t"
> > +   :  :  "i"(&((char *)key)[branch ? 1 : 0]) :  : l_yes);
> > +
> > +   return false;
> > +l_yes:
> > +   return true;
> > +}
> > +
> > +static __always_inline bool arch_static_branch_jump(struct static_key *key,
> > +   bool branch)
> > +{
> > +   asm_volatile_goto(
> > +   "   .option push\n\n"
> > +   "   .option norelax \n\n"
> > +   "   .option norvc   \n\n"
> > +   "1: jal zero, %l[l_yes] \n\t"
> > +   "   .option pop \n\n"
> > +   "   .pushsection__jump_table, \"aw\"\n\t"
> > +   "

Re: [PATCH net] xsk: remove cheap_dma optimization

2020-07-01 Thread Björn Töpel


On 2020-06-29 17:41, Robin Murphy wrote:

On 2020-06-28 18:16, Björn Töpel wrote:

[...]>

Somewhat related to the DMA API; It would have performance benefits for
AF_XDP if the DMA range of the mapped memory was linear, i.e. by IOMMU
utilization. I've started hacking a thing a little bit, but it would be
nice if such API was part of the mapping core.

Input: array of pages Output: array of dma addrs (and obviously dev,
flags and such)

For non-IOMMU len(array of pages) == len(array of dma addrs)
For best-case IOMMU len(array of dma addrs) == 1 (large linear space)

But that's for later. :-)


FWIW you will typically get that behaviour from IOMMU-based 
implementations of dma_map_sg() right now, although it's not strictly 
guaranteed. If you can weather some additional setup cost of calling 
sg_alloc_table_from_pages() plus walking the list after mapping to test 
whether you did get a contiguous result, you could start taking 
advantage of it as some of the dma-buf code in DRM and v4l2 does already 
(although those cases actually treat it as a strict dependency rather 
than an optimisation).


I'm inclined to agree that if we're going to see more of these cases, a 
new API call that did formally guarantee a DMA-contiguous mapping 
(either via IOMMU or bounce buffering) or failure might indeed be handy.




I forgot to reply to this one! My current hack is using the iommu code 
directly, similar to what vfio-pci does (hopefully not gutting the API 
this time ;-)).


Your approach sound much nicer, and easier. I'll try that out! Thanks a 
lot for the pointers, and I might be back with more questions.



Cheers,
Björn


Robin.

Re: [PATCH net] xsk: remove cheap_dma optimization

2020-06-29 Thread Björn Töpel


On 2020-06-29 15:52, Daniel Borkmann wrote:


Ok, fair enough, please work with DMA folks to get this properly 
integrated and

restored then. Applied, thanks!


Daniel, you were too quick! Please revert this one; Christoph just 
submitted a 4-patch-series that addresses both the DMA API, and the perf 
regression!



Björn

Re: add an API to check if a streamming mapping needs sync calls

2020-06-29 Thread Björn Töpel


On 2020-06-29 15:03, Christoph Hellwig wrote:

Hi all,

this series lifts the somewhat hacky checks in the XSK code if a DMA
streaming mapping needs dma_sync_single_for_{device,cpu} calls to the
DMA API.



Thanks a lot for working on, and fixing this, Christoph!

I took the series for a spin, and there are (obviously) no performance
regressions.

Would the patches go through the net/bpf trees or somewhere else?

For the series:
Tested-by: Björn Töpel 
Acked-by: Björn Töpel 


Björn



Diffstat:
  Documentation/core-api/dma-api.rst |8 +
  include/linux/dma-direct.h |1
  include/linux/dma-mapping.h|5 +++
  include/net/xsk_buff_pool.h|6 ++--
  kernel/dma/direct.c|6 
  kernel/dma/mapping.c   |   10 ++
  net/xdp/xsk_buff_pool.c|   54 
++---
  7 files changed, 37 insertions(+), 53 deletions(-)

Re: [PATCH net] xsk: remove cheap_dma optimization

2020-06-29 Thread Björn Töpel




On 2020-06-29 17:18, Daniel Borkmann wrote:
Nice, tossed from bpf tree then! (Looks like it didn't land on the bpf 
list yet,
but seems other mails are currently stuck as well on vger. I presume it 
will be

routed to Linus via Christoph?)


Thanks!

Christoph (according to the other mail) was OK taking the series via 
your bpf, Dave's net, or his dma-mapping tree.



Björn

Re: [PATCH net] xsk: remove cheap_dma optimization

2020-06-28 Thread Björn Töpel




On 2020-06-27 09:04, Christoph Hellwig wrote:

On Sat, Jun 27, 2020 at 01:00:19AM +0200, Daniel Borkmann wrote:

Given there is roughly a ~5 weeks window at max where this removal could
still be applied in the worst case, could we come up with a fix / proposal
first that moves this into the DMA mapping core? If there is something that
can be agreed upon by all parties, then we could avoid re-adding the 9%
slowdown. :/


I'd rather turn it upside down - this abuse of the internals blocks work
that has basically just missed the previous window and I'm not going
to wait weeks to sort out the API misuse.  But we can add optimizations
back later if we find a sane way.



I'm not super excited about the performance loss, but I do get
Christoph's frustration about gutting the DMA API making it harder for
DMA people to get work done. Lets try to solve this properly using
proper DMA APIs.



That being said I really can't see how this would make so much of a
difference.  What architecture and what dma_ops are you using for
those measurements?  What is the workload?



The 9% is for an AF_XDP (Fast raw Ethernet socket. Think AF_PACKET, but 
faster.) benchmark: receive the packet from the NIC, and drop it. The 
DMA syncs stand out in the perf top:


  28.63%  [kernel]   [k] i40e_clean_rx_irq_zc
  17.12%  [kernel]   [k] xp_alloc
   8.80%  [kernel]   [k] __xsk_rcv_zc
   7.69%  [kernel]   [k] xdp_do_redirect
   5.35%  bpf_prog_992d9ddc835e5629  [k] bpf_prog_992d9ddc835e5629
   4.77%  [kernel]   [k] xsk_rcv.part.0
   4.07%  [kernel]   [k] __xsk_map_redirect
   3.80%  [kernel]   [k] dma_direct_sync_single_for_cpu
   3.03%  [kernel]   [k] dma_direct_sync_single_for_device
   2.76%  [kernel]   [k] i40e_alloc_rx_buffers_zc
   1.83%  [kernel]   [k] xsk_flush
...

For this benchmark the dma_ops are NULL (dma_is_direct() == true), and
the main issue is that SWIOTLB is now unconditionally enabled [1] for
x86, and for each sync we have to check that if is_swiotlb_buffer()
which involves a some costly indirection.

That was pretty much what my hack avoided. Instead we did all the checks
upfront, since AF_XDP has long-term DMA mappings, and just set a flag
for that.

Avoiding the whole "is this address swiotlb" in
dma_direct_sync_single_for_{cpu, device]() per-packet
would help a lot.

Somewhat related to the DMA API; It would have performance benefits for
AF_XDP if the DMA range of the mapped memory was linear, i.e. by IOMMU
utilization. I've started hacking a thing a little bit, but it would be
nice if such API was part of the mapping core.

Input: array of pages Output: array of dma addrs (and obviously dev,
flags and such)

For non-IOMMU len(array of pages) == len(array of dma addrs)
For best-case IOMMU len(array of dma addrs) == 1 (large linear space)

But that's for later. :-)


Björn


[1] commit: 09230cbc1bab ("swiotlb: move the SWIOTLB config symbol to 
lib/Kconfig")

[PATCH net] xsk: remove cheap_dma optimization

2020-06-26 Thread Björn Töpel

From: Björn Töpel 

When the AF_XDP buffer allocation API was introduced it had an
optimization, "cheap_dma". The idea was that when the umem was DMA
mapped, the pool also checked whether the mapping required a
synchronization (CPU to device, and vice versa). If not, it would be
marked as "cheap_dma" and the synchronization would be elided.

In [1] Christoph points out that the optimization above breaks the DMA
API abstraction, and should be removed. Further, Christoph points out
that optimizations like this should be done within the DMA mapping
core, and not elsewhere.

Unfortunately this has implications for the packet rate
performance. The AF_XDP rxdrop scenario shows a 9% decrease in packets
per second.

[1] https://lore.kernel.org/netdev/20200626074725.ga21...@lst.de/

Cc: Christoph Hellwig 
Fixes: 2b43470add8c ("xsk: Introduce AF_XDP buffer allocation API")
Signed-off-by: Björn Töpel 
---
 include/net/xsk_buff_pool.h | 16 +++--
 net/xdp/xsk_buff_pool.c | 69 ++---
 2 files changed, 6 insertions(+), 79 deletions(-)

diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index a4ff226505c9..3ea9b9654632 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -40,7 +40,6 @@ struct xsk_buff_pool {
u32 headroom;
u32 chunk_size;
u32 frame_len;
-   bool cheap_dma;
bool unaligned;
void *addrs;
struct device *dev;
@@ -77,24 +76,17 @@ static inline dma_addr_t xp_get_frame_dma(struct 
xdp_buff_xsk *xskb)
return xskb->frame_dma;
 }
 
-void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb);
 static inline void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb)
 {
-   if (xskb->pool->cheap_dma)
-   return;
-
-   xp_dma_sync_for_cpu_slow(xskb);
+   dma_sync_single_range_for_cpu(xskb->pool->dev, xskb->dma, 0,
+ xskb->pool->frame_len, DMA_BIDIRECTIONAL);
 }
 
-void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma,
-size_t size);
 static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool,
  dma_addr_t dma, size_t size)
 {
-   if (pool->cheap_dma)
-   return;
-
-   xp_dma_sync_for_device_slow(pool, dma, size);
+   dma_sync_single_range_for_device(pool->dev, dma, 0,
+size, DMA_BIDIRECTIONAL);
 }
 
 /* Masks for xdp_umem_page flags.
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index 540ed75e4482..c330e5f3aadf 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -2,9 +2,6 @@
 
 #include 
 #include 
-#include 
-#include 
-#include 
 
 #include "xsk_queue.h"
 
@@ -55,7 +52,6 @@ struct xsk_buff_pool *xp_create(struct page **pages, u32 
nr_pages, u32 chunks,
pool->free_heads_cnt = chunks;
pool->headroom = headroom;
pool->chunk_size = chunk_size;
-   pool->cheap_dma = true;
pool->unaligned = unaligned;
pool->frame_len = chunk_size - headroom - XDP_PACKET_HEADROOM;
INIT_LIST_HEAD(>free_list);
@@ -125,48 +121,6 @@ static void xp_check_dma_contiguity(struct xsk_buff_pool 
*pool)
}
 }
 
-static bool __maybe_unused xp_check_swiotlb_dma(struct xsk_buff_pool *pool)
-{
-#if defined(CONFIG_SWIOTLB)
-   phys_addr_t paddr;
-   u32 i;
-
-   for (i = 0; i < pool->dma_pages_cnt; i++) {
-   paddr = dma_to_phys(pool->dev, pool->dma_pages[i]);
-   if (is_swiotlb_buffer(paddr))
-   return false;
-   }
-#endif
-   return true;
-}
-
-static bool xp_check_cheap_dma(struct xsk_buff_pool *pool)
-{
-#if defined(CONFIG_HAS_DMA)
-   const struct dma_map_ops *ops = get_dma_ops(pool->dev);
-
-   if (ops) {
-   return !ops->sync_single_for_cpu &&
-   !ops->sync_single_for_device;
-   }
-
-   if (!dma_is_direct(ops))
-   return false;
-
-   if (!xp_check_swiotlb_dma(pool))
-   return false;
-
-   if (!dev_is_dma_coherent(pool->dev)) {
-#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) ||   \
-   defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) ||\
-   defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE)
-   return false;
-#endif
-   }
-#endif
-   return true;
-}
-
 int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
   unsigned long attrs, struct page **pages, u32 nr_pages)
 {
@@ -195,7 +149,6 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device 
*dev,
xp_check_dma_contiguity(pool);
 
pool->dev = dev;
-   pool->cheap_dma = xp_check_cheap_dma(pool);
return 0;
 }
 EXPORT_SYMBOL(xp_dma_map);
@@ -280,11 +233,8 @@ struct xdp_buff *xp_alloc(

Re: the XSK buffer pool needs be to reverted

2020-06-26 Thread Björn Töpel


On 2020-06-26 14:41, Christoph Hellwig wrote:

On Fri, Jun 26, 2020 at 02:22:41PM +0200, Björn Töpel wrote:

[...]


Understood. Wdyt about something in the lines of the diff below? It's
build tested only, but removes all non-dma API usage ("poking
internals"). Would that be a way forward, and then as a next step work
on a solution that would give similar benefits, but something that would
live in the dma mapping core?


Yes, that would solve the immediate issues.



Good. I'll cook a proper patch and post it.


Björn

1 2 >

1 - 100 of 145 matches

Mail list logo