[PATCH v4 11/11] riscv: Enable DAX VMEMMAP optimization
From: Björn Töpel Now that DAX is usable, enable the DAX VMEMMAP optimization as well. Signed-off-by: Björn Töpel --- arch/riscv/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 8a49b5f4c017..1631bf568158 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -73,6 +73,7 @@ config RISCV select ARCH_WANT_GENERAL_HUGETLB if !RISCV_ISA_SVNAPOT select ARCH_WANT_HUGE_PMD_SHARE if 64BIT select ARCH_WANT_LD_ORPHAN_WARN if !XIP_KERNEL + select ARCH_WANT_OPTIMIZE_DAX_VMEMMAP select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP select ARCH_WANTS_NO_INSTR select ARCH_WANTS_THP_SWAP if HAVE_ARCH_TRANSPARENT_HUGEPAGE -- 2.43.0
[PATCH v4 10/11] riscv: mm: Add support for ZONE_DEVICE
From: Björn Töpel ZONE_DEVICE pages need DEVMAP PTEs support to function (ARCH_HAS_PTE_DEVMAP). Claim another RSW (reserved for software) bit in the PTE for DEVMAP mark, add the corresponding helpers, and enable ARCH_HAS_PTE_DEVMAP for riscv64. Reviewed-by: Alexandre Ghiti Signed-off-by: Björn Töpel --- arch/riscv/Kconfig| 1 + arch/riscv/include/asm/pgtable-64.h | 20 arch/riscv/include/asm/pgtable-bits.h | 1 + arch/riscv/include/asm/pgtable.h | 17 + 4 files changed, 39 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 8d860ad3b171..8a49b5f4c017 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -37,6 +37,7 @@ config RISCV select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PMEM_API select ARCH_HAS_PREPARE_SYNC_CORE_CMD + select ARCH_HAS_PTE_DEVMAP if 64BIT && MMU select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_DIRECT_MAP if MMU select ARCH_HAS_SET_MEMORY if MMU diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h index 8c36a8818432..0897dd99ab8d 100644 --- a/arch/riscv/include/asm/pgtable-64.h +++ b/arch/riscv/include/asm/pgtable-64.h @@ -398,4 +398,24 @@ static inline struct page *pgd_page(pgd_t pgd) #define p4d_offset p4d_offset p4d_t *p4d_offset(pgd_t *pgd, unsigned long address); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pte_devmap(pte_t pte); +static inline pte_t pmd_pte(pmd_t pmd); + +static inline int pmd_devmap(pmd_t pmd) +{ + return pte_devmap(pmd_pte(pmd)); +} + +static inline int pud_devmap(pud_t pud) +{ + return 0; +} + +static inline int pgd_devmap(pgd_t pgd) +{ + return 0; +} +#endif + #endif /* _ASM_RISCV_PGTABLE_64_H */ diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h index 179bd4afece4..a8f5205cea54 100644 --- a/arch/riscv/include/asm/pgtable-bits.h +++ b/arch/riscv/include/asm/pgtable-bits.h @@ -19,6 +19,7 @@ #define _PAGE_SOFT (3 << 8)/* Reserved for software */ #define _PAGE_SPECIAL (1 << 8)/* RSW: 0x1 */ +#define _PAGE_DEVMAP(1 << 9)/* RSW, devmap */ #define _PAGE_TABLE _PAGE_PRESENT /* diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 41f1b2c6f949..dd4d52940106 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -390,6 +390,13 @@ static inline int pte_special(pte_t pte) return pte_val(pte) & _PAGE_SPECIAL; } +#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP +static inline int pte_devmap(pte_t pte) +{ + return pte_val(pte) & _PAGE_DEVMAP; +} +#endif + /* static inline pte_t pte_rdprotect(pte_t pte) */ static inline pte_t pte_wrprotect(pte_t pte) @@ -431,6 +438,11 @@ static inline pte_t pte_mkspecial(pte_t pte) return __pte(pte_val(pte) | _PAGE_SPECIAL); } +static inline pte_t pte_mkdevmap(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_DEVMAP); +} + static inline pte_t pte_mkhuge(pte_t pte) { return pte; @@ -721,6 +733,11 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd) return pte_pmd(pte_mkdirty(pmd_pte(pmd))); } +static inline pmd_t pmd_mkdevmap(pmd_t pmd) +{ + return pte_pmd(pte_mkdevmap(pmd_pte(pmd))); +} + static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { -- 2.43.0
[PATCH v4 09/11] virtio-mem: Enable virtio-mem for RISC-V
From: Björn Töpel Now that RISC-V has memory hotplugging support, virtio-mem can be used on the platform. Acked-by: David Hildenbrand Signed-off-by: Björn Töpel --- drivers/virtio/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index 6284538a8184..42a48ac763ee 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -122,7 +122,7 @@ config VIRTIO_BALLOON config VIRTIO_MEM tristate "Virtio mem driver" - depends on X86_64 || ARM64 + depends on X86_64 || ARM64 || RISCV depends on VIRTIO depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE -- 2.43.0
[PATCH v4 05/11] riscv: mm: Add pfn_to_kaddr() implementation
From: Björn Töpel The pfn_to_kaddr() function is used by KASAN's memory hotplugging path. Add the missing function to the RISC-V port, so that it can be built with MHP and CONFIG_KASAN. Signed-off-by: Björn Töpel --- arch/riscv/include/asm/page.h | 5 + 1 file changed, 5 insertions(+) diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 115ac98b8d72..235fd45d998d 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -188,6 +188,11 @@ extern phys_addr_t __phys_addr_symbol(unsigned long x); unsigned long kaslr_offset(void); +static __always_inline void *pfn_to_kaddr(unsigned long pfn) +{ + return __va(pfn << PAGE_SHIFT); +} + #endif /* __ASSEMBLY__ */ #define virt_addr_valid(vaddr) ({ \ -- 2.43.0
[PATCH v4 08/11] riscv: Enable memory hotplugging for RISC-V
From: Björn Töpel Enable ARCH_ENABLE_MEMORY_HOTPLUG and ARCH_ENABLE_MEMORY_HOTREMOVE for RISC-V. Reviewed-by: Alexandre Ghiti Signed-off-by: Björn Töpel --- arch/riscv/Kconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 0525ee2d63c7..8d860ad3b171 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -16,6 +16,8 @@ config RISCV select ACPI_REDUCED_HARDWARE_ONLY if ACPI select ARCH_DMA_DEFAULT_COHERENT select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION + select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM_VMEMMAP + select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2 select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_HAS_BINFMT_FLAT @@ -46,6 +48,7 @@ config RISCV select ARCH_HAS_UBSAN select ARCH_HAS_VDSO_DATA select ARCH_KEEP_MEMBLOCK if ACPI + select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE if 64BIT && MMU select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT select ARCH_STACKWALK -- 2.43.0
[PATCH v4 07/11] riscv: mm: Take memory hotplug read-lock during kernel page table dump
From: Björn Töpel During memory hot remove, the ptdump functionality can end up touching stale data. Avoid any potential crashes (or worse), by holding the memory hotplug read-lock while traversing the page table. This change is analogous to arm64's commit bf2b59f60ee1 ("arm64/mm: Hold memory hotplug lock while walking for kernel page table dump"). Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Signed-off-by: Björn Töpel --- arch/riscv/mm/ptdump.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c index 1289cc6d3700..9d5f657a251b 100644 --- a/arch/riscv/mm/ptdump.c +++ b/arch/riscv/mm/ptdump.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -370,7 +371,9 @@ bool ptdump_check_wx(void) static int ptdump_show(struct seq_file *m, void *v) { + get_online_mems(); ptdump_walk(m, m->private); + put_online_mems(); return 0; } -- 2.43.0
[PATCH v4 06/11] riscv: mm: Add memory hotplugging support
From: Björn Töpel For an architecture to support memory hotplugging, a couple of callbacks needs to be implemented: arch_add_memory() This callback is responsible for adding the physical memory into the direct map, and call into the memory hotplugging generic code via __add_pages() that adds the corresponding struct page entries, and updates the vmemmap mapping. arch_remove_memory() This is the inverse of the callback above. vmemmap_free() This function tears down the vmemmap mappings (if CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the backing vmemmap pages. Note that for persistent memory, an alternative allocator for the backing pages can be used; The vmem_altmap. This means that when the backing pages are cleared, extra care is needed so that the correct deallocation method is used. arch_get_mappable_range() This functions returns the PA range that the direct map can map. Used by the MHP internals for sanity checks. The page table unmap/teardown functions are heavily based on code from the x86 tree. The same remove_pgd_mapping() function is used in both vmemmap_free() and arch_remove_memory(), but in the latter function the backing pages are not removed. Signed-off-by: Björn Töpel --- arch/riscv/mm/init.c | 267 +++ 1 file changed, 267 insertions(+) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 1f7e7c223bec..bfa2dea95354 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1534,3 +1534,270 @@ struct execmem_info __init *execmem_arch_setup(void) } #endif /* CONFIG_MMU */ #endif /* CONFIG_EXECMEM */ + +#ifdef CONFIG_MEMORY_HOTPLUG +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) +{ + struct page *page = pmd_page(*pmd); + struct ptdesc *ptdesc = page_ptdesc(page); + pte_t *pte; + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = pte_start + i; + if (!pte_none(*pte)) + return; + } + + pagetable_pte_dtor(ptdesc); + if (PageReserved(page)) + free_reserved_page(page); + else + pagetable_free(ptdesc); + pmd_clear(pmd); +} + +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) +{ + struct page *page = pud_page(*pud); + struct ptdesc *ptdesc = page_ptdesc(page); + pmd_t *pmd; + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd = pmd_start + i; + if (!pmd_none(*pmd)) + return; + } + + pagetable_pmd_dtor(ptdesc); + if (PageReserved(page)) + free_reserved_page(page); + else + pagetable_free(ptdesc); + pud_clear(pud); +} + +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d) +{ + struct page *page = p4d_page(*p4d); + pud_t *pud; + int i; + + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (!pud_none(*pud)) + return; + } + + if (PageReserved(page)) + free_reserved_page(page); + else + free_pages((unsigned long)page_address(page), 0); + p4d_clear(p4d); +} + +static void __meminit free_vmemmap_storage(struct page *page, size_t size, + struct vmem_altmap *altmap) +{ + int order = get_order(size); + + if (altmap) { + vmem_altmap_free(altmap, size >> PAGE_SHIFT); + return; + } + + if (PageReserved(page)) { + unsigned int nr_pages = 1 << order; + + while (nr_pages--) + free_reserved_page(page++); + return; + } + + free_pages((unsigned long)page_address(page), order); +} + +static void __meminit remove_pte_mapping(pte_t *pte_base, unsigned long addr, unsigned long end, +bool is_vmemmap, struct vmem_altmap *altmap) +{ + unsigned long next; + pte_t *ptep, pte; + + for (; addr < end; addr = next) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + if (next > end) + next = end; + + ptep = pte_base + pte_index(addr); + pte = ptep_get(ptep); + if (!pte_present(*ptep)) + continue; + + pte_clear(_mm, addr, ptep); + if (is_vmemmap) + free_vmemmap_storage(pte_page(pte), PAGE_SIZE, altmap); + } +} + +static void __meminit remove_pmd_mapping(pmd_t *pmd_base, unsigned long addr, unsigned long end, +bool is_vmemmap, struct vmem_altmap *altmap) +{ + unsigned long next; + pte_t *pte_base; + pmd_t *pmdp, pmd; + + for (; addr < end; addr = next) { +
[PATCH v4 04/11] riscv: mm: Refactor create_linear_mapping_range() for memory hot add
From: Björn Töpel Add a parameter to the direct map setup function, so it can be used in arch_add_memory() later. Reviewed-by: Alexandre Ghiti Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Signed-off-by: Björn Töpel --- arch/riscv/mm/init.c | 15 ++- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 0dd04cedc0d2..1f7e7c223bec 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1232,7 +1232,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) } static void __meminit create_linear_mapping_range(phys_addr_t start, phys_addr_t end, - uintptr_t fixed_map_size) + uintptr_t fixed_map_size, const pgprot_t *pgprot) { phys_addr_t pa; uintptr_t va, map_size; @@ -1243,7 +1243,7 @@ static void __meminit create_linear_mapping_range(phys_addr_t start, phys_addr_t best_map_size(pa, va, end - pa); create_pgd_mapping(swapper_pg_dir, va, pa, map_size, - pgprot_from_va(va)); + pgprot ? *pgprot : pgprot_from_va(va)); } } @@ -1287,22 +1287,19 @@ static void __init create_linear_mapping_page_table(void) if (end >= __pa(PAGE_OFFSET) + memory_limit) end = __pa(PAGE_OFFSET) + memory_limit; - create_linear_mapping_range(start, end, 0); + create_linear_mapping_range(start, end, 0, NULL); } #ifdef CONFIG_STRICT_KERNEL_RWX - create_linear_mapping_range(ktext_start, ktext_start + ktext_size, 0); - create_linear_mapping_range(krodata_start, - krodata_start + krodata_size, 0); + create_linear_mapping_range(ktext_start, ktext_start + ktext_size, 0, NULL); + create_linear_mapping_range(krodata_start, krodata_start + krodata_size, 0, NULL); memblock_clear_nomap(ktext_start, ktext_size); memblock_clear_nomap(krodata_start, krodata_size); #endif #ifdef CONFIG_KFENCE - create_linear_mapping_range(kfence_pool, - kfence_pool + KFENCE_POOL_SIZE, - PAGE_SIZE); + create_linear_mapping_range(kfence_pool, kfence_pool + KFENCE_POOL_SIZE, PAGE_SIZE, NULL); memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE); #endif -- 2.43.0
[PATCH v4 03/11] riscv: mm: Change attribute from __init to __meminit for page functions
From: Björn Töpel Prepare for memory hotplugging support by changing from __init to __meminit for the page table functions that are used by the upcoming architecture specific callbacks. Changing the __init attribute to __meminit, avoids that the functions are removed after init. The __meminit attribute makes sure the functions are kept in the kernel text post init, but only if memory hotplugging is enabled for the build. Reviewed-by: Alexandre Ghiti Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Signed-off-by: Björn Töpel --- arch/riscv/include/asm/mmu.h | 4 +-- arch/riscv/include/asm/pgtable.h | 2 +- arch/riscv/mm/init.c | 56 ++-- 3 files changed, 28 insertions(+), 34 deletions(-) diff --git a/arch/riscv/include/asm/mmu.h b/arch/riscv/include/asm/mmu.h index 947fd60f9051..c9e03e9da3dc 100644 --- a/arch/riscv/include/asm/mmu.h +++ b/arch/riscv/include/asm/mmu.h @@ -31,8 +31,8 @@ typedef struct { #define cntx2asid(cntx)((cntx) & SATP_ASID_MASK) #define cntx2version(cntx) ((cntx) & ~SATP_ASID_MASK) -void __init create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa, - phys_addr_t sz, pgprot_t prot); +void __meminit create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa, phys_addr_t sz, + pgprot_t prot); #endif /* __ASSEMBLY__ */ #endif /* _ASM_RISCV_MMU_H */ diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index aad8b8ca51f1..41f1b2c6f949 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -165,7 +165,7 @@ struct pt_alloc_ops { #endif }; -extern struct pt_alloc_ops pt_ops __initdata; +extern struct pt_alloc_ops pt_ops __meminitdata; #ifdef CONFIG_MMU /* Number of PGD entries that a user-mode program can use */ diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index a5b3bc1f3b88..0dd04cedc0d2 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -297,7 +297,7 @@ static void __init setup_bootmem(void) } #ifdef CONFIG_MMU -struct pt_alloc_ops pt_ops __initdata; +struct pt_alloc_ops pt_ops __meminitdata; pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss; pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss; @@ -359,7 +359,7 @@ static inline pte_t *__init get_pte_virt_fixmap(phys_addr_t pa) return (pte_t *)set_fixmap_offset(FIX_PTE, pa); } -static inline pte_t *__init get_pte_virt_late(phys_addr_t pa) +static inline pte_t *__meminit get_pte_virt_late(phys_addr_t pa) { return (pte_t *) __va(pa); } @@ -378,7 +378,7 @@ static inline phys_addr_t __init alloc_pte_fixmap(uintptr_t va) return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); } -static phys_addr_t __init alloc_pte_late(uintptr_t va) +static phys_addr_t __meminit alloc_pte_late(uintptr_t va) { struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0); @@ -386,9 +386,8 @@ static phys_addr_t __init alloc_pte_late(uintptr_t va) return __pa((pte_t *)ptdesc_address(ptdesc)); } -static void __init create_pte_mapping(pte_t *ptep, - uintptr_t va, phys_addr_t pa, - phys_addr_t sz, pgprot_t prot) +static void __meminit create_pte_mapping(pte_t *ptep, uintptr_t va, phys_addr_t pa, phys_addr_t sz, +pgprot_t prot) { uintptr_t pte_idx = pte_index(va); @@ -442,7 +441,7 @@ static pmd_t *__init get_pmd_virt_fixmap(phys_addr_t pa) return (pmd_t *)set_fixmap_offset(FIX_PMD, pa); } -static pmd_t *__init get_pmd_virt_late(phys_addr_t pa) +static pmd_t *__meminit get_pmd_virt_late(phys_addr_t pa) { return (pmd_t *) __va(pa); } @@ -459,7 +458,7 @@ static phys_addr_t __init alloc_pmd_fixmap(uintptr_t va) return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); } -static phys_addr_t __init alloc_pmd_late(uintptr_t va) +static phys_addr_t __meminit alloc_pmd_late(uintptr_t va) { struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0); @@ -467,9 +466,9 @@ static phys_addr_t __init alloc_pmd_late(uintptr_t va) return __pa((pmd_t *)ptdesc_address(ptdesc)); } -static void __init create_pmd_mapping(pmd_t *pmdp, - uintptr_t va, phys_addr_t pa, - phys_addr_t sz, pgprot_t prot) +static void __meminit create_pmd_mapping(pmd_t *pmdp, +uintptr_t va, phys_addr_t pa, +phys_addr_t sz, pgprot_t prot) { pte_t *ptep; phys_addr_t pte_phys; @@ -505,7 +504,7 @@ static pud_t *__init get_pud_virt_fixmap(phys_addr_t pa) return (pud_t *)set_fixmap_offset(FIX_PUD, pa); } -static pud_t *__init get_pud_virt_late(phys_addr_t pa) +static pud_t *__meminit get_pud_virt_late(phys_addr_t pa)
[PATCH v4 02/11] riscv: mm: Pre-allocate vmemmap/direct map/kasan PGD entries
From: Björn Töpel The RISC-V port copies the PGD table from init_mm/swapper_pg_dir to all userland page tables, which means that if the PGD level table is changed, other page tables has to be updated as well. Instead of having the PGD changes ripple out to all tables, the synchronization can be avoided by pre-allocating the PGD entries/pages at boot, avoiding the synchronization all together. This is currently done for the bpf/modules, and vmalloc PGD regions. Extend this scheme for the PGD regions touched by memory hotplugging. Prepare the RISC-V port for memory hotplug by pre-allocate vmemmap/direct map/kasan entries at the PGD level. This will roughly waste ~128 (plus 32 if KASAN is enabled) worth of 4K pages when memory hotplugging is enabled in the kernel configuration. Reviewed-by: Alexandre Ghiti Signed-off-by: Björn Töpel --- arch/riscv/include/asm/kasan.h | 4 ++-- arch/riscv/mm/init.c | 9 + 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h index 0b85e363e778..e6a0071bdb56 100644 --- a/arch/riscv/include/asm/kasan.h +++ b/arch/riscv/include/asm/kasan.h @@ -6,8 +6,6 @@ #ifndef __ASSEMBLY__ -#ifdef CONFIG_KASAN - /* * The following comment was copied from arm64: * KASAN_SHADOW_START: beginning of the kernel virtual addresses. @@ -34,6 +32,8 @@ */ #define KASAN_SHADOW_START ((KASAN_SHADOW_END - KASAN_SHADOW_SIZE) & PGDIR_MASK) #define KASAN_SHADOW_END MODULES_LOWEST_VADDR + +#ifdef CONFIG_KASAN #define KASAN_SHADOW_OFFSET_AC(CONFIG_KASAN_SHADOW_OFFSET, UL) void kasan_init(void); diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index fe5072f66c8c..a5b3bc1f3b88 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -28,6 +28,7 @@ #include #include +#include #include #include #include @@ -1493,11 +1494,19 @@ static void __init preallocate_pgd_pages_range(unsigned long start, unsigned lon panic("Failed to pre-allocate %s pages for %s area\n", lvl, area); } +#define PAGE_END KASAN_SHADOW_START + void __init pgtable_cache_init(void) { preallocate_pgd_pages_range(VMALLOC_START, VMALLOC_END, "vmalloc"); if (IS_ENABLED(CONFIG_MODULES)) preallocate_pgd_pages_range(MODULES_VADDR, MODULES_END, "bpf/modules"); + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) { + preallocate_pgd_pages_range(VMEMMAP_START, VMEMMAP_END, "vmemmap"); + preallocate_pgd_pages_range(PAGE_OFFSET, PAGE_END, "direct map"); + if (IS_ENABLED(CONFIG_KASAN)) + preallocate_pgd_pages_range(KASAN_SHADOW_START, KASAN_SHADOW_END, "kasan"); + } } #endif -- 2.43.0
[PATCH v4 00/11] riscv: Memory Hot(Un)Plug support
From: Björn Töpel Memory Hot(Un)Plug support (and ZONE_DEVICE) for the RISC-V port (For the restless folks: change log in the bottom!) Introduction To quote "Documentation/admin-guide/mm/memory-hotplug.rst": "Memory hot(un)plug allows for increasing and decreasing the size of physical memory available to a machine at runtime." This series adds memory hot(un)plugging, and ZONE_DEVICE support for the RISC-V Linux port. MM configuration RISC-V MM has the following configuration: * Memory blocks are 128M, analogous to x86-64. It uses PMD ("hugepage") vmemmaps. From that follows that 2M (PMD) worth of vmemmap spans 32768 pages á 4K which gets us 128M. * The pageblock size is the minimum minimum virtio_mem size, and on RISC-V it's 2M (2^9 * 4K). Implementation == The PGD table on RISC-V is shared/copied between for all processes. To avoid doing page table synchronization, the first patch (patch 1) pre-allocated the PGD entries for vmemmap/direct map. By doing that the init_mm PGD will be fixed at kernel init, and synchronization can be avoided all together. The following two patches (patch 2-3) does some preparations, followed by the actual MHP implementation (patch 4-5). Then, MHP and virtio-mem are enabled (patch 6-7), and finally ZONE_DEVICE support is added (patch 8). MHP and locking === TL;DR: The MHP does not step on any toes, except for ptdump. Additional locking is required for ptdump. Long version: For v2 I spent some time digging into init_mm synchronization/update. Here are my findings, and I'd love them to be corrected if incorrect. It's been a gnarly path... The `init_mm` structure is a special mm (perhaps not a "real" one). It's a "lazy context" that tracks kernel page table resources, e.g., the kernel page table (swapper_pg_dir), a kernel page_table_lock (more about the usage below), mmap_lock, and such. `init_mm` does not track/contain any VMAs. Having the `init_mm` is convenient, so that the regular kernel page table walk/modify functions can be used. Now, `init_mm` being special means that the locking for kernel page tables are special as well. On RISC-V the PGD (top-level page table structure), similar to x86, is shared (copied) with user processes. If the kernel PGD is modified, it has to be synched to user-mode processes PGDs. This is avoided by pre-populating the PGD, so it'll be fixed from boot. The in-kernel pgd regions are documented in `Documentation/arch/riscv/vm-layout.rst`. The distinct regions are: * vmemmap * vmalloc/ioremap space * direct mapping of all physical memory * kasan * modules, BPF * kernel Memory hotplug is the process of adding/removing memory to/from the kernel. Adding is done in two phases: 1. Add the memory to the kernel 2. Online memory, making it available to the page allocator. Step 1 is partially architecture dependent, and updates the init_mm page table: * Update the direct map page tables. The direct map is a linear map, representing all physical memory: `virt = phys + PAGE_OFFSET` * Add a `struct page` for each added page of memory. Update the vmemmap (virtual mapping to the `struct page`, so we can easily transform a kernel virtual address to a `struct page *` address. >From an MHP perspective, there are two regions of the PGD that are updated: * vmemmap * direct mapping of all physical memory The `struct mm_struct` has a couple of locks in play: * `spinlock_t page_table_lock` protects the page table, and some counters * `struct rw_semaphore mmap_lock` protect an mm's VMAs Note again that `init_mm` does not contain any VMAs, but still uses the mmap_lock in some places. The `page_table_lock` was originally used to to protect all pages tables, but more recently a split page table lock has been introduced. The split lock has a per-table lock for the PTE and PMD tables. If split lock is disabled, all tables are guarded by `mm->page_table_lock` (for user processes). Split page table locks are not used for init_mm. MHP operations is typically synchronized using `DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock)`. Actors -- The following non-MHP actors in the kernel traverses (read), and/or modifies the kernel PGD. * `ptdump` Walks the entire `init_mm`, via `ptdump_walk_pgd()` with the `mmap_write_lock(init_mm)` taken. Observation: ptdump can race with MHP, and needs additional locking to avoid crashes/races. * `set_direct_*` / `arch/riscv/mm/pageattr.c` The `set_direct_*` functionality is used to "synchronize" the direct map to other kernel mappings, e.g. modules/kernel text. The direct map is using "as large huge table mappings as possible", which means that the `set_direct_*` might need to split the direct
[PATCH v4 01/11] riscv: mm: Properly forward vmemmap_populate() altmap parameter
From: Björn Töpel Make sure that the altmap parameter is properly passed on to vmemmap_populate_hugepages(). Reviewed-by: Alexandre Ghiti Signed-off-by: Björn Töpel --- arch/riscv/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index e3405e4b99af..fe5072f66c8c 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1439,7 +1439,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, * memory hotplug, we are not able to update all the page tables with * the new PMDs. */ - return vmemmap_populate_hugepages(start, end, node, NULL); + return vmemmap_populate_hugepages(start, end, node, altmap); } #endif -- 2.43.0
Re: [PATCH] riscv: Fix early ftrace nop patching
Alexandre Ghiti writes: > Commit c97bf629963e ("riscv: Fix text patching when IPI are used") > converted ftrace_make_nop() to use patch_insn_write() which does not > emit any icache flush relying entirely on __ftrace_modify_code() to do > that. > > But we missed that ftrace_make_nop() was called very early directly when > converting mcount calls into nops (actually on riscv it converts 2B nops > emitted by the compiler into 4B nops). > > This caused crashes on multiple HW as reported by Conor and Björn since > the booting core could have half-patched instructions in its icache > which would trigger an illegal instruction trap: fix this by emitting a > local flush icache when early patching nops. > > Fixes: c97bf629963e ("riscv: Fix text patching when IPI are used") > Signed-off-by: Alexandre Ghiti Nice! I've manged to reproduce the crash on the VisionFive2 board (however only triggered when CONFIG_RELOCATABLE=y), and can verify that this fix solves the issue. Reviewed-by: Björn Töpel Tested-by: Björn Töpel
Re: [PATCH v3 2/9] riscv: mm: Pre-allocate vmemmap/direct map PGD entries
Björn Töpel writes: > From: Björn Töpel > > The RISC-V port copies the PGD table from init_mm/swapper_pg_dir to > all userland page tables, which means that if the PGD level table is > changed, other page tables has to be updated as well. > > Instead of having the PGD changes ripple out to all tables, the > synchronization can be avoided by pre-allocating the PGD entries/pages > at boot, avoiding the synchronization all together. > > This is currently done for the bpf/modules, and vmalloc PGD regions. > Extend this scheme for the PGD regions touched by memory hotplugging. > > Prepare the RISC-V port for memory hotplug by pre-allocate > vmemmap/direct map entries at the PGD level. This will roughly waste > ~128 worth of 4K pages when memory hotplugging is enabled in the > kernel configuration. > > Reviewed-by: Alexandre Ghiti > Signed-off-by: Björn Töpel > --- > arch/riscv/include/asm/kasan.h | 4 ++-- > arch/riscv/mm/init.c | 7 +++ > 2 files changed, 9 insertions(+), 2 deletions(-) > > diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h > index 0b85e363e778..e6a0071bdb56 100644 > --- a/arch/riscv/include/asm/kasan.h > +++ b/arch/riscv/include/asm/kasan.h > @@ -6,8 +6,6 @@ > > #ifndef __ASSEMBLY__ > > -#ifdef CONFIG_KASAN > - > /* > * The following comment was copied from arm64: > * KASAN_SHADOW_START: beginning of the kernel virtual addresses. > @@ -34,6 +32,8 @@ > */ > #define KASAN_SHADOW_START ((KASAN_SHADOW_END - KASAN_SHADOW_SIZE) & > PGDIR_MASK) > #define KASAN_SHADOW_END MODULES_LOWEST_VADDR > + > +#ifdef CONFIG_KASAN > #define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) > > void kasan_init(void); > diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c > index b66f846e7634..c98010ede810 100644 > --- a/arch/riscv/mm/init.c > +++ b/arch/riscv/mm/init.c > @@ -27,6 +27,7 @@ > > #include > #include > +#include > #include > #include > #include > @@ -1488,10 +1489,16 @@ static void __init > preallocate_pgd_pages_range(unsigned long start, unsigned lon > panic("Failed to pre-allocate %s pages for %s area\n", lvl, area); > } > > +#define PAGE_END KASAN_SHADOW_START > + > void __init pgtable_cache_init(void) > { > preallocate_pgd_pages_range(VMALLOC_START, VMALLOC_END, "vmalloc"); > if (IS_ENABLED(CONFIG_MODULES)) > preallocate_pgd_pages_range(MODULES_VADDR, MODULES_END, > "bpf/modules"); > + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) { > + preallocate_pgd_pages_range(VMEMMAP_START, VMEMMAP_END, > "vmemmap"); > + preallocate_pgd_pages_range(PAGE_OFFSET, PAGE_END, "direct > map"); Alex pointed out that KASAN PGDs should be preallocated as well! I'll address this in the next revision. Björn
Re: [PATCH v3 5/9] riscv: mm: Add memory hotplugging support
Alexandre Ghiti writes: > On Tue, May 21, 2024 at 1:49 PM Björn Töpel wrote: >> >> From: Björn Töpel >> >> For an architecture to support memory hotplugging, a couple of >> callbacks needs to be implemented: >> >> arch_add_memory() >> This callback is responsible for adding the physical memory into the >> direct map, and call into the memory hotplugging generic code via >> __add_pages() that adds the corresponding struct page entries, and >> updates the vmemmap mapping. >> >> arch_remove_memory() >> This is the inverse of the callback above. >> >> vmemmap_free() >> This function tears down the vmemmap mappings (if >> CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the >> backing vmemmap pages. Note that for persistent memory, an >> alternative allocator for the backing pages can be used; The >> vmem_altmap. This means that when the backing pages are cleared, >> extra care is needed so that the correct deallocation method is >> used. >> >> arch_get_mappable_range() >> This functions returns the PA range that the direct map can map. >> Used by the MHP internals for sanity checks. >> >> The page table unmap/teardown functions are heavily based on code from >> the x86 tree. The same remove_pgd_mapping() function is used in both >> vmemmap_free() and arch_remove_memory(), but in the latter function >> the backing pages are not removed. >> >> Signed-off-by: Björn Töpel >> --- >> arch/riscv/mm/init.c | 261 +++ >> 1 file changed, 261 insertions(+) >> >> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c >> index 6f72b0b2b854..6693b742bf2f 100644 >> --- a/arch/riscv/mm/init.c >> +++ b/arch/riscv/mm/init.c >> @@ -1493,3 +1493,264 @@ void __init pgtable_cache_init(void) >> } >> } >> #endif >> + >> +#ifdef CONFIG_MEMORY_HOTPLUG >> +static void __meminit free_pagetable(struct page *page, int order) >> +{ >> + unsigned int nr_pages = 1 << order; >> + >> + /* >> +* vmemmap/direct page tables can be reserved, if added at >> +* boot. >> +*/ >> + if (PageReserved(page)) { >> + __ClearPageReserved(page); > > What's the difference between __ClearPageReserved() and > ClearPageReserved()? Because it seems like free_reserved_page() calls > the latter already, so why would you need to call > __ClearPageReserved() on the first page? Indeed! x86 copy pasta (which uses bootmem info page that RV doesn't). >> + while (nr_pages--) >> + free_reserved_page(page++); >> + return; >> + } >> + >> + free_pages((unsigned long)page_address(page), order); >> +} >> + >> +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) >> +{ >> + pte_t *pte; >> + int i; >> + >> + for (i = 0; i < PTRS_PER_PTE; i++) { >> + pte = pte_start + i; >> + if (!pte_none(*pte)) >> + return; >> + } >> + >> + free_pagetable(pmd_page(*pmd), 0); >> + pmd_clear(pmd); >> +} >> + >> +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) >> +{ >> + pmd_t *pmd; >> + int i; >> + >> + for (i = 0; i < PTRS_PER_PMD; i++) { >> + pmd = pmd_start + i; >> + if (!pmd_none(*pmd)) >> + return; >> + } >> + >> + free_pagetable(pud_page(*pud), 0); >> + pud_clear(pud); >> +} >> + >> +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d) >> +{ >> + pud_t *pud; >> + int i; >> + >> + for (i = 0; i < PTRS_PER_PUD; i++) { >> + pud = pud_start + i; >> + if (!pud_none(*pud)) >> + return; >> + } >> + >> + free_pagetable(p4d_page(*p4d), 0); >> + p4d_clear(p4d); >> +} >> + >> +static void __meminit free_vmemmap_storage(struct page *page, size_t size, >> + struct vmem_altmap *altmap) >> +{ >> + if (altmap) >> + vmem_altmap_free(altmap, size >> PAGE_SHIFT); >> + else >> + free_pagetable(page, get_order(size)); >> +} >> + >> +static v
Re: [PATCH v3 9/9] riscv: mm: Add support for ZONE_DEVICE
Alexandre Ghiti writes: > On Tue, May 21, 2024 at 1:49 PM Björn Töpel wrote: >> >> From: Björn Töpel >> >> ZONE_DEVICE pages need DEVMAP PTEs support to function >> (ARCH_HAS_PTE_DEVMAP). Claim another RSW (reserved for software) bit >> in the PTE for DEVMAP mark, add the corresponding helpers, and enable >> ARCH_HAS_PTE_DEVMAP for riscv64. >> >> Signed-off-by: Björn Töpel >> --- >> arch/riscv/Kconfig| 1 + >> arch/riscv/include/asm/pgtable-64.h | 20 >> arch/riscv/include/asm/pgtable-bits.h | 1 + >> arch/riscv/include/asm/pgtable.h | 17 + >> 4 files changed, 39 insertions(+) >> >> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig >> index 2724dc2af29f..0b74698c63c7 100644 >> --- a/arch/riscv/Kconfig >> +++ b/arch/riscv/Kconfig >> @@ -36,6 +36,7 @@ config RISCV >> select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE >> select ARCH_HAS_PMEM_API >> select ARCH_HAS_PREPARE_SYNC_CORE_CMD >> + select ARCH_HAS_PTE_DEVMAP if 64BIT && MMU >> select ARCH_HAS_PTE_SPECIAL >> select ARCH_HAS_SET_DIRECT_MAP if MMU >> select ARCH_HAS_SET_MEMORY if MMU >> diff --git a/arch/riscv/include/asm/pgtable-64.h >> b/arch/riscv/include/asm/pgtable-64.h >> index 221a5c1ee287..c67a9bbfd010 100644 >> --- a/arch/riscv/include/asm/pgtable-64.h >> +++ b/arch/riscv/include/asm/pgtable-64.h >> @@ -400,4 +400,24 @@ static inline struct page *pgd_page(pgd_t pgd) >> #define p4d_offset p4d_offset >> p4d_t *p4d_offset(pgd_t *pgd, unsigned long address); >> >> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE >> +static inline int pte_devmap(pte_t pte); >> +static inline pte_t pmd_pte(pmd_t pmd); >> + >> +static inline int pmd_devmap(pmd_t pmd) >> +{ >> + return pte_devmap(pmd_pte(pmd)); >> +} >> + >> +static inline int pud_devmap(pud_t pud) >> +{ >> + return 0; >> +} >> + >> +static inline int pgd_devmap(pgd_t pgd) >> +{ >> + return 0; >> +} >> +#endif >> + >> #endif /* _ASM_RISCV_PGTABLE_64_H */ >> diff --git a/arch/riscv/include/asm/pgtable-bits.h >> b/arch/riscv/include/asm/pgtable-bits.h >> index 179bd4afece4..a8f5205cea54 100644 >> --- a/arch/riscv/include/asm/pgtable-bits.h >> +++ b/arch/riscv/include/asm/pgtable-bits.h >> @@ -19,6 +19,7 @@ >> #define _PAGE_SOFT (3 << 8)/* Reserved for software */ >> >> #define _PAGE_SPECIAL (1 << 8)/* RSW: 0x1 */ >> +#define _PAGE_DEVMAP(1 << 9)/* RSW, devmap */ >> #define _PAGE_TABLE _PAGE_PRESENT >> >> /* >> diff --git a/arch/riscv/include/asm/pgtable.h >> b/arch/riscv/include/asm/pgtable.h >> index 7933f493db71..02fadc276064 100644 >> --- a/arch/riscv/include/asm/pgtable.h >> +++ b/arch/riscv/include/asm/pgtable.h >> @@ -387,6 +387,13 @@ static inline int pte_special(pte_t pte) >> return pte_val(pte) & _PAGE_SPECIAL; >> } >> >> +#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP >> +static inline int pte_devmap(pte_t pte) >> +{ >> + return pte_val(pte) & _PAGE_DEVMAP; >> +} >> +#endif > > Not sure you need the #ifdef here. W/o it 32b builds break (!defined(CONFIG_ARCH_HAS_PTE_DEVMAP) will have a default implementation).. Maybe it's cleaner just to use that instead? >> + >> /* static inline pte_t pte_rdprotect(pte_t pte) */ >> >> static inline pte_t pte_wrprotect(pte_t pte) >> @@ -428,6 +435,11 @@ static inline pte_t pte_mkspecial(pte_t pte) >> return __pte(pte_val(pte) | _PAGE_SPECIAL); >> } >> >> +static inline pte_t pte_mkdevmap(pte_t pte) >> +{ >> + return __pte(pte_val(pte) | _PAGE_DEVMAP); >> +} >> + >> static inline pte_t pte_mkhuge(pte_t pte) >> { >> return pte; >> @@ -711,6 +723,11 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd) >> return pte_pmd(pte_mkdirty(pmd_pte(pmd))); >> } >> >> +static inline pmd_t pmd_mkdevmap(pmd_t pmd) >> +{ >> + return pte_pmd(pte_mkdevmap(pmd_pte(pmd))); >> +} >> + >> static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, >> pmd_t *pmdp, pmd_t pmd) >> { >> -- >> 2.40.1 >> > > Otherwise, you can add: > > Reviewed-by: Alexandre Ghiti Thank you! Björn
[PATCH v3 9/9] riscv: mm: Add support for ZONE_DEVICE
From: Björn Töpel ZONE_DEVICE pages need DEVMAP PTEs support to function (ARCH_HAS_PTE_DEVMAP). Claim another RSW (reserved for software) bit in the PTE for DEVMAP mark, add the corresponding helpers, and enable ARCH_HAS_PTE_DEVMAP for riscv64. Signed-off-by: Björn Töpel --- arch/riscv/Kconfig| 1 + arch/riscv/include/asm/pgtable-64.h | 20 arch/riscv/include/asm/pgtable-bits.h | 1 + arch/riscv/include/asm/pgtable.h | 17 + 4 files changed, 39 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 2724dc2af29f..0b74698c63c7 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -36,6 +36,7 @@ config RISCV select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PMEM_API select ARCH_HAS_PREPARE_SYNC_CORE_CMD + select ARCH_HAS_PTE_DEVMAP if 64BIT && MMU select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_DIRECT_MAP if MMU select ARCH_HAS_SET_MEMORY if MMU diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h index 221a5c1ee287..c67a9bbfd010 100644 --- a/arch/riscv/include/asm/pgtable-64.h +++ b/arch/riscv/include/asm/pgtable-64.h @@ -400,4 +400,24 @@ static inline struct page *pgd_page(pgd_t pgd) #define p4d_offset p4d_offset p4d_t *p4d_offset(pgd_t *pgd, unsigned long address); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pte_devmap(pte_t pte); +static inline pte_t pmd_pte(pmd_t pmd); + +static inline int pmd_devmap(pmd_t pmd) +{ + return pte_devmap(pmd_pte(pmd)); +} + +static inline int pud_devmap(pud_t pud) +{ + return 0; +} + +static inline int pgd_devmap(pgd_t pgd) +{ + return 0; +} +#endif + #endif /* _ASM_RISCV_PGTABLE_64_H */ diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h index 179bd4afece4..a8f5205cea54 100644 --- a/arch/riscv/include/asm/pgtable-bits.h +++ b/arch/riscv/include/asm/pgtable-bits.h @@ -19,6 +19,7 @@ #define _PAGE_SOFT (3 << 8)/* Reserved for software */ #define _PAGE_SPECIAL (1 << 8)/* RSW: 0x1 */ +#define _PAGE_DEVMAP(1 << 9)/* RSW, devmap */ #define _PAGE_TABLE _PAGE_PRESENT /* diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 7933f493db71..02fadc276064 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -387,6 +387,13 @@ static inline int pte_special(pte_t pte) return pte_val(pte) & _PAGE_SPECIAL; } +#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP +static inline int pte_devmap(pte_t pte) +{ + return pte_val(pte) & _PAGE_DEVMAP; +} +#endif + /* static inline pte_t pte_rdprotect(pte_t pte) */ static inline pte_t pte_wrprotect(pte_t pte) @@ -428,6 +435,11 @@ static inline pte_t pte_mkspecial(pte_t pte) return __pte(pte_val(pte) | _PAGE_SPECIAL); } +static inline pte_t pte_mkdevmap(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_DEVMAP); +} + static inline pte_t pte_mkhuge(pte_t pte) { return pte; @@ -711,6 +723,11 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd) return pte_pmd(pte_mkdirty(pmd_pte(pmd))); } +static inline pmd_t pmd_mkdevmap(pmd_t pmd) +{ + return pte_pmd(pte_mkdevmap(pmd_pte(pmd))); +} + static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { -- 2.40.1
[PATCH v3 8/9] virtio-mem: Enable virtio-mem for RISC-V
From: Björn Töpel Now that RISC-V has memory hotplugging support, virtio-mem can be used on the platform. Acked-by: David Hildenbrand Signed-off-by: Björn Töpel --- drivers/virtio/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index c17193544268..4e5cebf1b82a 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -122,7 +122,7 @@ config VIRTIO_BALLOON config VIRTIO_MEM tristate "Virtio mem driver" - depends on X86_64 || ARM64 + depends on X86_64 || ARM64 || RISCV depends on VIRTIO depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE -- 2.40.1
[PATCH v3 7/9] riscv: Enable memory hotplugging for RISC-V
From: Björn Töpel Enable ARCH_ENABLE_MEMORY_HOTPLUG and ARCH_ENABLE_MEMORY_HOTREMOVE for RISC-V. Signed-off-by: Björn Töpel --- arch/riscv/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index fe5281398543..2724dc2af29f 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -16,6 +16,8 @@ config RISCV select ACPI_REDUCED_HARDWARE_ONLY if ACPI select ARCH_DMA_DEFAULT_COHERENT select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION + select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM_VMEMMAP && 64BIT && MMU + select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2 select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_HAS_BINFMT_FLAT -- 2.40.1
[PATCH v3 6/9] riscv: mm: Take memory hotplug read-lock during kernel page table dump
From: Björn Töpel During memory hot remove, the ptdump functionality can end up touching stale data. Avoid any potential crashes (or worse), by holding the memory hotplug read-lock while traversing the page table. This change is analogous to arm64's commit bf2b59f60ee1 ("arm64/mm: Hold memory hotplug lock while walking for kernel page table dump"). Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Signed-off-by: Björn Töpel --- arch/riscv/mm/ptdump.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c index 1289cc6d3700..9d5f657a251b 100644 --- a/arch/riscv/mm/ptdump.c +++ b/arch/riscv/mm/ptdump.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -370,7 +371,9 @@ bool ptdump_check_wx(void) static int ptdump_show(struct seq_file *m, void *v) { + get_online_mems(); ptdump_walk(m, m->private); + put_online_mems(); return 0; } -- 2.40.1
[PATCH v3 5/9] riscv: mm: Add memory hotplugging support
From: Björn Töpel For an architecture to support memory hotplugging, a couple of callbacks needs to be implemented: arch_add_memory() This callback is responsible for adding the physical memory into the direct map, and call into the memory hotplugging generic code via __add_pages() that adds the corresponding struct page entries, and updates the vmemmap mapping. arch_remove_memory() This is the inverse of the callback above. vmemmap_free() This function tears down the vmemmap mappings (if CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the backing vmemmap pages. Note that for persistent memory, an alternative allocator for the backing pages can be used; The vmem_altmap. This means that when the backing pages are cleared, extra care is needed so that the correct deallocation method is used. arch_get_mappable_range() This functions returns the PA range that the direct map can map. Used by the MHP internals for sanity checks. The page table unmap/teardown functions are heavily based on code from the x86 tree. The same remove_pgd_mapping() function is used in both vmemmap_free() and arch_remove_memory(), but in the latter function the backing pages are not removed. Signed-off-by: Björn Töpel --- arch/riscv/mm/init.c | 261 +++ 1 file changed, 261 insertions(+) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 6f72b0b2b854..6693b742bf2f 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1493,3 +1493,264 @@ void __init pgtable_cache_init(void) } } #endif + +#ifdef CONFIG_MEMORY_HOTPLUG +static void __meminit free_pagetable(struct page *page, int order) +{ + unsigned int nr_pages = 1 << order; + + /* +* vmemmap/direct page tables can be reserved, if added at +* boot. +*/ + if (PageReserved(page)) { + __ClearPageReserved(page); + while (nr_pages--) + free_reserved_page(page++); + return; + } + + free_pages((unsigned long)page_address(page), order); +} + +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) +{ + pte_t *pte; + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = pte_start + i; + if (!pte_none(*pte)) + return; + } + + free_pagetable(pmd_page(*pmd), 0); + pmd_clear(pmd); +} + +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) +{ + pmd_t *pmd; + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd = pmd_start + i; + if (!pmd_none(*pmd)) + return; + } + + free_pagetable(pud_page(*pud), 0); + pud_clear(pud); +} + +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d) +{ + pud_t *pud; + int i; + + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (!pud_none(*pud)) + return; + } + + free_pagetable(p4d_page(*p4d), 0); + p4d_clear(p4d); +} + +static void __meminit free_vmemmap_storage(struct page *page, size_t size, + struct vmem_altmap *altmap) +{ + if (altmap) + vmem_altmap_free(altmap, size >> PAGE_SHIFT); + else + free_pagetable(page, get_order(size)); +} + +static void __meminit remove_pte_mapping(pte_t *pte_base, unsigned long addr, unsigned long end, +bool is_vmemmap, struct vmem_altmap *altmap) +{ + unsigned long next; + pte_t *ptep, pte; + + for (; addr < end; addr = next) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + if (next > end) + next = end; + + ptep = pte_base + pte_index(addr); + pte = READ_ONCE(*ptep); + + if (!pte_present(*ptep)) + continue; + + pte_clear(_mm, addr, ptep); + if (is_vmemmap) + free_vmemmap_storage(pte_page(pte), PAGE_SIZE, altmap); + } +} + +static void __meminit remove_pmd_mapping(pmd_t *pmd_base, unsigned long addr, unsigned long end, +bool is_vmemmap, struct vmem_altmap *altmap) +{ + unsigned long next; + pte_t *pte_base; + pmd_t *pmdp, pmd; + + for (; addr < end; addr = next) { + next = pmd_addr_end(addr, end); + pmdp = pmd_base + pmd_index(addr); + pmd = READ_ONCE(*pmdp); + + if (!pmd_present(pmd)) + continue; + + if (pmd_leaf(pmd)) { + pmd_clear(pmdp); + if (is_vmemmap) + free_vmemmap_storage(pmd_page(pmd), PMD_SIZE, altmap)
[PATCH v3 4/9] riscv: mm: Refactor create_linear_mapping_range() for memory hot add
From: Björn Töpel Add a parameter to the direct map setup function, so it can be used in arch_add_memory() later. Reviewed-by: Alexandre Ghiti Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Signed-off-by: Björn Töpel --- arch/riscv/mm/init.c | 15 ++- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index c969427eab88..6f72b0b2b854 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1227,7 +1227,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) } static void __meminit create_linear_mapping_range(phys_addr_t start, phys_addr_t end, - uintptr_t fixed_map_size) + uintptr_t fixed_map_size, const pgprot_t *pgprot) { phys_addr_t pa; uintptr_t va, map_size; @@ -1238,7 +1238,7 @@ static void __meminit create_linear_mapping_range(phys_addr_t start, phys_addr_t best_map_size(pa, va, end - pa); create_pgd_mapping(swapper_pg_dir, va, pa, map_size, - pgprot_from_va(va)); + pgprot ? *pgprot : pgprot_from_va(va)); } } @@ -1282,22 +1282,19 @@ static void __init create_linear_mapping_page_table(void) if (end >= __pa(PAGE_OFFSET) + memory_limit) end = __pa(PAGE_OFFSET) + memory_limit; - create_linear_mapping_range(start, end, 0); + create_linear_mapping_range(start, end, 0, NULL); } #ifdef CONFIG_STRICT_KERNEL_RWX - create_linear_mapping_range(ktext_start, ktext_start + ktext_size, 0); - create_linear_mapping_range(krodata_start, - krodata_start + krodata_size, 0); + create_linear_mapping_range(ktext_start, ktext_start + ktext_size, 0, NULL); + create_linear_mapping_range(krodata_start, krodata_start + krodata_size, 0, NULL); memblock_clear_nomap(ktext_start, ktext_size); memblock_clear_nomap(krodata_start, krodata_size); #endif #ifdef CONFIG_KFENCE - create_linear_mapping_range(kfence_pool, - kfence_pool + KFENCE_POOL_SIZE, - PAGE_SIZE); + create_linear_mapping_range(kfence_pool, kfence_pool + KFENCE_POOL_SIZE, PAGE_SIZE, NULL); memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE); #endif -- 2.40.1
[PATCH v3 3/9] riscv: mm: Change attribute from __init to __meminit for page functions
From: Björn Töpel Prepare for memory hotplugging support by changing from __init to __meminit for the page table functions that are used by the upcoming architecture specific callbacks. Changing the __init attribute to __meminit, avoids that the functions are removed after init. The __meminit attribute makes sure the functions are kept in the kernel text post init, but only if memory hotplugging is enabled for the build. Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Signed-off-by: Björn Töpel --- arch/riscv/include/asm/mmu.h | 4 +-- arch/riscv/include/asm/pgtable.h | 2 +- arch/riscv/mm/init.c | 56 ++-- 3 files changed, 28 insertions(+), 34 deletions(-) diff --git a/arch/riscv/include/asm/mmu.h b/arch/riscv/include/asm/mmu.h index 947fd60f9051..c9e03e9da3dc 100644 --- a/arch/riscv/include/asm/mmu.h +++ b/arch/riscv/include/asm/mmu.h @@ -31,8 +31,8 @@ typedef struct { #define cntx2asid(cntx)((cntx) & SATP_ASID_MASK) #define cntx2version(cntx) ((cntx) & ~SATP_ASID_MASK) -void __init create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa, - phys_addr_t sz, pgprot_t prot); +void __meminit create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa, phys_addr_t sz, + pgprot_t prot); #endif /* __ASSEMBLY__ */ #endif /* _ASM_RISCV_MMU_H */ diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 58fd7b70b903..7933f493db71 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -162,7 +162,7 @@ struct pt_alloc_ops { #endif }; -extern struct pt_alloc_ops pt_ops __initdata; +extern struct pt_alloc_ops pt_ops __meminitdata; #ifdef CONFIG_MMU /* Number of PGD entries that a user-mode program can use */ diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index c98010ede810..c969427eab88 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -295,7 +295,7 @@ static void __init setup_bootmem(void) } #ifdef CONFIG_MMU -struct pt_alloc_ops pt_ops __initdata; +struct pt_alloc_ops pt_ops __meminitdata; pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss; pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss; @@ -357,7 +357,7 @@ static inline pte_t *__init get_pte_virt_fixmap(phys_addr_t pa) return (pte_t *)set_fixmap_offset(FIX_PTE, pa); } -static inline pte_t *__init get_pte_virt_late(phys_addr_t pa) +static inline pte_t *__meminit get_pte_virt_late(phys_addr_t pa) { return (pte_t *) __va(pa); } @@ -376,7 +376,7 @@ static inline phys_addr_t __init alloc_pte_fixmap(uintptr_t va) return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); } -static phys_addr_t __init alloc_pte_late(uintptr_t va) +static phys_addr_t __meminit alloc_pte_late(uintptr_t va) { struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0); @@ -384,9 +384,8 @@ static phys_addr_t __init alloc_pte_late(uintptr_t va) return __pa((pte_t *)ptdesc_address(ptdesc)); } -static void __init create_pte_mapping(pte_t *ptep, - uintptr_t va, phys_addr_t pa, - phys_addr_t sz, pgprot_t prot) +static void __meminit create_pte_mapping(pte_t *ptep, uintptr_t va, phys_addr_t pa, phys_addr_t sz, +pgprot_t prot) { uintptr_t pte_idx = pte_index(va); @@ -440,7 +439,7 @@ static pmd_t *__init get_pmd_virt_fixmap(phys_addr_t pa) return (pmd_t *)set_fixmap_offset(FIX_PMD, pa); } -static pmd_t *__init get_pmd_virt_late(phys_addr_t pa) +static pmd_t *__meminit get_pmd_virt_late(phys_addr_t pa) { return (pmd_t *) __va(pa); } @@ -457,7 +456,7 @@ static phys_addr_t __init alloc_pmd_fixmap(uintptr_t va) return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); } -static phys_addr_t __init alloc_pmd_late(uintptr_t va) +static phys_addr_t __meminit alloc_pmd_late(uintptr_t va) { struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0); @@ -465,9 +464,9 @@ static phys_addr_t __init alloc_pmd_late(uintptr_t va) return __pa((pmd_t *)ptdesc_address(ptdesc)); } -static void __init create_pmd_mapping(pmd_t *pmdp, - uintptr_t va, phys_addr_t pa, - phys_addr_t sz, pgprot_t prot) +static void __meminit create_pmd_mapping(pmd_t *pmdp, +uintptr_t va, phys_addr_t pa, +phys_addr_t sz, pgprot_t prot) { pte_t *ptep; phys_addr_t pte_phys; @@ -503,7 +502,7 @@ static pud_t *__init get_pud_virt_fixmap(phys_addr_t pa) return (pud_t *)set_fixmap_offset(FIX_PUD, pa); } -static pud_t *__init get_pud_virt_late(phys_addr_t pa) +static pud_t *__meminit get_pud_virt_late(phys_addr_t pa) { return (pud_t *)__va(p
[PATCH v3 2/9] riscv: mm: Pre-allocate vmemmap/direct map PGD entries
From: Björn Töpel The RISC-V port copies the PGD table from init_mm/swapper_pg_dir to all userland page tables, which means that if the PGD level table is changed, other page tables has to be updated as well. Instead of having the PGD changes ripple out to all tables, the synchronization can be avoided by pre-allocating the PGD entries/pages at boot, avoiding the synchronization all together. This is currently done for the bpf/modules, and vmalloc PGD regions. Extend this scheme for the PGD regions touched by memory hotplugging. Prepare the RISC-V port for memory hotplug by pre-allocate vmemmap/direct map entries at the PGD level. This will roughly waste ~128 worth of 4K pages when memory hotplugging is enabled in the kernel configuration. Reviewed-by: Alexandre Ghiti Signed-off-by: Björn Töpel --- arch/riscv/include/asm/kasan.h | 4 ++-- arch/riscv/mm/init.c | 7 +++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h index 0b85e363e778..e6a0071bdb56 100644 --- a/arch/riscv/include/asm/kasan.h +++ b/arch/riscv/include/asm/kasan.h @@ -6,8 +6,6 @@ #ifndef __ASSEMBLY__ -#ifdef CONFIG_KASAN - /* * The following comment was copied from arm64: * KASAN_SHADOW_START: beginning of the kernel virtual addresses. @@ -34,6 +32,8 @@ */ #define KASAN_SHADOW_START ((KASAN_SHADOW_END - KASAN_SHADOW_SIZE) & PGDIR_MASK) #define KASAN_SHADOW_END MODULES_LOWEST_VADDR + +#ifdef CONFIG_KASAN #define KASAN_SHADOW_OFFSET_AC(CONFIG_KASAN_SHADOW_OFFSET, UL) void kasan_init(void); diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index b66f846e7634..c98010ede810 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -1488,10 +1489,16 @@ static void __init preallocate_pgd_pages_range(unsigned long start, unsigned lon panic("Failed to pre-allocate %s pages for %s area\n", lvl, area); } +#define PAGE_END KASAN_SHADOW_START + void __init pgtable_cache_init(void) { preallocate_pgd_pages_range(VMALLOC_START, VMALLOC_END, "vmalloc"); if (IS_ENABLED(CONFIG_MODULES)) preallocate_pgd_pages_range(MODULES_VADDR, MODULES_END, "bpf/modules"); + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) { + preallocate_pgd_pages_range(VMEMMAP_START, VMEMMAP_END, "vmemmap"); + preallocate_pgd_pages_range(PAGE_OFFSET, PAGE_END, "direct map"); + } } #endif -- 2.40.1
[PATCH v3 1/9] riscv: mm: Properly forward vmemmap_populate() altmap parameter
From: Björn Töpel Make sure that the altmap parameter is properly passed on to vmemmap_populate_hugepages(). Signed-off-by: Björn Töpel --- arch/riscv/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 2574f6a3b0e7..b66f846e7634 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1434,7 +1434,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, * memory hotplug, we are not able to update all the page tables with * the new PMDs. */ - return vmemmap_populate_hugepages(start, end, node, NULL); + return vmemmap_populate_hugepages(start, end, node, altmap); } #endif -- 2.40.1
[PATCH v3 0/9] riscv: Memory Hot(Un)Plug support
From: Björn Töpel Memory Hot(Un)Plug support (and ZONE_DEVICE) for the RISC-V port Introduction To quote "Documentation/admin-guide/mm/memory-hotplug.rst": "Memory hot(un)plug allows for increasing and decreasing the size of physical memory available to a machine at runtime." This series adds memory hot(un)plugging, and ZONE_DEVICE support for the RISC-V Linux port. MM configuration RISC-V MM has the following configuration: * Memory blocks are 128M, analogous to x86-64. It uses PMD ("hugepage") vmemmaps. From that follows that 2M (PMD) worth of vmemmap spans 32768 pages á 4K which gets us 128M. * The pageblock size is the minimum minimum virtio_mem size, and on RISC-V it's 2M (2^9 * 4K). Implementation == The PGD table on RISC-V is shared/copied between for all processes. To avoid doing page table synchronization, the first patch (patch 1) pre-allocated the PGD entries for vmemmap/direct map. By doing that the init_mm PGD will be fixed at kernel init, and synchronization can be avoided all together. The following two patches (patch 2-3) does some preparations, followed by the actual MHP implementation (patch 4-5). Then, MHP and virtio-mem are enabled (patch 6-7), and finally ZONE_DEVICE support is added (patch 8). MHP and locking === TL;DR: The MHP does not step on any toes, except for ptdump. Additional locking is required for ptdump. Long version: For v2 I spent some time digging into init_mm synchronization/update. Here are my findings, and I'd love them to be corrected if incorrect. It's been a gnarly path... The `init_mm` structure is a special mm (perhaps not a "real" one). It's a "lazy context" that tracks kernel page table resources, e.g., the kernel page table (swapper_pg_dir), a kernel page_table_lock (more about the usage below), mmap_lock, and such. `init_mm` does not track/contain any VMAs. Having the `init_mm` is convenient, so that the regular kernel page table walk/modify functions can be used. Now, `init_mm` being special means that the locking for kernel page tables are special as well. On RISC-V the PGD (top-level page table structure), similar to x86, is shared (copied) with user processes. If the kernel PGD is modified, it has to be synched to user-mode processes PGDs. This is avoided by pre-populating the PGD, so it'll be fixed from boot. The in-kernel pgd regions are documented in `Documentation/arch/riscv/vm-layout.rst`. The distinct regions are: * vmemmap * vmalloc/ioremap space * direct mapping of all physical memory * kasan * modules, BPF * kernel Memory hotplug is the process of adding/removing memory to/from the kernel. Adding is done in two phases: 1. Add the memory to the kernel 2. Online memory, making it available to the page allocator. Step 1 is partially architecture dependent, and updates the init_mm page table: * Update the direct map page tables. The direct map is a linear map, representing all physical memory: `virt = phys + PAGE_OFFSET` * Add a `struct page` for each added page of memory. Update the vmemmap (virtual mapping to the `struct page`, so we can easily transform a kernel virtual address to a `struct page *` address. >From an MHP perspective, there are two regions of the PGD that are updated: * vmemmap * direct mapping of all physical memory The `struct mm_struct` has a couple of locks in play: * `spinlock_t page_table_lock` protects the page table, and some counters * `struct rw_semaphore mmap_lock` protect an mm's VMAs Note again that `init_mm` does not contain any VMAs, but still uses the mmap_lock in some places. The `page_table_lock` was originally used to to protect all pages tables, but more recently a split page table lock has been introduced. The split lock has a per-table lock for the PTE and PMD tables. If split lock is disabled, all tables are guarded by `mm->page_table_lock` (for user processes). Split page table locks are not used for init_mm. MHP operations is typically synchronized using `DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock)`. Actors -- The following non-MHP actors in the kernel traverses (read), and/or modifies the kernel PGD. * `ptdump` Walks the entire `init_mm`, via `ptdump_walk_pgd()` with the `mmap_write_lock(init_mm)` taken. Observation: ptdump can race with MHP, and needs additional locking to avoid crashes/races. * `set_direct_*` / `arch/riscv/mm/pageattr.c` The `set_direct_*` functionality is used to "synchronize" the direct map to other kernel mappings, e.g. modules/kernel text. The direct map is using "as large huge table mappings as possible", which means that the `set_direct_*` might need to split the direct map. The `set_direct_*` functions operates with
Re: [PATCH v2 8/8] riscv: mm: Add support for ZONE_DEVICE
Björn Töpel writes: > From: Björn Töpel > > ZONE_DEVICE pages need DEVMAP PTEs support to function > (ARCH_HAS_PTE_DEVMAP). Claim another RSW (reserved for software) bit > in the PTE for DEVMAP mark, add the corresponding helpers, and enable > ARCH_HAS_PTE_DEVMAP for riscv64. ...and this patch has rv32 build issues. Will fix as well.
Re: [PATCH v2 4/8] riscv: mm: Add memory hotplugging support
Oscar Salvador writes: > On Tue, May 14, 2024 at 04:04:42PM +0200, Björn Töpel wrote: >> +static void __meminit free_vmemmap_storage(struct page *page, size_t size, >> + struct vmem_altmap *altmap) >> +{ >> +if (altmap) >> +vmem_altmap_free(altmap, size >> PAGE_SHIFT); >> +else >> +free_pages((unsigned long)page_address(page), get_order(size)); > > David already pointed this out, but can check > arch/x86/mm/init_64.c:free_pagetable(). > > You will see that we have to do some magic for bootmem memory (DIMMs > which were not hotplugged but already present) Thank you! >> +#ifdef CONFIG_SPARSEMEM_VMEMMAP >> +void __ref vmemmap_free(unsigned long start, unsigned long end, struct >> vmem_altmap *altmap) >> +{ >> +remove_pgd_mapping(start, end, true, altmap); >> +} >> +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ >> +#endif /* CONFIG_MEMORY_HOTPLUG */ > > I will comment on the patch where you add support for hotplug and the > dependency, but on a track in LSFMM today, we decided that most likely > we will drop memory-hotplug support for !CONFIG_SPARSEMEM_VMEMMAP > environments. > So, since you are adding this plain fresh, please consider to tight the > hotplug dependency to CONFIG_SPARSEMEM_VMEMMAP. > As a bonus, you will only have to maintain one flavour of functions. Ah, yeah, I saw it mentioned on the LSF/MM/BPF topics. Less is definitely more -- I'll make the next version depend on SPARSEMEM_VMEMMAP. Björn
Re: [PATCH v2 2/8] riscv: mm: Change attribute from __init to __meminit for page functions
Oscar Salvador writes: > On Tue, May 14, 2024 at 04:04:40PM +0200, Björn Töpel wrote: >> From: Björn Töpel >> >> Prepare for memory hotplugging support by changing from __init to >> __meminit for the page table functions that are used by the upcoming >> architecture specific callbacks. >> >> Changing the __init attribute to __meminit, avoids that the functions >> are removed after init. The __meminit attribute makes sure the >> functions are kept in the kernel text post init, but only if memory >> hotplugging is enabled for the build. >> >> Also, make sure that the altmap parameter is properly passed on to >> vmemmap_populate_hugepages(). >> >> Signed-off-by: Björn Töpel > > Reviewed-by: Oscar Salvador > >> +static void __meminit create_linear_mapping_range(phys_addr_t start, >> phys_addr_t end, >> + uintptr_t fixed_map_size) >> { >> phys_addr_t pa; >> uintptr_t va, map_size; >> @@ -1435,7 +1429,7 @@ int __meminit vmemmap_populate(unsigned long start, >> unsigned long end, int node, >> * memory hotplug, we are not able to update all the page tables with >> * the new PMDs. >> */ >> -return vmemmap_populate_hugepages(start, end, node, NULL); >> +return vmemmap_populate_hugepages(start, end, node, altmap); > > I would have put this into a separate patch. Thanks for the review, Oscar! I'll split this up (also suggested by Alex!). Cheers, Björn
Re: [PATCH v2 4/8] riscv: mm: Add memory hotplugging support
Alexandre Ghiti writes: > On Tue, May 14, 2024 at 4:05 PM Björn Töpel wrote: >> +int __ref arch_add_memory(int nid, u64 start, u64 size, struct mhp_params >> *params) >> +{ >> + int ret; >> + >> + create_linear_mapping_range(start, start + size, 0, >pgprot); >> + flush_tlb_all(); >> + ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, >> params); >> + if (ret) { >> + remove_linear_mapping(start, size); >> + return ret; >> + } >> + > > You need to flush the TLB here too since __add_pages() populates the > page table with the new vmemmap mapping (only because riscv allows to > cache invalid entries, I'll adapt this in my next version of Svvptc > support). > >> + max_pfn = PFN_UP(start + size); >> + max_low_pfn = max_pfn; >> + return 0; >> +} >> + >> +void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap >> *altmap) >> +{ >> + __remove_pages(start >> PAGE_SHIFT, size >> PAGE_SHIFT, altmap); >> + remove_linear_mapping(start, size); > > You need to flush the TLB here too. I'll address all of the above in the next version. Thanks for reviewing the series! Björn
Re: [PATCH v2 6/8] riscv: Enable memory hotplugging for RISC-V
Alexandre Ghiti writes: > On Tue, May 14, 2024 at 4:05 PM Björn Töpel wrote: >> >> From: Björn Töpel >> >> Enable ARCH_ENABLE_MEMORY_HOTPLUG and ARCH_ENABLE_MEMORY_HOTREMOVE for >> RISC-V. >> >> Signed-off-by: Björn Töpel >> --- >> arch/riscv/Kconfig | 2 ++ >> 1 file changed, 2 insertions(+) >> >> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig >> index 6bec1bce6586..b9398b64bb69 100644 >> --- a/arch/riscv/Kconfig >> +++ b/arch/riscv/Kconfig >> @@ -16,6 +16,8 @@ config RISCV >> select ACPI_REDUCED_HARDWARE_ONLY if ACPI >> select ARCH_DMA_DEFAULT_COHERENT >> select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION >> + select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM && 64BIT && MMU > > I think this should be SPARSEMEM_VMEMMAP here. Hmm, care to elaborate? I thought that was optional.
Re: [PATCH v2 2/8] riscv: mm: Change attribute from __init to __meminit for page functions
Alexandre Ghiti writes: > On Tue, May 14, 2024 at 4:05 PM Björn Töpel wrote: >> >> From: Björn Töpel >> >> Prepare for memory hotplugging support by changing from __init to >> __meminit for the page table functions that are used by the upcoming >> architecture specific callbacks. >> >> Changing the __init attribute to __meminit, avoids that the functions >> are removed after init. The __meminit attribute makes sure the >> functions are kept in the kernel text post init, but only if memory >> hotplugging is enabled for the build. >> >> Also, make sure that the altmap parameter is properly passed on to >> vmemmap_populate_hugepages(). >> >> Signed-off-by: Björn Töpel >> --- >> arch/riscv/include/asm/mmu.h | 4 +-- >> arch/riscv/include/asm/pgtable.h | 2 +- >> arch/riscv/mm/init.c | 58 ++-- >> 3 files changed, 29 insertions(+), 35 deletions(-) >> >> diff --git a/arch/riscv/include/asm/mmu.h b/arch/riscv/include/asm/mmu.h >> index 60be458e94da..c09c3c79f496 100644 >> --- a/arch/riscv/include/asm/mmu.h >> +++ b/arch/riscv/include/asm/mmu.h >> @@ -28,8 +28,8 @@ typedef struct { >> #endif >> } mm_context_t; >> >> -void __init create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa, >> - phys_addr_t sz, pgprot_t prot); >> +void __meminit create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t >> pa, phys_addr_t sz, >> + pgprot_t prot); >> #endif /* __ASSEMBLY__ */ >> >> #endif /* _ASM_RISCV_MMU_H */ >> diff --git a/arch/riscv/include/asm/pgtable.h >> b/arch/riscv/include/asm/pgtable.h >> index 58fd7b70b903..7933f493db71 100644 >> --- a/arch/riscv/include/asm/pgtable.h >> +++ b/arch/riscv/include/asm/pgtable.h >> @@ -162,7 +162,7 @@ struct pt_alloc_ops { >> #endif >> }; >> >> -extern struct pt_alloc_ops pt_ops __initdata; >> +extern struct pt_alloc_ops pt_ops __meminitdata; >> >> #ifdef CONFIG_MMU >> /* Number of PGD entries that a user-mode program can use */ >> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c >> index 5b8cdfafb52a..c969427eab88 100644 >> --- a/arch/riscv/mm/init.c >> +++ b/arch/riscv/mm/init.c >> @@ -295,7 +295,7 @@ static void __init setup_bootmem(void) >> } >> >> #ifdef CONFIG_MMU >> -struct pt_alloc_ops pt_ops __initdata; >> +struct pt_alloc_ops pt_ops __meminitdata; >> >> pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss; >> pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss; >> @@ -357,7 +357,7 @@ static inline pte_t *__init >> get_pte_virt_fixmap(phys_addr_t pa) >> return (pte_t *)set_fixmap_offset(FIX_PTE, pa); >> } >> >> -static inline pte_t *__init get_pte_virt_late(phys_addr_t pa) >> +static inline pte_t *__meminit get_pte_virt_late(phys_addr_t pa) >> { >> return (pte_t *) __va(pa); >> } >> @@ -376,7 +376,7 @@ static inline phys_addr_t __init >> alloc_pte_fixmap(uintptr_t va) >> return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); >> } >> >> -static phys_addr_t __init alloc_pte_late(uintptr_t va) >> +static phys_addr_t __meminit alloc_pte_late(uintptr_t va) >> { >> struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, >> 0); >> >> @@ -384,9 +384,8 @@ static phys_addr_t __init alloc_pte_late(uintptr_t va) >> return __pa((pte_t *)ptdesc_address(ptdesc)); >> } >> >> -static void __init create_pte_mapping(pte_t *ptep, >> - uintptr_t va, phys_addr_t pa, >> - phys_addr_t sz, pgprot_t prot) >> +static void __meminit create_pte_mapping(pte_t *ptep, uintptr_t va, >> phys_addr_t pa, phys_addr_t sz, >> +pgprot_t prot) >> { >> uintptr_t pte_idx = pte_index(va); >> >> @@ -440,7 +439,7 @@ static pmd_t *__init get_pmd_virt_fixmap(phys_addr_t pa) >> return (pmd_t *)set_fixmap_offset(FIX_PMD, pa); >> } >> >> -static pmd_t *__init get_pmd_virt_late(phys_addr_t pa) >> +static pmd_t *__meminit get_pmd_virt_late(phys_addr_t pa) >> { >> return (pmd_t *) __va(pa); >> } >> @@ -457,7 +456,7 @@ static phys_addr_t __init alloc_pmd_fixmap(uintptr_t va) >> return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); >> } >> >> -static phys_addr_t __init alloc_pmd_late(uintptr_t va) >&g
Re: [PATCH v2 4/8] riscv: mm: Add memory hotplugging support
David Hildenbrand writes: > On 14.05.24 16:04, Björn Töpel wrote: >> From: Björn Töpel >> >> For an architecture to support memory hotplugging, a couple of >> callbacks needs to be implemented: >> >> arch_add_memory() >>This callback is responsible for adding the physical memory into the >>direct map, and call into the memory hotplugging generic code via >>__add_pages() that adds the corresponding struct page entries, and >>updates the vmemmap mapping. >> >> arch_remove_memory() >>This is the inverse of the callback above. >> >> vmemmap_free() >>This function tears down the vmemmap mappings (if >>CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the >>backing vmemmap pages. Note that for persistent memory, an >>alternative allocator for the backing pages can be used; The >>vmem_altmap. This means that when the backing pages are cleared, >>extra care is needed so that the correct deallocation method is >>used. >> >> arch_get_mappable_range() >>This functions returns the PA range that the direct map can map. >>Used by the MHP internals for sanity checks. >> >> The page table unmap/teardown functions are heavily based on code from >> the x86 tree. The same remove_pgd_mapping() function is used in both >> vmemmap_free() and arch_remove_memory(), but in the latter function >> the backing pages are not removed. >> >> Signed-off-by: Björn Töpel >> --- >> arch/riscv/mm/init.c | 242 +++ >> 1 file changed, 242 insertions(+) >> >> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c >> index 6f72b0b2b854..7f0b921a3d3a 100644 >> --- a/arch/riscv/mm/init.c >> +++ b/arch/riscv/mm/init.c >> @@ -1493,3 +1493,245 @@ void __init pgtable_cache_init(void) >> } >> } >> #endif >> + >> +#ifdef CONFIG_MEMORY_HOTPLUG >> +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) >> +{ >> +pte_t *pte; >> +int i; >> + >> +for (i = 0; i < PTRS_PER_PTE; i++) { >> +pte = pte_start + i; >> +if (!pte_none(*pte)) >> +return; >> +} >> + >> +free_pages((unsigned long)page_address(pmd_page(*pmd)), 0); >> +pmd_clear(pmd); >> +} >> + >> +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) >> +{ >> +pmd_t *pmd; >> +int i; >> + >> +for (i = 0; i < PTRS_PER_PMD; i++) { >> +pmd = pmd_start + i; >> +if (!pmd_none(*pmd)) >> +return; >> +} >> + >> +free_pages((unsigned long)page_address(pud_page(*pud)), 0); >> +pud_clear(pud); >> +} >> + >> +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d) >> +{ >> +pud_t *pud; >> +int i; >> + >> +for (i = 0; i < PTRS_PER_PUD; i++) { >> +pud = pud_start + i; >> +if (!pud_none(*pud)) >> +return; >> +} >> + >> +free_pages((unsigned long)page_address(p4d_page(*p4d)), 0); >> +p4d_clear(p4d); >> +} >> + >> +static void __meminit free_vmemmap_storage(struct page *page, size_t size, >> + struct vmem_altmap *altmap) >> +{ >> +if (altmap) >> +vmem_altmap_free(altmap, size >> PAGE_SHIFT); >> +else >> +free_pages((unsigned long)page_address(page), get_order(size)); > > If you unplug a DIMM that was added during boot (can happen on x86-64, > can it happen on riscv?), free_pages() would not be sufficient. You'd be > freeing a PG_reserved page that has to be freed differently. I'd say if it can happen on x86-64, it probably can on RISC-V. I'll look into this for the next spin! Thanks for spending time on the series! Cheers, Björn
Re: [PATCH v2 1/8] riscv: mm: Pre-allocate vmemmap/direct map PGD entries
Alexandre Ghiti writes: > Hi Björn, > > On Tue, May 14, 2024 at 4:05 PM Björn Töpel wrote: >> >> From: Björn Töpel >> >> The RISC-V port copies the PGD table from init_mm/swapper_pg_dir to >> all userland page tables, which means that if the PGD level table is >> changed, other page tables has to be updated as well. >> >> Instead of having the PGD changes ripple out to all tables, the >> synchronization can be avoided by pre-allocating the PGD entries/pages >> at boot, avoiding the synchronization all together. >> >> This is currently done for the bpf/modules, and vmalloc PGD regions. >> Extend this scheme for the PGD regions touched by memory hotplugging. >> >> Prepare the RISC-V port for memory hotplug by pre-allocate >> vmemmap/direct map entries at the PGD level. This will roughly waste >> ~128 worth of 4K pages when memory hotplugging is enabled in the >> kernel configuration. >> >> Signed-off-by: Björn Töpel >> --- >> arch/riscv/include/asm/kasan.h | 4 ++-- >> arch/riscv/mm/init.c | 7 +++ >> 2 files changed, 9 insertions(+), 2 deletions(-) >> >> diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h >> index 0b85e363e778..e6a0071bdb56 100644 >> --- a/arch/riscv/include/asm/kasan.h >> +++ b/arch/riscv/include/asm/kasan.h >> @@ -6,8 +6,6 @@ >> >> #ifndef __ASSEMBLY__ >> >> -#ifdef CONFIG_KASAN >> - >> /* >> * The following comment was copied from arm64: >> * KASAN_SHADOW_START: beginning of the kernel virtual addresses. >> @@ -34,6 +32,8 @@ >> */ >> #define KASAN_SHADOW_START ((KASAN_SHADOW_END - KASAN_SHADOW_SIZE) & >> PGDIR_MASK) >> #define KASAN_SHADOW_END MODULES_LOWEST_VADDR >> + >> +#ifdef CONFIG_KASAN >> #define KASAN_SHADOW_OFFSET_AC(CONFIG_KASAN_SHADOW_OFFSET, UL) >> >> void kasan_init(void); >> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c >> index 2574f6a3b0e7..5b8cdfafb52a 100644 >> --- a/arch/riscv/mm/init.c >> +++ b/arch/riscv/mm/init.c >> @@ -27,6 +27,7 @@ >> >> #include >> #include >> +#include >> #include >> #include >> #include >> @@ -1488,10 +1489,16 @@ static void __init >> preallocate_pgd_pages_range(unsigned long start, unsigned lon >> panic("Failed to pre-allocate %s pages for %s area\n", lvl, area); >> } >> >> +#define PAGE_END KASAN_SHADOW_START >> + >> void __init pgtable_cache_init(void) >> { >> preallocate_pgd_pages_range(VMALLOC_START, VMALLOC_END, "vmalloc"); >> if (IS_ENABLED(CONFIG_MODULES)) >> preallocate_pgd_pages_range(MODULES_VADDR, MODULES_END, >> "bpf/modules"); >> + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) { >> + preallocate_pgd_pages_range(VMEMMAP_START, VMEMMAP_END, >> "vmemmap"); >> + preallocate_pgd_pages_range(PAGE_OFFSET, PAGE_END, "direct >> map"); >> + } >> } >> #endif >> -- >> 2.40.1 >> > > As you asked, with > https://lore.kernel.org/linux-riscv/20240514133614.87813-1-alexgh...@rivosinc.com/T/#u, > you will be able to remove the usage of KASAN_SHADOW_START. Very nice -- consistency! I'll need to respin, so I'll clean this up for the next version. > But anyhow, you can add: > > Reviewed-by: Alexandre Ghiti Thank you! Björn
[PATCH v2 8/8] riscv: mm: Add support for ZONE_DEVICE
From: Björn Töpel ZONE_DEVICE pages need DEVMAP PTEs support to function (ARCH_HAS_PTE_DEVMAP). Claim another RSW (reserved for software) bit in the PTE for DEVMAP mark, add the corresponding helpers, and enable ARCH_HAS_PTE_DEVMAP for riscv64. Signed-off-by: Björn Töpel --- arch/riscv/Kconfig| 1 + arch/riscv/include/asm/pgtable-64.h | 20 arch/riscv/include/asm/pgtable-bits.h | 1 + arch/riscv/include/asm/pgtable.h | 15 +++ 4 files changed, 37 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index b9398b64bb69..6d426afdd904 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -36,6 +36,7 @@ config RISCV select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PMEM_API select ARCH_HAS_PREPARE_SYNC_CORE_CMD + select ARCH_HAS_PTE_DEVMAP if 64BIT && MMU select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_DIRECT_MAP if MMU select ARCH_HAS_SET_MEMORY if MMU diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h index 221a5c1ee287..c67a9bbfd010 100644 --- a/arch/riscv/include/asm/pgtable-64.h +++ b/arch/riscv/include/asm/pgtable-64.h @@ -400,4 +400,24 @@ static inline struct page *pgd_page(pgd_t pgd) #define p4d_offset p4d_offset p4d_t *p4d_offset(pgd_t *pgd, unsigned long address); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pte_devmap(pte_t pte); +static inline pte_t pmd_pte(pmd_t pmd); + +static inline int pmd_devmap(pmd_t pmd) +{ + return pte_devmap(pmd_pte(pmd)); +} + +static inline int pud_devmap(pud_t pud) +{ + return 0; +} + +static inline int pgd_devmap(pgd_t pgd) +{ + return 0; +} +#endif + #endif /* _ASM_RISCV_PGTABLE_64_H */ diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h index 179bd4afece4..a8f5205cea54 100644 --- a/arch/riscv/include/asm/pgtable-bits.h +++ b/arch/riscv/include/asm/pgtable-bits.h @@ -19,6 +19,7 @@ #define _PAGE_SOFT (3 << 8)/* Reserved for software */ #define _PAGE_SPECIAL (1 << 8)/* RSW: 0x1 */ +#define _PAGE_DEVMAP(1 << 9)/* RSW, devmap */ #define _PAGE_TABLE _PAGE_PRESENT /* diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 7933f493db71..216de1db3cd0 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -387,6 +387,11 @@ static inline int pte_special(pte_t pte) return pte_val(pte) & _PAGE_SPECIAL; } +static inline int pte_devmap(pte_t pte) +{ + return pte_val(pte) & _PAGE_DEVMAP; +} + /* static inline pte_t pte_rdprotect(pte_t pte) */ static inline pte_t pte_wrprotect(pte_t pte) @@ -428,6 +433,11 @@ static inline pte_t pte_mkspecial(pte_t pte) return __pte(pte_val(pte) | _PAGE_SPECIAL); } +static inline pte_t pte_mkdevmap(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_DEVMAP); +} + static inline pte_t pte_mkhuge(pte_t pte) { return pte; @@ -711,6 +721,11 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd) return pte_pmd(pte_mkdirty(pmd_pte(pmd))); } +static inline pmd_t pmd_mkdevmap(pmd_t pmd) +{ + return pte_pmd(pte_mkdevmap(pmd_pte(pmd))); +} + static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { -- 2.40.1
[PATCH v2 6/8] riscv: Enable memory hotplugging for RISC-V
From: Björn Töpel Enable ARCH_ENABLE_MEMORY_HOTPLUG and ARCH_ENABLE_MEMORY_HOTREMOVE for RISC-V. Signed-off-by: Björn Töpel --- arch/riscv/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 6bec1bce6586..b9398b64bb69 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -16,6 +16,8 @@ config RISCV select ACPI_REDUCED_HARDWARE_ONLY if ACPI select ARCH_DMA_DEFAULT_COHERENT select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION + select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM && 64BIT && MMU + select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2 select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_HAS_BINFMT_FLAT -- 2.40.1
[PATCH v2 7/8] virtio-mem: Enable virtio-mem for RISC-V
From: Björn Töpel Now that RISC-V has memory hotplugging support, virtio-mem can be used on the platform. Signed-off-by: Björn Töpel --- drivers/virtio/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index c17193544268..4e5cebf1b82a 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -122,7 +122,7 @@ config VIRTIO_BALLOON config VIRTIO_MEM tristate "Virtio mem driver" - depends on X86_64 || ARM64 + depends on X86_64 || ARM64 || RISCV depends on VIRTIO depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE -- 2.40.1
[PATCH v2 4/8] riscv: mm: Add memory hotplugging support
From: Björn Töpel For an architecture to support memory hotplugging, a couple of callbacks needs to be implemented: arch_add_memory() This callback is responsible for adding the physical memory into the direct map, and call into the memory hotplugging generic code via __add_pages() that adds the corresponding struct page entries, and updates the vmemmap mapping. arch_remove_memory() This is the inverse of the callback above. vmemmap_free() This function tears down the vmemmap mappings (if CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the backing vmemmap pages. Note that for persistent memory, an alternative allocator for the backing pages can be used; The vmem_altmap. This means that when the backing pages are cleared, extra care is needed so that the correct deallocation method is used. arch_get_mappable_range() This functions returns the PA range that the direct map can map. Used by the MHP internals for sanity checks. The page table unmap/teardown functions are heavily based on code from the x86 tree. The same remove_pgd_mapping() function is used in both vmemmap_free() and arch_remove_memory(), but in the latter function the backing pages are not removed. Signed-off-by: Björn Töpel --- arch/riscv/mm/init.c | 242 +++ 1 file changed, 242 insertions(+) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 6f72b0b2b854..7f0b921a3d3a 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1493,3 +1493,245 @@ void __init pgtable_cache_init(void) } } #endif + +#ifdef CONFIG_MEMORY_HOTPLUG +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) +{ + pte_t *pte; + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = pte_start + i; + if (!pte_none(*pte)) + return; + } + + free_pages((unsigned long)page_address(pmd_page(*pmd)), 0); + pmd_clear(pmd); +} + +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) +{ + pmd_t *pmd; + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd = pmd_start + i; + if (!pmd_none(*pmd)) + return; + } + + free_pages((unsigned long)page_address(pud_page(*pud)), 0); + pud_clear(pud); +} + +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d) +{ + pud_t *pud; + int i; + + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (!pud_none(*pud)) + return; + } + + free_pages((unsigned long)page_address(p4d_page(*p4d)), 0); + p4d_clear(p4d); +} + +static void __meminit free_vmemmap_storage(struct page *page, size_t size, + struct vmem_altmap *altmap) +{ + if (altmap) + vmem_altmap_free(altmap, size >> PAGE_SHIFT); + else + free_pages((unsigned long)page_address(page), get_order(size)); +} + +static void __meminit remove_pte_mapping(pte_t *pte_base, unsigned long addr, unsigned long end, +bool is_vmemmap, struct vmem_altmap *altmap) +{ + unsigned long next; + pte_t *ptep, pte; + + for (; addr < end; addr = next) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + if (next > end) + next = end; + + ptep = pte_base + pte_index(addr); + pte = READ_ONCE(*ptep); + + if (!pte_present(*ptep)) + continue; + + pte_clear(_mm, addr, ptep); + if (is_vmemmap) + free_vmemmap_storage(pte_page(pte), PAGE_SIZE, altmap); + } +} + +static void __meminit remove_pmd_mapping(pmd_t *pmd_base, unsigned long addr, unsigned long end, +bool is_vmemmap, struct vmem_altmap *altmap) +{ + unsigned long next; + pte_t *pte_base; + pmd_t *pmdp, pmd; + + for (; addr < end; addr = next) { + next = pmd_addr_end(addr, end); + pmdp = pmd_base + pmd_index(addr); + pmd = READ_ONCE(*pmdp); + + if (!pmd_present(pmd)) + continue; + + if (pmd_leaf(pmd)) { + pmd_clear(pmdp); + if (is_vmemmap) + free_vmemmap_storage(pmd_page(pmd), PMD_SIZE, altmap); + continue; + } + + pte_base = (pte_t *)pmd_page_vaddr(*pmdp); + remove_pte_mapping(pte_base, addr, next, is_vmemmap, altmap); + free_pte_table(pte_base, pmdp); + } +} + +static void __meminit remove_pud_mapping(pud_t *pud_base, unsigned long addr, unsigned long end, +bool is_vmemmap,
[PATCH v2 5/8] riscv: mm: Take memory hotplug read-lock during kernel page table dump
From: Björn Töpel During memory hot remove, the ptdump functionality can end up touching stale data. Avoid any potential crashes (or worse), by holding the memory hotplug read-lock while traversing the page table. This change is analogous to arm64's commit bf2b59f60ee1 ("arm64/mm: Hold memory hotplug lock while walking for kernel page table dump"). Signed-off-by: Björn Töpel --- arch/riscv/mm/ptdump.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c index 1289cc6d3700..9d5f657a251b 100644 --- a/arch/riscv/mm/ptdump.c +++ b/arch/riscv/mm/ptdump.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -370,7 +371,9 @@ bool ptdump_check_wx(void) static int ptdump_show(struct seq_file *m, void *v) { + get_online_mems(); ptdump_walk(m, m->private); + put_online_mems(); return 0; } -- 2.40.1
[PATCH v2 3/8] riscv: mm: Refactor create_linear_mapping_range() for memory hot add
From: Björn Töpel Add a parameter to the direct map setup function, so it can be used in arch_add_memory() later. Signed-off-by: Björn Töpel --- arch/riscv/mm/init.c | 15 ++- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index c969427eab88..6f72b0b2b854 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1227,7 +1227,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) } static void __meminit create_linear_mapping_range(phys_addr_t start, phys_addr_t end, - uintptr_t fixed_map_size) + uintptr_t fixed_map_size, const pgprot_t *pgprot) { phys_addr_t pa; uintptr_t va, map_size; @@ -1238,7 +1238,7 @@ static void __meminit create_linear_mapping_range(phys_addr_t start, phys_addr_t best_map_size(pa, va, end - pa); create_pgd_mapping(swapper_pg_dir, va, pa, map_size, - pgprot_from_va(va)); + pgprot ? *pgprot : pgprot_from_va(va)); } } @@ -1282,22 +1282,19 @@ static void __init create_linear_mapping_page_table(void) if (end >= __pa(PAGE_OFFSET) + memory_limit) end = __pa(PAGE_OFFSET) + memory_limit; - create_linear_mapping_range(start, end, 0); + create_linear_mapping_range(start, end, 0, NULL); } #ifdef CONFIG_STRICT_KERNEL_RWX - create_linear_mapping_range(ktext_start, ktext_start + ktext_size, 0); - create_linear_mapping_range(krodata_start, - krodata_start + krodata_size, 0); + create_linear_mapping_range(ktext_start, ktext_start + ktext_size, 0, NULL); + create_linear_mapping_range(krodata_start, krodata_start + krodata_size, 0, NULL); memblock_clear_nomap(ktext_start, ktext_size); memblock_clear_nomap(krodata_start, krodata_size); #endif #ifdef CONFIG_KFENCE - create_linear_mapping_range(kfence_pool, - kfence_pool + KFENCE_POOL_SIZE, - PAGE_SIZE); + create_linear_mapping_range(kfence_pool, kfence_pool + KFENCE_POOL_SIZE, PAGE_SIZE, NULL); memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE); #endif -- 2.40.1
[PATCH v2 2/8] riscv: mm: Change attribute from __init to __meminit for page functions
From: Björn Töpel Prepare for memory hotplugging support by changing from __init to __meminit for the page table functions that are used by the upcoming architecture specific callbacks. Changing the __init attribute to __meminit, avoids that the functions are removed after init. The __meminit attribute makes sure the functions are kept in the kernel text post init, but only if memory hotplugging is enabled for the build. Also, make sure that the altmap parameter is properly passed on to vmemmap_populate_hugepages(). Signed-off-by: Björn Töpel --- arch/riscv/include/asm/mmu.h | 4 +-- arch/riscv/include/asm/pgtable.h | 2 +- arch/riscv/mm/init.c | 58 ++-- 3 files changed, 29 insertions(+), 35 deletions(-) diff --git a/arch/riscv/include/asm/mmu.h b/arch/riscv/include/asm/mmu.h index 60be458e94da..c09c3c79f496 100644 --- a/arch/riscv/include/asm/mmu.h +++ b/arch/riscv/include/asm/mmu.h @@ -28,8 +28,8 @@ typedef struct { #endif } mm_context_t; -void __init create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa, - phys_addr_t sz, pgprot_t prot); +void __meminit create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa, phys_addr_t sz, + pgprot_t prot); #endif /* __ASSEMBLY__ */ #endif /* _ASM_RISCV_MMU_H */ diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 58fd7b70b903..7933f493db71 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -162,7 +162,7 @@ struct pt_alloc_ops { #endif }; -extern struct pt_alloc_ops pt_ops __initdata; +extern struct pt_alloc_ops pt_ops __meminitdata; #ifdef CONFIG_MMU /* Number of PGD entries that a user-mode program can use */ diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 5b8cdfafb52a..c969427eab88 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -295,7 +295,7 @@ static void __init setup_bootmem(void) } #ifdef CONFIG_MMU -struct pt_alloc_ops pt_ops __initdata; +struct pt_alloc_ops pt_ops __meminitdata; pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss; pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss; @@ -357,7 +357,7 @@ static inline pte_t *__init get_pte_virt_fixmap(phys_addr_t pa) return (pte_t *)set_fixmap_offset(FIX_PTE, pa); } -static inline pte_t *__init get_pte_virt_late(phys_addr_t pa) +static inline pte_t *__meminit get_pte_virt_late(phys_addr_t pa) { return (pte_t *) __va(pa); } @@ -376,7 +376,7 @@ static inline phys_addr_t __init alloc_pte_fixmap(uintptr_t va) return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); } -static phys_addr_t __init alloc_pte_late(uintptr_t va) +static phys_addr_t __meminit alloc_pte_late(uintptr_t va) { struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0); @@ -384,9 +384,8 @@ static phys_addr_t __init alloc_pte_late(uintptr_t va) return __pa((pte_t *)ptdesc_address(ptdesc)); } -static void __init create_pte_mapping(pte_t *ptep, - uintptr_t va, phys_addr_t pa, - phys_addr_t sz, pgprot_t prot) +static void __meminit create_pte_mapping(pte_t *ptep, uintptr_t va, phys_addr_t pa, phys_addr_t sz, +pgprot_t prot) { uintptr_t pte_idx = pte_index(va); @@ -440,7 +439,7 @@ static pmd_t *__init get_pmd_virt_fixmap(phys_addr_t pa) return (pmd_t *)set_fixmap_offset(FIX_PMD, pa); } -static pmd_t *__init get_pmd_virt_late(phys_addr_t pa) +static pmd_t *__meminit get_pmd_virt_late(phys_addr_t pa) { return (pmd_t *) __va(pa); } @@ -457,7 +456,7 @@ static phys_addr_t __init alloc_pmd_fixmap(uintptr_t va) return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); } -static phys_addr_t __init alloc_pmd_late(uintptr_t va) +static phys_addr_t __meminit alloc_pmd_late(uintptr_t va) { struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0); @@ -465,9 +464,9 @@ static phys_addr_t __init alloc_pmd_late(uintptr_t va) return __pa((pmd_t *)ptdesc_address(ptdesc)); } -static void __init create_pmd_mapping(pmd_t *pmdp, - uintptr_t va, phys_addr_t pa, - phys_addr_t sz, pgprot_t prot) +static void __meminit create_pmd_mapping(pmd_t *pmdp, +uintptr_t va, phys_addr_t pa, +phys_addr_t sz, pgprot_t prot) { pte_t *ptep; phys_addr_t pte_phys; @@ -503,7 +502,7 @@ static pud_t *__init get_pud_virt_fixmap(phys_addr_t pa) return (pud_t *)set_fixmap_offset(FIX_PUD, pa); } -static pud_t *__init get_pud_virt_late(phys_addr_t pa) +static pud_t *__meminit get_pud_virt_late(phys_addr_t pa) { return (pud_t *)__va(pa); } @@ -521,7 +520,7 @@ static phys_addr_t __init alloc_pud_fixmap(uint
[PATCH v2 1/8] riscv: mm: Pre-allocate vmemmap/direct map PGD entries
From: Björn Töpel The RISC-V port copies the PGD table from init_mm/swapper_pg_dir to all userland page tables, which means that if the PGD level table is changed, other page tables has to be updated as well. Instead of having the PGD changes ripple out to all tables, the synchronization can be avoided by pre-allocating the PGD entries/pages at boot, avoiding the synchronization all together. This is currently done for the bpf/modules, and vmalloc PGD regions. Extend this scheme for the PGD regions touched by memory hotplugging. Prepare the RISC-V port for memory hotplug by pre-allocate vmemmap/direct map entries at the PGD level. This will roughly waste ~128 worth of 4K pages when memory hotplugging is enabled in the kernel configuration. Signed-off-by: Björn Töpel --- arch/riscv/include/asm/kasan.h | 4 ++-- arch/riscv/mm/init.c | 7 +++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h index 0b85e363e778..e6a0071bdb56 100644 --- a/arch/riscv/include/asm/kasan.h +++ b/arch/riscv/include/asm/kasan.h @@ -6,8 +6,6 @@ #ifndef __ASSEMBLY__ -#ifdef CONFIG_KASAN - /* * The following comment was copied from arm64: * KASAN_SHADOW_START: beginning of the kernel virtual addresses. @@ -34,6 +32,8 @@ */ #define KASAN_SHADOW_START ((KASAN_SHADOW_END - KASAN_SHADOW_SIZE) & PGDIR_MASK) #define KASAN_SHADOW_END MODULES_LOWEST_VADDR + +#ifdef CONFIG_KASAN #define KASAN_SHADOW_OFFSET_AC(CONFIG_KASAN_SHADOW_OFFSET, UL) void kasan_init(void); diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 2574f6a3b0e7..5b8cdfafb52a 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -1488,10 +1489,16 @@ static void __init preallocate_pgd_pages_range(unsigned long start, unsigned lon panic("Failed to pre-allocate %s pages for %s area\n", lvl, area); } +#define PAGE_END KASAN_SHADOW_START + void __init pgtable_cache_init(void) { preallocate_pgd_pages_range(VMALLOC_START, VMALLOC_END, "vmalloc"); if (IS_ENABLED(CONFIG_MODULES)) preallocate_pgd_pages_range(MODULES_VADDR, MODULES_END, "bpf/modules"); + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) { + preallocate_pgd_pages_range(VMEMMAP_START, VMEMMAP_END, "vmemmap"); + preallocate_pgd_pages_range(PAGE_OFFSET, PAGE_END, "direct map"); + } } #endif -- 2.40.1
[PATCH v2 0/8] riscv: Memory Hot(Un)Plug support
From: Björn Töpel Memory Hot(Un)Plug support (and ZONE_DEVICE) for the RISC-V port Introduction To quote "Documentation/admin-guide/mm/memory-hotplug.rst": "Memory hot(un)plug allows for increasing and decreasing the size of physical memory available to a machine at runtime." This series adds memory hot(un)plugging, and ZONE_DEVICE support for the RISC-V Linux port. I'm sending this series while LSF/MM/BPF is on-going, and with some luck some MM person can review the series while zoning out on a talk. ;-) MM configuration RISC-V MM has the following configuration: * Memory blocks are 128M, analogous to x86-64. It uses PMD ("hugepage") vmemmaps. From that follows that 2M (PMD) worth of vmemmap spans 32768 pages á 4K which gets us 128M. * The pageblock size is the minimum minimum virtio_mem size, and on RISC-V it's 2M (2^9 * 4K). Implementation == The PGD table on RISC-V is shared/copied between for all processes. To avoid doing page table synchronization, the first patch (patch 1) pre-allocated the PGD entries for vmemmap/direct map. By doing that the init_mm PGD will be fixed at kernel init, and synchronization can be avoided all together. The following two patches (patch 2-3) does some preparations, followed by the actual MHP implementation (patch 4-5). Then, MHP and virtio-mem are enabled (patch 6-7), and finally ZONE_DEVICE support is added (patch 8). MHP and locking === TL;DR: The MHP does not step on any toes, except for ptdump. Additional locking is required for ptdump. Long version: For v2 I spent some time digging into init_mm synchronization/update. Here are my findings, and I'd love them to be corrected if incorrect. It's been a gnarly path... The `init_mm` structure is a special mm (perhaps not a "real" one). It's a "lazy context" that tracks kernel page table resources, e.g., the kernel page table (swapper_pg_dir), a kernel page_table_lock (more about the usage below), mmap_lock, and such. `init_mm` does not track/contain any VMAs. Having the `init_mm` is convenient, so that the regular kernel page table walk/modify functions can be used. Now, `init_mm` being special means that the locking for kernel page tables are special as well. On RISC-V the PGD (top-level page table structure), similar to x86, is shared (copied) with user processes. If the kernel PGD is modified, it has to be synched to user-mode processes PGDs. This is avoided by pre-populating the PGD, so it'll be fixed from boot. The in-kernel pgd regions are documented in `Documentation/arch/riscv/vm-layout.rst`. The distinct regions are: * vmemmap * vmalloc/ioremap space * direct mapping of all physical memory * kasan * modules, BPF * kernel Memory hotplug is the process of adding/removing memory to/from the kernel. Adding is done in two phases: 1. Add the memory to the kernel 2. Online memory, making it available to the page allocator. Step 1 is partially architecture dependent, and updates the init_mm page table: * Update the direct map page tables. The direct map is a linear map, representing all physical memory: `virt = phys + PAGE_OFFSET` * Add a `struct page` for each added page of memory. Update the vmemmap (virtual mapping to the `struct page`, so we can easily transform a kernel virtual address to a `struct page *` address. >From an MHP perspective, there are two regions of the PGD that are updated: * vmemmap * direct mapping of all physical memory The `struct mm_struct` has a couple of locks in play: * `spinlock_t page_table_lock` protects the page table, and some counters * `struct rw_semaphore mmap_lock` protect an mm's VMAs Note again that `init_mm` does not contain any VMAs, but still uses the mmap_lock in some places. The `page_table_lock` was originally used to to protect all pages tables, but more recently a split page table lock has been introduced. The split lock has a per-table lock for the PTE and PMD tables. If split lock is disabled, all tables are guarded by `mm->page_table_lock` (for user processes). Split page table locks are not used for init_mm. MHP operations is typically synchronized using `DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock)`. Actors -- The following non-MHP actors in the kernel traverses (read), and/or modifies the kernel PGD. * `ptdump` Walks the entire `init_mm`, via `ptdump_walk_pgd()` with the `mmap_write_lock(init_mm)` taken. Observation: ptdump can race with MHP, and needs additional locking to avoid crashes/races. * `set_direct_*` / `arch/riscv/mm/pageattr.c` The `set_direct_*` functionality is used to "synchronize" the direct map to other kernel mappings, e.g. modules/kernel text. The direct map is using "as large huge table m
Re: [PATCH] ftrace: riscv: move from REGS to ARGS
Puranjay Mohan writes: > This commit replaces riscv's support for FTRACE_WITH_REGS with support > for FTRACE_WITH_ARGS. This is required for the ongoing effort to stop > relying on stop_machine() for RISCV's implementation of ftrace. > > The main relevant benefit that this change will bring for the above > use-case is that now we don't have separate ftrace_caller and > ftrace_regs_caller trampolines. This will allow the callsite to call > ftrace_caller by modifying a single instruction. Now the callsite can > do something similar to: > > When not tracing:| When tracing: > > func: func: > auipc t0, ftrace_caller_topauipc t0, ftrace_caller_top > nop <==> jalr t0, ftrace_caller_bottom > [...] [...] > > The above assumes that we are dropping the support of calling a direct > trampoline from the callsite. We need to drop this as the callsite can't > change the target address to call, it can only enable/disable a call to > a preset target (ftrace_caller in the above diagram). We can later optimize > this by calling an intermediate dispatcher trampoline before ftrace_caller. > > Currently, ftrace_regs_caller saves all CPU registers in the format of > struct pt_regs and allows the tracer to modify them. We don't need to > save all of the CPU registers because at function entry only a subset of > pt_regs is live: > > |--+--+-| > | Register | ABI Name | Description | > |--+--+-| > | x1 | ra | Return address for traced function | > | x2 | sp | Stack pointer | > | x5 | t0 | Return address for ftrace_caller trampoline | > | x8 | s0/fp| Frame pointer | > | x10-11 | a0-1 | Function arguments/return values| > | x12-17 | a2-7 | Function arguments | > |--+--+-| > > See RISCV calling convention[1] for the above table. > > Saving just the live registers decreases the amount of stack space > required from 288 Bytes to 112 Bytes. > > Basic testing was done with this on the VisionFive 2 development board. > > Note: > - Moving from REGS to ARGS will mean that RISCV will stop supporting > KPROBES_ON_FTRACE as it requires full pt_regs to be saved. > - KPROBES_ON_FTRACE will be supplanted by FPROBES see [2]. Puranjay, Apologies for the slow response. Restating from the RFC; This change is a prerequisite for moving towards a "FTRACE_WITH_CALL_OPS" model on riscv, and finally getting rid of the stop_machine() text patching. I've tested this on QEMU, and on the VisionFive2. No regressions (FTRACE_STARTUP_*, and kselftest), other than that KPROBES_ON_FTRACE no longer works. (Which will be addressed by [2]). Palmer, my preference is that this should go on for-next, rather than being part of the upcoming text patching support (worked on by Andy), but really up to you. Tested-by: Björn Töpel Reviewed-by: Björn Töpel
Re: [RFC PATCH] ftrace: riscv: move from REGS to ARGS
Puranjay Mohan writes: > This commit replaces riscv's support for FTRACE_WITH_REGS with support > for FTRACE_WITH_ARGS. This is required for the ongoing effort to stop > relying on stop_machine() for RISCV's implementation of ftrace. > > The main relevant benefit that this change will bring for the above > use-case is that now we don't have separate ftrace_caller and > ftrace_regs_caller trampolines. This will allow the callsite to call > ftrace_caller by modifying a single instruction. Now the callsite can > do something similar to: > > When not tracing:| When tracing: > > func: func: > auipc t0, ftrace_caller_topauipc t0, ftrace_caller_top > nop <==> jalr t0, ftrace_caller_bottom > [...] [...] > > The above assumes that we are dropping the support of calling a direct > trampoline from the callsite. We need to drop this as the callsite can't > change the target address to call, it can only enable/disable a call to > a preset target (ftrace_caller in the above diagram). > > Currently, ftrace_regs_caller saves all CPU registers in the format of > struct pt_regs and allows the tracer to modify them. We don't need to > save all of the CPU registers because at function entry only a subset of > pt_regs is live: > > |--+--+-| > | Register | ABI Name | Description | > |--+--+-| > | x1 | ra | Return address for traced function | > | x2 | sp | Stack pointer | > | x5 | t0 | Return address for ftrace_caller trampoline | > | x8 | s0/fp| Frame pointer | > | x10-11 | a0-1 | Function arguments/return values| > | x12-17 | a2-7 | Function arguments | > |--+--+-| > > See RISCV calling convention[1] for the above table. > > Saving just the live registers decreases the amount of stack space > required from 288 Bytes to 112 Bytes. > > Basic testing was done with this on the VisionFive 2 development board. > > Note: > - Moving from REGS to ARGS will mean that RISCV will stop supporting > KPROBES_ON_FTRACE as it requires full pt_regs to be saved. ...and FPROBES, but Masami is (still?) working on a series to address that [1]. My perspective; This is following the work Mark et al has done for arm64, and it does make sense for RISC-V as well. I'm in favor having this change as part of the bigger call_ops/text patch change for RISC-V. [...] > diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h > index 1276d7d9ca8b..1e95bf4ded6c 100644 > --- a/arch/riscv/include/asm/ftrace.h > +++ b/arch/riscv/include/asm/ftrace.h > @@ -124,20 +124,87 @@ struct dyn_ftrace; > int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec); > #define ftrace_init_nop ftrace_init_nop > > -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS > +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS > +#define arch_ftrace_get_regs(regs) NULL > struct ftrace_ops; > -struct ftrace_regs; > +struct ftrace_regs { > + unsigned long epc; > + unsigned long ra; > + unsigned long sp; > + unsigned long s0; > + unsigned long t1; > + union { > + unsigned long args[8]; > + struct { > + unsigned long a0; > + unsigned long a1; > + unsigned long a2; > + unsigned long a3; > + unsigned long a4; > + unsigned long a5; > + unsigned long a6; > + unsigned long a7; > + }; > + }; > +}; > + > +static __always_inline unsigned long > +ftrace_regs_get_instruction_pointer(const struct ftrace_regs *fregs) > +{ > + return fregs->epc; > +} > + > +static __always_inline void > +ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs, > + unsigned long pc) > +{ > + fregs->epc = pc; > +} > + > +static __always_inline unsigned long > +ftrace_regs_get_stack_pointer(const struct ftrace_regs *fregs) > +{ > + return fregs->sp; > +} > + > +static __always_inline unsigned long > +ftrace_regs_get_argument(struct ftrace_regs *fregs, unsigned int n) > +{ > + if (n < 8) > + return fregs->args[n]; > + return 0; > +} > + > +static __always_inline unsigned long > +ftrace_regs_get_return_value(const struct ftrace_regs *fregs) > +{ > + return fregs->a0; > +} > + > +static __always_inline void > +ftrace_regs_set_return_value(struct ftrace_regs *fregs, > + unsigned long ret) > +{ > + fregs->a0 = ret; > +} > + > +static __always_inline void >
Re: [RFC PATCH] riscv: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS
Andy Chiu writes: > On Thu, Mar 21, 2024 at 4:48 PM Björn Töpel wrote: >> >> Andy, >> >> Pulling out the A option: >> >> >> > A) Use auipc/jalr, only patch jalr to take us to a common >> >> >dispatcher/trampoline >> >> > >> >> > | # probably on a data cache-line != func >> >> > .text to avoid ping-pong >> >> > | ... >> >> > | func: >> >> > | ...make sure ra isn't messed up... >> >> > | aupic >> >> > | nop <=> jalr # Text patch point -> common_dispatch >> >> > | ACTUAL_FUNC >> >> > | >> >> > | common_dispatch: >> >> > | load based on ra >> >> > | jalr >> >> > | ... >> >> > >> >> > The auipc is never touched, and will be overhead. Also, we need a mv to >> >> > store ra in a scratch register as well -- like Arm. We'll have two insn >> >> > per-caller overhead for a disabled caller. >> > >> > My patch series takes a similar "in-function dispatch" approach. A >> > difference is that the is >> > embedded within each function entry. I'd like to have it moved to a >> > run-time allocated array to reduce total text size. >> >> This is what arm64 has as well. It's a 8B + 1-2 dirt cheap movish like >> instructions (save ra, prepare jump with auipc). I think that's a >> reasonable overhead. >> >> > Another difference is that my series changes the first instruction to >> > "j ACTUAL_FUNC" for the "ftrace disable" case. As long as the >> > architecture guarantees the atomicity of the first instruction, then >> > we are safe. For example, we are safe if the first instruction could >> > only be "mv tmp, ra" or "j ACTUAL_FUNC". And since the loaded address is >> > always valid, we can fix "mv + jalr" down so we don't have to >> > play with the exception handler trick. The guarantee from arch would >> > require ziccif (in RVA22) though, but I think it is the same for us >> > (unless with stop_machine). For ziccif, I would rather call that out >> > during boot than blindly assume. >> >> I'm maybe biased, but I'd prefer the A) over your version with the >> unconditional jump. A) has the overhead of two, I'd say, free >> instructions (again "Meten is Weten!" ;-)). > > Yes, I'd also prefer A for less overall patch size. We can also > optimize the overhead with a direct jump if that makes sense. Though, > we need to sort out a way to map functions to corresponding > trampolines. A direct way I could image is CALL_OPS'ish patching > style, if the ftrace destination has to be patched in a per-function > manner. For example: > > > func_symbol: > auipc t0, common_dispatch:high <=> j actual_func: > jalr t0, common_dispatch:low(t0) > > common_dispatch: > load t1, index + dispatch-list > ld t1, 0(t1) > jr t1 Yup, exactly like that (but I'd put the acutal target ptr in there, instead of an additional indirection. Exactly what Mark does for arm64). When we enter the common_dispatch, the ptr would be -12(t0). As for patching auipc or jalr, I guess we need to measure what's best. My knee-jerk would be always auipc is better than jump -- but let's measure. ;-) Björn
Re: [RFC PATCH] riscv: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS
Mark, Mark Rutland writes: >> A) Use auipc/jalr, only patch jalr to take us to a common >>dispatcher/trampoline >> >> | # probably on a data cache-line != func .text >> to avoid ping-pong >> | ... >> | func: >> | ...make sure ra isn't messed up... >> | aupic >> | nop <=> jalr # Text patch point -> common_dispatch >> | ACTUAL_FUNC >> | >> | common_dispatch: >> | load based on ra >> | jalr >> | ... >> >> The auipc is never touched, and will be overhead. Also, we need a mv to >> store ra in a scratch register as well -- like Arm. We'll have two insn >> per-caller overhead for a disabled caller. > > Is the AUIPC a significant overhead? IIUC that's similar to Arm's ADRP, and > I'd > have expected that to be pretty cheap. No, reg-to-reg moves are dirt cheap in my book. > IIUC your JALR can choose which destination register to store the return > address in, and if so, you could leave the original ra untouched (and recover > that in the common trampoline). Have I misunderstood that? > > Maybe that doesn't play nicely with something else? No, you're right, we can link to another register, and shave off an instruction. I can imagine that some implementation prefer x1/x5 for branch prediction reasons, but that's something that we can measure on. So, 1-2 movs + nop are unconditionally executed on the disabled case. (1-2 depending on the ra save/jalr reg strategy). >> B) Use jal, which can only take us +/-1M, and requires multiple >>dispatchers (and tracking which one to use, and properly distribute >>them. Ick.) >> >> | # probably on a data cache-line != func .text >> to avoid ping-pong >> | ... >> | func: >> | ...make sure ra isn't messed up... >> | nop <=> jal # Text patch point -> within_1M_to_func_dispatch >> | ACTUAL_FUNC >> | >> | within_1M_to_func_dispatch: >> | load based on ra >> | jalr >> >> C) Use jal, which can only take us +/-1M, and use a per-function >>trampoline requires multiple dispatchers (and tracking which one to >>use). Blows up text size A LOT. >> >> | # somewhere, but probably on a different >> cacheline than the .text to avoid ping-ongs >> | ... >> | per_func_dispatch >> | load based on ra >> | jalr >> | func: >> | ...make sure ra isn't messed up... >> | nop <=> jal # Text patch point -> per_func_dispatch >> | ACTUAL_FUNC > > Beware that with option (C) you'll need to handle that in your unwinder for > RELIABLE_STACKTRACE. If you don't have a symbol for per_func_dispatch (or > func_trace_target_data_8B), PC values within per_func_dispatch would be > symbolized as the prior function/data. Good point (but I don't like C much...)! >> It's a bit sad that we'll always have to have a dispatcher/trampoline, >> but it's still better than stop_machine(). (And we'll need a fencei IPI >> as well, but only one. ;-)) >> >> Today, I'm leaning towards A (which is what Mark suggested, and also >> Robbin).. Any other options? > > Assuming my understanding of JALR above is correct, I reckon A is the nicest > option out of A/B/C. Yes! +1! Björn
Re: [RFC PATCH] riscv: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS
Andy, Pulling out the A option: >> > A) Use auipc/jalr, only patch jalr to take us to a common >> >dispatcher/trampoline >> > >> > | # probably on a data cache-line != func >> > .text to avoid ping-pong >> > | ... >> > | func: >> > | ...make sure ra isn't messed up... >> > | aupic >> > | nop <=> jalr # Text patch point -> common_dispatch >> > | ACTUAL_FUNC >> > | >> > | common_dispatch: >> > | load based on ra >> > | jalr >> > | ... >> > >> > The auipc is never touched, and will be overhead. Also, we need a mv to >> > store ra in a scratch register as well -- like Arm. We'll have two insn >> > per-caller overhead for a disabled caller. > > My patch series takes a similar "in-function dispatch" approach. A > difference is that the is > embedded within each function entry. I'd like to have it moved to a > run-time allocated array to reduce total text size. This is what arm64 has as well. It's a 8B + 1-2 dirt cheap movish like instructions (save ra, prepare jump with auipc). I think that's a reasonable overhead. > Another difference is that my series changes the first instruction to > "j ACTUAL_FUNC" for the "ftrace disable" case. As long as the > architecture guarantees the atomicity of the first instruction, then > we are safe. For example, we are safe if the first instruction could > only be "mv tmp, ra" or "j ACTUAL_FUNC". And since the loaded address is > always valid, we can fix "mv + jalr" down so we don't have to > play with the exception handler trick. The guarantee from arch would > require ziccif (in RVA22) though, but I think it is the same for us > (unless with stop_machine). For ziccif, I would rather call that out > during boot than blindly assume. I'm maybe biased, but I'd prefer the A) over your version with the unconditional jump. A) has the overhead of two, I'd say, free instructions (again "Meten is Weten!" ;-)). > However, one thing I am not very sure is: do we need a destination > address in a "per-function" manner? It seems like most of the time the > destination address can only be ftrace_call, or ftrace_regs_call. If > the number of destination addresses is very few, then we could > potentially reduce the size of > . Yes, we do need a per-function manner. BPF, e.g., uses dynamically/JIT:ed trampolines/targets. Björn
Re: [RFC PATCH] riscv: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS
Björn Töpel writes: > Puranjay Mohan writes: > >> Björn Töpel writes: >> >>> >>> Hmm, depending on RISC-V's CMODX path, the pro/cons CALL_OPS vs dynamic >>> trampolines changes quite a bit. >>> >>> The more I look at the pains of patching two instruction ("split >>> immediates"), the better "patch data" + one insn patching look. >> >> I was looking at how dynamic trampolines would be implemented for RISC-V. >> >> With CALL-OPS we need to patch the auipc+jalr at function entry only, the >> ops pointer above the function can be patched atomically. >> >> With a dynamic trampoline we need a auipc+jalr pair at function entry to jump >> to the trampoline and then another auipc+jalr pair to jump from trampoline to >> ops->func. When the ops->func is modified, we would need to update the >> auipc+jalr at in the trampoline. >> >> So, I am not sure how to move forward here, CALL-OPS or Dynamic trampolines? > > Yeah. Honestly, we need to figure out the patching story prior > choosing the path, so let's start there. > > After reading Mark's reply, and discussing with OpenJDK folks (who does > the most crazy text patching on all platforms), having to patch multiple > instructions (where the address materialization is split over multiple > instructions) is a no-go. It's just a too big can of worms. So, if we > can only patch one insn, it's CALL_OPS. > > A couple of options (in addition to Andy's), and all require a > per-function landing address ala CALL_OPS) tweaking what Mark is doing > on Arm (given the poor branch range). > > ...and maybe we'll get RISC-V rainbows/unicorns in the future getting > better reach (full 64b! ;-)). > > A) Use auipc/jalr, only patch jalr to take us to a common >dispatcher/trampoline > > | # probably on a data cache-line != func .text > to avoid ping-pong > | ... > | func: > | ...make sure ra isn't messed up... > | aupic > | nop <=> jalr # Text patch point -> common_dispatch > | ACTUAL_FUNC > | > | common_dispatch: > | load based on ra > | jalr > | ... > > The auipc is never touched, and will be overhead. Also, we need a mv to > store ra in a scratch register as well -- like Arm. We'll have two insn > per-caller overhead for a disabled caller. > > B) Use jal, which can only take us +/-1M, and requires multiple >dispatchers (and tracking which one to use, and properly distribute >them. Ick.) > > | # probably on a data cache-line != func .text > to avoid ping-pong > | ... > | func: > | ...make sure ra isn't messed up... > | nop <=> jal # Text patch point -> within_1M_to_func_dispatch > | ACTUAL_FUNC > | > | within_1M_to_func_dispatch: > | load based on ra > | jalr > > C) Use jal, which can only take us +/-1M, and use a per-function >trampoline requires multiple dispatchers (and tracking which one to >use). Blows up text size A LOT. > > | # somewhere, but probably on a different > cacheline than the .text to avoid ping-ongs > | ... > | per_func_dispatch > | load based on ra > | jalr > | func: > | ...make sure ra isn't messed up... > | nop <=> jal # Text patch point -> per_func_dispatch > | ACTUAL_FUNC Brendan proposed yet another option, "in-function dispatch": D) | # idk somewhere | ... | func: |mv tmp1, ra |aupic tmp2, |mv tmp3, zero <=> ld tmp3, tmp2 |nop <=> jalr ra, tmp3 |ACTUAL_FUNC There are 4 CMODX possiblities: mv, nop: fully disabled, no problems mv, jalr: We will jump to zero. We would need to have the inst page/access fault handler take care of this case. Especially if we align the instructions so that they can be patched together, being interrupted in the middle and taking this path will be rare. ld, nop: no problems ld, jalr: fully enabled, no problems Patching is a 64b store/sd, and we only need a fence.i at the end, since we can handle all 4 possibilities. For the disabled case we'll have: A) mv, aupic, nop D) mv, aupic, mv, nop. Puranjay, I've flipped. Let's go Mark's CALL_OPS together with a new text patch mechanism w/o stop_machine(). Björn
Re: [RFC PATCH] riscv: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS
Puranjay Mohan writes: > Björn Töpel writes: > >> >> Hmm, depending on RISC-V's CMODX path, the pro/cons CALL_OPS vs dynamic >> trampolines changes quite a bit. >> >> The more I look at the pains of patching two instruction ("split >> immediates"), the better "patch data" + one insn patching look. > > I was looking at how dynamic trampolines would be implemented for RISC-V. > > With CALL-OPS we need to patch the auipc+jalr at function entry only, the > ops pointer above the function can be patched atomically. > > With a dynamic trampoline we need a auipc+jalr pair at function entry to jump > to the trampoline and then another auipc+jalr pair to jump from trampoline to > ops->func. When the ops->func is modified, we would need to update the > auipc+jalr at in the trampoline. > > So, I am not sure how to move forward here, CALL-OPS or Dynamic trampolines? Yeah. Honestly, we need to figure out the patching story prior choosing the path, so let's start there. After reading Mark's reply, and discussing with OpenJDK folks (who does the most crazy text patching on all platforms), having to patch multiple instructions (where the address materialization is split over multiple instructions) is a no-go. It's just a too big can of worms. So, if we can only patch one insn, it's CALL_OPS. A couple of options (in addition to Andy's), and all require a per-function landing address ala CALL_OPS) tweaking what Mark is doing on Arm (given the poor branch range). ...and maybe we'll get RISC-V rainbows/unicorns in the future getting better reach (full 64b! ;-)). A) Use auipc/jalr, only patch jalr to take us to a common dispatcher/trampoline | # probably on a data cache-line != func .text to avoid ping-pong | ... | func: | ...make sure ra isn't messed up... | aupic | nop <=> jalr # Text patch point -> common_dispatch | ACTUAL_FUNC | | common_dispatch: | load based on ra | jalr | ... The auipc is never touched, and will be overhead. Also, we need a mv to store ra in a scratch register as well -- like Arm. We'll have two insn per-caller overhead for a disabled caller. B) Use jal, which can only take us +/-1M, and requires multiple dispatchers (and tracking which one to use, and properly distribute them. Ick.) | # probably on a data cache-line != func .text to avoid ping-pong | ... | func: | ...make sure ra isn't messed up... | nop <=> jal # Text patch point -> within_1M_to_func_dispatch | ACTUAL_FUNC | | within_1M_to_func_dispatch: | load based on ra | jalr C) Use jal, which can only take us +/-1M, and use a per-function trampoline requires multiple dispatchers (and tracking which one to use). Blows up text size A LOT. | # somewhere, but probably on a different cacheline than the .text to avoid ping-ongs | ... | per_func_dispatch | load based on ra | jalr | func: | ...make sure ra isn't messed up... | nop <=> jal # Text patch point -> per_func_dispatch | ACTUAL_FUNC It's a bit sad that we'll always have to have a dispatcher/trampoline, but it's still better than stop_machine(). (And we'll need a fencei IPI as well, but only one. ;-)) Today, I'm leaning towards A (which is what Mark suggested, and also Robbin).. Any other options? [Now how do we implement OPTPROBES? I'm kidding. ;-)] Björn
Re: [RFC PATCH] riscv: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS
Mark Rutland writes: > Hi Bjorn > > (apologies, my corporate mail server has butchered your name here). Ha! That's the price I have to pay for carrying double-umlauts everywhere. Thanks for getting back with a really useful answer! >> On Arm64, CALL_OPS makes it possible to implement direct calls, while >> only patching one BL instruction -- nice! > > The key thing here isn't that we patch a single instruction (since we have ot > patch the ops pointer too!); it's that we can safely patch either of the ops > pointer or BL/NOP at any time while threads are concurrently executing. ...which indeed is a very nice property! > If you have a multi-instruction sequence, then threads can be preempted > mid-sequence, and it's very painful/complex to handle all of the races that > entails. Oh yes. RISC-V is currently using auipc/jalr with stop_machine(), and also requires that preemtion is off. Unusable to put it blunt. > For example, if your callsites use a sequence: > > AUIPC , > JALR , () > > Using stop_machine() won't allow you to patch that safely as some threads > could be stuck mid-sequence, e.g. > > AUIPC , > [ preempted here ] > JALR , () > > ... and you can't update the JALR to use a new funcptr immediate until those > have completed the sequence. > > There are ways around that, but they're complicated and/or expensive, e.g. > > * Use a sequence of multiple patches, starting with replacing the JALR with an > exception-generating instruction with a fixup handler, which is sort-of what > x86 does with UD2. This may require multiple passes with > synchronize_rcu_tasks() to make sure all threads have seen the latest > instructions, and that cannot be done under stop_machine(), so if you need > stop_machine() for CMODx reasons, you may need to use that several times > with > intervening calls to synchronize_rcu_tasks(). > > * Have the patching logic manually go over each thread and fix up the pt_regs > for the interrupted thread. This is pretty horrid since you could have > nested > exceptions and a task could have several pt_regs which might require > updating. Yup, and both of these have rather unplesant overhead. > The CALL_OPS approach is a bit easier to deal with as we can patch the > per-callsite pointer atomically, then we can (possibly) enable/disable the > callsite's branch, then wait for threads to drain once. > > As a heads-up, there are some latent/generic issues with DYNAMIC_FTRACE > generally in this area (CALL_OPs happens to side-step those, but trampoline > usage is currently affected): > > https://lore.kernel.org/lkml/Zenx_Q0UiwMbSAdP@FVFF77S0Q05N/ > > ... I'm looking into fixing that at the moment, and it looks like that's > likely > to require some per-architecture changes. > >> On RISC-V we cannot use use the same ideas as Arm64 straight off, >> because the range of jal (compare to BL) is simply too short (+/-1M). >> So, on RISC-V we need to use a full auipc/jal pair (the text patching >> story is another chapter, but let's leave that aside for now). Since we >> have to patch multiple instructions, the cmodx situation doesn't really >> improve with CALL_OPS. > > The branch range thing is annoying, but I think this boils down to the same > problem as arm64 has with needing a "MOV , LR" instruction that we have > to > patch in once at boot time. You could do the same and patch in the AUIPC once, > e.g. have > > | NOP > | NOP > | func: > | AUIPC , > | JALR , () // patched with NOP > > ... which'd look very similar to arm64's sequence: > > | NOP > | NOP > | func: > | MOV X9, LR > | BL ftrace_caller // patched with NOP > > ... which I think means it *might* be better from a cmodx perspective? > >> Let's say that we continue building on your patch and implement direct >> calls on CALL_OPS for RISC-V as well. >> >> From Florent's commit message for direct calls: >> >> |There are a few cases to distinguish: >> |- If a direct call ops is the only one tracing a function: >> | - If the direct called trampoline is within the reach of a BL >> |instruction >> | -> the ftrace patchsite jumps to the trampoline >> | - Else >> | -> the ftrace patchsite jumps to the ftrace_caller trampoline >> which >> |reads the ops pointer in the patchsite and jumps to the direct >> |call address stored in the ops >> |- Else >> | -> the ftrace patchsite jumps to the ftrace_caller trampoline and >> its >> | ops literal points to ftrace_list_ops so it iterates over all >> | registered ftrace ops, including the direct call ops and calls >> its >> | call_direct_funcs handler which stores the direct called >> | trampoline's address in the ftrace_regs and the ftrace_caller >> | trampoline will return to that address instead of returning to >> the >> | traced function >>
Re: [RFC PATCH] riscv: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS
Puranjay Mohan writes: >> Now, I think a better approach for RISC-V would be implementing what x86 >> has (arch_ftrace_update_trampoline()), rather than CALL_OPS for RISC-V. >> >> Thoughts? > > I am going to spin some patches for implementing > arch_ftrace_update_trampoline() for > RISC-V, then we can compare the two approaches and see which is > better. But I agree > that arch_ftrace_update_trampoline() is a better approach given that > we can jump anywhere > with auipc/jalr. Yup, and the text size wont blow up. Cheers, Björn
Re: [RFC PATCH] riscv: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS
Puranjay Mohan writes: > Hi Björn, > > On Thu, Mar 7, 2024 at 8:27 PM Björn Töpel wrote: >> >> Puranjay! >> >> Puranjay Mohan writes: >> >> > This patch enables support for DYNAMIC_FTRACE_WITH_CALL_OPS on RISC-V. >> > This allows each ftrace callsite to provide an ftrace_ops to the common >> > ftrace trampoline, allowing each callsite to invoke distinct tracer >> > functions without the need to fall back to list processing or to >> > allocate custom trampolines for each callsite. This significantly speeds >> > up cases where multiple distinct trace functions are used and callsites >> > are mostly traced by a single tracer. >> > >> > The idea and most of the implementation is taken from the ARM64's >> > implementation of the same feature. The idea is to place a pointer to >> > the ftrace_ops as a literal at a fixed offset from the function entry >> > point, which can be recovered by the common ftrace trampoline. >> >> Not really a review, but some more background; Another rationale (on-top >> of the improved per-call performance!) for CALL_OPS was to use it to >> build ftrace direct call support (which BPF uses a lot!). Mark, please >> correct me if I'm lying here! >> >> On Arm64, CALL_OPS makes it possible to implement direct calls, while >> only patching one BL instruction -- nice! >> >> On RISC-V we cannot use use the same ideas as Arm64 straight off, >> because the range of jal (compare to BL) is simply too short (+/-1M). >> So, on RISC-V we need to use a full auipc/jal pair (the text patching >> story is another chapter, but let's leave that aside for now). Since we >> have to patch multiple instructions, the cmodx situation doesn't really >> improve with CALL_OPS. >> >> Let's say that we continue building on your patch and implement direct >> calls on CALL_OPS for RISC-V as well. >> >> From Florent's commit message for direct calls: >> >> |There are a few cases to distinguish: >> |- If a direct call ops is the only one tracing a function: >> | - If the direct called trampoline is within the reach of a BL >> |instruction >> | -> the ftrace patchsite jumps to the trampoline >> | - Else >> | -> the ftrace patchsite jumps to the ftrace_caller trampoline >> which >> |reads the ops pointer in the patchsite and jumps to the direct >> |call address stored in the ops >> |- Else >> | -> the ftrace patchsite jumps to the ftrace_caller trampoline and >> its >> | ops literal points to ftrace_list_ops so it iterates over all >> | registered ftrace ops, including the direct call ops and calls >> its >> | call_direct_funcs handler which stores the direct called >> | trampoline's address in the ftrace_regs and the ftrace_caller >> | trampoline will return to that address instead of returning to >> the >> | traced function >> >> On RISC-V, where auipc/jalr is used, the direct called trampoline would >> always be reachable, and then first Else-clause would never be entered. >> This means the the performance for direct calls would be the same as the >> one we have today (i.e. no regression!). >> >> RISC-V does like x86 does (-ish) -- patch multiple instructions, long >> reach. >> >> Arm64 uses CALL_OPS and patch one instruction BL. >> >> Now, with this background in mind, compared to what we have today, >> CALL_OPS would give us (again assuming we're using it for direct calls): >> >> * Better performance for tracer per-call (faster ops lookup) GOOD > > ^ this was the only motivation for me to implement this patch. > > I don't think implementing direct calls over call ops is fruitful for > RISC-V because once > the auipc/jalr can be patched atomically, the direct call trampoline > is always reachable. > Solving the atomic text patching problem would be fun!! I am eager to > see how it will be > solved. Given the upcoming Zjid spec, we'll soon be in a much better place where we can reason about cmodx. >> * Larger text size (function alignment + extra nops) BAD >> * Same direct call performance NEUTRAL >> * Same complicated text patching required NEUTRAL >> >> It would be interesting to see how the per-call performance would >> improve on x86 with CALL_OPS! ;-) > > If I remember from Steven's talk, x86 uses dynamically allocated trampolines > for per callsite tracers, would CALL_OPS provide better performance than that? Probably not, and it was really a tongue-in-cheek comment -- nothing I encourage you to do! Now, I think a better approach for RISC-V would be implementing what x86 has (arch_ftrace_update_trampoline()), rather than CALL_OPS for RISC-V. Thoughts? Björn
Re: [RFC PATCH] riscv: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS
Puranjay! Puranjay Mohan writes: > This patch enables support for DYNAMIC_FTRACE_WITH_CALL_OPS on RISC-V. > This allows each ftrace callsite to provide an ftrace_ops to the common > ftrace trampoline, allowing each callsite to invoke distinct tracer > functions without the need to fall back to list processing or to > allocate custom trampolines for each callsite. This significantly speeds > up cases where multiple distinct trace functions are used and callsites > are mostly traced by a single tracer. > > The idea and most of the implementation is taken from the ARM64's > implementation of the same feature. The idea is to place a pointer to > the ftrace_ops as a literal at a fixed offset from the function entry > point, which can be recovered by the common ftrace trampoline. Not really a review, but some more background; Another rationale (on-top of the improved per-call performance!) for CALL_OPS was to use it to build ftrace direct call support (which BPF uses a lot!). Mark, please correct me if I'm lying here! On Arm64, CALL_OPS makes it possible to implement direct calls, while only patching one BL instruction -- nice! On RISC-V we cannot use use the same ideas as Arm64 straight off, because the range of jal (compare to BL) is simply too short (+/-1M). So, on RISC-V we need to use a full auipc/jal pair (the text patching story is another chapter, but let's leave that aside for now). Since we have to patch multiple instructions, the cmodx situation doesn't really improve with CALL_OPS. Let's say that we continue building on your patch and implement direct calls on CALL_OPS for RISC-V as well. >From Florent's commit message for direct calls: |There are a few cases to distinguish: |- If a direct call ops is the only one tracing a function: | - If the direct called trampoline is within the reach of a BL |instruction | -> the ftrace patchsite jumps to the trampoline | - Else | -> the ftrace patchsite jumps to the ftrace_caller trampoline which |reads the ops pointer in the patchsite and jumps to the direct |call address stored in the ops |- Else | -> the ftrace patchsite jumps to the ftrace_caller trampoline and its | ops literal points to ftrace_list_ops so it iterates over all | registered ftrace ops, including the direct call ops and calls its | call_direct_funcs handler which stores the direct called | trampoline's address in the ftrace_regs and the ftrace_caller | trampoline will return to that address instead of returning to the | traced function On RISC-V, where auipc/jalr is used, the direct called trampoline would always be reachable, and then first Else-clause would never be entered. This means the the performance for direct calls would be the same as the one we have today (i.e. no regression!). RISC-V does like x86 does (-ish) -- patch multiple instructions, long reach. Arm64 uses CALL_OPS and patch one instruction BL. Now, with this background in mind, compared to what we have today, CALL_OPS would give us (again assuming we're using it for direct calls): * Better performance for tracer per-call (faster ops lookup) GOOD * Larger text size (function alignment + extra nops) BAD * Same direct call performance NEUTRAL * Same complicated text patching required NEUTRAL It would be interesting to see how the per-call performance would improve on x86 with CALL_OPS! ;-) I'm trying to wrap my head if it makes sense to have it on RISC-V, given that we're a bit different from Arm64. Does the scale tip to the GOOD side? Oh, and we really need to see performance numbers on real HW! I have a VF2 that I could try this series on. Björn
Re: [PATCH v3 2/2] riscv: Fix text patching when IPI are used
Conor Dooley writes: > On Tue, Mar 05, 2024 at 08:33:30AM +0530, Anup Patel wrote: >> On Tue, Mar 5, 2024 at 1:54 AM Björn Töpel wrote: >> > >> > Conor Dooley writes: >> > >> > > On Thu, Feb 29, 2024 at 01:10:56PM +0100, Alexandre Ghiti wrote: >> > >> For now, we use stop_machine() to patch the text and when we use IPIs >> > >> for >> > >> remote icache flushes (which is emitted in patch_text_nosync()), the >> > >> system >> > >> hangs. >> > >> >> > >> So instead, make sure every CPU executes the stop_machine() patching >> > >> function and emit a local icache flush there. >> > >> >> > >> Co-developed-by: Björn Töpel >> > >> Signed-off-by: Björn Töpel >> > >> Signed-off-by: Alexandre Ghiti >> > >> Reviewed-by: Andrea Parri >> > > >> > > What commit does this fix? >> > >> > Hmm. The bug is exposed when the AIA IPI are introduced, and used >> > (instead of the firmware-based). >> > >> > I'm not sure this is something we'd like backported, but rather a >> > prerequisite to AIA. >> > >> > @Anup @Alex WDYT? >> > >> >> The current text patching never considered IPIs being injected >> directly in S-mode from hart to another so we are seeing this >> issue now with AIA IPIs. >> >> We certainly don't need to backport this fix since it's more >> of a preparatory fix for AIA IPIs. > > Whether or not this is backportable, if it fixes a bug, it should get > a Fixes: tag for the commit that it fixes. Fixes: isn't "the backport" > tag, cc: stable is. I guess the question is if this *is* a fix, or rather a change required for AIA (not a fix).
Re: [PATCH v3 2/2] riscv: Fix text patching when IPI are used
Conor Dooley writes: > On Thu, Feb 29, 2024 at 01:10:56PM +0100, Alexandre Ghiti wrote: >> For now, we use stop_machine() to patch the text and when we use IPIs for >> remote icache flushes (which is emitted in patch_text_nosync()), the system >> hangs. >> >> So instead, make sure every CPU executes the stop_machine() patching >> function and emit a local icache flush there. >> >> Co-developed-by: Björn Töpel >> Signed-off-by: Björn Töpel >> Signed-off-by: Alexandre Ghiti >> Reviewed-by: Andrea Parri > > What commit does this fix? Hmm. The bug is exposed when the AIA IPI are introduced, and used (instead of the firmware-based). I'm not sure this is something we'd like backported, but rather a prerequisite to AIA. @Anup @Alex WDYT?
Re: [PATCH] riscv: Fix text patching when icache flushes use IPIs
Alexandre Ghiti writes: > For now, we use stop_machine() to patch the text and when we use IPIs for > remote icache flushes, the system hangs since the irqs are disabled on all > cpus. > > So instead, make sure every cpu executes the stop_machine() patching > function which emits a local icache flush and then avoids the use of > IPIs. > > Co-developed-by: Björn Töpel > Signed-off-by: Björn Töpel > Signed-off-by: Alexandre Ghiti FWIW, the BPF selftests pass nicely with this (especially the fentry/fexit tests ;-)). I don't know if it's worth much saying that your own stuff was tested, but here goes: Tested-by: Björn Töpel
[PATCH v12 for-next 4/4] samples: ftrace: Add RISC-V support for SAMPLE_FTRACE_DIRECT[_MULTI]
From: Song Shuai Add RISC-V variants of the ftrace-direct* samples. Tested-by: Evgenii Shatokhin Signed-off-by: Song Shuai Tested-by: Guo Ren Signed-off-by: Guo Ren Acked-by: Björn Töpel --- arch/riscv/Kconfig | 2 + samples/ftrace/ftrace-direct-modify.c | 35 ++ samples/ftrace/ftrace-direct-multi-modify.c | 41 + samples/ftrace/ftrace-direct-multi.c| 25 + samples/ftrace/ftrace-direct-too.c | 28 ++ samples/ftrace/ftrace-direct.c | 24 6 files changed, 155 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 4684cdc754a0..0ee79a92918d 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -142,6 +142,8 @@ config RISCV select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RETHOOK if !XIP_KERNEL select HAVE_RSEQ + select HAVE_SAMPLE_FTRACE_DIRECT + select HAVE_SAMPLE_FTRACE_DIRECT_MULTI select HAVE_STACKPROTECTOR select HAVE_SYSCALL_TRACEPOINTS select HOTPLUG_CORE_SYNC_DEAD if HOTPLUG_CPU diff --git a/samples/ftrace/ftrace-direct-modify.c b/samples/ftrace/ftrace-direct-modify.c index e2a6a69352df..81220390851a 100644 --- a/samples/ftrace/ftrace-direct-modify.c +++ b/samples/ftrace/ftrace-direct-modify.c @@ -24,6 +24,41 @@ extern void my_tramp2(void *); static unsigned long my_ip = (unsigned long)schedule; +#ifdef CONFIG_RISCV +#include + +asm ( +" .pushsection.text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" +" .globl my_tramp1\n" +" my_tramp1:\n" +" addisp,sp,-2*"SZREG"\n" +" "REG_S" t0,0*"SZREG"(sp)\n" +" "REG_S" ra,1*"SZREG"(sp)\n" +" callmy_direct_func1\n" +" "REG_L" t0,0*"SZREG"(sp)\n" +" "REG_L" ra,1*"SZREG"(sp)\n" +" addisp,sp,2*"SZREG"\n" +" jr t0\n" +" .size my_tramp1, .-my_tramp1\n" +" .type my_tramp2, @function\n" +" .globl my_tramp2\n" + +" my_tramp2:\n" +" addisp,sp,-2*"SZREG"\n" +" "REG_S" t0,0*"SZREG"(sp)\n" +" "REG_S" ra,1*"SZREG"(sp)\n" +" callmy_direct_func2\n" +" "REG_L" t0,0*"SZREG"(sp)\n" +" "REG_L" ra,1*"SZREG"(sp)\n" +" addisp,sp,2*"SZREG"\n" +" jr t0\n" +" .size my_tramp2, .-my_tramp2\n" +" .popsection\n" +); + +#endif /* CONFIG_RISCV */ + #ifdef CONFIG_X86_64 #include diff --git a/samples/ftrace/ftrace-direct-multi-modify.c b/samples/ftrace/ftrace-direct-multi-modify.c index 2e349834d63c..f943e40d57fd 100644 --- a/samples/ftrace/ftrace-direct-multi-modify.c +++ b/samples/ftrace/ftrace-direct-multi-modify.c @@ -22,6 +22,47 @@ void my_direct_func2(unsigned long ip) extern void my_tramp1(void *); extern void my_tramp2(void *); +#ifdef CONFIG_RISCV +#include + +asm ( +" .pushsection.text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" +" .globl my_tramp1\n" +" my_tramp1:\n" +" addi sp,sp,-3*"SZREG"\n" +" "REG_S"a0,0*"SZREG"(sp)\n" +" "REG_S"t0,1*"SZREG"(sp)\n" +" "REG_S"ra,2*"SZREG"(sp)\n" +" mv a0,t0\n" +" call my_direct_func1\n" +" "REG_L"a0,0*"SZREG"(sp)\n" +" "REG_L"t0,1*"SZREG"(sp)\n" +" "REG_L"ra,2*"SZREG"(sp)\n" +" addi sp,sp,3*"SZREG"\n" +" jr t0\n" +" .size my_tramp1, .-my_tramp1\n" + +" .type my_tramp2, @function\n" +" .globl my_tramp2\n" +" my_tramp2:\n" +" addi sp,sp,-3*"SZREG"\n" +" "REG_S"a0,0*"SZREG"(sp)\n" +" "REG_S"t0,1*"SZREG"(sp)\n" +" "REG_S"ra,2*"SZREG"(sp)\n" +" mv a0,t0\n" +" call my_direct_func2\n" +" "REG_L"a0,0*"SZREG"(sp)\n" +" "REG_L"t0,1*"SZREG"(sp)\n" +" "REG_L"ra,2*&quo
[PATCH v12 for-next 3/4] riscv: ftrace: Add DYNAMIC_FTRACE_WITH_DIRECT_CALLS support
From: Song Shuai Select the DYNAMIC_FTRACE_WITH_DIRECT_CALLS to provide the register_ftrace_direct[_multi] interfaces allowing users to register the customed trampoline (direct_caller) as the mcount for one or more target functions. And modify_ftrace_direct[_multi] are also provided for modifying direct_caller. To make the direct_caller and the other ftrace hooks (e.g. function/fgraph tracer, k[ret]probes) co-exist, a temporary register is nominated to store the address of direct_caller in ftrace_regs_caller. After the setting of the address direct_caller by direct_ops->func and the RESTORE_REGS in ftrace_regs_caller, direct_caller will be jumped to by the `jr` inst. Add DYNAMIC_FTRACE_WITH_DIRECT_CALLS support for RISC-V. Signed-off-by: Song Shuai Tested-by: Guo Ren Signed-off-by: Guo Ren Acked-by: Björn Töpel --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/ftrace.h | 7 +++ arch/riscv/kernel/mcount-dyn.S | 10 ++ 3 files changed, 18 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 69c95e42be9f..4684cdc754a0 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -114,6 +114,7 @@ config RISCV select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS if MMU select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && MMU && (CLANG_SUPPORTS_DYNAMIC_FTRACE || GCC_SUPPORTS_DYNAMIC_FTRACE) + select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL select HAVE_FUNCTION_GRAPH_TRACER diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h index b383926f73be..329172122952 100644 --- a/arch/riscv/include/asm/ftrace.h +++ b/arch/riscv/include/asm/ftrace.h @@ -135,6 +135,13 @@ struct ftrace_regs; void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); #define ftrace_graph_func ftrace_graph_func + +static inline void __arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr) +{ + regs->t1 = addr; +} +#define arch_ftrace_set_direct_caller(fregs, addr) \ + __arch_ftrace_set_direct_caller(&(fregs)->regs, addr) #endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ #endif /* __ASSEMBLY__ */ diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S index c902a7ddb310..b7ce001779c1 100644 --- a/arch/riscv/kernel/mcount-dyn.S +++ b/arch/riscv/kernel/mcount-dyn.S @@ -229,6 +229,7 @@ SYM_FUNC_END(ftrace_caller) #else /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ SYM_FUNC_START(ftrace_regs_caller) + mv t1, zero SAVE_ABI_REGS 1 PREPARE_ARGS @@ -236,7 +237,10 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) callftrace_stub RESTORE_ABI_REGS 1 + bnezt1, .Ldirect jr t0 +.Ldirect: + jr t1 SYM_FUNC_END(ftrace_regs_caller) SYM_FUNC_START(ftrace_caller) @@ -250,3 +254,9 @@ SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) jr t0 SYM_FUNC_END(ftrace_caller) #endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +SYM_CODE_START(ftrace_stub_direct_tramp) + jr t0 +SYM_CODE_END(ftrace_stub_direct_tramp) +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ -- 2.40.1
[PATCH v12 for-next 0/4] riscv: ftrace: Miscellaneous ftrace improvements
From: Björn Töpel NB! Song told me that he would not have the time work on this series, so I picked it up. This series includes a three ftrace improvements for RISC-V: 1. Do not require to run recordmcount at build time (patch 1) 2. Simplification of the function graph functionality (patch 2) 3. Enable DYNAMIC_FTRACE_WITH_DIRECT_CALLS (patch 3 and 4) The series has been tested on Qemu/rv64 virt/Debian sid with the following test configs: CONFIG_FTRACE_SELFTEST=y CONFIG_FTRACE_STARTUP_TEST=y CONFIG_SAMPLE_FTRACE_DIRECT=m CONFIG_SAMPLE_FTRACE_DIRECT_MULTI=m CONFIG_SAMPLE_FTRACE_OPS=m All tests pass. Cheers, Björn Changes in v12: - Massaged the commit messages a bit. - Squashed the samples patches, so that the rv32 support is included from the start. - Minor whitespace changes in the mcount-dyn.S. - Minor style changes. Changes in v11: https://lore.kernel.org/linux-riscv/20230627111612.761164-1-suagrfil...@gmail.com/ - append a patch that makes the DIRECT_CALL samples support RV32I in this series fixing the rv32 build failure reported by Palmer - validated with ftrace boottime selftest and manual sample modules test in qemu-system for RV32I and RV64I Changes in v10: https://lore.kernel.org/all/20230511093234.3123181-1-suagrfil...@gmail.com/ - add Acked-by from Björn Töpel in patch 2 and patch 4 - replace `move` with `mv` in patch3 - prettify patch 2/4 with proper tabs Changes in v9: https://lore.kernel.org/linux-riscv/20230510101857.2953955-1-suagrfil...@gmail.com/ 1. add Acked-by from Björn Töpel in patch 1 2. rebase patch2/patch3 on Linux v6.4-rc1 - patch 2: to make the `SAVE_ABI_REGS` configurable, revert the modification of mcount-dyn.S from commit (45b32b946a97 "riscv: entry: Consolidate general regs saving/restoring") - patch 3: to pass the trace_selftest, add the implement of `ftrace_stub_direct_tramp` from commit (fee86a4ed536 "ftrace: selftest: remove broken trace_direct_tramp") ; and fixup the context conflict in Kconfig Changes in v8: https://lore.kernel.org/linux-riscv/20230324033342.3177979-1-suagrfil...@gmail.com/ - Fix incorrect address values in the 4nd patch - Rebased on v6.3-rc2 Changes in v7: https://lore.kernel.org/linux-riscv/20230112090603.1295340-1-guo...@kernel.org/ - Fixup RESTORE_ABI_REGS by remove PT_T0(sp) overwrite. - Add FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY [1] - Fixup kconfig with HAVE_SAMPLE_FTRACE_DIRECT & HAVE_SAMPLE_FTRACE_DIRECT_MULTI Changes in v6: https://lore.kernel.org/linux-riscv/20230107133549.4192639-1-guo...@kernel.org/ - Replace 8 with MCOUNT_INSN_SIZE - Replace "REG_L a1, PT_RA(sp)" with "mv a1, ra" - Add Evgenii Shatokhin comment Changes in v5: https://lore.kernel.org/linux-riscv/20221208091244.203407-1-guo...@kernel.org/ - Sort Kconfig entries in alphabetical order. Changes in v4: https://lore.kernel.org/linux-riscv/20221129033230.255947-1-guo...@kernel.org/ - Include [3] for maintenance. [Song Shuai] Changes in V3: https://lore.kernel.org/linux-riscv/20221123153950.2911981-1-guo...@kernel.org/ - Include [2] for maintenance. [Song Shuai] [1]: https://lore.kernel.org/linux-riscv/CAAYs2=j3eak9vu6xbaw0zpuoh00rh8v5c2u3fepkokzfibw...@mail.gmail.com/T/#t [2]: https://lore.kernel.org/lkml/20221120084230.910152-1-suagrfil...@gmail.com/ [3]: https://lore.kernel.org/linux-riscv/20221123142025.1504030-1-suagrfil...@gmail.com/ Song Shuai (4): riscv: select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY riscv: ftrace: Make function graph use ftrace directly riscv: ftrace: Add DYNAMIC_FTRACE_WITH_DIRECT_CALLS support samples: ftrace: Add RISC-V support for SAMPLE_FTRACE_DIRECT[_MULTI] arch/riscv/Kconfig | 4 + arch/riscv/include/asm/ftrace.h | 18 +- arch/riscv/kernel/ftrace.c | 30 ++- arch/riscv/kernel/mcount-dyn.S | 198 samples/ftrace/ftrace-direct-modify.c | 35 samples/ftrace/ftrace-direct-multi-modify.c | 41 samples/ftrace/ftrace-direct-multi.c| 25 +++ samples/ftrace/ftrace-direct-too.c | 28 +++ samples/ftrace/ftrace-direct.c | 24 +++ 9 files changed, 348 insertions(+), 55 deletions(-) base-commit: 3ca112b71f35dd5d99fc4571a56b5fc6f0c15814 -- 2.40.1
Re: [PATCH -fixes] riscv: Fix ftrace syscall handling which are now prefixed with __riscv_
Alexandre Ghiti writes: > ftrace creates entries for each syscall in the tracefs but has failed > since commit 08d0ce30e0e4 ("riscv: Implement syscall wrappers") which > prefixes all riscv syscalls with __riscv_. > > So fix this by implementing arch_syscall_match_sym_name() which allows us > to ignore this prefix. > > And also ignore compat syscalls like x86/arm64 by implementing > arch_trace_is_compat_syscall(). > > Fixes: 08d0ce30e0e4 ("riscv: Implement syscall wrappers") > Signed-off-by: Alexandre Ghiti This fix does the BPF test suite happier! Tested-by: Björn Töpel
Re: [PATCH -fixes] riscv: Fix ftrace syscall handling which are now prefixed with __riscv_
Alexandre Ghiti writes: > @Conor Dooley This fails checkpatch but the documentation here states > that this is how to do: > https://elixir.bootlin.com/linux/latest/source/Documentation/trace/ftrace-design.rst#L246 FWIW, I'll relax the PW CI checkpatch test going forward. It's way too strict...
[PATCH v3] riscv: Only consider swbp/ss handlers for correct privileged mode
From: Björn Töpel RISC-V software breakpoint trap handlers are used for {k,u}probes. When trapping from kernelmode, only the kernelmode handlers should be considered. Vice versa, only usermode handlers for usermode traps. This is not the case on RISC-V, which can trigger a bug if a userspace process uses uprobes, and a WARN() is triggered from kernelmode (which is implemented via {c.,}ebreak). The kernel will trap on the kernelmode {c.,}ebreak, look for uprobes handlers, realize incorrectly that uprobes need to be handled, and exit the trap handler early. The trap returns to re-executing the {c.,}ebreak, and enter an infinite trap-loop. The issue was found running the BPF selftest [1]. Fix this issue by only considering the swbp/ss handlers for kernel/usermode respectively. Also, move CONFIG ifdeffery from traps.c to the asm/{k,u}probes.h headers. Note that linux/uprobes.h only include asm/uprobes.h if CONFIG_UPROBES is defined, which is why asm/uprobes.h needs to be unconditionally included in traps.c Link: https://lore.kernel.org/linux-riscv/87v8d19aun@all.your.base.are.belong.to.us/ # [1] Fixes: 74784081aac8 ("riscv: Add uprobes supported") Reviewed-by: Guo Ren Reviewed-by: Nam Cao Tested-by: Puranjay Mohan Signed-off-by: Björn Töpel --- v2->v3: Remove incorrect tags (Conor) Collect review/test tags v1->v2: Fix Clang build warning (kernel test robot) --- arch/riscv/include/asm/kprobes.h | 11 ++- arch/riscv/include/asm/uprobes.h | 13 - arch/riscv/kernel/traps.c| 28 ++-- 3 files changed, 40 insertions(+), 12 deletions(-) diff --git a/arch/riscv/include/asm/kprobes.h b/arch/riscv/include/asm/kprobes.h index e7882ccb0fd4..78ea44f76718 100644 --- a/arch/riscv/include/asm/kprobes.h +++ b/arch/riscv/include/asm/kprobes.h @@ -40,6 +40,15 @@ void arch_remove_kprobe(struct kprobe *p); int kprobe_fault_handler(struct pt_regs *regs, unsigned int trapnr); bool kprobe_breakpoint_handler(struct pt_regs *regs); bool kprobe_single_step_handler(struct pt_regs *regs); - +#else +static inline bool kprobe_breakpoint_handler(struct pt_regs *regs) +{ + return false; +} + +static inline bool kprobe_single_step_handler(struct pt_regs *regs) +{ + return false; +} #endif /* CONFIG_KPROBES */ #endif /* _ASM_RISCV_KPROBES_H */ diff --git a/arch/riscv/include/asm/uprobes.h b/arch/riscv/include/asm/uprobes.h index f2183e00fdd2..3fc7deda9190 100644 --- a/arch/riscv/include/asm/uprobes.h +++ b/arch/riscv/include/asm/uprobes.h @@ -34,7 +34,18 @@ struct arch_uprobe { bool simulate; }; +#ifdef CONFIG_UPROBES bool uprobe_breakpoint_handler(struct pt_regs *regs); bool uprobe_single_step_handler(struct pt_regs *regs); - +#else +static inline bool uprobe_breakpoint_handler(struct pt_regs *regs) +{ + return false; +} + +static inline bool uprobe_single_step_handler(struct pt_regs *regs) +{ + return false; +} +#endif /* CONFIG_UPROBES */ #endif /* _ASM_RISCV_UPROBES_H */ diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 19807c4d3805..fae8f610d867 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include #include @@ -247,22 +249,28 @@ static inline unsigned long get_break_insn_length(unsigned long pc) return GET_INSN_LENGTH(insn); } +static bool probe_single_step_handler(struct pt_regs *regs) +{ + bool user = user_mode(regs); + + return user ? uprobe_single_step_handler(regs) : kprobe_single_step_handler(regs); +} + +static bool probe_breakpoint_handler(struct pt_regs *regs) +{ + bool user = user_mode(regs); + + return user ? uprobe_breakpoint_handler(regs) : kprobe_breakpoint_handler(regs); +} + void handle_break(struct pt_regs *regs) { -#ifdef CONFIG_KPROBES - if (kprobe_single_step_handler(regs)) + if (probe_single_step_handler(regs)) return; - if (kprobe_breakpoint_handler(regs)) - return; -#endif -#ifdef CONFIG_UPROBES - if (uprobe_single_step_handler(regs)) + if (probe_breakpoint_handler(regs)) return; - if (uprobe_breakpoint_handler(regs)) - return; -#endif current->thread.bad_cause = regs->cause; if (user_mode(regs)) base-commit: 0bb80ecc33a8fb5a682236443c1e740d5c917d1d -- 2.39.2
Re: linux-next: build failure after merge of the bpf-next tree
On 2021-03-11 01:47, Stephen Rothwell wrote: Hi all, After merging the bpf-next tree, today's linux-next build (perf) failed like this: make[3]: *** No rule to make target 'libbpf_util.h', needed by '/home/sfr/next/perf/staticobjs/xsk.o'. Stop. Hi Stephen, It's an incremental build issue, as pointed out here [1], that is resolved by cleaning the build. Cheers, Björn [1] https://lore.kernel.org/bpf/CAEf4BzYPDF87At4=_gsndxof84oiqyjxgahl7_jvpuntovu...@mail.gmail.com/ Caused by commit 7e8bbe24cb8b ("libbpf: xsk: Move barriers from libbpf_util.h to xsk.h") I have used the bpf tree from next-20210310 for today.
Re: [PATCH] tools/memory-model: Fix smp_mb__after_spinlock() spelling
On 2021-03-05 16:36, Alan Stern wrote: On Fri, Mar 05, 2021 at 11:28:23AM +0100, Björn Töpel wrote: From: Björn Töpel A misspelled invokation of git-grep, revealed that ---^ Smetimes my brain is a little slow... Do you confirm that this is a joke? I wish, Alan. I wish. Looks like I can only spel function names correctly. Have a nice weekend! Björn Alan
[PATCH] tools/memory-model: Fix smp_mb__after_spinlock() spelling
From: Björn Töpel A misspelled invokation of git-grep, revealed that smp_mb__after_spinlock() was misspelled in explaination.txt. Add missing "_" to smp_mb__after_spinlock(). Fixes: 1c27b644c0fd ("Automate memory-barriers.txt; provide Linux-kernel memory model") Signed-off-by: Björn Töpel --- tools/memory-model/Documentation/explanation.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/memory-model/Documentation/explanation.txt b/tools/memory-model/Documentation/explanation.txt index f9d610d5a1a4..5d72f3112e56 100644 --- a/tools/memory-model/Documentation/explanation.txt +++ b/tools/memory-model/Documentation/explanation.txt @@ -2510,7 +2510,7 @@ they behave as follows: smp_mb__after_atomic() orders po-earlier atomic updates and the events preceding them against all po-later events; - smp_mb_after_spinlock() orders po-earlier lock acquisition + smp_mb__after_spinlock() orders po-earlier lock acquisition events and the events preceding them against all po-later events. base-commit: 7f58c0fb9238abaa3997185ceec319201b6f5a94 -- 2.27.0
Re: XDP socket rings, and LKMM litmus tests
On Tue, 2 Mar 2021 at 21:41, Paul E. McKenney wrote: > > On Tue, Mar 02, 2021 at 09:24:04PM +0100, Björn Töpel wrote: > > On Tue, 2 Mar 2021 at 20:57, Paul E. McKenney wrote: > > > > > > On Tue, Mar 02, 2021 at 07:46:27PM +0100, Björn Töpel wrote: > > > > [...] > > > > > > > > Before digging in too deeply, does the following simplification > > > still capture your intent? > > > > > > > Thanks for having a look, Paul! > > > > > P0(int *prod, int *cons, int *data) > > > { > > > int p; > > > int cond = 0; > > > > > > p = READ_ONCE(*prod); > > > if (p == READ_ONCE(*cons)) > > > cond = 1; > > > > With this, yes! > > > > > if (cond) { > > > smp_mb(); > > > WRITE_ONCE(*data, 1); > > > smp_wmb(); > > > WRITE_ONCE(*prod, p ^ 1); > > > } > > > } > > > > > > P1(int *prod, int *cons, int *data) > > > { > > > int c; > > > int d = -1; > > > int cond = 0; > > > > > > c = READ_ONCE(*cons); > > > if (READ_ONCE(*prod) == c) > > > cond = 1; > > > > Hmm, this would not be the correct state transition. > > > > c==1 && p==1 would set cond to 1, right? > > > > I would agree with: > > c = READ_ONCE(*cons); > > if (READ_ONCE(*prod) != c) > > Right you are! > > With that, it looks to me like LKMM is OK with removing the smp_mb(). > My guess is that the issue is that LKMM confines the effect of control > dependencies to a single "if" statement, hence my reworking of your > original. > Interesting! I tried the acquire/release version: P0(int *prod, int *cons, int *data) { int p; int cond = 0; p = READ_ONCE(*prod); if (p == READ_ONCE(*cons)) { WRITE_ONCE(*data, 1); smp_store_release(prod, p ^ 1); } } P1(int *prod, int *cons, int *data) { int c; int d = -1; c = READ_ONCE(*cons); if (smp_load_acquire(prod) != c) { d = READ_ONCE(*data); smp_store_release(cons, c ^ 1); } } and as with the previous example, restructuring the if-statement makes "if (p == READ_ONCE(*cons)) {" sufficient, instead of "if (p == smp_load_acquire(cons)) {". Yay! Björn > Thanx, Paul > > > > > > > if (cond == 1) { > > > smp_rmb(); > > > d = READ_ONCE(*data); > > > smp_mb(); > > > WRITE_ONCE(*cons, c ^ 1); > > > } > > > } > > > > > > Thanx, Paul > > > > > > > [...] > > > > Björn
Re: XDP socket rings, and LKMM litmus tests
On Tue, 2 Mar 2021 at 21:04, Paul E. McKenney wrote: > [...] > > And if the answer is "yes", how about this one? > With the == to != change in P1, this is OK! > P0(int *prod, int *cons, int *data) > { > int p; > > p = READ_ONCE(*prod); > if (p == READ_ONCE(*cons)) { ...and now d==1 is not a valid state anymore according to herd. If think I need some input here! :-D > WRITE_ONCE(*data, 1); > smp_wmb(); > WRITE_ONCE(*prod, p ^ 1); > } > } > > P1(int *prod, int *cons, int *data) > { > int c; > int d = -1; > > c = READ_ONCE(*cons); > if (READ_ONCE(*prod) == c) { > smp_rmb(); > d = READ_ONCE(*data); > smp_mb(); > WRITE_ONCE(*cons, c ^ 1); > } > } > [...] Björn
Re: XDP socket rings, and LKMM litmus tests
On Tue, 2 Mar 2021 at 20:57, Paul E. McKenney wrote: > > On Tue, Mar 02, 2021 at 07:46:27PM +0100, Björn Töpel wrote: [...] > > Before digging in too deeply, does the following simplification > still capture your intent? > Thanks for having a look, Paul! > P0(int *prod, int *cons, int *data) > { > int p; > int cond = 0; > > p = READ_ONCE(*prod); > if (p == READ_ONCE(*cons)) > cond = 1; With this, yes! > if (cond) { > smp_mb(); > WRITE_ONCE(*data, 1); > smp_wmb(); > WRITE_ONCE(*prod, p ^ 1); > } > } > > P1(int *prod, int *cons, int *data) > { > int c; > int d = -1; > int cond = 0; > > c = READ_ONCE(*cons); > if (READ_ONCE(*prod) == c) > cond = 1; Hmm, this would not be the correct state transition. c==1 && p==1 would set cond to 1, right? I would agree with: c = READ_ONCE(*cons); if (READ_ONCE(*prod) != c) > > if (cond == 1) { > smp_rmb(); > d = READ_ONCE(*data); > smp_mb(); > WRITE_ONCE(*cons, c ^ 1); > } > } > > Thanx, Paul > [...] Björn
XDP socket rings, and LKMM litmus tests
Hi! Firstly; The long Cc-list is to reach the LKMM-folks. Some background; the XDP sockets use a ring-buffer to communicate between the kernel and userland. It's a single-consumer/single-producer ring, and described in net/xdp/xsk_queue.h. --8<--- /* The structure of the shared state of the rings are the same as the * ring buffer in kernel/events/ring_buffer.c. For the Rx and completion * ring, the kernel is the producer and user space is the consumer. For * the Tx and fill rings, the kernel is the consumer and user space is * the producer. * * producer consumer * * if (LOAD ->consumer) { LOAD ->producer *(A) smp_rmb() (C) *STORE $data LOAD $data *smp_wmb() (B) smp_mb()(D) *STORE ->producer STORE ->consumer * } * * (A) pairs with (D), and (B) pairs with (C). ... -->8--- I'd like to replace the smp_{r,w,}mb() barriers with acquire-release semantics [1], without breaking existing userspace applications. So, I figured I'd use herd7 and the LKMM model to build a litmus test for the barrier version, then for the acquire-release version, and finally permutations of both. The idea is to use a one element ring, with a state machine outlined in the litmus test. The basic test for the existing smp_{r,w,}mb() barriers looks like: $ cat spsc-rb+1p1c.litmus C spsc-rb+1p1c // Stupid one entry ring: // prod cons allowed action prod cons //00 => prod => 10 //01 => cons => 00 //10 => cons => 11 //11 => prod => 01 { prod = 1; } // Here, we start at prod==1,cons==0, data==0, i.e. producer has // written data=0, so from here only the consumer can start, and should // consume data==0. Afterwards, producer can continue and write 1 to // data. Can we enter state prod==0, cons==1, but consumer observerd // the write of 1? P0(int *prod, int *cons, int *data) { int p; int c; int cond = 0; p = READ_ONCE(*prod); c = READ_ONCE(*cons); if (p == 0) if (c == 0) cond = 1; if (p == 1) if (c == 1) cond = 1; if (cond) { smp_mb(); WRITE_ONCE(*data, 1); smp_wmb(); WRITE_ONCE(*prod, p ^ 1); } } P1(int *prod, int *cons, int *data) { int p; int c; int d = -1; int cond = 0; p = READ_ONCE(*prod); c = READ_ONCE(*cons); if (p == 1) if (c == 0) cond = 1; if (p == 0) if (c == 1) cond = 1; if (cond == 1) { smp_rmb(); d = READ_ONCE(*data); smp_mb(); WRITE_ONCE(*cons, c ^ 1); } } exists( 1:d=1 /\ prod=0 /\ cons=1 ); -- The weird state changing if-statements is because that I didn't get '&&' and '||' to work with herd. When this is run: $ herd7 -conf linux-kernel.cfg litmus-tests/spsc-rb+1p1c.litmus Test spsc-rb+1p1c Allowed States 2 1:d=0; cons=1; prod=0; 1:d=0; cons=1; prod=1; No Witnesses Positive: 0 Negative: 2 Condition exists (1:d=1 /\ prod=0 /\ cons=1) Observation spsc-rb+1p1c Never 0 2 Time spsc-rb+1p1c 0.04 Hash=b399756d6a1301ca5bda042f32130791 Now to my question; In P0 there's an smp_mb(). Without that, the d==1 can be observed from P1 (consumer): $ herd7 -conf linux-kernel.cfg litmus-tests/spsc-rb+1p1c.litmus Test spsc-rb+1p1c Allowed States 3 1:d=0; cons=1; prod=0; 1:d=0; cons=1; prod=1; 1:d=1; cons=1; prod=0; Ok Witnesses Positive: 1 Negative: 2 Condition exists (1:d=1 /\ prod=0 /\ cons=1) Observation spsc-rb+1p1c Sometimes 1 2 Time spsc-rb+1p1c 0.04 Hash=0047fc21fa77da9a9aee15e35ec367ef In commit c7f2e3cd6c1f ("perf: Optimize ring-buffer write by depending on control dependencies") removes the corresponding smp_mb(), and also the circular buffer in circular-buffers.txt (pre commit 6c43c091bdc5 ("documentation: Update circular buffer for load-acquire/store-release")) is missing the smp_mb() at the producer-side. I'm trying to wrap my head around why it's OK to remove the smp_mb() in the cases above? I'm worried that the current XDP socket ring implementation (which is missing smp_mb()) might be broken. If you read this far, thanks! :-) Björn [1] https://lore.kernel.org/bpf/20210301104318.263262-2-bjorn.to...@gmail.com/
Re: general protection fault in xsk_recvmsg
#syz fix: xsk: Validate socket state in xsk_recvmsg, prior touching socket members
Re: memory leak in xskq_create
On 2020-12-16 19:11, Peilin Ye wrote: Hi all, On Sun, Dec 13, 2020 at 06:53:10AM -0800, syzbot wrote: BUG: memory leak unreferenced object 0x88810f897940 (size 64): comm "syz-executor991", pid 8502, jiffies 4294942194 (age 14.080s) hex dump (first 32 bytes): 7f 00 00 00 80 00 00 00 00 00 00 00 00 00 00 00 00 a0 37 0c 81 88 ff ff 00 00 00 00 00 00 00 00 ..7. backtrace: [<639d0dd1>] xskq_create+0x23/0xd0 include/linux/slab.h:552 [] xsk_init_queue net/xdp/xsk.c:508 [inline] [ ] xsk_setsockopt+0x1c4/0x590 net/xdp/xsk.c:875 [<2b302260>] __sys_setsockopt+0x1b0/0x360 net/socket.c:2132 [ ] __do_sys_setsockopt net/socket.c:2143 [inline] [ ] __se_sys_setsockopt net/socket.c:2140 [inline] [ ] __x64_sys_setsockopt+0x22/0x30 net/socket.c:2140 [<05c2b4a0>] do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 [<03db140f>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 I have tested the following diff locally against syzbot's reproducer, and sent a patch to it [1] for testing. I will send a real patch here tomorrow if syzbot is happy about it. Please see explanation below. Hi Peilin Ye! Thanks for taking a look! Magnus has already addressed this problem in another patch [1]. Cheers, Björn [1] https://lore.kernel.org/bpf/20201214085127.3960-1-magnus.karls...@gmail.com/
Re: [PATCH bpf-next v5 04/11] bpf: Rename BPF_XADD and prepare to encode other atomics in .imm
On Tue, 15 Dec 2020 at 13:25, Brendan Jackman wrote: > > A subsequent patch will add additional atomic operations. These new > operations will use the same opcode field as the existing XADD, with > the immediate discriminating different operations. > > In preparation, rename the instruction mode BPF_ATOMIC and start > calling the zero immediate BPF_ADD. > > This is possible (doesn't break existing valid BPF progs) because the > immediate field is currently reserved MBZ and BPF_ADD is zero. > > All uses are removed from the tree but the BPF_XADD definition is > kept around to avoid breaking builds for people including kernel > headers. > > Signed-off-by: Brendan Jackman > --- > Documentation/networking/filter.rst | 30 +++- > arch/arm/net/bpf_jit_32.c | 7 ++- > arch/arm64/net/bpf_jit_comp.c | 16 +-- > arch/mips/net/ebpf_jit.c | 11 +++-- > arch/powerpc/net/bpf_jit_comp64.c | 25 -- > arch/riscv/net/bpf_jit_comp32.c | 20 ++-- > arch/riscv/net/bpf_jit_comp64.c | 16 +-- For RISC-V: Acked-by: Björn Töpel
Re: linux-next: ERROR: build error for arm64
On 2020-12-06 08:32, Hui Su wrote: hi, all: The error came out like this when i build the linux-next kernel with ARCH=arm64, with the arm64_defconfig: CC drivers/net/ethernet/freescale/dpaa/dpaa_eth.o ../drivers/net/ethernet/freescale/dpaa/dpaa_eth.c: In function ‘dpaa_fq_init’: ../drivers/net/ethernet/freescale/dpaa/dpaa_eth.c:1135:9: error: too few arguments to function ‘xdp_rxq_info_reg’ 1135 | err = xdp_rxq_info_reg(_fq->xdp_rxq, dpaa_fq->net_dev, | ^~~~ In file included from ../include/linux/netdevice.h:42, from ../include/uapi/linux/if_arp.h:27, from ../include/linux/if_arp.h:23, from ../drivers/net/ethernet/freescale/dpaa/dpaa_eth.c:40: ../include/net/xdp.h:229:5: note: declared here 229 | int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, | ^~~~ make[6]: *** [../scripts/Makefile.build:283: drivers/net/ethernet/freescale/dpaa/dpaa_eth.o] Error 1 Branch: git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git HEAD: 2996bd3f6ca9ea529b40c369a94b247657abdb4d ARCH: arm64 CONFIG: arch/arm64/configs/defconfig CMD: make ARCH=arm64 CROSS_COMPILE=/usr/bin/aarch64-linux-gnu- Image It maybe introduced by the commit b02e5a0ebb172(xsk: Propagate napi_id to XDP socket Rx path), and if anyone solved this, please add: Reported-by: Hui Su Hui, was already resolved in commit fdd8b8249ef8 ("dpaa_eth: fix build errorr in dpaa_fq_init"). Thanks, Björn
Re: [PATCH][next] samples/bpf: Fix spelling mistake "recieving" -> "receiving"
On Thu, 3 Dec 2020 at 12:46, Colin King wrote: > > From: Colin Ian King > > There is a spelling mistake in an error message. Fix it. > > Signed-off-by: Colin Ian King Acked-by: Björn Töpel > --- > samples/bpf/xdpsock_user.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c > index 0fee7f3aef3c..9553c7c47fc4 100644 > --- a/samples/bpf/xdpsock_user.c > +++ b/samples/bpf/xdpsock_user.c > @@ -1570,7 +1570,7 @@ recv_xsks_map_fd(int *xsks_map_fd) > > err = recv_xsks_map_fd_from_ctrl_node(sock, xsks_map_fd); > if (err) { > - fprintf(stderr, "Error %d recieving fd\n", err); > + fprintf(stderr, "Error %d receiving fd\n", err); > return err; > } > return 0; > -- > 2.29.2 >
Re: [PATCH] dpaa_eth: fix build errorr in dpaa_fq_init
On Thu, 3 Dec 2020 at 15:46, Anders Roxell wrote: > > When building FSL_DPAA_ETH the following build error shows up: > > /tmp/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c: In function > ‘dpaa_fq_init’: > /tmp/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c:1135:9: error: too few > arguments to function ‘xdp_rxq_info_reg’ > 1135 | err = xdp_rxq_info_reg(_fq->xdp_rxq, dpaa_fq->net_dev, > | ^~~~ > > Commit b02e5a0ebb17 ("xsk: Propagate napi_id to XDP socket Rx path") > added an extra argument to function xdp_rxq_info_reg and commit > d57e57d0cd04 ("dpaa_eth: add XDP_TX support") didn't know about that > extra argument. > > Signed-off-by: Anders Roxell > --- > > I think this issue is seen since both patches went in at the same time > to bpf-next and net-next. > Thanks Anders! Indeed, when bpf-next is pulled into net-next this needs to be applied. Acked-by: Björn Töpel > drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c > b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c > index 947b3d2f9c7e..6cc8c4e078de 100644 > --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c > +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c > @@ -1133,7 +1133,7 @@ static int dpaa_fq_init(struct dpaa_fq *dpaa_fq, bool > td_enable) > if (dpaa_fq->fq_type == FQ_TYPE_RX_DEFAULT || > dpaa_fq->fq_type == FQ_TYPE_RX_PCD) { > err = xdp_rxq_info_reg(_fq->xdp_rxq, dpaa_fq->net_dev, > - dpaa_fq->fqid); > + dpaa_fq->fqid, 0); > if (err) { > dev_err(dev, "xdp_rxq_info_reg() = %d\n", err); > return err; > -- > 2.29.2 >
Re: [PATCH] xsk: add cq event
On 2020-11-16 17:12, Xuan Zhuo wrote: On Mon, 16 Nov 2020 15:31:20 +0100, =?UTF-8?B?QmrDtnJuIFTDtnBlbA==?= wrote: On 2020-11-16 09:10, Xuan Zhuo wrote: When we write all cq items to tx, we have to wait for a new event based on poll to indicate that it is writable. But the current writability is triggered based on whether tx is full or not, and In fact, when tx is dissatisfied, the user of cq's item may not necessarily get it, because it may still be occupied by the network card. In this case, we need to know when cq is available, so this patch adds a socket option, When the user configures this option using setsockopt, when cq is available, a readable event is generated for all xsk bound to this umem. I can't find a better description of this event, I think it can also be 'readable', although it is indeed different from the 'readable' of the new data. But the overhead of xsk checking whether cq or rx is readable is small. Signed-off-by: Xuan Zhuo Thanks for the patch! I'm not a fan of having two different "readable" event (both Rx and cq). Could you explain a bit what the use case is, so I get a better understanding. The Tx queues has a back-pressure mechanism, determined of the number of elements in cq. Is it related to that? Please explain a bit more what you're trying to solve, and maybe we can figure out a better way forward! Thanks! Björn I want to implement a tool for mass sending. For example, the size of cq is 1024, and I set the size of tx also to 1024, so that I will put all cq in tx at once, and then I have to wait for an event, come Indicates that there is new write space or new cq is available. At present, we can only monitor the event of write able. This indicates whether tx is full, but in fact, tx is basically not full, because the full state is very short, and those tx items are being used by the network card. And epoll_wait will be awakened directly every time, without waiting, but I cannot get the cq item, so I still cannot successfully send the package again. Of course, I don't like the "readable" event very much. This is a suitable one I found in the existing epoll event. ^_^ More questions! By "Mass sending" do you mean maximum throughput, or does that mean "in very large batches"? For the latter to do 1k batches, you could increase the Tx/cq buffer size to say 4k. For maximum thoughput it's better to use smaller batches (e.g. what the txpush scenario in samples/xdpsock does). You're right that even if there's space in the Tx ring, it wont be sent unless there's sufficient space in the cq ring. Maybe it would make sense to be more restrictive when triggering the "writable" socket event? E.g. only trigger it when there's space in Tx *and* sufficient cq space? Björn Thanks.
Re: [PATCH] xsk: add cq event
On 2020-11-16 09:10, Xuan Zhuo wrote: When we write all cq items to tx, we have to wait for a new event based on poll to indicate that it is writable. But the current writability is triggered based on whether tx is full or not, and In fact, when tx is dissatisfied, the user of cq's item may not necessarily get it, because it may still be occupied by the network card. In this case, we need to know when cq is available, so this patch adds a socket option, When the user configures this option using setsockopt, when cq is available, a readable event is generated for all xsk bound to this umem. I can't find a better description of this event, I think it can also be 'readable', although it is indeed different from the 'readable' of the new data. But the overhead of xsk checking whether cq or rx is readable is small. Signed-off-by: Xuan Zhuo Thanks for the patch! I'm not a fan of having two different "readable" event (both Rx and cq). Could you explain a bit what the use case is, so I get a better understanding. The Tx queues has a back-pressure mechanism, determined of the number of elements in cq. Is it related to that? Please explain a bit more what you're trying to solve, and maybe we can figure out a better way forward! Thanks! Björn
Re: [PATCH][next] xsk: Fix null check on error return path
On 2020-09-02 17:07, Gustavo A. R. Silva wrote: Currently, dma_map is being checked, when the right object identifier to be null-checked is dma_map->dma_pages, instead. Fix this by null-checking dma_map->dma_pages. Addresses-Coverity-ID: 1496811 ("Logically dead code") Fixes: 921b68692abb ("xsk: Enable sharing of dma mappings") Signed-off-by: Gustavo A. R. Silva Nice catch! Acked-by: Björn Töpel --- net/xdp/xsk_buff_pool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 795d7c81c0ca..5b00bc5707f2 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -287,7 +287,7 @@ static struct xsk_dma_map *xp_create_dma_map(struct device *dev, struct net_devi return NULL; dma_map->dma_pages = kvcalloc(nr_pages, sizeof(*dma_map->dma_pages), GFP_KERNEL); - if (!dma_map) { + if (!dma_map->dma_pages) { kfree(dma_map); return NULL; }
Re: [Linux-kernel-mentees] [PATCH net v2] xdp: Prevent kernel-infoleak in xsk_getsockopt()
On Tue, 28 Jul 2020 at 07:37, Peilin Ye wrote: > > xsk_getsockopt() is copying uninitialized stack memory to userspace when > `extra_stats` is `false`. Fix it. > > Fixes: 8aa5a33578e9 ("xsk: Add new statistics") > Suggested-by: Dan Carpenter > Signed-off-by: Peilin Ye > --- Acked-by: Björn Töpel > Doing `= {};` is sufficient since currently `struct xdp_statistics` is > defined as follows: > > struct xdp_statistics { > __u64 rx_dropped; > __u64 rx_invalid_descs; > __u64 tx_invalid_descs; > __u64 rx_ring_full; > __u64 rx_fill_ring_empty_descs; > __u64 tx_ring_empty_descs; > }; > > When being copied to the userspace, `stats` will not contain any > uninitialized "holes" between struct fields. > > Changes in v2: > - Remove the "Cc: sta...@vger.kernel.org" tag. (Suggested by Song Liu > ) > - Initialize `stats` by assignment instead of using memset(). > (Suggested by Song Liu ) > > net/xdp/xsk.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c > index 26e3bba8c204..b2b533eddebf 100644 > --- a/net/xdp/xsk.c > +++ b/net/xdp/xsk.c > @@ -840,7 +840,7 @@ static int xsk_getsockopt(struct socket *sock, int level, > int optname, > switch (optname) { > case XDP_STATISTICS: > { > - struct xdp_statistics stats; > + struct xdp_statistics stats = {}; > bool extra_stats = true; > size_t stats_size; > > -- > 2.25.1 >
Re: [PATCH bpf-next v1 0/3] bpf, riscv: Add compressed instructions to rv64 JIT
On Tue, 21 Jul 2020 at 04:52, Luke Nelson wrote: > > This patch series enables using compressed riscv (RVC) instructions > in the rv64 BPF JIT. > > RVC is a standard riscv extension that adds a set of compressed, > 2-byte instructions that can replace some regular 4-byte instructions > for improved code density. > > This series first modifies the JIT to support using 2-byte instructions > (e.g., in jump offset computations), then adds RVC encoding and > helper functions, and finally uses the helper functions to optimize > the rv64 JIT. > > I used our formal verification framework, Serval, to verify the > correctness of the RVC encodings and their uses in the rv64 JIT. > > The JIT continues to pass all tests in lib/test_bpf.c, and introduces > no new failures to test_verifier; both with and without RVC being enabled. > > The following are examples of the JITed code for the verifier selftest > "direct packet read test#3 for CGROUP_SKB OK", without and with RVC > enabled, respectively. The former uses 178 bytes, and the latter uses 112, > for a ~37% reduction in code size for this example. > > Without RVC: > >0: 02000813addi a6,zero,32 >4: fd010113addi sp,sp,-48 >8: 02813423sds0,40(sp) >c: 02913023sds1,32(sp) > 10: 01213c23sds2,24(sp) > 14: 01313823sds3,16(sp) > 18: 01413423sds4,8(sp) > 1c: 03010413addi s0,sp,48 > 20: 03056683lwu a3,48(a0) > 24: 02069693slli a3,a3,0x20 > 28: 0206d693srli a3,a3,0x20 > 2c: 03456703lwu a4,52(a0) > 30: 02071713slli a4,a4,0x20 > 34: 02075713srli a4,a4,0x20 > 38: 03856483lwu s1,56(a0) > 3c: 02049493slli s1,s1,0x20 > 40: 0204d493srli s1,s1,0x20 > 44: 03c56903lwu s2,60(a0) > 48: 02091913slli s2,s2,0x20 > 4c: 02095913srli s2,s2,0x20 > 50: 04056983lwu s3,64(a0) > 54: 0203slli s3,s3,0x20 > 58: 0209d993srli s3,s3,0x20 > 5c: 09056a03lwu s4,144(a0) > 60: 020a1a13slli s4,s4,0x20 > 64: 020a5a13srli s4,s4,0x20 > 68: 00900313addi t1,zero,9 > 6c: 006a7463bgeu s4,t1,0x74 > 70: 0a13addi s4,zero,0 > 74: 02d52823swa3,48(a0) > 78: 02e52a23swa4,52(a0) > 7c: 02952c23sws1,56(a0) > 80: 03252e23sws2,60(a0) > 84: 05352023sws3,64(a0) > 88: 0793addi a5,zero,0 > 8c: 02813403lds0,40(sp) > 90: 02013483lds1,32(sp) > 94: 01813903lds2,24(sp) > 98: 01013983lds3,16(sp) > 9c: 00813a03lds4,8(sp) > a0: 03010113addi sp,sp,48 > a4: 00078513addi a0,a5,0 > a8: 8067jalr zero,0(ra) > > With RVC: > >0: 02000813addia6,zero,32 >4: 7179c.addi16sp sp,-48 >6: f422c.sdsp s0,40(sp) >8: f026c.sdsp s1,32(sp) >a: ec4ac.sdsp s2,24(sp) >c: e84ec.sdsp s3,16(sp) >e: e452c.sdsp s4,8(sp) > 10: 1800c.addi4spn s0,sp,48 > 12: 03056683lwu a3,48(a0) > 16: 1682c.slli a3,0x20 > 18: 9281c.srli a3,0x20 > 1a: 03456703lwu a4,52(a0) > 1e: 1702c.slli a4,0x20 > 20: 9301c.srli a4,0x20 > 22: 03856483lwu s1,56(a0) > 26: 1482c.slli s1,0x20 > 28: 9081c.srli s1,0x20 > 2a: 03c56903lwu s2,60(a0) > 2e: 1902c.slli s2,0x20 > 30: 02095913srlis2,s2,0x20 > 34: 04056983lwu s3,64(a0) > 38: 1982c.slli s3,0x20 > 3a: 0209d993srlis3,s3,0x20 > 3e: 09056a03lwu s4,144(a0) > 42: 1a02c.slli s4,0x20 > 44: 020a5a13srlis4,s4,0x20 > 48: 4325c.lit1,9 > 4a: 006a7363bgeus4,t1,0x50 > 4e: 4a01c.lis4,0 > 50: d914c.swa3,48(a0) > 52: d958c.swa4,52(a0) > 54: dd04c.sws1,56(a0) > 56: 03252e23sw s2,60(a0) > 5a: 05352023sw s3,64(a0) > 5e: 4781c.lia5,0 > 60: 7422c.ldsp s0,40(sp) > 62: 7482c.ldsp s1,32(sp) > 64: 6962c.ldsp s2,24(sp) > 66: 69c2c.ldsp s3,16(sp) > 68: 6a22c.ldsp s4,8(sp) > 6a: 6145c.addi16sp sp,48 > 6c: 853ec.mva0,a5 > 6e: 8082c.jrra > > RFC -> v1: > - From Björn Töpel: > * Changed RVOFF macro to static inline "ninsns_rvoff". > * Changed return type of rvc_ functions from u32 to u16. > * Changed sizeof(u16) to sizeof(*ctx->insns). > * F
Re: [PATCH bpf-next v1 3/3] bpf, riscv: Use compressed instructions in the rv64 JIT
0x50 > 4e: 4a01c.lis4,0 > 50: d914c.swa3,48(a0) > 52: d958c.swa4,52(a0) > 54: dd04c.sws1,56(a0) > 56: 03252e23sw s2,60(a0) > 5a: 05352023sw s3,64(a0) > 5e: 4781c.lia5,0 > 60: 7422c.ldsp s0,40(sp) > 62: 7482c.ldsp s1,32(sp) > 64: 6962c.ldsp s2,24(sp) > 66: 69c2c.ldsp s3,16(sp) > 68: 6a22c.ldsp s4,8(sp) > 6a: 6145c.addi16sp sp,48 > 6c: 853ec.mva0,a5 > 6e: 8082c.jrra > > Cc: Björn Töpel > Signed-off-by: Luke Nelson > --- > Björn: I added you as Cc instead of Acked-by for this patch so you > would have a chance to review the further changes I made in emit_imm > (described above). Thanks! LGTM! I'll add the Acked-by/Reviewed-by to the cover letter! Really nice work! Björn > --- > arch/riscv/net/bpf_jit_comp64.c | 281 +--- > 1 file changed, 147 insertions(+), 134 deletions(-) > > diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c > index 55861269da2a..8a56b5293117 100644 > --- a/arch/riscv/net/bpf_jit_comp64.c > +++ b/arch/riscv/net/bpf_jit_comp64.c > @@ -132,19 +132,23 @@ static void emit_imm(u8 rd, s64 val, struct > rv_jit_context *ctx) > * > * This also means that we need to process LSB to MSB. > */ > - s64 upper = (val + (1 << 11)) >> 12, lower = val & 0xfff; > + s64 upper = (val + (1 << 11)) >> 12; > + /* Sign-extend lower 12 bits to 64 bits since immediates for li, > addiw, > +* and addi are signed and RVC checks will perform signed comparisons. > +*/ > + s64 lower = ((val & 0xfff) << 52) >> 52; > int shift; > > if (is_32b_int(val)) { > if (upper) > - emit(rv_lui(rd, upper), ctx); > + emit_lui(rd, upper, ctx); > > if (!upper) { > - emit(rv_addi(rd, RV_REG_ZERO, lower), ctx); > + emit_li(rd, lower, ctx); > return; > } > > - emit(rv_addiw(rd, rd, lower), ctx); > + emit_addiw(rd, rd, lower, ctx); > return; > } > > @@ -154,9 +158,9 @@ static void emit_imm(u8 rd, s64 val, struct > rv_jit_context *ctx) > > emit_imm(rd, upper, ctx); > > - emit(rv_slli(rd, rd, shift), ctx); > + emit_slli(rd, rd, shift, ctx); > if (lower) > - emit(rv_addi(rd, rd, lower), ctx); > + emit_addi(rd, rd, lower, ctx); > } > > static void __build_epilogue(bool is_tail_call, struct rv_jit_context *ctx) > @@ -164,43 +168,43 @@ static void __build_epilogue(bool is_tail_call, struct > rv_jit_context *ctx) > int stack_adjust = ctx->stack_size, store_offset = stack_adjust - 8; > > if (seen_reg(RV_REG_RA, ctx)) { > - emit(rv_ld(RV_REG_RA, store_offset, RV_REG_SP), ctx); > + emit_ld(RV_REG_RA, store_offset, RV_REG_SP, ctx); > store_offset -= 8; > } > - emit(rv_ld(RV_REG_FP, store_offset, RV_REG_SP), ctx); > + emit_ld(RV_REG_FP, store_offset, RV_REG_SP, ctx); > store_offset -= 8; > if (seen_reg(RV_REG_S1, ctx)) { > - emit(rv_ld(RV_REG_S1, store_offset, RV_REG_SP), ctx); > + emit_ld(RV_REG_S1, store_offset, RV_REG_SP, ctx); > store_offset -= 8; > } > if (seen_reg(RV_REG_S2, ctx)) { > - emit(rv_ld(RV_REG_S2, store_offset, RV_REG_SP), ctx); > + emit_ld(RV_REG_S2, store_offset, RV_REG_SP, ctx); > store_offset -= 8; > } > if (seen_reg(RV_REG_S3, ctx)) { > - emit(rv_ld(RV_REG_S3, store_offset, RV_REG_SP), ctx); > + emit_ld(RV_REG_S3, store_offset, RV_REG_SP, ctx); > store_offset -= 8; > } > if (seen_reg(RV_REG_S4, ctx)) { > - emit(rv_ld(RV_REG_S4, store_offset, RV_REG_SP), ctx); > + emit_ld(RV_REG_S4, store_offset, RV_REG_SP, ctx); > store_offset -= 8; > } > if (seen_reg(RV_REG_S5, ctx)) { > - emit(rv_ld(RV_REG_S5, store_offset, RV_REG_SP), ctx); > + emit_ld(RV_REG_S5, store_offset, RV_REG_SP, ctx); > store_offset -= 8; > } > if (seen_reg(RV_REG_S6, ctx)) { > - emit(rv_ld(RV_REG_S6, store_offset, RV_REG_SP), ctx); > + emit_ld(RV_REG_S6,
Re: [RFC PATCH bpf-next 3/3] bpf, riscv: Use compressed instructions in the rv64 JIT
On Mon, 13 Jul 2020 at 20:37, Luke Nelson wrote: > > This patch uses the RVC support and encodings from bpf_jit.h to optimize > the rv64 jit. > > The optimizations work by replacing emit(rv_X(...)) with a call to a > helper function emit_X, which will emit a compressed version of the > instruction when possible, and when RVC is enabled. > > The JIT continues to pass all tests in lib/test_bpf.c, and introduces > no new failures to test_verifier; both with and without RVC being enabled. > > The following are examples of the JITed code for the verifier selftest > "direct packet read test#3 for CGROUP_SKB OK", without and with RVC > enabled, respectively. The former uses 178 bytes, and the latter uses 112, > for a ~37% reduction in code size for this example. > > Without RVC: > >0: 02000813addi a6,zero,32 >4: fd010113addi sp,sp,-48 >8: 02813423sds0,40(sp) >c: 02913023sds1,32(sp) > 10: 01213c23sds2,24(sp) > 14: 01313823sds3,16(sp) > 18: 01413423sds4,8(sp) > 1c: 03010413addi s0,sp,48 > 20: 03056683lwu a3,48(a0) > 24: 02069693slli a3,a3,0x20 > 28: 0206d693srli a3,a3,0x20 > 2c: 03456703lwu a4,52(a0) > 30: 02071713slli a4,a4,0x20 > 34: 02075713srli a4,a4,0x20 > 38: 03856483lwu s1,56(a0) > 3c: 02049493slli s1,s1,0x20 > 40: 0204d493srli s1,s1,0x20 > 44: 03c56903lwu s2,60(a0) > 48: 02091913slli s2,s2,0x20 > 4c: 02095913srli s2,s2,0x20 > 50: 04056983lwu s3,64(a0) > 54: 0203slli s3,s3,0x20 > 58: 0209d993srli s3,s3,0x20 > 5c: 09056a03lwu s4,144(a0) > 60: 020a1a13slli s4,s4,0x20 > 64: 020a5a13srli s4,s4,0x20 > 68: 00900313addi t1,zero,9 > 6c: 006a7463bgeu s4,t1,0x74 > 70: 0a13addi s4,zero,0 > 74: 02d52823swa3,48(a0) > 78: 02e52a23swa4,52(a0) > 7c: 02952c23sws1,56(a0) > 80: 03252e23sws2,60(a0) > 84: 05352023sws3,64(a0) > 88: 0793addi a5,zero,0 > 8c: 02813403lds0,40(sp) > 90: 02013483lds1,32(sp) > 94: 01813903lds2,24(sp) > 98: 01013983lds3,16(sp) > 9c: 00813a03lds4,8(sp) > a0: 03010113addi sp,sp,48 > a4: 00078513addi a0,a5,0 > a8: 8067jalr zero,0(ra) > > With RVC: > >0: 02000813addia6,zero,32 >4: 7179c.addi16sp sp,-48 >6: f422c.sdsp s0,40(sp) >8: f026c.sdsp s1,32(sp) >a: ec4ac.sdsp s2,24(sp) >c: e84ec.sdsp s3,16(sp) >e: e452c.sdsp s4,8(sp) > 10: 1800c.addi4spn s0,sp,48 > 12: 03056683lwu a3,48(a0) > 16: 1682c.slli a3,0x20 > 18: 9281c.srli a3,0x20 > 1a: 03456703lwu a4,52(a0) > 1e: 1702c.slli a4,0x20 > 20: 9301c.srli a4,0x20 > 22: 03856483lwu s1,56(a0) > 26: 1482c.slli s1,0x20 > 28: 9081c.srli s1,0x20 > 2a: 03c56903lwu s2,60(a0) > 2e: 1902c.slli s2,0x20 > 30: 02095913srlis2,s2,0x20 > 34: 04056983lwu s3,64(a0) > 38: 1982c.slli s3,0x20 > 3a: 0209d993srlis3,s3,0x20 > 3e: 09056a03lwu s4,144(a0) > 42: 1a02c.slli s4,0x20 > 44: 020a5a13srlis4,s4,0x20 > 48: 4325c.lit1,9 > 4a: 006a7363bgeus4,t1,0x50 > 4e: 4a01c.lis4,0 > 50: d914c.swa3,48(a0) > 52: d958c.swa4,52(a0) > 54: dd04c.sws1,56(a0) > 56: 03252e23sw s2,60(a0) > 5a: 05352023sw s3,64(a0) > 5e: 4781c.lia5,0 > 60: 7422c.ldsp s0,40(sp) > 62: 7482c.ldsp s1,32(sp) > 64: 6962 c.ldsp s2,24(sp) > 66: 69c2c.ldsp s3,16(sp) > 68: 6a22c.ldsp s4,8(sp) > 6a: 6145c.addi16sp sp,48 > 6c: 853ec.mva0,a5 > 6e: 8082c.jrra > > Signed-off-by: Luke Nelson Very nice work! For the proper series, feel free to add: Acked-by: Björn Töpel > --- > arch/riscv/net/bpf_jit_comp64.c | 275 +--- > 1 file changed, 142 insertions(+), 133 deletions(-) > > diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c > index 26feed92f1bc..823fe800f406 100644 > --- a/arch/riscv/net/bpf_jit_comp64.c > +++ b/arch/riscv/net/bpf_jit_comp64.c > @@ -137,14 +137,14 @@ static void emit_imm(u8 rd, s64 val, struct > rv_jit_context *ctx) > > if (is_32b_int(val))
Re: [RFC PATCH bpf-next 2/3] bpf, riscv: Add encodings for compressed instructions
On Mon, 13 Jul 2020 at 20:37, Luke Nelson wrote: > > This patch adds functions for encoding and emitting compressed riscv > (RVC) instructions to the BPF JIT. > > Some regular riscv instructions can be compressed into an RVC instruction > if the instruction fields meet some requirements. For example, "add rd, > rs1, rs2" can be compressed into "c.add rd, rs2" when rd == rs1. > > To make using RVC encodings simpler, this patch also adds helper > functions that selectively emit either a regular instruction or a > compressed instruction if possible. > > For example, emit_add will produce a "c.add" if possible and regular > "add" otherwise. > > Signed-off-by: Luke Nelson > --- > arch/riscv/net/bpf_jit.h | 474 +++ > 1 file changed, 474 insertions(+) > > diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h > index 5c89ea904c1a..f3ac2d4a50f7 100644 > --- a/arch/riscv/net/bpf_jit.h > +++ b/arch/riscv/net/bpf_jit.h > @@ -13,6 +13,15 @@ > #include > #include > > +static inline bool rvc_enabled(void) > +{ > +#ifdef CONFIG_RISCV_ISA_C > + return true; > +#else > + return false; > +#endif > +} > + > enum { > RV_REG_ZERO = 0, /* The constant value 0 */ > RV_REG_RA = 1, /* Return address */ > @@ -48,6 +57,18 @@ enum { > RV_REG_T6 = 31, > }; > > +static inline bool is_creg(u8 reg) > +{ > + return (1 << reg) & (BIT(RV_REG_FP) | > +BIT(RV_REG_S1) | > +BIT(RV_REG_A0) | > +BIT(RV_REG_A1) | > +BIT(RV_REG_A2) | > +BIT(RV_REG_A3) | > +BIT(RV_REG_A4) | > +BIT(RV_REG_A5)); > +} > + > struct rv_jit_context { > struct bpf_prog *prog; > u16 *insns; /* RV insns */ > @@ -134,6 +155,16 @@ static inline int invert_bpf_cond(u8 cond) > return -1; > } > > +static inline bool is_6b_int(long val) > +{ > + return -(1L << 5) <= val && val < (1L << 5); > +} > + > +static inline bool is_10b_int(long val) > +{ > + return -(1L << 9) <= val && val < (1L << 9); > +} > + > static inline bool is_12b_int(long val) > { > return -(1L << 11) <= val && val < (1L << 11); > @@ -224,6 +255,59 @@ static inline u32 rv_amo_insn(u8 funct5, u8 aq, u8 rl, > u8 rs2, u8 rs1, > return rv_r_insn(funct7, rs2, rs1, funct3, rd, opcode); > } > > +/* RISC-V compressed instruction formats. */ > + > +static inline u32 rv_cr_insn(u8 funct4, u8 rd, u8 rs2, u8 op) Please change so that the return type is u16, so it matches emitc. > +{ > + return (funct4 << 12) | (rd << 7) | (rs2 << 2) | op; > +} > + > +static inline u32 rv_ci_insn(u8 funct3, u32 imm6, u8 rd, u8 op) > +{ > + u16 imm; > + > + imm = ((imm6 & 0x20) << 7) | ((imm6 & 0x1f) << 2); > + return (funct3 << 13) | (rd << 7) | op | imm; > +} > + > +static inline u32 rv_css_insn(u8 funct3, u32 uimm, u8 rs2, u8 op) > +{ > + return (funct3 << 13) | (uimm << 7) | (rs2 << 2) | op; > +} > + > +static inline u32 rv_ciw_insn(u8 funct3, u32 uimm, u8 rd, u8 op) > +{ > + return (funct3 << 13) | (uimm << 5) | ((rd & 0x7) << 2) | op; > +} > + > +static inline u32 rv_cl_insn(u8 funct3, u32 imm_hi, u8 rs1, u32 imm_lo, u8 > rd, > +u8 op) > +{ > + return (funct3 << 13) | (imm_hi << 10) | ((rs1 & 0x7) << 7) | > + (imm_lo << 5) | ((rd & 0x7) << 2) | op; > +} > + > +static inline u32 rv_cs_insn(u8 funct3, u32 imm_hi, u8 rs1, u32 imm_lo, u8 > rs2, > +u8 op) > +{ > + return (funct3 << 13) | (imm_hi << 10) | ((rs1 & 0x7) << 7) | > + (imm_lo << 5) | ((rs2 & 0x7) << 2) | op; > +} > + > +static inline u32 rv_ca_insn(u8 funct6, u8 rd, u8 funct2, u8 rs2, u8 op) > +{ > + return (funct6 << 10) | ((rd & 0x7) << 7) | (funct2 << 5) | > + ((rs2 & 0x7) << 2) | op; > +} > + > +static inline u32 rv_cb_insn(u8 funct3, u32 imm6, u8 funct2, u8 rd, u8 op) > +{ > + u16 imm; > + > + imm = ((imm6 & 0x20) << 7) | ((imm6 & 0x1f) << 2); > + return (funct3 << 13) | (funct2 << 10) | ((rd & 0x7) << 7) | op | imm; > +} > + > /* Instructions shared by both RV32 and RV64. */ > > static inline u32 rv_addi(u8 rd, u8 rs1, u16 imm11_0) > @@ -431,6 +515,135 @@ static inline u32 rv_amoadd_w(u8 rd, u8 rs2, u8 rs1, u8 > aq, u8 rl) > return rv_amo_insn(0, aq, rl, rs2, rs1, 2, rd, 0x2f); > } > > +/* RVC instrutions. */ > + > +static inline u32 rvc_addi4spn(u8 rd, u32 imm10) And same here. Change to u16 for the return value. > +{ > + u32 imm; > + > + imm = ((imm10 & 0x30) << 2) | ((imm10 & 0x3c0) >> 4) | > + ((imm10 & 0x4) >> 1) | ((imm10 & 0x8) >> 3); > + return rv_ciw_insn(0x0, imm, rd, 0x0); > +} > + > +static inline u32 rvc_lw(u8 rd, u32 imm7, u8 rs1) > +{ > + u32 imm_hi, imm_lo; > +
Re: [RFC PATCH bpf-next 1/3] bpf, riscv: Modify JIT ctx to support compressed instructions
On Mon, 13 Jul 2020 at 20:37, Luke Nelson wrote: > > This patch makes the necessary changes to struct rv_jit_context and to > bpf_int_jit_compile to support compressed riscv (RVC) instructions in > the BPF JIT. > > It changes the JIT image to be u16 instead of u32, since RVC instructions > are 2 bytes as opposed to 4. > > It also changes ctx->offset and ctx->ninsns to refer to 2-byte > instructions rather than 4-byte ones. The riscv PC is always 16-bit > aligned, so this is sufficient to refer to any valid riscv offset. > > The code for computing jump offsets in bytes is updated accordingly, > and factored in to the RVOFF macro to simplify the code. > > Signed-off-by: Luke Nelson > --- > arch/riscv/net/bpf_jit.h| 23 --- > arch/riscv/net/bpf_jit_comp32.c | 14 +++--- > arch/riscv/net/bpf_jit_comp64.c | 12 ++-- > arch/riscv/net/bpf_jit_core.c | 6 +++--- > 4 files changed, 36 insertions(+), 19 deletions(-) > > diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h > index 20e235d06f66..5c89ea904c1a 100644 > --- a/arch/riscv/net/bpf_jit.h > +++ b/arch/riscv/net/bpf_jit.h > @@ -50,7 +50,7 @@ enum { > > struct rv_jit_context { > struct bpf_prog *prog; > - u32 *insns; /* RV insns */ > + u16 *insns; /* RV insns */ > int ninsns; > int epilogue_offset; > int *offset;/* BPF to RV */ > @@ -58,6 +58,9 @@ struct rv_jit_context { > int stack_size; > }; > > +/* Convert from ninsns to bytes. */ > +#define RVOFF(ninsns) ((ninsns) << 1) > + I guess it's a matter of taste, but I'd prefer a simple static inline function instead of the macro. > struct rv_jit_data { > struct bpf_binary_header *header; > u8 *image; > @@ -74,8 +77,22 @@ static inline void bpf_flush_icache(void *start, void *end) > flush_icache_range((unsigned long)start, (unsigned long)end); > } > > +/* Emit a 4-byte riscv instruction. */ > static inline void emit(const u32 insn, struct rv_jit_context *ctx) > { > + if (ctx->insns) { > + ctx->insns[ctx->ninsns] = insn; > + ctx->insns[ctx->ninsns + 1] = (insn >> 16); > + } > + > + ctx->ninsns += 2; > +} > + > +/* Emit a 2-byte riscv compressed instruction. */ > +static inline void emitc(const u16 insn, struct rv_jit_context *ctx) > +{ > + BUILD_BUG_ON(!rvc_enabled()); > + > if (ctx->insns) > ctx->insns[ctx->ninsns] = insn; > > @@ -86,7 +103,7 @@ static inline int epilogue_offset(struct rv_jit_context > *ctx) > { > int to = ctx->epilogue_offset, from = ctx->ninsns; > > - return (to - from) << 2; > + return RVOFF(to - from); > } > > /* Return -1 or inverted cond. */ > @@ -149,7 +166,7 @@ static inline int rv_offset(int insn, int off, struct > rv_jit_context *ctx) > off++; /* BPF branch is from PC+1, RV is from PC */ > from = (insn > 0) ? ctx->offset[insn - 1] : 0; > to = (insn + off > 0) ? ctx->offset[insn + off - 1] : 0; > - return (to - from) << 2; > + return RVOFF(to - from); > } > > /* Instruction formats. */ > diff --git a/arch/riscv/net/bpf_jit_comp32.c b/arch/riscv/net/bpf_jit_comp32.c > index b198eaa74456..d22001aa0057 100644 > --- a/arch/riscv/net/bpf_jit_comp32.c > +++ b/arch/riscv/net/bpf_jit_comp32.c > @@ -644,7 +644,7 @@ static int emit_branch_r64(const s8 *src1, const s8 > *src2, s32 rvoff, > > e = ctx->ninsns; > /* Adjust for extra insns. */ > - rvoff -= (e - s) << 2; > + rvoff -= RVOFF(e - s); > emit_jump_and_link(RV_REG_ZERO, rvoff, true, ctx); > return 0; > } > @@ -713,7 +713,7 @@ static int emit_bcc(u8 op, u8 rd, u8 rs, int rvoff, > struct rv_jit_context *ctx) > if (far) { > e = ctx->ninsns; > /* Adjust for extra insns. */ > - rvoff -= (e - s) << 2; > + rvoff -= RVOFF(e - s); > emit_jump_and_link(RV_REG_ZERO, rvoff, true, ctx); > } > return 0; > @@ -731,7 +731,7 @@ static int emit_branch_r32(const s8 *src1, const s8 > *src2, s32 rvoff, > > e = ctx->ninsns; > /* Adjust for extra insns. */ > - rvoff -= (e - s) << 2; > + rvoff -= RVOFF(e - s); > > if (emit_bcc(op, lo(rs1), lo(rs2), rvoff, ctx)) > return -1; > @@ -795,7 +795,7 @@ static int emit_bpf_tail_call(int insn, struct > rv_jit_context *ctx) > * if (index >= max_entries) > * goto out; > */ > - off = (tc_ninsn - (ctx->ninsns - start_insn)) << 2; > + off = RVOFF(tc_ninsn - (ctx->ninsns - start_insn)); > emit_bcc(BPF_JGE, lo(idx_reg), RV_REG_T1, off, ctx); > > /* > @@ -804,7 +804,7 @@ static int emit_bpf_tail_call(int insn, struct > rv_jit_context *ctx) > * goto out; > */ > emit(rv_addi(RV_REG_T1, RV_REG_TCC, -1), ctx); > - off = (tc_ninsn -
Re: [RFC PATCH bpf-next 0/3] bpf, riscv: Add compressed instructions to rv64 JIT
On Mon, 13 Jul 2020 at 20:37, Luke Nelson wrote: > > This patch series enables using compressed riscv (RVC) instructions > in the rv64 BPF JIT. > First of all; Really nice work. I like this, and it makes the code easier to read as well (e.g. emit_mv). I'm a bit curious why you only did it for RV64, and not RV32? I have some minor comments on the patches. I strongly encourage you to submit this as a proper (non-RFC) set for bpf-next. Cheers, Björn > RVC is a standard riscv extension that adds a set of compressed, > 2-byte instructions that can replace some regular 4-byte instructions > for improved code density. > > This series first modifies the JIT to support using 2-byte instructions > (e.g., in jump offset computations), then adds RVC encoding and > helper functions, and finally uses the helper functions to optimize > the rv64 JIT. > > I used our formal verification framework, Serval, to verify the > correctness of the RVC encodings and their uses in the rv64 JIT. > > The JIT continues to pass all tests in lib/test_bpf.c, and introduces > no new failures to test_verifier; both with and without RVC being enabled. > > The following are examples of the JITed code for the verifier selftest > "direct packet read test#3 for CGROUP_SKB OK", without and with RVC > enabled, respectively. The former uses 178 bytes, and the latter uses 112, > for a ~37% reduction in code size for this example. > > Without RVC: > >0: 02000813addi a6,zero,32 >4: fd010113addi sp,sp,-48 >8: 02813423sds0,40(sp) >c: 02913023sds1,32(sp) > 10: 01213c23sds2,24(sp) > 14: 01313823sds3,16(sp) > 18: 01413423sds4,8(sp) > 1c: 03010413addi s0,sp,48 > 20: 03056683lwu a3,48(a0) > 24: 02069693slli a3,a3,0x20 > 28: 0206d693srli a3,a3,0x20 > 2c: 03456703lwu a4,52(a0) > 30: 02071713slli a4,a4,0x20 > 34: 02075713srli a4,a4,0x20 > 38: 03856483lwu s1,56(a0) > 3c: 02049493slli s1,s1,0x20 > 40: 0204d493srli s1,s1,0x20 > 44: 03c56903lwu s2,60(a0) > 48: 02091913slli s2,s2,0x20 > 4c: 02095913srli s2,s2,0x20 > 50: 04056983lwu s3,64(a0) > 54: 0203slli s3,s3,0x20 > 58: 0209d993srli s3,s3,0x20 > 5c: 09056a03lwu s4,144(a0) > 60: 020a1a13slli s4,s4,0x20 > 64: 020a5a13srli s4,s4,0x20 > 68: 00900313addi t1,zero,9 > 6c: 006a7463bgeu s4,t1,0x74 > 70: 0a13addi s4,zero,0 > 74: 02d52823swa3,48(a0) > 78: 02e52a23swa4,52(a0) > 7c: 02952c23sws1,56(a0) > 80: 03252e23sws2,60(a0) > 84: 05352023sws3,64(a0) > 88: 0793addi a5,zero,0 > 8c: 02813403lds0,40(sp) > 90: 02013483lds1,32(sp) > 94: 01813903lds2,24(sp) > 98: 01013983lds3,16(sp) > 9c: 00813a03lds4,8(sp) > a0: 03010113addi sp,sp,48 > a4: 00078513addi a0,a5,0 > a8: 8067jalr zero,0(ra) > > With RVC: > >0: 02000813addia6,zero,32 >4: 7179c.addi16sp sp,-48 >6: f422c.sdsp s0,40(sp) >8: f026c.sdsp s1,32(sp) >a: ec4ac.sdsp s2,24(sp) >c: e84ec.sdsp s3,16(sp) >e: e452c.sdsp s4,8(sp) > 10: 1800c.addi4spn s0,sp,48 > 12: 03056683lwu a3,48(a0) > 16: 1682c.slli a3,0x20 > 18: 9281c.srli a3,0x20 > 1a: 03456703lwu a4,52(a0) > 1e: 1702c.slli a4,0x20 > 20: 9301c.srli a4,0x20 > 22: 03856483lwu s1,56(a0) > 26: 1482c.slli s1,0x20 > 28: 9081c.srli s1,0x20 > 2a: 03c56903lwu s2,60(a0) > 2e: 1902c.slli s2,0x20 > 30: 02095913srlis2,s2,0x20 > 34: 04056983lwu s3,64(a0) > 38: 1982c.slli s3,0x20 > 3a: 0209d993srlis3,s3,0x20 > 3e: 09056a03lwu s4,144(a0) > 42: 1a02c.slli s4,0x20 > 44: 020a5a13srlis4,s4,0x20 > 48: 4325c.lit1,9 > 4a: 006a7363bgeus4,t1,0x50 > 4e: 4a01c.lis4,0 > 50: d914c.swa3,48(a0) > 52: d958c.swa4,52(a0) > 54: dd04c.sws1,56(a0) > 56: 03252e23sw s2,60(a0) > 5a: 05352023sw s3,64(a0) > 5e: 4781c.lia5,0 > 60: 7422c.ldsp s0,40(sp) > 62: 7482c.ldsp s1,32(sp) > 64: 6962c.ldsp s2,24(sp) > 66: 69c2c.ldsp s3,16(sp) > 68: 6a22c.ldsp s4,8(sp) > 6a: 6145c.addi16sp sp,48 > 6c: 853ec.mva0,a5 > 6e: 8082c.jrra > > Luke Nelson (3): > bpf, riscv: Modify JIT ctx to support compressed instructions > bpf, riscv: Add encodings for compressed instructions > bpf, riscv: Use compressed instructions in the rv64 JIT > > arch/riscv/net/bpf_jit.h| 495 +++- >
Re: [PATCH v2 2/2] riscv: Add jump-label implementation
On Wed, 8 Jul 2020 at 23:10, Emil Renner Berthing wrote: > > Add jump-label implementation based on the ARM64 version > and add CONFIG_JUMP_LABEL=y to the defconfigs. > > Signed-off-by: Emil Renner Berthing > Reviewed-by: Björn Töpel Tested-by: Björn Töpel > --- > > Tested on the HiFive Unleashed board. > > Changes since v1: > - WARN and give up gracefully if the jump offset cannot be > represented in a JAL instruction. > - Add missing braces. > - Add CONFIG_JUMP_LABEL=y to defconfigs. > > All suggested by Björn Töpel. > > Changes since RFC: > - Use RISCV_PTR and RISCV_LGPTR macros to match struct jump_table > also in 32bit kernels. > - Remove unneeded branch ? 1 : 0, thanks Björn > - Fix \n\n instead of \n\t mistake > > .../core/jump-labels/arch-support.txt | 2 +- > arch/riscv/Kconfig| 2 + > arch/riscv/configs/defconfig | 1 + > arch/riscv/configs/nommu_k210_defconfig | 1 + > arch/riscv/configs/nommu_virt_defconfig | 1 + > arch/riscv/configs/rv32_defconfig | 1 + > arch/riscv/include/asm/jump_label.h | 59 +++ > arch/riscv/kernel/Makefile| 2 + > arch/riscv/kernel/jump_label.c| 49 +++ > 9 files changed, 117 insertions(+), 1 deletion(-) > create mode 100644 arch/riscv/include/asm/jump_label.h > create mode 100644 arch/riscv/kernel/jump_label.c > > diff --git a/Documentation/features/core/jump-labels/arch-support.txt > b/Documentation/features/core/jump-labels/arch-support.txt > index 632a1c7aefa2..760243d18ed7 100644 > --- a/Documentation/features/core/jump-labels/arch-support.txt > +++ b/Documentation/features/core/jump-labels/arch-support.txt > @@ -23,7 +23,7 @@ > |openrisc: | TODO | > | parisc: | ok | > | powerpc: | ok | > -| riscv: | TODO | > +| riscv: | ok | > |s390: | ok | > | sh: | TODO | > | sparc: | ok | > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig > index fd639937e251..d2f5c53fdc19 100644 > --- a/arch/riscv/Kconfig > +++ b/arch/riscv/Kconfig > @@ -46,6 +46,8 @@ config RISCV > select GENERIC_TIME_VSYSCALL if MMU && 64BIT > select HANDLE_DOMAIN_IRQ > select HAVE_ARCH_AUDITSYSCALL > + select HAVE_ARCH_JUMP_LABEL > + select HAVE_ARCH_JUMP_LABEL_RELATIVE > select HAVE_ARCH_KASAN if MMU && 64BIT > select HAVE_ARCH_KGDB > select HAVE_ARCH_KGDB_QXFER_PKT > diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig > index 4da4886246a4..d58c93efb603 100644 > --- a/arch/riscv/configs/defconfig > +++ b/arch/riscv/configs/defconfig > @@ -17,6 +17,7 @@ CONFIG_BPF_SYSCALL=y > CONFIG_SOC_SIFIVE=y > CONFIG_SOC_VIRT=y > CONFIG_SMP=y > +CONFIG_JUMP_LABEL=y > CONFIG_MODULES=y > CONFIG_MODULE_UNLOAD=y > CONFIG_NET=y > diff --git a/arch/riscv/configs/nommu_k210_defconfig > b/arch/riscv/configs/nommu_k210_defconfig > index b48138e329ea..cd1df62b13c7 100644 > --- a/arch/riscv/configs/nommu_k210_defconfig > +++ b/arch/riscv/configs/nommu_k210_defconfig > @@ -33,6 +33,7 @@ CONFIG_SMP=y > CONFIG_NR_CPUS=2 > CONFIG_CMDLINE="earlycon console=ttySIF0" > CONFIG_CMDLINE_FORCE=y > +CONFIG_JUMP_LABEL=y > # CONFIG_BLOCK is not set > CONFIG_BINFMT_FLAT=y > # CONFIG_COREDUMP is not set > diff --git a/arch/riscv/configs/nommu_virt_defconfig > b/arch/riscv/configs/nommu_virt_defconfig > index cf74e179bf90..f27596e9663e 100644 > --- a/arch/riscv/configs/nommu_virt_defconfig > +++ b/arch/riscv/configs/nommu_virt_defconfig > @@ -30,6 +30,7 @@ CONFIG_MAXPHYSMEM_2GB=y > CONFIG_SMP=y > CONFIG_CMDLINE="root=/dev/vda rw earlycon=uart8250,mmio,0x1000,115200n8 > console=ttyS0" > CONFIG_CMDLINE_FORCE=y > +CONFIG_JUMP_LABEL=y > # CONFIG_BLK_DEV_BSG is not set > CONFIG_PARTITION_ADVANCED=y > # CONFIG_MSDOS_PARTITION is not set > diff --git a/arch/riscv/configs/rv32_defconfig > b/arch/riscv/configs/rv32_defconfig > index 05bbf5240569..3a55f0e00d6c 100644 > --- a/arch/riscv/configs/rv32_defconfig > +++ b/arch/riscv/configs/rv32_defconfig > @@ -17,6 +17,7 @@ CONFIG_BPF_SYSCALL=y > CONFIG_SOC_VIRT=y > CONFIG_ARCH_RV32I=y > CONFIG_SMP=y > +CONFIG_JUMP_LABEL=y > CONFIG_MODULES=y > CONFIG_MODULE_UNLOAD=y > CONFIG_NET=y > diff --git a/arch/riscv/include/asm/jump_label.h > b/arch/riscv/include/asm/jump_label.h > new file mode 100644 > index ..d5fb342bfccf > --- /dev/null > +++ b/arch/riscv/include/asm/jump_label.
Re: [PATCH v2 1/2] riscv: Support R_RISCV_ADD64 and R_RISCV_SUB64 relocs
On Wed, 8 Jul 2020 at 23:10, Emil Renner Berthing wrote: > > These are needed for the __jump_table in modules using > static keys/jump-labels with the layout from > HAVE_ARCH_JUMP_LABEL_RELATIVE on 64bit kernels. > > Signed-off-by: Emil Renner Berthing Reviewed-by: Björn Töpel Tested-by: Björn Töpel > --- > > Tested on the HiFive Unleashed board. > > This patch is new in v2. It fixes an error loading modules > containing static keys found by Björn Töpel. > > arch/riscv/kernel/module.c | 16 > 1 file changed, 16 insertions(+) > > diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c > index 7191342c54da..104fba889cf7 100644 > --- a/arch/riscv/kernel/module.c > +++ b/arch/riscv/kernel/module.c > @@ -263,6 +263,13 @@ static int apply_r_riscv_add32_rela(struct module *me, > u32 *location, > return 0; > } > > +static int apply_r_riscv_add64_rela(struct module *me, u32 *location, > + Elf_Addr v) > +{ > + *(u64 *)location += (u64)v; > + return 0; > +} > + > static int apply_r_riscv_sub32_rela(struct module *me, u32 *location, > Elf_Addr v) > { > @@ -270,6 +277,13 @@ static int apply_r_riscv_sub32_rela(struct module *me, > u32 *location, > return 0; > } > > +static int apply_r_riscv_sub64_rela(struct module *me, u32 *location, > + Elf_Addr v) > +{ > + *(u64 *)location -= (u64)v; > + return 0; > +} > + > static int (*reloc_handlers_rela[]) (struct module *me, u32 *location, > Elf_Addr v) = { > [R_RISCV_32]= apply_r_riscv_32_rela, > @@ -290,7 +304,9 @@ static int (*reloc_handlers_rela[]) (struct module *me, > u32 *location, > [R_RISCV_RELAX] = apply_r_riscv_relax_rela, > [R_RISCV_ALIGN] = apply_r_riscv_align_rela, > [R_RISCV_ADD32] = apply_r_riscv_add32_rela, > + [R_RISCV_ADD64] = apply_r_riscv_add64_rela, > [R_RISCV_SUB32] = apply_r_riscv_sub32_rela, > + [R_RISCV_SUB64] = apply_r_riscv_sub64_rela, > }; > > int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, > -- > 2.27.0 >
Re: [PATCH v1] riscv: Add jump-label implementation
On Tue, 7 Jul 2020 at 17:08, Emil Renner Berthing wrote: > > Add jump-label implementation based on the ARM64 version. > > Tested on the HiFive Unleashed board. > I took your patch for a spin on qemu. The boot selftest (CONFIG_STATIC_KEYS_SELFTEST=y) passes, but the module test (CONFIG_TEST_STATIC_KEYS=m) does not. When I run the in "test tools/testing/selftests/static_keys" (this simply loads the test_static_key_base.ko and test_static_keys.ko modules) I get: [ 134.090464] test_static_keys: Unknown relocation type 36 36 is the relocation type R_RISCV_ADD64, which is not handled in arch/riscv/kernel/module.c. If you dump the relocation entries for test_static_keys.ko (riscv64-linux-gnu-objdump -r test_static_keys.ko), you'll see that: RELOCATION RECORDS FOR [__jump_table]: OFFSET TYPE VALUE R_RISCV_ADD32 .L1^B1 R_RISCV_SUB32 .L0 0004 R_RISCV_ADD32 .L3 0004 R_RISCV_SUB32 .L0 0008 R_RISCV_ADD64 old_true_key+0x0001 0008 R_RISCV_SUB64 .L0 ... It would be great if you could add a patch for that as well (separate, same series). R_RISCV_ADD64 *and* R_RISCV_SUB64 are currently unhandled by module.c. Cheers, Björn > Signed-off-by: Emil Renner Berthing > --- > > Changes since RFC: > - Use RISCV_PTR and RISCV_LGPTR macros to match struct jump_table > also in 32bit kernels. > - Remove unneeded branch ? 1 : 0, thanks Björn > - Fix \n\n instead of \n\t mistake > > .../core/jump-labels/arch-support.txt | 2 +- > arch/riscv/Kconfig| 2 + > arch/riscv/include/asm/jump_label.h | 59 +++ > arch/riscv/kernel/Makefile| 2 + > arch/riscv/kernel/jump_label.c| 44 ++ > 5 files changed, 108 insertions(+), 1 deletion(-) > create mode 100644 arch/riscv/include/asm/jump_label.h > create mode 100644 arch/riscv/kernel/jump_label.c > > diff --git a/Documentation/features/core/jump-labels/arch-support.txt > b/Documentation/features/core/jump-labels/arch-support.txt > index 632a1c7aefa2..760243d18ed7 100644 > --- a/Documentation/features/core/jump-labels/arch-support.txt > +++ b/Documentation/features/core/jump-labels/arch-support.txt > @@ -23,7 +23,7 @@ > |openrisc: | TODO | > | parisc: | ok | > | powerpc: | ok | > -| riscv: | TODO | > +| riscv: | ok | > |s390: | ok | > | sh: | TODO | > | sparc: | ok | > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig > index fd639937e251..d2f5c53fdc19 100644 > --- a/arch/riscv/Kconfig > +++ b/arch/riscv/Kconfig > @@ -46,6 +46,8 @@ config RISCV > select GENERIC_TIME_VSYSCALL if MMU && 64BIT > select HANDLE_DOMAIN_IRQ > select HAVE_ARCH_AUDITSYSCALL > + select HAVE_ARCH_JUMP_LABEL > + select HAVE_ARCH_JUMP_LABEL_RELATIVE > select HAVE_ARCH_KASAN if MMU && 64BIT > select HAVE_ARCH_KGDB > select HAVE_ARCH_KGDB_QXFER_PKT > diff --git a/arch/riscv/include/asm/jump_label.h > b/arch/riscv/include/asm/jump_label.h > new file mode 100644 > index ..d5fb342bfccf > --- /dev/null > +++ b/arch/riscv/include/asm/jump_label.h > @@ -0,0 +1,59 @@ > +/* SPDX-License-Identifier: GPL-2.0-only */ > +/* > + * Copyright (C) 2020 Emil Renner Berthing > + * > + * Based on arch/arm64/include/asm/jump_label.h > + */ > +#ifndef __ASM_JUMP_LABEL_H > +#define __ASM_JUMP_LABEL_H > + > +#ifndef __ASSEMBLY__ > + > +#include > + > +#define JUMP_LABEL_NOP_SIZE 4 > + > +static __always_inline bool arch_static_branch(struct static_key *key, > + bool branch) > +{ > + asm_volatile_goto( > + " .option push\n\t" > + " .option norelax \n\t" > + " .option norvc \n\t" > + "1: nop \n\t" > + " .option pop \n\t" > + " .pushsection__jump_table, \"aw\"\n\t" > + " .align " RISCV_LGPTR " \n\t" > + " .long 1b - ., %l[label] - . \n\t" > + " " RISCV_PTR " %0 - . \n\t" > + " .popsection \n\t" > + : : "i"(&((char *)key)[branch]) : : label); > + > + return false; > +label: > + return true; > +} > + > +static __always_inline bool arch_static_branch_jump(struct static_key *key, > + bool branch) > +{ > + asm_volatile_goto( > + " .option push\n\t" > + " .option norelax \n\t" > +
Re: [PATCH v1] riscv: Add jump-label implementation
On Tue, 7 Jul 2020 at 17:08, Emil Renner Berthing wrote: > > Add jump-label implementation based on the ARM64 version. > > Tested on the HiFive Unleashed board. > > Signed-off-by: Emil Renner Berthing > --- > > Changes since RFC: > - Use RISCV_PTR and RISCV_LGPTR macros to match struct jump_table > also in 32bit kernels. > - Remove unneeded branch ? 1 : 0, thanks Björn > - Fix \n\n instead of \n\t mistake > > .../core/jump-labels/arch-support.txt | 2 +- > arch/riscv/Kconfig| 2 + > arch/riscv/include/asm/jump_label.h | 59 +++ > arch/riscv/kernel/Makefile| 2 + > arch/riscv/kernel/jump_label.c| 44 ++ > 5 files changed, 108 insertions(+), 1 deletion(-) > create mode 100644 arch/riscv/include/asm/jump_label.h > create mode 100644 arch/riscv/kernel/jump_label.c > > diff --git a/Documentation/features/core/jump-labels/arch-support.txt > b/Documentation/features/core/jump-labels/arch-support.txt > index 632a1c7aefa2..760243d18ed7 100644 > --- a/Documentation/features/core/jump-labels/arch-support.txt > +++ b/Documentation/features/core/jump-labels/arch-support.txt > @@ -23,7 +23,7 @@ > |openrisc: | TODO | > | parisc: | ok | > | powerpc: | ok | > -| riscv: | TODO | > +| riscv: | ok | > |s390: | ok | > | sh: | TODO | > | sparc: | ok | > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig > index fd639937e251..d2f5c53fdc19 100644 > --- a/arch/riscv/Kconfig > +++ b/arch/riscv/Kconfig > @@ -46,6 +46,8 @@ config RISCV > select GENERIC_TIME_VSYSCALL if MMU && 64BIT > select HANDLE_DOMAIN_IRQ > select HAVE_ARCH_AUDITSYSCALL > + select HAVE_ARCH_JUMP_LABEL > + select HAVE_ARCH_JUMP_LABEL_RELATIVE Missed one thing in the last reply; Please update defconfig with CONFIG_JUMP_LABEL=y. Björn > select HAVE_ARCH_KASAN if MMU && 64BIT > select HAVE_ARCH_KGDB > select HAVE_ARCH_KGDB_QXFER_PKT > diff --git a/arch/riscv/include/asm/jump_label.h > b/arch/riscv/include/asm/jump_label.h > new file mode 100644 > index ..d5fb342bfccf > --- /dev/null > +++ b/arch/riscv/include/asm/jump_label.h > @@ -0,0 +1,59 @@ > +/* SPDX-License-Identifier: GPL-2.0-only */ > +/* > + * Copyright (C) 2020 Emil Renner Berthing > + * > + * Based on arch/arm64/include/asm/jump_label.h > + */ > +#ifndef __ASM_JUMP_LABEL_H > +#define __ASM_JUMP_LABEL_H > + > +#ifndef __ASSEMBLY__ > + > +#include > + > +#define JUMP_LABEL_NOP_SIZE 4 > + > +static __always_inline bool arch_static_branch(struct static_key *key, > + bool branch) > +{ > + asm_volatile_goto( > + " .option push\n\t" > + " .option norelax \n\t" > + " .option norvc \n\t" > + "1: nop \n\t" > + " .option pop \n\t" > + " .pushsection__jump_table, \"aw\"\n\t" > + " .align " RISCV_LGPTR " \n\t" > + " .long 1b - ., %l[label] - . \n\t" > + " " RISCV_PTR " %0 - . \n\t" > + " .popsection \n\t" > + : : "i"(&((char *)key)[branch]) : : label); > + > + return false; > +label: > + return true; > +} > + > +static __always_inline bool arch_static_branch_jump(struct static_key *key, > + bool branch) > +{ > + asm_volatile_goto( > + " .option push\n\t" > + " .option norelax \n\t" > + " .option norvc \n\t" > + "1: jal zero, %l[label] \n\t" > + " .option pop \n\t" > + " .pushsection__jump_table, \"aw\"\n\t" > + " .align " RISCV_LGPTR " \n\t" > + " .long 1b - ., %l[label] - . \n\t" > + " " RISCV_PTR " %0 - . \n\t" > + " .popsection \n\t" > + : : "i"(&((char *)key)[branch]) : : label); > + > + return false; > +label: > + return true; > +} > + > +#endif /* __ASSEMBLY__ */ > +#endif /* __ASM_JUMP_LABEL_H */ > diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile > index b355cf485671..a5287ab9f7f2 100644 > --- a/arch/riscv/kernel/Makefile > +++ b/arch/riscv/kernel/Makefile > @@ -53,4 +53,6 @@ endif > obj-$(CONFIG_HOTPLUG_CPU) +=
Re: [PATCH v1] riscv: Add jump-label implementation
\n\t" > + " .popsection \n\t" > + : : "i"(&((char *)key)[branch]) : : label); > + > + return false; > +label: > + return true; > +} > + > +#endif /* __ASSEMBLY__ */ > +#endif /* __ASM_JUMP_LABEL_H */ > diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile > index b355cf485671..a5287ab9f7f2 100644 > --- a/arch/riscv/kernel/Makefile > +++ b/arch/riscv/kernel/Makefile > @@ -53,4 +53,6 @@ endif > obj-$(CONFIG_HOTPLUG_CPU) += cpu-hotplug.o > obj-$(CONFIG_KGDB) += kgdb.o > > +obj-$(CONFIG_JUMP_LABEL) += jump_label.o > + > clean: > diff --git a/arch/riscv/kernel/jump_label.c b/arch/riscv/kernel/jump_label.c > new file mode 100644 > index ..55b2d742efe1 > --- /dev/null > +++ b/arch/riscv/kernel/jump_label.c > @@ -0,0 +1,44 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * Copyright (C) 2020 Emil Renner Berthing > + * > + * Based on arch/arm64/kernel/jump_label.c > + */ > +#include > +#include > +#include > + > +#define RISCV_INSN_NOP 0x0013 > +#define RISCV_INSN_JAL 0x006f > + > +void arch_jump_label_transform(struct jump_entry *entry, > + enum jump_label_type type) > +{ > + void *addr = (void *)jump_entry_code(entry); > + u32 insn; > + > + if (type == JUMP_LABEL_JMP) { > + u32 offset = jump_entry_target(entry) - > jump_entry_code(entry); The 20b jump offset of JAL is enough, but to catch future changes that might potentially break this, I would add a WARN to catch if offset >20b (or overkill fallback to JALR -- don't do that.). Somewhat related to that; The UNIX specification for RISC-V requires support for RVC. Would it make sense to use the C.-instructions C.J and C.NOP? Really a nit, or more of a question. I'm fine with this 32b version! > + > + insn = RISCV_INSN_JAL | > + ((offset & GENMASK(19, 12)) << (12 - 12)) | > + ((offset & GENMASK(11, 11)) << (20 - 11)) | > + ((offset & GENMASK(10, 1)) << (21 - 1)) | > + ((offset & GENMASK(20, 20)) << (31 - 20)); > + } else > + insn = RISCV_INSN_NOP; Since the if-statement needs {}, the else-clause should have one too. Feel free to add: Reviewed-by: Björn Töpel Cheers, Björn > + > + patch_text_nosync(addr, , sizeof(insn)); > +} > + > +void arch_jump_label_transform_static(struct jump_entry *entry, > + enum jump_label_type type) > +{ > + /* > +* We use the same instructions in the arch_static_branch and > +* arch_static_branch_jump inline functions, so there's no > +* need to patch them up here. > +* The core will call arch_jump_label_transform when those > +* instructions need to be replaced. > +*/ > +} > -- > 2.27.0 >
Re: [RFC] riscv: Add jump-label implementation
On Sat, 4 Jul 2020 at 13:35, Emil Renner Berthing wrote: > > On Sat, 4 Jul 2020 at 13:23, Björn Töpel wrote: [...] > > Indeed. And nice work! Can you respin the patch with the 32b fix > > above, and also without the RFC tag? > > Yes, of course. If you don't mind I'll wait a bit and let this collect > a bit more comments. > Certainly! > > Curious; Why is [branch ? 1 : 0] needed when coding the boolean into > > the key pointer (arm64 is just [branch]). Different encoding of > > booleans (branch in this case)? > > No, that was just me being unsure exactly how bool works when used as > an index. After reading up on it it seems the original code is right, > you can actually trust that _Bool is either 0 or 1. I'll fix it in the > next version. Thanks! > Cool! Thanks for clearing that up for me! Cheers, Björn
Re: [RFC] riscv: Add jump-label implementation
On Fri, 3 Jul 2020 at 17:43, Emil Renner Berthing wrote: > > On Thu, 2 Jul 2020 at 22:07, Emil Renner Berthing wrote: > > > > Add basic jump-label implementation heavily based > > on the ARM64 version. > > > > Tested on the HiFive Unleashed. > > > > Signed-off-by: Emil Renner Berthing > > --- > > > > This seems to work on my HiFive Unleashed. At least boots, runs fine > > and the static key self-tests doesn't complain, but I'm sure I've missed > > something, so I'm sending this as an RFC. > > > > /Emil > > > > .../core/jump-labels/arch-support.txt | 2 +- > > arch/riscv/Kconfig| 2 + > > arch/riscv/include/asm/jump_label.h | 59 +++ > > arch/riscv/kernel/Makefile| 2 + > > arch/riscv/kernel/jump_label.c| 44 ++ > > 5 files changed, 108 insertions(+), 1 deletion(-) > > create mode 100644 arch/riscv/include/asm/jump_label.h > > create mode 100644 arch/riscv/kernel/jump_label.c > > > > diff --git a/Documentation/features/core/jump-labels/arch-support.txt > > b/Documentation/features/core/jump-labels/arch-support.txt > > index 632a1c7aefa2..760243d18ed7 100644 > > --- a/Documentation/features/core/jump-labels/arch-support.txt > > +++ b/Documentation/features/core/jump-labels/arch-support.txt > > @@ -23,7 +23,7 @@ > > |openrisc: | TODO | > > | parisc: | ok | > > | powerpc: | ok | > > -| riscv: | TODO | > > +| riscv: | ok | > > |s390: | ok | > > | sh: | TODO | > > | sparc: | ok | > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig > > index fd639937e251..d2f5c53fdc19 100644 > > --- a/arch/riscv/Kconfig > > +++ b/arch/riscv/Kconfig > > @@ -46,6 +46,8 @@ config RISCV > > select GENERIC_TIME_VSYSCALL if MMU && 64BIT > > select HANDLE_DOMAIN_IRQ > > select HAVE_ARCH_AUDITSYSCALL > > + select HAVE_ARCH_JUMP_LABEL > > + select HAVE_ARCH_JUMP_LABEL_RELATIVE > > select HAVE_ARCH_KASAN if MMU && 64BIT > > select HAVE_ARCH_KGDB > > select HAVE_ARCH_KGDB_QXFER_PKT > > diff --git a/arch/riscv/include/asm/jump_label.h > > b/arch/riscv/include/asm/jump_label.h > > new file mode 100644 > > index ..29be6d351866 > > --- /dev/null > > +++ b/arch/riscv/include/asm/jump_label.h > > @@ -0,0 +1,59 @@ > > +/* SPDX-License-Identifier: GPL-2.0-only */ > > +/* > > + * Copyright (C) 2020 Emil Renner Berthing > > + * > > + * Based on arch/arm64/include/asm/jump_label.h > > + */ > > +#ifndef __ASM_JUMP_LABEL_H > > +#define __ASM_JUMP_LABEL_H > > + > > +#ifndef __ASSEMBLY__ > > + > > +#include > > + > > +#define JUMP_LABEL_NOP_SIZE 4 > > + > > +static __always_inline bool arch_static_branch(struct static_key *key, > > + bool branch) > > +{ > > + asm_volatile_goto( > > + " .option push\n\n" > > + " .option norelax \n\n" > > + " .option norvc \n\n" > > + "1: nop \n\t" > > + " .option pop \n\n" > > + " .pushsection__jump_table, \"aw\"\n\t" > > + " .align 3 \n\t" > > + " .long 1b - ., %l[l_yes] - . \n\t" > > + " .quad %0 - . \n\t" > > With HAVE_ARCH_JUMP_LABEL_RELATIVE we get > struct jump_entry { > s32 code; > s32 target; > long key; > } > ..so this .quad and the one below should be replaced by the RISCV_PTR > macro to match the struct in 32bit kernels. > Indeed. And nice work! Can you respin the patch with the 32b fix above, and also without the RFC tag? Curious; Why is [branch ? 1 : 0] needed when coding the boolean into the key pointer (arm64 is just [branch]). Different encoding of booleans (branch in this case)? Cheers, Björn > > + " .popsection \n\t" > > + : : "i"(&((char *)key)[branch ? 1 : 0]) : : l_yes); > > + > > + return false; > > +l_yes: > > + return true; > > +} > > + > > +static __always_inline bool arch_static_branch_jump(struct static_key *key, > > + bool branch) > > +{ > > + asm_volatile_goto( > > + " .option push\n\n" > > + " .option norelax \n\n" > > + " .option norvc \n\n" > > + "1: jal zero, %l[l_yes] \n\t" > > + " .option pop \n\n" > > + " .pushsection__jump_table, \"aw\"\n\t" > > + "
Re: [PATCH net] xsk: remove cheap_dma optimization
On 2020-06-29 17:41, Robin Murphy wrote: On 2020-06-28 18:16, Björn Töpel wrote: [...]> Somewhat related to the DMA API; It would have performance benefits for AF_XDP if the DMA range of the mapped memory was linear, i.e. by IOMMU utilization. I've started hacking a thing a little bit, but it would be nice if such API was part of the mapping core. Input: array of pages Output: array of dma addrs (and obviously dev, flags and such) For non-IOMMU len(array of pages) == len(array of dma addrs) For best-case IOMMU len(array of dma addrs) == 1 (large linear space) But that's for later. :-) FWIW you will typically get that behaviour from IOMMU-based implementations of dma_map_sg() right now, although it's not strictly guaranteed. If you can weather some additional setup cost of calling sg_alloc_table_from_pages() plus walking the list after mapping to test whether you did get a contiguous result, you could start taking advantage of it as some of the dma-buf code in DRM and v4l2 does already (although those cases actually treat it as a strict dependency rather than an optimisation). I'm inclined to agree that if we're going to see more of these cases, a new API call that did formally guarantee a DMA-contiguous mapping (either via IOMMU or bounce buffering) or failure might indeed be handy. I forgot to reply to this one! My current hack is using the iommu code directly, similar to what vfio-pci does (hopefully not gutting the API this time ;-)). Your approach sound much nicer, and easier. I'll try that out! Thanks a lot for the pointers, and I might be back with more questions. Cheers, Björn Robin.
Re: [PATCH net] xsk: remove cheap_dma optimization
On 2020-06-29 15:52, Daniel Borkmann wrote: Ok, fair enough, please work with DMA folks to get this properly integrated and restored then. Applied, thanks! Daniel, you were too quick! Please revert this one; Christoph just submitted a 4-patch-series that addresses both the DMA API, and the perf regression! Björn
Re: add an API to check if a streamming mapping needs sync calls
On 2020-06-29 15:03, Christoph Hellwig wrote: Hi all, this series lifts the somewhat hacky checks in the XSK code if a DMA streaming mapping needs dma_sync_single_for_{device,cpu} calls to the DMA API. Thanks a lot for working on, and fixing this, Christoph! I took the series for a spin, and there are (obviously) no performance regressions. Would the patches go through the net/bpf trees or somewhere else? For the series: Tested-by: Björn Töpel Acked-by: Björn Töpel Björn Diffstat: Documentation/core-api/dma-api.rst |8 + include/linux/dma-direct.h |1 include/linux/dma-mapping.h|5 +++ include/net/xsk_buff_pool.h|6 ++-- kernel/dma/direct.c|6 kernel/dma/mapping.c | 10 ++ net/xdp/xsk_buff_pool.c| 54 ++--- 7 files changed, 37 insertions(+), 53 deletions(-)
Re: [PATCH net] xsk: remove cheap_dma optimization
On 2020-06-29 17:18, Daniel Borkmann wrote: Nice, tossed from bpf tree then! (Looks like it didn't land on the bpf list yet, but seems other mails are currently stuck as well on vger. I presume it will be routed to Linus via Christoph?) Thanks! Christoph (according to the other mail) was OK taking the series via your bpf, Dave's net, or his dma-mapping tree. Björn
Re: [PATCH net] xsk: remove cheap_dma optimization
On 2020-06-27 09:04, Christoph Hellwig wrote: On Sat, Jun 27, 2020 at 01:00:19AM +0200, Daniel Borkmann wrote: Given there is roughly a ~5 weeks window at max where this removal could still be applied in the worst case, could we come up with a fix / proposal first that moves this into the DMA mapping core? If there is something that can be agreed upon by all parties, then we could avoid re-adding the 9% slowdown. :/ I'd rather turn it upside down - this abuse of the internals blocks work that has basically just missed the previous window and I'm not going to wait weeks to sort out the API misuse. But we can add optimizations back later if we find a sane way. I'm not super excited about the performance loss, but I do get Christoph's frustration about gutting the DMA API making it harder for DMA people to get work done. Lets try to solve this properly using proper DMA APIs. That being said I really can't see how this would make so much of a difference. What architecture and what dma_ops are you using for those measurements? What is the workload? The 9% is for an AF_XDP (Fast raw Ethernet socket. Think AF_PACKET, but faster.) benchmark: receive the packet from the NIC, and drop it. The DMA syncs stand out in the perf top: 28.63% [kernel] [k] i40e_clean_rx_irq_zc 17.12% [kernel] [k] xp_alloc 8.80% [kernel] [k] __xsk_rcv_zc 7.69% [kernel] [k] xdp_do_redirect 5.35% bpf_prog_992d9ddc835e5629 [k] bpf_prog_992d9ddc835e5629 4.77% [kernel] [k] xsk_rcv.part.0 4.07% [kernel] [k] __xsk_map_redirect 3.80% [kernel] [k] dma_direct_sync_single_for_cpu 3.03% [kernel] [k] dma_direct_sync_single_for_device 2.76% [kernel] [k] i40e_alloc_rx_buffers_zc 1.83% [kernel] [k] xsk_flush ... For this benchmark the dma_ops are NULL (dma_is_direct() == true), and the main issue is that SWIOTLB is now unconditionally enabled [1] for x86, and for each sync we have to check that if is_swiotlb_buffer() which involves a some costly indirection. That was pretty much what my hack avoided. Instead we did all the checks upfront, since AF_XDP has long-term DMA mappings, and just set a flag for that. Avoiding the whole "is this address swiotlb" in dma_direct_sync_single_for_{cpu, device]() per-packet would help a lot. Somewhat related to the DMA API; It would have performance benefits for AF_XDP if the DMA range of the mapped memory was linear, i.e. by IOMMU utilization. I've started hacking a thing a little bit, but it would be nice if such API was part of the mapping core. Input: array of pages Output: array of dma addrs (and obviously dev, flags and such) For non-IOMMU len(array of pages) == len(array of dma addrs) For best-case IOMMU len(array of dma addrs) == 1 (large linear space) But that's for later. :-) Björn [1] commit: 09230cbc1bab ("swiotlb: move the SWIOTLB config symbol to lib/Kconfig")
[PATCH net] xsk: remove cheap_dma optimization
From: Björn Töpel When the AF_XDP buffer allocation API was introduced it had an optimization, "cheap_dma". The idea was that when the umem was DMA mapped, the pool also checked whether the mapping required a synchronization (CPU to device, and vice versa). If not, it would be marked as "cheap_dma" and the synchronization would be elided. In [1] Christoph points out that the optimization above breaks the DMA API abstraction, and should be removed. Further, Christoph points out that optimizations like this should be done within the DMA mapping core, and not elsewhere. Unfortunately this has implications for the packet rate performance. The AF_XDP rxdrop scenario shows a 9% decrease in packets per second. [1] https://lore.kernel.org/netdev/20200626074725.ga21...@lst.de/ Cc: Christoph Hellwig Fixes: 2b43470add8c ("xsk: Introduce AF_XDP buffer allocation API") Signed-off-by: Björn Töpel --- include/net/xsk_buff_pool.h | 16 +++-- net/xdp/xsk_buff_pool.c | 69 ++--- 2 files changed, 6 insertions(+), 79 deletions(-) diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index a4ff226505c9..3ea9b9654632 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -40,7 +40,6 @@ struct xsk_buff_pool { u32 headroom; u32 chunk_size; u32 frame_len; - bool cheap_dma; bool unaligned; void *addrs; struct device *dev; @@ -77,24 +76,17 @@ static inline dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb) return xskb->frame_dma; } -void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb); static inline void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb) { - if (xskb->pool->cheap_dma) - return; - - xp_dma_sync_for_cpu_slow(xskb); + dma_sync_single_range_for_cpu(xskb->pool->dev, xskb->dma, 0, + xskb->pool->frame_len, DMA_BIDIRECTIONAL); } -void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma, -size_t size); static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, size_t size) { - if (pool->cheap_dma) - return; - - xp_dma_sync_for_device_slow(pool, dma, size); + dma_sync_single_range_for_device(pool->dev, dma, 0, +size, DMA_BIDIRECTIONAL); } /* Masks for xdp_umem_page flags. diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 540ed75e4482..c330e5f3aadf 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -2,9 +2,6 @@ #include #include -#include -#include -#include #include "xsk_queue.h" @@ -55,7 +52,6 @@ struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks, pool->free_heads_cnt = chunks; pool->headroom = headroom; pool->chunk_size = chunk_size; - pool->cheap_dma = true; pool->unaligned = unaligned; pool->frame_len = chunk_size - headroom - XDP_PACKET_HEADROOM; INIT_LIST_HEAD(>free_list); @@ -125,48 +121,6 @@ static void xp_check_dma_contiguity(struct xsk_buff_pool *pool) } } -static bool __maybe_unused xp_check_swiotlb_dma(struct xsk_buff_pool *pool) -{ -#if defined(CONFIG_SWIOTLB) - phys_addr_t paddr; - u32 i; - - for (i = 0; i < pool->dma_pages_cnt; i++) { - paddr = dma_to_phys(pool->dev, pool->dma_pages[i]); - if (is_swiotlb_buffer(paddr)) - return false; - } -#endif - return true; -} - -static bool xp_check_cheap_dma(struct xsk_buff_pool *pool) -{ -#if defined(CONFIG_HAS_DMA) - const struct dma_map_ops *ops = get_dma_ops(pool->dev); - - if (ops) { - return !ops->sync_single_for_cpu && - !ops->sync_single_for_device; - } - - if (!dma_is_direct(ops)) - return false; - - if (!xp_check_swiotlb_dma(pool)) - return false; - - if (!dev_is_dma_coherent(pool->dev)) { -#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ - defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) ||\ - defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) - return false; -#endif - } -#endif - return true; -} - int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, unsigned long attrs, struct page **pages, u32 nr_pages) { @@ -195,7 +149,6 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, xp_check_dma_contiguity(pool); pool->dev = dev; - pool->cheap_dma = xp_check_cheap_dma(pool); return 0; } EXPORT_SYMBOL(xp_dma_map); @@ -280,11 +233,8 @@ struct xdp_buff *xp_alloc(
Re: the XSK buffer pool needs be to reverted
On 2020-06-26 14:41, Christoph Hellwig wrote: On Fri, Jun 26, 2020 at 02:22:41PM +0200, Björn Töpel wrote: [...] Understood. Wdyt about something in the lines of the diff below? It's build tested only, but removes all non-dma API usage ("poking internals"). Would that be a way forward, and then as a next step work on a solution that would give similar benefits, but something that would live in the dma mapping core? Yes, that would solve the immediate issues. Good. I'll cook a proper patch and post it. Björn